chore: add SharePoint sync environment variables to integration test (#5197 )

* chore: add SharePoint sync environment variables to integration test workflows * fix cubic comments * test: skip SharePoint permission tests for non-enterprise * test: update SharePoint permission tests to skip for non-enterprise environments
fix sharepoint tests (#5209 )
2026-02-16 23:35:46 +00:00 · 2025-08-18 03:21:04 +00:00 · 2025-08-17 22:25:47 +00:00 · 2025-08-16 01:39:16 +00:00 · 2025-08-15 18:19:42 -07:00 · 2025-08-15 15:43:01 -07:00
1119 changed files with 77388 additions and 20462 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1,3 @@
 * @onyx-dot-app/onyx-core-team
+# Helm charts Owners
+/helm/ @justin-tahara
--- a/.github/actions/custom-build-and-push/action.yml
+++ b/.github/actions/custom-build-and-push/action.yml
@@ -25,6 +25,10 @@ inputs:
  tags:
    description: 'Image tags'
    required: true
+  no-cache:
+    description: 'Read from cache'
+    required: false
+    default: 'false'
  cache-from:
    description: 'Cache sources'
    required: false
@@ -55,6 +59,7 @@ runs:
        push: ${{ inputs.push }}
        load: ${{ inputs.load }}
        tags: ${{ inputs.tags }}
+        no-cache: ${{ inputs.no-cache }}
        cache-from: ${{ inputs.cache-from }}
        cache-to: ${{ inputs.cache-to }}

@@ -77,6 +82,7 @@ runs:
        push: ${{ inputs.push }}
        load: ${{ inputs.load }}
        tags: ${{ inputs.tags }}
+        no-cache: ${{ inputs.no-cache }}
        cache-from: ${{ inputs.cache-from }}
        cache-to: ${{ inputs.cache-to }}

@@ -99,6 +105,7 @@ runs:
        push: ${{ inputs.push }}
        load: ${{ inputs.load }}
        tags: ${{ inputs.tags }}
+        no-cache: ${{ inputs.no-cache }}
        cache-from: ${{ inputs.cache-from }}
        cache-to: ${{ inputs.cache-to }}

--- a/.github/workflows/docker-build-push-backend-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-backend-container-on-tag.yml
@@ -7,18 +7,47 @@ on:

 env:
  REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-backend-cloud' || 'onyxdotapp/onyx-backend' }}
-  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
+  DEPLOYMENT: ${{ contains(github.ref_name, 'cloud') && 'cloud' || 'standalone' }}
+  
+  # don't tag cloud images with "latest"
+  LATEST_TAG: ${{ contains(github.ref_name, 'latest') && !contains(github.ref_name, 'cloud') }}

 jobs:
  build-and-push:
    # TODO: investigate a matrix build like the web container
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
-
+    runs-on:
+      - runs-on
+      - runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }}
+      - run-id=${{ github.run_id }}
+      - tag=platform-${{ matrix.platform }}
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - linux/amd64
+          - linux/arm64
+          
    steps:
+      - name: Prepare
+        run: |
+          platform=${{ matrix.platform }}
+          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
+          
      - name: Checkout code
        uses: actions/checkout@v4

+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
+            
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

@@ -34,18 +63,80 @@ jobs:
          sudo apt-get install -y build-essential

      - name: Backend Image Docker Build and Push
-        uses: docker/build-push-action@v5
+        id: build
+        uses: docker/build-push-action@v6
        with:
          context: ./backend
          file: ./backend/Dockerfile
-          platforms: linux/amd64,linux/arm64
+          platforms: ${{ matrix.platform }}
          push: true
-          tags: |
-            ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-            ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
          build-args: |
            ONYX_VERSION=${{ github.ref_name }}
+          labels: ${{ steps.meta.outputs.labels }}
+          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

+      - name: Export digest      
+        run: |
+          mkdir -p /tmp/digests
+          digest="${{ steps.build.outputs.digest }}"
+          touch "/tmp/digests/${digest#sha256:}"
+
+      - name: Upload digest
+        uses: actions/upload-artifact@v4
+        with:
+          name: backend-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
+          path: /tmp/digests/*
+          if-no-files-found: error
+          retention-days: 1
+          
+  merge:
+    runs-on: ubuntu-latest
+    needs:
+      - build-and-push
+    steps:
+      # Needed for trivyignore
+      - name: Checkout
+        uses: actions/checkout@v4
+        
+      - name: Download digests
+        uses: actions/download-artifact@v4
+        with:
+          path: /tmp/digests
+          pattern: backend-digests-*-${{ github.run_id }}
+          merge-multiple: true
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      - name: Create manifest list and push
+        working-directory: /tmp/digests
+        run: |
+          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
+            $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
+
+      - name: Inspect image
+        run: |
+          docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}
+          
      # trivy has their own rate limiting issues causing this action to flake
      # we worked around it by hardcoding to different db repos in env
      # can re-enable when they figure it out
@@ -56,6 +147,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          # To run locally: trivy image --severity HIGH,CRITICAL onyxdotapp/onyx-backend
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
--- a/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
@@ -4,12 +4,12 @@ name: Build and Push Cloud Web Image on Tag
 on:
  push:
    tags:
-      - "*"
+      - "*cloud*"

 env:
  REGISTRY_IMAGE: onyxdotapp/onyx-web-server-cloud
-  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
-
+  DEPLOYMENT: cloud
+  
 jobs:
  build:
    runs-on:
@@ -38,9 +38,10 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
          tags: |
-            type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-            type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
+            type=raw,value=${{ github.ref_name }}

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@@ -53,7 +54,7 @@ jobs:

      - name: Build and push by digest
        id: build
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -70,10 +71,12 @@ jobs:
            NEXT_PUBLIC_FORGOT_PASSWORD_ENABLED=true
            NEXT_PUBLIC_INCLUDE_ERROR_POPUP_SUPPORT_LINK=true
            NODE_OPTIONS=--max-old-space-size=8192
-          # needed due to weird interactions with the builds for different platforms
-          no-cache: true
          labels: ${{ steps.meta.outputs.labels }}
          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/cloudweb-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/cloudweb-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          # no-cache needed due to weird interactions with the builds for different platforms
+          # NOTE(rkuo): this may not be true any more with the proper cache prefixing by architecture - currently testing with it off

      - name: Export digest
        run: |
@@ -84,7 +87,7 @@ jobs:
      - name: Upload digest
        uses: actions/upload-artifact@v4
        with:
-          name: digests-${{ env.PLATFORM_PAIR }}
+          name: cloudweb-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
          path: /tmp/digests/*
          if-no-files-found: error
          retention-days: 1
@@ -98,7 +101,7 @@ jobs:
        uses: actions/download-artifact@v4
        with:
          path: /tmp/digests
-          pattern: digests-*
+          pattern: cloudweb-digests-*-${{ github.run_id }}
          merge-multiple: true

      - name: Set up Docker Buildx
@@ -109,6 +112,10 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}

      - name: Login to Docker Hub
        uses: docker/login-action@v3
@@ -136,6 +143,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
          severity: "CRITICAL,HIGH"
--- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
@@ -7,10 +7,13 @@ on:

 env:
  REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-model-server-cloud' || 'onyxdotapp/onyx-model-server' }}
-  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
  DOCKER_BUILDKIT: 1
  BUILDKIT_PROGRESS: plain
+  DEPLOYMENT: ${{ contains(github.ref_name, 'cloud') && 'cloud' || 'standalone' }}

+  # don't tag cloud images with "latest"
+  LATEST_TAG: ${{ contains(github.ref_name, 'latest') && !contains(github.ref_name, 'cloud') }}
+  
 jobs:

 #   Bypassing this for now as the idea of not building is glitching
@@ -51,6 +54,8 @@ jobs:
    if: needs.check_model_server_changes.outputs.changed == 'true'
    runs-on:
      [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}-amd64"]
+    env:
+      PLATFORM_PAIR: linux-amd64
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
@@ -75,7 +80,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and Push AMD64
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./backend
          file: ./backend/Dockerfile.model_server
@@ -86,12 +91,17 @@ jobs:
            DANSWER_VERSION=${{ github.ref_name }}
          outputs: type=registry
          provenance: false
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+#           no-cache: true

  build-arm64:
    needs: [check_model_server_changes]
    if: needs.check_model_server_changes.outputs.changed == 'true'
    runs-on:
      [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}-arm64"]
+    env:
+      PLATFORM_PAIR: linux-arm64
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
@@ -116,7 +126,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and Push ARM64
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./backend
          file: ./backend/Dockerfile.model_server
@@ -127,6 +137,8 @@ jobs:
            DANSWER_VERSION=${{ github.ref_name }}
          outputs: type=registry
          provenance: false
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

  merge-and-scan:
    needs: [build-amd64, build-arm64, check_model_server_changes]
@@ -156,6 +168,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
          severity: "CRITICAL,HIGH"
--- a/.github/workflows/docker-build-push-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-web-container-on-tag.yml
@@ -8,9 +8,25 @@ on:
 env:
  REGISTRY_IMAGE: onyxdotapp/onyx-web-server
  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
+  DEPLOYMENT: standalone

 jobs:
+  precheck:
+    runs-on: [runs-on, runner=2cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    outputs:
+      should-run: ${{ steps.set-output.outputs.should-run }}
+    steps:
+      - name: Check if tag contains "cloud"
+        id: set-output
+        run: |
+          if [[ "${{ github.ref_name }}" == *cloud* ]]; then
+            echo "should-run=false" >> "$GITHUB_OUTPUT"
+          else
+            echo "should-run=true" >> "$GITHUB_OUTPUT"
+          fi
  build:
+    needs: precheck
+    if: needs.precheck.outputs.should-run == 'true'
    runs-on:
      - runs-on
      - runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }}
@@ -37,9 +53,11 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
          tags: |
-            type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-            type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
@@ -52,7 +70,7 @@ jobs:

      - name: Build and push by digest
        id: build
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -62,11 +80,13 @@ jobs:
            ONYX_VERSION=${{ github.ref_name }}
            NODE_OPTIONS=--max-old-space-size=8192

-          # needed due to weird interactions with the builds for different platforms
-          no-cache: true
          labels: ${{ steps.meta.outputs.labels }}
          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
-
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/web-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/web-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          # no-cache needed due to weird interactions with the builds for different platforms
+          # NOTE(rkuo): this may not be true any more with the proper cache prefixing by architecture - currently testing with it off
+          
      - name: Export digest
        run: |
          mkdir -p /tmp/digests
@@ -76,21 +96,22 @@ jobs:
      - name: Upload digest
        uses: actions/upload-artifact@v4
        with:
-          name: digests-${{ env.PLATFORM_PAIR }}
+          name: web-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
          path: /tmp/digests/*
          if-no-files-found: error
          retention-days: 1

  merge:
-    runs-on: ubuntu-latest
    needs:
      - build
+    if: needs.precheck.outputs.should-run == 'true'
+    runs-on: ubuntu-latest
    steps:
      - name: Download digests
        uses: actions/download-artifact@v4
        with:
          path: /tmp/digests
-          pattern: digests-*
+          pattern: web-digests-*-${{ github.run_id }}
          merge-multiple: true

      - name: Set up Docker Buildx
@@ -101,6 +122,11 @@ jobs:
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
+          flavor: |
+            latest=false
+          tags: |
+            type=raw,value=${{ github.ref_name }}
+            type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}

      - name: Login to Docker Hub
        uses: docker/login-action@v3
@@ -128,6 +154,8 @@ jobs:
        env:
          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+          TRIVY_USERNAME: ${{ secrets.DOCKER_USERNAME }}
+          TRIVY_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
          severity: "CRITICAL,HIGH"
--- a/.github/workflows/helm-chart-releases.yml
+++ b/.github/workflows/helm-chart-releases.yml
@@ -0,0 +1,49 @@
+name: Release Onyx Helm Charts
+
+on:
+  push:
+    branches:
+      - main
+
+permissions: write-all
+
+jobs:
+  release:
+    permissions:
+      contents: write
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install Helm CLI
+        uses: azure/setup-helm@v4
+        with:
+          version: v3.12.1
+
+      - name: Add required Helm repositories
+        run: |
+          helm repo add bitnami https://charts.bitnami.com/bitnami
+          helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
+          helm repo update
+
+      - name: Build chart dependencies
+        run: |
+          set -euo pipefail
+          for chart_dir in deployment/helm/charts/*; do
+            if [ -f "$chart_dir/Chart.yaml" ]; then
+              echo "Building dependencies for $chart_dir"
+              helm dependency build "$chart_dir"
+            fi
+          done
+
+      - name: Publish Helm charts to gh-pages
+        uses: stefanprodan/helm-gh-pages@v1.7.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          charts_dir: deployment/helm/charts
+          branch: gh-pages
+          commit_username: ${{ github.actor }}
+          commit_email: ${{ github.actor }}@users.noreply.github.com
--- a/.github/workflows/pr-external-dependency-unit-tests.yml
+++ b/.github/workflows/pr-external-dependency-unit-tests.yml
@@ -0,0 +1,94 @@
+name: External Dependency Unit Tests
+
+on:
+  merge_group:
+  pull_request:
+    branches: [main]
+
+env:
+  # AWS
+  S3_AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
+  S3_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
+
+  # MinIO
+  S3_ENDPOINT_URL: "http://localhost:9004"
+
+  # Confluence
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
+  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+
+jobs:
+  discover-test-dirs:
+    runs-on: ubuntu-latest
+    outputs:
+      test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      
+      - name: Discover test directories
+        id: set-matrix
+        run: |
+          # Find all subdirectories in backend/tests/external_dependency_unit
+          dirs=$(find backend/tests/external_dependency_unit -mindepth 1 -maxdepth 1 -type d -exec basename {} \; | sort | jq -R -s -c 'split("\n")[:-1]')
+          echo "test-dirs=$dirs" >> $GITHUB_OUTPUT
+
+  external-dependency-unit-tests:
+    needs: discover-test-dirs
+    # See https://runs-on.com/runners/linux/
+    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    
+    strategy:
+      fail-fast: false
+      matrix:
+        test-dir: ${{ fromJson(needs.discover-test-dirs.outputs.test-dirs) }}
+
+    env:
+      PYTHONPATH: ./backend
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          playwright install chromium
+          playwright install-deps chromium
+
+      - name: Set up Standard Dependencies
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p onyx-stack up -d minio relational_db cache index
+
+      - name: Run migrations
+        run: |
+          cd backend
+          alembic upgrade head
+
+      - name: Run Tests for ${{ matrix.test-dir }}
+        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
+        run: |
+          py.test \
+            -n 8 \
+            --dist loadfile \
+            --durations=8 \
+            -o junit_family=xunit2 \
+            -xv \
+            --ff \
+            backend/tests/external_dependency_unit/${{ matrix.test-dir }}
--- a/.github/workflows/pr-helm-chart-testing.yml
+++ b/.github/workflows/pr-helm-chart-testing.yml
@@ -37,6 +37,11 @@ jobs:
          echo "changed=true" >> "$GITHUB_OUTPUT"
        fi

+    # uncomment to force run chart-testing
+#     - name: Force run chart-testing (list-changed)
+#       id: list-changed
+#       run: echo "changed=true" >> $GITHUB_OUTPUT
+        
    # lint all charts if any changes were detected
    - name: Run chart-testing (lint)
      if: steps.list-changed.outputs.changed == 'true'
@@ -50,7 +55,25 @@ jobs:

    - name: Run chart-testing (install)
      if: steps.list-changed.outputs.changed == 'true'
-      run: ct install --all --helm-extra-set-args="--set=nginx.enabled=false" --debug --config ct.yaml
+      run: ct install --all \
+        --helm-extra-set-args="\
+          --set=nginx.enabled=false \
+          --set=postgresql.enabled=false \
+          --set=redis.enabled=false \
+          --set=minio.enabled=false \
+          --set=vespa.enabled=false \
+          --set=slackbot.enabled=false \
+          --set=api.replicaCount=0 \
+          --set=inferenceCapability.replicaCount=0 \
+          --set=indexCapability.replicaCount=0 \
+          --set=celery_beat.replicaCount=0 \
+          --set=celery_worker_heavy.replicaCount=0 \
+          --set=celery_worker_docprocessing.replicaCount=0 \
+          --set=celery_worker_light.replicaCount=0 \
+          --set=celery_worker_monitoring.replicaCount=0 \
+          --set=celery_worker_primary.replicaCount=0 \
+          --set=celery_worker_user_files_indexing.replicaCount=0" \
+        --debug --config ct.yaml
      # the following would install only changed charts, but we only have one chart so 
      # don't worry about that for now
      # run: ct install --target-branch ${{ github.event.repository.default_branch }}
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -16,15 +16,62 @@ env:
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
+  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
+  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
+  PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
+  PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
+  PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
+  PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
+  PLATFORM_PAIR: linux-amd64

 jobs:
  integration-tests:
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on, runner=32cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    runs-on:
+      [
+        runs-on,
+        runner=32cpu-linux-x64,
+        disk=large,
+        "run-id=${{ github.run_id }}",
+      ]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+            backend/requirements/ee.txt
+      - run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/ee.txt
+
+      - name: Generate OpenAPI schema
+        working-directory: ./backend
+        env:
+          PYTHONPATH: "."
+        run: |
+          python scripts/onyx_openapi_schema.py --filename generated/openapi.json
+
+      - name: Generate OpenAPI Python client
+        working-directory: ./backend
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}/backend/generated:/local" \
+            openapitools/openapi-generator-cli generate \
+            -i /local/openapi.json \
+            -g python \
+            -o /local/onyx_openapi_client \
+            --package-name onyx_openapi_client
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

@@ -61,8 +108,8 @@ jobs:
          tags: onyxdotapp/onyx-backend:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      - name: Build Model Server Docker image
        uses: ./.github/actions/custom-build-and-push
@@ -73,8 +120,8 @@ jobs:
          tags: onyxdotapp/onyx-model-server:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      - name: Build integration test Docker image
        uses: ./.github/actions/custom-build-and-push
@@ -85,8 +132,8 @@ jobs:
          tags: onyxdotapp/onyx-integration:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      # Start containers for multi-tenant tests
      - name: Start Docker containers for multi-tenant tests
@@ -113,6 +160,8 @@ jobs:
            -e POSTGRES_HOST=relational_db \
            -e POSTGRES_USER=postgres \
            -e POSTGRES_PASSWORD=password \
+            -e DB_READONLY_USER=db_readonly_user \
+            -e DB_READONLY_PASSWORD=password \
            -e POSTGRES_DB=postgres \
            -e POSTGRES_USE_NULL_POOL=true \
            -e VESPA_HOST=index \
@@ -158,6 +207,7 @@ jobs:
          DISABLE_TELEMETRY=true \
          IMAGE_TAG=test \
          INTEGRATION_TESTS_MODE=true \
+          CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=0.001 \
          docker compose -f docker-compose.dev.yml -p onyx-stack up -d
        id: start_docker

@@ -210,6 +260,8 @@ jobs:
            -e POSTGRES_HOST=relational_db \
            -e POSTGRES_USER=postgres \
            -e POSTGRES_PASSWORD=password \
+            -e DB_READONLY_USER=db_readonly_user \
+            -e DB_READONLY_PASSWORD=password \
            -e POSTGRES_DB=postgres \
            -e POSTGRES_POOL_PRE_PING=true \
            -e POSTGRES_USE_NULL_POOL=true \
@@ -221,6 +273,13 @@ jobs:
            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
+            -e JIRA_BASE_URL=${JIRA_BASE_URL} \
+            -e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
+            -e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
+            -e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
+            -e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
+            -e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
+            -e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
            -e TEST_WEB_HOSTNAME=test-runner \
            -e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
            -e MOCK_CONNECTOR_SERVER_PORT=8001 \
--- a/.github/workflows/pr-labeler.yml
+++ b/.github/workflows/pr-labeler.yml
@@ -0,0 +1,38 @@
+name: PR Labeler
+
+on:
+  pull_request_target:
+    branches:
+      - main
+    types:
+      - opened
+      - reopened
+      - synchronize
+      - edited
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  validate_pr_title:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check PR title for Conventional Commits
+        env:
+          PR_TITLE: ${{ github.event.pull_request.title }}
+        run: |
+          echo "PR Title: $PR_TITLE"
+          if [[ ! "$PR_TITLE" =~ ^(feat|fix|docs|test|ci|refactor|perf|chore|revert|build)(\(.+\))?:\ .+ ]]; then
+            echo "::error::❌ Your PR title does not follow the Conventional Commits format.
+              This check ensures that all pull requests use clear, consistent titles that help automate changelogs and improve project history.
+
+              Please update your PR title to follow the Conventional Commits style.  
+              Here is a link to a blog explaining the reason why we've included the Conventional Commits style into our PR titles: https://xfuture-blog.com/working-with-conventional-commits
+
+              **Here are some examples of valid PR titles:**
+              - feat: add user authentication
+              - fix(login): handle null password error
+              - docs(readme): update installation instructions"
+            exit 1
+          fi
--- a/.github/workflows/pr-mit-integration-tests.yml
+++ b/.github/workflows/pr-mit-integration-tests.yml
@@ -16,15 +16,59 @@ env:
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
-
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
+  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
+  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
+  PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
+  PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
+  PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
+  PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
+  PLATFORM_PAIR: linux-amd64
 jobs:
  integration-tests-mit:
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on, runner=32cpu-linux-x64, "run-id=${{ github.run_id }}"]
+    runs-on:
+      [
+        runs-on,
+        runner=32cpu-linux-x64,
+        disk=large,
+        "run-id=${{ github.run_id }}",
+      ]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
+        
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+      - run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt

+      - name: Generate OpenAPI schema
+        working-directory: ./backend
+        env:
+          PYTHONPATH: "."
+        run: |
+          python scripts/onyx_openapi_schema.py --filename generated/openapi.json
+
+      - name: Generate OpenAPI Python client
+        working-directory: ./backend
+        run: |
+          docker run --rm \
+            -v "${{ github.workspace }}/backend/generated:/local" \
+            openapitools/openapi-generator-cli generate \
+            -i /local/openapi.json \
+            -g python \
+            -o /local/onyx_openapi_client \
+            --package-name onyx_openapi_client
+            
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

@@ -61,8 +105,8 @@ jobs:
          tags: onyxdotapp/onyx-backend:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      - name: Build Model Server Docker image
        uses: ./.github/actions/custom-build-and-push
@@ -73,8 +117,8 @@ jobs:
          tags: onyxdotapp/onyx-model-server:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      - name: Build integration test Docker image
        uses: ./.github/actions/custom-build-and-push
@@ -85,8 +129,8 @@ jobs:
          tags: onyxdotapp/onyx-integration:test
          push: false
          load: true
-          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
-          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/mit-integration-tests/integration-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max

      # NOTE: Use pre-ping/null pool to reduce flakiness due to dropped connections
      - name: Start Docker containers
@@ -152,6 +196,8 @@ jobs:
            -e POSTGRES_USER=postgres \
            -e POSTGRES_PASSWORD=password \
            -e POSTGRES_DB=postgres \
+            -e DB_READONLY_USER=db_readonly_user \
+            -e DB_READONLY_PASSWORD=password \
            -e POSTGRES_POOL_PRE_PING=true \
            -e POSTGRES_USE_NULL_POOL=true \
            -e VESPA_HOST=index \
@@ -162,6 +208,13 @@ jobs:
            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
+            -e JIRA_BASE_URL=${JIRA_BASE_URL} \
+            -e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
+            -e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
+            -e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
+            -e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
+            -e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
+            -e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
            -e TEST_WEB_HOSTNAME=test-runner \
            -e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
            -e MOCK_CONNECTOR_SERVER_PORT=8001 \
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -10,6 +10,7 @@ env:
  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
  GEN_AI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
  MOCK_LLM_RESPONSE: true
+  PYTEST_PLAYWRIGHT_SKIP_INITIAL_RESET: true

 jobs:
  playwright-tests:
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -31,16 +31,29 @@ jobs:
        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
        pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt

+    - name: Generate OpenAPI schema
+      working-directory: ./backend
+      env:
+        PYTHONPATH: "."
+      run: |
+        python scripts/onyx_openapi_schema.py --filename generated/openapi.json
+
+    - name: Generate OpenAPI Python client
+      working-directory: ./backend
+      run: |
+        docker run --rm \
+          -v "${{ github.workspace }}/backend/generated:/local" \
+          openapitools/openapi-generator-cli generate \
+          -i /local/openapi.json \
+          -g python \
+          -o /local/onyx_openapi_client \
+          --package-name onyx_openapi_client \
+            
    - name: Run MyPy
      run: |
        cd backend
        mypy .

-    - name: Run ruff
-      run: |
-        cd backend
-        ruff .
-
    - name: Check import order with reorder-python-imports
      run: |
        cd backend
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -12,18 +12,21 @@ env:
  # AWS
  AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS }}
  AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS }}
-  
+
  # Confluence
  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
-  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+
  # Jira
+  JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}

+  # Gong
  GONG_ACCESS_KEY: ${{ secrets.GONG_ACCESS_KEY }}
  GONG_ACCESS_KEY_SECRET: ${{ secrets.GONG_ACCESS_KEY_SECRET }}

@@ -33,37 +36,66 @@ env:
  GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
  GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
  GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
+
  # Slab
  SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}
+
  # Zendesk
  ZENDESK_SUBDOMAIN: ${{ secrets.ZENDESK_SUBDOMAIN }}
  ZENDESK_EMAIL: ${{ secrets.ZENDESK_EMAIL }}
  ZENDESK_TOKEN: ${{ secrets.ZENDESK_TOKEN }}
+
  # Salesforce
  SF_USERNAME: ${{ secrets.SF_USERNAME }}
  SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
  SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}
+
+  # Hubspot
+  HUBSPOT_ACCESS_TOKEN: ${{ secrets.HUBSPOT_ACCESS_TOKEN }}
+
+  # IMAP
+  IMAP_HOST: ${{ secrets.IMAP_HOST }}
+  IMAP_USERNAME: ${{ secrets.IMAP_USERNAME }}
+  IMAP_PASSWORD: ${{ secrets.IMAP_PASSWORD }}
+  IMAP_MAILBOXES: ${{ secrets.IMAP_MAILBOXES }}
+
  # Airtable
  AIRTABLE_TEST_BASE_ID: ${{ secrets.AIRTABLE_TEST_BASE_ID }}
  AIRTABLE_TEST_TABLE_ID: ${{ secrets.AIRTABLE_TEST_TABLE_ID }}
  AIRTABLE_TEST_TABLE_NAME: ${{ secrets.AIRTABLE_TEST_TABLE_NAME }}
  AIRTABLE_ACCESS_TOKEN: ${{ secrets.AIRTABLE_ACCESS_TOKEN }}
+
  # Sharepoint
  SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }}
  SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }}
  SHAREPOINT_CLIENT_DIRECTORY_ID: ${{ secrets.SHAREPOINT_CLIENT_DIRECTORY_ID }}
  SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
+
  # Github
  ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }}
+
+  # Gitlab
+  GITLAB_ACCESS_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
+
  # Gitbook
  GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }}
  GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}
+
  # Notion
  NOTION_INTEGRATION_TOKEN: ${{ secrets.NOTION_INTEGRATION_TOKEN }}
+
  # Highspot
  HIGHSPOT_KEY: ${{ secrets.HIGHSPOT_KEY }}
  HIGHSPOT_SECRET: ${{ secrets.HIGHSPOT_SECRET }}

+  # Slack
+  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+  # Teams
+  TEAMS_APPLICATION_ID: ${{ secrets.TEAMS_APPLICATION_ID }}
+  TEAMS_DIRECTORY_ID: ${{ secrets.TEAMS_DIRECTORY_ID }}
+  TEAMS_SECRET: ${{ secrets.TEAMS_SECRET }}
+
 jobs:
  connectors-check:
    # See https://runs-on.com/runners/linux/
@@ -95,7 +127,15 @@ jobs:

      - name: Run Tests
        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
-        run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
+        run: |
+          py.test \
+            -n 8 \
+            --dist loadfile \
+            --durations=8 \
+            -o junit_family=xunit2 \
+            -xv \
+            --ff \
+            backend/tests/daily/connectors

      - name: Alert on Failure
        if: failure() && github.event_name == 'schedule'
--- a/.github/workflows/pr-python-tests.yml
+++ b/.github/workflows/pr-python-tests.yml
@@ -15,6 +15,9 @@ jobs:
    env:
      PYTHONPATH: ./backend
      REDIS_CLOUD_PYTEST_PASSWORD: ${{ secrets.REDIS_CLOUD_PYTEST_PASSWORD }}
+      SF_USERNAME: ${{ secrets.SF_USERNAME }}
+      SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
+      SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}
      
    steps:
    - name: Checkout code
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,9 @@
 /web/test-results/
 backend/onyx/agent_search/main/test_data.json
 backend/tests/regression/answer_quality/test_data.json
+backend/tests/regression/search_quality/eval-*
+backend/tests/regression/search_quality/search_eval_config.yaml
+backend/tests/regression/search_quality/*.json

 # secret files
 .env
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -45,8 +45,9 @@ PYTHONPATH=../backend
 PYTHONUNBUFFERED=1


-# Internet Search 
+# Internet Search
 BING_API_KEY=<REPLACE THIS>
+EXA_API_KEY=<REPLACE THIS>


 # Enable the full set of Danswer Enterprise Edition features
@@ -58,3 +59,9 @@ AGENT_RETRIEVAL_STATS=False   # Note: This setting will incur substantial re-ran
 AGENT_RERANKING_STATS=True
 AGENT_MAX_QUERY_RETRIEVAL_RESULTS=20
 AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS=20
+
+# S3 File Store Configuration (MinIO for local development)
+S3_ENDPOINT_URL=http://localhost:9004
+S3_FILE_STORE_BUCKET_NAME=onyx-file-store-bucket
+S3_AWS_ACCESS_KEY_ID=minioadmin
+S3_AWS_SECRET_ACCESS_KEY=minioadmin
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -24,8 +24,8 @@
          "Celery primary",
          "Celery light",
          "Celery heavy",
-          "Celery indexing",
-          "Celery user files indexing",
+          "Celery docfetching",
+          "Celery docprocessing",
          "Celery beat",
          "Celery monitoring"
        ],
@@ -46,8 +46,8 @@
          "Celery primary",
          "Celery light",
          "Celery heavy",
-          "Celery indexing",
-          "Celery user files indexing",
+          "Celery docfetching",
+          "Celery docprocessing",
          "Celery beat",
          "Celery monitoring"
        ],
@@ -226,35 +226,66 @@
        "consoleTitle": "Celery heavy Console"
      },
      {
-        "name": "Celery indexing",
+        "name": "Celery docfetching",
        "type": "debugpy",
        "request": "launch",
        "module": "celery",
        "cwd": "${workspaceFolder}/backend",
        "envFile": "${workspaceFolder}/.vscode/.env",
        "env": {
-          "ENABLE_MULTIPASS_INDEXING": "false",
-          "LOG_LEVEL": "DEBUG",
-          "PYTHONUNBUFFERED": "1",
-          "PYTHONPATH": "."
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
        },
        "args": [
-          "-A",
-          "onyx.background.celery.versioned_apps.indexing",
-          "worker",
-          "--pool=threads",
-          "--concurrency=1",
-          "--prefetch-multiplier=1",
-          "--loglevel=INFO",
-          "--hostname=indexing@%n",
-          "-Q",
-          "connector_indexing"
+            "-A",
+            "onyx.background.celery.versioned_apps.docfetching",
+            "worker",
+            "--pool=threads",
+            "--concurrency=1",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docfetching@%n",
+            "-Q",
+            "connector_doc_fetching,user_files_indexing"
        ],
        "presentation": {
-          "group": "2"
+            "group": "2"
        },
-        "consoleTitle": "Celery indexing Console"
-      },
+        "consoleTitle": "Celery docfetching Console",
+        "justMyCode": false
+    },
+    {
+        "name": "Celery docprocessing",
+        "type": "debugpy",
+        "request": "launch",
+        "module": "celery",
+        "cwd": "${workspaceFolder}/backend",
+        "envFile": "${workspaceFolder}/.vscode/.env",
+        "env": {
+            "ENABLE_MULTIPASS_INDEXING": "false",
+            "LOG_LEVEL": "DEBUG",
+            "PYTHONUNBUFFERED": "1",
+            "PYTHONPATH": "."
+        },
+        "args": [
+            "-A",
+            "onyx.background.celery.versioned_apps.docprocessing",
+            "worker",
+            "--pool=threads",
+            "--concurrency=6",
+            "--prefetch-multiplier=1",
+            "--loglevel=INFO",
+            "--hostname=docprocessing@%n",
+            "-Q",
+            "docprocessing"
+        ],
+        "presentation": {
+            "group": "2"
+        },
+        "consoleTitle": "Celery docprocessing Console",
+        "justMyCode": false
+    },
      {
        "name": "Celery monitoring",
        "type": "debugpy",
@@ -303,35 +334,6 @@
        },
        "consoleTitle": "Celery beat Console"
      },
-      {
-        "name": "Celery user files indexing",
-        "type": "debugpy",
-        "request": "launch",
-        "module": "celery",
-        "cwd": "${workspaceFolder}/backend",
-        "envFile": "${workspaceFolder}/.vscode/.env",
-        "env": {
-          "LOG_LEVEL": "DEBUG",
-          "PYTHONUNBUFFERED": "1",
-          "PYTHONPATH": "."
-        },
-        "args": [
-          "-A",
-          "onyx.background.celery.versioned_apps.indexing",
-          "worker",
-          "--pool=threads",
-          "--concurrency=1",
-          "--prefetch-multiplier=1",
-          "--loglevel=INFO",
-          "--hostname=user_files_indexing@%n",
-          "-Q",
-          "user_files_indexing"
-        ],
-        "presentation": {
-          "group": "2"
-        },
-        "consoleTitle": "Celery user files indexing Console"
-      },
      {
        "name": "Pytest",
        "consoleName": "Pytest",
@@ -412,6 +414,46 @@
          "group": "3"
        }
      },
+    {
+      // script to generate the openapi schema
+      "name": "Onyx OpenAPI Schema Generator",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "scripts/onyx_openapi_schema.py",
+      "cwd": "${workspaceFolder}/backend",
+      "envFile": "${workspaceFolder}/.env",
+      "env": {
+        "PYTHONUNBUFFERED": "1",
+        "PYTHONPATH": "."
+      },
+      "args": [
+        "--filename",
+        "generated/openapi.json"
+      ]
+    },
+    {
+      // script to debug multi tenant db issues
+      "name": "Onyx DB Manager (Top Chunks)",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "scripts/debugging/onyx_db.py",
+      "cwd": "${workspaceFolder}/backend",
+      "envFile": "${workspaceFolder}/.env",
+      "env": {
+        "PYTHONUNBUFFERED": "1",
+        "PYTHONPATH": "."
+      },
+      "args": [
+        "--password",
+        "your_password_here",
+        "--port",
+        "5433",
+        "--report",
+        "top-chunks",
+        "--filename",
+        "generated/tenants_by_num_docs.csv"
+      ]
+    },
      {
        "name": "Debug React Web App in Chrome",
        "type": "chrome",
--- a/.vscode/tasks.template.jsonc
+++ b/.vscode/tasks.template.jsonc
@@ -0,0 +1,101 @@
+{
+    "version": "2.0.0",
+    "tasks": [
+        {
+            "type": "austin",
+            "label": "Profile celery beat",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/backend"
+            },
+            "command": [
+                "sudo",
+                "-E"
+            ],
+            "args": [
+              "celery",
+              "-A",
+              "onyx.background.celery.versioned_apps.beat",
+              "beat",
+              "--loglevel=INFO"
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate Onyx OpenAPI Python client",
+            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/backend"
+            },
+            "command": [
+                "openapi-generator"
+            ],
+            "args": [
+                "generate",
+                "-i",
+                "generated/openapi.json",
+                "-g",
+                "python",
+                "-o",
+                "generated/onyx_openapi_client",
+                "--package-name",
+                "onyx_openapi_client",
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate Typescript Fetch client (openapi-generator)",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}"
+            },
+            "command": [
+                "openapi-generator"
+            ],
+            "args": [
+                "generate",
+                "-i",
+                "backend/generated/openapi.json",
+                "-g",
+                "typescript-fetch",
+                "-o",
+                "${workspaceFolder}/web/src/lib/generated/onyx_api",
+                "--additional-properties=disallowAdditionalPropertiesIfNotPresent=false,legacyDiscriminatorBehavior=false,supportsES6=true",
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate TypeScript Client (openapi-ts)",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/web"
+            },
+            "command": [
+                "npx"
+            ],
+            "args": [
+                "openapi-typescript",
+                "../backend/generated/openapi.json",
+                "--output",
+                "./src/lib/generated/onyx-schema.ts",
+            ]
+        },
+        {
+            "type": "shell",
+            "label": "Generate TypeScript Client (orval)",
+            "envFile": "${workspaceFolder}/.env",
+            "options": {
+              "cwd": "${workspaceFolder}/web"
+            },
+            "command": [
+                "npx"
+            ],
+            "args": [
+            	"orval",
+                "--config",
+                "orval.config.js",
+            ]
+        }
+    ]
+}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,4 +1,4 @@
-<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
+<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->

 # Contributing to Onyx

@@ -12,8 +12,8 @@ As an open source project in a rapidly changing space, we welcome all contributi

 The [GitHub Issues](https://github.com/onyx-dot-app/onyx/issues) page is a great place to start for contribution ideas.

-To ensure that your contribution is aligned with the project's direction, please reach out to Hagen (or any other maintainer) on the Onyx team
-via [Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA) /
+To ensure that your contribution is aligned with the project's direction, please reach out to any maintainer on the Onyx team
+via [Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA) /
 [Discord](https://discord.gg/TDJ59cGV2X) or [email](mailto:founders@onyx.app).

 Issues that have been explicitly approved by the maintainers (aligned with the direction of the project)
@@ -28,7 +28,7 @@ Your input is vital to making sure that Onyx moves in the right direction.
 Before starting on implementation, please raise a GitHub issue.

 Also, always feel free to message the founders (Chris Weaver / Yuhong Sun) on
-[Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA) /
+[Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA) /
 [Discord](https://discord.gg/TDJ59cGV2X) directly about anything at all.

 ### Contributing Code
@@ -59,6 +59,7 @@ Onyx being a fully functional app, relies on some external software, specificall
 - [Postgres](https://www.postgresql.org/) (Relational DB)
 - [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
 - [Redis](https://redis.io/) (Cache)
+- [MinIO](https://min.io/) (File Store)
 - [Nginx](https://nginx.org/) (Not needed for development flows generally)

 > **Note:**
@@ -171,10 +172,10 @@ Otherwise, you can follow the instructions below to run the application for deve

 You will need Docker installed to run these containers.

-First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis with:
+First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis/MinIO with:

 ```bash
-docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache
+docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache minio
 ```

 (index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->
+<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->

 <a name="readme-top"></a>

@@ -13,7 +13,7 @@
 <a href="https://docs.onyx.app/" target="_blank">
    <img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
 </a>
-<a href="https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA" target="_blank">
+<a href="https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA" target="_blank">
    <img src="https://img.shields.io/badge/slack-join-blue.svg?logo=slack" alt="Slack">
 </a>
 <a href="https://discord.gg/TDJ59cGV2X" target="_blank">
--- a/backend/.gitignore
+++ b/backend/.gitignore
@@ -9,4 +9,6 @@ api_keys.py
 vespa-app.zip
 dynamic_config_storage/
 celerybeat-schedule*
-onyx/connectors/salesforce/data/
+onyx/connectors/salesforce/data/
+.test.env
+/generated
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -77,6 +77,9 @@ RUN apt-get update && \
    rm -rf /var/lib/apt/lists/* && \
    rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key

+# Install postgresql-client for easy manual tests
+# Install it here to avoid it being cleaned up above
+RUN apt-get update && apt-get install -y postgresql-client

 # Pre-downloading models for setups with limited egress
 RUN python -c "from tokenizers import Tokenizer; \
@@ -85,7 +88,7 @@ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
 # Pre-downloading NLTK for setups with limited egress
 RUN python -c "import nltk; \
 nltk.download('stopwords', quiet=True); \
-nltk.download('punkt', quiet=True);"
+nltk.download('punkt_tab', quiet=True);"
 # nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed

 # Set up application files
--- a/backend/alembic/README.md
+++ b/backend/alembic/README.md
@@ -1,4 +1,4 @@
-<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
+<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->

 # Alembic DB Migrations

@@ -20,3 +20,44 @@ To run all un-applied migrations:
 To undo migrations:
 `alembic downgrade -X`
 where X is the number of migrations you want to undo from the current state
+
+### Multi-tenant migrations
+
+For multi-tenant deployments, you can use additional options:
+
+**Upgrade all tenants:**
+```bash
+alembic -x upgrade_all_tenants=true upgrade head
+```
+
+**Upgrade specific schemas:**
+```bash
+# Single schema
+alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012 upgrade head
+
+# Multiple schemas (comma-separated)
+alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012,public,another_tenant upgrade head
+```
+
+**Upgrade tenants within an alphabetical range:**
+```bash
+# Upgrade tenants 100-200 when sorted alphabetically (positions 100 to 200)
+alembic -x upgrade_all_tenants=true -x tenant_range_start=100 -x tenant_range_end=200 upgrade head
+
+# Upgrade tenants starting from position 1000 alphabetically
+alembic -x upgrade_all_tenants=true -x tenant_range_start=1000 upgrade head
+
+# Upgrade first 500 tenants alphabetically
+alembic -x upgrade_all_tenants=true -x tenant_range_end=500 upgrade head
+```
+
+**Continue on error (for batch operations):**
+```bash
+alembic -x upgrade_all_tenants=true -x continue=true upgrade head
+```
+
+The tenant range filtering works by:
+1. Sorting tenant IDs alphabetically
+2. Using 1-based position numbers (1st, 2nd, 3rd tenant, etc.)
+3. Filtering to the specified range of positions
+4. Non-tenant schemas (like 'public') are always included
--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -1,12 +1,12 @@
 from typing import Any, Literal
-from onyx.db.engine import get_iam_auth_token
+from onyx.db.engine.iam_auth import get_iam_auth_token
 from onyx.configs.app_configs import USE_IAM_AUTH
 from onyx.configs.app_configs import POSTGRES_HOST
 from onyx.configs.app_configs import POSTGRES_PORT
 from onyx.configs.app_configs import POSTGRES_USER
 from onyx.configs.app_configs import AWS_REGION_NAME
-from onyx.db.engine import build_connection_string
-from onyx.db.engine import get_all_tenant_ids
+from onyx.db.engine.sql_engine import build_connection_string
+from onyx.db.engine.tenant_utils import get_all_tenant_ids
 from sqlalchemy import event
 from sqlalchemy import pool
 from sqlalchemy import text
@@ -21,9 +21,14 @@ from alembic import context
 from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.sql.schema import SchemaItem
 from onyx.configs.constants import SSL_CERT_FILE
-from shared_configs.configs import MULTI_TENANT, POSTGRES_DEFAULT_SCHEMA
+from shared_configs.configs import (
+    MULTI_TENANT,
+    POSTGRES_DEFAULT_SCHEMA,
+    TENANT_ID_PREFIX,
+)
 from onyx.db.models import Base
 from celery.backends.database.session import ResultModelBase  # type: ignore
+from onyx.db.engine.sql_engine import SqlEngine

 # Make sure in alembic.ini [logger_root] level=INFO is set or most logging will be
 # hidden! (defaults to level=WARN)
@@ -68,15 +73,67 @@ def include_object(
    return True


-def get_schema_options() -> tuple[str, bool, bool, bool]:
+def filter_tenants_by_range(
+    tenant_ids: list[str], start_range: int | None = None, end_range: int | None = None
+) -> list[str]:
+    """
+    Filter tenant IDs by alphabetical position range.
+
+    Args:
+        tenant_ids: List of tenant IDs to filter
+        start_range: Starting position in alphabetically sorted list (1-based, inclusive)
+        end_range: Ending position in alphabetically sorted list (1-based, inclusive)
+
+    Returns:
+        Filtered list of tenant IDs in their original order
+    """
+    if start_range is None and end_range is None:
+        return tenant_ids
+
+    # Separate tenant IDs from non-tenant schemas
+    tenant_schemas = [tid for tid in tenant_ids if tid.startswith(TENANT_ID_PREFIX)]
+    non_tenant_schemas = [
+        tid for tid in tenant_ids if not tid.startswith(TENANT_ID_PREFIX)
+    ]
+
+    # Sort tenant schemas alphabetically.
+    # NOTE: can cause missed schemas if a schema is created in between workers
+    # fetching of all tenant IDs. We accept this risk for now. Just re-running
+    # the migration will fix the issue.
+    sorted_tenant_schemas = sorted(tenant_schemas)
+
+    # Apply range filtering (0-based indexing)
+    start_idx = start_range if start_range is not None else 0
+    end_idx = end_range if end_range is not None else len(sorted_tenant_schemas)
+
+    # Ensure indices are within bounds
+    start_idx = max(0, start_idx)
+    end_idx = min(len(sorted_tenant_schemas), end_idx)
+
+    # Get the filtered tenant schemas
+    filtered_tenant_schemas = sorted_tenant_schemas[start_idx:end_idx]
+
+    # Combine with non-tenant schemas and preserve original order
+    filtered_tenants = []
+    for tenant_id in tenant_ids:
+        if tenant_id in filtered_tenant_schemas or tenant_id in non_tenant_schemas:
+            filtered_tenants.append(tenant_id)
+
+    return filtered_tenants
+
+
+def get_schema_options() -> (
+    tuple[bool, bool, bool, int | None, int | None, list[str] | None]
+):
    x_args_raw = context.get_x_argument()
    x_args = {}
    for arg in x_args_raw:
-        for pair in arg.split(","):
-            if "=" in pair:
-                key, value = pair.split("=", 1)
-                x_args[key.strip()] = value.strip()
-    schema_name = x_args.get("schema", POSTGRES_DEFAULT_SCHEMA)
+        if "=" in arg:
+            key, value = arg.split("=", 1)
+            x_args[key.strip()] = value.strip()
+        else:
+            raise ValueError(f"Invalid argument: {arg}")
+
    create_schema = x_args.get("create_schema", "true").lower() == "true"
    upgrade_all_tenants = x_args.get("upgrade_all_tenants", "false").lower() == "true"

@@ -84,17 +141,81 @@ def get_schema_options() -> tuple[str, bool, bool, bool]:
    # only applies to online migrations
    continue_on_error = x_args.get("continue", "false").lower() == "true"

-    if (
-        MULTI_TENANT
-        and schema_name == POSTGRES_DEFAULT_SCHEMA
-        and not upgrade_all_tenants
-    ):
+    # Tenant range filtering
+    tenant_range_start = None
+    tenant_range_end = None
+
+    if "tenant_range_start" in x_args:
+        try:
+            tenant_range_start = int(x_args["tenant_range_start"])
+        except ValueError:
+            raise ValueError(
+                f"Invalid tenant_range_start value: {x_args['tenant_range_start']}. Must be an integer."
+            )
+
+    if "tenant_range_end" in x_args:
+        try:
+            tenant_range_end = int(x_args["tenant_range_end"])
+        except ValueError:
+            raise ValueError(
+                f"Invalid tenant_range_end value: {x_args['tenant_range_end']}. Must be an integer."
+            )
+
+    # Validate range
+    if tenant_range_start is not None and tenant_range_end is not None:
+        if tenant_range_start > tenant_range_end:
+            raise ValueError(
+                f"tenant_range_start ({tenant_range_start}) cannot be greater than tenant_range_end ({tenant_range_end})"
+            )
+
+    # Specific schema names filtering (replaces both schema_name and the old tenant_ids approach)
+    schemas = None
+    if "schemas" in x_args:
+        schema_names_str = x_args["schemas"].strip()
+        if schema_names_str:
+            # Split by comma and strip whitespace
+            schemas = [
+                name.strip() for name in schema_names_str.split(",") if name.strip()
+            ]
+            if schemas:
+                logger.info(f"Specific schema names specified: {schemas}")
+
+    # Validate that only one method is used at a time
+    range_filtering = tenant_range_start is not None or tenant_range_end is not None
+    specific_filtering = schemas is not None and len(schemas) > 0
+
+    if range_filtering and specific_filtering:
        raise ValueError(
-            "Cannot run default migrations in public schema when multi-tenancy is enabled. "
-            "Please specify a tenant-specific schema."
+            "Cannot use both tenant range filtering (tenant_range_start/tenant_range_end) "
+            "and specific schema filtering (schemas) at the same time. "
+            "Please use only one filtering method."
        )

-    return schema_name, create_schema, upgrade_all_tenants, continue_on_error
+    if upgrade_all_tenants and specific_filtering:
+        raise ValueError(
+            "Cannot use both upgrade_all_tenants=true and schemas at the same time. "
+            "Use either upgrade_all_tenants=true for all tenants, or schemas for specific schemas."
+        )
+
+    # If any filtering parameters are specified, we're not doing the default single schema migration
+    if range_filtering:
+        upgrade_all_tenants = True
+
+    # Validate multi-tenant requirements
+    if MULTI_TENANT and not upgrade_all_tenants and not specific_filtering:
+        raise ValueError(
+            "In multi-tenant mode, you must specify either upgrade_all_tenants=true "
+            "or provide schemas. Cannot run default migration."
+        )
+
+    return (
+        create_schema,
+        upgrade_all_tenants,
+        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
+    )


 def do_run_migrations(
@@ -141,12 +262,20 @@ def provide_iam_token_for_alembic(

 async def run_async_migrations() -> None:
    (
-        schema_name,
        create_schema,
        upgrade_all_tenants,
        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
    ) = get_schema_options()

+    if not schemas and not MULTI_TENANT:
+        schemas = [POSTGRES_DEFAULT_SCHEMA]
+
+    # without init_engine, subsequent engine calls fail hard intentionally
+    SqlEngine.init_engine(pool_size=20, max_overflow=5)
+
    engine = create_async_engine(
        build_connection_string(),
        poolclass=pool.NullPool,
@@ -160,12 +289,50 @@ async def run_async_migrations() -> None:
        ) -> None:
            provide_iam_token_for_alembic(dialect, conn_rec, cargs, cparams)

-    if upgrade_all_tenants:
+    if schemas:
+        # Use specific schema names directly without fetching all tenants
+        logger.info(f"Migrating specific schema names: {schemas}")
+
+        i_schema = 0
+        num_schemas = len(schemas)
+        for schema in schemas:
+            i_schema += 1
+            logger.info(
+                f"Migrating schema: index={i_schema} num_schemas={num_schemas} schema={schema}"
+            )
+            try:
+                async with engine.connect() as connection:
+                    await connection.run_sync(
+                        do_run_migrations,
+                        schema_name=schema,
+                        create_schema=create_schema,
+                    )
+            except Exception as e:
+                logger.error(f"Error migrating schema {schema}: {e}")
+                if not continue_on_error:
+                    logger.error("--continue=true is not set, raising exception!")
+                    raise
+
+                logger.warning("--continue=true is set, continuing to next schema.")
+
+    elif upgrade_all_tenants:
        tenant_schemas = get_all_tenant_ids()

+        filtered_tenant_schemas = filter_tenants_by_range(
+            tenant_schemas, tenant_range_start, tenant_range_end
+        )
+
+        if tenant_range_start is not None or tenant_range_end is not None:
+            logger.info(
+                f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
+            )
+            logger.info(
+                f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
+            )
+
        i_tenant = 0
-        num_tenants = len(tenant_schemas)
-        for schema in tenant_schemas:
+        num_tenants = len(filtered_tenant_schemas)
+        for schema in filtered_tenant_schemas:
            i_tenant += 1
            logger.info(
                f"Migrating schema: index={i_tenant} num_tenants={num_tenants} schema={schema}"
@@ -180,36 +347,70 @@ async def run_async_migrations() -> None:
            except Exception as e:
                logger.error(f"Error migrating schema {schema}: {e}")
                if not continue_on_error:
-                    logger.error("--continue is not set, raising exception!")
+                    logger.error("--continue=true is not set, raising exception!")
                    raise

-                logger.warning("--continue is set, continuing to next schema.")
+                logger.warning("--continue=true is set, continuing to next schema.")

    else:
-        try:
-            logger.info(f"Migrating schema: {schema_name}")
-            async with engine.connect() as connection:
-                await connection.run_sync(
-                    do_run_migrations,
-                    schema_name=schema_name,
-                    create_schema=create_schema,
-                )
-        except Exception as e:
-            logger.error(f"Error migrating schema {schema_name}: {e}")
-            raise
+        # This should not happen in the new design since we require either
+        # upgrade_all_tenants=true or schemas in multi-tenant mode
+        # and for non-multi-tenant mode, we should use schemas with the default schema
+        raise ValueError(
+            "No migration target specified. Use either upgrade_all_tenants=true for all tenants "
+            "or schemas for specific schemas."
+        )

    await engine.dispose()


 def run_migrations_offline() -> None:
-    """This doesn't really get used when we migrate in the cloud."""
+    """
+    NOTE(rkuo): This generates a sql script that can be used to migrate the database ...
+    instead of migrating the db live via an open connection
+
+    Not clear on when this would be used by us or if it even works.
+
+    If it is offline, then why are there calls to the db engine?
+
+    This doesn't really get used when we migrate in the cloud."""

    logger.info("run_migrations_offline starting.")

-    schema_name, _, upgrade_all_tenants, continue_on_error = get_schema_options()
+    # without init_engine, subsequent engine calls fail hard intentionally
+    SqlEngine.init_engine(pool_size=20, max_overflow=5)
+
+    (
+        create_schema,
+        upgrade_all_tenants,
+        continue_on_error,
+        tenant_range_start,
+        tenant_range_end,
+        schemas,
+    ) = get_schema_options()
    url = build_connection_string()

-    if upgrade_all_tenants:
+    if schemas:
+        # Use specific schema names directly without fetching all tenants
+        logger.info(f"Migrating specific schema names: {schemas}")
+
+        for schema in schemas:
+            logger.info(f"Migrating schema: {schema}")
+            context.configure(
+                url=url,
+                target_metadata=target_metadata,  # type: ignore
+                literal_binds=True,
+                include_object=include_object,
+                version_table_schema=schema,
+                include_schemas=True,
+                script_location=config.get_main_option("script_location"),
+                dialect_opts={"paramstyle": "named"},
+            )
+
+            with context.begin_transaction():
+                context.run_migrations()
+
+    elif upgrade_all_tenants:
        engine = create_async_engine(url)

        if USE_IAM_AUTH:
@@ -223,7 +424,19 @@ def run_migrations_offline() -> None:
        tenant_schemas = get_all_tenant_ids()
        engine.sync_engine.dispose()

-        for schema in tenant_schemas:
+        filtered_tenant_schemas = filter_tenants_by_range(
+            tenant_schemas, tenant_range_start, tenant_range_end
+        )
+
+        if tenant_range_start is not None or tenant_range_end is not None:
+            logger.info(
+                f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
+            )
+            logger.info(
+                f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
+            )
+
+        for schema in filtered_tenant_schemas:
            logger.info(f"Migrating schema: {schema}")
            context.configure(
                url=url,
@@ -239,21 +452,12 @@ def run_migrations_offline() -> None:
            with context.begin_transaction():
                context.run_migrations()
    else:
-        logger.info(f"Migrating schema: {schema_name}")
-        context.configure(
-            url=url,
-            target_metadata=target_metadata,  # type: ignore
-            literal_binds=True,
-            include_object=include_object,
-            version_table_schema=schema_name,
-            include_schemas=True,
-            script_location=config.get_main_option("script_location"),
-            dialect_opts={"paramstyle": "named"},
+        # This should not happen in the new design
+        raise ValueError(
+            "No migration target specified. Use either upgrade_all_tenants=true for all tenants "
+            "or schemas for specific schemas."
        )

-        with context.begin_transaction():
-            context.run_migrations()
-

 def run_migrations_online() -> None:
    logger.info("run_migrations_online starting.")
--- a/backend/alembic/versions/03bf8be6b53a_rework_kg_config.py
+++ b/backend/alembic/versions/03bf8be6b53a_rework_kg_config.py
@@ -0,0 +1,121 @@
+"""rework-kg-config
+
+Revision ID: 03bf8be6b53a
+Revises: 65bc6e0f8500
+Create Date: 2025-06-16 10:52:34.815335
+
+"""
+
+import json
+
+
+from datetime import datetime
+from datetime import timedelta
+from sqlalchemy.dialects import postgresql
+from sqlalchemy import text
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "03bf8be6b53a"
+down_revision = "65bc6e0f8500"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # get current config
+    current_configs = (
+        op.get_bind()
+        .execute(text("SELECT kg_variable_name, kg_variable_values FROM kg_config"))
+        .all()
+    )
+    current_config_dict = {
+        config.kg_variable_name: (
+            config.kg_variable_values[0]
+            if config.kg_variable_name
+            not in ("KG_VENDOR_DOMAINS", "KG_IGNORE_EMAIL_DOMAINS")
+            else config.kg_variable_values
+        )
+        for config in current_configs
+        if config.kg_variable_values
+    }
+
+    # not using the KGConfigSettings model here in case it changes in the future
+    kg_config_settings = json.dumps(
+        {
+            "KG_EXPOSED": current_config_dict.get("KG_EXPOSED", False),
+            "KG_ENABLED": current_config_dict.get("KG_ENABLED", False),
+            "KG_VENDOR": current_config_dict.get("KG_VENDOR", None),
+            "KG_VENDOR_DOMAINS": current_config_dict.get("KG_VENDOR_DOMAINS", []),
+            "KG_IGNORE_EMAIL_DOMAINS": current_config_dict.get(
+                "KG_IGNORE_EMAIL_DOMAINS", []
+            ),
+            "KG_COVERAGE_START": current_config_dict.get(
+                "KG_COVERAGE_START",
+                (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d"),
+            ),
+            "KG_MAX_COVERAGE_DAYS": current_config_dict.get("KG_MAX_COVERAGE_DAYS", 90),
+            "KG_MAX_PARENT_RECURSION_DEPTH": current_config_dict.get(
+                "KG_MAX_PARENT_RECURSION_DEPTH", 2
+            ),
+            "KG_BETA_PERSONA_ID": current_config_dict.get("KG_BETA_PERSONA_ID", None),
+        }
+    )
+    op.execute(
+        f"INSERT INTO key_value_store (key, value) VALUES ('kg_config', '{kg_config_settings}')"
+    )
+
+    # drop kg config table
+    op.drop_table("kg_config")
+
+
+def downgrade() -> None:
+    # get current config
+    current_config_dict = {
+        "KG_EXPOSED": False,
+        "KG_ENABLED": False,
+        "KG_VENDOR": [],
+        "KG_VENDOR_DOMAINS": [],
+        "KG_IGNORE_EMAIL_DOMAINS": [],
+        "KG_COVERAGE_START": (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d"),
+        "KG_MAX_COVERAGE_DAYS": 90,
+        "KG_MAX_PARENT_RECURSION_DEPTH": 2,
+    }
+    current_configs = (
+        op.get_bind()
+        .execute(text("SELECT value FROM key_value_store WHERE key = 'kg_config'"))
+        .one_or_none()
+    )
+    if current_configs is not None:
+        current_config_dict.update(current_configs[0])
+    insert_values = [
+        {
+            "kg_variable_name": name,
+            "kg_variable_values": (
+                [str(val).lower() if isinstance(val, bool) else str(val)]
+                if not isinstance(val, list)
+                else val
+            ),
+        }
+        for name, val in current_config_dict.items()
+    ]
+
+    op.create_table(
+        "kg_config",
+        sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
+        sa.Column("kg_variable_name", sa.String(), nullable=False, index=True),
+        sa.Column("kg_variable_values", postgresql.ARRAY(sa.String()), nullable=False),
+        sa.UniqueConstraint("kg_variable_name", name="uq_kg_config_variable_name"),
+    )
+    op.bulk_insert(
+        sa.table(
+            "kg_config",
+            sa.column("kg_variable_name", sa.String),
+            sa.column("kg_variable_values", postgresql.ARRAY(sa.String)),
+        ),
+        insert_values,
+    )
+
+    op.execute("DELETE FROM key_value_store WHERE key = 'kg_config'")
--- a/backend/alembic/versions/0816326d83aa_add_federated_connector_tables.py
+++ b/backend/alembic/versions/0816326d83aa_add_federated_connector_tables.py
@@ -0,0 +1,72 @@
+"""add federated connector tables
+
+Revision ID: 0816326d83aa
+Revises: 12635f6655b7
+Create Date: 2025-06-29 14:09:45.109518
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+
+# revision identifiers, used by Alembic.
+revision = "0816326d83aa"
+down_revision = "12635f6655b7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create federated_connector table
+    op.create_table(
+        "federated_connector",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("source", sa.String(), nullable=False),
+        sa.Column("credentials", sa.LargeBinary(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Create federated_connector_oauth_token table
+    op.create_table(
+        "federated_connector_oauth_token",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("federated_connector_id", sa.Integer(), nullable=False),
+        sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("token", sa.LargeBinary(), nullable=False),
+        sa.Column("expires_at", sa.DateTime(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Create federated_connector__document_set table
+    op.create_table(
+        "federated_connector__document_set",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("federated_connector_id", sa.Integer(), nullable=False),
+        sa.Column("document_set_id", sa.Integer(), nullable=False),
+        sa.Column("entities", postgresql.JSONB(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(
+            ["document_set_id"], ["document_set.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint(
+            "federated_connector_id",
+            "document_set_id",
+            name="uq_federated_connector_document_set",
+        ),
+    )
+
+
+def downgrade() -> None:
+    # Drop tables in reverse order due to foreign key dependencies
+    op.drop_table("federated_connector__document_set")
+    op.drop_table("federated_connector_oauth_token")
+    op.drop_table("federated_connector")
--- a/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
+++ b/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
@@ -0,0 +1,596 @@
+"""drive-canonical-ids
+
+Revision ID: 12635f6655b7
+Revises: 58c50ef19f08
+Create Date: 2025-06-20 14:44:54.241159
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from urllib.parse import urlparse, urlunparse
+from httpx import HTTPStatusError
+import httpx
+from onyx.document_index.factory import get_default_document_index
+from onyx.db.search_settings import SearchSettings
+from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
+from onyx.document_index.vespa.shared_utils.utils import (
+    replace_invalid_doc_id_characters,
+)
+from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
+from onyx.utils.logger import setup_logger
+import os
+
+logger = setup_logger()
+
+# revision identifiers, used by Alembic.
+revision = "12635f6655b7"
+down_revision = "58c50ef19f08"
+branch_labels = None
+depends_on = None
+
+SKIP_CANON_DRIVE_IDS = os.environ.get("SKIP_CANON_DRIVE_IDS", "true").lower() == "true"
+
+
+def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
+    result = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_fetch = result.fetchall()
+    search_settings = (
+        SearchSettings(**search_settings_fetch[0]._asdict())
+        if search_settings_fetch
+        else None
+    )
+
+    result2 = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_future_fetch = result2.fetchall()
+    search_settings_future = (
+        SearchSettings(**search_settings_future_fetch[0]._asdict())
+        if search_settings_future_fetch
+        else None
+    )
+
+    if not isinstance(search_settings, SearchSettings):
+        raise RuntimeError(
+            "current search settings is of type " + str(type(search_settings))
+        )
+    if (
+        not isinstance(search_settings_future, SearchSettings)
+        and search_settings_future is not None
+    ):
+        raise RuntimeError(
+            "future search settings is of type " + str(type(search_settings_future))
+        )
+
+    return search_settings, search_settings_future
+
+
+def normalize_google_drive_url(url: str) -> str:
+    """Remove query parameters from Google Drive URLs to create canonical document IDs.
+    NOTE: copied from drive doc_conversion.py
+    """
+    parsed_url = urlparse(url)
+    parsed_url = parsed_url._replace(query="")
+    spl_path = parsed_url.path.split("/")
+    if spl_path and (spl_path[-1] in ["edit", "view", "preview"]):
+        spl_path.pop()
+        parsed_url = parsed_url._replace(path="/".join(spl_path))
+    # Remove query parameters and reconstruct URL
+    return urlunparse(parsed_url)
+
+
+def get_google_drive_documents_from_database() -> list[dict]:
+    """Get all Google Drive documents from the database."""
+    bind = op.get_bind()
+    result = bind.execute(
+        sa.text(
+            """
+            SELECT d.id
+            FROM document d
+            JOIN document_by_connector_credential_pair dcc ON d.id = dcc.id
+            JOIN connector_credential_pair cc ON dcc.connector_id = cc.connector_id
+                AND dcc.credential_id = cc.credential_id
+            JOIN connector c ON cc.connector_id = c.id
+            WHERE c.source = 'GOOGLE_DRIVE'
+        """
+        )
+    )
+
+    documents = []
+    for row in result:
+        documents.append({"document_id": row.id})
+
+    return documents
+
+
+def update_document_id_in_database(
+    old_doc_id: str, new_doc_id: str, index_name: str
+) -> None:
+    """Update document IDs in all relevant database tables using copy-and-swap approach."""
+    bind = op.get_bind()
+
+    # print(f"Updating database tables for document {old_doc_id} -> {new_doc_id}")
+
+    # Check if new document ID already exists
+    result = bind.execute(
+        sa.text("SELECT COUNT(*) FROM document WHERE id = :new_id"),
+        {"new_id": new_doc_id},
+    )
+    row = result.fetchone()
+    if row and row[0] > 0:
+        # print(f"Document with ID {new_doc_id} already exists, deleting old one")
+        delete_document_from_db(old_doc_id, index_name)
+        return
+
+    # Step 1: Create a new document row with the new ID (copy all fields from old row)
+    # Use a conservative approach to handle columns that might not exist in all installations
+    try:
+        bind.execute(
+            sa.text(
+                """
+                INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
+                                    link, doc_updated_at, primary_owners, secondary_owners,
+                                    external_user_emails, external_user_group_ids, is_public,
+                                    chunk_count, last_modified, last_synced, kg_stage, kg_processing_time)
+                SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
+                       link, doc_updated_at, primary_owners, secondary_owners,
+                       external_user_emails, external_user_group_ids, is_public,
+                       chunk_count, last_modified, last_synced, kg_stage, kg_processing_time
+                FROM document
+                WHERE id = :old_id
+            """
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated database tables for document {old_doc_id} -> {new_doc_id}")
+    except Exception as e:
+        # If the full INSERT fails, try a more basic version with only core columns
+        logger.warning(f"Full INSERT failed, trying basic version: {e}")
+        bind.execute(
+            sa.text(
+                """
+                INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
+                                    link, doc_updated_at, primary_owners, secondary_owners)
+                SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
+                       link, doc_updated_at, primary_owners, secondary_owners
+                FROM document
+                WHERE id = :old_id
+            """
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+
+    # Step 2: Update all foreign key references to point to the new ID
+
+    # Update document_by_connector_credential_pair table
+    bind.execute(
+        sa.text(
+            "UPDATE document_by_connector_credential_pair SET id = :new_id WHERE id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document_by_connector_credential_pair table for document {old_doc_id} -> {new_doc_id}")
+
+    # Update search_doc table (stores search results for chat replay)
+    # This is critical for agent functionality
+    bind.execute(
+        sa.text(
+            "UPDATE search_doc SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated search_doc table for document {old_doc_id} -> {new_doc_id}")
+    # Update document_retrieval_feedback table (user feedback on documents)
+    bind.execute(
+        sa.text(
+            "UPDATE document_retrieval_feedback SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document_retrieval_feedback table for document {old_doc_id} -> {new_doc_id}")
+    # Update document__tag table (document-tag relationships)
+    bind.execute(
+        sa.text(
+            "UPDATE document__tag SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated document__tag table for document {old_doc_id} -> {new_doc_id}")
+    # Update user_file table (user uploaded files linked to documents)
+    bind.execute(
+        sa.text(
+            "UPDATE user_file SET document_id = :new_id WHERE document_id = :old_id"
+        ),
+        {"new_id": new_doc_id, "old_id": old_doc_id},
+    )
+    # print(f"Successfully updated user_file table for document {old_doc_id} -> {new_doc_id}")
+    # Update KG and chunk_stats tables (these may not exist in all installations)
+    try:
+        # Update kg_entity table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_entity SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_entity table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_entity_extraction_staging table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_entity_extraction_staging SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_entity_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_relationship table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_relationship SET source_document = :new_id WHERE source_document = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_relationship table for document {old_doc_id} -> {new_doc_id}")
+        # Update kg_relationship_extraction_staging table
+        bind.execute(
+            sa.text(
+                "UPDATE kg_relationship_extraction_staging SET source_document = :new_id WHERE source_document = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated kg_relationship_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
+        # Update chunk_stats table
+        bind.execute(
+            sa.text(
+                "UPDATE chunk_stats SET document_id = :new_id WHERE document_id = :old_id"
+            ),
+            {"new_id": new_doc_id, "old_id": old_doc_id},
+        )
+        # print(f"Successfully updated chunk_stats table for document {old_doc_id} -> {new_doc_id}")
+        # Update chunk_stats ID field which includes document_id
+        bind.execute(
+            sa.text(
+                """
+                UPDATE chunk_stats
+                SET id = REPLACE(id, :old_id, :new_id)
+                WHERE id LIKE :old_id_pattern
+            """
+            ),
+            {
+                "new_id": new_doc_id,
+                "old_id": old_doc_id,
+                "old_id_pattern": f"{old_doc_id}__%",
+            },
+        )
+        # print(f"Successfully updated chunk_stats ID field for document {old_doc_id} -> {new_doc_id}")
+    except Exception as e:
+        logger.warning(f"Some KG/chunk tables may not exist or failed to update: {e}")
+
+    # Step 3: Delete the old document row (this should now be safe since all FKs point to new row)
+    bind.execute(
+        sa.text("DELETE FROM document WHERE id = :old_id"), {"old_id": old_doc_id}
+    )
+    # print(f"Successfully deleted document {old_doc_id} from database")
+
+
+def _visit_chunks(
+    *,
+    http_client: httpx.Client,
+    index_name: str,
+    selection: str,
+    continuation: str | None = None,
+) -> tuple[list[dict], str | None]:
+    """Helper that calls the /document/v1 visit API once and returns (docs, next_token)."""
+
+    # Use the same URL as the document API, but with visit-specific params
+    base_url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
+
+    params: dict[str, str] = {
+        "selection": selection,
+        "wantedDocumentCount": "1000",
+    }
+    if continuation:
+        params["continuation"] = continuation
+
+    # print(f"Visiting chunks for selection '{selection}' with params {params}")
+    resp = http_client.get(base_url, params=params, timeout=None)
+    # print(f"Visited chunks for document {selection}")
+    resp.raise_for_status()
+
+    payload = resp.json()
+    return payload.get("documents", []), payload.get("continuation")
+
+
+def delete_document_chunks_from_vespa(index_name: str, doc_id: str) -> None:
+    """Delete all chunks for *doc_id* from Vespa using continuation-token paging (no offset)."""
+
+    total_deleted = 0
+    # Use exact match instead of contains - Document Selector Language doesn't support contains
+    selection = f'{index_name}.document_id=="{doc_id}"'
+
+    with get_vespa_http_client() as http_client:
+        continuation: str | None = None
+        while True:
+            docs, continuation = _visit_chunks(
+                http_client=http_client,
+                index_name=index_name,
+                selection=selection,
+                continuation=continuation,
+            )
+
+            if not docs:
+                break
+
+            for doc in docs:
+                vespa_full_id = doc.get("id")
+                if not vespa_full_id:
+                    continue
+
+                vespa_doc_uuid = vespa_full_id.split("::")[-1]
+                delete_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
+
+                try:
+                    resp = http_client.delete(delete_url)
+                    resp.raise_for_status()
+                    total_deleted += 1
+                except Exception as e:
+                    print(f"Failed to delete chunk {vespa_doc_uuid}: {e}")
+
+            if not continuation:
+                break
+
+
+def update_document_id_in_vespa(
+    index_name: str, old_doc_id: str, new_doc_id: str
+) -> None:
+    """Update all chunks' document_id field from *old_doc_id* to *new_doc_id* using continuation paging."""
+
+    clean_new_doc_id = replace_invalid_doc_id_characters(new_doc_id)
+
+    # Use exact match instead of contains - Document Selector Language doesn't support contains
+    selection = f'{index_name}.document_id=="{old_doc_id}"'
+
+    with get_vespa_http_client() as http_client:
+        continuation: str | None = None
+        while True:
+            # print(f"Visiting chunks for document {old_doc_id} -> {new_doc_id}")
+            docs, continuation = _visit_chunks(
+                http_client=http_client,
+                index_name=index_name,
+                selection=selection,
+                continuation=continuation,
+            )
+
+            if not docs:
+                break
+
+            for doc in docs:
+                vespa_full_id = doc.get("id")
+                if not vespa_full_id:
+                    continue
+
+                vespa_doc_uuid = vespa_full_id.split("::")[-1]
+                vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
+
+                update_request = {
+                    "fields": {"document_id": {"assign": clean_new_doc_id}}
+                }
+
+                try:
+                    resp = http_client.put(vespa_url, json=update_request)
+                    resp.raise_for_status()
+                except Exception as e:
+                    print(f"Failed to update chunk {vespa_doc_uuid}: {e}")
+                    raise
+
+            if not continuation:
+                break
+
+
+def delete_document_from_db(current_doc_id: str, index_name: str) -> None:
+    # Delete all foreign key references first, then delete the document
+    try:
+        bind = op.get_bind()
+
+        # Delete from agent-related tables first (order matters due to foreign keys)
+        # Delete from agent__sub_query__search_doc first since it references search_doc
+        bind.execute(
+            sa.text(
+                """
+                DELETE FROM agent__sub_query__search_doc
+                WHERE search_doc_id IN (
+                    SELECT id FROM search_doc WHERE document_id = :doc_id
+                )
+                """
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from chat_message__search_doc
+        bind.execute(
+            sa.text(
+                """
+                DELETE FROM chat_message__search_doc
+                WHERE search_doc_id IN (
+                    SELECT id FROM search_doc WHERE document_id = :doc_id
+                )
+                """
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Now we can safely delete from search_doc
+        bind.execute(
+            sa.text("DELETE FROM search_doc WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from document_by_connector_credential_pair
+        bind.execute(
+            sa.text(
+                "DELETE FROM document_by_connector_credential_pair WHERE id = :doc_id"
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from other tables that reference this document
+        bind.execute(
+            sa.text(
+                "DELETE FROM document_retrieval_feedback WHERE document_id = :doc_id"
+            ),
+            {"doc_id": current_doc_id},
+        )
+
+        bind.execute(
+            sa.text("DELETE FROM document__tag WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        bind.execute(
+            sa.text("DELETE FROM user_file WHERE document_id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete from KG tables if they exist
+        try:
+            bind.execute(
+                sa.text("DELETE FROM kg_entity WHERE document_id = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text(
+                    "DELETE FROM kg_entity_extraction_staging WHERE document_id = :doc_id"
+                ),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM kg_relationship WHERE source_document = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text(
+                    "DELETE FROM kg_relationship_extraction_staging WHERE source_document = :doc_id"
+                ),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM chunk_stats WHERE document_id = :doc_id"),
+                {"doc_id": current_doc_id},
+            )
+
+            bind.execute(
+                sa.text("DELETE FROM chunk_stats WHERE id LIKE :doc_id_pattern"),
+                {"doc_id_pattern": f"{current_doc_id}__%"},
+            )
+
+        except Exception as e:
+            logger.warning(
+                f"Some KG/chunk tables may not exist or failed to delete from: {e}"
+            )
+
+        # Finally delete the document itself
+        bind.execute(
+            sa.text("DELETE FROM document WHERE id = :doc_id"),
+            {"doc_id": current_doc_id},
+        )
+
+        # Delete chunks from vespa
+        delete_document_chunks_from_vespa(index_name, current_doc_id)
+
+    except Exception as e:
+        print(f"Failed to delete duplicate document {current_doc_id}: {e}")
+        # Continue with other documents instead of failing the entire migration
+
+
+def upgrade() -> None:
+    if SKIP_CANON_DRIVE_IDS:
+        return
+    current_search_settings, future_search_settings = active_search_settings()
+    document_index = get_default_document_index(
+        current_search_settings,
+        future_search_settings,
+    )
+
+    # Get the index name
+    if hasattr(document_index, "index_name"):
+        index_name = document_index.index_name
+    else:
+        # Default index name if we can't get it from the document_index
+        index_name = "danswer_index"
+
+    # Get all Google Drive documents from the database (this is faster and more reliable)
+    gdrive_documents = get_google_drive_documents_from_database()
+
+    if not gdrive_documents:
+        return
+
+    # Track normalized document IDs to detect duplicates
+    all_normalized_doc_ids = set()
+    updated_count = 0
+
+    for doc_info in gdrive_documents:
+        current_doc_id = doc_info["document_id"]
+        normalized_doc_id = normalize_google_drive_url(current_doc_id)
+
+        print(f"Processing document {current_doc_id} -> {normalized_doc_id}")
+        # Check for duplicates
+        if normalized_doc_id in all_normalized_doc_ids:
+            # print(f"Deleting duplicate document {current_doc_id}")
+            delete_document_from_db(current_doc_id, index_name)
+            continue
+
+        all_normalized_doc_ids.add(normalized_doc_id)
+
+        # If the document ID already doesn't have query parameters, skip it
+        if current_doc_id == normalized_doc_id:
+            # print(f"Skipping document {current_doc_id} -> {normalized_doc_id} because it already has no query parameters")
+            continue
+
+        try:
+            # Update both database and Vespa in order
+            # Database first to ensure consistency
+            update_document_id_in_database(
+                current_doc_id, normalized_doc_id, index_name
+            )
+
+            # For Vespa, we can now use the original document IDs since we're using contains matching
+            update_document_id_in_vespa(index_name, current_doc_id, normalized_doc_id)
+            updated_count += 1
+            # print(f"Finished updating document {current_doc_id} -> {normalized_doc_id}")
+        except Exception as e:
+            print(f"Failed to update document {current_doc_id}: {e}")
+
+            if isinstance(e, HTTPStatusError):
+                print(f"HTTPStatusError: {e}")
+                print(f"Response: {e.response.text}")
+                print(f"Status: {e.response.status_code}")
+                print(f"Headers: {e.response.headers}")
+                print(f"Request: {e.request.url}")
+                print(f"Request headers: {e.request.headers}")
+            # Note: Rollback is complex with copy-and-swap approach since the old document is already deleted
+            # In case of failure, manual intervention may be required
+            # Continue with other documents instead of failing the entire migration
+            continue
+
+    logger.info(f"Migration complete. Updated {updated_count} Google Drive documents")
+
+
+def downgrade() -> None:
+    # this is a one way migration, so no downgrade.
+    # It wouldn't make sense to store the extra query parameters
+    # and duplicate documents to allow a reversal.
+    pass
--- a/backend/alembic/versions/238b84885828_add_foreign_key_to_user__external_user_.py
+++ b/backend/alembic/versions/238b84885828_add_foreign_key_to_user__external_user_.py
@@ -0,0 +1,45 @@
+"""Add foreign key to user__external_user_group_id
+
+Revision ID: 238b84885828
+Revises: a7688ab35c45
+Create Date: 2025-05-19 17:15:33.424584
+
+"""
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "238b84885828"
+down_revision = "a7688ab35c45"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # First, clean up any entries that don't have a valid cc_pair_id
+    op.execute(
+        """
+        DELETE FROM user__external_user_group_id
+        WHERE cc_pair_id NOT IN (SELECT id FROM connector_credential_pair)
+        """
+    )
+
+    # Add foreign key constraint with cascade delete
+    op.create_foreign_key(
+        "fk_user__external_user_group_id_cc_pair_id",
+        "user__external_user_group_id",
+        "connector_credential_pair",
+        ["cc_pair_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+
+def downgrade() -> None:
+    # Drop the foreign key constraint
+    op.drop_constraint(
+        "fk_user__external_user_group_id_cc_pair_id",
+        "user__external_user_group_id",
+        type_="foreignkey",
+    )
--- a/backend/alembic/versions/27c6ecc08586_permission_framework.py
+++ b/backend/alembic/versions/27c6ecc08586_permission_framework.py
@@ -144,27 +144,34 @@ def upgrade() -> None:

 def downgrade() -> None:
    op.execute("TRUNCATE TABLE index_attempt")
-    op.add_column(
-        "index_attempt",
-        sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
-    )
-    op.add_column(
-        "index_attempt",
-        sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
-    )
-    op.add_column(
-        "index_attempt",
-        sa.Column(
-            "connector_specific_config",
-            postgresql.JSONB(astext_type=sa.Text()),
-            autoincrement=False,
-            nullable=False,
-        ),
-    )
-
-    # Check if the constraint exists before dropping
    conn = op.get_bind()
    inspector = sa.inspect(conn)
+    existing_columns = {col["name"] for col in inspector.get_columns("index_attempt")}
+
+    if "input_type" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
+        )
+
+    if "source" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
+        )
+
+    if "connector_specific_config" not in existing_columns:
+        op.add_column(
+            "index_attempt",
+            sa.Column(
+                "connector_specific_config",
+                postgresql.JSONB(astext_type=sa.Text()),
+                autoincrement=False,
+                nullable=False,
+            ),
+        )
+
+    # Check if the constraint exists before dropping
    constraints = inspector.get_foreign_keys("index_attempt")

    if any(
@@ -183,8 +190,12 @@ def downgrade() -> None:
            "fk_index_attempt_connector_id", "index_attempt", type_="foreignkey"
        )

-    op.drop_column("index_attempt", "credential_id")
-    op.drop_column("index_attempt", "connector_id")
-    op.drop_table("connector_credential_pair")
-    op.drop_table("credential")
-    op.drop_table("connector")
+    if "credential_id" in existing_columns:
+        op.drop_column("index_attempt", "credential_id")
+
+    if "connector_id" in existing_columns:
+        op.drop_column("index_attempt", "connector_id")
+
+    op.execute("DROP TABLE IF EXISTS connector_credential_pair CASCADE")
+    op.execute("DROP TABLE IF EXISTS credential CASCADE")
+    op.execute("DROP TABLE IF EXISTS connector CASCADE")
--- a/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
+++ b/backend/alembic/versions/2f95e36923e6_add_indexing_coordination.py
@@ -0,0 +1,115 @@
+"""add_indexing_coordination
+
+Revision ID: 2f95e36923e6
+Revises: 0816326d83aa
+Create Date: 2025-07-10 16:17:57.762182
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "2f95e36923e6"
+down_revision = "0816326d83aa"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add database-based coordination fields (replacing Redis fencing)
+    op.add_column(
+        "index_attempt", sa.Column("celery_task_id", sa.String(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "cancellation_requested",
+            sa.Boolean(),
+            nullable=False,
+            server_default="false",
+        ),
+    )
+
+    # Add batch coordination fields (replacing FileStore state)
+    op.add_column(
+        "index_attempt", sa.Column("total_batches", sa.Integer(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "completed_batches", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "total_failures_batch_level",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("total_chunks", sa.Integer(), nullable=False, server_default="0"),
+    )
+
+    # Progress tracking for stall detection
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_progress_time", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_batches_completed_count",
+            sa.Integer(),
+            nullable=False,
+            server_default="0",
+        ),
+    )
+
+    # Heartbeat tracking for worker liveness detection
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "heartbeat_counter", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column(
+            "last_heartbeat_value", sa.Integer(), nullable=False, server_default="0"
+        ),
+    )
+    op.add_column(
+        "index_attempt",
+        sa.Column("last_heartbeat_time", sa.DateTime(timezone=True), nullable=True),
+    )
+
+    # Add index for coordination queries
+    op.create_index(
+        "ix_index_attempt_active_coordination",
+        "index_attempt",
+        ["connector_credential_pair_id", "search_settings_id", "status"],
+    )
+
+
+def downgrade() -> None:
+    # Remove the new index
+    op.drop_index("ix_index_attempt_active_coordination", table_name="index_attempt")
+
+    # Remove the new columns
+    op.drop_column("index_attempt", "last_batches_completed_count")
+    op.drop_column("index_attempt", "last_progress_time")
+    op.drop_column("index_attempt", "last_heartbeat_time")
+    op.drop_column("index_attempt", "last_heartbeat_value")
+    op.drop_column("index_attempt", "heartbeat_counter")
+    op.drop_column("index_attempt", "total_chunks")
+    op.drop_column("index_attempt", "total_failures_batch_level")
+    op.drop_column("index_attempt", "completed_batches")
+    op.drop_column("index_attempt", "total_batches")
+    op.drop_column("index_attempt", "cancellation_requested")
+    op.drop_column("index_attempt", "celery_task_id")
--- a/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
+++ b/backend/alembic/versions/36e9220ab794_update_kg_trigger_functions.py
@@ -0,0 +1,136 @@
+"""update_kg_trigger_functions
+
+Revision ID: 36e9220ab794
+Revises: c9e2cd766c29
+Create Date: 2025-06-22 17:33:25.833733
+
+"""
+
+from alembic import op
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
+
+# revision identifiers, used by Alembic.
+revision = "36e9220ab794"
+down_revision = "c9e2cd766c29"
+branch_labels = None
+depends_on = None
+
+
+def _get_tenant_contextvar(session: Session) -> str:
+    """Get the current schema for the migration"""
+    current_tenant = session.execute(text("SELECT current_schema()")).scalar()
+    if isinstance(current_tenant, str):
+        return current_tenant
+    else:
+        raise ValueError("Current tenant is not a string")
+
+
+def upgrade() -> None:
+
+    bind = op.get_bind()
+    session = Session(bind=bind)
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    tenant_id = _get_tenant_contextvar(session)
+    alphanum_pattern = r"[^a-z0-9]+"
+    truncate_length = 1000
+    function = "update_kg_entity_name"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                name text;
+                cleaned_name text;
+            BEGIN
+                -- Set name to semantic_id if document_id is not NULL
+                IF NEW.document_id IS NOT NULL THEN
+                    SELECT lower(semantic_id) INTO name
+                    FROM "{tenant_id}".document
+                    WHERE id = NEW.document_id;
+                ELSE
+                    name = lower(NEW.name);
+                END IF;
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams
+                NEW.name = name;
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".kg_entity')
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            BEFORE INSERT OR UPDATE OF name
+            ON "{tenant_id}".kg_entity
+            FOR EACH ROW
+            EXECUTE FUNCTION "{tenant_id}".{function}();
+        """
+    )
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    function = "update_kg_entity_name_from_doc"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                doc_name text;
+                cleaned_name text;
+            BEGIN
+                doc_name = lower(NEW.semantic_id);
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    doc_name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams for all entities referencing this document
+                UPDATE "{tenant_id}".kg_entity
+                SET
+                    name = doc_name,
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
+                WHERE document_id = NEW.id;
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".document')
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            AFTER UPDATE OF semantic_id
+            ON "{tenant_id}".document
+            FOR EACH ROW
+            EXECUTE FUNCTION "{tenant_id}".{function}();
+        """
+    )
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/3bd4c84fe72f_improved_index.py
+++ b/backend/alembic/versions/3bd4c84fe72f_improved_index.py
@@ -21,22 +21,14 @@ depends_on = None
 # an outage by creating an index without using CONCURRENTLY. This migration:
 #
 # 1. Creates more efficient full-text search capabilities using tsvector columns and GIN indexes
-# 2. Uses CONCURRENTLY for all index creation to prevent table locking
-# 3. Explicitly manages transactions with COMMIT statements to allow CONCURRENTLY to work
-# (see: https://www.postgresql.org/docs/9.4/sql-createindex.html#SQL-CREATEINDEX-CONCURRENTLY)
-# (see: https://github.com/sqlalchemy/alembic/issues/277)
-# 4. Adds indexes to both chat_message and chat_session tables for comprehensive search
+# 2. Adds indexes to both chat_message and chat_session tables for comprehensive search
+# 3. Note: CONCURRENTLY was removed due to operational issues


 def upgrade() -> None:
    # First, drop any existing indexes to avoid conflicts
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
-
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
-
-    op.execute("COMMIT")
+    op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
+    op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")
    op.execute("DROP INDEX IF EXISTS idx_chat_message_message_lower;")

    # Drop existing columns if they exist
@@ -52,12 +44,9 @@ def upgrade() -> None:
        """
    )

-    # Commit the current transaction before creating concurrent indexes
-    op.execute("COMMIT")
-
    op.execute(
        """
-        CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
+        CREATE INDEX IF NOT EXISTS idx_chat_message_tsv
        ON chat_message
        USING GIN (message_tsv)
        """
@@ -72,12 +61,9 @@ def upgrade() -> None:
        """
    )

-    # Commit again before creating the second concurrent index
-    op.execute("COMMIT")
-
    op.execute(
        """
-        CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
+        CREATE INDEX IF NOT EXISTS idx_chat_session_desc_tsv
        ON chat_session
        USING GIN (description_tsv)
        """
@@ -85,12 +71,9 @@ def upgrade() -> None:


 def downgrade() -> None:
-    # Drop the indexes first (use CONCURRENTLY for dropping too)
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
-
-    op.execute("COMMIT")
-    op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
+    # Drop the indexes first
+    op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
+    op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")

    # Then drop the columns
    op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv;")
--- a/backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py
+++ b/backend/alembic/versions/3fc5d75723b3_add_doc_metadata_field_in_document_model.py
@@ -0,0 +1,30 @@
+"""add_doc_metadata_field_in_document_model
+
+Revision ID: 3fc5d75723b3
+Revises: 2f95e36923e6
+Create Date: 2025-07-28 18:45:37.985406
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "3fc5d75723b3"
+down_revision = "2f95e36923e6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "document",
+        sa.Column(
+            "doc_metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("document", "doc_metadata")
--- a/backend/alembic/versions/47a07e1a38f1_fix_invalid_model_configurations_state.py
+++ b/backend/alembic/versions/47a07e1a38f1_fix_invalid_model_configurations_state.py
@@ -0,0 +1,150 @@
+"""Fix invalid model-configurations state
+
+Revision ID: 47a07e1a38f1
+Revises: 7a70b7664e37
+Create Date: 2025-04-23 15:39:43.159504
+
+"""
+
+from alembic import op
+from pydantic import BaseModel, ConfigDict
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from onyx.llm.llm_provider_options import (
+    fetch_model_names_for_provider_as_set,
+    fetch_visible_model_names_for_provider_as_set,
+)
+
+
+# revision identifiers, used by Alembic.
+revision = "47a07e1a38f1"
+down_revision = "7a70b7664e37"
+branch_labels = None
+depends_on = None
+
+
+class _SimpleModelConfiguration(BaseModel):
+    # Configure model to read from attributes
+    model_config = ConfigDict(from_attributes=True)
+
+    id: int
+    llm_provider_id: int
+    name: str
+    is_visible: bool
+    max_input_tokens: int | None
+
+
+def upgrade() -> None:
+    llm_provider_table = sa.sql.table(
+        "llm_provider",
+        sa.column("id", sa.Integer),
+        sa.column("provider", sa.String),
+        sa.column("model_names", postgresql.ARRAY(sa.String)),
+        sa.column("display_model_names", postgresql.ARRAY(sa.String)),
+        sa.column("default_model_name", sa.String),
+        sa.column("fast_default_model_name", sa.String),
+    )
+    model_configuration_table = sa.sql.table(
+        "model_configuration",
+        sa.column("id", sa.Integer),
+        sa.column("llm_provider_id", sa.Integer),
+        sa.column("name", sa.String),
+        sa.column("is_visible", sa.Boolean),
+        sa.column("max_input_tokens", sa.Integer),
+    )
+
+    connection = op.get_bind()
+
+    llm_providers = connection.execute(
+        sa.select(
+            llm_provider_table.c.id,
+            llm_provider_table.c.provider,
+        )
+    ).fetchall()
+
+    for llm_provider in llm_providers:
+        llm_provider_id, provider_name = llm_provider
+
+        default_models = fetch_model_names_for_provider_as_set(provider_name)
+        display_models = fetch_visible_model_names_for_provider_as_set(
+            provider_name=provider_name
+        )
+
+        # if `fetch_model_names_for_provider_as_set` returns `None`, then
+        # that means that `provider_name` is not a well-known llm provider.
+        if not default_models:
+            continue
+
+        if not display_models:
+            raise RuntimeError(
+                "If `default_models` is non-None, `display_models` must be non-None too."
+            )
+
+        model_configurations = [
+            _SimpleModelConfiguration.model_validate(model_configuration)
+            for model_configuration in connection.execute(
+                sa.select(
+                    model_configuration_table.c.id,
+                    model_configuration_table.c.llm_provider_id,
+                    model_configuration_table.c.name,
+                    model_configuration_table.c.is_visible,
+                    model_configuration_table.c.max_input_tokens,
+                ).where(model_configuration_table.c.llm_provider_id == llm_provider_id)
+            ).fetchall()
+        ]
+
+        if model_configurations:
+            at_least_one_is_visible = any(
+                [
+                    model_configuration.is_visible
+                    for model_configuration in model_configurations
+                ]
+            )
+
+            # If there is at least one model which is public, this is a valid state.
+            # Therefore, don't touch it and move on to the next one.
+            if at_least_one_is_visible:
+                continue
+
+            existing_visible_model_names: set[str] = set(
+                [
+                    model_configuration.name
+                    for model_configuration in model_configurations
+                    if model_configuration.is_visible
+                ]
+            )
+
+            difference = display_models.difference(existing_visible_model_names)
+
+            for model_name in difference:
+                if not model_name:
+                    continue
+
+                insert_statement = postgresql.insert(model_configuration_table).values(
+                    llm_provider_id=llm_provider_id,
+                    name=model_name,
+                    is_visible=True,
+                    max_input_tokens=None,
+                )
+
+                connection.execute(
+                    insert_statement.on_conflict_do_update(
+                        index_elements=["llm_provider_id", "name"],
+                        set_={"is_visible": insert_statement.excluded.is_visible},
+                    )
+                )
+        else:
+            for model_name in default_models:
+                connection.execute(
+                    model_configuration_table.insert().values(
+                        llm_provider_id=llm_provider_id,
+                        name=model_name,
+                        is_visible=model_name in display_models,
+                        max_input_tokens=None,
+                    )
+                )
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
+++ b/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py
@@ -0,0 +1,691 @@
+"""create knowledge graph tables
+
+Revision ID: 495cb26ce93e
+Revises: ca04500b9ee8
+Create Date: 2025-03-19 08:51:14.341989
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy import text
+from datetime import datetime, timedelta
+
+from onyx.configs.app_configs import DB_READONLY_USER
+from onyx.configs.app_configs import DB_READONLY_PASSWORD
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
+
+
+# revision identifiers, used by Alembic.
+revision = "495cb26ce93e"
+down_revision = "ca04500b9ee8"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+
+    # Create a new permission-less user to be later used for knowledge graph queries.
+    # The user will later get temporary read privileges for a specific view that will be
+    # ad hoc generated specific to a knowledge graph query.
+    #
+    # Note: in order for the migration to run, the DB_READONLY_USER and DB_READONLY_PASSWORD
+    # environment variables MUST be set. Otherwise, an exception will be raised.
+
+    if not MULTI_TENANT:
+
+        # Enable pg_trgm extension if not already enabled
+        op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
+
+        # Create read-only db user here only in single tenant mode. For multi-tenant mode,
+        # the user is created in the alembic_tenants migration.
+        if not (DB_READONLY_USER and DB_READONLY_PASSWORD):
+            raise Exception("DB_READONLY_USER or DB_READONLY_PASSWORD is not set")
+
+        op.execute(
+            text(
+                f"""
+                DO $$
+                BEGIN
+                    -- Check if the read-only user already exists
+                    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                        -- Create the read-only user with the specified password
+                        EXECUTE format('CREATE USER %I WITH PASSWORD %L', '{DB_READONLY_USER}', '{DB_READONLY_PASSWORD}');
+                        -- First revoke all privileges to ensure a clean slate
+                        EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
+                        -- Grant only the CONNECT privilege to allow the user to connect to the database
+                        -- but not perform any operations without additional specific grants
+                        EXECUTE format('GRANT CONNECT ON DATABASE %I TO %I', current_database(), '{DB_READONLY_USER}');
+                    END IF;
+                END
+                $$;
+                """
+            )
+        )
+
+    # Grant usage on current schema to readonly user
+    op.execute(
+        text(
+            f"""
+            DO $$
+            BEGIN
+                IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                    EXECUTE format('GRANT USAGE ON SCHEMA %I TO %I', current_schema(), '{DB_READONLY_USER}');
+                END IF;
+            END
+            $$;
+            """
+        )
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_config CASCADE")
+    op.create_table(
+        "kg_config",
+        sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
+        sa.Column("kg_variable_name", sa.String(), nullable=False, index=True),
+        sa.Column("kg_variable_values", postgresql.ARRAY(sa.String()), nullable=False),
+        sa.UniqueConstraint("kg_variable_name", name="uq_kg_config_variable_name"),
+    )
+
+    # Insert initial data into kg_config table
+    op.bulk_insert(
+        sa.table(
+            "kg_config",
+            sa.column("kg_variable_name", sa.String),
+            sa.column("kg_variable_values", postgresql.ARRAY(sa.String)),
+        ),
+        [
+            {"kg_variable_name": "KG_EXPOSED", "kg_variable_values": ["false"]},
+            {"kg_variable_name": "KG_ENABLED", "kg_variable_values": ["false"]},
+            {"kg_variable_name": "KG_VENDOR", "kg_variable_values": []},
+            {"kg_variable_name": "KG_VENDOR_DOMAINS", "kg_variable_values": []},
+            {"kg_variable_name": "KG_IGNORE_EMAIL_DOMAINS", "kg_variable_values": []},
+            {
+                "kg_variable_name": "KG_EXTRACTION_IN_PROGRESS",
+                "kg_variable_values": ["false"],
+            },
+            {
+                "kg_variable_name": "KG_CLUSTERING_IN_PROGRESS",
+                "kg_variable_values": ["false"],
+            },
+            {
+                "kg_variable_name": "KG_COVERAGE_START",
+                "kg_variable_values": [
+                    (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d")
+                ],
+            },
+            {"kg_variable_name": "KG_MAX_COVERAGE_DAYS", "kg_variable_values": ["90"]},
+            {
+                "kg_variable_name": "KG_MAX_PARENT_RECURSION_DEPTH",
+                "kg_variable_values": ["2"],
+            },
+        ],
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_entity_type CASCADE")
+    op.create_table(
+        "kg_entity_type",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("grounding", sa.String(), nullable=False),
+        sa.Column(
+            "attributes",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("active", sa.Boolean(), nullable=False, default=False),
+        sa.Column("deep_extraction", sa.Boolean(), nullable=False, default=False),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.Column("grounded_source_name", sa.String(), nullable=True),
+        sa.Column("entity_values", postgresql.ARRAY(sa.String()), nullable=True),
+        sa.Column(
+            "clustering",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship_type CASCADE")
+    # Create KGRelationshipType table
+    op.create_table(
+        "kg_relationship_type",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column(
+            "source_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column(
+            "target_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column("definition", sa.Boolean(), nullable=False, default=False),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("active", sa.Boolean(), nullable=False, default=True),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.Column(
+            "clustering",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.ForeignKeyConstraint(
+            ["source_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+        sa.ForeignKeyConstraint(
+            ["target_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship_type_extraction_staging CASCADE")
+    # Create KGRelationshipTypeExtractionStaging table
+    op.create_table(
+        "kg_relationship_type_extraction_staging",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column(
+            "source_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column(
+            "target_entity_type_id_name", sa.String(), nullable=False, index=True
+        ),
+        sa.Column("definition", sa.Boolean(), nullable=False, default=False),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("active", sa.Boolean(), nullable=False, default=True),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.Column(
+            "clustering",
+            postgresql.JSONB,
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("transferred", sa.Boolean(), nullable=False, server_default="false"),
+        sa.ForeignKeyConstraint(
+            ["source_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+        sa.ForeignKeyConstraint(
+            ["target_entity_type_id_name"], ["kg_entity_type.id_name"]
+        ),
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_entity CASCADE")
+
+    # Create KGEntity table
+    op.create_table(
+        "kg_entity",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column("entity_class", sa.String(), nullable=True, index=True),
+        sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
+        sa.Column("entity_key", sa.String(), nullable=True, index=True),
+        sa.Column("name_trigrams", postgresql.ARRAY(sa.String(3)), nullable=True),
+        sa.Column("document_id", sa.String(), nullable=True, index=True),
+        sa.Column(
+            "alternative_names",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("entity_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column(
+            "keywords",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column(
+            "acl", postgresql.ARRAY(sa.String()), nullable=False, server_default="{}"
+        ),
+        sa.Column("boosts", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("attributes", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("event_time", sa.DateTime(timezone=True), nullable=True),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(["entity_type_id_name"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["document_id"], ["document.id"]),
+        sa.UniqueConstraint(
+            "name",
+            "entity_type_id_name",
+            "document_id",
+            name="uq_kg_entity_name_type_doc",
+        ),
+    )
+    op.create_index("ix_entity_type_acl", "kg_entity", ["entity_type_id_name", "acl"])
+    op.create_index(
+        "ix_entity_name_search", "kg_entity", ["name", "entity_type_id_name"]
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_entity_extraction_staging CASCADE")
+    # Create KGEntityExtractionStaging table
+    op.create_table(
+        "kg_entity_extraction_staging",
+        sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column("name", sa.String(), nullable=False, index=True),
+        sa.Column("document_id", sa.String(), nullable=True, index=True),
+        sa.Column(
+            "alternative_names",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("entity_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column(
+            "keywords",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column(
+            "acl", postgresql.ARRAY(sa.String()), nullable=False, server_default="{}"
+        ),
+        sa.Column("boosts", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("attributes", postgresql.JSONB, nullable=False, server_default="{}"),
+        sa.Column("transferred_id_name", sa.String(), nullable=True, default=None),
+        sa.Column("entity_class", sa.String(), nullable=True, index=True),
+        sa.Column("entity_key", sa.String(), nullable=True, index=True),
+        sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
+        sa.Column("parent_key", sa.String(), nullable=True, index=True),
+        sa.Column("event_time", sa.DateTime(timezone=True), nullable=True),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(["entity_type_id_name"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["document_id"], ["document.id"]),
+    )
+    op.create_index(
+        "ix_entity_extraction_staging_acl",
+        "kg_entity_extraction_staging",
+        ["entity_type_id_name", "acl"],
+    )
+    op.create_index(
+        "ix_entity_extraction_staging_name_search",
+        "kg_entity_extraction_staging",
+        ["name", "entity_type_id_name"],
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship CASCADE")
+    # Create KGRelationship table
+    op.create_table(
+        "kg_relationship",
+        sa.Column("id_name", sa.String(), nullable=False, index=True),
+        sa.Column("source_node", sa.String(), nullable=False, index=True),
+        sa.Column("target_node", sa.String(), nullable=False, index=True),
+        sa.Column("source_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("target_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("source_document", sa.String(), nullable=True, index=True),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("relationship_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(["source_node"], ["kg_entity.id_name"]),
+        sa.ForeignKeyConstraint(["target_node"], ["kg_entity.id_name"]),
+        sa.ForeignKeyConstraint(["source_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["target_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["source_document"], ["document.id"]),
+        sa.ForeignKeyConstraint(
+            ["relationship_type_id_name"], ["kg_relationship_type.id_name"]
+        ),
+        sa.UniqueConstraint(
+            "source_node",
+            "target_node",
+            "type",
+            name="uq_kg_relationship_source_target_type",
+        ),
+        sa.PrimaryKeyConstraint("id_name", "source_document"),
+    )
+    op.create_index(
+        "ix_kg_relationship_nodes", "kg_relationship", ["source_node", "target_node"]
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_relationship_extraction_staging CASCADE")
+    # Create KGRelationshipExtractionStaging table
+    op.create_table(
+        "kg_relationship_extraction_staging",
+        sa.Column("id_name", sa.String(), nullable=False, index=True),
+        sa.Column("source_node", sa.String(), nullable=False, index=True),
+        sa.Column("target_node", sa.String(), nullable=False, index=True),
+        sa.Column("source_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("target_node_type", sa.String(), nullable=False, index=True),
+        sa.Column("source_document", sa.String(), nullable=True, index=True),
+        sa.Column("type", sa.String(), nullable=False, index=True),
+        sa.Column("relationship_type_id_name", sa.String(), nullable=False, index=True),
+        sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
+        sa.Column("transferred", sa.Boolean(), nullable=False, server_default="false"),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+        sa.ForeignKeyConstraint(
+            ["source_node"], ["kg_entity_extraction_staging.id_name"]
+        ),
+        sa.ForeignKeyConstraint(
+            ["target_node"], ["kg_entity_extraction_staging.id_name"]
+        ),
+        sa.ForeignKeyConstraint(["source_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["target_node_type"], ["kg_entity_type.id_name"]),
+        sa.ForeignKeyConstraint(["source_document"], ["document.id"]),
+        sa.ForeignKeyConstraint(
+            ["relationship_type_id_name"],
+            ["kg_relationship_type_extraction_staging.id_name"],
+        ),
+        sa.UniqueConstraint(
+            "source_node",
+            "target_node",
+            "type",
+            name="uq_kg_relationship_extraction_staging_source_target_type",
+        ),
+        sa.PrimaryKeyConstraint("id_name", "source_document"),
+    )
+    op.create_index(
+        "ix_kg_relationship_extraction_staging_nodes",
+        "kg_relationship_extraction_staging",
+        ["source_node", "target_node"],
+    )
+
+    op.execute("DROP TABLE IF EXISTS kg_term CASCADE")
+    # Create KGTerm table
+    op.create_table(
+        "kg_term",
+        sa.Column("id_term", sa.String(), primary_key=True, nullable=False, index=True),
+        sa.Column(
+            "entity_types",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+        sa.Column(
+            "time_updated",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            onupdate=sa.text("now()"),
+        ),
+        sa.Column(
+            "time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
+        ),
+    )
+    op.create_index("ix_search_term_entities", "kg_term", ["entity_types"])
+    op.create_index("ix_search_term_term", "kg_term", ["id_term"])
+
+    op.add_column(
+        "document",
+        sa.Column("kg_stage", sa.String(), nullable=True, index=True),
+    )
+    op.add_column(
+        "document",
+        sa.Column("kg_processing_time", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.add_column(
+        "connector",
+        sa.Column(
+            "kg_processing_enabled",
+            sa.Boolean(),
+            nullable=True,
+            server_default="false",
+        ),
+    )
+
+    op.add_column(
+        "connector",
+        sa.Column(
+            "kg_coverage_days",
+            sa.Integer(),
+            nullable=True,
+            server_default=None,
+        ),
+    )
+
+    # Create GIN index for clustering and normalization
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS idx_kg_entity_clustering_trigrams "
+        f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA}.gin_trgm_ops)"
+    )
+    op.execute(
+        "CREATE INDEX IF NOT EXISTS idx_kg_entity_normalization_trigrams "
+        "ON kg_entity USING GIN (name_trigrams)"
+    )
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    alphanum_pattern = r"[^a-z0-9]+"
+    truncate_length = 1000
+    function = "update_kg_entity_name"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION {function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                name text;
+                cleaned_name text;
+            BEGIN
+                -- Set name to semantic_id if document_id is not NULL
+                IF NEW.document_id IS NOT NULL THEN
+                    SELECT lower(semantic_id) INTO name
+                    FROM document
+                    WHERE id = NEW.document_id;
+                ELSE
+                    name = lower(NEW.name);
+                END IF;
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams
+                NEW.name = name;
+                NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f"DROP TRIGGER IF EXISTS {trigger} ON kg_entity")
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            BEFORE INSERT OR UPDATE OF name
+            ON kg_entity
+            FOR EACH ROW
+            EXECUTE FUNCTION {function}();
+        """
+    )
+
+    # Create kg_entity trigger to update kg_entity.name and its trigrams
+    function = "update_kg_entity_name_from_doc"
+    op.execute(
+        text(
+            f"""
+            CREATE OR REPLACE FUNCTION {function}()
+            RETURNS TRIGGER AS $$
+            DECLARE
+                doc_name text;
+                cleaned_name text;
+            BEGIN
+                doc_name = lower(NEW.semantic_id);
+
+                -- Clean name and truncate if too long
+                cleaned_name = regexp_replace(
+                    doc_name,
+                    '{alphanum_pattern}', '', 'g'
+                );
+                IF length(cleaned_name) > {truncate_length} THEN
+                    cleaned_name = left(cleaned_name, {truncate_length});
+                END IF;
+
+                -- Set name and name trigrams for all entities referencing this document
+                UPDATE kg_entity
+                SET
+                    name = doc_name,
+                    name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
+                WHERE document_id = NEW.id;
+                RETURN NEW;
+            END;
+            $$ LANGUAGE plpgsql;
+            """
+        )
+    )
+    trigger = f"{function}_trigger"
+    op.execute(f"DROP TRIGGER IF EXISTS {trigger} ON document")
+    op.execute(
+        f"""
+        CREATE TRIGGER {trigger}
+            AFTER UPDATE OF semantic_id
+            ON document
+            FOR EACH ROW
+            EXECUTE FUNCTION {function}();
+        """
+    )
+
+
+def downgrade() -> None:
+
+    #  Drop all views that start with 'kg_'
+    op.execute(
+        """
+                DO $$
+                DECLARE
+                    view_name text;
+                BEGIN
+                    FOR view_name IN
+                        SELECT c.relname
+                        FROM pg_catalog.pg_class c
+                        JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
+                        WHERE c.relkind = 'v'
+                        AND n.nspname = current_schema()
+                        AND c.relname LIKE 'kg_relationships_with_access%'
+                    LOOP
+                        EXECUTE 'DROP VIEW IF EXISTS ' || quote_ident(view_name);
+                    END LOOP;
+                END $$;
+            """
+    )
+
+    op.execute(
+        """
+                DO $$
+                DECLARE
+                    view_name text;
+                BEGIN
+                    FOR view_name IN
+                        SELECT c.relname
+                        FROM pg_catalog.pg_class c
+                        JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
+                        WHERE c.relkind = 'v'
+                        AND n.nspname = current_schema()
+                        AND c.relname LIKE 'allowed_docs%'
+                    LOOP
+                        EXECUTE 'DROP VIEW IF EXISTS ' || quote_ident(view_name);
+                    END LOOP;
+                END $$;
+            """
+    )
+
+    for table, function in (
+        ("kg_entity", "update_kg_entity_name"),
+        ("document", "update_kg_entity_name_from_doc"),
+    ):
+        op.execute(f"DROP TRIGGER IF EXISTS {function}_trigger ON {table}")
+        op.execute(f"DROP FUNCTION IF EXISTS {function}()")
+
+    # Drop index
+    op.execute("DROP INDEX IF EXISTS idx_kg_entity_clustering_trigrams")
+    op.execute("DROP INDEX IF EXISTS idx_kg_entity_normalization_trigrams")
+
+    # Drop tables in reverse order of creation to handle dependencies
+    op.drop_table("kg_term")
+    op.drop_table("kg_relationship")
+    op.drop_table("kg_entity")
+    op.drop_table("kg_relationship_type")
+    op.drop_table("kg_relationship_extraction_staging")
+    op.drop_table("kg_relationship_type_extraction_staging")
+    op.drop_table("kg_entity_extraction_staging")
+    op.drop_table("kg_entity_type")
+    op.drop_column("connector", "kg_processing_enabled")
+    op.drop_column("connector", "kg_coverage_days")
+    op.drop_column("document", "kg_stage")
+    op.drop_column("document", "kg_processing_time")
+    op.drop_table("kg_config")
+
+    # Revoke usage on current schema for the readonly user
+    op.execute(
+        text(
+            f"""
+            DO $$
+            BEGIN
+                IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                    EXECUTE format('REVOKE ALL ON SCHEMA %I FROM %I', current_schema(), '{DB_READONLY_USER}');
+                END IF;
+            END
+            $$;
+            """
+        )
+    )
+
+    if not MULTI_TENANT:
+        # Drop read-only db user here only in single tenant mode. For multi-tenant mode,
+        # the user is dropped in the alembic_tenants migration.
+
+        op.execute(
+            text(
+                f"""
+            DO $$
+            BEGIN
+                IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                    -- First revoke all privileges from the database
+                    EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
+                    -- Then drop the user
+                    EXECUTE format('DROP USER %I', '{DB_READONLY_USER}');
+                END IF;
+            END
+            $$;
+        """
+            )
+        )
+        op.execute(text("DROP EXTENSION IF EXISTS pg_trgm"))
--- a/backend/alembic/versions/58c50ef19f08_add_stale_column_to_user__external_user_.py
+++ b/backend/alembic/versions/58c50ef19f08_add_stale_column_to_user__external_user_.py
@@ -0,0 +1,90 @@
+"""add stale column to external user group tables
+
+Revision ID: 58c50ef19f08
+Revises: 7b9b952abdf6
+Create Date: 2025-06-25 14:08:14.162380
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "58c50ef19f08"
+down_revision = "7b9b952abdf6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add the stale column with default value False to user__external_user_group_id
+    op.add_column(
+        "user__external_user_group_id",
+        sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
+    )
+
+    # Create index for efficient querying of stale rows by cc_pair_id
+    op.create_index(
+        "ix_user__external_user_group_id_cc_pair_id_stale",
+        "user__external_user_group_id",
+        ["cc_pair_id", "stale"],
+        unique=False,
+    )
+
+    # Create index for efficient querying of all stale rows
+    op.create_index(
+        "ix_user__external_user_group_id_stale",
+        "user__external_user_group_id",
+        ["stale"],
+        unique=False,
+    )
+
+    # Add the stale column with default value False to public_external_user_group
+    op.add_column(
+        "public_external_user_group",
+        sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
+    )
+
+    # Create index for efficient querying of stale rows by cc_pair_id
+    op.create_index(
+        "ix_public_external_user_group_cc_pair_id_stale",
+        "public_external_user_group",
+        ["cc_pair_id", "stale"],
+        unique=False,
+    )
+
+    # Create index for efficient querying of all stale rows
+    op.create_index(
+        "ix_public_external_user_group_stale",
+        "public_external_user_group",
+        ["stale"],
+        unique=False,
+    )
+
+
+def downgrade() -> None:
+    # Drop the indices for public_external_user_group first
+    op.drop_index(
+        "ix_public_external_user_group_stale", table_name="public_external_user_group"
+    )
+    op.drop_index(
+        "ix_public_external_user_group_cc_pair_id_stale",
+        table_name="public_external_user_group",
+    )
+
+    # Drop the stale column from public_external_user_group
+    op.drop_column("public_external_user_group", "stale")
+
+    # Drop the indices for user__external_user_group_id
+    op.drop_index(
+        "ix_user__external_user_group_id_stale",
+        table_name="user__external_user_group_id",
+    )
+    op.drop_index(
+        "ix_user__external_user_group_id_cc_pair_id_stale",
+        table_name="user__external_user_group_id",
+    )
+
+    # Drop the stale column from user__external_user_group_id
+    op.drop_column("user__external_user_group_id", "stale")
--- a/backend/alembic/versions/5c448911b12f_add_content_type_to_userfile.py
+++ b/backend/alembic/versions/5c448911b12f_add_content_type_to_userfile.py
@@ -0,0 +1,24 @@
+"""Add content type to UserFile
+
+Revision ID: 5c448911b12f
+Revises: 47a07e1a38f1
+Create Date: 2025-04-25 16:59:48.182672
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "5c448911b12f"
+down_revision = "47a07e1a38f1"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column("user_file", sa.Column("content_type", sa.String(), nullable=True))
+
+
+def downgrade() -> None:
+    op.drop_column("user_file", "content_type")
--- a/backend/alembic/versions/62c3a055a141_add_file_names_to_file_connector_config.py
+++ b/backend/alembic/versions/62c3a055a141_add_file_names_to_file_connector_config.py
@@ -0,0 +1,132 @@
+"""add file names to file connector config
+
+Revision ID: 62c3a055a141
+Revises: 3fc5d75723b3
+Create Date: 2025-07-30 17:01:24.417551
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+import json
+import os
+import logging
+
+
+# revision identifiers, used by Alembic.
+revision = "62c3a055a141"
+down_revision = "3fc5d75723b3"
+branch_labels = None
+depends_on = None
+
+SKIP_FILE_NAME_MIGRATION = (
+    os.environ.get("SKIP_FILE_NAME_MIGRATION", "true").lower() == "true"
+)
+
+logger = logging.getLogger("alembic.runtime.migration")
+
+
+def upgrade() -> None:
+    if SKIP_FILE_NAME_MIGRATION:
+        logger.info(
+            "Skipping file name migration. Hint: set SKIP_FILE_NAME_MIGRATION=false to run this migration"
+        )
+        return
+    logger.info("Running file name migration")
+    # Get connection
+    conn = op.get_bind()
+
+    # Get all FILE connectors with their configs
+    file_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'FILE'
+        """
+        )
+    ).fetchall()
+
+    for connector_id, config in file_connectors:
+        # Parse config if it's a string
+        if isinstance(config, str):
+            config = json.loads(config)
+
+        # Get file_locations list
+        file_locations = config.get("file_locations", [])
+
+        # Get display names for each file_id
+        file_names = []
+        for file_id in file_locations:
+            result = conn.execute(
+                sa.text(
+                    """
+                    SELECT display_name
+                    FROM file_record
+                    WHERE file_id = :file_id
+                """
+                ),
+                {"file_id": file_id},
+            ).fetchone()
+
+            if result:
+                file_names.append(result[0])
+            else:
+                file_names.append(file_id)  # Should not happen
+
+        # Add file_names to config
+        new_config = dict(config)
+        new_config["file_names"] = file_names
+
+        # Update the connector
+        conn.execute(
+            sa.text(
+                """
+                UPDATE connector
+                SET connector_specific_config = :new_config
+                WHERE id = :connector_id
+            """
+            ),
+            {"connector_id": connector_id, "new_config": json.dumps(new_config)},
+        )
+
+
+def downgrade() -> None:
+    # Get connection
+    conn = op.get_bind()
+
+    # Remove file_names from all FILE connectors
+    file_connectors = conn.execute(
+        sa.text(
+            """
+            SELECT id, connector_specific_config
+            FROM connector
+            WHERE source = 'FILE'
+        """
+        )
+    ).fetchall()
+
+    for connector_id, config in file_connectors:
+        # Parse config if it's a string
+        if isinstance(config, str):
+            config = json.loads(config)
+
+        # Remove file_names if it exists
+        if "file_names" in config:
+            new_config = dict(config)
+            del new_config["file_names"]
+
+            # Update the connector
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE connector
+                    SET connector_specific_config = :new_config
+                    WHERE id = :connector_id
+                """
+                ),
+                {
+                    "connector_id": connector_id,
+                    "new_config": json.dumps(new_config),
+                },
+            )
--- a/backend/alembic/versions/65bc6e0f8500_remove_kg_subtype_from_db.py
+++ b/backend/alembic/versions/65bc6e0f8500_remove_kg_subtype_from_db.py
@@ -0,0 +1,41 @@
+"""remove kg subtype from db
+
+Revision ID: 65bc6e0f8500
+Revises: cec7ec36c505
+Create Date: 2025-06-13 10:04:27.705976
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "65bc6e0f8500"
+down_revision = "cec7ec36c505"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.drop_column("kg_entity", "entity_class")
+    op.drop_column("kg_entity", "entity_subtype")
+    op.drop_column("kg_entity_extraction_staging", "entity_class")
+    op.drop_column("kg_entity_extraction_staging", "entity_subtype")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "kg_entity_extraction_staging",
+        sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
+    )
+    op.add_column(
+        "kg_entity_extraction_staging",
+        sa.Column("entity_class", sa.String(), nullable=True, index=True),
+    )
+    op.add_column(
+        "kg_entity", sa.Column("entity_subtype", sa.String(), nullable=True, index=True)
+    )
+    op.add_column(
+        "kg_entity", sa.Column("entity_class", sa.String(), nullable=True, index=True)
+    )
--- a/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py
+++ b/backend/alembic/versions/6a804aeb4830_duplicated_no_harm_user_file_migration.py
@@ -6,12 +6,6 @@ Create Date: 2025-04-01 07:26:10.539362

 """

-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy import inspect
-import datetime
-
-
 # revision identifiers, used by Alembic.
 revision = "6a804aeb4830"
 down_revision = "8e1ac4f39a9f"
@@ -19,99 +13,10 @@ branch_labels = None
 depends_on = None


+# Leaving this around only because some people might be on this migration
+# originally was a duplicate of the user files migration
 def upgrade() -> None:
-    # Check if user_file table already exists
-    conn = op.get_bind()
-    inspector = inspect(conn)
-
-    if not inspector.has_table("user_file"):
-        # Create user_folder table without parent_id
-        op.create_table(
-            "user_folder",
-            sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
-            sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
-            sa.Column("name", sa.String(length=255), nullable=True),
-            sa.Column("description", sa.String(length=255), nullable=True),
-            sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
-            sa.Column(
-                "created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
-            ),
-        )
-
-        # Create user_file table with folder_id instead of parent_folder_id
-        op.create_table(
-            "user_file",
-            sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
-            sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
-            sa.Column(
-                "folder_id",
-                sa.Integer(),
-                sa.ForeignKey("user_folder.id"),
-                nullable=True,
-            ),
-            sa.Column("link_url", sa.String(), nullable=True),
-            sa.Column("token_count", sa.Integer(), nullable=True),
-            sa.Column("file_type", sa.String(), nullable=True),
-            sa.Column("file_id", sa.String(length=255), nullable=False),
-            sa.Column("document_id", sa.String(length=255), nullable=False),
-            sa.Column("name", sa.String(length=255), nullable=False),
-            sa.Column(
-                "created_at",
-                sa.DateTime(),
-                default=datetime.datetime.utcnow,
-            ),
-            sa.Column(
-                "cc_pair_id",
-                sa.Integer(),
-                sa.ForeignKey("connector_credential_pair.id"),
-                nullable=True,
-                unique=True,
-            ),
-        )
-
-        # Create persona__user_file table
-        op.create_table(
-            "persona__user_file",
-            sa.Column(
-                "persona_id",
-                sa.Integer(),
-                sa.ForeignKey("persona.id"),
-                primary_key=True,
-            ),
-            sa.Column(
-                "user_file_id",
-                sa.Integer(),
-                sa.ForeignKey("user_file.id"),
-                primary_key=True,
-            ),
-        )
-
-        # Create persona__user_folder table
-        op.create_table(
-            "persona__user_folder",
-            sa.Column(
-                "persona_id",
-                sa.Integer(),
-                sa.ForeignKey("persona.id"),
-                primary_key=True,
-            ),
-            sa.Column(
-                "user_folder_id",
-                sa.Integer(),
-                sa.ForeignKey("user_folder.id"),
-                primary_key=True,
-            ),
-        )
-
-        op.add_column(
-            "connector_credential_pair",
-            sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
-        )
-
-        # Update existing records to have is_user_file=False instead of NULL
-        op.execute(
-            "UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
-        )
+    pass


 def downgrade() -> None:
--- a/backend/alembic/versions/703313b75876_add_tokenratelimit_tables.py
+++ b/backend/alembic/versions/703313b75876_add_tokenratelimit_tables.py
@@ -6,11 +6,8 @@ Create Date: 2024-04-15 01:36:02.952809

 """

-import json
-from typing import cast
 from alembic import op
 import sqlalchemy as sa
-from onyx.key_value_store.factory import get_kv_store

 # revision identifiers, used by Alembic.
 revision = "703313b75876"
@@ -54,27 +51,10 @@ def upgrade() -> None:
        sa.PrimaryKeyConstraint("rate_limit_id", "user_group_id"),
    )

-    try:
-        settings_json = cast(str, get_kv_store().load("token_budget_settings"))
-        settings = json.loads(settings_json)
-
-        is_enabled = settings.get("enable_token_budget", False)
-        token_budget = settings.get("token_budget", -1)
-        period_hours = settings.get("period_hours", -1)
-
-        if is_enabled and token_budget > 0 and period_hours > 0:
-            op.execute(
-                f"INSERT INTO token_rate_limit \
-                    (enabled, token_budget, period_hours, scope) VALUES \
-                        ({is_enabled}, {token_budget}, {period_hours}, 'GLOBAL')"
-            )
-
-        # Delete the dynamic config
-        get_kv_store().delete("token_budget_settings")
-
-    except Exception:
-        # Ignore if the dynamic config is not found
-        pass
+    # NOTE: rate limit settings used to be stored in the "token_budget_settings" key in the
+    # KeyValueStore. This will now be lost. The KV store works differently than it used to
+    # so the migration is fairly complicated and likely not worth it to support (pretty much
+    # nobody will have it set)


 def downgrade() -> None:
--- a/backend/alembic/versions/7a70b7664e37_add_model_configuration_table.py
+++ b/backend/alembic/versions/7a70b7664e37_add_model_configuration_table.py
@@ -0,0 +1,237 @@
+"""Add model-configuration table
+
+Revision ID: 7a70b7664e37
+Revises: d961aca62eb3
+Create Date: 2025-04-10 15:00:35.984669
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from onyx.llm.llm_provider_options import (
+    fetch_model_names_for_provider_as_set,
+    fetch_visible_model_names_for_provider_as_set,
+)
+
+# revision identifiers, used by Alembic.
+revision = "7a70b7664e37"
+down_revision = "d961aca62eb3"
+branch_labels = None
+depends_on = None
+
+
+def _resolve(
+    provider_name: str,
+    model_names: list[str] | None,
+    display_model_names: list[str] | None,
+    default_model_name: str,
+    fast_default_model_name: str | None,
+) -> set[tuple[str, bool]]:
+    models = set(model_names) if model_names else None
+    display_models = set(display_model_names) if display_model_names else None
+
+    # If both are defined, we need to make sure that `model_names` is a superset of `display_model_names`.
+    if models and display_models:
+        models = display_models.union(models)
+
+    # If only `model_names` is defined, then:
+    #   - If default-model-names are available for the `provider_name`, then set `display_model_names` to it
+    #     and set `model_names` to the union of those default-model-names with itself.
+    #   - If no default-model-names are available, then set `display_models` to `models`.
+    #
+    # This preserves the invariant that `display_models` is a subset of `models`.
+    elif models and not display_models:
+        visible_default_models = fetch_visible_model_names_for_provider_as_set(
+            provider_name=provider_name
+        )
+        if visible_default_models:
+            display_models = set(visible_default_models)
+            models = display_models.union(models)
+        else:
+            display_models = set(models)
+
+    # If only the `display_model_names` are defined, then set `models` to the union of `display_model_names`
+    # and the default-model-names for that provider.
+    #
+    # This will also preserve the invariant that `display_models` is a subset of `models`.
+    elif not models and display_models:
+        default_models = fetch_model_names_for_provider_as_set(
+            provider_name=provider_name
+        )
+        if default_models:
+            models = display_models.union(default_models)
+        else:
+            models = set(display_models)
+
+    # If neither are defined, then set `models` and `display_models` to the default-model-names for the given provider.
+    #
+    # This will also preserve the invariant that `display_models` is a subset of `models`.
+    else:
+        default_models = fetch_model_names_for_provider_as_set(
+            provider_name=provider_name
+        )
+        visible_default_models = fetch_visible_model_names_for_provider_as_set(
+            provider_name=provider_name
+        )
+
+        if default_models:
+            if not visible_default_models:
+                raise RuntimeError
+                raise RuntimeError(
+                    "If `default_models` is non-None, `visible_default_models` must be non-None too."
+                )
+            models = default_models
+            display_models = visible_default_models
+
+        # This is not a well-known llm-provider; we can't provide any model suggestions.
+        # Therefore, we set to the empty set and continue
+        else:
+            models = set()
+            display_models = set()
+
+    # It is possible that `default_model_name` is not in `models` and is not in `display_models`.
+    # It is also possible that `fast_default_model_name` is not in `models` and is not in `display_models`.
+    models.add(default_model_name)
+    if fast_default_model_name:
+        models.add(fast_default_model_name)
+    display_models.add(default_model_name)
+    if fast_default_model_name:
+        display_models.add(fast_default_model_name)
+
+    return set([(model, model in display_models) for model in models])
+
+
+def upgrade() -> None:
+    op.create_table(
+        "model_configuration",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("llm_provider_id", sa.Integer(), nullable=False),
+        sa.Column("name", sa.String(), nullable=False),
+        sa.Column("is_visible", sa.Boolean(), nullable=False),
+        sa.Column("max_input_tokens", sa.Integer(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["llm_provider_id"], ["llm_provider.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("llm_provider_id", "name"),
+    )
+
+    # Create temporary sqlalchemy references to tables for data migration
+    llm_provider_table = sa.sql.table(
+        "llm_provider",
+        sa.column("id", sa.Integer),
+        sa.column("provider", sa.Integer),
+        sa.column("model_names", postgresql.ARRAY(sa.String)),
+        sa.column("display_model_names", postgresql.ARRAY(sa.String)),
+        sa.column("default_model_name", sa.String),
+        sa.column("fast_default_model_name", sa.String),
+    )
+    model_configuration_table = sa.sql.table(
+        "model_configuration",
+        sa.column("id", sa.Integer),
+        sa.column("llm_provider_id", sa.Integer),
+        sa.column("name", sa.String),
+        sa.column("is_visible", sa.Boolean),
+        sa.column("max_input_tokens", sa.Integer),
+    )
+    connection = op.get_bind()
+    llm_providers = connection.execute(
+        sa.select(
+            llm_provider_table.c.id,
+            llm_provider_table.c.provider,
+            llm_provider_table.c.model_names,
+            llm_provider_table.c.display_model_names,
+            llm_provider_table.c.default_model_name,
+            llm_provider_table.c.fast_default_model_name,
+        )
+    ).fetchall()
+
+    for llm_provider in llm_providers:
+        provider_id = llm_provider[0]
+        provider_name = llm_provider[1]
+        model_names = llm_provider[2]
+        display_model_names = llm_provider[3]
+        default_model_name = llm_provider[4]
+        fast_default_model_name = llm_provider[5]
+
+        model_configurations = _resolve(
+            provider_name=provider_name,
+            model_names=model_names,
+            display_model_names=display_model_names,
+            default_model_name=default_model_name,
+            fast_default_model_name=fast_default_model_name,
+        )
+
+        for model_name, is_visible in model_configurations:
+            connection.execute(
+                model_configuration_table.insert().values(
+                    llm_provider_id=provider_id,
+                    name=model_name,
+                    is_visible=is_visible,
+                    max_input_tokens=None,
+                )
+            )
+
+    op.drop_column("llm_provider", "model_names")
+    op.drop_column("llm_provider", "display_model_names")
+
+
+def downgrade() -> None:
+    llm_provider = sa.table(
+        "llm_provider",
+        sa.column("id", sa.Integer),
+        sa.column("model_names", postgresql.ARRAY(sa.String)),
+        sa.column("display_model_names", postgresql.ARRAY(sa.String)),
+    )
+
+    model_configuration = sa.table(
+        "model_configuration",
+        sa.column("id", sa.Integer),
+        sa.column("llm_provider_id", sa.Integer),
+        sa.column("name", sa.String),
+        sa.column("is_visible", sa.Boolean),
+        sa.column("max_input_tokens", sa.Integer),
+    )
+    op.add_column(
+        "llm_provider",
+        sa.Column(
+            "model_names",
+            postgresql.ARRAY(sa.VARCHAR()),
+            autoincrement=False,
+            nullable=True,
+        ),
+    )
+    op.add_column(
+        "llm_provider",
+        sa.Column(
+            "display_model_names",
+            postgresql.ARRAY(sa.VARCHAR()),
+            autoincrement=False,
+            nullable=True,
+        ),
+    )
+
+    connection = op.get_bind()
+    provider_ids = connection.execute(sa.select(llm_provider.c.id)).fetchall()
+
+    for (provider_id,) in provider_ids:
+        # Get all models for this provider
+        models = connection.execute(
+            sa.select(
+                model_configuration.c.name, model_configuration.c.is_visible
+            ).where(model_configuration.c.llm_provider_id == provider_id)
+        ).fetchall()
+
+        all_models = [model[0] for model in models]
+        visible_models = [model[0] for model in models if model[1]]
+
+        # Update provider with arrays
+        op.execute(
+            llm_provider.update()
+            .where(llm_provider.c.id == provider_id)
+            .values(model_names=all_models, display_model_names=visible_models)
+        )
+
+    op.drop_table("model_configuration")
--- a/backend/alembic/versions/7b9b952abdf6_update_entities.py
+++ b/backend/alembic/versions/7b9b952abdf6_update_entities.py
@@ -0,0 +1,318 @@
+"""update-entities
+
+Revision ID: 7b9b952abdf6
+Revises: 36e9220ab794
+Create Date: 2025-06-23 20:24:08.139201
+
+"""
+
+import json
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "7b9b952abdf6"
+down_revision = "36e9220ab794"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    # new entity type metadata_attribute_conversion
+    new_entity_type_conversion = {
+        "LINEAR": {
+            "team": {"name": "team", "keep": True, "implication_property": None},
+            "state": {"name": "state", "keep": True, "implication_property": None},
+            "priority": {
+                "name": "priority",
+                "keep": True,
+                "implication_property": None,
+            },
+            "estimate": {
+                "name": "estimate",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_at": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "started_at": {
+                "name": "started_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "completed_at": {
+                "name": "completed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "due_date": {
+                "name": "due_date",
+                "keep": True,
+                "implication_property": None,
+            },
+            "creator": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignee": {
+                "name": "assignee",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+        },
+        "JIRA": {
+            "issuetype": {
+                "name": "subtype",
+                "keep": True,
+                "implication_property": None,
+            },
+            "status": {"name": "status", "keep": True, "implication_property": None},
+            "priority": {
+                "name": "priority",
+                "keep": True,
+                "implication_property": None,
+            },
+            "project_name": {
+                "name": "project",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "updated": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "resolution_date": {
+                "name": "completed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "duedate": {"name": "due_date", "keep": True, "implication_property": None},
+            "reporter_email": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignee_email": {
+                "name": "assignee",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+            "key": {"name": "key", "keep": True, "implication_property": None},
+            "parent": {"name": "parent", "keep": True, "implication_property": None},
+        },
+        "GITHUB_PR": {
+            "repo": {"name": "repository", "keep": True, "implication_property": None},
+            "state": {"name": "state", "keep": True, "implication_property": None},
+            "num_commits": {
+                "name": "num_commits",
+                "keep": True,
+                "implication_property": None,
+            },
+            "num_files_changed": {
+                "name": "num_files_changed",
+                "keep": True,
+                "implication_property": None,
+            },
+            "labels": {"name": "labels", "keep": True, "implication_property": None},
+            "merged": {"name": "merged", "keep": True, "implication_property": None},
+            "merged_at": {
+                "name": "merged_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "closed_at": {
+                "name": "closed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_at": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "updated_at": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "user": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignees": {
+                "name": "assignees",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+        },
+        "GITHUB_ISSUE": {
+            "repo": {"name": "repository", "keep": True, "implication_property": None},
+            "state": {"name": "state", "keep": True, "implication_property": None},
+            "labels": {"name": "labels", "keep": True, "implication_property": None},
+            "closed_at": {
+                "name": "closed_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_at": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "updated_at": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "user": {
+                "name": "creator",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_creator_of",
+                },
+            },
+            "assignees": {
+                "name": "assignees",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "from_email",
+                    "implied_relationship_name": "is_assignee_of",
+                },
+            },
+        },
+        "FIREFLIES": {},
+        "ACCOUNT": {},
+        "OPPORTUNITY": {
+            "name": {"name": "name", "keep": True, "implication_property": None},
+            "stage_name": {"name": "stage", "keep": True, "implication_property": None},
+            "type": {"name": "type", "keep": True, "implication_property": None},
+            "amount": {"name": "amount", "keep": True, "implication_property": None},
+            "fiscal_year": {
+                "name": "fiscal_year",
+                "keep": True,
+                "implication_property": None,
+            },
+            "fiscal_quarter": {
+                "name": "fiscal_quarter",
+                "keep": True,
+                "implication_property": None,
+            },
+            "is_closed": {
+                "name": "is_closed",
+                "keep": True,
+                "implication_property": None,
+            },
+            "close_date": {
+                "name": "close_date",
+                "keep": True,
+                "implication_property": None,
+            },
+            "probability": {
+                "name": "close_probability",
+                "keep": True,
+                "implication_property": None,
+            },
+            "created_date": {
+                "name": "created_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "last_modified_date": {
+                "name": "updated_at",
+                "keep": True,
+                "implication_property": None,
+            },
+            "account": {
+                "name": "account",
+                "keep": False,
+                "implication_property": {
+                    "implied_entity_type": "ACCOUNT",
+                    "implied_relationship_name": "is_account_of",
+                },
+            },
+        },
+        "VENDOR": {},
+        "EMPLOYEE": {},
+    }
+
+    current_entity_types = conn.execute(
+        sa.text("SELECT id_name, attributes from kg_entity_type")
+    ).all()
+    for entity_type, attributes in current_entity_types:
+        # delete removed entity types
+        if entity_type not in new_entity_type_conversion:
+            op.execute(
+                sa.text(f"DELETE FROM kg_entity_type WHERE id_name = '{entity_type}'")
+            )
+            continue
+
+        # update entity type attributes
+        if "metadata_attributes" in attributes:
+            del attributes["metadata_attributes"]
+        attributes["metadata_attribute_conversion"] = new_entity_type_conversion[
+            entity_type
+        ]
+        attributes_str = json.dumps(attributes).replace("'", "''")
+        op.execute(
+            sa.text(
+                f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
+                f"WHERE id_name = '{entity_type}'"
+            ),
+        )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+
+    current_entity_types = conn.execute(
+        sa.text("SELECT id_name, attributes from kg_entity_type")
+    ).all()
+    for entity_type, attributes in current_entity_types:
+        conversion = {}
+        if "metadata_attribute_conversion" in attributes:
+            conversion = attributes.pop("metadata_attribute_conversion")
+        attributes["metadata_attributes"] = {
+            attr: prop["name"] for attr, prop in conversion.items() if prop["keep"]
+        }
+
+        attributes_str = json.dumps(attributes).replace("'", "''")
+        op.execute(
+            sa.text(
+                f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
+                f"WHERE id_name = '{entity_type}'"
+            ),
+        )
--- a/backend/alembic/versions/90e3b9af7da4_tag_fix.py
+++ b/backend/alembic/versions/90e3b9af7da4_tag_fix.py
@@ -0,0 +1,341 @@
+"""tag-fix
+
+Revision ID: 90e3b9af7da4
+Revises: 62c3a055a141
+Create Date: 2025-08-01 20:58:14.607624
+
+"""
+
+import json
+import logging
+import os
+
+from typing import cast
+from typing import Generator
+
+from alembic import op
+import sqlalchemy as sa
+
+from onyx.document_index.factory import get_default_document_index
+from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
+from onyx.db.search_settings import SearchSettings
+from onyx.configs.app_configs import AUTH_TYPE
+from onyx.configs.constants import AuthType
+from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
+
+logger = logging.getLogger("alembic.runtime.migration")
+
+
+# revision identifiers, used by Alembic.
+revision = "90e3b9af7da4"
+down_revision = "62c3a055a141"
+branch_labels = None
+depends_on = None
+
+SKIP_TAG_FIX = os.environ.get("SKIP_TAG_FIX", "true").lower() == "true"
+
+# override for cloud
+if AUTH_TYPE == AuthType.CLOUD:
+    SKIP_TAG_FIX = True
+
+
+def set_is_list_for_known_tags() -> None:
+    """
+    Sets is_list to true for all tags that are known to be lists.
+    """
+    LIST_METADATA: list[tuple[str, str]] = [
+        ("CLICKUP", "tags"),
+        ("CONFLUENCE", "labels"),
+        ("DISCOURSE", "tags"),
+        ("FRESHDESK", "emails"),
+        ("GITHUB", "assignees"),
+        ("GITHUB", "labels"),
+        ("GURU", "tags"),
+        ("GURU", "folders"),
+        ("HUBSPOT", "associated_contact_ids"),
+        ("HUBSPOT", "associated_company_ids"),
+        ("HUBSPOT", "associated_deal_ids"),
+        ("HUBSPOT", "associated_ticket_ids"),
+        ("JIRA", "labels"),
+        ("MEDIAWIKI", "categories"),
+        ("ZENDESK", "labels"),
+        ("ZENDESK", "content_tags"),
+    ]
+
+    bind = op.get_bind()
+    for source, key in LIST_METADATA:
+        bind.execute(
+            sa.text(
+                f"""
+                UPDATE tag
+                SET is_list = true
+                WHERE tag_key = '{key}'
+                AND source = '{source}'
+                """
+            )
+        )
+
+
+def set_is_list_for_list_tags() -> None:
+    """
+    Sets is_list to true for all tags which have multiple values for a given
+    document, key, and source triplet. This only works if we remove old tags
+    from the database.
+    """
+    bind = op.get_bind()
+    bind.execute(
+        sa.text(
+            """
+            UPDATE tag
+            SET is_list = true
+            FROM (
+                SELECT DISTINCT tag.tag_key, tag.source
+                FROM tag
+                JOIN document__tag ON tag.id = document__tag.tag_id
+                GROUP BY tag.tag_key, tag.source, document__tag.document_id
+                HAVING count(*) > 1
+            ) AS list_tags
+            WHERE tag.tag_key = list_tags.tag_key
+            AND tag.source = list_tags.source
+            """
+        )
+    )
+
+
+def log_list_tags() -> None:
+    bind = op.get_bind()
+    result = bind.execute(
+        sa.text(
+            """
+            SELECT DISTINCT source, tag_key
+            FROM tag
+            WHERE is_list
+            ORDER BY source, tag_key
+            """
+        )
+    ).fetchall()
+    logger.info(
+        "List tags:\n" + "\n".join(f"  {source}: {key}" for source, key in result)
+    )
+
+
+def remove_old_tags() -> None:
+    """
+    Removes old tags from the database.
+    Previously, there was a bug where if a document got indexed with a tag and then
+    the document got reindexed, the old tag would not be removed.
+    This function removes those old tags by comparing it against the tags in vespa.
+    """
+    current_search_settings, future_search_settings = active_search_settings()
+    document_index = get_default_document_index(
+        current_search_settings, future_search_settings
+    )
+
+    # Get the index name
+    if hasattr(document_index, "index_name"):
+        index_name = document_index.index_name
+    else:
+        # Default index name if we can't get it from the document_index
+        index_name = "danswer_index"
+
+    for batch in _get_batch_documents_with_multiple_tags():
+        n_deleted = 0
+
+        for document_id in batch:
+            true_metadata = _get_vespa_metadata(document_id, index_name)
+            tags = _get_document_tags(document_id)
+
+            # identify document__tags to delete
+            to_delete: list[str] = []
+            for tag_id, tag_key, tag_value in tags:
+                true_val = true_metadata.get(tag_key, "")
+                if (isinstance(true_val, list) and tag_value not in true_val) or (
+                    isinstance(true_val, str) and tag_value != true_val
+                ):
+                    to_delete.append(str(tag_id))
+
+            if not to_delete:
+                continue
+
+            # delete old document__tags
+            bind = op.get_bind()
+            result = bind.execute(
+                sa.text(
+                    f"""
+                    DELETE FROM document__tag
+                    WHERE document_id = '{document_id}'
+                    AND tag_id IN ({','.join(to_delete)})
+                    """
+                )
+            )
+            n_deleted += result.rowcount
+        logger.info(f"Processed {len(batch)} documents and deleted {n_deleted} tags")
+
+
+def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
+    result = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_fetch = result.fetchall()
+    search_settings = (
+        SearchSettings(**search_settings_fetch[0]._asdict())
+        if search_settings_fetch
+        else None
+    )
+
+    result2 = op.get_bind().execute(
+        sa.text(
+            """
+        SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
+        """
+        )
+    )
+    search_settings_future_fetch = result2.fetchall()
+    search_settings_future = (
+        SearchSettings(**search_settings_future_fetch[0]._asdict())
+        if search_settings_future_fetch
+        else None
+    )
+
+    if not isinstance(search_settings, SearchSettings):
+        raise RuntimeError(
+            "current search settings is of type " + str(type(search_settings))
+        )
+    if (
+        not isinstance(search_settings_future, SearchSettings)
+        and search_settings_future is not None
+    ):
+        raise RuntimeError(
+            "future search settings is of type " + str(type(search_settings_future))
+        )
+
+    return search_settings, search_settings_future
+
+
+def _get_batch_documents_with_multiple_tags(
+    batch_size: int = 128,
+) -> Generator[list[str], None, None]:
+    """
+    Returns a list of document ids which contain a one to many tag.
+    The document may either contain a list metadata value, or may contain leftover
+    old tags from reindexing.
+    """
+    offset_clause = ""
+    bind = op.get_bind()
+
+    while True:
+        batch = bind.execute(
+            sa.text(
+                f"""
+                SELECT DISTINCT document__tag.document_id
+                FROM tag
+                JOIN document__tag ON tag.id = document__tag.tag_id
+                GROUP BY tag.tag_key, tag.source, document__tag.document_id
+                HAVING count(*) > 1 {offset_clause}
+                ORDER BY document__tag.document_id
+                LIMIT {batch_size}
+                """
+            )
+        ).fetchall()
+        if not batch:
+            break
+        doc_ids = [document_id for document_id, in batch]
+        yield doc_ids
+        offset_clause = f"AND document__tag.document_id > '{doc_ids[-1]}'"
+
+
+def _get_vespa_metadata(
+    document_id: str, index_name: str
+) -> dict[str, str | list[str]]:
+    url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
+
+    # Document-Selector language
+    selection = (
+        f"{index_name}.document_id=='{document_id}' and {index_name}.chunk_id==0"
+    )
+
+    params: dict[str, str | int] = {
+        "selection": selection,
+        "wantedDocumentCount": 1,
+        "fieldSet": f"{index_name}:metadata",
+    }
+
+    with get_vespa_http_client() as client:
+        resp = client.get(url, params=params)
+        resp.raise_for_status()
+
+    docs = resp.json().get("documents", [])
+    if not docs:
+        raise RuntimeError(f"No chunk-0 found for document {document_id}")
+
+    # for some reason, metadata is a string
+    metadata = docs[0]["fields"]["metadata"]
+    return json.loads(metadata)
+
+
+def _get_document_tags(document_id: str) -> list[tuple[int, str, str]]:
+    bind = op.get_bind()
+    result = bind.execute(
+        sa.text(
+            f"""
+            SELECT tag.id, tag.tag_key, tag.tag_value
+            FROM tag
+            JOIN document__tag ON tag.id = document__tag.tag_id
+            WHERE document__tag.document_id = '{document_id}'
+            """
+        )
+    ).fetchall()
+    return cast(list[tuple[int, str, str]], result)
+
+
+def upgrade() -> None:
+    op.add_column(
+        "tag",
+        sa.Column("is_list", sa.Boolean(), nullable=False, server_default="false"),
+    )
+    op.drop_constraint(
+        constraint_name="_tag_key_value_source_uc",
+        table_name="tag",
+        type_="unique",
+    )
+    op.create_unique_constraint(
+        constraint_name="_tag_key_value_source_list_uc",
+        table_name="tag",
+        columns=["tag_key", "tag_value", "source", "is_list"],
+    )
+    set_is_list_for_known_tags()
+
+    if SKIP_TAG_FIX:
+        logger.warning(
+            "Skipping removal of old tags. "
+            "This can cause issues when using the knowledge graph, or "
+            "when filtering for documents by tags."
+        )
+        log_list_tags()
+        return
+
+    remove_old_tags()
+    set_is_list_for_list_tags()
+
+    # debug
+    log_list_tags()
+
+
+def downgrade() -> None:
+    # the migration adds and populates the is_list column, and removes old bugged tags
+    # there isn't a point in adding back the bugged tags, so we just drop the column
+    op.drop_constraint(
+        constraint_name="_tag_key_value_source_list_uc",
+        table_name="tag",
+        type_="unique",
+    )
+    op.create_unique_constraint(
+        constraint_name="_tag_key_value_source_uc",
+        table_name="tag",
+        columns=["tag_key", "tag_value", "source"],
+    )
+    op.drop_column("tag", "is_list")
--- a/backend/alembic/versions/9aadf32dfeb4_add_user_files.py
+++ b/backend/alembic/versions/9aadf32dfeb4_add_user_files.py
@@ -103,6 +103,7 @@ def upgrade() -> None:


 def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "is_user_file")
    # Drop the persona__user_folder table
    op.drop_table("persona__user_folder")
    # Drop the persona__user_file table
@@ -111,4 +112,3 @@ def downgrade() -> None:
    op.drop_table("user_file")
    # Drop the user_folder table
    op.drop_table("user_folder")
-    op.drop_column("connector_credential_pair", "is_user_file")
--- a/backend/alembic/versions/a7688ab35c45_add_public_external_user_group_table.py
+++ b/backend/alembic/versions/a7688ab35c45_add_public_external_user_group_table.py
@@ -0,0 +1,32 @@
+"""Add public_external_user_group table
+
+Revision ID: a7688ab35c45
+Revises: 5c448911b12f
+Create Date: 2025-05-06 20:55:12.747875
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "a7688ab35c45"
+down_revision = "5c448911b12f"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "public_external_user_group",
+        sa.Column("external_user_group_id", sa.String(), nullable=False),
+        sa.Column("cc_pair_id", sa.Integer(), nullable=False),
+        sa.PrimaryKeyConstraint("external_user_group_id", "cc_pair_id"),
+        sa.ForeignKeyConstraint(
+            ["cc_pair_id"], ["connector_credential_pair.id"], ondelete="CASCADE"
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("public_external_user_group")
--- a/backend/alembic/versions/b558f51620b4_pause_finished_user_file_connectors.py
+++ b/backend/alembic/versions/b558f51620b4_pause_finished_user_file_connectors.py
@@ -0,0 +1,33 @@
+"""Pause finished user file connectors
+
+Revision ID: b558f51620b4
+Revises: 90e3b9af7da4
+Create Date: 2025-08-15 17:17:02.456704
+
+"""
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "b558f51620b4"
+down_revision = "90e3b9af7da4"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Set all user file connector credential pairs with ACTIVE status to PAUSED
+    # This ensures user files don't continue to run indexing tasks after processing
+    op.execute(
+        """
+        UPDATE connector_credential_pair
+        SET status = 'PAUSED'
+        WHERE is_user_file = true
+        AND status = 'ACTIVE'
+        """
+    )
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
+++ b/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
@@ -0,0 +1,315 @@
+"""modify_file_store_for_external_storage
+
+Revision ID: c9e2cd766c29
+Revises: 03bf8be6b53a
+Create Date: 2025-06-13 14:02:09.867679
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+from typing import cast, Any
+
+from botocore.exceptions import ClientError
+
+from onyx.db._deprecated.pg_file_store import delete_lobj_by_id, read_lobj
+from onyx.file_store.file_store import get_s3_file_store
+from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
+
+# revision identifiers, used by Alembic.
+revision = "c9e2cd766c29"
+down_revision = "03bf8be6b53a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    try:
+        # Modify existing file_store table to support external storage
+        op.rename_table("file_store", "file_record")
+
+        # Make lobj_oid nullable (for external storage files)
+        op.alter_column("file_record", "lobj_oid", nullable=True)
+
+        # Add external storage columns with generic names
+        op.add_column(
+            "file_record", sa.Column("bucket_name", sa.String(), nullable=True)
+        )
+        op.add_column(
+            "file_record", sa.Column("object_key", sa.String(), nullable=True)
+        )
+
+        # Add timestamps for tracking
+        op.add_column(
+            "file_record",
+            sa.Column(
+                "created_at",
+                sa.DateTime(timezone=True),
+                server_default=sa.func.now(),
+                nullable=False,
+            ),
+        )
+        op.add_column(
+            "file_record",
+            sa.Column(
+                "updated_at",
+                sa.DateTime(timezone=True),
+                server_default=sa.func.now(),
+                nullable=False,
+            ),
+        )
+
+        op.alter_column("file_record", "file_name", new_column_name="file_id")
+    except Exception as e:
+        if "does not exist" in str(e) or 'relation "file_store" does not exist' in str(
+            e
+        ):
+            print(
+                f"Ran into error - {e}. Likely means we had a partial success in the past, continuing..."
+            )
+        else:
+            raise
+
+    print(
+        "External storage configured - migrating files from PostgreSQL to external storage..."
+    )
+    # if we fail midway through this, we'll have a partial success. Running the migration
+    # again should allow us to continue.
+    _migrate_files_to_external_storage()
+    print("File migration completed successfully!")
+
+    # Remove lobj_oid column
+    op.drop_column("file_record", "lobj_oid")
+
+
+def downgrade() -> None:
+    """Revert schema changes and migrate files from external storage back to PostgreSQL large objects."""
+
+    print(
+        "Reverting to PostgreSQL-backed file store – migrating files from external storage …"
+    )
+
+    # 1. Ensure `lobj_oid` exists on the current `file_record` table (nullable for now).
+    op.add_column("file_record", sa.Column("lobj_oid", sa.Integer(), nullable=True))
+
+    # 2. Move content from external storage back into PostgreSQL large objects (table is still
+    #    called `file_record` so application code continues to work during the copy).
+    try:
+        _migrate_files_to_postgres()
+    except Exception:
+        print("Error during downgrade migration, rolling back …")
+        op.drop_column("file_record", "lobj_oid")
+        raise
+
+    # 3. After migration every row should now have `lobj_oid` populated – mark NOT NULL.
+    op.alter_column("file_record", "lobj_oid", nullable=False)
+
+    # 4. Remove columns that are only relevant to external storage.
+    op.drop_column("file_record", "updated_at")
+    op.drop_column("file_record", "created_at")
+    op.drop_column("file_record", "object_key")
+    op.drop_column("file_record", "bucket_name")
+
+    # 5. Rename `file_id` back to `file_name` (still on `file_record`).
+    op.alter_column("file_record", "file_id", new_column_name="file_name")
+
+    # 6. Finally, rename the table back to its original name expected by the legacy codebase.
+    op.rename_table("file_record", "file_store")
+
+    print(
+        "Downgrade migration completed – files are now stored inside PostgreSQL again."
+    )
+
+
+# -----------------------------------------------------------------------------
+# Helper: migrate from external storage (S3/MinIO) back into PostgreSQL large objects
+
+
+def _migrate_files_to_postgres() -> None:
+    """Move any files whose content lives in external S3-compatible storage back into PostgreSQL.
+
+    The logic mirrors *inverse* of `_migrate_files_to_external_storage` used on upgrade.
+    """
+
+    # Obtain DB session from Alembic context
+    bind = op.get_bind()
+    session = Session(bind=bind)
+
+    # Fetch rows that have external storage pointers (bucket/object_key not NULL)
+    result = session.execute(
+        text(
+            "SELECT file_id, bucket_name, object_key FROM file_record "
+            "WHERE bucket_name IS NOT NULL AND object_key IS NOT NULL"
+        )
+    )
+
+    files_to_migrate = [row[0] for row in result.fetchall()]
+    total_files = len(files_to_migrate)
+
+    if total_files == 0:
+        print("No files found in external storage to migrate back to PostgreSQL.")
+        return
+
+    print(f"Found {total_files} files to migrate back to PostgreSQL large objects.")
+
+    _set_tenant_contextvar(session)
+    migrated_count = 0
+
+    # only create external store if we have files to migrate. This line
+    # makes it so we need to have S3/MinIO configured to run this migration.
+    external_store = get_s3_file_store()
+
+    for i, file_id in enumerate(files_to_migrate, 1):
+        print(f"Migrating file {i}/{total_files}: {file_id}")
+
+        # Read file content from external storage (always binary)
+        try:
+            file_io = external_store.read_file(
+                file_id=file_id, mode="b", use_tempfile=True
+            )
+            file_io.seek(0)
+
+            # Import lazily to avoid circular deps at Alembic runtime
+            from onyx.db._deprecated.pg_file_store import (
+                create_populate_lobj,
+            )  # noqa: E402
+
+            # Create new Postgres large object and populate it
+            lobj_oid = create_populate_lobj(content=file_io, db_session=session)
+
+            # Update DB row: set lobj_oid, clear bucket/object_key
+            session.execute(
+                text(
+                    "UPDATE file_record SET lobj_oid = :lobj_oid, bucket_name = NULL, "
+                    "object_key = NULL WHERE file_id = :file_id"
+                ),
+                {"lobj_oid": lobj_oid, "file_id": file_id},
+            )
+        except ClientError as e:
+            if "NoSuchKey" in str(e):
+                print(
+                    f"File {file_id} not found in external storage. Deleting from database."
+                )
+                session.execute(
+                    text("DELETE FROM file_record WHERE file_id = :file_id"),
+                    {"file_id": file_id},
+                )
+            else:
+                raise
+
+        migrated_count += 1
+        print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
+
+    # Flush the SQLAlchemy session so statements are sent to the DB, but **do not**
+    # commit the transaction.  The surrounding Alembic migration will commit once
+    # the *entire* downgrade succeeds.  This keeps the whole downgrade atomic and
+    # avoids leaving the database in a partially-migrated state if a later schema
+    # operation fails.
+    session.flush()
+
+    print(
+        f"Migration back to PostgreSQL completed: {migrated_count} files staged for commit."
+    )
+
+
+def _migrate_files_to_external_storage() -> None:
+    """Migrate files from PostgreSQL large objects to external storage"""
+    # Get database session
+    bind = op.get_bind()
+    session = Session(bind=bind)
+    external_store = get_s3_file_store()
+
+    # Find all files currently stored in PostgreSQL (lobj_oid is not null)
+    result = session.execute(
+        text(
+            "SELECT file_id FROM file_record WHERE lobj_oid IS NOT NULL "
+            "AND bucket_name IS NULL AND object_key IS NULL"
+        )
+    )
+
+    files_to_migrate = [row[0] for row in result.fetchall()]
+    total_files = len(files_to_migrate)
+
+    if total_files == 0:
+        print("No files found in PostgreSQL storage to migrate.")
+        return
+
+    # might need to move this above the if statement when creating a new multi-tenant
+    # system. VERY extreme edge case.
+    external_store.initialize()
+    print(f"Found {total_files} files to migrate from PostgreSQL to external storage.")
+
+    _set_tenant_contextvar(session)
+    migrated_count = 0
+
+    for i, file_id in enumerate(files_to_migrate, 1):
+        print(f"Migrating file {i}/{total_files}: {file_id}")
+
+        # Read file record to get metadata
+        file_record = session.execute(
+            text("SELECT * FROM file_record WHERE file_id = :file_id"),
+            {"file_id": file_id},
+        ).fetchone()
+
+        if file_record is None:
+            print(f"File {file_id} not found in PostgreSQL storage.")
+            continue
+
+        lobj_id = cast(int, file_record.lobj_oid)  # type: ignore
+        file_metadata = cast(Any, file_record.file_metadata)  # type: ignore
+
+        # Read file content from PostgreSQL
+        try:
+            file_content = read_lobj(
+                lobj_id, db_session=session, mode="b", use_tempfile=True
+            )
+        except Exception as e:
+            if "large object" in str(e) and "does not exist" in str(e):
+                print(f"File {file_id} not found in PostgreSQL storage.")
+                continue
+            else:
+                raise
+
+        # Handle file_metadata type conversion
+        file_metadata = None
+        if file_metadata is not None:
+            if isinstance(file_metadata, dict):
+                file_metadata = file_metadata
+            else:
+                # Convert other types to dict if possible, otherwise None
+                try:
+                    file_metadata = dict(file_record.file_metadata)  # type: ignore
+                except (TypeError, ValueError):
+                    file_metadata = None
+
+        # Save to external storage (this will handle the database record update and cleanup)
+        # NOTE: this WILL .commit() the transaction.
+        external_store.save_file(
+            file_id=file_id,
+            content=file_content,
+            display_name=file_record.display_name,
+            file_origin=file_record.file_origin,
+            file_type=file_record.file_type,
+            file_metadata=file_metadata,
+        )
+        delete_lobj_by_id(lobj_id, db_session=session)
+
+        migrated_count += 1
+        print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
+
+    # See note above – flush but do **not** commit so the outer Alembic transaction
+    # controls atomicity.
+    session.flush()
+
+    print(
+        f"Migration completed: {migrated_count} files staged for commit to external storage."
+    )
+
+
+def _set_tenant_contextvar(session: Session) -> None:
+    """Set the tenant contextvar to the default schema"""
+    current_tenant = session.execute(text("SELECT current_schema()")).scalar()
+    print(f"Migrating files for tenant: {current_tenant}")
+    CURRENT_TENANT_ID_CONTEXTVAR.set(current_tenant)
--- a/backend/alembic/versions/ca04500b9ee8_add_cascade_deletes_to_agent_tables.py
+++ b/backend/alembic/versions/ca04500b9ee8_add_cascade_deletes_to_agent_tables.py
@@ -0,0 +1,128 @@
+"""add_cascade_deletes_to_agent_tables
+
+Revision ID: ca04500b9ee8
+Revises: 238b84885828
+Create Date: 2025-05-30 16:03:51.112263
+
+"""
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "ca04500b9ee8"
+down_revision = "238b84885828"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Drop existing foreign key constraints
+    op.drop_constraint(
+        "agent__sub_question_primary_question_id_fkey",
+        "agent__sub_question",
+        type_="foreignkey",
+    )
+    op.drop_constraint(
+        "agent__sub_query_parent_question_id_fkey",
+        "agent__sub_query",
+        type_="foreignkey",
+    )
+    op.drop_constraint(
+        "chat_message__standard_answer_chat_message_id_fkey",
+        "chat_message__standard_answer",
+        type_="foreignkey",
+    )
+    op.drop_constraint(
+        "agent__sub_query__search_doc_sub_query_id_fkey",
+        "agent__sub_query__search_doc",
+        type_="foreignkey",
+    )
+
+    # Recreate foreign key constraints with CASCADE delete
+    op.create_foreign_key(
+        "agent__sub_question_primary_question_id_fkey",
+        "agent__sub_question",
+        "chat_message",
+        ["primary_question_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+    op.create_foreign_key(
+        "agent__sub_query_parent_question_id_fkey",
+        "agent__sub_query",
+        "agent__sub_question",
+        ["parent_question_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+    op.create_foreign_key(
+        "chat_message__standard_answer_chat_message_id_fkey",
+        "chat_message__standard_answer",
+        "chat_message",
+        ["chat_message_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+    op.create_foreign_key(
+        "agent__sub_query__search_doc_sub_query_id_fkey",
+        "agent__sub_query__search_doc",
+        "agent__sub_query",
+        ["sub_query_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+
+def downgrade() -> None:
+    # Drop CASCADE foreign key constraints
+    op.drop_constraint(
+        "agent__sub_question_primary_question_id_fkey",
+        "agent__sub_question",
+        type_="foreignkey",
+    )
+    op.drop_constraint(
+        "agent__sub_query_parent_question_id_fkey",
+        "agent__sub_query",
+        type_="foreignkey",
+    )
+    op.drop_constraint(
+        "chat_message__standard_answer_chat_message_id_fkey",
+        "chat_message__standard_answer",
+        type_="foreignkey",
+    )
+    op.drop_constraint(
+        "agent__sub_query__search_doc_sub_query_id_fkey",
+        "agent__sub_query__search_doc",
+        type_="foreignkey",
+    )
+
+    # Recreate foreign key constraints without CASCADE delete
+    op.create_foreign_key(
+        "agent__sub_question_primary_question_id_fkey",
+        "agent__sub_question",
+        "chat_message",
+        ["primary_question_id"],
+        ["id"],
+    )
+    op.create_foreign_key(
+        "agent__sub_query_parent_question_id_fkey",
+        "agent__sub_query",
+        "agent__sub_question",
+        ["parent_question_id"],
+        ["id"],
+    )
+    op.create_foreign_key(
+        "chat_message__standard_answer_chat_message_id_fkey",
+        "chat_message__standard_answer",
+        "chat_message",
+        ["chat_message_id"],
+        ["id"],
+    )
+    op.create_foreign_key(
+        "agent__sub_query__search_doc_sub_query_id_fkey",
+        "agent__sub_query__search_doc",
+        "agent__sub_query",
+        ["sub_query_id"],
+        ["id"],
+    )
--- a/backend/alembic/versions/cec7ec36c505_kgentity_parent.py
+++ b/backend/alembic/versions/cec7ec36c505_kgentity_parent.py
@@ -0,0 +1,29 @@
+"""kgentity_parent
+
+Revision ID: cec7ec36c505
+Revises: 495cb26ce93e
+Create Date: 2025-06-07 20:07:46.400770
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "cec7ec36c505"
+down_revision = "495cb26ce93e"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "kg_entity",
+        sa.Column("parent_key", sa.String(), nullable=True, index=True),
+    )
+    # NOTE: you will have to reindex the KG after this migration as the parent_key will be null
+
+
+def downgrade() -> None:
+    op.drop_column("kg_entity", "parent_key")
--- a/backend/alembic/versions/d961aca62eb3_update_status_length.py
+++ b/backend/alembic/versions/d961aca62eb3_update_status_length.py
@@ -0,0 +1,57 @@
+"""Update status length
+
+Revision ID: d961aca62eb3
+Revises: cf90764725d8
+Create Date: 2025-03-23 16:10:05.683965
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "d961aca62eb3"
+down_revision = "cf90764725d8"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Drop the existing enum type constraint
+    op.execute("ALTER TABLE connector_credential_pair ALTER COLUMN status TYPE varchar")
+
+    # Create new enum type with all values
+    op.execute(
+        "ALTER TABLE connector_credential_pair ALTER COLUMN status TYPE VARCHAR(20) USING status::varchar(20)"
+    )
+
+    # Update the enum type to include all possible values
+    op.alter_column(
+        "connector_credential_pair",
+        "status",
+        type_=sa.Enum(
+            "SCHEDULED",
+            "INITIAL_INDEXING",
+            "ACTIVE",
+            "PAUSED",
+            "DELETING",
+            "INVALID",
+            name="connectorcredentialpairstatus",
+            native_enum=False,
+        ),
+        existing_type=sa.String(20),
+        nullable=False,
+    )
+
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "in_repeated_error_state", sa.Boolean, default=False, server_default="false"
+        ),
+    )
+
+
+def downgrade() -> None:
+    # no need to convert back to the old enum type, since we're not using it anymore
+    op.drop_column("connector_credential_pair", "in_repeated_error_state")
--- a/backend/alembic/versions/da42808081e3_migrate_jira_connectors_to_new_format.py
+++ b/backend/alembic/versions/da42808081e3_migrate_jira_connectors_to_new_format.py
@@ -11,7 +11,7 @@ import sqlalchemy as sa
 import json

 from onyx.configs.constants import DocumentSource
-from onyx.connectors.onyx_jira.utils import extract_jira_project
+from onyx.connectors.jira.utils import extract_jira_project


 # revision identifiers, used by Alembic.
@@ -21,6 +21,9 @@ branch_labels = None
 depends_on = None


+PRESERVED_CONFIG_KEYS = ["comment_email_blacklist", "batch_size", "labels_to_skip"]
+
+
 def upgrade() -> None:
    # Get all Jira connectors
    conn = op.get_bind()
@@ -62,6 +65,9 @@ def upgrade() -> None:
                f"WARNING: Jira connector {connector_id} has no project URL configured"
            )
            continue
+        for old_key in PRESERVED_CONFIG_KEYS:
+            if old_key in old_config:
+                new_config[old_key] = old_config[old_key]

        # Update the connector config
        conn.execute(
@@ -108,6 +114,10 @@ def downgrade() -> None:
        else:
            continue

+        for old_key in PRESERVED_CONFIG_KEYS:
+            if old_key in new_config:
+                old_config[old_key] = new_config[old_key]
+
        # Update the connector config
        conn.execute(
            sa.text(
@@ -117,5 +127,5 @@ def downgrade() -> None:
                WHERE id = :id
                """
            ),
-            {"id": connector_id, "old_config": old_config},
+            {"id": connector_id, "old_config": json.dumps(old_config)},
        )
--- a/backend/alembic/versions/dbaa756c2ccf_embedding_models.py
+++ b/backend/alembic/versions/dbaa756c2ccf_embedding_models.py
@@ -10,12 +10,19 @@ from alembic import op
 import sqlalchemy as sa
 from sqlalchemy import table, column, String, Integer, Boolean

-from onyx.db.search_settings import (
-    get_new_default_embedding_model,
-    get_old_default_embedding_model,
-    user_has_overridden_embedding_model,
-)
+from onyx.configs.model_configs import ASYM_PASSAGE_PREFIX
+from onyx.configs.model_configs import ASYM_QUERY_PREFIX
+from onyx.configs.model_configs import DOC_EMBEDDING_DIM
+from onyx.configs.model_configs import DOCUMENT_ENCODER_MODEL
+from onyx.configs.model_configs import NORMALIZE_EMBEDDINGS
+from onyx.configs.model_configs import OLD_DEFAULT_DOCUMENT_ENCODER_MODEL
+from onyx.configs.model_configs import OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM
+from onyx.configs.model_configs import OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS
+from onyx.db.enums import EmbeddingPrecision
 from onyx.db.models import IndexModelStatus
+from onyx.db.search_settings import user_has_overridden_embedding_model
+from onyx.indexing.models import IndexingSetting
+from onyx.natural_language_processing.search_nlp_models import clean_model_name

 # revision identifiers, used by Alembic.
 revision = "dbaa756c2ccf"
@@ -24,6 +31,47 @@ branch_labels: None = None
 depends_on: None = None


+def _get_old_default_embedding_model() -> IndexingSetting:
+    is_overridden = user_has_overridden_embedding_model()
+    return IndexingSetting(
+        model_name=(
+            DOCUMENT_ENCODER_MODEL
+            if is_overridden
+            else OLD_DEFAULT_DOCUMENT_ENCODER_MODEL
+        ),
+        model_dim=(
+            DOC_EMBEDDING_DIM if is_overridden else OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM
+        ),
+        embedding_precision=(EmbeddingPrecision.FLOAT),
+        normalize=(
+            NORMALIZE_EMBEDDINGS
+            if is_overridden
+            else OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS
+        ),
+        query_prefix=(ASYM_QUERY_PREFIX if is_overridden else ""),
+        passage_prefix=(ASYM_PASSAGE_PREFIX if is_overridden else ""),
+        index_name="danswer_chunk",
+        multipass_indexing=False,
+        enable_contextual_rag=False,
+        api_url=None,
+    )
+
+
+def _get_new_default_embedding_model() -> IndexingSetting:
+    return IndexingSetting(
+        model_name=DOCUMENT_ENCODER_MODEL,
+        model_dim=DOC_EMBEDDING_DIM,
+        embedding_precision=(EmbeddingPrecision.BFLOAT16),
+        normalize=NORMALIZE_EMBEDDINGS,
+        query_prefix=ASYM_QUERY_PREFIX,
+        passage_prefix=ASYM_PASSAGE_PREFIX,
+        index_name=f"danswer_chunk_{clean_model_name(DOCUMENT_ENCODER_MODEL)}",
+        multipass_indexing=False,
+        enable_contextual_rag=False,
+        api_url=None,
+    )
+
+
 def upgrade() -> None:
    op.create_table(
        "embedding_model",
@@ -61,7 +109,7 @@ def upgrade() -> None:
    # the user selected via env variables before this change. This is needed since
    # all index_attempts must be associated with an embedding model, so without this
    # we will run into violations of non-null contraints
-    old_embedding_model = get_old_default_embedding_model()
+    old_embedding_model = _get_old_default_embedding_model()
    op.bulk_insert(
        EmbeddingModel,
        [
@@ -79,7 +127,7 @@ def upgrade() -> None:
    # if the user has not overridden the default embedding model via env variables,
    # insert the new default model into the database to auto-upgrade them
    if not user_has_overridden_embedding_model():
-        new_embedding_model = get_new_default_embedding_model()
+        new_embedding_model = _get_new_default_embedding_model()
        op.bulk_insert(
            EmbeddingModel,
            [
--- a/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py
+++ b/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py
@@ -18,11 +18,13 @@ depends_on: None = None


 def upgrade() -> None:
+    op.execute("DROP TABLE IF EXISTS document CASCADE")
    op.create_table(
        "document",
        sa.Column("id", sa.String(), nullable=False),
        sa.PrimaryKeyConstraint("id"),
    )
+    op.execute("DROP TABLE IF EXISTS chunk CASCADE")
    op.create_table(
        "chunk",
        sa.Column("id", sa.String(), nullable=False),
@@ -43,6 +45,7 @@ def upgrade() -> None:
        ),
        sa.PrimaryKeyConstraint("id", "document_store_type"),
    )
+    op.execute("DROP TABLE IF EXISTS deletion_attempt CASCADE")
    op.create_table(
        "deletion_attempt",
        sa.Column("id", sa.Integer(), nullable=False),
@@ -84,6 +87,7 @@ def upgrade() -> None:
        ),
        sa.PrimaryKeyConstraint("id"),
    )
+    op.execute("DROP TABLE IF EXISTS document_by_connector_credential_pair CASCADE")
    op.create_table(
        "document_by_connector_credential_pair",
        sa.Column("id", sa.String(), nullable=False),
@@ -106,7 +110,10 @@ def upgrade() -> None:


 def downgrade() -> None:
+    # upstream tables first
    op.drop_table("document_by_connector_credential_pair")
    op.drop_table("deletion_attempt")
    op.drop_table("chunk")
-    op.drop_table("document")
+
+    # Alembic op.drop_table() has no "cascade" flag – issue raw SQL
+    op.execute("DROP TABLE IF EXISTS document CASCADE")
--- a/backend/onyx/connectors/onyx_jira/init.py
+++ b/backend/onyx/connectors/onyx_jira/init.py
--- a/backend/alembic_tenants/env.py
+++ b/backend/alembic_tenants/env.py
@@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.schema import SchemaItem

 from alembic import context
-from onyx.db.engine import build_connection_string
+from onyx.db.engine.sql_engine import build_connection_string
 from onyx.db.models import PublicBase

 # this is the Alembic Config object, which provides
--- a/backend/alembic_tenants/versions/3b9f09038764_add_read_only_kg_user.py
+++ b/backend/alembic_tenants/versions/3b9f09038764_add_read_only_kg_user.py
@@ -0,0 +1,80 @@
+"""add_db_readonly_user
+
+Revision ID: 3b9f09038764
+Revises: 3b45e0018bf1
+Create Date: 2025-05-11 11:05:11.436977
+
+"""
+
+from sqlalchemy import text
+
+from alembic import op
+from onyx.configs.app_configs import DB_READONLY_PASSWORD
+from onyx.configs.app_configs import DB_READONLY_USER
+from shared_configs.configs import MULTI_TENANT
+
+
+# revision identifiers, used by Alembic.
+revision = "3b9f09038764"
+down_revision = "3b45e0018bf1"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    if MULTI_TENANT:
+
+        # Enable pg_trgm extension if not already enabled
+        op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
+
+        # Create read-only db user here only in multi-tenant mode. For single-tenant mode,
+        # the user is created in the standard migration.
+        if not (DB_READONLY_USER and DB_READONLY_PASSWORD):
+            raise Exception("DB_READONLY_USER or DB_READONLY_PASSWORD is not set")
+
+        op.execute(
+            text(
+                f"""
+                DO $$
+                BEGIN
+                    -- Check if the read-only user already exists
+                    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                        -- Create the read-only user with the specified password
+                        EXECUTE format('CREATE USER %I WITH PASSWORD %L', '{DB_READONLY_USER}', '{DB_READONLY_PASSWORD}');
+                        -- First revoke all privileges to ensure a clean slate
+                        EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
+                        -- Grant only the CONNECT privilege to allow the user to connect to the database
+                        -- but not perform any operations without additional specific grants
+                        EXECUTE format('GRANT CONNECT ON DATABASE %I TO %I', current_database(), '{DB_READONLY_USER}');
+                    END IF;
+                END
+                $$;
+                """
+            )
+        )
+
+
+def downgrade() -> None:
+    if MULTI_TENANT:
+        # Drop read-only db user here only in single tenant mode. For multi-tenant mode,
+        # the user is dropped in the alembic_tenants migration.
+
+        op.execute(
+            text(
+                f"""
+            DO $$
+            BEGIN
+                IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
+                    -- First revoke all privileges from the database
+                    EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
+                    -- Then revoke all privileges from the public schema
+                    EXECUTE format('REVOKE ALL ON SCHEMA public FROM %I', '{DB_READONLY_USER}');
+                    -- Then drop the user
+                    EXECUTE format('DROP USER %I', '{DB_READONLY_USER}');
+                END IF;
+            END
+            $$;
+        """
+            )
+        )
+        op.execute(text("DROP EXTENSION IF EXISTS pg_trgm"))
--- a/backend/ee/onyx/access/access.py
+++ b/backend/ee/onyx/access/access.py
@@ -1,12 +1,10 @@
 from sqlalchemy.orm import Session

 from ee.onyx.db.external_perm import fetch_external_groups_for_user
+from ee.onyx.db.external_perm import fetch_public_external_group_ids
 from ee.onyx.db.user_group import fetch_user_groups_for_documents
 from ee.onyx.db.user_group import fetch_user_groups_for_user
-from ee.onyx.external_permissions.post_query_censoring import (
-    DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION,
-)
-from ee.onyx.external_permissions.sync_params import DOC_PERMISSIONS_FUNC_MAP
+from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
 from onyx.access.access import (
    _get_access_for_documents as get_access_for_documents_without_groups,
 )
@@ -17,6 +15,10 @@ from onyx.access.utils import prefix_user_group
 from onyx.db.document import get_document_sources
 from onyx.db.document import get_documents_by_ids
 from onyx.db.models import User
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()


 def _get_access_for_document(
@@ -63,13 +65,21 @@ def _get_access_for_documents(
        document_ids=document_ids,
    )

+    all_public_ext_u_group_ids = set(fetch_public_external_group_ids(db_session))
+
    access_map = {}
    for document_id, non_ee_access in non_ee_access_dict.items():
        document = doc_id_map[document_id]
        source = doc_id_to_source_map.get(document_id)
+        if source is None:
+            logger.error(f"Document {document_id} has no source")
+            continue
+
+        perm_sync_config = get_source_perm_sync_config(source)
        is_only_censored = (
-            source in DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION
-            and source not in DOC_PERMISSIONS_FUNC_MAP
+            perm_sync_config
+            and perm_sync_config.censoring_config is not None
+            and perm_sync_config.doc_sync_config is None
        )

        ext_u_emails = (
@@ -89,7 +99,10 @@ def _get_access_for_documents(
        # If its censored, then it's public anywhere during the search and then permissions are
        # applied after the search
        is_public_anywhere = (
-            document.is_public or non_ee_access.is_public or is_only_censored
+            document.is_public
+            or non_ee_access.is_public
+            or is_only_censored
+            or any(u_group in all_public_ext_u_group_ids for u_group in ext_u_groups)
        )

        # To avoid collisions of group namings between connectors, they need to be prefixed
--- a/backend/ee/onyx/background/celery/apps/heavy.py
+++ b/backend/ee/onyx/background/celery/apps/heavy.py
@@ -0,0 +1,129 @@
+import csv
+import io
+from datetime import datetime
+
+from celery import shared_task
+from celery import Task
+
+from ee.onyx.server.query_history.api import fetch_and_process_chat_session_history
+from ee.onyx.server.query_history.api import ONYX_ANONYMIZED_EMAIL
+from ee.onyx.server.query_history.models import QuestionAnswerPairSnapshot
+from onyx.background.celery.apps.heavy import celery_app
+from onyx.background.task_utils import construct_query_history_report_name
+from onyx.configs.app_configs import JOB_TIMEOUT
+from onyx.configs.app_configs import ONYX_QUERY_HISTORY_TYPE
+from onyx.configs.constants import FileOrigin
+from onyx.configs.constants import FileType
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.configs.constants import QueryHistoryType
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.tasks import delete_task_with_id
+from onyx.db.tasks import mark_task_as_finished_with_id
+from onyx.db.tasks import mark_task_as_started_with_id
+from onyx.file_store.file_store import get_default_file_store
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+
+@shared_task(
+    name=OnyxCeleryTask.EXPORT_QUERY_HISTORY_TASK,
+    ignore_result=True,
+    soft_time_limit=JOB_TIMEOUT,
+    bind=True,
+    trail=False,
+)
+def export_query_history_task(
+    self: Task,
+    *,
+    start: datetime,
+    end: datetime,
+    start_time: datetime,
+    # Need to include the tenant_id since the TenantAwareTask needs this
+    tenant_id: str,
+) -> None:
+    if not self.request.id:
+        raise RuntimeError("No task id defined for this task; cannot identify it")
+
+    task_id = self.request.id
+    stream = io.StringIO()
+    writer = csv.DictWriter(
+        stream,
+        fieldnames=list(QuestionAnswerPairSnapshot.model_fields.keys()),
+    )
+    writer.writeheader()
+
+    with get_session_with_current_tenant() as db_session:
+        try:
+            mark_task_as_started_with_id(
+                db_session=db_session,
+                task_id=task_id,
+            )
+
+            snapshot_generator = fetch_and_process_chat_session_history(
+                db_session=db_session,
+                start=start,
+                end=end,
+            )
+
+            for snapshot in snapshot_generator:
+                if ONYX_QUERY_HISTORY_TYPE == QueryHistoryType.ANONYMIZED:
+                    snapshot.user_email = ONYX_ANONYMIZED_EMAIL
+
+                writer.writerows(
+                    qa_pair.to_json()
+                    for qa_pair in QuestionAnswerPairSnapshot.from_chat_session_snapshot(
+                        snapshot
+                    )
+                )
+
+        except Exception:
+            logger.exception(f"Failed to export query history with {task_id=}")
+            mark_task_as_finished_with_id(
+                db_session=db_session,
+                task_id=task_id,
+                success=False,
+            )
+            raise
+
+    report_name = construct_query_history_report_name(task_id)
+    with get_session_with_current_tenant() as db_session:
+        try:
+            stream.seek(0)
+            get_default_file_store().save_file(
+                content=stream,
+                display_name=report_name,
+                file_origin=FileOrigin.QUERY_HISTORY_CSV,
+                file_type=FileType.CSV,
+                file_metadata={
+                    "start": start.isoformat(),
+                    "end": end.isoformat(),
+                    "start_time": start_time.isoformat(),
+                },
+                file_id=report_name,
+            )
+
+            delete_task_with_id(
+                db_session=db_session,
+                task_id=task_id,
+            )
+        except Exception:
+            logger.exception(
+                f"Failed to save query history export file; {report_name=}"
+            )
+            mark_task_as_finished_with_id(
+                db_session=db_session,
+                task_id=task_id,
+                success=False,
+            )
+            raise
+
+
+celery_app.autodiscover_tasks(
+    [
+        "ee.onyx.background.celery.tasks.doc_permission_syncing",
+        "ee.onyx.background.celery.tasks.external_group_syncing",
+        "ee.onyx.background.celery.tasks.cleanup",
+    ]
+)
--- a/backend/ee/onyx/background/celery/apps/light.py
+++ b/backend/ee/onyx/background/celery/apps/light.py
@@ -0,0 +1,8 @@
+from onyx.background.celery.apps.light import celery_app
+
+celery_app.autodiscover_tasks(
+    [
+        "ee.onyx.background.celery.tasks.doc_permission_syncing",
+        "ee.onyx.background.celery.tasks.external_group_syncing",
+    ]
+)
--- a/backend/ee/onyx/background/celery/apps/monitoring.py
+++ b/backend/ee/onyx/background/celery/apps/monitoring.py
@@ -0,0 +1,7 @@
+from onyx.background.celery.apps.monitoring import celery_app
+
+celery_app.autodiscover_tasks(
+    [
+        "ee.onyx.background.celery.tasks.tenant_provisioning",
+    ]
+)
--- a/backend/ee/onyx/background/celery/apps/primary.py
+++ b/backend/ee/onyx/background/celery/apps/primary.py
@@ -1,12 +1,22 @@
+from datetime import datetime
+from datetime import timezone
+from uuid import UUID
+
+from celery import shared_task
+from celery import Task
+
 from ee.onyx.background.celery_utils import should_perform_chat_ttl_check
 from ee.onyx.background.task_name_builders import name_chat_ttl_task
 from ee.onyx.server.reporting.usage_export_generation import create_new_usage_report
 from onyx.background.celery.apps.primary import celery_app
-from onyx.background.task_utils import build_celery_task_wrapper
 from onyx.configs.app_configs import JOB_TIMEOUT
+from onyx.configs.constants import OnyxCeleryTask
 from onyx.db.chat import delete_chat_session
 from onyx.db.chat import get_chat_sessions_older_than
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.enums import TaskStatus
+from onyx.db.tasks import mark_task_as_finished_with_id
+from onyx.db.tasks import register_task
 from onyx.server.settings.store import load_settings
 from onyx.utils.logger import setup_logger

@@ -15,18 +25,42 @@ logger = setup_logger()
 # mark as EE for all tasks in this file


-@build_celery_task_wrapper(name_chat_ttl_task)
-@celery_app.task(soft_time_limit=JOB_TIMEOUT)
-def perform_ttl_management_task(retention_limit_days: int, *, tenant_id: str) -> None:
-    with get_session_with_current_tenant() as db_session:
-        old_chat_sessions = get_chat_sessions_older_than(
-            retention_limit_days, db_session
-        )
+@shared_task(
+    name=OnyxCeleryTask.PERFORM_TTL_MANAGEMENT_TASK,
+    ignore_result=True,
+    soft_time_limit=JOB_TIMEOUT,
+    bind=True,
+    trail=False,
+)
+def perform_ttl_management_task(
+    self: Task, retention_limit_days: int, *, tenant_id: str
+) -> None:
+    task_id = self.request.id
+    if not task_id:
+        raise RuntimeError("No task id defined for this task; cannot identify it")

-    for user_id, session_id in old_chat_sessions:
-        # one session per delete so that we don't blow up if a deletion fails.
+    start_time = datetime.now(tz=timezone.utc)
+
+    user_id: UUID | None = None
+    session_id: UUID | None = None
+    try:
        with get_session_with_current_tenant() as db_session:
-            try:
+            # we generally want to move off this, but keeping for now
+            register_task(
+                db_session=db_session,
+                task_name=name_chat_ttl_task(retention_limit_days, tenant_id),
+                task_id=task_id,
+                status=TaskStatus.STARTED,
+                start_time=start_time,
+            )
+
+            old_chat_sessions = get_chat_sessions_older_than(
+                retention_limit_days, db_session
+            )
+
+        for user_id, session_id in old_chat_sessions:
+            # one session per delete so that we don't blow up if a deletion fails.
+            with get_session_with_current_tenant() as db_session:
                delete_chat_session(
                    user_id,
                    session_id,
@@ -34,11 +68,26 @@ def perform_ttl_management_task(retention_limit_days: int, *, tenant_id: str) ->
                    include_deleted=True,
                    hard_delete=True,
                )
-            except Exception:
-                logger.exception(
-                    "delete_chat_session exceptioned. "
-                    f"user_id={user_id} session_id={session_id}"
-                )
+
+        with get_session_with_current_tenant() as db_session:
+            mark_task_as_finished_with_id(
+                db_session=db_session,
+                task_id=task_id,
+                success=True,
+            )
+
+    except Exception:
+        logger.exception(
+            "delete_chat_session exceptioned. "
+            f"user_id={user_id} session_id={session_id}"
+        )
+        with get_session_with_current_tenant() as db_session:
+            mark_task_as_finished_with_id(
+                db_session=db_session,
+                task_id=task_id,
+                success=False,
+            )
+        raise


 #####
@@ -47,7 +96,7 @@ def perform_ttl_management_task(retention_limit_days: int, *, tenant_id: str) ->


@celery_app.task(
-    name="check_ttl_management_task",
+    name=OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
    ignore_result=True,
    soft_time_limit=JOB_TIMEOUT,
 )
@@ -67,7 +116,7 @@ def check_ttl_management_task(*, tenant_id: str) -> None:


@celery_app.task(
-    name="autogenerate_usage_report_task",
+    name=OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
    ignore_result=True,
    soft_time_limit=JOB_TIMEOUT,
 )
@@ -79,3 +128,12 @@ def autogenerate_usage_report_task(*, tenant_id: str) -> None:
            user_id=None,
            period=None,
        )
+
+
+celery_app.autodiscover_tasks(
+    [
+        "ee.onyx.background.celery.tasks.doc_permission_syncing",
+        "ee.onyx.background.celery.tasks.external_group_syncing",
+        "ee.onyx.background.celery.tasks.cloud",
+    ]
+)
--- a/backend/ee/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/ee/onyx/background/celery/tasks/beat_schedule.py
@@ -1,6 +1,7 @@
 from datetime import timedelta
 from typing import Any

+from ee.onyx.configs.app_configs import CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS
 from onyx.background.celery.tasks.beat_schedule import (
    beat_cloud_tasks as base_beat_system_tasks,
 )
@@ -13,34 +14,42 @@ from onyx.background.celery.tasks.beat_schedule import (
    get_tasks_to_schedule as base_get_tasks_to_schedule,
 )
 from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
 from shared_configs.configs import MULTI_TENANT

 ee_beat_system_tasks: list[dict] = []

-ee_beat_task_templates: list[dict] = []
-ee_beat_task_templates.extend(
-    [
-        {
-            "name": "autogenerate-usage-report",
-            "task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
-            "schedule": timedelta(days=30),
-            "options": {
-                "priority": OnyxCeleryPriority.MEDIUM,
-                "expires": BEAT_EXPIRES_DEFAULT,
-            },
+ee_beat_task_templates: list[dict] = [
+    {
+        "name": "autogenerate-usage-report",
+        "task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
+        "schedule": timedelta(days=30),
+        "options": {
+            "priority": OnyxCeleryPriority.MEDIUM,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
-        {
-            "name": "check-ttl-management",
-            "task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
-            "schedule": timedelta(hours=1),
-            "options": {
-                "priority": OnyxCeleryPriority.MEDIUM,
-                "expires": BEAT_EXPIRES_DEFAULT,
-            },
+    },
+    {
+        "name": "check-ttl-management",
+        "task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
+        "schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
+        "options": {
+            "priority": OnyxCeleryPriority.MEDIUM,
+            "expires": BEAT_EXPIRES_DEFAULT,
        },
-    ]
-)
+    },
+    {
+        "name": "export-query-history-cleanup-task",
+        "task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
+        "schedule": timedelta(hours=1),
+        "options": {
+            "priority": OnyxCeleryPriority.MEDIUM,
+            "expires": BEAT_EXPIRES_DEFAULT,
+            "queue": OnyxCeleryQueues.CSV_GENERATION,
+        },
+    },
+]

 ee_tasks_to_schedule: list[dict] = []

@@ -58,10 +67,20 @@ if not MULTI_TENANT:
        {
            "name": "check-ttl-management",
            "task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
+            "schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
+            "options": {
+                "priority": OnyxCeleryPriority.MEDIUM,
+                "expires": BEAT_EXPIRES_DEFAULT,
+            },
+        },
+        {
+            "name": "export-query-history-cleanup-task",
+            "task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
            "schedule": timedelta(hours=1),
            "options": {
                "priority": OnyxCeleryPriority.MEDIUM,
                "expires": BEAT_EXPIRES_DEFAULT,
+                "queue": OnyxCeleryQueues.CSV_GENERATION,
            },
        },
    ]
--- a/backend/ee/onyx/background/celery/tasks/cleanup/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/cleanup/tasks.py
@@ -0,0 +1,40 @@
+from datetime import datetime
+from datetime import timedelta
+
+from celery import shared_task
+
+from ee.onyx.db.query_history import get_all_query_history_export_tasks
+from onyx.configs.app_configs import JOB_TIMEOUT
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.db.engine.sql_engine import get_session_with_tenant
+from onyx.db.enums import TaskStatus
+from onyx.db.tasks import delete_task_with_id
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+
+@shared_task(
+    name=OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
+    ignore_result=True,
+    soft_time_limit=JOB_TIMEOUT,
+)
+def export_query_history_cleanup_task(*, tenant_id: str) -> None:
+    with get_session_with_tenant(tenant_id=tenant_id) as db_session:
+        tasks = get_all_query_history_export_tasks(db_session=db_session)
+
+        for task in tasks:
+            if task.status == TaskStatus.SUCCESS:
+                delete_task_with_id(db_session=db_session, task_id=task.task_id)
+            elif task.status == TaskStatus.FAILURE:
+                if task.start_time:
+                    deadline = task.start_time + timedelta(hours=24)
+                    now = datetime.now()
+                    if now < deadline:
+                        continue
+
+                logger.error(
+                    f"Task with {task.task_id=} failed; it is being deleted now"
+                )
+                delete_task_with_id(db_session=db_session, task_id=task.task_id)
--- a/backend/ee/onyx/background/celery/tasks/cloud/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/cloud/tasks.py
@@ -0,0 +1,104 @@
+import time
+
+from celery import shared_task
+from celery import Task
+from celery.exceptions import SoftTimeLimitExceeded
+from redis.lock import Lock as RedisLock
+
+from ee.onyx.server.tenants.product_gating import get_gated_tenants
+from onyx.background.celery.apps.app_base import task_logger
+from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT
+from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
+from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
+from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.configs.constants import OnyxRedisLocks
+from onyx.db.engine.tenant_utils import get_all_tenant_ids
+from onyx.redis.redis_pool import get_redis_client
+from onyx.redis.redis_pool import redis_lock_dump
+from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
+
+
+@shared_task(
+    name=OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
+    ignore_result=True,
+    trail=False,
+    bind=True,
+)
+def cloud_beat_task_generator(
+    self: Task,
+    task_name: str,
+    queue: str = OnyxCeleryTask.DEFAULT,
+    priority: int = OnyxCeleryPriority.MEDIUM,
+    expires: int = BEAT_EXPIRES_DEFAULT,
+) -> bool | None:
+    """a lightweight task used to kick off individual beat tasks per tenant."""
+    time_start = time.monotonic()
+
+    redis_client = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)
+
+    lock_beat: RedisLock = redis_client.lock(
+        f"{OnyxRedisLocks.CLOUD_BEAT_TASK_GENERATOR_LOCK}:{task_name}",
+        timeout=CELERY_GENERIC_BEAT_LOCK_TIMEOUT,
+    )
+
+    # these tasks should never overlap
+    if not lock_beat.acquire(blocking=False):
+        return None
+
+    last_lock_time = time.monotonic()
+    tenant_ids: list[str] = []
+    num_processed_tenants = 0
+
+    try:
+        tenant_ids = get_all_tenant_ids()
+        gated_tenants = get_gated_tenants()
+        for tenant_id in tenant_ids:
+            if tenant_id in gated_tenants:
+                continue
+
+            current_time = time.monotonic()
+            if current_time - last_lock_time >= (CELERY_GENERIC_BEAT_LOCK_TIMEOUT / 4):
+                lock_beat.reacquire()
+                last_lock_time = current_time
+
+            # needed in the cloud
+            if IGNORED_SYNCING_TENANT_LIST and tenant_id in IGNORED_SYNCING_TENANT_LIST:
+                continue
+
+            self.app.send_task(
+                task_name,
+                kwargs=dict(
+                    tenant_id=tenant_id,
+                ),
+                queue=queue,
+                priority=priority,
+                expires=expires,
+                ignore_result=True,
+            )
+
+            num_processed_tenants += 1
+    except SoftTimeLimitExceeded:
+        task_logger.info(
+            "Soft time limit exceeded, task is being terminated gracefully."
+        )
+    except Exception:
+        task_logger.exception("Unexpected exception during cloud_beat_task_generator")
+    finally:
+        if not lock_beat.owned():
+            task_logger.error(
+                "cloud_beat_task_generator - Lock not owned on completion"
+            )
+            redis_lock_dump(lock_beat, redis_client)
+        else:
+            lock_beat.release()
+
+    time_elapsed = time.monotonic() - time_start
+    task_logger.info(
+        f"cloud_beat_task_generator finished: "
+        f"task={task_name} "
+        f"num_processed_tenants={num_processed_tenants} "
+        f"num_tenants={len(tenant_ids)} "
+        f"elapsed={time_elapsed:.2f}"
+    )
+    return True
--- a/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -16,22 +16,21 @@ from redis import Redis
 from redis.exceptions import LockError
 from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session
+from tenacity import retry
+from tenacity import retry_if_exception
+from tenacity import stop_after_delay
+from tenacity import wait_random_exponential

-from ee.onyx.configs.app_configs import DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY
 from ee.onyx.db.connector_credential_pair import get_all_auto_sync_cc_pairs
 from ee.onyx.db.document import upsert_document_external_perms
-from ee.onyx.external_permissions.sync_params import DOC_PERMISSION_SYNC_PERIODS
-from ee.onyx.external_permissions.sync_params import DOC_PERMISSIONS_FUNC_MAP
-from ee.onyx.external_permissions.sync_params import (
-    DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION,
-)
+from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
 from onyx.access.models import DocExternalAccess
 from onyx.background.celery.apps.app_base import task_logger
 from onyx.background.celery.celery_redis import celery_find_task
 from onyx.background.celery.celery_redis import celery_get_queue_length
 from onyx.background.celery.celery_redis import celery_get_queued_task_ids
 from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
-from onyx.background.celery.tasks.shared.tasks import OnyxCeleryTaskCompletionStatus
+from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT
@@ -47,8 +46,11 @@ from onyx.configs.constants import OnyxRedisSignals
 from onyx.connectors.factory import validate_ccpair_for_user
 from onyx.db.connector import mark_cc_pair_as_permissions_synced
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
+from onyx.db.document import get_document_ids_for_connector_credential_pair
+from onyx.db.document import get_documents_for_connector_credential_pair_limited_columns
 from onyx.db.document import upsert_document_by_connector_credential_pair
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_tenant
 from onyx.db.enums import AccessType
 from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.enums import SyncStatus
@@ -57,6 +59,9 @@ from onyx.db.models import ConnectorCredentialPair
 from onyx.db.sync_record import insert_sync_record
 from onyx.db.sync_record import update_sync_record_status
 from onyx.db.users import batch_add_ext_perm_user_if_not_exists
+from onyx.db.utils import DocumentRow
+from onyx.db.utils import is_retryable_sqlalchemy_error
+from onyx.db.utils import SortOrder
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.redis.redis_connector import RedisConnector
 from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
@@ -72,12 +77,14 @@ from onyx.utils.logger import LoggerContextVars
 from onyx.utils.logger import setup_logger
 from onyx.utils.telemetry import optional_telemetry
 from onyx.utils.telemetry import RecordType
-
+from shared_configs.configs import MULTI_TENANT

 logger = setup_logger()


 DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES = 3
+DOCUMENT_PERMISSIONS_UPDATE_STOP_AFTER = 10 * 60
+DOCUMENT_PERMISSIONS_UPDATE_MAX_WAIT = 60


 # 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
@@ -85,6 +92,24 @@ LIGHT_SOFT_TIME_LIMIT = 105
 LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15


+def _get_fence_validation_block_expiration() -> int:
+    """
+    Compute the expiration time for the fence validation block signal.
+    Base expiration is 300 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
+    """
+    base_expiration = 300  # seconds
+
+    if not MULTI_TENANT:
+        return base_expiration
+
+    try:
+        beat_multiplier = OnyxRuntime.get_beat_multiplier()
+    except Exception:
+        beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
+
+    return int(base_expiration * beat_multiplier)
+
+
 """Jobs / utils for kicking off doc permissions sync tasks."""


@@ -98,16 +123,29 @@ def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> b
    if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
        return False

+    sync_config = get_source_perm_sync_config(cc_pair.connector.source)
+    if sync_config is None:
+        logger.error(f"No sync config found for {cc_pair.connector.source}")
+        return False
+
+    if sync_config.doc_sync_config is None:
+        logger.error(f"No doc sync config found for {cc_pair.connector.source}")
+        return False
+
+    # if indexing also does perm sync, don't start running doc_sync until at
+    # least one indexing is done
+    if (
+        sync_config.doc_sync_config.initial_index_should_sync
+        and cc_pair.last_successful_index_time is None
+    ):
+        return False
+
    # If the last sync is None, it has never been run so we run the sync
    last_perm_sync = cc_pair.last_time_perm_sync
    if last_perm_sync is None:
        return True

-    source_sync_period = DOC_PERMISSION_SYNC_PERIODS.get(cc_pair.connector.source)
-
-    if not source_sync_period:
-        source_sync_period = DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY
-
+    source_sync_period = sync_config.doc_sync_config.doc_sync_frequency
    source_sync_period *= int(OnyxRuntime.get_doc_permission_sync_multiplier())

    # If the last sync is greater than the full fetch period, we run the sync
@@ -179,7 +217,11 @@ def check_for_doc_permissions_sync(self: Task, *, tenant_id: str) -> bool | None
                    "Exception while validating permission sync fences"
                )

-            r.set(OnyxRedisSignals.BLOCK_VALIDATE_PERMISSION_SYNC_FENCES, 1, ex=300)
+            r.set(
+                OnyxRedisSignals.BLOCK_VALIDATE_PERMISSION_SYNC_FENCES,
+                1,
+                ex=_get_fence_validation_block_expiration(),
+            )

        # use a lookup table to find active fences. We still have to verify the fence
        # exists since it is an optimization and not the source of truth.
@@ -383,7 +425,7 @@ def connector_permission_sync_generator_task(

    lock: RedisLock = r.lock(
        OnyxRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX
-        + f"_{redis_connector.id}",
+        + f"_{redis_connector.cc_pair_id}",
        timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT,
        thread_local=False,
    )
@@ -410,6 +452,7 @@ def connector_permission_sync_generator_task(
                created = validate_ccpair_for_user(
                    cc_pair.connector.id,
                    cc_pair.credential.id,
+                    cc_pair.access_type,
                    db_session,
                    enforce_creation=False,
                )
@@ -425,11 +468,15 @@ def connector_permission_sync_generator_task(
                raise

            source_type = cc_pair.connector.source
+            sync_config = get_source_perm_sync_config(source_type)
+            if sync_config is None:
+                logger.error(f"No sync config found for {source_type}")
+                return None

-            doc_sync_func = DOC_PERMISSIONS_FUNC_MAP.get(source_type)
-            if doc_sync_func is None:
-                if source_type in DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION:
+            if sync_config.doc_sync_config is None:
+                if sync_config.censoring_config:
                    return None
+
                raise ValueError(
                    f"No doc sync func found for {source_type} with cc_pair={cc_pair_id}"
                )
@@ -449,7 +496,37 @@ def connector_permission_sync_generator_task(
            redis_connector.permissions.set_fence(new_payload)

            callback = PermissionSyncCallback(redis_connector, lock, r)
-            document_external_accesses = doc_sync_func(cc_pair, callback)
+
+            # pass in the capability to fetch all existing docs for the cc_pair
+            # this is can be used to determine documents that are "missing" and thus
+            # should no longer be accessible. The decision as to whether we should find
+            # every document during the doc sync process is connector-specific.
+            def fetch_all_existing_docs_fn(
+                sort_order: SortOrder | None = None,
+            ) -> list[DocumentRow]:
+                result = get_documents_for_connector_credential_pair_limited_columns(
+                    db_session=db_session,
+                    connector_id=cc_pair.connector.id,
+                    credential_id=cc_pair.credential.id,
+                    sort_order=sort_order,
+                )
+                return list(result)
+
+            def fetch_all_existing_docs_ids_fn() -> list[str]:
+                result = get_document_ids_for_connector_credential_pair(
+                    db_session=db_session,
+                    connector_id=cc_pair.connector.id,
+                    credential_id=cc_pair.credential.id,
+                )
+                return result
+
+            doc_sync_func = sync_config.doc_sync_config.doc_sync_func
+            document_external_accesses = doc_sync_func(
+                cc_pair,
+                fetch_all_existing_docs_fn,
+                fetch_all_existing_docs_ids_fn,
+                callback,
+            )

            task_logger.info(
                f"RedisConnector.permissions.generate_tasks starting. cc_pair={cc_pair_id}"
@@ -457,13 +534,13 @@ def connector_permission_sync_generator_task(

            tasks_generated = 0
            for doc_external_access in document_external_accesses:
-                redis_connector.permissions.generate_tasks(
-                    celery_app=self.app,
+                redis_connector.permissions.update_db(
                    lock=lock,
                    new_permissions=[doc_external_access],
                    source_string=source_type,
                    connector_id=cc_pair.connector.id,
                    credential_id=cc_pair.credential.id,
+                    task_logger=task_logger,
                )
                tasks_generated += 1

@@ -476,6 +553,7 @@ def connector_permission_sync_generator_task(

    except Exception as e:
        error_msg = format_error_for_logging(e)
+
        task_logger.warning(
            f"Permission sync exceptioned: cc_pair={cc_pair_id} payload_id={payload_id} {error_msg}"
        )
@@ -496,33 +574,28 @@ def connector_permission_sync_generator_task(
    )


-@shared_task(
-    name=OnyxCeleryTask.UPDATE_EXTERNAL_DOCUMENT_PERMISSIONS_TASK,
-    soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
-    time_limit=LIGHT_TIME_LIMIT,
-    max_retries=DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES,
-    bind=True,
+# NOTE(rkuo): this should probably move to the db layer
+@retry(
+    retry=retry_if_exception(is_retryable_sqlalchemy_error),
+    wait=wait_random_exponential(
+        multiplier=1, max=DOCUMENT_PERMISSIONS_UPDATE_MAX_WAIT
+    ),
+    stop=stop_after_delay(DOCUMENT_PERMISSIONS_UPDATE_STOP_AFTER),
 )
-def update_external_document_permissions_task(
-    self: Task,
+def document_update_permissions(
    tenant_id: str,
-    serialized_doc_external_access: dict,
-    source_string: str,
+    permissions: DocExternalAccess,
+    source_type_str: str,
    connector_id: int,
    credential_id: int,
 ) -> bool:
    start = time.monotonic()

-    completion_status = OnyxCeleryTaskCompletionStatus.UNDEFINED
-
-    document_external_access = DocExternalAccess.from_dict(
-        serialized_doc_external_access
-    )
-    doc_id = document_external_access.doc_id
-    external_access = document_external_access.external_access
+    doc_id = permissions.doc_id
+    external_access = permissions.external_access

    try:
-        with get_session_with_current_tenant() as db_session:
+        with get_session_with_tenant(tenant_id=tenant_id) as db_session:
            # Add the users to the DB if they don't exist
            batch_add_ext_perm_user_if_not_exists(
                db_session=db_session,
@@ -534,7 +607,7 @@ def update_external_document_permissions_task(
                db_session=db_session,
                doc_id=doc_id,
                external_access=external_access,
-                source_type=DocumentSource(source_string),
+                source_type=DocumentSource(source_type_str),
            )

            if created_new_doc:
@@ -553,29 +626,17 @@ def update_external_document_permissions_task(
                f"action=update_permissions "
                f"elapsed={elapsed:.2f}"
            )
-
-        completion_status = OnyxCeleryTaskCompletionStatus.SUCCEEDED
    except Exception as e:
-        error_msg = format_error_for_logging(e)
-        task_logger.warning(
-            f"Exception in update_external_document_permissions_task: connector_id={connector_id} doc_id={doc_id} {error_msg}"
-        )
        task_logger.exception(
-            f"update_external_document_permissions_task exceptioned: "
+            f"document_update_permissions exceptioned: "
            f"connector_id={connector_id} doc_id={doc_id}"
        )
-        completion_status = OnyxCeleryTaskCompletionStatus.NON_RETRYABLE_EXCEPTION
+        raise e
    finally:
        task_logger.info(
-            f"update_external_document_permissions_task completed: status={completion_status.value} doc={doc_id}"
+            f"document_update_permissions completed: connector_id={connector_id} doc={doc_id}"
        )

-    if completion_status != OnyxCeleryTaskCompletionStatus.SUCCEEDED:
-        return False
-
-    task_logger.info(
-        f"update_external_document_permissions_task finished: connector_id={connector_id} doc_id={doc_id}"
-    )
    return True


--- a/backend/ee/onyx/background/celery/tasks/external_group_syncing/group_sync_utils.py
+++ b/backend/ee/onyx/background/celery/tasks/external_group_syncing/group_sync_utils.py
@@ -0,0 +1,30 @@
+from sqlalchemy.orm import Session
+
+from ee.onyx.external_permissions.sync_params import (
+    source_group_sync_is_cc_pair_agnostic,
+)
+from onyx.db.connector import mark_cc_pair_as_external_group_synced
+from onyx.db.connector_credential_pair import get_connector_credential_pairs_for_source
+from onyx.db.models import ConnectorCredentialPair
+
+
+def _get_all_cc_pair_ids_to_mark_as_group_synced(
+    db_session: Session, cc_pair: ConnectorCredentialPair
+) -> list[int]:
+    if not source_group_sync_is_cc_pair_agnostic(cc_pair.connector.source):
+        return [cc_pair.id]
+
+    cc_pairs = get_connector_credential_pairs_for_source(
+        db_session, cc_pair.connector.source
+    )
+    return [cc_pair.id for cc_pair in cc_pairs]
+
+
+def mark_all_relevant_cc_pairs_as_external_group_synced(
+    db_session: Session, cc_pair: ConnectorCredentialPair
+) -> None:
+    """For some source types, one successful group sync run should count for all
+    cc pairs of that type. This function handles that case."""
+    cc_pair_ids = _get_all_cc_pair_ids_to_mark_as_group_synced(db_session, cc_pair)
+    for cc_pair_id in cc_pair_ids:
+        mark_cc_pair_as_external_group_synced(db_session, cc_pair_id)
--- a/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
@@ -14,18 +14,23 @@ from pydantic import ValidationError
 from redis import Redis
 from redis.lock import Lock as RedisLock

+from ee.onyx.background.celery.tasks.external_group_syncing.group_sync_utils import (
+    mark_all_relevant_cc_pairs_as_external_group_synced,
+)
 from ee.onyx.db.connector_credential_pair import get_all_auto_sync_cc_pairs
 from ee.onyx.db.connector_credential_pair import get_cc_pairs_by_source
 from ee.onyx.db.external_perm import ExternalUserGroup
-from ee.onyx.db.external_perm import replace_user__ext_group_for_cc_pair
-from ee.onyx.external_permissions.sync_params import EXTERNAL_GROUP_SYNC_PERIODS
-from ee.onyx.external_permissions.sync_params import GROUP_PERMISSIONS_FUNC_MAP
+from ee.onyx.db.external_perm import mark_old_external_groups_as_stale
+from ee.onyx.db.external_perm import remove_stale_external_groups
+from ee.onyx.db.external_perm import upsert_external_groups
 from ee.onyx.external_permissions.sync_params import (
-    GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC,
+    get_all_cc_pair_agnostic_group_sync_sources,
 )
+from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
 from onyx.background.celery.apps.app_base import task_logger
 from onyx.background.celery.celery_redis import celery_find_task
 from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
+from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
 from onyx.background.error_logging import emit_background_error
 from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.constants import CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT
@@ -37,11 +42,8 @@ from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisConstants
 from onyx.configs.constants import OnyxRedisLocks
 from onyx.configs.constants import OnyxRedisSignals
-from onyx.connectors.exceptions import ConnectorValidationError
-from onyx.connectors.factory import validate_ccpair_for_user
-from onyx.db.connector import mark_cc_pair_as_external_group_synced
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
-from onyx.db.engine import get_session_with_current_tenant
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.enums import AccessType
 from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.enums import SyncStatus
@@ -56,19 +58,34 @@ from onyx.redis.redis_connector_ext_group_sync import (
 )
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
+from onyx.server.runtime.onyx_runtime import OnyxRuntime
 from onyx.server.utils import make_short_id
 from onyx.utils.logger import format_error_for_logging
 from onyx.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT

 logger = setup_logger()


-EXTERNAL_GROUPS_UPDATE_MAX_RETRIES = 3
+_EXTERNAL_GROUP_BATCH_SIZE = 100


-# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
-LIGHT_SOFT_TIME_LIMIT = 105
-LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
+def _get_fence_validation_block_expiration() -> int:
+    """
+    Compute the expiration time for the fence validation block signal.
+    Base expiration is 300 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
+    """
+    base_expiration = 300  # seconds
+
+    if not MULTI_TENANT:
+        return base_expiration
+
+    try:
+        beat_multiplier = OnyxRuntime.get_beat_multiplier()
+    except Exception:
+        beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
+
+    return int(base_expiration * beat_multiplier)


 def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
@@ -88,12 +105,20 @@ def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
        )
        return False

-    # If there is not group sync function for the connector, we don't run the sync
-    # This is fine because all sources dont necessarily have a concept of groups
-    if not GROUP_PERMISSIONS_FUNC_MAP.get(cc_pair.connector.source):
+    sync_config = get_source_perm_sync_config(cc_pair.connector.source)
+    if sync_config is None:
        task_logger.debug(
            f"Skipping group sync for CC Pair {cc_pair.id} - "
-            f"no group sync function for {cc_pair.connector.source}"
+            f"no sync config found for {cc_pair.connector.source}"
+        )
+        return False
+
+    # If there is not group sync function for the connector, we don't run the sync
+    # This is fine because all sources dont necessarily have a concept of groups
+    if sync_config.group_sync_config is None:
+        task_logger.debug(
+            f"Skipping group sync for CC Pair {cc_pair.id} - "
+            f"no group sync config found for {cc_pair.connector.source}"
        )
        return False

@@ -102,11 +127,7 @@ def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
    if last_ext_group_sync is None:
        return True

-    source_sync_period = EXTERNAL_GROUP_SYNC_PERIODS.get(cc_pair.connector.source)
-
-    # If EXTERNAL_GROUP_SYNC_PERIODS is None, we always run the sync.
-    if not source_sync_period:
-        return True
+    source_sync_period = sync_config.group_sync_config.group_sync_frequency

    # If the last sync is greater than the full fetch period, we run the sync
    next_sync = last_ext_group_sync + timedelta(seconds=source_sync_period)
@@ -146,9 +167,8 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
        with get_session_with_current_tenant() as db_session:
            cc_pairs = get_all_auto_sync_cc_pairs(db_session)

-            # We only want to sync one cc_pair per source type in
-            # GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC
-            for source in GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC:
+            # For some sources, we only want to sync one cc_pair per source type
+            for source in get_all_cc_pair_agnostic_group_sync_sources():
                # These are ordered by cc_pair id so the first one is the one we want
                cc_pairs_to_dedupe = get_cc_pairs_by_source(
                    db_session,
@@ -156,8 +176,7 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
                    access_type=AccessType.SYNC,
                    status=ConnectorCredentialPairStatus.ACTIVE,
                )
-                # We only want to sync one cc_pair per source type
-                # in GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC so we dedupe here
+                # dedupe cc_pairs to only keep the first one
                for cc_pair_to_remove in cc_pairs_to_dedupe[1:]:
                    cc_pairs = [
                        cc_pair
@@ -196,7 +215,11 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
                    "Exception while validating external group sync fences"
                )

-            r.set(OnyxRedisSignals.BLOCK_VALIDATE_EXTERNAL_GROUP_SYNC_FENCES, 1, ex=300)
+            r.set(
+                OnyxRedisSignals.BLOCK_VALIDATE_EXTERNAL_GROUP_SYNC_FENCES,
+                1,
+                ex=_get_fence_validation_block_expiration(),
+            )
    except SoftTimeLimitExceeded:
        task_logger.info(
            "Soft time limit exceeded, task is being terminated gracefully."
@@ -360,7 +383,7 @@ def connector_external_group_sync_generator_task(

    lock: RedisLock = r.lock(
        OnyxRedisLocks.CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX
-        + f"_{redis_connector.id}",
+        + f"_{redis_connector.cc_pair_id}",
        timeout=CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT,
    )

@@ -375,73 +398,12 @@ def connector_external_group_sync_generator_task(
        payload.started = datetime.now(timezone.utc)
        redis_connector.external_group_sync.set_fence(payload)

+        _perform_external_group_sync(
+            cc_pair_id=cc_pair_id,
+            tenant_id=tenant_id,
+        )
+
        with get_session_with_current_tenant() as db_session:
-            cc_pair = get_connector_credential_pair_from_id(
-                db_session=db_session,
-                cc_pair_id=cc_pair_id,
-                eager_load_credential=True,
-            )
-            if cc_pair is None:
-                raise ValueError(
-                    f"No connector credential pair found for id: {cc_pair_id}"
-                )
-
-            try:
-                created = validate_ccpair_for_user(
-                    cc_pair.connector.id,
-                    cc_pair.credential.id,
-                    db_session,
-                    enforce_creation=False,
-                )
-                if not created:
-                    task_logger.warning(
-                        f"Unable to create connector credential pair for id: {cc_pair_id}"
-                    )
-            except Exception:
-                task_logger.exception(
-                    f"validate_ccpair_permissions_sync exceptioned: cc_pair={cc_pair_id}"
-                )
-                # TODO: add some notification to the admins here
-                raise
-
-            source_type = cc_pair.connector.source
-
-            ext_group_sync_func = GROUP_PERMISSIONS_FUNC_MAP.get(source_type)
-            if ext_group_sync_func is None:
-                msg = f"No external group sync func found for {source_type} for cc_pair: {cc_pair_id}"
-                emit_background_error(msg, cc_pair_id=cc_pair_id)
-                raise ValueError(msg)
-
-            logger.info(
-                f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}"
-            )
-            external_user_groups: list[ExternalUserGroup] = []
-            try:
-                external_user_groups = ext_group_sync_func(tenant_id, cc_pair)
-            except ConnectorValidationError as e:
-                # TODO: add some notification to the admins here
-                logger.exception(
-                    f"Error syncing external groups for {source_type} for cc_pair: {cc_pair_id} {e}"
-                )
-                raise e
-
-            logger.info(
-                f"Syncing {len(external_user_groups)} external user groups for {source_type}"
-            )
-            logger.debug(f"New external user groups: {external_user_groups}")
-
-            replace_user__ext_group_for_cc_pair(
-                db_session=db_session,
-                cc_pair_id=cc_pair.id,
-                group_defs=external_user_groups,
-                source=cc_pair.connector.source,
-            )
-            logger.info(
-                f"Synced {len(external_user_groups)} external user groups for {source_type}"
-            )
-
-            mark_cc_pair_as_external_group_synced(db_session, cc_pair.id)
-
            update_sync_record_status(
                db_session=db_session,
                entity_id=cc_pair_id,
@@ -483,6 +445,81 @@ def connector_external_group_sync_generator_task(
    )


+def _perform_external_group_sync(
+    cc_pair_id: int,
+    tenant_id: str,
+) -> None:
+    with get_session_with_current_tenant() as db_session:
+        cc_pair = get_connector_credential_pair_from_id(
+            db_session=db_session,
+            cc_pair_id=cc_pair_id,
+            eager_load_credential=True,
+        )
+        if cc_pair is None:
+            raise ValueError(f"No connector credential pair found for id: {cc_pair_id}")
+
+        source_type = cc_pair.connector.source
+        sync_config = get_source_perm_sync_config(source_type)
+        if sync_config is None:
+            msg = f"No sync config found for {source_type} for cc_pair: {cc_pair_id}"
+            emit_background_error(msg, cc_pair_id=cc_pair_id)
+            raise ValueError(msg)
+
+        if sync_config.group_sync_config is None:
+            msg = f"No group sync config found for {source_type} for cc_pair: {cc_pair_id}"
+            emit_background_error(msg, cc_pair_id=cc_pair_id)
+            raise ValueError(msg)
+
+        ext_group_sync_func = sync_config.group_sync_config.group_sync_func
+
+        logger.info(
+            f"Marking old external groups as stale for {source_type} for cc_pair: {cc_pair_id}"
+        )
+        mark_old_external_groups_as_stale(db_session, cc_pair_id)
+
+        logger.info(
+            f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}"
+        )
+        external_user_group_batch: list[ExternalUserGroup] = []
+        try:
+            external_user_group_generator = ext_group_sync_func(tenant_id, cc_pair)
+            for external_user_group in external_user_group_generator:
+                external_user_group_batch.append(external_user_group)
+                if len(external_user_group_batch) >= _EXTERNAL_GROUP_BATCH_SIZE:
+                    logger.debug(
+                        f"New external user groups: {external_user_group_batch}"
+                    )
+                    upsert_external_groups(
+                        db_session=db_session,
+                        cc_pair_id=cc_pair_id,
+                        external_groups=external_user_group_batch,
+                        source=cc_pair.connector.source,
+                    )
+                    external_user_group_batch = []
+
+            if external_user_group_batch:
+                logger.debug(f"New external user groups: {external_user_group_batch}")
+                upsert_external_groups(
+                    db_session=db_session,
+                    cc_pair_id=cc_pair_id,
+                    external_groups=external_user_group_batch,
+                    source=cc_pair.connector.source,
+                )
+        except Exception as e:
+            # TODO: add some notification to the admins here
+            logger.exception(
+                f"Error syncing external groups for {source_type} for cc_pair: {cc_pair_id} {e}"
+            )
+            raise e
+
+        logger.info(
+            f"Removing stale external groups for {source_type} for cc_pair: {cc_pair_id}"
+        )
+        remove_stale_external_groups(db_session, cc_pair_id)
+
+        mark_all_relevant_cc_pairs_as_external_group_synced(db_session, cc_pair)
+
+
 def validate_external_group_sync_fences(
    tenant_id: str,
    celery_app: Celery,
--- a/backend/ee/onyx/background/celery/tasks/tenant_provisioning/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/tenant_provisioning/tasks.py
@@ -14,13 +14,12 @@ from ee.onyx.server.tenants.provisioning import setup_tenant
 from ee.onyx.server.tenants.schema_management import create_schema_if_not_exists
 from ee.onyx.server.tenants.schema_management import get_current_alembic_version
 from onyx.background.celery.apps.app_base import task_logger
-from onyx.configs.app_configs import JOB_TIMEOUT
 from onyx.configs.app_configs import TARGET_AVAILABLE_TENANTS
-from onyx.configs.constants import OnyxCeleryPriority
+from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisLocks
-from onyx.db.engine import get_session_with_shared_schema
+from onyx.db.engine.sql_engine import get_session_with_shared_schema
 from onyx.db.models import AvailableTenant
 from onyx.redis.redis_pool import get_redis_client
 from shared_configs.configs import MULTI_TENANT
@@ -39,7 +38,8 @@ _TENANT_PROVISIONING_TIME_LIMIT = 60 * 10  # 10 minutes
    name=OnyxCeleryTask.CLOUD_CHECK_AVAILABLE_TENANTS,
    queue=OnyxCeleryQueues.MONITORING,
    ignore_result=True,
-    soft_time_limit=JOB_TIMEOUT,
+    soft_time_limit=_TENANT_PROVISIONING_SOFT_TIME_LIMIT,
+    time_limit=_TENANT_PROVISIONING_TIME_LIMIT,
    trail=False,
    bind=True,
 )
@@ -55,7 +55,7 @@ def check_available_tenants(self: Task) -> None:
        )
        return

-    r = get_redis_client()
+    r = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)
    lock_check: RedisLock = r.lock(
        OnyxRedisLocks.CHECK_AVAILABLE_TENANTS_LOCK,
        timeout=_TENANT_PROVISIONING_SOFT_TIME_LIMIT,
@@ -71,32 +71,28 @@ def check_available_tenants(self: Task) -> None:
    try:
        # Get the current count of available tenants
        with get_session_with_shared_schema() as db_session:
-            available_tenants_count = db_session.query(AvailableTenant).count()
+            num_available_tenants = db_session.query(AvailableTenant).count()

        # Get the target number of available tenants
-        target_available_tenants = getattr(
+        num_minimum_available_tenants = getattr(
            TARGET_AVAILABLE_TENANTS, "value", DEFAULT_TARGET_AVAILABLE_TENANTS
        )

        # Calculate how many new tenants we need to provision
-        tenants_to_provision = max(
-            0, target_available_tenants - available_tenants_count
-        )
+        if num_available_tenants < num_minimum_available_tenants:
+            tenants_to_provision = num_minimum_available_tenants - num_available_tenants
+        else:
+            tenants_to_provision = 0

        task_logger.info(
-            f"Available tenants: {available_tenants_count}, "
-            f"Target: {target_available_tenants}, "
+            f"Available tenants: {num_available_tenants}, "
+            f"Target minimum available tenants: {num_minimum_available_tenants}, "
            f"To provision: {tenants_to_provision}"
        )

-        # Trigger pre-provisioning tasks for each tenant needed
-        for _ in range(tenants_to_provision):
-            from celery import current_app
-
-            current_app.send_task(
-                OnyxCeleryTask.PRE_PROVISION_TENANT,
-                priority=OnyxCeleryPriority.LOW,
-            )
+        # just provision one tenant each time we run this ... increase if needed.
+        if tenants_to_provision > 0:
+            pre_provision_tenant()

    except Exception:
        task_logger.exception("Error in check_available_tenants task")
@@ -105,15 +101,7 @@ def check_available_tenants(self: Task) -> None:
        lock_check.release()


-@shared_task(
-    name=OnyxCeleryTask.PRE_PROVISION_TENANT,
-    ignore_result=True,
-    soft_time_limit=_TENANT_PROVISIONING_SOFT_TIME_LIMIT,
-    time_limit=_TENANT_PROVISIONING_TIME_LIMIT,
-    queue=OnyxCeleryQueues.MONITORING,
-    bind=True,
-)
-def pre_provision_tenant(self: Task) -> None:
+def pre_provision_tenant() -> None:
    """
    Pre-provision a new tenant and store it in the NewAvailableTenant table.
    This function fully sets up the tenant with all necessary configurations,
@@ -122,9 +110,9 @@ def pre_provision_tenant(self: Task) -> None:
    # The MULTI_TENANT check is now done at the caller level (check_available_tenants)
    # rather than inside this function

-    r = get_redis_client()
+    r = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)
    lock_provision: RedisLock = r.lock(
-        OnyxRedisLocks.PRE_PROVISION_TENANT_LOCK,
+        OnyxRedisLocks.CLOUD_PRE_PROVISION_TENANT_LOCK,
        timeout=_TENANT_PROVISIONING_SOFT_TIME_LIMIT,
    )

--- a/backend/ee/onyx/background/celery_utils.py
+++ b/backend/ee/onyx/background/celery_utils.py
@@ -9,7 +9,7 @@ logger = setup_logger()


 def should_perform_chat_ttl_check(
-    retention_limit_days: int | None, db_session: Session
+    retention_limit_days: float | None, db_session: Session
 ) -> bool:
    # TODO: make this a check for None and add behavior for 0 day TTL
    if not retention_limit_days:
--- a/backend/ee/onyx/background/task_name_builders.py
+++ b/backend/ee/onyx/background/task_name_builders.py
@@ -1,2 +1,16 @@
-def name_chat_ttl_task(retention_limit_days: int, tenant_id: str | None = None) -> str:
+from datetime import datetime
+
+from onyx.configs.constants import OnyxCeleryTask
+
+
+QUERY_HISTORY_TASK_NAME_PREFIX = OnyxCeleryTask.EXPORT_QUERY_HISTORY_TASK
+
+
+def name_chat_ttl_task(
+    retention_limit_days: float, tenant_id: str | None = None
+) -> str:
    return f"chat_ttl_{retention_limit_days}_days"
+
+
+def query_history_task_name(start: datetime, end: datetime) -> str:
+    return f"{QUERY_HISTORY_TASK_NAME_PREFIX}_{start}_{end}"
--- a/backend/ee/onyx/configs/app_configs.py
+++ b/backend/ee/onyx/configs/app_configs.py
@@ -25,13 +25,25 @@ SAML_CONF_DIR = os.environ.get("SAML_CONF_DIR") or "/app/ee/onyx/configs/saml_co
 #####
 # Auto Permission Sync
 #####
+# should generally only be used for sources that support polling of permissions
+# e.g. can pull in only permission changes rather than having to go through all
+# documents every time
 DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY = int(
    os.environ.get("DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
 )

-# In seconds, default is 5 minutes
+
+#####
+# Confluence
+#####
+
+# In seconds, default is 30 minutes
 CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
-    os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
+    os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 30 * 60
+)
+# In seconds, default is 30 minutes
+CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
 )
 # This is a boolean that determines if anonymous access is public
 # Default behavior is to not make the page public and instead add a group
@@ -39,14 +51,79 @@ CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
 CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC = (
    os.environ.get("CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC", "").lower() == "true"
 )
+
+
+#####
+# JIRA
+#####
+
+# In seconds, default is 30 minutes
+JIRA_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("JIRA_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
+)
+
+
+#####
+# Google Drive
+#####
+GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
+    os.environ.get("GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
+)
+
+
+#####
+# GitHub
+#####
 # In seconds, default is 5 minutes
-CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
-    os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
+GITHUB_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("GITHUB_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
+)
+# In seconds, default is 5 minutes
+GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY = int(
+    os.environ.get("GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
+)
+
+
+#####
+# Slack
+#####
+SLACK_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("SLACK_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
 )

 NUM_PERMISSION_WORKERS = int(os.environ.get("NUM_PERMISSION_WORKERS") or 2)


+#####
+# Teams
+#####
+# In seconds, default is 5 minutes
+TEAMS_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("TEAMS_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
+)
+
+#####
+# SharePoint
+#####
+# In seconds, default is 30 minutes
+SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY = int(
+    os.environ.get("SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
+)
+
+# In seconds, default is 5 minutes
+SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY = int(
+    os.environ.get("SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
+)
+
+
+####
+# Celery Job Frequency
+####
+CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS = float(
+    os.environ.get("CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS") or 1
+)  # float for easier testing
+
+
 STRIPE_SECRET_KEY = os.environ.get("STRIPE_SECRET_KEY")
 STRIPE_PRICE_ID = os.environ.get("STRIPE_PRICE")

@@ -62,29 +139,6 @@ JWT_PUBLIC_KEY_URL: str | None = os.getenv("JWT_PUBLIC_KEY_URL", None)
 SUPER_USERS = json.loads(os.environ.get("SUPER_USERS", "[]"))
 SUPER_CLOUD_API_KEY = os.environ.get("SUPER_CLOUD_API_KEY", "api_key")

-OAUTH_SLACK_CLIENT_ID = os.environ.get("OAUTH_SLACK_CLIENT_ID", "")
-OAUTH_SLACK_CLIENT_SECRET = os.environ.get("OAUTH_SLACK_CLIENT_SECRET", "")
-OAUTH_CONFLUENCE_CLOUD_CLIENT_ID = os.environ.get(
-    "OAUTH_CONFLUENCE_CLOUD_CLIENT_ID", ""
-)
-OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET = os.environ.get(
-    "OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET", ""
-)
-OAUTH_JIRA_CLOUD_CLIENT_ID = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_ID", "")
-OAUTH_JIRA_CLOUD_CLIENT_SECRET = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_SECRET", "")
-OAUTH_GOOGLE_DRIVE_CLIENT_ID = os.environ.get("OAUTH_GOOGLE_DRIVE_CLIENT_ID", "")
-OAUTH_GOOGLE_DRIVE_CLIENT_SECRET = os.environ.get(
-    "OAUTH_GOOGLE_DRIVE_CLIENT_SECRET", ""
-)
-
-GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
-    os.environ.get("GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
-)
-
-SLACK_PERMISSION_DOC_SYNC_FREQUENCY = int(
-    os.environ.get("SLACK_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
-)
-
 # The posthog client does not accept empty API keys or hosts however it fails silently
 # when the capture is called. These defaults prevent Posthog issues from breaking the Onyx app
 POSTHOG_API_KEY = os.environ.get("POSTHOG_API_KEY") or "FooBar"
@@ -92,6 +146,4 @@ POSTHOG_HOST = os.environ.get("POSTHOG_HOST") or "https://us.i.posthog.com"

 HUBSPOT_TRACKING_URL = os.environ.get("HUBSPOT_TRACKING_URL")

-ANONYMOUS_USER_COOKIE_NAME = "onyx_anonymous_user"
-
 GATED_TENANTS_KEY = "gated_tenants"
--- a/backend/ee/onyx/connectors/perm_sync_valid.py
+++ b/backend/ee/onyx/connectors/perm_sync_valid.py
@@ -0,0 +1,28 @@
+from onyx.connectors.confluence.connector import ConfluenceConnector
+from onyx.connectors.google_drive.connector import GoogleDriveConnector
+from onyx.connectors.interfaces import BaseConnector
+
+
+def validate_confluence_perm_sync(connector: ConfluenceConnector) -> None:
+    """
+    Validate that the connector is configured correctly for permissions syncing.
+    """
+
+
+def validate_drive_perm_sync(connector: GoogleDriveConnector) -> None:
+    """
+    Validate that the connector is configured correctly for permissions syncing.
+    """
+
+
+def validate_perm_sync(connector: BaseConnector) -> None:
+    """
+    Override this if your connector needs to validate permissions syncing.
+    Raise an exception if invalid, otherwise do nothing.
+
+    Default is a no-op (always successful).
+    """
+    if isinstance(connector, ConfluenceConnector):
+        validate_confluence_perm_sync(connector)
+    elif isinstance(connector, GoogleDriveConnector):
+        validate_drive_perm_sync(connector)
--- a/backend/ee/onyx/db/analytics.py
+++ b/backend/ee/onyx/db/analytics.py
@@ -140,7 +140,7 @@ def fetch_onyxbot_analytics(
                    (
                        or_(
                            ChatMessageFeedback.is_positive.is_(False),
-                            ChatMessageFeedback.required_followup,
+                            ChatMessageFeedback.required_followup.is_(True),
                        ),
                        1,
                    ),
@@ -173,7 +173,7 @@ def fetch_onyxbot_analytics(
        .all()
    )

-    return results
+    return [tuple(row) for row in results]


 def fetch_persona_message_analytics(
--- a/backend/ee/onyx/db/external_perm.py
+++ b/backend/ee/onyx/db/external_perm.py
@@ -4,10 +4,12 @@ from uuid import UUID
 from pydantic import BaseModel
 from sqlalchemy import delete
 from sqlalchemy import select
+from sqlalchemy import update
 from sqlalchemy.orm import Session

 from onyx.access.utils import build_ext_group_name_for_onyx
 from onyx.configs.constants import DocumentSource
+from onyx.db.models import PublicExternalUserGroup
 from onyx.db.models import User
 from onyx.db.models import User__ExternalUserGroupId
 from onyx.db.users import batch_add_ext_perm_user_if_not_exists
@@ -20,6 +22,12 @@ logger = setup_logger()
 class ExternalUserGroup(BaseModel):
    id: str
    user_emails: list[str]
+    # `True` for cases like a Folder in Google Drive that give domain-wide
+    # or "Anyone with link" access to all files in the folder.
+    # if this is set, `user_emails` don't really matter.
+    # When this is `True`, this `ExternalUserGroup` object doesn't really represent
+    # an actual "group" in the source.
+    gives_anyone_access: bool = False


 def delete_user__ext_group_for_user__no_commit(
@@ -44,20 +52,52 @@ def delete_user__ext_group_for_cc_pair__no_commit(
    )


-def replace_user__ext_group_for_cc_pair(
+def delete_public_external_group_for_cc_pair__no_commit(
    db_session: Session,
    cc_pair_id: int,
-    group_defs: list[ExternalUserGroup],
+) -> None:
+    db_session.execute(
+        delete(PublicExternalUserGroup).where(
+            PublicExternalUserGroup.cc_pair_id == cc_pair_id
+        )
+    )
+
+
+def mark_old_external_groups_as_stale(
+    db_session: Session,
+    cc_pair_id: int,
+) -> None:
+    db_session.execute(
+        update(User__ExternalUserGroupId)
+        .where(User__ExternalUserGroupId.cc_pair_id == cc_pair_id)
+        .values(stale=True)
+    )
+    db_session.execute(
+        update(PublicExternalUserGroup)
+        .where(PublicExternalUserGroup.cc_pair_id == cc_pair_id)
+        .values(stale=True)
+    )
+
+
+def upsert_external_groups(
+    db_session: Session,
+    cc_pair_id: int,
+    external_groups: list[ExternalUserGroup],
    source: DocumentSource,
 ) -> None:
    """
-    This function clears all existing external user group relations for a given cc_pair_id
-    and replaces them with the new group definitions and commits the changes.
+    Performs a true upsert operation for external user groups:
+    - For existing groups (same user_id, external_user_group_id, cc_pair_id), updates the stale flag to False
+    - For new groups, inserts them with stale=False
+    - For public groups, uses upsert logic as well
    """
+    # If there are no groups to add, return early
+    if not external_groups:
+        return

    # collect all emails from all groups to batch add all users at once for efficiency
    all_group_member_emails = set()
-    for external_group in group_defs:
+    for external_group in external_groups:
        for user_email in external_group.user_emails:
            all_group_member_emails.add(user_email)

@@ -68,17 +108,17 @@ def replace_user__ext_group_for_cc_pair(
        emails=list(all_group_member_emails),
    )

-    delete_user__ext_group_for_cc_pair__no_commit(
-        db_session=db_session,
-        cc_pair_id=cc_pair_id,
-    )
-
    # map emails to ids
-    email_id_map = {user.email: user.id for user in all_group_members}
+    email_id_map = {user.email.lower(): user.id for user in all_group_members}

-    # use these ids to create new external user group relations relating group_id to user_ids
-    new_external_permissions = []
-    for external_group in group_defs:
+    # Process each external group
+    for external_group in external_groups:
+        external_group_id = build_ext_group_name_for_onyx(
+            ext_group_name=external_group.id,
+            source=source,
+        )
+
+        # Handle user-group mappings
        for user_email in external_group.user_emails:
            user_id = email_id_map.get(user_email.lower())
            if user_id is None:
@@ -87,19 +127,71 @@ def replace_user__ext_group_for_cc_pair(
                    f" with email {user_email} not found"
                )
                continue
-            external_group_id = build_ext_group_name_for_onyx(
-                ext_group_name=external_group.id,
-                source=source,
-            )
-            new_external_permissions.append(
-                User__ExternalUserGroupId(
-                    user_id=user_id,
-                    external_user_group_id=external_group_id,
-                    cc_pair_id=cc_pair_id,
+
+            # Check if the user-group mapping already exists
+            existing_user_group = db_session.scalar(
+                select(User__ExternalUserGroupId).where(
+                    User__ExternalUserGroupId.user_id == user_id,
+                    User__ExternalUserGroupId.external_user_group_id
+                    == external_group_id,
+                    User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
                )
            )

-    db_session.add_all(new_external_permissions)
+            if existing_user_group:
+                # Update existing record
+                existing_user_group.stale = False
+            else:
+                # Insert new record
+                new_user_group = User__ExternalUserGroupId(
+                    user_id=user_id,
+                    external_user_group_id=external_group_id,
+                    cc_pair_id=cc_pair_id,
+                    stale=False,
+                )
+                db_session.add(new_user_group)
+
+        # Handle public group if needed
+        if external_group.gives_anyone_access:
+            # Check if the public group already exists
+            existing_public_group = db_session.scalar(
+                select(PublicExternalUserGroup).where(
+                    PublicExternalUserGroup.external_user_group_id == external_group_id,
+                    PublicExternalUserGroup.cc_pair_id == cc_pair_id,
+                )
+            )
+
+            if existing_public_group:
+                # Update existing record
+                existing_public_group.stale = False
+            else:
+                # Insert new record
+                new_public_group = PublicExternalUserGroup(
+                    external_user_group_id=external_group_id,
+                    cc_pair_id=cc_pair_id,
+                    stale=False,
+                )
+                db_session.add(new_public_group)
+
+    db_session.commit()
+
+
+def remove_stale_external_groups(
+    db_session: Session,
+    cc_pair_id: int,
+) -> None:
+    db_session.execute(
+        delete(User__ExternalUserGroupId).where(
+            User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
+            User__ExternalUserGroupId.stale.is_(True),
+        )
+    )
+    db_session.execute(
+        delete(PublicExternalUserGroup).where(
+            PublicExternalUserGroup.cc_pair_id == cc_pair_id,
+            PublicExternalUserGroup.stale.is_(True),
+        )
+    )
    db_session.commit()


@@ -130,3 +222,11 @@ def fetch_external_groups_for_user_email_and_group_ids(
        )
    ).all()
    return list(user_ext_groups)
+
+
+def fetch_public_external_group_ids(
+    db_session: Session,
+) -> list[str]:
+    return list(
+        db_session.scalars(select(PublicExternalUserGroup.external_user_group_id)).all()
+    )
--- a/backend/ee/onyx/db/persona.py
+++ b/backend/ee/onyx/db/persona.py
@@ -11,6 +11,7 @@ from onyx.server.features.persona.models import PersonaSharedNotificationData

 def make_persona_private(
    persona_id: int,
+    creator_user_id: UUID | None,
    user_ids: list[UUID] | None,
    group_ids: list[int] | None,
    db_session: Session,
@@ -29,15 +30,15 @@ def make_persona_private(
        user_ids_set = set(user_ids)
        for user_id in user_ids_set:
            db_session.add(Persona__User(persona_id=persona_id, user_id=user_id))
-
-            create_notification(
-                user_id=user_id,
-                notif_type=NotificationType.PERSONA_SHARED,
-                db_session=db_session,
-                additional_data=PersonaSharedNotificationData(
-                    persona_id=persona_id,
-                ).model_dump(),
-            )
+            if user_id != creator_user_id:
+                create_notification(
+                    user_id=user_id,
+                    notif_type=NotificationType.PERSONA_SHARED,
+                    db_session=db_session,
+                    additional_data=PersonaSharedNotificationData(
+                        persona_id=persona_id,
+                    ).model_dump(),
+                )

    if group_ids:
        group_ids_set = set(group_ids)
--- a/backend/ee/onyx/db/query_history.py
+++ b/backend/ee/onyx/db/query_history.py
@@ -15,10 +15,13 @@ from sqlalchemy.sql import select
 from sqlalchemy.sql.expression import literal
 from sqlalchemy.sql.expression import UnaryExpression

+from ee.onyx.background.task_name_builders import QUERY_HISTORY_TASK_NAME_PREFIX
 from onyx.configs.constants import QAFeedbackType
 from onyx.db.models import ChatMessage
 from onyx.db.models import ChatMessageFeedback
 from onyx.db.models import ChatSession
+from onyx.db.models import TaskQueueState
+from onyx.db.tasks import get_all_tasks_with_prefix


 def _build_filter_conditions(
@@ -171,3 +174,9 @@ def fetch_chat_sessions_eagerly_by_time(
    chat_sessions = query.all()

    return chat_sessions
+
+
+def get_all_query_history_export_tasks(
+    db_session: Session,
+) -> list[TaskQueueState]:
+    return get_all_tasks_with_prefix(db_session, QUERY_HISTORY_TASK_NAME_PREFIX)
--- a/backend/ee/onyx/db/usage_export.py
+++ b/backend/ee/onyx/db/usage_export.py
@@ -5,6 +5,8 @@ from typing import IO
 from typing import Optional

 from fastapi_users_db_sqlalchemy import UUID_ID
+from sqlalchemy import cast
+from sqlalchemy.dialects.postgresql import UUID
 from sqlalchemy.orm import Session

 from ee.onyx.db.query_history import fetch_chat_sessions_eagerly_by_time
@@ -13,6 +15,7 @@ from ee.onyx.server.reporting.usage_export_models import FlowType
 from ee.onyx.server.reporting.usage_export_models import UsageReportMetadata
 from onyx.configs.constants import MessageType
 from onyx.db.models import UsageReport
+from onyx.db.models import User
 from onyx.file_store.file_store import get_default_file_store


@@ -86,25 +89,49 @@ def get_all_empty_chat_message_entries(


 def get_all_usage_reports(db_session: Session) -> list[UsageReportMetadata]:
+    # Get the user emails
+    usage_reports = db_session.query(UsageReport).all()
+    user_ids = {r.requestor_user_id for r in usage_reports if r.requestor_user_id}
+    user_emails = {
+        user.id: user.email
+        for user in db_session.query(User)
+        .filter(cast(User.id, UUID).in_(user_ids))
+        .all()
+    }
+
    return [
        UsageReportMetadata(
            report_name=r.report_name,
-            requestor=str(r.requestor_user_id) if r.requestor_user_id else None,
+            requestor=(
+                user_emails.get(r.requestor_user_id) if r.requestor_user_id else None
+            ),
            time_created=r.time_created,
            period_from=r.period_from,
            period_to=r.period_to,
        )
-        for r in db_session.query(UsageReport).all()
+        for r in usage_reports
    ]


 def get_usage_report_data(
-    db_session: Session,
-    report_name: str,
+    report_display_name: str,
 ) -> IO:
-    file_store = get_default_file_store(db_session)
+    """
+    Get the usage report data from the file store.
+
+    Args:
+        db_session: The database session.
+        report_display_name: The display name of the usage report. Also assumes
+                             that the file is stored with this as the ID in the file store.
+
+    Returns:
+        The usage report data.
+    """
+    file_store = get_default_file_store()
    # usage report may be very large, so don't load it all into memory
-    return file_store.read_file(file_name=report_name, mode="b", use_tempfile=True)
+    return file_store.read_file(
+        file_id=report_display_name, mode="b", use_tempfile=True
+    )


 def write_usage_report(
--- a/backend/ee/onyx/db/user_group.py
+++ b/backend/ee/onyx/db/user_group.py
@@ -128,11 +128,14 @@ def validate_object_creation_for_user(
    target_group_ids: list[int] | None = None,
    object_is_public: bool | None = None,
    object_is_perm_sync: bool | None = None,
+    object_is_owned_by_user: bool = False,
+    object_is_new: bool = False,
 ) -> None:
    """
    All users can create/edit permission synced objects if they don't specify a group
    All admin actions are allowed.
-    Prevents non-admins from creating/editing:
+    Curators and global curators can create public objects.
+    Prevents other non-admins from creating/editing:
    - public objects
    - objects with no groups
    - objects that belong to a group they don't curate
@@ -143,13 +146,23 @@ def validate_object_creation_for_user(
    if not user or user.role == UserRole.ADMIN:
        return

-    if object_is_public:
-        detail = "User does not have permission to create public credentials"
+    # Allow curators and global curators to create public objects
+    # w/o associated groups IF the object is new/owned by them
+    if (
+        object_is_public
+        and user.role in [UserRole.CURATOR, UserRole.GLOBAL_CURATOR]
+        and (object_is_new or object_is_owned_by_user)
+    ):
+        return
+
+    if object_is_public and user.role == UserRole.BASIC:
+        detail = "User does not have permission to create public objects"
        logger.error(detail)
        raise HTTPException(
            status_code=400,
            detail=detail,
        )
+
    if not target_group_ids:
        detail = "Curators must specify 1+ groups"
        logger.error(detail)
--- a/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
+++ b/backend/ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="utf-8"?>
+<services version="1.0">
+    <container id="default" version="1.0">
+        <document-api />
+        <search />
+        <http>
+            <server id="default" port="4080" />
+        </http>
+        <nodes count="[2, 4]">
+            <resources vcpu="4.0" memory="16Gb" architecture="arm64" storage-type="remote"
+                disk="48Gb" />
+        </nodes>
+
+
+    </container>
+    <content id="danswer_index" version="1.0">
+        <documents>
+            <!-- <document type="danswer_chunk" mode="index" /> -->
+{{ document_elements }}
+        </documents>
+        <nodes count="60">
+            <resources vcpu="8.0" memory="128.0Gb" architecture="arm64" storage-type="local"
+                disk="475.0Gb" />
+        </nodes>
+        <engine>
+            <proton>
+                <tuning>
+                    <searchnode>
+                        <requestthreads>
+                            <persearch>2</persearch>
+                        </requestthreads>
+                    </searchnode>
+                </tuning>
+            </proton>
+        </engine>
+
+        <config name="vespa.config.search.summary.juniperrc">
+            <max_matches>3</max_matches>
+            <length>750</length>
+            <surround_max>350</surround_max>
+            <min_length>300</min_length>
+        </config>
+
+
+        <min-redundancy>2</min-redundancy>
+
+    </content>
+</services>
--- a/backend/ee/onyx/external_permissions/confluence/constants.py
+++ b/backend/ee/onyx/external_permissions/confluence/constants.py
@@ -2,3 +2,6 @@
 # Instead of setting a page to public, we just add this group so that the page
 # is only accessible to users who have confluence accounts.
 ALL_CONF_EMAILS_GROUP_NAME = "All_Confluence_Users_Found_By_Onyx"
+
+VIEWSPACE_PERMISSION_TYPE = "VIEWSPACE"
+REQUEST_PAGINATION_LIMIT = 5000
--- a/backend/ee/onyx/external_permissions/confluence/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/confluence/doc_sync.py
@@ -4,19 +4,14 @@ https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.htm
 """

 from collections.abc import Generator
-from typing import Any

-from ee.onyx.configs.app_configs import CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC
-from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
+from ee.onyx.external_permissions.utils import generic_doc_sync
 from onyx.access.models import DocExternalAccess
-from onyx.access.models import ExternalAccess
+from onyx.configs.constants import DocumentSource
 from onyx.connectors.confluence.connector import ConfluenceConnector
-from onyx.connectors.confluence.onyx_confluence import (
-    get_user_email_from_username__server,
-)
-from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
 from onyx.connectors.credentials_provider import OnyxDBCredentialsProvider
-from onyx.connectors.models import SlimDocument
 from onyx.db.models import ConnectorCredentialPair
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.logger import setup_logger
@@ -24,336 +19,21 @@ from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()

-_VIEWSPACE_PERMISSION_TYPE = "VIEWSPACE"
-_REQUEST_PAGINATION_LIMIT = 5000

-
-def _get_server_space_permissions(
-    confluence_client: OnyxConfluence, space_key: str
-) -> ExternalAccess:
-    space_permissions = confluence_client.get_all_space_permissions_server(
-        space_key=space_key
-    )
-
-    viewspace_permissions = []
-    for permission_category in space_permissions:
-        if permission_category.get("type") == _VIEWSPACE_PERMISSION_TYPE:
-            viewspace_permissions.extend(
-                permission_category.get("spacePermissions", [])
-            )
-
-    is_public = False
-    user_names = set()
-    group_names = set()
-    for permission in viewspace_permissions:
-        user_name = permission.get("userName")
-        if user_name:
-            user_names.add(user_name)
-        group_name = permission.get("groupName")
-        if group_name:
-            group_names.add(group_name)
-
-        # It seems that if anonymous access is turned on for the site and space,
-        # then the space is publicly accessible.
-        # For confluence server, we make a group that contains all users
-        # that exist in confluence and then just add that group to the space permissions
-        # if anonymous access is turned on for the site and space or we set is_public = True
-        # if they set the env variable CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC to True so
-        # that we can support confluence server deployments that want anonymous access
-        # to be public (we cant test this because its paywalled)
-        if user_name is None and group_name is None:
-            # Defaults to False
-            if CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC:
-                is_public = True
-            else:
-                group_names.add(ALL_CONF_EMAILS_GROUP_NAME)
-
-    user_emails = set()
-    for user_name in user_names:
-        user_email = get_user_email_from_username__server(confluence_client, user_name)
-        if user_email:
-            user_emails.add(user_email)
-        else:
-            logger.warning(f"Email for user {user_name} not found in Confluence")
-
-    if not user_emails and not group_names:
-        logger.warning(
-            "No user emails or group names found in Confluence space permissions"
-            f"\nSpace key: {space_key}"
-            f"\nSpace permissions: {space_permissions}"
-        )
-
-    return ExternalAccess(
-        external_user_emails=user_emails,
-        external_user_group_ids=group_names,
-        is_public=is_public,
-    )
-
-
-def _get_cloud_space_permissions(
-    confluence_client: OnyxConfluence, space_key: str
-) -> ExternalAccess:
-    space_permissions_result = confluence_client.get_space(
-        space_key=space_key, expand="permissions"
-    )
-    space_permissions = space_permissions_result.get("permissions", [])
-
-    user_emails = set()
-    group_names = set()
-    is_externally_public = False
-    for permission in space_permissions:
-        subs = permission.get("subjects")
-        if subs:
-            # If there are subjects, then there are explicit users or groups with access
-            if email := subs.get("user", {}).get("results", [{}])[0].get("email"):
-                user_emails.add(email)
-            if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"):
-                group_names.add(group_name)
-        else:
-            # If there are no subjects, then the permission is for everyone
-            if permission.get("operation", {}).get(
-                "operation"
-            ) == "read" and permission.get("anonymousAccess", False):
-                # If the permission specifies read access for anonymous users, then
-                # the space is publicly accessible
-                is_externally_public = True
-
-    return ExternalAccess(
-        external_user_emails=user_emails,
-        external_user_group_ids=group_names,
-        is_public=is_externally_public,
-    )
-
-
-def _get_space_permissions(
-    confluence_client: OnyxConfluence,
-    is_cloud: bool,
-) -> dict[str, ExternalAccess]:
-    logger.debug("Getting space permissions")
-    # Gets all the spaces in the Confluence instance
-    all_space_keys = []
-    start = 0
-    while True:
-        spaces_batch = confluence_client.get_all_spaces(
-            start=start, limit=_REQUEST_PAGINATION_LIMIT
-        )
-        for space in spaces_batch.get("results", []):
-            all_space_keys.append(space.get("key"))
-
-        if len(spaces_batch.get("results", [])) < _REQUEST_PAGINATION_LIMIT:
-            break
-
-        start += len(spaces_batch.get("results", []))
-
-    # Gets the permissions for each space
-    logger.debug(f"Got {len(all_space_keys)} spaces from confluence")
-    space_permissions_by_space_key: dict[str, ExternalAccess] = {}
-    for space_key in all_space_keys:
-        if is_cloud:
-            space_permissions = _get_cloud_space_permissions(
-                confluence_client=confluence_client, space_key=space_key
-            )
-        else:
-            space_permissions = _get_server_space_permissions(
-                confluence_client=confluence_client, space_key=space_key
-            )
-
-        # Stores the permissions for each space
-        space_permissions_by_space_key[space_key] = space_permissions
-        logger.info(
-            f"Found space permissions for space '{space_key}': {space_permissions}"
-        )
-
-    return space_permissions_by_space_key
-
-
-def _extract_read_access_restrictions(
-    confluence_client: OnyxConfluence, restrictions: dict[str, Any]
-) -> tuple[set[str], set[str]]:
-    """
-    Converts a page's restrictions dict into an ExternalAccess object.
-    If there are no restrictions, then return None
-    """
-    read_access = restrictions.get("read", {})
-    read_access_restrictions = read_access.get("restrictions", {})
-
-    # Extract the users with read access
-    read_access_user = read_access_restrictions.get("user", {})
-    read_access_user_jsons = read_access_user.get("results", [])
-    read_access_user_emails = []
-    for user in read_access_user_jsons:
-        # If the user has an email, then add it to the list
-        if user.get("email"):
-            read_access_user_emails.append(user["email"])
-        # If the user has a username and not an email, then get the email from Confluence
-        elif user.get("username"):
-            email = get_user_email_from_username__server(
-                confluence_client=confluence_client, user_name=user["username"]
-            )
-            if email:
-                read_access_user_emails.append(email)
-            else:
-                logger.warning(
-                    f"Email for user {user['username']} not found in Confluence"
-                )
-        else:
-            if user.get("email") is not None:
-                logger.warning(f"Cant find email for user {user.get('displayName')}")
-                logger.warning(
-                    "This user needs to make their email accessible in Confluence Settings"
-                )
-
-            logger.warning(f"no user email or username for {user}")
-
-    # Extract the groups with read access
-    read_access_group = read_access_restrictions.get("group", {})
-    read_access_group_jsons = read_access_group.get("results", [])
-    read_access_group_names = [
-        group["name"] for group in read_access_group_jsons if group.get("name")
-    ]
-
-    return set(read_access_user_emails), set(read_access_group_names)
-
-
-def _get_all_page_restrictions(
-    confluence_client: OnyxConfluence,
-    perm_sync_data: dict[str, Any],
-) -> ExternalAccess | None:
-    """
-    This function gets the restrictions for a page by taking the intersection
-    of the page's restrictions and the restrictions of all the ancestors
-    of the page.
-    If the page/ancestor has no restrictions, then it is ignored (no intersection).
-    If no restrictions are found anywhere, then return None, indicating that the page
-    should inherit the space's restrictions.
-    """
-    found_user_emails: set[str] = set()
-    found_group_names: set[str] = set()
-
-    found_user_emails, found_group_names = _extract_read_access_restrictions(
-        confluence_client=confluence_client,
-        restrictions=perm_sync_data.get("restrictions", {}),
-    )
-
-    ancestors: list[dict[str, Any]] = perm_sync_data.get("ancestors", [])
-    for ancestor in ancestors:
-        ancestor_user_emails, ancestor_group_names = _extract_read_access_restrictions(
-            confluence_client=confluence_client,
-            restrictions=ancestor.get("restrictions", {}),
-        )
-        if not ancestor_user_emails and not ancestor_group_names:
-            # This ancestor has no restrictions, so it has no effect on
-            # the page's restrictions, so we ignore it
-            continue
-
-        found_user_emails.intersection_update(ancestor_user_emails)
-        found_group_names.intersection_update(ancestor_group_names)
-
-    # If there are no restrictions found, then the page
-    # inherits the space's restrictions so return None
-    if not found_user_emails and not found_group_names:
-        return None
-
-    return ExternalAccess(
-        external_user_emails=found_user_emails,
-        external_user_group_ids=found_group_names,
-        # there is no way for a page to be individually public if the space isn't public
-        is_public=False,
-    )
-
-
-def _fetch_all_page_restrictions(
-    confluence_client: OnyxConfluence,
-    slim_docs: list[SlimDocument],
-    space_permissions_by_space_key: dict[str, ExternalAccess],
-    is_cloud: bool,
-    callback: IndexingHeartbeatInterface | None,
-) -> Generator[DocExternalAccess, None, None]:
-    """
-    For all pages, if a page has restrictions, then use those restrictions.
-    Otherwise, use the space's restrictions.
-    """
-    for slim_doc in slim_docs:
-        if callback:
-            if callback.should_stop():
-                raise RuntimeError("confluence_doc_sync: Stop signal detected")
-
-            callback.progress("confluence_doc_sync:fetch_all_page_restrictions", 1)
-
-        if slim_doc.perm_sync_data is None:
-            raise ValueError(
-                f"No permission sync data found for document {slim_doc.id}"
-            )
-
-        if restrictions := _get_all_page_restrictions(
-            confluence_client=confluence_client,
-            perm_sync_data=slim_doc.perm_sync_data,
-        ):
-            yield DocExternalAccess(
-                doc_id=slim_doc.id,
-                external_access=restrictions,
-            )
-            # If there are restrictions, then we don't need to use the space's restrictions
-            continue
-
-        space_key = slim_doc.perm_sync_data.get("space_key")
-        if not (space_permissions := space_permissions_by_space_key.get(space_key)):
-            logger.debug(
-                f"Individually fetching space permissions for space {space_key}"
-            )
-            try:
-                # If the space permissions are not in the cache, then fetch them
-                if is_cloud:
-                    retrieved_space_permissions = _get_cloud_space_permissions(
-                        confluence_client=confluence_client, space_key=space_key
-                    )
-                else:
-                    retrieved_space_permissions = _get_server_space_permissions(
-                        confluence_client=confluence_client, space_key=space_key
-                    )
-                space_permissions_by_space_key[space_key] = retrieved_space_permissions
-                space_permissions = retrieved_space_permissions
-            except Exception as e:
-                logger.warning(
-                    f"Error fetching space permissions for space {space_key}: {e}"
-                )
-
-        if not space_permissions:
-            logger.warning(
-                f"No permissions found for document {slim_doc.id} in space {space_key}"
-            )
-            continue
-
-        # If there are no restrictions, then use the space's restrictions
-        yield DocExternalAccess(
-            doc_id=slim_doc.id,
-            external_access=space_permissions,
-        )
-        if (
-            not space_permissions.is_public
-            and not space_permissions.external_user_emails
-            and not space_permissions.external_user_group_ids
-        ):
-            logger.warning(
-                f"Permissions are empty for document: {slim_doc.id}\n"
-                "This means space permissions are may be wrong for"
-                f" Space key: {space_key}"
-            )
-
-    logger.debug("Finished fetching all page restrictions for space")
+CONFLUENCE_DOC_SYNC_LABEL = "confluence_doc_sync"


 def confluence_doc_sync(
    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
    callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
    """
-    Adds the external permissions to the documents in postgres
-    if the document doesn't already exists in postgres, we create
-    it in postgres so that when it gets created later, the permissions are
-    already populated
+    Fetches document permissions from Confluence and yields DocExternalAccess objects.
+    Compares fetched documents against existing documents in the DB for the connector.
+    If a document exists in the DB but not in the Confluence fetch, it's marked as restricted.
    """
-    logger.debug("Starting confluence doc sync")
    confluence_connector = ConfluenceConnector(
        **cc_pair.connector.connector_specific_config
    )
@@ -363,32 +43,11 @@ def confluence_doc_sync(
    )
    confluence_connector.set_credentials_provider(provider)

-    is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False)
-
-    space_permissions_by_space_key = _get_space_permissions(
-        confluence_client=confluence_connector.confluence_client,
-        is_cloud=is_cloud,
-    )
-
-    slim_docs = []
-    logger.debug("Fetching all slim documents from confluence")
-    for doc_batch in confluence_connector.retrieve_all_slim_documents(
-        callback=callback
-    ):
-        logger.debug(f"Got {len(doc_batch)} slim documents from confluence")
-        if callback:
-            if callback.should_stop():
-                raise RuntimeError("confluence_doc_sync: Stop signal detected")
-
-            callback.progress("confluence_doc_sync", 1)
-
-        slim_docs.extend(doc_batch)
-
-    logger.debug("Fetching all page restrictions for space")
-    yield from _fetch_all_page_restrictions(
-        confluence_client=confluence_connector.confluence_client,
-        slim_docs=slim_docs,
-        space_permissions_by_space_key=space_permissions_by_space_key,
-        is_cloud=is_cloud,
+    yield from generic_doc_sync(
+        cc_pair=cc_pair,
+        fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
        callback=callback,
+        doc_source=DocumentSource.CONFLUENCE,
+        slim_connector=confluence_connector,
+        label=CONFLUENCE_DOC_SYNC_LABEL,
    )
--- a/backend/ee/onyx/external_permissions/confluence/group_sync.py
+++ b/backend/ee/onyx/external_permissions/confluence/group_sync.py
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 from ee.onyx.db.external_perm import ExternalUserGroup
 from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
 from onyx.background.error_logging import emit_background_error
@@ -65,7 +67,7 @@ def _build_group_member_email_map(
 def confluence_group_sync(
    tenant_id: str,
    cc_pair: ConnectorCredentialPair,
-) -> list[ExternalUserGroup]:
+) -> Generator[ExternalUserGroup, None, None]:
    provider = OnyxDBCredentialsProvider(tenant_id, "confluence", cc_pair.credential_id)
    is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False)
    wiki_base: str = cc_pair.connector.connector_specific_config["wiki_base"]
@@ -89,10 +91,10 @@ def confluence_group_sync(
        confluence_client=confluence_client,
        cc_pair_id=cc_pair.id,
    )
-    onyx_groups: list[ExternalUserGroup] = []
+
    all_found_emails = set()
    for group_id, group_member_emails in group_member_email_map.items():
-        onyx_groups.append(
+        yield (
            ExternalUserGroup(
                id=group_id,
                user_emails=list(group_member_emails),
@@ -107,6 +109,4 @@ def confluence_group_sync(
            id=ALL_CONF_EMAILS_GROUP_NAME,
            user_emails=list(all_found_emails),
        )
-        onyx_groups.append(all_found_group)
-
-    return onyx_groups
+        yield all_found_group
--- a/backend/ee/onyx/external_permissions/confluence/page_access.py
+++ b/backend/ee/onyx/external_permissions/confluence/page_access.py
@@ -0,0 +1,133 @@
+from typing import Any
+
+from onyx.access.models import ExternalAccess
+from onyx.connectors.confluence.onyx_confluence import (
+    get_user_email_from_username__server,
+)
+from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def _extract_read_access_restrictions(
+    confluence_client: OnyxConfluence, restrictions: dict[str, Any]
+) -> tuple[set[str], set[str], bool]:
+    """
+    Converts a page's restrictions dict into an ExternalAccess object.
+    If there are no restrictions, then return None
+    """
+    read_access = restrictions.get("read", {})
+    read_access_restrictions = read_access.get("restrictions", {})
+
+    # Extract the users with read access
+    read_access_user = read_access_restrictions.get("user", {})
+    read_access_user_jsons = read_access_user.get("results", [])
+    # any items found means that there is a restriction
+    found_any_restriction = bool(read_access_user_jsons)
+
+    read_access_user_emails = []
+    for user in read_access_user_jsons:
+        # If the user has an email, then add it to the list
+        if user.get("email"):
+            read_access_user_emails.append(user["email"])
+        # If the user has a username and not an email, then get the email from Confluence
+        elif user.get("username"):
+            email = get_user_email_from_username__server(
+                confluence_client=confluence_client, user_name=user["username"]
+            )
+            if email:
+                read_access_user_emails.append(email)
+            else:
+                logger.warning(
+                    f"Email for user {user['username']} not found in Confluence"
+                )
+        else:
+            if user.get("email") is not None:
+                logger.warning(f"Cant find email for user {user.get('displayName')}")
+                logger.warning(
+                    "This user needs to make their email accessible in Confluence Settings"
+                )
+
+            logger.warning(f"no user email or username for {user}")
+
+    # Extract the groups with read access
+    read_access_group = read_access_restrictions.get("group", {})
+    read_access_group_jsons = read_access_group.get("results", [])
+    # any items found means that there is a restriction
+    found_any_restriction |= bool(read_access_group_jsons)
+    read_access_group_names = [
+        group["name"] for group in read_access_group_jsons if group.get("name")
+    ]
+
+    return (
+        set(read_access_user_emails),
+        set(read_access_group_names),
+        found_any_restriction,
+    )
+
+
+def get_page_restrictions(
+    confluence_client: OnyxConfluence,
+    page_id: str,
+    page_restrictions: dict[str, Any],
+    ancestors: list[dict[str, Any]],
+) -> ExternalAccess | None:
+    """
+    This function gets the restrictions for a page. In Confluence, a child can have
+    at MOST the same level accessibility as its immediate parent.
+
+    If no restrictions are found anywhere, then return None, indicating that the page
+    should inherit the space's restrictions.
+    """
+    found_user_emails: set[str] = set()
+    found_group_names: set[str] = set()
+
+    # NOTE: need the found_any_restriction, since we can find restrictions
+    # but not be able to extract any user emails or group names
+    # in this case, we should just give no access
+    found_user_emails, found_group_names, found_any_page_level_restriction = (
+        _extract_read_access_restrictions(
+            confluence_client=confluence_client,
+            restrictions=page_restrictions,
+        )
+    )
+    # if there are individual page-level restrictions, then this is the accurate
+    # restriction for the page. You cannot both have page-level restrictions AND
+    # inherit restrictions from the parent.
+    if found_any_page_level_restriction:
+        return ExternalAccess(
+            external_user_emails=found_user_emails,
+            external_user_group_ids=found_group_names,
+            is_public=False,
+        )
+
+    # ancestors seem to be in order from root to immediate parent
+    # https://community.atlassian.com/forums/Confluence-questions/Order-of-ancestors-in-REST-API-response-Confluence-Server-amp/qaq-p/2385981
+    # we want the restrictions from the immediate parent to take precedence, so we should
+    # reverse the list
+    for ancestor in reversed(ancestors):
+        (
+            ancestor_user_emails,
+            ancestor_group_names,
+            found_any_restrictions_in_ancestor,
+        ) = _extract_read_access_restrictions(
+            confluence_client=confluence_client,
+            restrictions=ancestor.get("restrictions", {}),
+        )
+        if found_any_restrictions_in_ancestor:
+            # if inheriting restrictions from the parent, then the first one we run into
+            # should be applied (the reason why we'd traverse more than one ancestor is if
+            # the ancestor also is in "inherit" mode.)
+            logger.debug(
+                f"Found user restrictions {ancestor_user_emails} and group restrictions {ancestor_group_names}"
+                f"for document {page_id} based on ancestor {ancestor}"
+            )
+            return ExternalAccess(
+                external_user_emails=ancestor_user_emails,
+                external_user_group_ids=ancestor_group_names,
+                is_public=False,
+            )
+
+    # we didn't find any restrictions, so the page inherits the space's restrictions
+    return None
--- a/backend/ee/onyx/external_permissions/confluence/space_access.py
+++ b/backend/ee/onyx/external_permissions/confluence/space_access.py
@@ -0,0 +1,165 @@
+from ee.onyx.configs.app_configs import CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC
+from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
+from ee.onyx.external_permissions.confluence.constants import REQUEST_PAGINATION_LIMIT
+from ee.onyx.external_permissions.confluence.constants import VIEWSPACE_PERMISSION_TYPE
+from onyx.access.models import ExternalAccess
+from onyx.connectors.confluence.onyx_confluence import (
+    get_user_email_from_username__server,
+)
+from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
+from onyx.utils.logger import setup_logger
+
+
+logger = setup_logger()
+
+
+def _get_server_space_permissions(
+    confluence_client: OnyxConfluence, space_key: str
+) -> ExternalAccess:
+    space_permissions = confluence_client.get_all_space_permissions_server(
+        space_key=space_key
+    )
+
+    viewspace_permissions = []
+    for permission_category in space_permissions:
+        if permission_category.get("type") == VIEWSPACE_PERMISSION_TYPE:
+            viewspace_permissions.extend(
+                permission_category.get("spacePermissions", [])
+            )
+
+    is_public = False
+    user_names = set()
+    group_names = set()
+    for permission in viewspace_permissions:
+        if user_name := permission.get("userName"):
+            user_names.add(user_name)
+        if group_name := permission.get("groupName"):
+            group_names.add(group_name)
+
+        # It seems that if anonymous access is turned on for the site and space,
+        # then the space is publicly accessible.
+        # For confluence server, we make a group that contains all users
+        # that exist in confluence and then just add that group to the space permissions
+        # if anonymous access is turned on for the site and space or we set is_public = True
+        # if they set the env variable CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC to True so
+        # that we can support confluence server deployments that want anonymous access
+        # to be public (we cant test this because its paywalled)
+        if user_name is None and group_name is None:
+            # Defaults to False
+            if CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC:
+                is_public = True
+            else:
+                group_names.add(ALL_CONF_EMAILS_GROUP_NAME)
+
+    user_emails = set()
+    for user_name in user_names:
+        user_email = get_user_email_from_username__server(confluence_client, user_name)
+        if user_email:
+            user_emails.add(user_email)
+        else:
+            logger.warning(f"Email for user {user_name} not found in Confluence")
+
+    if not user_emails and not group_names:
+        logger.warning(
+            "No user emails or group names found in Confluence space permissions"
+            f"\nSpace key: {space_key}"
+            f"\nSpace permissions: {space_permissions}"
+        )
+
+    return ExternalAccess(
+        external_user_emails=user_emails,
+        external_user_group_ids=group_names,
+        is_public=is_public,
+    )
+
+
+def _get_cloud_space_permissions(
+    confluence_client: OnyxConfluence, space_key: str
+) -> ExternalAccess:
+    space_permissions_result = confluence_client.get_space(
+        space_key=space_key, expand="permissions"
+    )
+    space_permissions = space_permissions_result.get("permissions", [])
+
+    user_emails = set()
+    group_names = set()
+    is_externally_public = False
+    for permission in space_permissions:
+        subs = permission.get("subjects")
+        if subs:
+            # If there are subjects, then there are explicit users or groups with access
+            if email := subs.get("user", {}).get("results", [{}])[0].get("email"):
+                user_emails.add(email)
+            if group_name := subs.get("group", {}).get("results", [{}])[0].get("name"):
+                group_names.add(group_name)
+        else:
+            # If there are no subjects, then the permission is for everyone
+            if permission.get("operation", {}).get(
+                "operation"
+            ) == "read" and permission.get("anonymousAccess", False):
+                # If the permission specifies read access for anonymous users, then
+                # the space is publicly accessible
+                is_externally_public = True
+
+    return ExternalAccess(
+        external_user_emails=user_emails,
+        external_user_group_ids=group_names,
+        is_public=is_externally_public,
+    )
+
+
+def get_space_permission(
+    confluence_client: OnyxConfluence,
+    space_key: str,
+    is_cloud: bool,
+) -> ExternalAccess:
+    if is_cloud:
+        space_permissions = _get_cloud_space_permissions(confluence_client, space_key)
+    else:
+        space_permissions = _get_server_space_permissions(confluence_client, space_key)
+
+    if (
+        not space_permissions.is_public
+        and not space_permissions.external_user_emails
+        and not space_permissions.external_user_group_ids
+    ):
+        logger.warning(
+            f"No permissions found for space '{space_key}'. This is very unlikely"
+            "to be correct and is more likely caused by an access token with"
+            "insufficient permissions. Make sure that the access token has Admin"
+            f"permissions for space '{space_key}'"
+        )
+
+    return space_permissions
+
+
+def get_all_space_permissions(
+    confluence_client: OnyxConfluence,
+    is_cloud: bool,
+) -> dict[str, ExternalAccess]:
+    logger.debug("Getting space permissions")
+    # Gets all the spaces in the Confluence instance
+    all_space_keys = []
+    start = 0
+    while True:
+        spaces_batch = confluence_client.get_all_spaces(
+            start=start, limit=REQUEST_PAGINATION_LIMIT
+        )
+        for space in spaces_batch.get("results", []):
+            all_space_keys.append(space.get("key"))
+
+        if len(spaces_batch.get("results", [])) < REQUEST_PAGINATION_LIMIT:
+            break
+
+        start += len(spaces_batch.get("results", []))
+
+    # Gets the permissions for each space
+    logger.debug(f"Got {len(all_space_keys)} spaces from confluence")
+    space_permissions_by_space_key: dict[str, ExternalAccess] = {}
+    for space_key in all_space_keys:
+        space_permissions = get_space_permission(confluence_client, space_key, is_cloud)
+
+        # Stores the permissions for each space
+        space_permissions_by_space_key[space_key] = space_permissions
+
+    return space_permissions_by_space_key
--- a/backend/ee/onyx/external_permissions/github/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/github/doc_sync.py
@@ -0,0 +1,294 @@
+import json
+from collections.abc import Generator
+
+from github import Github
+from github.Repository import Repository
+
+from ee.onyx.external_permissions.github.utils import fetch_repository_team_slugs
+from ee.onyx.external_permissions.github.utils import form_collaborators_group_id
+from ee.onyx.external_permissions.github.utils import form_organization_group_id
+from ee.onyx.external_permissions.github.utils import (
+    form_outside_collaborators_group_id,
+)
+from ee.onyx.external_permissions.github.utils import get_external_access_permission
+from ee.onyx.external_permissions.github.utils import get_repository_visibility
+from ee.onyx.external_permissions.github.utils import GitHubVisibility
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
+from onyx.access.models import DocExternalAccess
+from onyx.access.utils import build_ext_group_name_for_onyx
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.github.connector import DocMetadata
+from onyx.connectors.github.connector import GithubConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.db.utils import DocumentRow
+from onyx.db.utils import SortOrder
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+GITHUB_DOC_SYNC_LABEL = "github_doc_sync"
+
+
+def github_doc_sync(
+    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
+    callback: IndexingHeartbeatInterface | None = None,
+) -> Generator[DocExternalAccess, None, None]:
+    """
+    Sync GitHub documents with external access permissions.
+
+    This function checks each repository for visibility/team changes and updates
+    document permissions accordingly without using checkpoints.
+    """
+    logger.info(f"Starting GitHub document sync for CC pair ID: {cc_pair.id}")
+
+    # Initialize GitHub connector with credentials
+    github_connector: GithubConnector = GithubConnector(
+        **cc_pair.connector.connector_specific_config
+    )
+
+    github_connector.load_credentials(cc_pair.credential.credential_json)
+    logger.info("GitHub connector credentials loaded successfully")
+
+    if not github_connector.github_client:
+        logger.error("GitHub client initialization failed")
+        raise ValueError("github_client is required")
+
+    # Get all repositories from GitHub API
+    logger.info("Fetching all repositories from GitHub API")
+    try:
+        repos = []
+        if github_connector.repositories:
+            if "," in github_connector.repositories:
+                # Multiple repositories specified
+                repos = github_connector.get_github_repos(
+                    github_connector.github_client
+                )
+            else:
+                # Single repository
+                repos = [
+                    github_connector.get_github_repo(github_connector.github_client)
+                ]
+        else:
+            # All repositories
+            repos = github_connector.get_all_repos(github_connector.github_client)
+
+        logger.info(f"Found {len(repos)} repositories to check")
+    except Exception as e:
+        logger.error(f"Failed to fetch repositories: {e}")
+        raise
+
+    repo_to_doc_list_map: dict[str, list[DocumentRow]] = {}
+    # sort order is ascending because we want to get the oldest documents first
+    existing_docs: list[DocumentRow] = fetch_all_existing_docs_fn(
+        sort_order=SortOrder.ASC
+    )
+    logger.info(f"Found {len(existing_docs)} documents to check")
+    for doc in existing_docs:
+        try:
+            doc_metadata = DocMetadata.model_validate_json(json.dumps(doc.doc_metadata))
+            if doc_metadata.repo not in repo_to_doc_list_map:
+                repo_to_doc_list_map[doc_metadata.repo] = []
+            repo_to_doc_list_map[doc_metadata.repo].append(doc)
+        except Exception as e:
+            logger.error(f"Failed to parse doc metadata: {e} for doc {doc.id}")
+            continue
+    logger.info(f"Found {len(repo_to_doc_list_map)} documents to check")
+    # Process each repository individually
+    for repo in repos:
+        try:
+            logger.info(f"Processing repository: {repo.id} (name: {repo.name})")
+            repo_doc_list: list[DocumentRow] = repo_to_doc_list_map.get(
+                repo.full_name, []
+            )
+            if not repo_doc_list:
+                logger.warning(
+                    f"No documents found for repository {repo.id} ({repo.name})"
+                )
+                continue
+
+            current_external_group_ids = repo_doc_list[0].external_user_group_ids or []
+            # Check if repository has any permission changes
+            has_changes = _check_repository_for_changes(
+                repo=repo,
+                github_client=github_connector.github_client,
+                current_external_group_ids=current_external_group_ids,
+            )
+
+            if has_changes:
+                logger.info(
+                    f"Repository {repo.id} ({repo.name}) has changes, updating documents"
+                )
+
+                # Get new external access permissions for this repository
+                new_external_access = get_external_access_permission(
+                    repo, github_connector.github_client
+                )
+
+                logger.info(
+                    f"Found {len(repo_doc_list)} documents for repository {repo.full_name}"
+                )
+
+                # Yield updated external access for each document
+                for doc in repo_doc_list:
+                    if callback:
+                        callback.progress(GITHUB_DOC_SYNC_LABEL, 1)
+
+                    yield DocExternalAccess(
+                        doc_id=doc.id,
+                        external_access=new_external_access,
+                    )
+            else:
+                logger.info(
+                    f"Repository {repo.id} ({repo.name}) has no changes, skipping"
+                )
+        except Exception as e:
+            logger.error(f"Error processing repository {repo.id} ({repo.name}): {e}")
+
+    logger.info(f"GitHub document sync completed for CC pair ID: {cc_pair.id}")
+
+
+def _check_repository_for_changes(
+    repo: Repository,
+    github_client: Github,
+    current_external_group_ids: list[str],
+) -> bool:
+    """
+    Check if repository has any permission changes (visibility or team updates).
+    """
+    logger.info(f"Checking repository {repo.id} ({repo.name}) for changes")
+
+    # Check for repository visibility changes using the sample document data
+    if _is_repo_visibility_changed_from_groups(
+        repo=repo,
+        current_external_group_ids=current_external_group_ids,
+    ):
+        logger.info(f"Repository {repo.id} ({repo.name}) has visibility changes")
+        return True
+
+    # Check for team membership changes if repository is private
+    if get_repository_visibility(
+        repo
+    ) == GitHubVisibility.PRIVATE and _teams_updated_from_groups(
+        repo=repo,
+        github_client=github_client,
+        current_external_group_ids=current_external_group_ids,
+    ):
+        logger.info(f"Repository {repo.id} ({repo.name}) has team changes")
+        return True
+
+    logger.info(f"Repository {repo.id} ({repo.name}) has no changes")
+    return False
+
+
+def _is_repo_visibility_changed_from_groups(
+    repo: Repository,
+    current_external_group_ids: list[str],
+) -> bool:
+    """
+    Check if repository visibility has changed by analyzing existing external group IDs.
+
+    Args:
+        repo: GitHub repository object
+        current_external_group_ids: List of external group IDs from existing document
+
+    Returns:
+        True if visibility has changed
+    """
+    current_repo_visibility = get_repository_visibility(repo)
+    logger.info(f"Current repository visibility: {current_repo_visibility.value}")
+
+    # Build expected group IDs for current visibility
+    collaborators_group_id = build_ext_group_name_for_onyx(
+        source=DocumentSource.GITHUB,
+        ext_group_name=form_collaborators_group_id(repo.id),
+    )
+
+    org_group_id = None
+    if repo.organization:
+        org_group_id = build_ext_group_name_for_onyx(
+            source=DocumentSource.GITHUB,
+            ext_group_name=form_organization_group_id(repo.organization.id),
+        )
+
+    # Determine existing visibility from group IDs
+    has_collaborators_group = collaborators_group_id in current_external_group_ids
+    has_org_group = org_group_id and org_group_id in current_external_group_ids
+
+    if has_collaborators_group:
+        existing_repo_visibility = GitHubVisibility.PRIVATE
+    elif has_org_group:
+        existing_repo_visibility = GitHubVisibility.INTERNAL
+    else:
+        existing_repo_visibility = GitHubVisibility.PUBLIC
+
+    logger.info(f"Inferred existing visibility: {existing_repo_visibility.value}")
+
+    visibility_changed = existing_repo_visibility != current_repo_visibility
+    if visibility_changed:
+        logger.info(
+            f"Visibility changed for repo {repo.id} ({repo.name}): "
+            f"{existing_repo_visibility.value} -> {current_repo_visibility.value}"
+        )
+
+    return visibility_changed
+
+
+def _teams_updated_from_groups(
+    repo: Repository,
+    github_client: Github,
+    current_external_group_ids: list[str],
+) -> bool:
+    """
+    Check if repository team memberships have changed using existing group IDs.
+    """
+    # Fetch current team slugs for the repository
+    current_teams = fetch_repository_team_slugs(repo=repo, github_client=github_client)
+    logger.info(
+        f"Current teams for repository {repo.id} (name: {repo.name}): {current_teams}"
+    )
+
+    # Build group IDs to exclude from team comparison (non-team groups)
+    collaborators_group_id = build_ext_group_name_for_onyx(
+        source=DocumentSource.GITHUB,
+        ext_group_name=form_collaborators_group_id(repo.id),
+    )
+    outside_collaborators_group_id = build_ext_group_name_for_onyx(
+        source=DocumentSource.GITHUB,
+        ext_group_name=form_outside_collaborators_group_id(repo.id),
+    )
+    non_team_group_ids = {collaborators_group_id, outside_collaborators_group_id}
+
+    # Extract existing team IDs from current external group IDs
+    existing_team_ids = set()
+    for group_id in current_external_group_ids:
+        # Skip all non-team groups, keep only team groups
+        if group_id not in non_team_group_ids:
+            existing_team_ids.add(group_id)
+
+    # Note: existing_team_ids from DB are already prefixed (e.g., "github__team-slug")
+    # but current_teams from API are raw team slugs, so we need to add the prefix
+    current_team_ids = set()
+    for team_slug in current_teams:
+        team_group_id = build_ext_group_name_for_onyx(
+            source=DocumentSource.GITHUB,
+            ext_group_name=team_slug,
+        )
+        current_team_ids.add(team_group_id)
+
+    logger.info(
+        f"Existing team IDs: {existing_team_ids}, Current team IDs: {current_team_ids}"
+    )
+
+    # Compare actual team IDs to detect changes
+    teams_changed = current_team_ids != existing_team_ids
+    if teams_changed:
+        logger.info(
+            f"Team changes detected for repo {repo.id} (name: {repo.name}): "
+            f"existing={existing_team_ids}, current={current_team_ids}"
+        )
+
+    return teams_changed
--- a/backend/ee/onyx/external_permissions/github/group_sync.py
+++ b/backend/ee/onyx/external_permissions/github/group_sync.py
@@ -0,0 +1,46 @@
+from collections.abc import Generator
+
+from github import Repository
+
+from ee.onyx.db.external_perm import ExternalUserGroup
+from ee.onyx.external_permissions.github.utils import get_external_user_group
+from onyx.connectors.github.connector import GithubConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def github_group_sync(
+    tenant_id: str,
+    cc_pair: ConnectorCredentialPair,
+) -> Generator[ExternalUserGroup, None, None]:
+    github_connector: GithubConnector = GithubConnector(
+        **cc_pair.connector.connector_specific_config
+    )
+    github_connector.load_credentials(cc_pair.credential.credential_json)
+    if not github_connector.github_client:
+        raise ValueError("github_client is required")
+
+    logger.info("Starting GitHub group sync...")
+    repos: list[Repository.Repository] = []
+    if github_connector.repositories:
+        if "," in github_connector.repositories:
+            # Multiple repositories specified
+            repos = github_connector.get_github_repos(github_connector.github_client)
+        else:
+            # Single repository (backward compatibility)
+            repos = [github_connector.get_github_repo(github_connector.github_client)]
+    else:
+        # All repositories
+        repos = github_connector.get_all_repos(github_connector.github_client)
+
+    for repo in repos:
+        try:
+            for external_group in get_external_user_group(
+                repo, github_connector.github_client
+            ):
+                logger.info(f"External group: {external_group}")
+                yield external_group
+        except Exception as e:
+            logger.error(f"Error processing repository {repo.id} ({repo.name}): {e}")
--- a/backend/ee/onyx/external_permissions/github/utils.py
+++ b/backend/ee/onyx/external_permissions/github/utils.py
@@ -0,0 +1,488 @@
+from collections.abc import Callable
+from enum import Enum
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import TypeVar
+
+from github import Github
+from github import RateLimitExceededException
+from github.GithubException import GithubException
+from github.NamedUser import NamedUser
+from github.Organization import Organization
+from github.PaginatedList import PaginatedList
+from github.Repository import Repository
+from github.Team import Team
+from pydantic import BaseModel
+
+from ee.onyx.db.external_perm import ExternalUserGroup
+from onyx.access.models import ExternalAccess
+from onyx.access.utils import build_ext_group_name_for_onyx
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.github.rate_limit_utils import sleep_after_rate_limit_exception
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+class GitHubVisibility(Enum):
+    """GitHub repository visibility options."""
+
+    PUBLIC = "public"
+    PRIVATE = "private"
+    INTERNAL = "internal"
+
+
+MAX_RETRY_COUNT = 3
+
+T = TypeVar("T")
+
+# Higher-order function to wrap GitHub operations with retry and exception handling
+
+
+def _run_with_retry(
+    operation: Callable[[], T],
+    description: str,
+    github_client: Github,
+    retry_count: int = 0,
+) -> Optional[T]:
+    """Execute a GitHub operation with retry on rate limit and exception handling."""
+    logger.debug(f"Starting operation '{description}', attempt {retry_count + 1}")
+    try:
+        result = operation()
+        logger.debug(f"Operation '{description}' completed successfully")
+        return result
+    except RateLimitExceededException:
+        if retry_count < MAX_RETRY_COUNT:
+            sleep_after_rate_limit_exception(github_client)
+            logger.warning(
+                f"Rate limit exceeded while {description}. Retrying... "
+                f"(attempt {retry_count + 1}/{MAX_RETRY_COUNT})"
+            )
+            return _run_with_retry(
+                operation, description, github_client, retry_count + 1
+            )
+        else:
+            error_msg = f"Max retries exceeded for {description}"
+            logger.exception(error_msg)
+            raise RuntimeError(error_msg)
+    except GithubException as e:
+        logger.warning(f"GitHub API error during {description}: {e}")
+        return None
+    except Exception as e:
+        logger.exception(f"Unexpected error during {description}: {e}")
+        return None
+
+
+class UserInfo(BaseModel):
+    """Represents a GitHub user with their basic information."""
+
+    login: str
+    name: Optional[str] = None
+    email: Optional[str] = None
+
+
+class TeamInfo(BaseModel):
+    """Represents a GitHub team with its members."""
+
+    name: str
+    slug: str
+    members: List[UserInfo]
+
+
+def _fetch_organization_members(
+    github_client: Github, org_name: str, retry_count: int = 0
+) -> List[UserInfo]:
+    """Fetch all organization members including owners and regular members."""
+    org_members: List[UserInfo] = []
+    logger.info(f"Fetching organization members for {org_name}")
+
+    org = _run_with_retry(
+        lambda: github_client.get_organization(org_name),
+        f"get organization {org_name}",
+        github_client,
+    )
+    if not org:
+        logger.error(f"Failed to fetch organization {org_name}")
+        raise RuntimeError(f"Failed to fetch organization {org_name}")
+
+    member_objs: PaginatedList[NamedUser] | list[NamedUser] = (
+        _run_with_retry(
+            lambda: org.get_members(filter_="all"),
+            f"get members for organization {org_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for member in member_objs:
+        user_info = UserInfo(login=member.login, name=member.name, email=member.email)
+        org_members.append(user_info)
+
+    logger.info(f"Fetched {len(org_members)} members for organization {org_name}")
+    return org_members
+
+
+def _fetch_repository_teams_detailed(
+    repo: Repository, github_client: Github, retry_count: int = 0
+) -> List[TeamInfo]:
+    """Fetch teams with access to the repository and their members."""
+    teams_data: List[TeamInfo] = []
+    logger.info(f"Fetching teams for repository {repo.full_name}")
+
+    team_objs: PaginatedList[Team] | list[Team] = (
+        _run_with_retry(
+            lambda: repo.get_teams(),
+            f"get teams for repository {repo.full_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for team in team_objs:
+        logger.info(
+            f"Processing team {team.name} (slug: {team.slug}) for repository {repo.full_name}"
+        )
+
+        members: PaginatedList[NamedUser] | list[NamedUser] = (
+            _run_with_retry(
+                lambda: team.get_members(),
+                f"get members for team {team.name}",
+                github_client,
+            )
+            or []
+        )
+
+        team_members = []
+        for m in members:
+            user_info = UserInfo(login=m.login, name=m.name, email=m.email)
+            team_members.append(user_info)
+
+        team_info = TeamInfo(name=team.name, slug=team.slug, members=team_members)
+        teams_data.append(team_info)
+        logger.info(f"Team {team.name} has {len(team_members)} members")
+
+    logger.info(f"Fetched {len(teams_data)} teams for repository {repo.full_name}")
+    return teams_data
+
+
+def fetch_repository_team_slugs(
+    repo: Repository, github_client: Github, retry_count: int = 0
+) -> List[str]:
+    """Fetch team slugs with access to the repository."""
+    logger.info(f"Fetching team slugs for repository {repo.full_name}")
+    teams_data: List[str] = []
+
+    team_objs: PaginatedList[Team] | list[Team] = (
+        _run_with_retry(
+            lambda: repo.get_teams(),
+            f"get teams for repository {repo.full_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for team in team_objs:
+        teams_data.append(team.slug)
+
+    logger.info(f"Fetched {len(teams_data)} team slugs for repository {repo.full_name}")
+    return teams_data
+
+
+def _get_collaborators_and_outside_collaborators(
+    github_client: Github,
+    repo: Repository,
+) -> Tuple[List[UserInfo], List[UserInfo]]:
+    """Fetch and categorize collaborators into regular and outside collaborators."""
+    collaborators: List[UserInfo] = []
+    outside_collaborators: List[UserInfo] = []
+    logger.info(f"Fetching collaborators for repository {repo.full_name}")
+
+    repo_collaborators: PaginatedList[NamedUser] | list[NamedUser] = (
+        _run_with_retry(
+            lambda: repo.get_collaborators(),
+            f"get collaborators for repository {repo.full_name}",
+            github_client,
+        )
+        or []
+    )
+
+    for collaborator in repo_collaborators:
+        is_outside = False
+
+        # Check if collaborator is outside the organization
+        if repo.organization:
+            org: Organization | None = _run_with_retry(
+                lambda: github_client.get_organization(repo.organization.login),
+                f"get organization {repo.organization.login}",
+                github_client,
+            )
+
+            if org is not None:
+                org_obj = org
+                membership = _run_with_retry(
+                    lambda: org_obj.has_in_members(collaborator),
+                    f"check membership for {collaborator.login} in org {org_obj.login}",
+                    github_client,
+                )
+                is_outside = membership is not None and not membership
+
+        info = UserInfo(
+            login=collaborator.login, name=collaborator.name, email=collaborator.email
+        )
+        if repo.organization and is_outside:
+            outside_collaborators.append(info)
+        else:
+            collaborators.append(info)
+
+    logger.info(
+        f"Categorized {len(collaborators)} regular and {len(outside_collaborators)} outside collaborators for {repo.full_name}"
+    )
+    return collaborators, outside_collaborators
+
+
+def form_collaborators_group_id(repository_id: int) -> str:
+    """Generate group ID for repository collaborators."""
+    if not repository_id:
+        logger.exception("Repository ID is required to generate collaborators group ID")
+        raise ValueError("Repository ID must be set to generate group ID.")
+    group_id = f"{repository_id}_collaborators"
+    return group_id
+
+
+def form_organization_group_id(organization_id: int) -> str:
+    """Generate group ID for organization using organization ID."""
+    if not organization_id:
+        logger.exception(
+            "Organization ID is required to generate organization group ID"
+        )
+        raise ValueError("Organization ID must be set to generate group ID.")
+    group_id = f"{organization_id}_organization"
+    return group_id
+
+
+def form_outside_collaborators_group_id(repository_id: int) -> str:
+    """Generate group ID for outside collaborators."""
+    if not repository_id:
+        logger.exception(
+            "Repository ID is required to generate outside collaborators group ID"
+        )
+        raise ValueError("Repository ID must be set to generate group ID.")
+    group_id = f"{repository_id}_outside_collaborators"
+    return group_id
+
+
+def get_repository_visibility(repo: Repository) -> GitHubVisibility:
+    """
+    Get the visibility of a repository.
+    Returns GitHubVisibility enum member.
+    """
+    if hasattr(repo, "visibility"):
+        visibility = repo.visibility
+        logger.info(
+            f"Repository {repo.full_name} visibility from attribute: {visibility}"
+        )
+        try:
+            return GitHubVisibility(visibility)
+        except ValueError:
+            logger.warning(
+                f"Unknown visibility '{visibility}' for repo {repo.full_name}, defaulting to private"
+            )
+            return GitHubVisibility.PRIVATE
+
+    logger.info(f"Repository {repo.full_name} is private")
+    return GitHubVisibility.PRIVATE
+
+
+def get_external_access_permission(
+    repo: Repository, github_client: Github, add_prefix: bool = False
+) -> ExternalAccess:
+    """
+    Get the external access permission for a repository.
+    Uses group-based permissions for efficiency and scalability.
+
+    add_prefix: When this method is called during the initial permission sync via the connector,
+                the group ID isn't prefixed with the source while inserting the document record.
+                So in that case, set add_prefix to True, allowing the method itself to handle
+                prefixing. However, when the same method is invoked from doc_sync, our system
+                already adds the prefix to the group ID while processing the ExternalAccess object.
+    """
+    # We maintain collaborators, and outside collaborators as two separate groups
+    # instead of adding individual user emails to ExternalAccess.external_user_emails for two reasons:
+    # 1. Changes in repo collaborators (additions/removals) would require updating all documents.
+    # 2. Repo permissions can change without updating the repo's updated_at timestamp,
+    #    forcing full permission syncs for all documents every time, which is inefficient.
+
+    repo_visibility = get_repository_visibility(repo)
+    logger.info(
+        f"Generating ExternalAccess for {repo.full_name}: visibility={repo_visibility.value}"
+    )
+
+    if repo_visibility == GitHubVisibility.PUBLIC:
+        logger.info(
+            f"Repository {repo.full_name} is public - allowing access to all users"
+        )
+        return ExternalAccess(
+            external_user_emails=set(),
+            external_user_group_ids=set(),
+            is_public=True,
+        )
+    elif repo_visibility == GitHubVisibility.PRIVATE:
+        logger.info(
+            f"Repository {repo.full_name} is private - setting up restricted access"
+        )
+
+        collaborators_group_id = form_collaborators_group_id(repo.id)
+        outside_collaborators_group_id = form_outside_collaborators_group_id(repo.id)
+        if add_prefix:
+            collaborators_group_id = build_ext_group_name_for_onyx(
+                source=DocumentSource.GITHUB,
+                ext_group_name=collaborators_group_id,
+            )
+            outside_collaborators_group_id = build_ext_group_name_for_onyx(
+                source=DocumentSource.GITHUB,
+                ext_group_name=outside_collaborators_group_id,
+            )
+        group_ids = {collaborators_group_id, outside_collaborators_group_id}
+
+        team_slugs = fetch_repository_team_slugs(repo, github_client)
+        if add_prefix:
+            team_slugs = [
+                build_ext_group_name_for_onyx(
+                    source=DocumentSource.GITHUB,
+                    ext_group_name=slug,
+                )
+                for slug in team_slugs
+            ]
+        group_ids.update(team_slugs)
+
+        logger.info(f"ExternalAccess groups for {repo.full_name}: {group_ids}")
+        return ExternalAccess(
+            external_user_emails=set(),
+            external_user_group_ids=group_ids,
+            is_public=False,
+        )
+    else:
+        # Internal repositories - accessible to organization members
+        logger.info(
+            f"Repository {repo.full_name} is internal - accessible to org members"
+        )
+        org_group_id = form_organization_group_id(repo.organization.id)
+        if add_prefix:
+            org_group_id = build_ext_group_name_for_onyx(
+                source=DocumentSource.GITHUB,
+                ext_group_name=org_group_id,
+            )
+        group_ids = {org_group_id}
+        logger.info(f"ExternalAccess groups for {repo.full_name}: {group_ids}")
+        return ExternalAccess(
+            external_user_emails=set(),
+            external_user_group_ids=group_ids,
+            is_public=False,
+        )
+
+
+def get_external_user_group(
+    repo: Repository, github_client: Github
+) -> list[ExternalUserGroup]:
+    """
+    Get the external user group for a repository.
+    Creates ExternalUserGroup objects with actual user emails for each permission group.
+    """
+    repo_visibility = get_repository_visibility(repo)
+    logger.info(
+        f"Generating ExternalUserGroups for {repo.full_name}: visibility={repo_visibility.value}"
+    )
+
+    if repo_visibility == GitHubVisibility.PRIVATE:
+        logger.info(f"Processing private repository {repo.full_name}")
+
+        collaborators, outside_collaborators = (
+            _get_collaborators_and_outside_collaborators(github_client, repo)
+        )
+        teams = _fetch_repository_teams_detailed(repo, github_client)
+        external_user_groups = []
+
+        user_emails = set()
+        for collab in collaborators:
+            if collab.email:
+                user_emails.add(collab.email)
+            else:
+                logger.error(f"Collaborator {collab.login} has no email")
+
+        if user_emails:
+            collaborators_group = ExternalUserGroup(
+                id=form_collaborators_group_id(repo.id),
+                user_emails=list(user_emails),
+            )
+            external_user_groups.append(collaborators_group)
+            logger.info(f"Created collaborators group with {len(user_emails)} emails")
+
+        # Create group for outside collaborators
+        user_emails = set()
+        for collab in outside_collaborators:
+            if collab.email:
+                user_emails.add(collab.email)
+            else:
+                logger.error(f"Outside collaborator {collab.login} has no email")
+
+        if user_emails:
+            outside_collaborators_group = ExternalUserGroup(
+                id=form_outside_collaborators_group_id(repo.id),
+                user_emails=list(user_emails),
+            )
+            external_user_groups.append(outside_collaborators_group)
+            logger.info(
+                f"Created outside collaborators group with {len(user_emails)} emails"
+            )
+
+        # Create groups for teams
+        for team in teams:
+            user_emails = set()
+            for member in team.members:
+                if member.email:
+                    user_emails.add(member.email)
+                else:
+                    logger.error(f"Team member {member.login} has no email")
+
+            if user_emails:
+                team_group = ExternalUserGroup(
+                    id=team.slug,
+                    user_emails=list(user_emails),
+                )
+                external_user_groups.append(team_group)
+                logger.info(
+                    f"Created team group {team.name} with {len(user_emails)} emails"
+                )
+
+        logger.info(
+            f"Created {len(external_user_groups)} ExternalUserGroups for private repository {repo.full_name}"
+        )
+        return external_user_groups
+
+    if repo_visibility == GitHubVisibility.INTERNAL:
+        logger.info(f"Processing internal repository {repo.full_name}")
+
+        org_group_id = form_organization_group_id(repo.organization.id)
+        org_members = _fetch_organization_members(
+            github_client, repo.organization.login
+        )
+
+        user_emails = set()
+        for member in org_members:
+            if member.email:
+                user_emails.add(member.email)
+            else:
+                logger.error(f"Org member {member.login} has no email")
+
+        org_group = ExternalUserGroup(
+            id=org_group_id,
+            user_emails=list(user_emails),
+        )
+        logger.info(
+            f"Created organization group with {len(user_emails)} emails for internal repository {repo.full_name}"
+        )
+        return [org_group]
+
+    logger.info(f"Repository {repo.full_name} is public - no user groups needed")
+    return []
--- a/backend/ee/onyx/external_permissions/gmail/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/gmail/doc_sync.py
@@ -2,8 +2,9 @@ from collections.abc import Generator
 from datetime import datetime
 from datetime import timezone

+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from onyx.access.models import DocExternalAccess
-from onyx.access.models import ExternalAccess
 from onyx.connectors.gmail.connector import GmailConnector
 from onyx.connectors.interfaces import GenerateSlimDocumentOutput
 from onyx.db.models import ConnectorCredentialPair
@@ -34,6 +35,8 @@ def _get_slim_doc_generator(

 def gmail_doc_sync(
    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
    callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
    """
@@ -57,17 +60,11 @@ def gmail_doc_sync(

                callback.progress("gmail_doc_sync", 1)

-            if slim_doc.perm_sync_data is None:
+            if slim_doc.external_access is None:
                logger.warning(f"No permissions found for document {slim_doc.id}")
                continue

-            if user_email := slim_doc.perm_sync_data.get("user_email"):
-                ext_access = ExternalAccess(
-                    external_user_emails=set([user_email]),
-                    external_user_group_ids=set(),
-                    is_public=False,
-                )
-                yield DocExternalAccess(
-                    doc_id=slim_doc.id,
-                    external_access=ext_access,
-                )
+            yield DocExternalAccess(
+                doc_id=slim_doc.id,
+                external_access=slim_doc.external_access,
+            )
--- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py
@@ -1,23 +1,26 @@
 from collections.abc import Generator
 from datetime import datetime
 from datetime import timezone
-from typing import Any

+from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
+from ee.onyx.external_permissions.google_drive.models import PermissionType
+from ee.onyx.external_permissions.google_drive.permission_retrieval import (
+    get_permissions_by_ids,
+)
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
 from onyx.access.models import DocExternalAccess
 from onyx.access.models import ExternalAccess
 from onyx.connectors.google_drive.connector import GoogleDriveConnector
-from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
-from onyx.connectors.google_utils.resources import get_drive_service
+from onyx.connectors.google_drive.models import GoogleDriveFileType
+from onyx.connectors.google_utils.resources import GoogleDriveService
 from onyx.connectors.interfaces import GenerateSlimDocumentOutput
-from onyx.connectors.models import SlimDocument
 from onyx.db.models import ConnectorCredentialPair
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.logger import setup_logger

 logger = setup_logger()

-_PERMISSION_ID_PERMISSION_MAP: dict[str, dict[str, Any]] = {}
-

 def _get_slim_doc_generator(
    cc_pair: ConnectorCredentialPair,
@@ -38,105 +41,124 @@ def _get_slim_doc_generator(
    )


-def _fetch_permissions_for_permission_ids(
-    google_drive_connector: GoogleDriveConnector,
-    permission_ids: list[str],
-    permission_info: dict[str, Any],
-) -> list[dict[str, Any]]:
-    doc_id = permission_info.get("doc_id")
-    if not permission_info or not doc_id:
-        return []
+def _merge_permissions_lists(
+    permission_lists: list[list[GoogleDrivePermission]],
+) -> list[GoogleDrivePermission]:
+    """
+    Merge a list of permission lists into a single list of permissions.
+    """
+    seen_permission_ids: set[str] = set()
+    merged_permissions: list[GoogleDrivePermission] = []
+    for permission_list in permission_lists:
+        for permission in permission_list:
+            if permission.id not in seen_permission_ids:
+                merged_permissions.append(permission)
+                seen_permission_ids.add(permission.id)

-    permissions = [
-        _PERMISSION_ID_PERMISSION_MAP[pid]
-        for pid in permission_ids
-        if pid in _PERMISSION_ID_PERMISSION_MAP
-    ]
-
-    if len(permissions) == len(permission_ids):
-        return permissions
-
-    owner_email = permission_info.get("owner_email")
-
-    drive_service = get_drive_service(
-        creds=google_drive_connector.creds,
-        user_email=(owner_email or google_drive_connector.primary_admin_email),
-    )
-
-    # We continue on 404 or 403 because the document may not exist or the user may not have access to it
-    fetched_permissions = execute_paginated_retrieval(
-        retrieval_function=drive_service.permissions().list,
-        list_key="permissions",
-        fileId=doc_id,
-        fields="permissions(id, emailAddress, type, domain)",
-        supportsAllDrives=True,
-        continue_on_404_or_403=True,
-    )
-
-    permissions_for_doc_id = []
-    for permission in fetched_permissions:
-        permissions_for_doc_id.append(permission)
-        _PERMISSION_ID_PERMISSION_MAP[permission["id"]] = permission
-
-    return permissions_for_doc_id
+    return merged_permissions


-def _get_permissions_from_slim_doc(
-    google_drive_connector: GoogleDriveConnector,
-    slim_doc: SlimDocument,
+def get_external_access_for_raw_gdrive_file(
+    file: GoogleDriveFileType,
+    company_domain: str,
+    retriever_drive_service: GoogleDriveService | None,
+    admin_drive_service: GoogleDriveService,
 ) -> ExternalAccess:
-    permission_info = slim_doc.perm_sync_data or {}
+    """
+    Get the external access for a raw Google Drive file.

-    permissions_list = permission_info.get("permissions", [])
-    if not permissions_list:
-        if permission_ids := permission_info.get("permission_ids"):
-            permissions_list = _fetch_permissions_for_permission_ids(
-                google_drive_connector=google_drive_connector,
+    Assumes the file we retrieved has EITHER `permissions` or `permission_ids`
+    """
+    doc_id = file.get("id")
+    if not doc_id:
+        raise ValueError("No doc_id found in file")
+
+    permissions = file.get("permissions")
+    permission_ids = file.get("permissionIds")
+    drive_id = file.get("driveId")
+
+    permissions_list: list[GoogleDrivePermission] = []
+    if permissions:
+        permissions_list = [
+            GoogleDrivePermission.from_drive_permission(p) for p in permissions
+        ]
+    elif permission_ids:
+
+        def _get_permissions(
+            drive_service: GoogleDriveService,
+        ) -> list[GoogleDrivePermission]:
+            return get_permissions_by_ids(
+                drive_service=drive_service,
+                doc_id=doc_id,
                permission_ids=permission_ids,
-                permission_info=permission_info,
-            )
-        if not permissions_list:
-            logger.warning(f"No permissions found for document {slim_doc.id}")
-            return ExternalAccess(
-                external_user_emails=set(),
-                external_user_group_ids=set(),
-                is_public=False,
            )

-    company_domain = google_drive_connector.google_domain
+        permissions_list = _get_permissions(
+            retriever_drive_service or admin_drive_service
+        )
+        if len(permissions_list) != len(permission_ids) and retriever_drive_service:
+            logger.warning(
+                f"Failed to get all permissions for file {doc_id} with retriever service, "
+                "trying admin service"
+            )
+            backup_permissions_list = _get_permissions(admin_drive_service)
+            permissions_list = _merge_permissions_lists(
+                [permissions_list, backup_permissions_list]
+            )
+
+    folder_ids_to_inherit_permissions_from: set[str] = set()
    user_emails: set[str] = set()
    group_emails: set[str] = set()
    public = False
-    skipped_permissions = 0

    for permission in permissions_list:
-        if not permission:
-            skipped_permissions += 1
-            continue
+        # if the permission is inherited, do not add it directly to the file
+        # instead, add the folder ID as a group that has access to the file
+        # we will then handle mapping that folder to the list of Onyx users
+        # in the group sync job
+        # NOTE: this doesn't handle the case where a folder initially has no
+        # permissioning, but then later that folder is shared with a user or group.
+        # We could fetch all ancestors of the file to get the list of folders that
+        # might affect the permissions of the file, but this will get replaced with
+        # an audit-log based approach in the future so not doing it now.
+        if permission.inherited_from:
+            folder_ids_to_inherit_permissions_from.add(permission.inherited_from)

-        permission_type = permission["type"]
-        if permission_type == "user":
-            user_emails.add(permission["emailAddress"])
-        elif permission_type == "group":
-            group_emails.add(permission["emailAddress"])
-        elif permission_type == "domain" and company_domain:
-            if permission.get("domain") == company_domain:
+        if permission.type == PermissionType.USER:
+            if permission.email_address:
+                user_emails.add(permission.email_address)
+            else:
+                logger.error(
+                    "Permission is type `user` but no email address is "
+                    f"provided for document {doc_id}"
+                    f"\n {permission}"
+                )
+        elif permission.type == PermissionType.GROUP:
+            # groups are represented as email addresses within Drive
+            if permission.email_address:
+                group_emails.add(permission.email_address)
+            else:
+                logger.error(
+                    "Permission is type `group` but no email address is "
+                    f"provided for document {doc_id}"
+                    f"\n {permission}"
+                )
+        elif permission.type == PermissionType.DOMAIN and company_domain:
+            if permission.domain == company_domain:
                public = True
            else:
                logger.warning(
                    "Permission is type domain but does not match company domain:"
                    f"\n {permission}"
                )
-        elif permission_type == "anyone":
+        elif permission.type == PermissionType.ANYONE:
            public = True

-    if skipped_permissions > 0:
-        logger.warning(
-            f"Skipped {skipped_permissions} permissions of {len(permissions_list)} for document {slim_doc.id}"
-        )
-
-    drive_id = permission_info.get("drive_id")
-    group_ids = group_emails | ({drive_id} if drive_id is not None else set())
+    group_ids = (
+        group_emails
+        | folder_ids_to_inherit_permissions_from
+        | ({drive_id} if drive_id is not None else set())
+    )

    return ExternalAccess(
        external_user_emails=user_emails,
@@ -147,6 +169,8 @@ def _get_permissions_from_slim_doc(

 def gdrive_doc_sync(
    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
    callback: IndexingHeartbeatInterface | None,
 ) -> Generator[DocExternalAccess, None, None]:
    """
@@ -162,7 +186,9 @@ def gdrive_doc_sync(

    slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector)

+    total_processed = 0
    for slim_doc_batch in slim_doc_generator:
+        logger.info(f"Drive perm sync: Processing {len(slim_doc_batch)} documents")
        for slim_doc in slim_doc_batch:
            if callback:
                if callback.should_stop():
@@ -170,11 +196,14 @@ def gdrive_doc_sync(

                callback.progress("gdrive_doc_sync", 1)

-            ext_access = _get_permissions_from_slim_doc(
-                google_drive_connector=google_drive_connector,
-                slim_doc=slim_doc,
-            )
+            if slim_doc.external_access is None:
+                raise ValueError(
+                    f"Drive perm sync: No external access for document {slim_doc.id}"
+                )
+
            yield DocExternalAccess(
-                external_access=ext_access,
+                external_access=slim_doc.external_access,
                doc_id=slim_doc.id,
            )
+        total_processed += len(slim_doc_batch)
+        logger.info(f"Drive perm sync: Processed {total_processed} total documents")
--- a/backend/ee/onyx/external_permissions/google_drive/folder_retrieval.py
+++ b/backend/ee/onyx/external_permissions/google_drive/folder_retrieval.py
@@ -0,0 +1,84 @@
+from collections.abc import Iterator
+
+from googleapiclient.discovery import Resource  # type: ignore
+
+from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
+from ee.onyx.external_permissions.google_drive.permission_retrieval import (
+    get_permissions_by_ids,
+)
+from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
+from onyx.connectors.google_drive.file_retrieval import generate_time_range_filter
+from onyx.connectors.google_drive.models import GoogleDriveFileType
+from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
+from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+# Only include fields we need - folder ID and permissions
+# IMPORTANT: must fetch permissionIds, since sometimes the drive API
+# seems to miss permissions when requesting them directly
+FOLDER_PERMISSION_FIELDS = (
+    "nextPageToken, files(id, name, permissionIds, "
+    "permissions(id, emailAddress, type, domain, permissionDetails))"
+)
+
+
+def get_folder_permissions_by_ids(
+    service: Resource,
+    folder_id: str,
+    permission_ids: list[str],
+) -> list[GoogleDrivePermission]:
+    """
+    Retrieves permissions for a specific folder filtered by permission IDs.
+
+    Args:
+        service: The Google Drive service instance
+        folder_id: The ID of the folder to fetch permissions for
+        permission_ids: A list of permission IDs to filter by
+
+    Returns:
+        A list of permissions matching the provided permission IDs
+    """
+    return get_permissions_by_ids(
+        drive_service=service,
+        doc_id=folder_id,
+        permission_ids=permission_ids,
+    )
+
+
+def get_modified_folders(
+    service: Resource,
+    start: SecondsSinceUnixEpoch | None = None,
+    end: SecondsSinceUnixEpoch | None = None,
+) -> Iterator[GoogleDriveFileType]:
+    """
+    Retrieves all folders that were modified within the specified time range.
+    Only includes folder ID and permission information, not any contained files.
+
+    Args:
+        service: The Google Drive service instance
+        start: The start time as seconds since Unix epoch (inclusive)
+        end: The end time as seconds since Unix epoch (inclusive)
+
+    Returns:
+        An iterator yielding folder information including ID and permissions
+    """
+    # Build query for folders
+    query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
+    query += " and trashed = false"
+    query += generate_time_range_filter(start, end)
+
+    # Retrieve and yield folders
+    for folder in execute_paginated_retrieval(
+        retrieval_function=service.files().list,
+        list_key="files",
+        continue_on_404_or_403=True,
+        corpora="allDrives",
+        supportsAllDrives=True,
+        includeItemsFromAllDrives=True,
+        includePermissionsForView="published",
+        fields=FOLDER_PERMISSION_FIELDS,
+        q=query,
+    ):
+        yield folder
--- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py
+++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py
@@ -1,4 +1,17 @@
+from collections.abc import Generator
+
+from googleapiclient.errors import HttpError  # type: ignore
+from pydantic import BaseModel
+
 from ee.onyx.db.external_perm import ExternalUserGroup
+from ee.onyx.external_permissions.google_drive.folder_retrieval import (
+    get_folder_permissions_by_ids,
+)
+from ee.onyx.external_permissions.google_drive.folder_retrieval import (
+    get_modified_folders,
+)
+from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
+from ee.onyx.external_permissions.google_drive.models import PermissionType
 from onyx.connectors.google_drive.connector import GoogleDriveConnector
 from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
 from onyx.connectors.google_utils.resources import AdminService
@@ -10,8 +23,148 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


+"""
+Folder Permission Sync.
+
+Each folder is treated as a group. Each file has all ancestor folders
+as groups.
+"""
+
+
+class FolderInfo(BaseModel):
+    id: str
+    permissions: list[GoogleDrivePermission]
+
+
+def _get_all_folders(
+    google_drive_connector: GoogleDriveConnector, skip_folders_without_permissions: bool
+) -> list[FolderInfo]:
+    """Have to get all folders since the group syncing system assumes all groups
+    are returned every time.
+
+    TODO: tweak things so we can fetch deltas.
+    """
+    MAX_FAILED_PERCENTAGE = 0.5
+
+    all_folders: list[FolderInfo] = []
+    seen_folder_ids: set[str] = set()
+
+    def _get_all_folders_for_user(
+        google_drive_connector: GoogleDriveConnector,
+        skip_folders_without_permissions: bool,
+        user_email: str,
+    ) -> None:
+        """Helper to get folders for a specific user + update shared seen_folder_ids"""
+        drive_service = get_drive_service(
+            google_drive_connector.creds,
+            user_email,
+        )
+
+        for folder in get_modified_folders(
+            service=drive_service,
+        ):
+            folder_id = folder["id"]
+            if folder_id in seen_folder_ids:
+                logger.debug(f"Folder {folder_id} has already been seen. Skipping.")
+                continue
+
+            seen_folder_ids.add(folder_id)
+
+            # Check if the folder has permission IDs but no permissions
+            permission_ids = folder.get("permissionIds", [])
+            raw_permissions = folder.get("permissions", [])
+
+            if not raw_permissions and permission_ids:
+                # Fetch permissions using the IDs
+                permissions = get_folder_permissions_by_ids(
+                    drive_service, folder_id, permission_ids
+                )
+            else:
+                permissions = [
+                    GoogleDrivePermission.from_drive_permission(permission)
+                    for permission in raw_permissions
+                ]
+
+            # Don't include inherited permissions, those will be captured
+            # by the folder/shared drive itself
+            permissions = [
+                permission
+                for permission in permissions
+                if permission.inherited_from is None
+            ]
+
+            if not permissions and skip_folders_without_permissions:
+                logger.debug(f"Folder {folder_id} has no permissions. Skipping.")
+                continue
+
+            all_folders.append(
+                FolderInfo(
+                    id=folder_id,
+                    permissions=permissions,
+                )
+            )
+
+    failed_count = 0
+    user_emails = google_drive_connector._get_all_user_emails()
+    for user_email in user_emails:
+        try:
+            _get_all_folders_for_user(
+                google_drive_connector, skip_folders_without_permissions, user_email
+            )
+        except Exception:
+            logger.exception(f"Error getting folders for user {user_email}")
+            failed_count += 1
+
+            if failed_count > MAX_FAILED_PERCENTAGE * len(user_emails):
+                raise RuntimeError("Too many failed folder fetches during group sync")
+
+    return all_folders
+
+
+def _drive_folder_to_onyx_group(
+    folder: FolderInfo,
+    group_email_to_member_emails_map: dict[str, list[str]],
+) -> ExternalUserGroup:
+    """
+    Converts a folder into an Onyx group.
+    """
+    anyone_can_access = False
+    folder_member_emails: set[str] = set()
+
+    for permission in folder.permissions:
+        if permission.type == PermissionType.USER:
+            if permission.email_address is None:
+                logger.warning(
+                    f"User email is None for folder {folder.id} permission {permission}"
+                )
+                continue
+            folder_member_emails.add(permission.email_address)
+        elif permission.type == PermissionType.GROUP:
+            if permission.email_address not in group_email_to_member_emails_map:
+                logger.warning(
+                    f"Group email {permission.email_address} for folder {folder.id} "
+                    "not found in group_email_to_member_emails_map"
+                )
+                continue
+            folder_member_emails.update(
+                group_email_to_member_emails_map[permission.email_address]
+            )
+        elif permission.type == PermissionType.ANYONE:
+            anyone_can_access = True
+
+    return ExternalUserGroup(
+        id=folder.id,
+        user_emails=list(folder_member_emails),
+        gives_anyone_access=anyone_can_access,
+    )
+
+
+"""Individual Shared Drive / My Drive Permission Sync"""
+
+
 def _get_drive_members(
    google_drive_connector: GoogleDriveConnector,
+    admin_service: AdminService,
 ) -> dict[str, tuple[set[str], set[str]]]:
    """
    This builds a map of drive ids to their members (group and user emails).
@@ -20,6 +173,8 @@ def _get_drive_members(
        "drive_id_2": ({"group_email_3"}, {"user_email_3"}),
    }
    """
+
+    # fetches shared drives only
    drive_ids = google_drive_connector.get_all_drive_ids()

    drive_id_to_members_map: dict[str, tuple[set[str], set[str]]] = {}
@@ -28,25 +183,73 @@ def _get_drive_members(
        google_drive_connector.primary_admin_email,
    )

+    admin_user_info = (
+        admin_service.users()
+        .get(userKey=google_drive_connector.primary_admin_email)
+        .execute()
+    )
+    is_admin = admin_user_info.get("isAdmin", False) or admin_user_info.get(
+        "isDelegatedAdmin", False
+    )
+
    for drive_id in drive_ids:
        group_emails: set[str] = set()
        user_emails: set[str] = set()
-        for permission in execute_paginated_retrieval(
-            drive_service.permissions().list,
-            list_key="permissions",
-            fileId=drive_id,
-            fields="permissions(emailAddress, type)",
-            supportsAllDrives=True,
-        ):
-            if permission["type"] == "group":
-                group_emails.add(permission["emailAddress"])
-            elif permission["type"] == "user":
-                user_emails.add(permission["emailAddress"])
+
+        try:
+            for permission in execute_paginated_retrieval(
+                drive_service.permissions().list,
+                list_key="permissions",
+                fileId=drive_id,
+                fields="permissions(emailAddress, type),nextPageToken",
+                supportsAllDrives=True,
+                # can only set `useDomainAdminAccess` to true if the user
+                # is an admin
+                useDomainAdminAccess=is_admin,
+            ):
+                # NOTE: don't need to check for PermissionType.ANYONE since
+                # you can't share a drive with the internet
+                if permission["type"] == PermissionType.GROUP:
+                    group_emails.add(permission["emailAddress"])
+                elif permission["type"] == PermissionType.USER:
+                    user_emails.add(permission["emailAddress"])
+        except HttpError as e:
+            if e.status_code == 404:
+                logger.warning(
+                    f"Error getting permissions for drive id {drive_id}. "
+                    f"User '{google_drive_connector.primary_admin_email}' likely "
+                    f"does not have access to this drive. Exception: {e}"
+                )
+            else:
+                raise e
+
        drive_id_to_members_map[drive_id] = (group_emails, user_emails)
    return drive_id_to_members_map


-def _get_all_groups(
+def _drive_member_map_to_onyx_groups(
+    drive_id_to_members_map: dict[str, tuple[set[str], set[str]]],
+    group_email_to_member_emails_map: dict[str, list[str]],
+) -> Generator[ExternalUserGroup, None, None]:
+    """The `user_emails` for the Shared Drive should be all individuals in the
+    Shared Drive + the union of all flattened group emails."""
+    for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items():
+        drive_member_emails: set[str] = user_emails
+        for group_email in group_emails:
+            if group_email not in group_email_to_member_emails_map:
+                logger.warning(
+                    f"Group email {group_email} for drive {drive_id} not found in "
+                    "group_email_to_member_emails_map"
+                )
+                continue
+            drive_member_emails.update(group_email_to_member_emails_map[group_email])
+        yield ExternalUserGroup(
+            id=drive_id,
+            user_emails=list(drive_member_emails),
+        )
+
+
+def _get_all_google_groups(
    admin_service: AdminService,
    google_domain: str,
 ) -> set[str]:
@@ -58,12 +261,34 @@ def _get_all_groups(
        admin_service.groups().list,
        list_key="groups",
        domain=google_domain,
-        fields="groups(email)",
+        fields="groups(email),nextPageToken",
    ):
        group_emails.add(group["email"])
    return group_emails


+def _google_group_to_onyx_group(
+    admin_service: AdminService,
+    group_email: str,
+) -> ExternalUserGroup:
+    """
+    This maps google group emails to their member emails.
+    """
+    group_member_emails: set[str] = set()
+    for member in execute_paginated_retrieval(
+        admin_service.members().list,
+        list_key="members",
+        groupKey=group_email,
+        fields="members(email),nextPageToken",
+    ):
+        group_member_emails.add(member["email"])
+
+    return ExternalUserGroup(
+        id=group_email,
+        user_emails=list(group_member_emails),
+    )
+
+
 def _map_group_email_to_member_emails(
    admin_service: AdminService,
    group_emails: set[str],
@@ -78,7 +303,7 @@ def _map_group_email_to_member_emails(
            admin_service.members().list,
            list_key="members",
            groupKey=group_email,
-            fields="members(email)",
+            fields="members(email),nextPageToken",
        ):
            group_member_emails.add(member["email"])

@@ -89,6 +314,7 @@ def _map_group_email_to_member_emails(
 def _build_onyx_groups(
    drive_id_to_members_map: dict[str, tuple[set[str], set[str]]],
    group_email_to_member_emails_map: dict[str, set[str]],
+    folder_info: list[FolderInfo],
 ) -> list[ExternalUserGroup]:
    onyx_groups: list[ExternalUserGroup] = []

@@ -96,13 +322,52 @@ def _build_onyx_groups(
    # This is because having drive level access means you have
    # irrevocable access to all the files in the drive.
    for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items():
-        all_member_emails: set[str] = user_emails
+        drive_member_emails: set[str] = user_emails
        for group_email in group_emails:
-            all_member_emails.update(group_email_to_member_emails_map[group_email])
+            if group_email not in group_email_to_member_emails_map:
+                logger.warning(
+                    f"Group email {group_email} for drive {drive_id} not found in "
+                    "group_email_to_member_emails_map"
+                )
+                continue
+            drive_member_emails.update(group_email_to_member_emails_map[group_email])
        onyx_groups.append(
            ExternalUserGroup(
                id=drive_id,
-                user_emails=list(all_member_emails),
+                user_emails=list(drive_member_emails),
+            )
+        )
+
+    # Convert all folder permissions to onyx groups
+    for folder in folder_info:
+        anyone_can_access = False
+        folder_member_emails: set[str] = set()
+        for permission in folder.permissions:
+            if permission.type == PermissionType.USER:
+                if permission.email_address is None:
+                    logger.warning(
+                        f"User email is None for folder {folder.id} permission {permission}"
+                    )
+                    continue
+                folder_member_emails.add(permission.email_address)
+            elif permission.type == PermissionType.GROUP:
+                if permission.email_address not in group_email_to_member_emails_map:
+                    logger.warning(
+                        f"Group email {permission.email_address} for folder {folder.id} "
+                        "not found in group_email_to_member_emails_map"
+                    )
+                    continue
+                folder_member_emails.update(
+                    group_email_to_member_emails_map[permission.email_address]
+                )
+            elif permission.type == PermissionType.ANYONE:
+                anyone_can_access = True
+
+        onyx_groups.append(
+            ExternalUserGroup(
+                id=folder.id,
+                user_emails=list(folder_member_emails),
+                gives_anyone_access=anyone_can_access,
            )
        )

@@ -121,7 +386,7 @@ def _build_onyx_groups(
 def gdrive_group_sync(
    tenant_id: str,
    cc_pair: ConnectorCredentialPair,
-) -> list[ExternalUserGroup]:
+) -> Generator[ExternalUserGroup, None, None]:
    # Initialize connector and build credential/service objects
    google_drive_connector = GoogleDriveConnector(
        **cc_pair.connector.connector_specific_config
@@ -132,22 +397,30 @@ def gdrive_group_sync(
    )

    # Get all drive members
-    drive_id_to_members_map = _get_drive_members(google_drive_connector)
+    drive_id_to_members_map = _get_drive_members(google_drive_connector, admin_service)

    # Get all group emails
-    all_group_emails = _get_all_groups(
+    all_group_emails = _get_all_google_groups(
        admin_service, google_drive_connector.google_domain
    )

-    # Map group emails to their members
-    group_email_to_member_emails_map = _map_group_email_to_member_emails(
-        admin_service, all_group_emails
-    )
+    # Each google group is an Onyx group, yield those
+    group_email_to_member_emails_map: dict[str, list[str]] = {}
+    for group_email in all_group_emails:
+        onyx_group = _google_group_to_onyx_group(admin_service, group_email)
+        group_email_to_member_emails_map[group_email] = onyx_group.user_emails
+        yield onyx_group

-    # Convert the maps to onyx groups
-    onyx_groups = _build_onyx_groups(
-        drive_id_to_members_map=drive_id_to_members_map,
-        group_email_to_member_emails_map=group_email_to_member_emails_map,
-    )
+    # Each drive is a group, yield those
+    for onyx_group in _drive_member_map_to_onyx_groups(
+        drive_id_to_members_map, group_email_to_member_emails_map
+    ):
+        yield onyx_group

-    return onyx_groups
+    # Get all folder permissions
+    folder_info = _get_all_folders(
+        google_drive_connector=google_drive_connector,
+        skip_folders_without_permissions=True,
+    )
+    for folder in folder_info:
+        yield _drive_folder_to_onyx_group(folder, group_email_to_member_emails_map)
--- a/backend/ee/onyx/external_permissions/google_drive/models.py
+++ b/backend/ee/onyx/external_permissions/google_drive/models.py
@@ -0,0 +1,64 @@
+from enum import Enum
+from typing import Any
+
+from pydantic import BaseModel
+
+
+class PermissionType(str, Enum):
+    USER = "user"
+    GROUP = "group"
+    DOMAIN = "domain"
+    ANYONE = "anyone"
+
+
+class GoogleDrivePermissionDetails(BaseModel):
+    # this is "file", "member", etc.
+    # different from the `type` field within `GoogleDrivePermission`
+    # Sometimes can be not, although not sure why...
+    permission_type: str | None
+    # this is "reader", "writer", "owner", etc.
+    role: str
+    # this is the id of the parent permission
+    inherited_from: str | None
+
+
+class GoogleDrivePermission(BaseModel):
+    id: str
+    # groups are also represented as email addresses within Drive
+    # will be None for domain/global permissions
+    email_address: str | None
+    type: PermissionType
+    domain: str | None  # only applies to domain permissions
+    permission_details: GoogleDrivePermissionDetails | None
+
+    @classmethod
+    def from_drive_permission(
+        cls, drive_permission: dict[str, Any]
+    ) -> "GoogleDrivePermission":
+        # we seem to only get details for permissions that are inherited
+        # we can get multiple details if a permission is inherited from multiple
+        permission_details_list = drive_permission.get("permissionDetails", [])
+        permission_details: dict[str, Any] | None = (
+            permission_details_list[0] if permission_details_list else None
+        )
+        return cls(
+            id=drive_permission["id"],
+            email_address=drive_permission.get("emailAddress"),
+            type=PermissionType(drive_permission["type"]),
+            domain=drive_permission.get("domain"),
+            permission_details=(
+                GoogleDrivePermissionDetails(
+                    permission_type=permission_details.get("type"),
+                    role=permission_details.get("role", ""),
+                    inherited_from=permission_details.get("inheritedFrom"),
+                )
+                if permission_details
+                else None
+            ),
+        )
+
+    @property
+    def inherited_from(self) -> str | None:
+        if self.permission_details:
+            return self.permission_details.inherited_from
+        return None
--- a/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py
+++ b/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py
@@ -0,0 +1,62 @@
+from retry import retry
+
+from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
+from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval
+from onyx.connectors.google_utils.resources import GoogleDriveService
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+@retry(tries=3, delay=2, backoff=2)
+def get_permissions_by_ids(
+    drive_service: GoogleDriveService,
+    doc_id: str,
+    permission_ids: list[str],
+) -> list[GoogleDrivePermission]:
+    """
+    Fetches permissions for a document based on a list of permission IDs.
+
+    Args:
+        drive_service: The Google Drive service instance
+        doc_id: The ID of the document to fetch permissions for
+        permission_ids: A list of permission IDs to filter by
+
+    Returns:
+        A list of GoogleDrivePermission objects matching the provided permission IDs
+    """
+    if not permission_ids:
+        return []
+
+    # Create a set for faster lookup
+    permission_id_set = set(permission_ids)
+
+    # Fetch all permissions for the document
+    fetched_permissions = execute_paginated_retrieval(
+        retrieval_function=drive_service.permissions().list,
+        list_key="permissions",
+        fileId=doc_id,
+        fields="permissions(id, emailAddress, type, domain, permissionDetails),nextPageToken",
+        supportsAllDrives=True,
+        continue_on_404_or_403=True,
+    )
+
+    # Filter permissions by ID and convert to GoogleDrivePermission objects
+    filtered_permissions = []
+    for permission in fetched_permissions:
+        permission_id = permission.get("id")
+        if permission_id in permission_id_set:
+            google_drive_permission = GoogleDrivePermission.from_drive_permission(
+                permission
+            )
+            filtered_permissions.append(google_drive_permission)
+
+    # Log if we couldn't find all requested permission IDs
+    if len(filtered_permissions) < len(permission_ids):
+        missing_ids = permission_id_set - {p.id for p in filtered_permissions if p.id}
+        logger.warning(
+            f"Could not find all requested permission IDs for document {doc_id}. "
+            f"Missing IDs: {missing_ids}"
+        )
+
+    return filtered_permissions
--- a/backend/ee/onyx/external_permissions/jira/init.py
+++ b/backend/ee/onyx/external_permissions/jira/init.py
--- a/backend/ee/onyx/external_permissions/jira/doc_sync.py
+++ b/backend/ee/onyx/external_permissions/jira/doc_sync.py
@@ -0,0 +1,36 @@
+from collections.abc import Generator
+
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
+from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
+from ee.onyx.external_permissions.utils import generic_doc_sync
+from onyx.access.models import DocExternalAccess
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.jira.connector import JiraConnector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+JIRA_DOC_SYNC_TAG = "jira_doc_sync"
+
+
+def jira_doc_sync(
+    cc_pair: ConnectorCredentialPair,
+    fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
+    fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
+    callback: IndexingHeartbeatInterface | None = None,
+) -> Generator[DocExternalAccess, None, None]:
+    jira_connector = JiraConnector(
+        **cc_pair.connector.connector_specific_config,
+    )
+    jira_connector.load_credentials(cc_pair.credential.credential_json)
+
+    yield from generic_doc_sync(
+        cc_pair=cc_pair,
+        fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
+        callback=callback,
+        doc_source=DocumentSource.JIRA,
+        slim_connector=jira_connector,
+        label=JIRA_DOC_SYNC_TAG,
+    )
--- a/backend/ee/onyx/external_permissions/jira/models.py
+++ b/backend/ee/onyx/external_permissions/jira/models.py
@@ -0,0 +1,25 @@
+from typing import Any
+
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic.alias_generators import to_camel
+
+
+Holder = dict[str, Any]
+
+
+class Permission(BaseModel):
+    id: int
+    permission: str
+    holder: Holder | None
+
+
+class User(BaseModel):
+    account_id: str
+    email_address: str
+    display_name: str
+    active: bool
+
+    model_config = ConfigDict(
+        alias_generator=to_camel,
+    )
--- a/backend/ee/onyx/external_permissions/jira/page_access.py
+++ b/backend/ee/onyx/external_permissions/jira/page_access.py
@@ -0,0 +1,209 @@
+from collections import defaultdict
+
+from jira import JIRA
+from jira.resources import PermissionScheme
+from pydantic import ValidationError
+
+from ee.onyx.external_permissions.jira.models import Holder
+from ee.onyx.external_permissions.jira.models import Permission
+from ee.onyx.external_permissions.jira.models import User
+from onyx.access.models import ExternalAccess
+from onyx.utils.logger import setup_logger
+
+HolderMap = dict[str, list[Holder]]
+
+
+logger = setup_logger()
+
+
+def _build_holder_map(permissions: list[dict]) -> dict[str, list[Holder]]:
+    """
+    A "Holder" in JIRA is a person / entity who "holds" the corresponding permission.
+    It can have different types. They can be one of (but not limited to):
+        - user (an explicitly whitelisted user)
+        - projectRole (for project level "roles")
+        - reporter (the reporter of an issue)
+
+    A "Holder" usually has following structure:
+        - `{ "type": "user", "value": "$USER_ID", "user": { .. }, .. }`
+        - `{ "type": "projectRole", "value": "$PROJECT_ID", ..  }`
+
+    When we fetch the PermissionSchema from JIRA, we retrieve a list of "Holder"s.
+    The list of "Holder"s can have multiple "Holder"s of the same type in the list (e.g., you can have two `"type": "user"`s in
+    there, each corresponding to a different user).
+    This function constructs a map of "Holder" types to a list of the "Holder"s which contained that type.
+
+    Returns:
+        A dict from the "Holder" type to the actual "Holder" instance.
+
+    Example:
+        ```
+        {
+            "user": [
+                { "type": "user", "value": "10000", "user": { .. }, .. },
+                { "type": "user", "value": "10001", "user": { .. }, .. },
+            ],
+            "projectRole": [
+                { "type": "projectRole", "value": "10010", ..  },
+                { "type": "projectRole", "value": "10011", ..  },
+            ],
+            "applicationRole": [
+                { "type": "applicationRole" },
+            ],
+            ..
+        }
+        ```
+    """
+
+    holder_map: defaultdict[str, list[Holder]] = defaultdict(list)
+
+    for raw_perm in permissions:
+        if not hasattr(raw_perm, "raw"):
+            logger.warn(f"Expected a 'raw' field, but none was found: {raw_perm=}")
+            continue
+
+        permission = Permission(**raw_perm.raw)
+
+        # We only care about ability to browse through projects + issues (not other permissions such as read/write).
+        if permission.permission != "BROWSE_PROJECTS":
+            continue
+
+        # In order to associate this permission to some Atlassian entity, we need the "Holder".
+        # If this doesn't exist, then we cannot associate this permission to anyone; just skip.
+        if not permission.holder:
+            logger.warn(
+                f"Expected to find a permission holder, but none was found: {permission=}"
+            )
+            continue
+
+        type = permission.holder.get("type")
+        if not type:
+            logger.warn(
+                f"Expected to find the type of permission holder, but none was found: {permission=}"
+            )
+            continue
+
+        holder_map[type].append(permission.holder)
+
+    return holder_map
+
+
+def _get_user_emails(user_holders: list[Holder]) -> list[str]:
+    emails = []
+
+    for user_holder in user_holders:
+        if "user" not in user_holder:
+            continue
+        raw_user_dict = user_holder["user"]
+
+        try:
+            user_model = User.model_validate(raw_user_dict)
+        except ValidationError:
+            logger.error(
+                "Expected to be able to serialize the raw-user-dict into an instance of `User`, but validation failed;"
+                f"{raw_user_dict=}"
+            )
+            continue
+
+        emails.append(user_model.email_address)
+
+    return emails
+
+
+def _get_user_emails_from_project_roles(
+    jira_client: JIRA,
+    jira_project: str,
+    project_role_holders: list[Holder],
+) -> list[str]:
+    # NOTE (@raunakab) a `parallel_yield` may be helpful here...?
+    roles = [
+        jira_client.project_role(project=jira_project, id=project_role_holder["value"])
+        for project_role_holder in project_role_holders
+        if "value" in project_role_holder
+    ]
+
+    emails = []
+
+    for role in roles:
+        if not hasattr(role, "actors"):
+            continue
+
+        for actor in role.actors:
+            if not hasattr(actor, "actorUser") or not hasattr(
+                actor.actorUser, "accountId"
+            ):
+                continue
+
+            user = jira_client.user(id=actor.actorUser.accountId)
+            if not hasattr(user, "accountType") or user.accountType != "atlassian":
+                continue
+
+            if not hasattr(user, "emailAddress"):
+                msg = f"User's email address was not able to be retrieved;  {actor.actorUser.accountId=}"
+                if hasattr(user, "displayName"):
+                    msg += f" {actor.displayName=}"
+                logger.warn(msg)
+                continue
+
+            emails.append(user.emailAddress)
+
+    return emails
+
+
+def _build_external_access_from_holder_map(
+    jira_client: JIRA, jira_project: str, holder_map: HolderMap
+) -> ExternalAccess:
+    """
+    # Note:
+        If the `holder_map` contains an instance of "anyone", then this is a public JIRA project.
+        Otherwise, we fetch the "projectRole"s (i.e., the user-groups in JIRA speak), and the user emails.
+    """
+
+    if "anyone" in holder_map:
+        return ExternalAccess(
+            external_user_emails=set(), external_user_group_ids=set(), is_public=True
+        )
+
+    user_emails = (
+        _get_user_emails(user_holders=holder_map["user"])
+        if "user" in holder_map
+        else []
+    )
+    project_role_user_emails = (
+        _get_user_emails_from_project_roles(
+            jira_client=jira_client,
+            jira_project=jira_project,
+            project_role_holders=holder_map["projectRole"],
+        )
+        if "projectRole" in holder_map
+        else []
+    )
+
+    external_user_emails = set(user_emails + project_role_user_emails)
+
+    return ExternalAccess(
+        external_user_emails=external_user_emails,
+        external_user_group_ids=set(),
+        is_public=False,
+    )
+
+
+def get_project_permissions(
+    jira_client: JIRA,
+    jira_project: str,
+) -> ExternalAccess | None:
+    project_permissions: PermissionScheme = jira_client.project_permissionscheme(
+        project=jira_project
+    )
+
+    if not hasattr(project_permissions, "permissions"):
+        return None
+
+    if not isinstance(project_permissions.permissions, list):
+        return None
+
+    holder_map = _build_holder_map(permissions=project_permissions.permissions)
+
+    return _build_external_access_from_holder_map(
+        jira_client=jira_client, jira_project=jira_project, holder_map=holder_map
+    )
--- a/Show More
+++ b/Show More