more experimentation changes

experimentation
Fit Score & more suitable rewrite
2026-02-27 04:35:50 +00:00 · 2024-12-13 12:57:58 -08:00 · 2024-12-09 11:14:18 -08:00 · 2024-12-08 09:20:03 -08:00 · 2024-12-07 22:07:48 -08:00 · 2024-12-07 12:25:54 -08:00
844 changed files with 47025 additions and 27381 deletions
--- a/.github/workflows/docker-build-push-backend-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-backend-container-on-tag.yml
@@ -3,61 +3,61 @@ name: Build and Push Backend Image on Tag
 on:
  push:
    tags:
-      - '*'
+      - "*"

 env:
-  REGISTRY_IMAGE: danswer/danswer-backend
+  REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-backend-cloud' || 'danswer/danswer-backend' }}
  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
-  
+
 jobs:
  build-and-push:
-    # TODO: investigate a matrix build like the web container 
+    # TODO: investigate a matrix build like the web container
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"]
+    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]

    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
+      - name: Checkout code
+        uses: actions/checkout@v4

-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3

-    - name: Login to Docker Hub
-      uses: docker/login-action@v3
-      with:
-        username: ${{ secrets.DOCKER_USERNAME }}
-        password: ${{ secrets.DOCKER_TOKEN }}
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}

-    - name: Install build-essential
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y build-essential
-          
-    - name: Backend Image Docker Build and Push
-      uses: docker/build-push-action@v5
-      with:
-        context: ./backend
-        file: ./backend/Dockerfile
-        platforms: linux/amd64,linux/arm64
-        push: true
-        tags: |
-          ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-          ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
-        build-args: |
-          DANSWER_VERSION=${{ github.ref_name }}
+      - name: Install build-essential
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y build-essential

-    # trivy has their own rate limiting issues causing this action to flake
-    # we worked around it by hardcoding to different db repos in env
-    # can re-enable when they figure it out
-    # https://github.com/aquasecurity/trivy/discussions/7538
-    # https://github.com/aquasecurity/trivy-action/issues/389
-    - name: Run Trivy vulnerability scanner
-      uses: aquasecurity/trivy-action@master
-      env:
-        TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
-        TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
-      with:
-        # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend
-        image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-        severity: 'CRITICAL,HIGH'
-        trivyignores: ./backend/.trivyignore
+      - name: Backend Image Docker Build and Push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: |
+            ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
+            ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
+          build-args: |
+            DANSWER_VERSION=${{ github.ref_name }}
+
+      # trivy has their own rate limiting issues causing this action to flake
+      # we worked around it by hardcoding to different db repos in env
+      # can re-enable when they figure it out
+      # https://github.com/aquasecurity/trivy/discussions/7538
+      # https://github.com/aquasecurity/trivy-action/issues/389
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@master
+        env:
+          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
+          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+        with:
+          # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend
+          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
+          severity: "CRITICAL,HIGH"
+          trivyignores: ./backend/.trivyignore
--- a/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
@@ -4,12 +4,12 @@ name: Build and Push Cloud Web Image on Tag
 on:
  push:
    tags:
-      - '*'
+      - "*"

 env:
-  REGISTRY_IMAGE: danswer/danswer-cloud-web-server
+  REGISTRY_IMAGE: danswer/danswer-web-server-cloud
  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
-  
+
 jobs:
  build:
    runs-on:
@@ -28,11 +28,11 @@ jobs:
      - name: Prepare
        run: |
          platform=${{ matrix.platform }}
-          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV          
-      
+          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
+
      - name: Checkout
        uses: actions/checkout@v4
-      
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
@@ -41,16 +41,16 @@ jobs:
          tags: |
            type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
            type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
-      
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      
+
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_TOKEN }}
-    
+
      - name: Build and push by digest
        id: build
        uses: docker/build-push-action@v5
@@ -65,17 +65,18 @@ jobs:
            NEXT_PUBLIC_POSTHOG_KEY=${{ secrets.POSTHOG_KEY }}
            NEXT_PUBLIC_POSTHOG_HOST=${{ secrets.POSTHOG_HOST }}
            NEXT_PUBLIC_SENTRY_DSN=${{ secrets.SENTRY_DSN }}
-          # needed due to weird interactions with the builds for different platforms  
+            NEXT_PUBLIC_GTM_ENABLED=true
+          # needed due to weird interactions with the builds for different platforms
          no-cache: true
          labels: ${{ steps.meta.outputs.labels }}
          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
-      
+
      - name: Export digest
        run: |
          mkdir -p /tmp/digests
          digest="${{ steps.build.outputs.digest }}"
-          touch "/tmp/digests/${digest#sha256:}"          
-      
+          touch "/tmp/digests/${digest#sha256:}"
+
      - name: Upload digest
        uses: actions/upload-artifact@v4
        with:
@@ -95,42 +96,42 @@ jobs:
          path: /tmp/digests
          pattern: digests-*
          merge-multiple: true
-      
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
-      
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
-      
+
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_TOKEN }}
-      
+
      - name: Create manifest list and push
        working-directory: /tmp/digests
        run: |
          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
-            $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)          
-      
+            $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
+
      - name: Inspect image
        run: |
          docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}

-    # trivy has their own rate limiting issues causing this action to flake
-    # we worked around it by hardcoding to different db repos in env
-    # can re-enable when they figure it out
-    # https://github.com/aquasecurity/trivy/discussions/7538
-    # https://github.com/aquasecurity/trivy-action/issues/389
+      # trivy has their own rate limiting issues causing this action to flake
+      # we worked around it by hardcoding to different db repos in env
+      # can re-enable when they figure it out
+      # https://github.com/aquasecurity/trivy/discussions/7538
+      # https://github.com/aquasecurity/trivy-action/issues/389
      - name: Run Trivy vulnerability scanner
        uses: aquasecurity/trivy-action@master
        env:
-          TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
-          TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
+          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
+          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
        with:
          image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-          severity: 'CRITICAL,HIGH'
+          severity: "CRITICAL,HIGH"
--- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
@@ -3,53 +3,53 @@ name: Build and Push Model Server Image on Tag
 on:
  push:
    tags:
-      - '*'
+      - "*"

 env:
-  REGISTRY_IMAGE: danswer/danswer-model-server
+  REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-model-server-cloud' || 'danswer/danswer-model-server' }}
  LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
-  
+
 jobs:
  build-and-push:
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"]
+    runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]

    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
+      - name: Checkout code
+        uses: actions/checkout@v4

-    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3

-    - name: Login to Docker Hub
-      uses: docker/login-action@v3
-      with:
-        username: ${{ secrets.DOCKER_USERNAME }}
-        password: ${{ secrets.DOCKER_TOKEN }}
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}

-    - name: Model Server Image Docker Build and Push
-      uses: docker/build-push-action@v5
-      with:
-        context: ./backend
-        file: ./backend/Dockerfile.model_server
-        platforms: linux/amd64,linux/arm64
-        push: true
-        tags: |
-          ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
-          ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
-        build-args: |
-          DANSWER_VERSION=${{ github.ref_name }}
+      - name: Model Server Image Docker Build and Push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile.model_server
+          platforms: linux/amd64,linux/arm64
+          push: true
+          tags: |
+            ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
+            ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
+          build-args: |
+            DANSWER_VERSION=${{ github.ref_name }}

-    # trivy has their own rate limiting issues causing this action to flake
-    # we worked around it by hardcoding to different db repos in env
-    # can re-enable when they figure it out
-    # https://github.com/aquasecurity/trivy/discussions/7538
-    # https://github.com/aquasecurity/trivy-action/issues/389
-    - name: Run Trivy vulnerability scanner
-      uses: aquasecurity/trivy-action@master
-      env:
-        TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
-        TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
-      with:
-        image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }}
-        severity: 'CRITICAL,HIGH'
+      # trivy has their own rate limiting issues causing this action to flake
+      # we worked around it by hardcoding to different db repos in env
+      # can re-enable when they figure it out
+      # https://github.com/aquasecurity/trivy/discussions/7538
+      # https://github.com/aquasecurity/trivy-action/issues/389
+      - name: Run Trivy vulnerability scanner
+        uses: aquasecurity/trivy-action@master
+        env:
+          TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
+          TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
+        with:
+          image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }}
+          severity: "CRITICAL,HIGH"
--- a/.github/workflows/nightly-scan-licenses.yml
+++ b/.github/workflows/nightly-scan-licenses.yml
@@ -0,0 +1,76 @@
+# Scan for problematic software licenses
+
+# trivy has their own rate limiting issues causing this action to flake
+# we worked around it by hardcoding to different db repos in env
+# can re-enable when they figure it out
+# https://github.com/aquasecurity/trivy/discussions/7538
+# https://github.com/aquasecurity/trivy-action/issues/389
+
+name: 'Nightly - Scan licenses'
+on:
+#   schedule:
+#     - cron: '0 14 * * *'  # Runs every day at 6 AM PST / 7 AM PDT / 2 PM UTC
+  workflow_dispatch:  # Allows manual triggering
+
+permissions:
+  actions: read
+  contents: read
+  security-events: write
+  
+jobs:
+  scan-licenses:
+    # See https://runs-on.com/runners/linux/
+    runs-on: [runs-on,runner=2cpu-linux-x64,"run-id=${{ github.run_id }}"]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+            backend/requirements/model_server.txt
+      
+      - name: Get explicit and transitive dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
+          pip freeze > requirements-all.txt
+                    
+      - name: Check python
+        id: license_check_report
+        uses: pilosus/action-pip-license-checker@v2
+        with:
+          requirements: 'requirements-all.txt'
+          fail: 'Copyleft'
+          exclude: '(?i)^(pylint|aio[-_]*).*'
+          
+      - name: Print report
+        if: ${{ always() }}
+        run: echo "${{ steps.license_check_report.outputs.report }}"
+      
+      - name: Install npm dependencies
+        working-directory: ./web
+        run: npm ci
+        
+      - name: Run Trivy vulnerability scanner in repo mode
+        uses: aquasecurity/trivy-action@0.28.0
+        with:
+          scan-type: fs
+          scanners: license
+          format: table
+#           format: sarif
+#           output: trivy-results.sarif
+          severity: HIGH,CRITICAL
+
+#       - name: Upload Trivy scan results to GitHub Security tab
+#         uses: github/codeql-action/upload-sarif@v3
+#         with:
+#           sarif_file: trivy-results.sarif
--- a/.github/workflows/pr-chromatic-tests.yml
+++ b/.github/workflows/pr-chromatic-tests.yml
@@ -0,0 +1,225 @@
+name: Run Chromatic Tests
+concurrency:
+  group: Run-Chromatic-Tests-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }}
+  cancel-in-progress: true
+
+on: push
+
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+jobs:
+  playwright-tests:
+    name: Playwright Tests
+
+    # See https://runs-on.com/runners/linux/
+    runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+            backend/requirements/model_server.txt
+      - run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
+        
+      - name: Setup node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Install node dependencies
+        working-directory: ./web
+        run: npm ci
+
+      - name: Install playwright browsers
+        working-directory: ./web
+        run: npx playwright install --with-deps
+        
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      # tag every docker image with "test" so that we can spin up the correct set
+      # of images during testing
+      
+      # we use the runs-on cache for docker builds
+      # in conjunction with runs-on runners, it has better speed and unlimited caching
+      # https://runs-on.com/caching/s3-cache-for-github-actions/
+      # https://runs-on.com/caching/docker/
+      # https://github.com/moby/buildkit#s3-cache-experimental
+      
+      # images are built and run locally for testing purposes. Not pushed.
+
+      - name: Build Web Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./web
+          file: ./web/Dockerfile
+          platforms: linux/amd64
+          tags: danswer/danswer-web-server:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Build Backend Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile
+          platforms: linux/amd64
+          tags: danswer/danswer-backend:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Build Model Server Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile.model_server
+          platforms: linux/amd64
+          tags: danswer/danswer-model-server:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Start Docker containers
+        run: |
+          cd deployment/docker_compose
+          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
+          AUTH_TYPE=basic \
+          REQUIRE_EMAIL_VERIFICATION=false \
+          DISABLE_TELEMETRY=true \
+          IMAGE_TAG=test \
+          docker compose -f docker-compose.dev.yml -p danswer-stack up -d
+        id: start_docker
+
+      - name: Wait for service to be ready
+        run: |
+          echo "Starting wait-for-service script..."
+          
+          docker logs -f danswer-stack-api_server-1 &
+
+          start_time=$(date +%s)
+          timeout=300  # 5 minutes in seconds
+          
+          while true; do
+            current_time=$(date +%s)
+            elapsed_time=$((current_time - start_time))
+            
+            if [ $elapsed_time -ge $timeout ]; then
+              echo "Timeout reached. Service did not become ready in 5 minutes."
+              exit 1
+            fi
+            
+            # Use curl with error handling to ignore specific exit code 56
+            response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
+            
+            if [ "$response" = "200" ]; then
+              echo "Service is ready!"
+              break
+            elif [ "$response" = "curl_error" ]; then
+              echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
+            else
+              echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
+            fi
+            
+            sleep 5
+          done
+          echo "Finished waiting for service."
+
+      - name: Run pytest playwright test init
+        working-directory: ./backend
+        env: 
+          PYTEST_IGNORE_SKIP: true
+        run: pytest -s tests/integration/tests/playwright/test_playwright.py
+
+      - name: Run Playwright tests
+        working-directory: ./web
+        run: npx playwright test
+
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          # Chromatic automatically defaults to the test-results directory.
+          # Replace with the path to your custom directory and adjust the CHROMATIC_ARCHIVE_LOCATION environment variable accordingly.
+          name: test-results
+          path: ./web/test-results
+          retention-days: 30
+                    
+      # save before stopping the containers so the logs can be captured
+      - name: Save Docker logs
+        if: success() || failure()
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
+          mv docker-compose.log ${{ github.workspace }}/docker-compose.log
+      
+      - name: Upload logs
+        if: success() || failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: docker-logs
+          path: ${{ github.workspace }}/docker-compose.log
+
+      - name: Stop Docker containers
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p danswer-stack down -v
+
+  chromatic-tests:
+    name: Chromatic Tests
+    
+    needs: playwright-tests
+    runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          
+      - name: Setup node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          
+      - name: Install node dependencies
+        working-directory: ./web
+        run: npm ci
+        
+      - name: Download Playwright test results
+        uses: actions/download-artifact@v4
+        with:
+          name: test-results
+          path: ./web/test-results
+          
+      - name: Run Chromatic
+        uses: chromaui/action@latest
+        with:
+          playwright: true
+          projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }}
+          workingDir: ./web
+        env: 
+          CHROMATIC_ARCHIVE_LOCATION: ./test-results
--- a/.github/workflows/pr-helm-chart-testing.yml
+++ b/.github/workflows/pr-helm-chart-testing.yml
@@ -0,0 +1,72 @@
+name: Helm - Lint and Test Charts
+
+on:
+  merge_group:
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:  # Allows manual triggering
+  
+jobs:
+  helm-chart-check:
+    # See https://runs-on.com/runners/linux/
+    runs-on: [runs-on,runner=8cpu-linux-x64,hdd=256,"run-id=${{ github.run_id }}"]
+
+    # fetch-depth 0 is required for helm/chart-testing-action
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        
+    - name: Set up Helm
+      uses: azure/setup-helm@v4.2.0
+      with:
+        version: v3.14.4
+      
+    - name: Set up chart-testing
+      uses: helm/chart-testing-action@v2.6.1
+
+    # even though we specify chart-dirs in ct.yaml, it isn't used by ct for the list-changed command...
+    - name: Run chart-testing (list-changed)
+      id: list-changed
+      run: |
+        echo "default_branch: ${{ github.event.repository.default_branch }}"
+        changed=$(ct list-changed --remote origin --target-branch ${{ github.event.repository.default_branch }} --chart-dirs deployment/helm/charts)
+        echo "list-changed output: $changed"
+        if [[ -n "$changed" ]]; then
+          echo "changed=true" >> "$GITHUB_OUTPUT"
+        fi
+
+#     rkuo: I don't think we need python?
+#     - name: Set up Python
+#       uses: actions/setup-python@v5
+#       with:
+#         python-version: '3.11'
+#         cache: 'pip'
+#         cache-dependency-path: |
+#           backend/requirements/default.txt
+#           backend/requirements/dev.txt
+#           backend/requirements/model_server.txt
+#     - run: |
+#         python -m pip install --upgrade pip
+#         pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+#         pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+#         pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
+
+    # lint all charts if any changes were detected
+    - name: Run chart-testing (lint)
+      if: steps.list-changed.outputs.changed == 'true'
+      run: ct lint --config ct.yaml --all
+      # the following would lint only changed charts, but linting isn't expensive
+      # run: ct lint --config ct.yaml --target-branch ${{ github.event.repository.default_branch }}
+
+    - name: Create kind cluster
+      if: steps.list-changed.outputs.changed == 'true'
+      uses: helm/kind-action@v1.10.0
+
+    - name: Run chart-testing (install)
+      if: steps.list-changed.outputs.changed == 'true'
+      run: ct install --all --helm-extra-set-args="--set=nginx.enabled=false" --debug --config ct.yaml
+      # the following would install only changed charts, but we only have one chart so 
+      # don't worry about that for now
+      # run: ct install --target-branch ${{ github.event.repository.default_branch }}
--- a/.github/workflows/pr-helm-chart-testing.yml.disabled.txt
+++ b/.github/workflows/pr-helm-chart-testing.yml.disabled.txt
@@ -1,68 +0,0 @@
-# This workflow is intentionally disabled while we're still working on it
-# It's close to ready, but a race condition needs to be fixed with
-# API server and Vespa startup, and it needs to have a way to build/test against
-# local containers
-
-name: Helm - Lint and Test Charts
-
-on:
-  merge_group:
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  lint-test:
-    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on,runner=8cpu-linux-x64,hdd=256,"run-id=${{ github.run_id }}"]
-
-    # fetch-depth 0 is required for helm/chart-testing-action
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
-        
-    - name: Set up Helm
-      uses: azure/setup-helm@v4.2.0
-      with:
-        version: v3.14.4
-      
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.11'
-        cache: 'pip'
-        cache-dependency-path: |
-          backend/requirements/default.txt
-          backend/requirements/dev.txt
-          backend/requirements/model_server.txt
-    - run: |
-        python -m pip install --upgrade pip
-        pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
-        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
-        pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
-
-    - name: Set up chart-testing
-      uses: helm/chart-testing-action@v2.6.1
-
-    - name: Run chart-testing (list-changed)
-      id: list-changed
-      run: |
-        changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
-        if [[ -n "$changed" ]]; then
-          echo "changed=true" >> "$GITHUB_OUTPUT"
-        fi
-
-    - name: Run chart-testing (lint)
-#       if: steps.list-changed.outputs.changed == 'true'
-      run: ct lint --all --config ct.yaml --target-branch ${{ github.event.repository.default_branch }}
-
-    - name: Create kind cluster
-#       if: steps.list-changed.outputs.changed == 'true'
-      uses: helm/kind-action@v1.10.0
-
-    - name: Run chart-testing (install)
-#       if: steps.list-changed.outputs.changed == 'true'
-      run: ct install --all --config ct.yaml
-#       run: ct install --target-branch ${{ github.event.repository.default_branch }}
-      
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -13,7 +13,10 @@ on:
 env:
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+  
 jobs:
  integration-tests:
    # See https://runs-on.com/runners/linux/
@@ -195,9 +198,13 @@ jobs:
            -e API_SERVER_HOST=api_server \
            -e OPENAI_API_KEY=${OPENAI_API_KEY} \
            -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
+            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
+            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
+            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
            -e TEST_WEB_HOSTNAME=test-runner \
            danswer/danswer-integration:test \
-            /app/tests/integration/tests
+            /app/tests/integration/tests \
+            /app/tests/integration/connector_job_tests
        continue-on-error: true
        id: run_tests

@@ -210,17 +217,18 @@ jobs:
            echo "All integration tests passed successfully."
          fi

-      - name: Stop Docker containers
-        run: |
-          cd deployment/docker_compose
-          docker compose -f docker-compose.dev.yml -p danswer-stack down -v
-
+      # save before stopping the containers so the logs can be captured
      - name: Save Docker logs
        if: success() || failure()
        run: |
          cd deployment/docker_compose
          docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
          mv docker-compose.log ${{ github.workspace }}/docker-compose.log
+
+      - name: Stop Docker containers
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p danswer-stack down -v
      
      - name: Upload logs
        if: success() || failure()
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -18,6 +18,14 @@ env:
  # Jira
  JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
+  # Google
+  GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
+  GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }}
+  GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
+  GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
+  GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
+  # Slab
+  SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}

 jobs:
  connectors-check:
--- a/.github/workflows/pr-python-model-tests.yml
+++ b/.github/workflows/pr-python-model-tests.yml
@@ -15,7 +15,7 @@ env:
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

 jobs:
-  connectors-check:
+  model-check:
    # See https://runs-on.com/runners/linux/
    runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"]

--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@
 .vscode/
 *.sw?
 /backend/tests/regression/answer_quality/search_test_config.yaml
+/web/test-results/
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -203,7 +203,7 @@
                "--loglevel=INFO",
                "--hostname=light@%n",
                "-Q",
-                "vespa_metadata_sync,connector_deletion",
+                "vespa_metadata_sync,connector_deletion,doc_permissions_upsert",
            ],
            "presentation": {
 				 "group": "2",
@@ -232,7 +232,7 @@
                "--loglevel=INFO",
                "--hostname=heavy@%n",
                "-Q",
-                "connector_pruning",
+                "connector_pruning,connector_doc_permissions_sync,connector_external_group_sync",
            ],
            "presentation": {
 				 "group": "2",
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -32,7 +32,7 @@ To contribute to this project, please follow the
 When opening a pull request, mention related issues and feel free to tag relevant maintainers.

 Before creating a pull request please make sure that the new changes conform to the formatting and linting requirements.
-See the [Formatting and Linting](#-formatting-and-linting) section for how to run these checks locally.
+See the [Formatting and Linting](#formatting-and-linting) section for how to run these checks locally.


 ### Getting Help 🙋
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 <!-- DANSWER_METADATA={"link": "https://github.com/danswer-ai/danswer/blob/main/README.md"} -->
+<a name="readme-top"></a>

 <h2 align="center">
 <a href="https://www.danswer.ai/"> <img width="50%" src="https://github.com/danswer-owners/danswer/blob/1fabd9372d66cd54238847197c33f091a724803b/DanswerWithName.png?raw=true)" /></a>
@@ -11,7 +12,7 @@
 <a href="https://docs.danswer.dev/" target="_blank">
    <img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
 </a>
-<a href="https://join.slack.com/t/danswer/shared_invite/zt-2lcmqw703-071hBuZBfNEOGUsLa5PXvQ" target="_blank">
+<a href="https://join.slack.com/t/danswer/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA" target="_blank">
    <img src="https://img.shields.io/badge/slack-join-blue.svg?logo=slack" alt="Slack">
 </a>
 <a href="https://discord.gg/TDJ59cGV2X" target="_blank">
@@ -127,3 +128,19 @@ To try the Danswer Enterprise Edition:

 ## 💡 Contributing
 Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details.
+
+## ⭐Star History
+
+[![Star History Chart](https://api.star-history.com/svg?repos=danswer-ai/danswer&type=Date)](https://star-history.com/#danswer-ai/danswer&Date)
+
+## ✨Contributors
+
+<a href="https://github.com/danswer-ai/danswer/graphs/contributors">
+  <img alt="contributors" src="https://contrib.rocks/image?repo=danswer-ai/danswer"/>
+</a>
+
+<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
+    <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
+        ↑ Back to Top ↑
+    </a>
+</p>
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -12,7 +12,6 @@ ARG DANSWER_VERSION=0.8-dev
 ENV DANSWER_VERSION=${DANSWER_VERSION} \
    DANSWER_RUNNING_IN_DOCKER="true"

-ARG CA_CERT_CONTENT=""

 RUN echo "DANSWER_VERSION: ${DANSWER_VERSION}"
 # Install system dependencies
@@ -39,15 +38,6 @@ RUN apt-get update && \
    apt-get clean


-# Conditionally write the CA certificate and update certificates
-RUN if [ -n "$CA_CERT_CONTENT" ]; then \
-    echo "Adding custom CA certificate"; \
-    echo "$CA_CERT_CONTENT" > /usr/local/share/ca-certificates/my-ca.crt && \
-    chmod 644 /usr/local/share/ca-certificates/my-ca.crt && \
-    update-ca-certificates; \
-else \
-    echo "No custom CA certificate provided"; \
-fi

 # Install Python dependencies
 # Remove py which is pulled in by retry, py is not needed and is a CVE
@@ -83,11 +73,11 @@ RUN apt-get update && \
    rm -rf /var/lib/apt/lists/* && \
    rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key

+
 # Pre-downloading models for setups with limited egress
 RUN python -c "from tokenizers import Tokenizer; \
 Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"

-
 # Pre-downloading NLTK for setups with limited egress
 RUN python -c "import nltk; \
 nltk.download('stopwords', quiet=True); \
--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -1,5 +1,5 @@
 from sqlalchemy.engine.base import Connection
-from typing import Any
+from typing import Literal
 import asyncio
 from logging.config import fileConfig
 import logging
@@ -8,6 +8,7 @@ from alembic import context
 from sqlalchemy import pool
 from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.sql import text
+from sqlalchemy.sql.schema import SchemaItem

 from shared_configs.configs import MULTI_TENANT
 from danswer.db.engine import build_connection_string
@@ -35,7 +36,18 @@ logger = logging.getLogger(__name__)


 def include_object(
-    object: Any, name: str, type_: str, reflected: bool, compare_to: Any
+    object: SchemaItem,
+    name: str | None,
+    type_: Literal[
+        "schema",
+        "table",
+        "column",
+        "index",
+        "unique_constraint",
+        "foreign_key_constraint",
+    ],
+    reflected: bool,
+    compare_to: SchemaItem | None,
 ) -> bool:
    """
    Determines whether a database object should be included in migrations.
--- a/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py
+++ b/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py
@@ -0,0 +1,59 @@
+"""display custom llm models
+
+Revision ID: 177de57c21c9
+Revises: 4ee1287bd26a
+Create Date: 2024-11-21 11:49:04.488677
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy import and_
+
+revision = "177de57c21c9"
+down_revision = "4ee1287bd26a"
+branch_labels = None
+depends_on = None
+depends_on = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    llm_provider = sa.table(
+        "llm_provider",
+        sa.column("id", sa.Integer),
+        sa.column("provider", sa.String),
+        sa.column("model_names", postgresql.ARRAY(sa.String)),
+        sa.column("display_model_names", postgresql.ARRAY(sa.String)),
+    )
+
+    excluded_providers = ["openai", "bedrock", "anthropic", "azure"]
+
+    providers_to_update = sa.select(
+        llm_provider.c.id,
+        llm_provider.c.model_names,
+        llm_provider.c.display_model_names,
+    ).where(
+        and_(
+            ~llm_provider.c.provider.in_(excluded_providers),
+            llm_provider.c.model_names.isnot(None),
+        )
+    )
+
+    results = conn.execute(providers_to_update).fetchall()
+
+    for provider_id, model_names, display_model_names in results:
+        if display_model_names is None:
+            display_model_names = []
+
+        combined_model_names = list(set(display_model_names + model_names))
+        update_stmt = (
+            llm_provider.update()
+            .where(llm_provider.c.id == provider_id)
+            .values(display_model_names=combined_model_names)
+        )
+        conn.execute(update_stmt)
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py
+++ b/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py
@@ -0,0 +1,68 @@
+"""default chosen assistants to none
+
+Revision ID: 26b931506ecb
+Revises: 2daa494a0851
+Create Date: 2024-11-12 13:23:29.858995
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "26b931506ecb"
+down_revision = "2daa494a0851"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "user", sa.Column("chosen_assistants_new", postgresql.JSONB(), nullable=True)
+    )
+
+    op.execute(
+        """
+    UPDATE "user"
+    SET chosen_assistants_new =
+        CASE
+            WHEN chosen_assistants = '[-2, -1, 0]' THEN NULL
+            ELSE chosen_assistants
+        END
+    """
+    )
+
+    op.drop_column("user", "chosen_assistants")
+
+    op.alter_column(
+        "user", "chosen_assistants_new", new_column_name="chosen_assistants"
+    )
+
+
+def downgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column(
+            "chosen_assistants_old",
+            postgresql.JSONB(),
+            nullable=False,
+            server_default="[-2, -1, 0]",
+        ),
+    )
+
+    op.execute(
+        """
+    UPDATE "user"
+    SET chosen_assistants_old =
+        CASE
+            WHEN chosen_assistants IS NULL THEN '[-2, -1, 0]'::jsonb
+            ELSE chosen_assistants
+        END
+    """
+    )
+
+    op.drop_column("user", "chosen_assistants")
+
+    op.alter_column(
+        "user", "chosen_assistants_old", new_column_name="chosen_assistants"
+    )
--- a/backend/alembic/versions/2daa494a0851_add_group_sync_time.py
+++ b/backend/alembic/versions/2daa494a0851_add_group_sync_time.py
@@ -0,0 +1,30 @@
+"""add-group-sync-time
+
+Revision ID: 2daa494a0851
+Revises: c0fd6e4da83a
+Create Date: 2024-11-11 10:57:22.991157
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "2daa494a0851"
+down_revision = "c0fd6e4da83a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "last_time_external_group_sync",
+            sa.DateTime(timezone=True),
+            nullable=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "last_time_external_group_sync")
--- a/backend/alembic/versions/33cb72ea4d80_single_tool_call_per_message.py
+++ b/backend/alembic/versions/33cb72ea4d80_single_tool_call_per_message.py
@@ -0,0 +1,50 @@
+"""single tool call per message
+
+Revision ID: 33cb72ea4d80
+Revises: 5b29123cd710
+Create Date: 2024-11-01 12:51:01.535003
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "33cb72ea4d80"
+down_revision = "5b29123cd710"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Step 1: Delete extraneous ToolCall entries
+    # Keep only the ToolCall with the smallest 'id' for each 'message_id'
+    op.execute(
+        sa.text(
+            """
+            DELETE FROM tool_call
+            WHERE id NOT IN (
+                SELECT MIN(id)
+                FROM tool_call
+                WHERE message_id IS NOT NULL
+                GROUP BY message_id
+            );
+        """
+        )
+    )
+
+    # Step 2: Add a unique constraint on message_id
+    op.create_unique_constraint(
+        constraint_name="uq_tool_call_message_id",
+        table_name="tool_call",
+        columns=["message_id"],
+    )
+
+
+def downgrade() -> None:
+    # Step 1: Drop the unique constraint on message_id
+    op.drop_constraint(
+        constraint_name="uq_tool_call_message_id",
+        table_name="tool_call",
+        type_="unique",
+    )
--- a/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py
+++ b/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py
@@ -0,0 +1,45 @@
+"""add persona categories
+
+Revision ID: 47e5bef3a1d7
+Revises: dfbe9e93d3c7
+Create Date: 2024-11-05 18:55:02.221064
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "47e5bef3a1d7"
+down_revision = "dfbe9e93d3c7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create the persona_category table
+    op.create_table(
+        "persona_category",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sa.String(), nullable=False),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
+    )
+
+    # Add category_id to persona table
+    op.add_column("persona", sa.Column("category_id", sa.Integer(), nullable=True))
+    op.create_foreign_key(
+        "fk_persona_category",
+        "persona",
+        "persona_category",
+        ["category_id"],
+        ["id"],
+        ondelete="SET NULL",
+    )
+
+
+def downgrade() -> None:
+    op.drop_constraint("fk_persona_category", "persona", type_="foreignkey")
+    op.drop_column("persona", "category_id")
+    op.drop_table("persona_category")
--- a/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py
+++ b/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py
@@ -0,0 +1,280 @@
+"""add_multiple_slack_bot_support
+
+Revision ID: 4ee1287bd26a
+Revises: 47e5bef3a1d7
+Create Date: 2024-11-06 13:15:53.302644
+
+"""
+import logging
+from typing import cast
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+from danswer.key_value_store.factory import get_kv_store
+from danswer.db.models import SlackBot
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "4ee1287bd26a"
+down_revision = "47e5bef3a1d7"
+branch_labels: None = None
+depends_on: None = None
+
+# Configure logging
+logger = logging.getLogger("alembic.runtime.migration")
+logger.setLevel(logging.INFO)
+
+
+def upgrade() -> None:
+    logger.info(f"{revision}: create_table: slack_bot")
+    # Create new slack_bot table
+    op.create_table(
+        "slack_bot",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sa.String(), nullable=False),
+        sa.Column("enabled", sa.Boolean(), nullable=False, server_default="true"),
+        sa.Column("bot_token", sa.LargeBinary(), nullable=False),
+        sa.Column("app_token", sa.LargeBinary(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("bot_token"),
+        sa.UniqueConstraint("app_token"),
+    )
+
+    # # Create new slack_channel_config table
+    op.create_table(
+        "slack_channel_config",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("slack_bot_id", sa.Integer(), nullable=True),
+        sa.Column("persona_id", sa.Integer(), nullable=True),
+        sa.Column("channel_config", postgresql.JSONB(), nullable=False),
+        sa.Column("response_type", sa.String(), nullable=False),
+        sa.Column(
+            "enable_auto_filters", sa.Boolean(), nullable=False, server_default="false"
+        ),
+        sa.ForeignKeyConstraint(
+            ["slack_bot_id"],
+            ["slack_bot.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["persona_id"],
+            ["persona.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Handle existing Slack bot tokens first
+    logger.info(f"{revision}: Checking for existing Slack bot.")
+    bot_token = None
+    app_token = None
+    first_row_id = None
+
+    try:
+        tokens = cast(dict, get_kv_store().load("slack_bot_tokens_config_key"))
+    except Exception:
+        logger.warning("No existing Slack bot tokens found.")
+        tokens = {}
+
+    bot_token = tokens.get("bot_token")
+    app_token = tokens.get("app_token")
+
+    if bot_token and app_token:
+        logger.info(f"{revision}: Found bot and app tokens.")
+
+        session = Session(bind=op.get_bind())
+        new_slack_bot = SlackBot(
+            name="Slack Bot (Migrated)",
+            enabled=True,
+            bot_token=bot_token,
+            app_token=app_token,
+        )
+        session.add(new_slack_bot)
+        session.commit()
+        first_row_id = new_slack_bot.id
+
+    # Create a default bot if none exists
+    # This is in case there are no slack tokens but there are channels configured
+    op.execute(
+        sa.text(
+            """
+            INSERT INTO slack_bot (name, enabled, bot_token, app_token)
+            SELECT 'Default Bot', true, '', ''
+            WHERE NOT EXISTS (SELECT 1 FROM slack_bot)
+            RETURNING id;
+            """
+        )
+    )
+
+    # Get the bot ID to use (either from existing migration or newly created)
+    bot_id_query = sa.text(
+        """
+        SELECT COALESCE(
+            :first_row_id,
+            (SELECT id FROM slack_bot ORDER BY id ASC LIMIT 1)
+        ) as bot_id;
+        """
+    )
+    result = op.get_bind().execute(bot_id_query, {"first_row_id": first_row_id})
+    bot_id = result.scalar()
+
+    # CTE (Common Table Expression) that transforms the old slack_bot_config table data
+    # This splits up the channel_names into their own rows
+    channel_names_cte = """
+        WITH channel_names AS (
+            SELECT
+                sbc.id as config_id,
+                sbc.persona_id,
+                sbc.response_type,
+                sbc.enable_auto_filters,
+                jsonb_array_elements_text(sbc.channel_config->'channel_names') as channel_name,
+                sbc.channel_config->>'respond_tag_only' as respond_tag_only,
+                sbc.channel_config->>'respond_to_bots' as respond_to_bots,
+                sbc.channel_config->'respond_member_group_list' as respond_member_group_list,
+                sbc.channel_config->'answer_filters' as answer_filters,
+                sbc.channel_config->'follow_up_tags' as follow_up_tags
+            FROM slack_bot_config sbc
+        )
+    """
+
+    # Insert the channel names into the new slack_channel_config table
+    insert_statement = """
+        INSERT INTO slack_channel_config (
+            slack_bot_id,
+            persona_id,
+            channel_config,
+            response_type,
+            enable_auto_filters
+        )
+        SELECT
+            :bot_id,
+            channel_name.persona_id,
+            jsonb_build_object(
+                'channel_name', channel_name.channel_name,
+                'respond_tag_only',
+                COALESCE((channel_name.respond_tag_only)::boolean, false),
+                'respond_to_bots',
+                COALESCE((channel_name.respond_to_bots)::boolean, false),
+                'respond_member_group_list',
+                COALESCE(channel_name.respond_member_group_list, '[]'::jsonb),
+                'answer_filters',
+                COALESCE(channel_name.answer_filters, '[]'::jsonb),
+                'follow_up_tags',
+                COALESCE(channel_name.follow_up_tags, '[]'::jsonb)
+            ),
+            channel_name.response_type,
+            channel_name.enable_auto_filters
+        FROM channel_names channel_name;
+    """
+
+    op.execute(sa.text(channel_names_cte + insert_statement).bindparams(bot_id=bot_id))
+
+    # Clean up old tokens if they existed
+    try:
+        if bot_token and app_token:
+            logger.info(f"{revision}: Removing old bot and app tokens.")
+            get_kv_store().delete("slack_bot_tokens_config_key")
+    except Exception:
+        logger.warning("tried to delete tokens in dynamic config but failed")
+    # Rename the table
+    op.rename_table(
+        "slack_bot_config__standard_answer_category",
+        "slack_channel_config__standard_answer_category",
+    )
+
+    # Rename the column
+    op.alter_column(
+        "slack_channel_config__standard_answer_category",
+        "slack_bot_config_id",
+        new_column_name="slack_channel_config_id",
+    )
+
+    # Drop the table with CASCADE to handle dependent objects
+    op.execute("DROP TABLE slack_bot_config CASCADE")
+
+    logger.info(f"{revision}: Migration complete.")
+
+
+def downgrade() -> None:
+    # Recreate the old slack_bot_config table
+    op.create_table(
+        "slack_bot_config",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("persona_id", sa.Integer(), nullable=True),
+        sa.Column("channel_config", postgresql.JSONB(), nullable=False),
+        sa.Column("response_type", sa.String(), nullable=False),
+        sa.Column("enable_auto_filters", sa.Boolean(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["persona_id"],
+            ["persona.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Migrate data back to the old format
+    # Group by persona_id to combine channel names back into arrays
+    op.execute(
+        sa.text(
+            """
+            INSERT INTO slack_bot_config (
+                persona_id,
+                channel_config,
+                response_type,
+                enable_auto_filters
+            )
+            SELECT DISTINCT ON (persona_id)
+                persona_id,
+                jsonb_build_object(
+                    'channel_names', (
+                        SELECT jsonb_agg(c.channel_config->>'channel_name')
+                        FROM slack_channel_config c
+                        WHERE c.persona_id = scc.persona_id
+                    ),
+                    'respond_tag_only', (channel_config->>'respond_tag_only')::boolean,
+                    'respond_to_bots', (channel_config->>'respond_to_bots')::boolean,
+                    'respond_member_group_list', channel_config->'respond_member_group_list',
+                    'answer_filters', channel_config->'answer_filters',
+                    'follow_up_tags', channel_config->'follow_up_tags'
+                ),
+                response_type,
+                enable_auto_filters
+            FROM slack_channel_config scc
+            WHERE persona_id IS NOT NULL;
+            """
+        )
+    )
+
+    # Rename the table back
+    op.rename_table(
+        "slack_channel_config__standard_answer_category",
+        "slack_bot_config__standard_answer_category",
+    )
+
+    # Rename the column back
+    op.alter_column(
+        "slack_bot_config__standard_answer_category",
+        "slack_channel_config_id",
+        new_column_name="slack_bot_config_id",
+    )
+
+    # Try to save the first bot's tokens back to KV store
+    try:
+        first_bot = (
+            op.get_bind()
+            .execute(
+                sa.text(
+                    "SELECT bot_token, app_token FROM slack_bot ORDER BY id LIMIT 1"
+                )
+            )
+            .first()
+        )
+        if first_bot and first_bot.bot_token and first_bot.app_token:
+            tokens = {
+                "bot_token": first_bot.bot_token,
+                "app_token": first_bot.app_token,
+            }
+            get_kv_store().store("slack_bot_tokens_config_key", tokens)
+    except Exception:
+        logger.warning("Failed to save tokens back to KV store")
+
+    # Drop the new tables in reverse order
+    op.drop_table("slack_channel_config")
+    op.drop_table("slack_bot")
--- a/backend/alembic/versions/5b29123cd710_nullable_search_settings_for_historic_.py
+++ b/backend/alembic/versions/5b29123cd710_nullable_search_settings_for_historic_.py
@@ -0,0 +1,70 @@
+"""nullable search settings for historic index attempts
+
+Revision ID: 5b29123cd710
+Revises: 949b4a92a401
+Create Date: 2024-10-30 19:37:59.630704
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "5b29123cd710"
+down_revision = "949b4a92a401"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Drop the existing foreign key constraint
+    op.drop_constraint(
+        "fk_index_attempt_search_settings", "index_attempt", type_="foreignkey"
+    )
+
+    # Modify the column to be nullable
+    op.alter_column(
+        "index_attempt", "search_settings_id", existing_type=sa.INTEGER(), nullable=True
+    )
+
+    # Add back the foreign key with ON DELETE SET NULL
+    op.create_foreign_key(
+        "fk_index_attempt_search_settings",
+        "index_attempt",
+        "search_settings",
+        ["search_settings_id"],
+        ["id"],
+        ondelete="SET NULL",
+    )
+
+
+def downgrade() -> None:
+    # Warning: This will delete all index attempts that don't have search settings
+    op.execute(
+        """
+        DELETE FROM index_attempt
+        WHERE search_settings_id IS NULL
+    """
+    )
+
+    # Drop foreign key constraint
+    op.drop_constraint(
+        "fk_index_attempt_search_settings", "index_attempt", type_="foreignkey"
+    )
+
+    # Modify the column to be not nullable
+    op.alter_column(
+        "index_attempt",
+        "search_settings_id",
+        existing_type=sa.INTEGER(),
+        nullable=False,
+    )
+
+    # Add back the foreign key without ON DELETE SET NULL
+    op.create_foreign_key(
+        "fk_index_attempt_search_settings",
+        "index_attempt",
+        "search_settings",
+        ["search_settings_id"],
+        ["id"],
+    )
--- a/backend/alembic/versions/6d562f86c78b_remove_default_bot.py
+++ b/backend/alembic/versions/6d562f86c78b_remove_default_bot.py
@@ -0,0 +1,45 @@
+"""remove default bot
+
+Revision ID: 6d562f86c78b
+Revises: 177de57c21c9
+Create Date: 2024-11-22 11:51:29.331336
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "6d562f86c78b"
+down_revision = "177de57c21c9"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(
+        sa.text(
+            """
+            DELETE FROM slack_bot
+            WHERE name = 'Default Bot'
+            AND bot_token = ''
+            AND app_token = ''
+            AND NOT EXISTS (
+                SELECT 1 FROM slack_channel_config
+                WHERE slack_channel_config.slack_bot_id = slack_bot.id
+            )
+            """
+        )
+    )
+
+
+def downgrade() -> None:
+    op.execute(
+        sa.text(
+            """
+            INSERT INTO slack_bot (name, enabled, bot_token, app_token)
+            SELECT 'Default Bot', true, '', ''
+            WHERE NOT EXISTS (SELECT 1 FROM slack_bot)
+            RETURNING id;
+            """
+        )
+    )
--- a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
+++ b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
@@ -9,8 +9,8 @@ from alembic import op
 import sqlalchemy as sa

 from danswer.db.models import IndexModelStatus
-from danswer.search.enums import RecencyBiasSetting
-from danswer.search.enums import SearchType
+from danswer.context.search.enums import RecencyBiasSetting
+from danswer.context.search.enums import SearchType

 # revision identifiers, used by Alembic.
 revision = "776b3bbe9092"
--- a/backend/alembic/versions/93560ba1b118_add_web_ui_option_to_slack_config.py
+++ b/backend/alembic/versions/93560ba1b118_add_web_ui_option_to_slack_config.py
@@ -0,0 +1,35 @@
+"""add web ui option to slack config
+
+Revision ID: 93560ba1b118
+Revises: 6d562f86c78b
+Create Date: 2024-11-24 06:36:17.490612
+
+"""
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "93560ba1b118"
+down_revision = "6d562f86c78b"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add show_continue_in_web_ui with default False to all existing channel_configs
+    op.execute(
+        """
+        UPDATE slack_channel_config
+        SET channel_config = channel_config || '{"show_continue_in_web_ui": false}'::jsonb
+        WHERE NOT channel_config ? 'show_continue_in_web_ui'
+        """
+    )
+
+
+def downgrade() -> None:
+    # Remove show_continue_in_web_ui from all channel_configs
+    op.execute(
+        """
+        UPDATE slack_channel_config
+        SET channel_config = channel_config - 'show_continue_in_web_ui'
+        """
+    )
--- a/backend/alembic/versions/949b4a92a401_remove_rt.py
+++ b/backend/alembic/versions/949b4a92a401_remove_rt.py
@@ -7,6 +7,7 @@ Create Date: 2024-10-26 13:06:06.937969
 """
 from alembic import op
 from sqlalchemy.orm import Session
+from sqlalchemy import text

 # Import your models and constants
 from danswer.db.models import (
@@ -15,7 +16,6 @@ from danswer.db.models import (
    Credential,
    IndexAttempt,
 )
-from danswer.configs.constants import DocumentSource


 # revision identifiers, used by Alembic.
@@ -30,13 +30,11 @@ def upgrade() -> None:
    bind = op.get_bind()
    session = Session(bind=bind)

-    connectors_to_delete = (
-        session.query(Connector)
-        .filter(Connector.source == DocumentSource.REQUESTTRACKER)
-        .all()
+    # Get connectors using raw SQL
+    result = bind.execute(
+        text("SELECT id FROM connector WHERE source = 'requesttracker'")
    )
-
-    connector_ids = [connector.id for connector in connectors_to_delete]
+    connector_ids = [row[0] for row in result]

    if connector_ids:
        cc_pairs_to_delete = (
--- a/backend/alembic/versions/9cf5c00f72fe_add_creator_to_cc_pair.py
+++ b/backend/alembic/versions/9cf5c00f72fe_add_creator_to_cc_pair.py
@@ -0,0 +1,30 @@
+"""add creator to cc pair
+
+Revision ID: 9cf5c00f72fe
+Revises: 26b931506ecb
+Create Date: 2024-11-12 15:16:42.682902
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "9cf5c00f72fe"
+down_revision = "26b931506ecb"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "creator_id",
+            sa.UUID(as_uuid=True),
+            nullable=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "creator_id")
--- a/backend/alembic/versions/9f696734098f_combine_search_and_chat.py
+++ b/backend/alembic/versions/9f696734098f_combine_search_and_chat.py
@@ -0,0 +1,36 @@
+"""Combine Search and Chat
+
+Revision ID: 9f696734098f
+Revises: a8c2065484e6
+Create Date: 2024-11-27 15:32:19.694972
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "9f696734098f"
+down_revision = "a8c2065484e6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.alter_column("chat_session", "description", nullable=True)
+    op.drop_column("chat_session", "one_shot")
+    op.drop_column("slack_channel_config", "response_type")
+
+
+def downgrade() -> None:
+    op.execute("UPDATE chat_session SET description = '' WHERE description IS NULL")
+    op.alter_column("chat_session", "description", nullable=False)
+    op.add_column(
+        "chat_session",
+        sa.Column("one_shot", sa.Boolean(), nullable=False, server_default=sa.false()),
+    )
+    op.add_column(
+        "slack_channel_config",
+        sa.Column(
+            "response_type", sa.String(), nullable=False, server_default="citations"
+        ),
+    )
--- a/backend/alembic/versions/a8c2065484e6_add_auto_scroll_to_user_model.py
+++ b/backend/alembic/versions/a8c2065484e6_add_auto_scroll_to_user_model.py
@@ -0,0 +1,27 @@
+"""add auto scroll to user model
+
+Revision ID: a8c2065484e6
+Revises: abe7378b8217
+Create Date: 2024-11-22 17:34:09.690295
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "a8c2065484e6"
+down_revision = "abe7378b8217"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column("auto_scroll", sa.Boolean(), nullable=True, server_default=None),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("user", "auto_scroll")
--- a/backend/alembic/versions/abe7378b8217_add_indexing_trigger_to_cc_pair.py
+++ b/backend/alembic/versions/abe7378b8217_add_indexing_trigger_to_cc_pair.py
@@ -0,0 +1,30 @@
+"""add indexing trigger to cc_pair
+
+Revision ID: abe7378b8217
+Revises: 6d562f86c78b
+Create Date: 2024-11-26 19:09:53.481171
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "abe7378b8217"
+down_revision = "93560ba1b118"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "indexing_trigger",
+            sa.Enum("UPDATE", "REINDEX", name="indexingmode", native_enum=False),
+            nullable=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "indexing_trigger")
--- a/backend/alembic/versions/b156fa702355_chat_reworked.py
+++ b/backend/alembic/versions/b156fa702355_chat_reworked.py
@@ -288,6 +288,15 @@ def upgrade() -> None:


 def downgrade() -> None:
+    # NOTE: you will lose all chat history. This is to satisfy the non-nullable constraints
+    # below
+    op.execute("DELETE FROM chat_feedback")
+    op.execute("DELETE FROM chat_message__search_doc")
+    op.execute("DELETE FROM document_retrieval_feedback")
+    op.execute("DELETE FROM document_retrieval_feedback")
+    op.execute("DELETE FROM chat_message")
+    op.execute("DELETE FROM chat_session")
+
    op.drop_constraint(
        "chat_feedback__chat_message_fk", "chat_feedback", type_="foreignkey"
    )
--- a/backend/alembic/versions/b72ed7a5db0e_remove_description_from_starter_messages.py
+++ b/backend/alembic/versions/b72ed7a5db0e_remove_description_from_starter_messages.py
@@ -0,0 +1,48 @@
+"""remove description from starter messages
+
+Revision ID: b72ed7a5db0e
+Revises: 33cb72ea4d80
+Create Date: 2024-11-03 15:55:28.944408
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "b72ed7a5db0e"
+down_revision = "33cb72ea4d80"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(
+        sa.text(
+            """
+            UPDATE persona
+            SET starter_messages = (
+                SELECT jsonb_agg(elem - 'description')
+                FROM jsonb_array_elements(starter_messages) elem
+            )
+            WHERE starter_messages IS NOT NULL
+              AND jsonb_typeof(starter_messages) = 'array'
+            """
+        )
+    )
+
+
+def downgrade() -> None:
+    op.execute(
+        sa.text(
+            """
+            UPDATE persona
+            SET starter_messages = (
+                SELECT jsonb_agg(elem || '{"description": ""}')
+                FROM jsonb_array_elements(starter_messages) elem
+            )
+            WHERE starter_messages IS NOT NULL
+              AND jsonb_typeof(starter_messages) = 'array'
+            """
+        )
+    )
--- a/backend/alembic/versions/c0fd6e4da83a_add_recent_assistants.py
+++ b/backend/alembic/versions/c0fd6e4da83a_add_recent_assistants.py
@@ -0,0 +1,29 @@
+"""add recent assistants
+
+Revision ID: c0fd6e4da83a
+Revises: b72ed7a5db0e
+Create Date: 2024-11-03 17:28:54.916618
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "c0fd6e4da83a"
+down_revision = "b72ed7a5db0e"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column(
+            "recent_assistants", postgresql.JSONB(), server_default="[]", nullable=False
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("user", "recent_assistants")
--- a/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py
+++ b/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py
@@ -23,6 +23,56 @@ def upgrade() -> None:


 def downgrade() -> None:
+    # Delete chat messages and feedback first since they reference chat sessions
+    # Get chat messages from sessions with null persona_id
+    chat_messages_query = """
+        SELECT id
+        FROM chat_message
+        WHERE chat_session_id IN (
+            SELECT id
+            FROM chat_session
+            WHERE persona_id IS NULL
+        )
+    """
+
+    # Delete dependent records first
+    op.execute(
+        f"""
+        DELETE FROM document_retrieval_feedback
+        WHERE chat_message_id IN (
+            {chat_messages_query}
+        )
+    """
+    )
+    op.execute(
+        f"""
+        DELETE FROM chat_message__search_doc
+        WHERE chat_message_id IN (
+            {chat_messages_query}
+        )
+    """
+    )
+
+    # Delete chat messages
+    op.execute(
+        """
+        DELETE FROM chat_message
+        WHERE chat_session_id IN (
+            SELECT id
+            FROM chat_session
+            WHERE persona_id IS NULL
+        )
+    """
+    )
+
+    # Now we can safely delete the chat sessions
+    op.execute(
+        """
+        DELETE FROM chat_session
+        WHERE persona_id IS NULL
+    """
+    )
+
    op.alter_column(
        "chat_session",
        "persona_id",
--- a/backend/alembic/versions/dfbe9e93d3c7_extended_role_for_non_web.py
+++ b/backend/alembic/versions/dfbe9e93d3c7_extended_role_for_non_web.py
@@ -0,0 +1,42 @@
+"""extended_role_for_non_web
+
+Revision ID: dfbe9e93d3c7
+Revises: 9cf5c00f72fe
+Create Date: 2024-11-16 07:54:18.727906
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "dfbe9e93d3c7"
+down_revision = "9cf5c00f72fe"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        UPDATE "user"
+        SET role = 'EXT_PERM_USER'
+        WHERE has_web_login = false
+    """
+    )
+    op.drop_column("user", "has_web_login")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column("has_web_login", sa.Boolean(), nullable=False, server_default="true"),
+    )
+
+    op.execute(
+        """
+        UPDATE "user"
+        SET has_web_login = false,
+            role = 'BASIC'
+        WHERE role IN ('SLACK_USER', 'EXT_PERM_USER')
+    """
+    )
--- a/backend/alembic_tenants/env.py
+++ b/backend/alembic_tenants/env.py
@@ -1,5 +1,6 @@
 import asyncio
 from logging.config import fileConfig
+from typing import Literal

 from sqlalchemy import pool
 from sqlalchemy.engine import Connection
@@ -37,8 +38,15 @@ EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}

 def include_object(
    object: SchemaItem,
-    name: str,
-    type_: str,
+    name: str | None,
+    type_: Literal[
+        "schema",
+        "table",
+        "column",
+        "index",
+        "unique_constraint",
+        "foreign_key_constraint",
+    ],
    reflected: bool,
    compare_to: SchemaItem | None,
 ) -> bool:
--- a/backend/danswer/access/models.py
+++ b/backend/danswer/access/models.py
@@ -16,6 +16,46 @@ class ExternalAccess:
    is_public: bool


+@dataclass(frozen=True)
+class DocExternalAccess:
+    """
+    This is just a class to wrap the external access and the document ID
+    together. It's used for syncing document permissions to Redis.
+    """
+
+    external_access: ExternalAccess
+    # The document ID
+    doc_id: str
+
+    def to_dict(self) -> dict:
+        return {
+            "external_access": {
+                "external_user_emails": list(self.external_access.external_user_emails),
+                "external_user_group_ids": list(
+                    self.external_access.external_user_group_ids
+                ),
+                "is_public": self.external_access.is_public,
+            },
+            "doc_id": self.doc_id,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "DocExternalAccess":
+        external_access = ExternalAccess(
+            external_user_emails=set(
+                data["external_access"].get("external_user_emails", [])
+            ),
+            external_user_group_ids=set(
+                data["external_access"].get("external_user_group_ids", [])
+            ),
+            is_public=data["external_access"]["is_public"],
+        )
+        return cls(
+            external_access=external_access,
+            doc_id=data["doc_id"],
+        )
+
+
@dataclass(frozen=True)
 class DocumentAccess(ExternalAccess):
    # User emails for Danswer users, None indicates admin
--- a/backend/danswer/agent_search/core_qa_graph/edges.py
+++ b/backend/danswer/agent_search/core_qa_graph/edges.py
@@ -0,0 +1,42 @@
+from collections.abc import Hashable
+from typing import Union
+
+from langgraph.types import Send
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.primary_graph.states import RetrieverState
+from danswer.agent_search.primary_graph.states import VerifierState
+
+
+def sub_continue_to_verifier(state: BaseQAState) -> Union[Hashable, list[Hashable]]:
+    # Routes each de-douped retrieved doc to the verifier step - in parallel
+    # Notice the 'Send()' API that takes care of the parallelization
+
+    return [
+        Send(
+            "sub_verifier",
+            VerifierState(
+                document=doc,
+                #question=state["original_question"],
+                question=state["sub_question_str"],
+                graph_start_time=state["graph_start_time"],
+            ),
+        )
+        for doc in state["sub_question_deduped_retrieval_docs"]
+    ]
+
+
+def sub_continue_to_retrieval(state: BaseQAState) -> Union[Hashable, list[Hashable]]:
+    # Routes re-written queries to the (parallel) retrieval steps
+    # Notice the 'Send()' API that takes care of the parallelization
+    rewritten_queries = state["sub_question_search_queries"].rewritten_queries + [state["sub_question_str"]]
+    return [
+        Send(
+            "sub_custom_retrieve",
+            RetrieverState(
+                rewritten_query=query,
+                graph_start_time=state["graph_start_time"],
+            ),
+        )
+        for query in rewritten_queries
+    ]
--- a/backend/danswer/agent_search/core_qa_graph/graph_builder.py
+++ b/backend/danswer/agent_search/core_qa_graph/graph_builder.py
@@ -0,0 +1,132 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from danswer.agent_search.core_qa_graph.edges import sub_continue_to_retrieval
+from danswer.agent_search.core_qa_graph.edges import sub_continue_to_verifier
+from danswer.agent_search.core_qa_graph.nodes.combine_retrieved_docs import (
+    sub_combine_retrieved_docs,
+)
+from danswer.agent_search.core_qa_graph.nodes.custom_retrieve import (
+    sub_custom_retrieve,
+)
+from danswer.agent_search.core_qa_graph.nodes.dummy import sub_dummy
+from danswer.agent_search.core_qa_graph.nodes.final_format import (
+    sub_final_format,
+)
+from danswer.agent_search.core_qa_graph.nodes.generate import sub_generate
+from danswer.agent_search.core_qa_graph.nodes.qa_check import sub_qa_check
+from danswer.agent_search.core_qa_graph.nodes.rewrite import sub_rewrite
+from danswer.agent_search.core_qa_graph.nodes.verifier import sub_verifier
+from danswer.agent_search.core_qa_graph.states import BaseQAOutputState
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.core_qa_graph.states import CoreQAInputState
+
+
+def build_core_qa_graph() -> StateGraph:
+    sub_answers_initial = StateGraph(
+        state_schema=BaseQAState,
+        output=BaseQAOutputState,
+    )
+
+    ### Add nodes ###
+    sub_answers_initial.add_node(node="sub_dummy", action=sub_dummy)
+    sub_answers_initial.add_node(node="sub_rewrite", action=sub_rewrite)
+    sub_answers_initial.add_node(
+        node="sub_custom_retrieve",
+        action=sub_custom_retrieve,
+    )
+    sub_answers_initial.add_node(
+        node="sub_combine_retrieved_docs",
+        action=sub_combine_retrieved_docs,
+    )
+    sub_answers_initial.add_node(
+        node="sub_verifier",
+        action=sub_verifier,
+    )
+    sub_answers_initial.add_node(
+        node="sub_generate",
+        action=sub_generate,
+    )
+    sub_answers_initial.add_node(
+        node="sub_qa_check",
+        action=sub_qa_check,
+    )
+    sub_answers_initial.add_node(
+        node="sub_final_format",
+        action=sub_final_format,
+    )
+
+    ### Add edges ###
+    sub_answers_initial.add_edge(START, "sub_dummy")
+    sub_answers_initial.add_edge("sub_dummy", "sub_rewrite")
+
+    sub_answers_initial.add_conditional_edges(
+        source="sub_rewrite",
+        path=sub_continue_to_retrieval,
+    )
+
+    sub_answers_initial.add_edge(
+        start_key="sub_custom_retrieve",
+        end_key="sub_combine_retrieved_docs",
+    )
+
+    sub_answers_initial.add_conditional_edges(
+        source="sub_combine_retrieved_docs",
+        path=sub_continue_to_verifier,
+        path_map=["sub_verifier"],
+    )
+
+    sub_answers_initial.add_edge(
+        start_key="sub_verifier",
+        end_key="sub_generate",
+    )
+
+    sub_answers_initial.add_edge(
+        start_key="sub_generate",
+        end_key="sub_qa_check",
+    )
+
+    sub_answers_initial.add_edge(
+        start_key="sub_qa_check",
+        end_key="sub_final_format",
+    )
+
+    sub_answers_initial.add_edge(
+        start_key="sub_final_format",
+        end_key=END,
+    )
+    # sub_answers_graph = sub_answers_initial.compile()
+    return sub_answers_initial
+
+
+if __name__ == "__main__":
+    # q = "Whose music is kind of hard to easily enjoy?"
+    # q = "What is voice leading?"
+    # q = "What are the types of motions in music?"
+    # q = "What are key elements of music theory?"
+    # q = "How can I best understand music theory using voice leading?"
+    q = "What makes good music?"
+    # q = "types of motions in music"
+    # q = "What is the relationship between music and physics?"
+    # q = "Can you compare various grunge styles?"
+    # q = "Why is quantum gravity so hard?"
+
+    inputs = CoreQAInputState(
+        original_question=q,
+        sub_question_str=q,
+    )
+    sub_answers_graph = build_core_qa_graph()
+    compiled_sub_answers = sub_answers_graph.compile()
+    output = compiled_sub_answers.invoke(inputs)
+    print("\nOUTPUT:")
+    print(output.keys())
+    for key, value in output.items():
+        if key in [
+            "sub_question_answer",
+            "sub_question_str",
+            "sub_qas",
+            "initial_sub_qas",
+            "sub_question_answer",
+        ]:
+            print(f"{key}: {value}")
--- a/backend/danswer/agent_search/core_qa_graph/nodes/init.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/init.py
--- a/backend/danswer/agent_search/core_qa_graph/nodes/combine_retrieved_docs.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/combine_retrieved_docs.py
@@ -0,0 +1,36 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.context.search.models import InferenceSection
+
+
+def sub_combine_retrieved_docs(state: BaseQAState) -> dict[str, Any]:
+    """
+    Dedupe the retrieved docs.
+    """
+    node_start_time = datetime.now()
+
+    sub_question_base_retrieval_docs = state["sub_question_base_retrieval_docs"]
+
+    print(f"Number of docs from steps: {len(sub_question_base_retrieval_docs)}")
+    dedupe_docs: list[InferenceSection] = []
+    for base_retrieval_doc in sub_question_base_retrieval_docs:
+        if not any(
+            base_retrieval_doc.center_chunk.chunk_id == doc.center_chunk.chunk_id
+            for doc in dedupe_docs
+        ):
+            dedupe_docs.append(base_retrieval_doc)
+
+    print(f"Number of deduped docs: {len(dedupe_docs)}")
+    
+
+    return {
+        "sub_question_deduped_retrieval_docs": dedupe_docs,
+        "log_messages": generate_log_message(
+            message="sub - combine_retrieved_docs (dedupe)",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/core_qa_graph/nodes/custom_retrieve.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/custom_retrieve.py
@@ -0,0 +1,66 @@
+import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import RetrieverState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.context.search.models import InferenceSection
+from danswer.context.search.models import SearchRequest
+from danswer.context.search.pipeline import SearchPipeline
+from danswer.db.engine import get_session_context_manager
+from danswer.llm.factory import get_default_llms
+
+
+def sub_custom_retrieve(state: RetrieverState) -> dict[str, Any]:
+    """
+    Retrieve documents
+
+    Args:
+        state (dict): The current graph state
+
+    Returns:
+        state (dict): New key added to state, documents, that contains retrieved documents
+    """
+    print("---RETRIEVE SUB---")
+
+    node_start_time = datetime.datetime.now()
+
+    rewritten_query = state["rewritten_query"]
+
+    # Retrieval
+    # TODO: add the actual retrieval, probably from search_tool.run()
+    documents: list[InferenceSection] = []
+    llm, fast_llm = get_default_llms()
+    with get_session_context_manager() as db_session:
+        documents = SearchPipeline(
+            search_request=SearchRequest(
+                query=rewritten_query,
+            ),
+            user=None,
+            llm=llm,
+            fast_llm=fast_llm,
+            db_session=db_session,
+        )
+        
+        reranked_docs = documents.reranked_sections
+
+        # initial metric to measure fit TODO: implement metric properly
+
+        top_1_score = reranked_docs[0].center_chunk.score
+        top_5_score = sum([doc.center_chunk.score for doc in reranked_docs[:5]]) / 5
+        top_10_score = sum([doc.center_chunk.score for doc in reranked_docs[:10]]) / 10
+
+        fit_score = 1/3 * (top_1_score + top_5_score + top_10_score)
+
+        chunk_ids = {'query': rewritten_query, 
+                     'chunk_ids': [doc.center_chunk.chunk_id for doc in reranked_docs]}
+
+
+    return {
+        "sub_question_base_retrieval_docs": reranked_docs,
+        "sub_chunk_ids": [chunk_ids],
+        "log_messages": generate_log_message(
+            message=f"sub - custom_retrieve, fit_score: {fit_score}",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/core_qa_graph/nodes/dummy.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/dummy.py
@@ -0,0 +1,24 @@
+import datetime
+from typing import Any
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_dummy(state: BaseQAState) -> dict[str, Any]:
+    """
+    Dummy step
+    """
+
+    print("---Sub Dummy---")
+
+    node_start_time = datetime.datetime.now()
+
+    return {
+        "graph_start_time": node_start_time,
+        "log_messages": generate_log_message(
+            message="sub - dummy",
+            node_start_time=node_start_time,
+            graph_start_time=node_start_time,
+        ),
+    }
--- a/backend/danswer/agent_search/core_qa_graph/nodes/final_format.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/final_format.py
@@ -0,0 +1,22 @@
+from typing import Any
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+
+
+def sub_final_format(state: BaseQAState) -> dict[str, Any]:
+    """
+    Create the final output for the QA subgraph
+    """
+
+    print("---BASE FINAL FORMAT---")
+
+    return {
+        "sub_qas": [
+            {
+                "sub_question": state["sub_question_str"],
+                "sub_answer": state["sub_question_answer"],
+                "sub_answer_check": state["sub_question_answer_check"],
+            }
+        ],
+        "log_messages": state["log_messages"],
+    }
--- a/backend/danswer/agent_search/core_qa_graph/nodes/generate.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/generate.py
@@ -0,0 +1,91 @@
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.shared_graph_utils.prompts import BASE_RAG_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.llm.factory import get_default_llms
+
+
+def sub_generate(state: BaseQAState) -> dict[str, Any]:
+    """
+    Generate answer
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+         dict: The updated state with re-phrased question
+    """
+    print("---GENERATE---")
+    
+    # Create sub-query results
+
+    verified_chunks = [chunk.center_chunk.chunk_id for chunk in state["sub_question_verified_retrieval_docs"]]
+    result_dict = {}
+
+    chunk_id_dicts = state["sub_chunk_ids"]
+    expanded_chunks = []
+    original_chunks = []
+    
+    for chunk_id_dict in chunk_id_dicts:
+        sub_question = chunk_id_dict['query']
+        verified_sq_chunks = [chunk_id for chunk_id in chunk_id_dict['chunk_ids'] if chunk_id in verified_chunks]
+
+        if sub_question != state["original_question"]:
+            expanded_chunks += verified_sq_chunks
+        else:
+            result_dict['ORIGINAL'] = len(verified_sq_chunks)
+            original_chunks += verified_sq_chunks
+        result_dict[sub_question[:30]] = len(verified_sq_chunks)
+    
+    expansion_chunks = set(expanded_chunks)
+    num_expansion_chunks = sum([1 for chunk_id in expansion_chunks if chunk_id in verified_chunks])
+    num_original_relevant_chunks = len(original_chunks)
+    num_missed_relevant_chunks = sum([1 for chunk_id in original_chunks if chunk_id not in expansion_chunks])
+    num_gained_relevant_chunks = sum([1 for chunk_id in expansion_chunks if chunk_id not in original_chunks])
+    result_dict['expansion_chunks'] = num_expansion_chunks
+
+
+        
+    print(result_dict)
+
+    node_start_time = datetime.now()
+
+    question = state["sub_question_str"]
+    docs = state["sub_question_verified_retrieval_docs"]
+
+    print(f"Number of verified retrieval docs: {len(docs)}")
+    
+    # Only take the top 10 docs. 
+    # TODO: Make this dynamic or use config param?
+    top_10_docs = docs[-10:]
+
+    msg = [
+        HumanMessage(
+            content=BASE_RAG_PROMPT.format(question=question, context=format_docs(top_10_docs))
+        )
+    ]
+
+    # Grader
+    _, fast_llm = get_default_llms()
+    response = list(
+        fast_llm.stream(
+            prompt=msg,
+            # structured_response_format=None,
+        )
+    )
+
+    answer_str = merge_message_runs(response, chunk_separator="")[0].content
+    return {
+        "sub_question_answer": answer_str,
+        "log_messages": generate_log_message(
+            message="base - generate",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/core_qa_graph/nodes/qa_check.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/qa_check.py
@@ -0,0 +1,51 @@
+import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.shared_graph_utils.prompts import BASE_CHECK_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.llm.factory import get_default_llms
+
+
+def sub_qa_check(state: BaseQAState) -> dict[str, Any]:
+    """
+    Check if the sub-question answer is satisfactory.
+
+    Args:
+        state: The current SubQAState containing the sub-question and its answer
+
+    Returns:
+        dict containing the check result and log message
+    """
+    node_start_time = datetime.datetime.now()
+
+    msg = [
+        HumanMessage(
+            content=BASE_CHECK_PROMPT.format(
+                question=state["sub_question_str"],
+                base_answer=state["sub_question_answer"],
+            )
+        )
+    ]
+
+    _, fast_llm = get_default_llms()
+    response = list(
+        fast_llm.stream(
+            prompt=msg,
+            # structured_response_format=None,
+        )
+    )
+
+    response_str = merge_message_runs(response, chunk_separator="")[0].content
+
+    return {
+        "sub_question_answer_check": response_str,
+        "base_answer_messages": generate_log_message(
+            message="sub - qa_check",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/core_qa_graph/nodes/rewrite.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/rewrite.py
@@ -0,0 +1,74 @@
+import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.shared_graph_utils.models import RewrittenQueries
+from danswer.agent_search.shared_graph_utils.prompts import (
+    REWRITE_PROMPT_MULTI_ORIGINAL,
+)
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.llm.factory import get_default_llms
+
+
+def sub_rewrite(state: BaseQAState) -> dict[str, Any]:
+    """
+    Transform the initial question into more suitable search queries.
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+        dict: The updated state with re-phrased question
+    """
+
+    print("---SUB TRANSFORM QUERY---")
+
+    node_start_time = datetime.datetime.now()
+
+    # messages = state["base_answer_messages"]
+    question = state["sub_question_str"]
+
+    msg = [
+        HumanMessage(
+            content=REWRITE_PROMPT_MULTI_ORIGINAL.format(question=question),
+        )
+    ]
+
+    """
+    msg = [
+        HumanMessage(
+            content=REWRITE_PROMPT_MULTI.format(question=question),
+        )
+    ]
+    """
+
+    _, fast_llm = get_default_llms()
+    llm_response_list = list(
+        fast_llm.stream(
+            prompt=msg,
+            # structured_response_format={"type": "json_object", "schema": RewrittenQueries.model_json_schema()},
+            # structured_response_format=RewrittenQueries.model_json_schema(),
+        )
+    )
+    llm_response = merge_message_runs(llm_response_list, chunk_separator="")[0].content
+
+    print(f"llm_response: {llm_response}")
+
+    rewritten_queries = llm_response.split("--")
+    # rewritten_queries = [llm_response.split("\n")[0]]
+
+    print(f"rewritten_queries: {rewritten_queries}")
+
+    rewritten_queries = RewrittenQueries(rewritten_queries=rewritten_queries)
+
+    return {
+        "sub_question_search_queries": rewritten_queries,
+        "log_messages": generate_log_message(
+            message="sub - rewrite",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/core_qa_graph/nodes/verifier.py
+++ b/backend/danswer/agent_search/core_qa_graph/nodes/verifier.py
@@ -0,0 +1,64 @@
+import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.primary_graph.states import VerifierState
+from danswer.agent_search.shared_graph_utils.models import BinaryDecision
+from danswer.agent_search.shared_graph_utils.prompts import VERIFIER_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.llm.factory import get_default_llms
+
+
+def sub_verifier(state: VerifierState) -> dict[str, Any]:
+    """
+    Check whether the document is relevant for the original user question
+
+    Args:
+        state (VerifierState): The current state
+
+    Returns:
+        dict: ict: The updated state with the final decision
+    """
+
+    # print("---VERIFY QUTPUT---")
+    node_start_time = datetime.datetime.now()
+
+    question = state["question"]
+    document_content = state["document"].combined_content
+
+    msg = [
+        HumanMessage(
+            content=VERIFIER_PROMPT.format(
+                question=question, document_content=document_content
+            )
+        )
+    ]
+
+    # Grader
+    llm, fast_llm = get_default_llms()
+    response = list(
+        llm.stream(
+            prompt=msg,
+            # structured_response_format=BinaryDecision.model_json_schema(),
+        )
+    )
+
+    response_string = merge_message_runs(response, chunk_separator="")[0].content
+    # Convert string response to proper dictionary format
+    decision_dict = {"decision": response_string.lower()}
+    formatted_response = BinaryDecision.model_validate(decision_dict)
+
+    print(f"Verification end time: {datetime.datetime.now()}")
+
+    return {
+        "sub_question_verified_retrieval_docs": [state["document"]]
+        if formatted_response.decision == "yes"
+        else [],
+        "log_messages": generate_log_message(
+            message=f"sub - verifier: {formatted_response.decision}",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/core_qa_graph/states.py
+++ b/backend/danswer/agent_search/core_qa_graph/states.py
@@ -0,0 +1,90 @@
+import operator
+from collections.abc import Sequence
+from datetime import datetime
+from typing import Annotated
+from typing import TypedDict
+
+from langchain_core.messages import BaseMessage
+from langgraph.graph.message import add_messages
+
+from danswer.agent_search.shared_graph_utils.models import RewrittenQueries
+from danswer.context.search.models import InferenceSection
+from danswer.llm.interfaces import LLM
+
+
+class SubQuestionRetrieverState(TypedDict):
+    # The state for the parallel Retrievers. They each need to see only one query
+    sub_question_rewritten_query: str
+
+
+class SubQuestionVerifierState(TypedDict):
+    # The state for the parallel verification step.  Each node execution need to see only one question/doc pair
+    sub_question_document: InferenceSection
+    sub_question: str
+
+
+class CoreQAInputState(TypedDict):
+    sub_question_str: str
+    original_question: str
+
+
+class BaseQAState(TypedDict):
+    # The 'core SubQuestion'  state.
+    original_question: str
+    graph_start_time: datetime
+    # start time for parallel initial sub-questionn thread
+    sub_query_start_time: datetime
+    sub_question_rewritten_queries: list[str]
+    sub_question_str: str
+    sub_question_search_queries: RewrittenQueries
+    sub_question_nr: int
+    sub_chunk_ids: Annotated[Sequence[dict], operator.add]
+    sub_question_base_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_deduped_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_verified_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_reranked_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_top_chunks: Annotated[Sequence[dict], operator.add]
+    sub_question_answer: str
+    sub_question_answer_check: str
+    log_messages: Annotated[Sequence[BaseMessage], add_messages]
+    sub_qas: Annotated[Sequence[dict], operator.add]
+    # Answers sent back to core
+    initial_sub_qas: Annotated[Sequence[dict], operator.add]
+    primary_llm: LLM
+    fast_llm: LLM
+
+
+class BaseQAOutputState(TypedDict):
+    # The 'SubQuestion'  output state. Removes all the intermediate states
+    sub_question_rewritten_queries: list[str]
+    sub_question_str: str
+    sub_question_search_queries: list[str]
+    sub_question_nr: int
+    # Answers sent back to core
+    sub_qas: Annotated[Sequence[dict], operator.add]
+    # Answers sent back to core
+    initial_sub_qas: Annotated[Sequence[dict], operator.add]
+    sub_question_base_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_deduped_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_verified_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_reranked_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_top_chunks: Annotated[Sequence[dict], operator.add]
+    sub_question_answer: str
+    sub_question_answer_check: str
+    log_messages: Annotated[Sequence[BaseMessage], add_messages]
--- a/backend/danswer/agent_search/deep_qa_graph/edges.py
+++ b/backend/danswer/agent_search/deep_qa_graph/edges.py
@@ -0,0 +1,46 @@
+from collections.abc import Hashable
+from typing import Union
+
+from langgraph.types import Send
+
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+from danswer.agent_search.primary_graph.states import RetrieverState
+from danswer.agent_search.primary_graph.states import VerifierState
+
+
+def sub_continue_to_verifier(state: ResearchQAState) -> Union[Hashable, list[Hashable]]:
+    # Routes each de-douped retrieved doc to the verifier step - in parallel
+    # Notice the 'Send()' API that takes care of the parallelization
+
+    return [
+        Send(
+            "sub_verifier",
+            VerifierState(
+                document=doc,
+                question=state["sub_question"],
+                primary_llm=state["primary_llm"],
+                fast_llm=state["fast_llm"],
+                graph_start_time=state["graph_start_time"],
+            ),
+        )
+        for doc in state["sub_question_base_retrieval_docs"]
+    ]
+
+
+def sub_continue_to_retrieval(
+    state: ResearchQAState,
+) -> Union[Hashable, list[Hashable]]:
+    # Routes re-written queries to the (parallel) retrieval steps
+    # Notice the 'Send()' API that takes care of the parallelization
+    return [
+        Send(
+            "sub_custom_retrieve",
+            RetrieverState(
+                rewritten_query=query,
+                primary_llm=state["primary_llm"],
+                fast_llm=state["fast_llm"],
+                graph_start_time=state["graph_start_time"],
+            ),
+        )
+        for query in state["sub_question_rewritten_queries"]
+    ]
--- a/backend/danswer/agent_search/deep_qa_graph/graph_builder.py
+++ b/backend/danswer/agent_search/deep_qa_graph/graph_builder.py
@@ -0,0 +1,93 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from danswer.agent_search.deep_qa_graph.edges import sub_continue_to_retrieval
+from danswer.agent_search.deep_qa_graph.edges import sub_continue_to_verifier
+from danswer.agent_search.deep_qa_graph.nodes.combine_retrieved_docs import (
+    sub_combine_retrieved_docs,
+)
+from danswer.agent_search.deep_qa_graph.nodes.custom_retrieve import sub_custom_retrieve
+from danswer.agent_search.deep_qa_graph.nodes.dummy import sub_dummy
+from danswer.agent_search.deep_qa_graph.nodes.final_format import sub_final_format
+from danswer.agent_search.deep_qa_graph.nodes.generate import sub_generate
+from danswer.agent_search.deep_qa_graph.nodes.qa_check import sub_qa_check
+from danswer.agent_search.deep_qa_graph.nodes.verifier import sub_verifier
+from danswer.agent_search.deep_qa_graph.states import ResearchQAOutputState
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+
+
+def build_deep_qa_graph() -> StateGraph:
+    # Define the nodes we will cycle between
+    sub_answers = StateGraph(state_schema=ResearchQAState, output=ResearchQAOutputState)
+
+    ### Add Nodes ###
+
+    # Dummy node for initial processing
+    sub_answers.add_node(node="sub_dummy", action=sub_dummy)
+
+    # The retrieval step
+    sub_answers.add_node(node="sub_custom_retrieve", action=sub_custom_retrieve)
+
+    # The dedupe step
+    sub_answers.add_node(
+        node="sub_combine_retrieved_docs", action=sub_combine_retrieved_docs
+    )
+
+    # Verifying retrieved information
+    sub_answers.add_node(node="sub_verifier", action=sub_verifier)
+
+    # Generating the response
+    sub_answers.add_node(node="sub_generate", action=sub_generate)
+
+    # Checking the quality of the answer
+    sub_answers.add_node(node="sub_qa_check", action=sub_qa_check)
+
+    # Final formatting of the response
+    sub_answers.add_node(node="sub_final_format", action=sub_final_format)
+
+    ### Add Edges ###
+
+    # Generate multiple sub-questions
+    sub_answers.add_edge(start_key=START, end_key="sub_rewrite")
+
+    # For each sub-question, perform a retrieval in parallel
+    sub_answers.add_conditional_edges(
+        source="sub_rewrite",
+        path=sub_continue_to_retrieval,
+        path_map=["sub_custom_retrieve"],
+    )
+
+    # Combine the retrieved docs for each sub-question from the parallel retrievals
+    sub_answers.add_edge(
+        start_key="sub_custom_retrieve", end_key="sub_combine_retrieved_docs"
+    )
+
+    # Go over all of the combined retrieved docs and verify them against the original question
+    sub_answers.add_conditional_edges(
+        source="sub_combine_retrieved_docs",
+        path=sub_continue_to_verifier,
+        path_map=["sub_verifier"],
+    )
+
+    # Generate an answer for each verified retrieved doc
+    sub_answers.add_edge(start_key="sub_verifier", end_key="sub_generate")
+
+    # Check the quality of the answer
+    sub_answers.add_edge(start_key="sub_generate", end_key="sub_qa_check")
+
+    sub_answers.add_edge(start_key="sub_qa_check", end_key="sub_final_format")
+
+    sub_answers.add_edge(start_key="sub_final_format", end_key=END)
+
+    return sub_answers
+
+
+if __name__ == "__main__":
+    # TODO: add the actual question
+    inputs = {"sub_question": "Whose music is kind of hard to easily enjoy?"}
+    sub_answers_graph = build_deep_qa_graph()
+    compiled_sub_answers = sub_answers_graph.compile()
+    output = compiled_sub_answers.invoke(inputs)
+    print("\nOUTPUT:")
+    print(output)
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/init.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/init.py
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/combine_retrieved_docs.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/combine_retrieved_docs.py
@@ -0,0 +1,31 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_combine_retrieved_docs(state: ResearchQAState) -> dict[str, Any]:
+    """
+    Dedupe the retrieved docs.
+    """
+    node_start_time = datetime.now()
+
+    sub_question_base_retrieval_docs = state["sub_question_base_retrieval_docs"]
+
+    print(f"Number of docs from steps: {len(sub_question_base_retrieval_docs)}")
+    dedupe_docs = []
+    for base_retrieval_doc in sub_question_base_retrieval_docs:
+        if base_retrieval_doc not in dedupe_docs:
+            dedupe_docs.append(base_retrieval_doc)
+
+    print(f"Number of deduped docs: {len(dedupe_docs)}")
+
+    return {
+        "sub_question_deduped_retrieval_docs": dedupe_docs,
+        "log_messages": generate_log_message(
+            message="sub - combine_retrieved_docs (dedupe)",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/custom_retrieve.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/custom_retrieve.py
@@ -0,0 +1,33 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import RetrieverState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.context.search.models import InferenceSection
+
+
+def sub_custom_retrieve(state: RetrieverState) -> dict[str, Any]:
+    """
+    Retrieve documents
+
+    Args:
+        state (dict): The current graph state
+
+    Returns:
+        state (dict): New key added to state, documents, that contains retrieved documents
+    """
+    print("---RETRIEVE SUB---")
+    node_start_time = datetime.now()
+
+    # Retrieval
+    # TODO: add the actual retrieval, probably from search_tool.run()
+    documents: list[InferenceSection] = []
+
+    return {
+        "sub_question_base_retrieval_docs": documents,
+        "log_messages": generate_log_message(
+            message="sub - custom_retrieve",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/dummy.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/dummy.py
@@ -0,0 +1,21 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_dummy(state: BaseQAState) -> dict[str, Any]:
+    """
+    Dummy step
+    """
+
+    print("---Sub Dummy---")
+
+    return {
+        "log_messages": generate_log_message(
+            message="sub - dummy",
+            node_start_time=datetime.now(),
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/final_format.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/final_format.py
@@ -0,0 +1,31 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_final_format(state: ResearchQAState) -> dict[str, Any]:
+    """
+    Create the final output for the QA subgraph
+    """
+
+    print("---SUB  FINAL FORMAT---")
+    node_start_time = datetime.now()
+
+    return {
+        # TODO: Type this
+        "sub_qas": [
+            {
+                "sub_question": state["sub_question"],
+                "sub_answer": state["sub_question_answer"],
+                "sub_question_nr": state["sub_question_nr"],
+                "sub_answer_check": state["sub_question_answer_check"],
+            }
+        ],
+        "log_messages": generate_log_message(
+            message="sub - final format",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/generate.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/generate.py
@@ -0,0 +1,56 @@
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+from danswer.agent_search.shared_graph_utils.prompts import BASE_RAG_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_generate(state: ResearchQAState) -> dict[str, Any]:
+    """
+    Generate answer
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+         dict: The updated state with re-phrased question
+    """
+    print("---SUB GENERATE---")
+    node_start_time = datetime.now()
+
+    question = state["sub_question"]
+    docs = state["sub_question_verified_retrieval_docs"]
+
+    print(f"Number of verified retrieval docs for sub-question: {len(docs)}")
+
+    msg = [
+        HumanMessage(
+            content=BASE_RAG_PROMPT.format(question=question, context=format_docs(docs))
+        )
+    ]
+
+    # Grader
+    if len(docs) > 0:
+        model = state["fast_llm"]
+        response = list(
+            model.stream(
+                prompt=msg,
+            )
+        )
+        response_str = merge_message_runs(response, chunk_separator="")[0].content
+    else:
+        response_str = ""
+
+    return {
+        "sub_question_answer": response_str,
+        "log_messages": generate_log_message(
+            message="sub - generate",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/qa_check.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/qa_check.py
@@ -0,0 +1,57 @@
+import json
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.deep_qa_graph.prompts import SUB_CHECK_PROMPT
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+from danswer.agent_search.shared_graph_utils.models import BinaryDecision
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_qa_check(state: ResearchQAState) -> dict[str, Any]:
+    """
+    Check whether the final output satisfies the original user question
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+        dict: The updated state with the final decision
+    """
+
+    print("---CHECK SUB QUTPUT---")
+    node_start_time = datetime.now()
+
+    sub_answer = state["sub_question_answer"]
+    sub_question = state["sub_question"]
+
+    msg = [
+        HumanMessage(
+            content=SUB_CHECK_PROMPT.format(
+                sub_question=sub_question, sub_answer=sub_answer
+            )
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = list(
+        model.stream(
+            prompt=msg,
+            structured_response_format=BinaryDecision.model_json_schema(),
+        )
+    )
+
+    raw_response = json.loads(response[0].pretty_repr())
+    formatted_response = BinaryDecision.model_validate(raw_response)
+
+    return {
+        "sub_question_answer_check": formatted_response.decision,
+        "log_messages": generate_log_message(
+            message=f"sub - qa check: {formatted_response.decision}",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/rewrite.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/rewrite.py
@@ -0,0 +1,64 @@
+import json
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+from danswer.agent_search.shared_graph_utils.models import RewrittenQueries
+from danswer.agent_search.shared_graph_utils.prompts import REWRITE_PROMPT_MULTI
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.llm.interfaces import LLM
+
+
+def sub_rewrite(state: ResearchQAState) -> dict[str, Any]:
+    """
+    Transform the initial question into more suitable search queries.
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+        dict: The updated state with re-phrased question
+    """
+
+    print("---SUB TRANSFORM QUERY---")
+    node_start_time = datetime.now()
+
+    question = state["sub_question"]
+
+    msg = [
+        HumanMessage(
+            content=REWRITE_PROMPT_MULTI.format(question=question),
+        )
+    ]
+    fast_llm: LLM = state["fast_llm"]
+    llm_response = list(
+        fast_llm.stream(
+            prompt=msg,
+            structured_response_format=RewrittenQueries.model_json_schema(),
+        )
+    )
+
+    # Get the rewritten queries in a defined format
+    rewritten_queries: RewrittenQueries = json.loads(llm_response[0].pretty_repr())
+
+    print(f"rewritten_queries: {rewritten_queries}")
+
+    rewritten_queries = RewrittenQueries(
+        rewritten_queries=[
+            "music hard to listen to",
+            "Music that is not fun or pleasant",
+        ]
+    )
+
+    print(f"hardcoded rewritten_queries: {rewritten_queries}")
+
+    return {
+        "sub_question_rewritten_queries": rewritten_queries,
+        "log_messages": generate_log_message(
+            message="sub - rewrite",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/nodes/verifier.py
+++ b/backend/danswer/agent_search/deep_qa_graph/nodes/verifier.py
@@ -0,0 +1,59 @@
+import json
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.states import VerifierState
+from danswer.agent_search.shared_graph_utils.models import BinaryDecision
+from danswer.agent_search.shared_graph_utils.prompts import VERIFIER_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_verifier(state: VerifierState) -> dict[str, Any]:
+    """
+    Check whether the document is relevant for the original user question
+
+    Args:
+        state (VerifierState): The current state
+
+    Returns:
+        dict: ict: The updated state with the final decision
+    """
+
+    print("---SUB VERIFY QUTPUT---")
+    node_start_time = datetime.now()
+
+    question = state["question"]
+    document_content = state["document"].combined_content
+
+    msg = [
+        HumanMessage(
+            content=VERIFIER_PROMPT.format(
+                question=question, document_content=document_content
+            )
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = list(
+        model.stream(
+            prompt=msg,
+            structured_response_format=BinaryDecision.model_json_schema(),
+        )
+    )
+
+    raw_response = json.loads(response[0].pretty_repr())
+    formatted_response = BinaryDecision.model_validate(raw_response)
+
+    return {
+        "deduped_retrieval_docs": [state["document"]]
+        if formatted_response.decision == "yes"
+        else [],
+        "log_messages": generate_log_message(
+            message=f"core - verifier: {formatted_response.decision}",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_qa_graph/prompts.py
+++ b/backend/danswer/agent_search/deep_qa_graph/prompts.py
@@ -0,0 +1,13 @@
+SUB_CHECK_PROMPT = """ \n
+    Please check whether the suggested answer seems to address the original question.
+
+    Please only answer with 'yes' or 'no' \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    Here is the proposed answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+    Please answer with yes or no:"""
--- a/backend/danswer/agent_search/deep_qa_graph/states.py
+++ b/backend/danswer/agent_search/deep_qa_graph/states.py
@@ -0,0 +1,64 @@
+import operator
+from collections.abc import Sequence
+from datetime import datetime
+from typing import Annotated
+from typing import TypedDict
+
+from langchain_core.messages import BaseMessage
+from langgraph.graph.message import add_messages
+
+from danswer.context.search.models import InferenceSection
+from danswer.llm.interfaces import LLM
+
+
+class ResearchQAState(TypedDict):
+    # The 'core SubQuestion'  state.
+    original_question: str
+    graph_start_time: datetime
+    sub_question_rewritten_queries: list[str]
+    sub_question: str
+    sub_question_nr: int
+    sub_question_base_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_deduped_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_verified_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_reranked_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_top_chunks: Annotated[Sequence[dict], operator.add]
+    sub_question_answer: str
+    sub_question_answer_check: str
+    log_messages: Annotated[Sequence[BaseMessage], add_messages]
+    sub_qas: Annotated[Sequence[dict], operator.add]
+    primary_llm: LLM
+    fast_llm: LLM
+
+
+class ResearchQAOutputState(TypedDict):
+    # The 'SubQuestion'  output state. Removes all the intermediate states
+    sub_question_rewritten_queries: list[str]
+    sub_question: str
+    sub_question_nr: int
+    # Answers sent back to core
+    sub_qas: Annotated[Sequence[dict], operator.add]
+    sub_question_base_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_deduped_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_verified_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_reranked_retrieval_docs: Annotated[
+        Sequence[InferenceSection], operator.add
+    ]
+    sub_question_top_chunks: Annotated[Sequence[dict], operator.add]
+    sub_question_answer: str
+    sub_question_answer_check: str
+    log_messages: Annotated[Sequence[BaseMessage], add_messages]
--- a/backend/danswer/agent_search/primary_graph/edges.py
+++ b/backend/danswer/agent_search/primary_graph/edges.py
@@ -0,0 +1,75 @@
+from collections.abc import Hashable
+from typing import Union
+
+from langchain_core.messages import HumanMessage
+from langgraph.types import Send
+
+from danswer.agent_search.core_qa_graph.states import BaseQAState
+from danswer.agent_search.deep_qa_graph.states import ResearchQAState
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.prompts import BASE_CHECK_PROMPT
+
+
+def continue_to_initial_sub_questions(
+    state: QAState,
+) -> Union[Hashable, list[Hashable]]:
+    # Routes re-written queries to the (parallel) retrieval steps
+    # Notice the 'Send()' API that takes care of the parallelization
+    return [
+        Send(
+            "sub_answers_graph_initial",
+            BaseQAState(
+                sub_question_str=initial_sub_question["sub_question_str"],
+                sub_question_search_queries=initial_sub_question[
+                    "sub_question_search_queries"
+                ],
+                sub_question_nr=initial_sub_question["sub_question_nr"],
+                primary_llm=state["primary_llm"],
+                fast_llm=state["fast_llm"],
+                graph_start_time=state["graph_start_time"],
+            ),
+        )
+        for initial_sub_question in state["initial_sub_questions"]
+    ]
+
+
+def continue_to_answer_sub_questions(state: QAState) -> Union[Hashable, list[Hashable]]:
+    # Routes re-written queries to the (parallel) retrieval steps
+    # Notice the 'Send()' API that takes care of the parallelization
+    return [
+        Send(
+            "sub_answers_graph",
+            ResearchQAState(
+                sub_question=sub_question["sub_question_str"],
+                sub_question_nr=sub_question["sub_question_nr"],
+                graph_start_time=state["graph_start_time"],
+                primary_llm=state["primary_llm"],
+                fast_llm=state["fast_llm"],
+            ),
+        )
+        for sub_question in state["sub_questions"]
+    ]
+
+
+def continue_to_deep_answer(state: QAState) -> Union[Hashable, list[Hashable]]:
+    print("---GO TO DEEP ANSWER OR END---")
+
+    base_answer = state["base_answer"]
+
+    question = state["original_question"]
+
+    BASE_CHECK_MESSAGE = [
+        HumanMessage(
+            content=BASE_CHECK_PROMPT.format(question=question, base_answer=base_answer)
+        )
+    ]
+
+    model = state["fast_llm"]
+    response = model.invoke(BASE_CHECK_MESSAGE)
+
+    print(f"CAN WE CONTINUE W/O GENERATING A DEEP ANSWER? - {response.pretty_repr()}")
+
+    if response.pretty_repr() == "no":
+        return "decompose"
+    else:
+        return "end"
--- a/backend/danswer/agent_search/primary_graph/graph_builder.py
+++ b/backend/danswer/agent_search/primary_graph/graph_builder.py
@@ -0,0 +1,171 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from danswer.agent_search.core_qa_graph.graph_builder import build_core_qa_graph
+from danswer.agent_search.deep_qa_graph.graph_builder import build_deep_qa_graph
+from danswer.agent_search.primary_graph.edges import continue_to_answer_sub_questions
+from danswer.agent_search.primary_graph.edges import continue_to_deep_answer
+from danswer.agent_search.primary_graph.edges import continue_to_initial_sub_questions
+from danswer.agent_search.primary_graph.nodes.base_wait import base_wait
+from danswer.agent_search.primary_graph.nodes.combine_retrieved_docs import (
+    combine_retrieved_docs,
+)
+from danswer.agent_search.primary_graph.nodes.custom_retrieve import custom_retrieve
+from danswer.agent_search.primary_graph.nodes.decompose import decompose
+from danswer.agent_search.primary_graph.nodes.deep_answer_generation import (
+    deep_answer_generation,
+)
+from danswer.agent_search.primary_graph.nodes.dummy_start import dummy_start
+from danswer.agent_search.primary_graph.nodes.entity_term_extraction import (
+    entity_term_extraction,
+)
+from danswer.agent_search.primary_graph.nodes.final_stuff import final_stuff
+from danswer.agent_search.primary_graph.nodes.generate_initial import generate_initial
+from danswer.agent_search.primary_graph.nodes.main_decomp_base import main_decomp_base
+from danswer.agent_search.primary_graph.nodes.rewrite import rewrite
+from danswer.agent_search.primary_graph.nodes.sub_qa_level_aggregator import (
+    sub_qa_level_aggregator,
+)
+from danswer.agent_search.primary_graph.nodes.sub_qa_manager import sub_qa_manager
+from danswer.agent_search.primary_graph.nodes.verifier import verifier
+from danswer.agent_search.primary_graph.states import QAState
+
+
+def build_core_graph() -> StateGraph:
+    # Define the nodes we will cycle between
+    core_answer_graph = StateGraph(state_schema=QAState)
+
+    ### Add Nodes ###
+    core_answer_graph.add_node(node="dummy_start", 
+                               action=dummy_start)
+
+    # Re-writing the question
+    core_answer_graph.add_node(node="rewrite", 
+                               action=rewrite)
+
+    # The retrieval step
+    core_answer_graph.add_node(node="custom_retrieve", 
+                               action=custom_retrieve)
+
+    # Combine and dedupe retrieved docs.
+    core_answer_graph.add_node(
+        node="combine_retrieved_docs", 
+        action=combine_retrieved_docs
+    )
+
+    # Extract entities, terms and relationships
+    core_answer_graph.add_node(
+        node="entity_term_extraction", 
+        action=entity_term_extraction
+    )
+
+    # Verifying that a retrieved doc is relevant
+    core_answer_graph.add_node(node="verifier", 
+                               action=verifier)
+
+    # Initial question decomposition
+    core_answer_graph.add_node(node="main_decomp_base", 
+                               action=main_decomp_base)
+
+    # Build the base QA sub-graph and compile it
+    compiled_core_qa_graph = build_core_qa_graph().compile()
+    # Add the compiled base QA sub-graph as a node to the core graph
+    core_answer_graph.add_node(
+        node="sub_answers_graph_initial", 
+        action=compiled_core_qa_graph
+    )
+
+    # Checking whether the initial answer is in the ballpark
+    core_answer_graph.add_node(node="base_wait", 
+                               action=base_wait)
+
+    # Decompose the question into sub-questions
+    core_answer_graph.add_node(node="decompose", 
+                               action=decompose)
+
+    # Manage the sub-questions
+    core_answer_graph.add_node(node="sub_qa_manager", 
+                               action=sub_qa_manager)
+
+    # Build the research QA sub-graph and compile it
+    compiled_deep_qa_graph = build_deep_qa_graph().compile()
+    # Add the compiled research QA sub-graph as a node to the core graph
+    core_answer_graph.add_node(node="sub_answers_graph", 
+                               action=compiled_deep_qa_graph)
+
+    # Aggregate the sub-questions
+    core_answer_graph.add_node(
+        node="sub_qa_level_aggregator", 
+        action=sub_qa_level_aggregator
+    )
+
+    # aggregate sub questions and answers
+    core_answer_graph.add_node(
+        node="deep_answer_generation", 
+        action=deep_answer_generation
+    )
+
+    # A final clean-up step
+    core_answer_graph.add_node(node="final_stuff", 
+                               action=final_stuff)
+
+    # Generating a response after we know the documents are relevant
+    core_answer_graph.add_node(node="generate_initial", 
+                               action=generate_initial)
+
+    ### Add Edges ###
+
+    # start the initial sub-question decomposition
+    core_answer_graph.add_edge(start_key=START, 
+                               end_key="main_decomp_base")
+
+    core_answer_graph.add_conditional_edges(
+        source="main_decomp_base",
+        path=continue_to_initial_sub_questions,
+    )
+
+    # use the retrieved information to generate the answer
+    core_answer_graph.add_edge(
+        start_key=["verifier", "sub_answers_graph_initial"], 
+        end_key="generate_initial"
+    )
+    core_answer_graph.add_edge(start_key="generate_initial", 
+                               end_key="base_wait")
+
+    core_answer_graph.add_conditional_edges(
+        source="base_wait",
+        path=continue_to_deep_answer,
+        path_map={"decompose": "entity_term_extraction", "end": "final_stuff"},
+    )
+
+    core_answer_graph.add_edge(start_key="entity_term_extraction", end_key="decompose")
+
+    core_answer_graph.add_edge(start_key="decompose", 
+                               end_key="sub_qa_manager")
+    core_answer_graph.add_conditional_edges(
+        source="sub_qa_manager",
+        path=continue_to_answer_sub_questions,
+    )
+
+    core_answer_graph.add_edge(
+        start_key="sub_answers_graph",
+          end_key="sub_qa_level_aggregator"
+    )
+
+    core_answer_graph.add_edge(
+        start_key="sub_qa_level_aggregator", 
+        end_key="deep_answer_generation"
+    )
+
+    core_answer_graph.add_edge(
+        start_key="deep_answer_generation", 
+        end_key="final_stuff"
+    )
+
+    core_answer_graph.add_edge(start_key="final_stuff", 
+                               end_key=END)
+    
+    core_answer_graph.compile()
+
+    return core_answer_graph
--- a/backend/danswer/agent_search/primary_graph/nodes/init.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/init.py
--- a/backend/danswer/agent_search/primary_graph/nodes/base_wait.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/base_wait.py
@@ -0,0 +1,27 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def base_wait(state: QAState) -> dict[str, Any]:
+    """
+    Ensures that all required steps are completed before proceeding to the next step
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+        dict: {} (no operation, just logging)
+    """
+
+    print("---Base Wait ---")
+    node_start_time = datetime.now()
+    return {
+        "log_messages": generate_log_message(
+            message="core - base_wait",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/combine_retrieved_docs.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/combine_retrieved_docs.py
@@ -0,0 +1,36 @@
+from collections.abc import Sequence
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.context.search.models import InferenceSection
+
+
+def combine_retrieved_docs(state: QAState) -> dict[str, Any]:
+    """
+    Dedupe the retrieved docs.
+    """
+    node_start_time = datetime.now()
+
+    base_retrieval_docs: Sequence[InferenceSection] = state["base_retrieval_docs"]
+
+    print(f"Number of docs from steps: {len(base_retrieval_docs)}")
+    dedupe_docs: list[InferenceSection] = []
+    for base_retrieval_doc in base_retrieval_docs:
+        if not any(
+            base_retrieval_doc.center_chunk.document_id == doc.center_chunk.document_id
+            for doc in dedupe_docs
+        ):
+            dedupe_docs.append(base_retrieval_doc)
+
+    print(f"Number of deduped docs: {len(dedupe_docs)}")
+
+    return {
+        "deduped_retrieval_docs": dedupe_docs,
+        "log_messages": generate_log_message(
+            message="core - combine_retrieved_docs (dedupe)",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/custom_retrieve.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/custom_retrieve.py
@@ -0,0 +1,52 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import RetrieverState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.context.search.models import InferenceSection
+from danswer.context.search.models import SearchRequest
+from danswer.context.search.pipeline import SearchPipeline
+from danswer.db.engine import get_session_context_manager
+from danswer.llm.factory import get_default_llms
+
+
+def custom_retrieve(state: RetrieverState) -> dict[str, Any]:
+    """
+    Retrieve documents
+
+    Args:
+        retriever_state (dict): The current graph state
+
+    Returns:
+        state (dict): New key added to state, documents, that contains retrieved documents
+    """
+    print("---RETRIEVE---")
+
+    node_start_time = datetime.now()
+
+    query = state["rewritten_query"]
+
+    # Retrieval
+    # TODO: add the actual retrieval, probably from search_tool.run()
+    llm, fast_llm = get_default_llms()
+    with get_session_context_manager() as db_session:
+        top_sections = SearchPipeline(
+            search_request=SearchRequest(
+                query=query,
+            ),
+            user=None,
+            llm=llm,
+            fast_llm=fast_llm,
+            db_session=db_session,
+        ).reranked_sections
+        print(len(top_sections))
+    documents: list[InferenceSection] = []
+
+    return {
+        "base_retrieval_docs": documents,
+        "log_messages": generate_log_message(
+            message="core - custom_retrieve",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/decompose.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/decompose.py
@@ -0,0 +1,78 @@
+import json
+import re
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.prompts import DEEP_DECOMPOSE_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_entity_term_extraction
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def decompose(state: QAState) -> dict[str, Any]:
+    """ """
+
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+    base_answer = state["base_answer"]
+
+    # get the entity term extraction dict and properly format it
+    entity_term_extraction_dict = state["retrieved_entities_relationships"][
+        "retrieved_entities_relationships"
+    ]
+
+    entity_term_extraction_str = format_entity_term_extraction(
+        entity_term_extraction_dict
+    )
+
+    initial_question_answers = state["initial_sub_qas"]
+
+    addressed_question_list = [
+        x["sub_question"]
+        for x in initial_question_answers
+        if x["sub_answer_check"] == "yes"
+    ]
+    failed_question_list = [
+        x["sub_question"]
+        for x in initial_question_answers
+        if x["sub_answer_check"] == "no"
+    ]
+
+    msg = [
+        HumanMessage(
+            content=DEEP_DECOMPOSE_PROMPT.format(
+                question=question,
+                entity_term_extraction_str=entity_term_extraction_str,
+                base_answer=base_answer,
+                answered_sub_questions="\n - ".join(addressed_question_list),
+                failed_sub_questions="\n - ".join(failed_question_list),
+            ),
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+
+    cleaned_response = re.sub(r"```json\n|\n```", "", response.pretty_repr())
+    parsed_response = json.loads(cleaned_response)
+
+    sub_questions_dict = {}
+    for sub_question_nr, sub_question_dict in enumerate(
+        parsed_response["sub_questions"]
+    ):
+        sub_question_dict["answered"] = False
+        sub_question_dict["verified"] = False
+        sub_questions_dict[sub_question_nr] = sub_question_dict
+
+    return {
+        "decomposed_sub_questions_dict": sub_questions_dict,
+        "log_messages": generate_log_message(
+            message="deep - decompose",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/deep_answer_generation.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/deep_answer_generation.py
@@ -0,0 +1,61 @@
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.prompts import COMBINED_CONTEXT
+from danswer.agent_search.shared_graph_utils.prompts import MODIFIED_RAG_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.agent_search.shared_graph_utils.utils import normalize_whitespace
+
+
+# aggregate sub questions and answers
+def deep_answer_generation(state: QAState) -> dict[str, Any]:
+    """
+    Generate answer
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+         dict: The updated state with re-phrased question
+    """
+    print("---DEEP GENERATE---")
+
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+    docs = state["deduped_retrieval_docs"]
+
+    deep_answer_context = state["core_answer_dynamic_context"]
+
+    print(f"Number of verified retrieval docs - deep: {len(docs)}")
+
+    combined_context = normalize_whitespace(
+        COMBINED_CONTEXT.format(
+            deep_answer_context=deep_answer_context, formated_docs=format_docs(docs)
+        )
+    )
+
+    msg = [
+        HumanMessage(
+            content=MODIFIED_RAG_PROMPT.format(
+                question=question, combined_context=combined_context
+            )
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+
+    return {
+        "deep_answer": response.content,
+        "log_messages": generate_log_message(
+            message="deep - deep answer generation",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/dummy_start.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/dummy_start.py
@@ -0,0 +1,11 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import QAState
+
+
+def dummy_start(state: QAState) -> dict[str, Any]:
+    """
+    Dummy node to set the start time
+    """
+    return {"start_time": datetime.now()}
--- a/backend/danswer/agent_search/primary_graph/nodes/entity_term_extraction.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/entity_term_extraction.py
@@ -0,0 +1,51 @@
+import json
+import re
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.primary_graph.prompts import ENTITY_TERM_PROMPT
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+from danswer.llm.factory import get_default_llms
+
+
+def entity_term_extraction(state: QAState) -> dict[str, Any]:
+    """Extract entities and terms from the question and context"""
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+    docs = state["deduped_retrieval_docs"]
+
+    doc_context = format_docs(docs)
+
+    msg = [
+        HumanMessage(
+            content=ENTITY_TERM_PROMPT.format(question=question, context=doc_context),
+        )
+    ]
+    _, fast_llm = get_default_llms()
+    # Grader
+    llm_response_list = list(
+        fast_llm.stream(
+            prompt=msg,
+            # structured_response_format={"type": "json_object", "schema": RewrittenQueries.model_json_schema()},
+            # structured_response_format=RewrittenQueries.model_json_schema(),
+        )
+    )
+    llm_response = merge_message_runs(llm_response_list, chunk_separator="")[0].content
+
+    cleaned_response = re.sub(r"```json\n|\n```", "", llm_response)
+    parsed_response = json.loads(cleaned_response)
+
+    return {
+        "retrieved_entities_relationships": parsed_response,
+        "log_messages": generate_log_message(
+            message="deep - entity term extraction",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/final_stuff.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/final_stuff.py
@@ -0,0 +1,85 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def final_stuff(state: QAState) -> dict[str, Any]:
+    """
+    Invokes the agent model to generate a response based on the current state. Given
+    the question, it will decide to retrieve using the retriever tool, or simply end.
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+        dict: The updated state with the agent response appended to messages
+    """
+    print("---FINAL---")
+    node_start_time = datetime.now()
+
+    messages = state["log_messages"]
+    time_ordered_messages = [x.pretty_repr() for x in messages]
+    time_ordered_messages.sort()
+
+    print("Message Log:")
+    print("\n".join(time_ordered_messages))
+
+    initial_sub_qas = state["initial_sub_qas"]
+    initial_sub_qa_list = []
+    for initial_sub_qa in initial_sub_qas:
+        if initial_sub_qa["sub_answer_check"] == "yes":
+            initial_sub_qa_list.append(
+                f'  Question:\n  {initial_sub_qa["sub_question"]}\n  --\n  Answer:\n  {initial_sub_qa["sub_answer"]}\n  -----'
+            )
+
+    initial_sub_qa_context = "\n".join(initial_sub_qa_list)
+
+    log_message = generate_log_message(
+        message="all - final_stuff",
+        node_start_time=node_start_time,
+        graph_start_time=state["graph_start_time"],
+    )
+
+    print(log_message)
+    print("--------------------------------")
+
+    base_answer = state["base_answer"]
+
+    print(f"Final Base Answer:\n{base_answer}")
+    print("--------------------------------")
+    print(f"Initial Answered Sub Questions:\n{initial_sub_qa_context}")
+    print("--------------------------------")
+
+    if not state.get("deep_answer"):
+        print("No Deep Answer was required")
+        return {
+            "log_messages": log_message,
+        }
+
+    deep_answer = state["deep_answer"]
+    sub_qas = state["sub_qas"]
+    sub_qa_list = []
+    for sub_qa in sub_qas:
+        if sub_qa["sub_answer_check"] == "yes":
+            sub_qa_list.append(
+                f'  Question:\n  {sub_qa["sub_question"]}\n  --\n  Answer:\n  {sub_qa["sub_answer"]}\n  -----'
+            )
+
+    sub_qa_context = "\n".join(sub_qa_list)
+
+    print(f"Final Base Answer:\n{base_answer}")
+    print("--------------------------------")
+    print(f"Final Deep Answer:\n{deep_answer}")
+    print("--------------------------------")
+    print("Sub Questions and Answers:")
+    print(sub_qa_context)
+
+    return {
+        "log_messages": generate_log_message(
+            message="all - final_stuff",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/generate.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/generate.py
@@ -0,0 +1,52 @@
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.prompts import BASE_RAG_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def generate(state: QAState) -> dict[str, Any]:
+    """
+    Generate answer
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+         dict: The updated state with re-phrased question
+    """
+    print("---GENERATE---")
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+    docs = state["deduped_retrieval_docs"]
+
+    print(f"Number of verified retrieval docs: {len(docs)}")
+
+    msg = [
+        HumanMessage(
+            content=BASE_RAG_PROMPT.format(question=question, context=format_docs(docs))
+        )
+    ]
+
+    # Grader
+    llm = state["fast_llm"]
+    response = list(
+        llm.stream(
+            prompt=msg,
+            structured_response_format=None,
+        )
+    )
+
+    return {
+        "base_answer": response[0].pretty_repr(),
+        "log_messages": generate_log_message(
+            message="core - generate",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/generate_initial.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/generate_initial.py
@@ -0,0 +1,72 @@
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.prompts import INITIAL_RAG_PROMPT
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def generate_initial(state: QAState) -> dict[str, Any]:
+    """
+    Generate answer
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+         dict: The updated state with re-phrased question
+    """
+    print("---GENERATE INITIAL---")
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+    docs = state["deduped_retrieval_docs"]
+    print(f"Number of verified retrieval docs - base: {len(docs)}")
+
+    sub_question_answers = state["initial_sub_qas"]
+
+    sub_question_answers_list = []
+
+    _SUB_QUESTION_ANSWER_TEMPLATE = """
+    Sub-Question:\n  - {sub_question}\n  --\nAnswer:\n  - {sub_answer}\n\n
+    """
+    for sub_question_answer_dict in sub_question_answers:
+        if (
+            sub_question_answer_dict["sub_answer_check"] == "yes"
+            and len(sub_question_answer_dict["sub_answer"]) > 0
+            and sub_question_answer_dict["sub_answer"] != "I don't know"
+        ):
+            sub_question_answers_list.append(
+                _SUB_QUESTION_ANSWER_TEMPLATE.format(
+                    sub_question=sub_question_answer_dict["sub_question"],
+                    sub_answer=sub_question_answer_dict["sub_answer"],
+                )
+            )
+
+    sub_question_answer_str = "\n\n------\n\n".join(sub_question_answers_list)
+
+    msg = [
+        HumanMessage(
+            content=INITIAL_RAG_PROMPT.format(
+                question=question,
+                context=format_docs(docs),
+                answered_sub_questions=sub_question_answer_str,
+            )
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+
+    return {
+        "base_answer": response.pretty_repr(),
+        "log_messages": generate_log_message(
+            message="core - generate initial",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/main_decomp_base.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/main_decomp_base.py
@@ -0,0 +1,64 @@
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.prompts import INITIAL_DECOMPOSITION_PROMPT
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import clean_and_parse_list_string
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def main_decomp_base(state: QAState) -> dict[str, Any]:
+    """
+    Perform an initial question decomposition, incl. one search term
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+        dict: The updated state with initial decomposition
+    """
+
+    print("---INITIAL DECOMP---")
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+
+    msg = [
+        HumanMessage(
+            content=INITIAL_DECOMPOSITION_PROMPT.format(question=question),
+        )
+    ]
+
+    # Get the rewritten queries in a defined format
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+
+    content = response.pretty_repr()
+    list_of_subquestions = clean_and_parse_list_string(content)
+
+    decomp_list = []
+
+    for sub_question_nr, sub_question in enumerate(list_of_subquestions):
+        sub_question_str = sub_question["sub_question"].strip()
+        # temporarily
+        sub_question_search_queries = [sub_question["search_term"]]
+
+        decomp_list.append(
+            {
+                "sub_question_str": sub_question_str,
+                "sub_question_search_queries": sub_question_search_queries,
+                "sub_question_nr": sub_question_nr,
+            }
+        )
+
+    return {
+        "initial_sub_questions": decomp_list,
+        "sub_query_start_time": node_start_time,
+        "log_messages": generate_log_message(
+            message="core - initial decomp",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/rewrite.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/rewrite.py
@@ -0,0 +1,55 @@
+import json
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.models import RewrittenQueries
+from danswer.agent_search.shared_graph_utils.prompts import REWRITE_PROMPT_MULTI
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def rewrite(state: QAState) -> dict[str, Any]:
+    """
+    Transform the initial question into more suitable search queries.
+
+    Args:
+        qa_state (messages): The current state
+
+    Returns:
+        dict: The updated state with re-phrased question
+    """
+    print("---STARTING GRAPH---")
+    graph_start_time = datetime.now()
+
+    print("---TRANSFORM QUERY---")
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+
+    msg = [
+        HumanMessage(
+            content=REWRITE_PROMPT_MULTI.format(question=question),
+        )
+    ]
+
+    # Get the rewritten queries in a defined format
+    fast_llm = state["fast_llm"]
+    llm_response = list(
+        fast_llm.stream(
+            prompt=msg,
+            structured_response_format=RewrittenQueries.model_json_schema(),
+        )
+    )
+
+    formatted_response: RewrittenQueries = json.loads(llm_response[0].pretty_repr())
+
+    return {
+        "rewritten_queries": formatted_response.rewritten_queries,
+        "log_messages": generate_log_message(
+            message="core - rewrite",
+            node_start_time=node_start_time,
+            graph_start_time=graph_start_time,
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/sub_qa_level_aggregator.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/sub_qa_level_aggregator.py
@@ -0,0 +1,39 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+# aggregate sub questions and answers
+def sub_qa_level_aggregator(state: QAState) -> dict[str, Any]:
+    sub_qas = state["sub_qas"]
+
+    node_start_time = datetime.now()
+
+    dynamic_context_list = [
+        "Below you will find useful information to answer the original question:"
+    ]
+    checked_sub_qas = []
+
+    for core_answer_sub_qa in sub_qas:
+        question = core_answer_sub_qa["sub_question"]
+        answer = core_answer_sub_qa["sub_answer"]
+        verified = core_answer_sub_qa["sub_answer_check"]
+
+        if verified == "yes":
+            dynamic_context_list.append(
+                f"Question:\n{question}\n\nAnswer:\n{answer}\n\n---\n\n"
+            )
+            checked_sub_qas.append({"sub_question": question, "sub_answer": answer})
+    dynamic_context = "\n".join(dynamic_context_list)
+
+    return {
+        "core_answer_dynamic_context": dynamic_context,
+        "checked_sub_qas": checked_sub_qas,
+        "log_messages": generate_log_message(
+            message="deep - sub qa level aggregator",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/sub_qa_manager.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/sub_qa_manager.py
@@ -0,0 +1,28 @@
+from datetime import datetime
+from typing import Any
+
+from danswer.agent_search.primary_graph.states import QAState
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def sub_qa_manager(state: QAState) -> dict[str, Any]:
+    """ """
+
+    node_start_time = datetime.now()
+
+    sub_questions_dict = state["decomposed_sub_questions_dict"]
+
+    sub_questions = {}
+
+    for sub_question_nr, sub_question_dict in sub_questions_dict.items():
+        sub_questions[sub_question_nr] = sub_question_dict["sub_question"]
+
+    return {
+        "sub_questions": sub_questions,
+        "num_new_question_iterations": 0,
+        "log_messages": generate_log_message(
+            message="deep - sub qa manager",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/nodes/verifier.py
+++ b/backend/danswer/agent_search/primary_graph/nodes/verifier.py
@@ -0,0 +1,59 @@
+import json
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.primary_graph.states import VerifierState
+from danswer.agent_search.shared_graph_utils.models import BinaryDecision
+from danswer.agent_search.shared_graph_utils.prompts import VERIFIER_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def verifier(state: VerifierState) -> dict[str, Any]:
+    """
+    Check whether the document is relevant for the original user question
+
+    Args:
+        state (VerifierState): The current state
+
+    Returns:
+        dict: ict: The updated state with the final decision
+    """
+
+    print("---VERIFY QUTPUT---")
+    node_start_time = datetime.now()
+
+    question = state["question"]
+    document_content = state["document"].combined_content
+
+    msg = [
+        HumanMessage(
+            content=VERIFIER_PROMPT.format(
+                question=question, document_content=document_content
+            )
+        )
+    ]
+
+    # Grader
+    llm = state["fast_llm"]
+    response = list(
+        llm.stream(
+            prompt=msg,
+            structured_response_format=BinaryDecision.model_json_schema(),
+        )
+    )
+
+    raw_response = json.loads(response[0].pretty_repr())
+    formatted_response = BinaryDecision.model_validate(raw_response)
+
+    return {
+        "deduped_retrieval_docs": [state["document"]]
+        if formatted_response.decision == "yes"
+        else [],
+        "log_messages": generate_log_message(
+            message=f"core - verifier: {formatted_response.decision}",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/primary_graph/prompts.py
+++ b/backend/danswer/agent_search/primary_graph/prompts.py
@@ -0,0 +1,86 @@
+INITIAL_DECOMPOSITION_PROMPT = """ \n
+    Please decompose an initial user question into not more than 4 appropriate sub-questions that help to
+    answer the original question. The purpose for this decomposition is to isolate individulal entities
+    (i.e., 'compare sales of company A and company B' -> 'what are sales for company A' + 'what are sales
+    for company B'), split ambiguous terms (i.e., 'what is our success with company A' -> 'what are our
+    sales with company A' + 'what is our market share with company A' + 'is company A a reference customer
+    for us'), etc. Each sub-question should be realistically be answerable by a good RAG system. \n
+
+    For each sub-question, please also create one search term that can be used to retrieve relevant
+    documents from a document store.
+
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Please formulate your answer as a list of json objects with the following format:
+
+   [{{"sub_question": <sub-question>, "search_term": <search term>}}, ...]
+
+    Answer:
+    """
+
+INITIAL_RAG_PROMPT = """ \n
+    You are an assistant for question-answering tasks. Use the information provided below - and only the
+    provided information - to answer the provided question.
+
+    The information provided below consists of:
+     1) a number of answered sub-questions - these are very important(!) and definitely should be
+     considered to answer the question.
+     2) a number of documents that were also deemed relevant for the question.
+
+    If you don't know the answer or if the provided information is empty or insufficient, just say
+    "I don't know". Do not use your internal knowledge!
+
+    Again, only use the provided informationand do not use your internal knowledge! It is a matter of life
+    and death that you do NOT use your internal knowledge, just the provided information!
+
+    Try to keep your answer concise.
+
+    And here is the question and the provided information:
+    \n
+    \nQuestion:\n {question}
+
+    \nAnswered Sub-questions:\n {answered_sub_questions}
+
+    \nContext:\n {context} \n\n
+    \n\n
+
+    Answer:"""
+
+ENTITY_TERM_PROMPT = """ \n
+    Based on the original question and the context retieved from a dataset, please generate a list of
+    entities (e.g. companies, organizations, industries, products, locations, etc.), terms and concepts
+    (e.g. sales, revenue, etc.) that are relevant for the question, plus their relations to each other.
+
+    \n\n
+    Here is the original question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+   And here is the context retrieved:
+    \n ------- \n
+    {context}
+    \n ------- \n
+
+    Please format your answer as a json object in the following format:
+
+    {{"retrieved_entities_relationships": {{
+        "entities": [{{
+            "entity_name": <assign a name for the entity>,
+            "entity_type": <specify a short type name for the entity, such as 'company', 'location',...>
+        }}],
+        "relationships": [{{
+            "name": <assign a name for the relationship>,
+            "type": <specify a short type name for the relationship, such as 'sales_to', 'is_location_of',...>,
+            "entities": [<related entity name 1>, <related entity name 2>]
+        }}],
+        "terms": [{{
+            "term_name": <assign a name for the term>,
+            "term_type": <specify a short type name for the term, such as 'revenue', 'market_share',...>,
+            "similar_to": <list terms that are similar to this term>
+        }}]
+    }}
+    }}
+   """
--- a/backend/danswer/agent_search/primary_graph/states.py
+++ b/backend/danswer/agent_search/primary_graph/states.py
@@ -0,0 +1,73 @@
+import operator
+from collections.abc import Sequence
+from datetime import datetime
+from typing import Annotated
+from typing import TypedDict
+
+from langchain_core.messages import BaseMessage
+from langgraph.graph.message import add_messages
+
+from danswer.agent_search.shared_graph_utils.models import RewrittenQueries
+from danswer.context.search.models import InferenceSection
+
+
+class QAState(TypedDict):
+    # The 'main' state of the answer graph
+    original_question: str
+    graph_start_time: datetime
+    # start time for parallel initial sub-questionn thread
+    sub_query_start_time: datetime
+    log_messages: Annotated[Sequence[BaseMessage], add_messages]
+    rewritten_queries: RewrittenQueries
+    sub_questions: list[dict]
+    initial_sub_questions: list[dict]
+    ranked_subquestion_ids: list[int]
+    decomposed_sub_questions_dict: dict
+    rejected_sub_questions: Annotated[list[str], operator.add]
+    rejected_sub_questions_handled: bool
+    sub_qas: Annotated[Sequence[dict], operator.add]
+    initial_sub_qas: Annotated[Sequence[dict], operator.add]
+    checked_sub_qas: Annotated[Sequence[dict], operator.add]
+    base_retrieval_docs: Annotated[Sequence[InferenceSection], operator.add]
+    deduped_retrieval_docs: Annotated[Sequence[InferenceSection], operator.add]
+    reranked_retrieval_docs: Annotated[Sequence[InferenceSection], operator.add]
+    retrieved_entities_relationships: dict
+    questions_context: list[dict]
+    qa_level: int
+    top_chunks: list[InferenceSection]
+    sub_question_top_chunks: Annotated[Sequence[dict], operator.add]
+    num_new_question_iterations: int
+    core_answer_dynamic_context: str
+    dynamic_context: str
+    initial_base_answer: str
+    base_answer: str
+    deep_answer: str
+
+
+class QAOuputState(TypedDict):
+    # The 'main' output state of the answer graph. Removes all the intermediate states
+    original_question: str
+    log_messages: Annotated[Sequence[BaseMessage], add_messages]
+    sub_questions: list[dict]
+    sub_qas: Annotated[Sequence[dict], operator.add]
+    initial_sub_qas: Annotated[Sequence[dict], operator.add]
+    checked_sub_qas: Annotated[Sequence[dict], operator.add]
+    reranked_retrieval_docs: Annotated[Sequence[InferenceSection], operator.add]
+    retrieved_entities_relationships: dict
+    top_chunks: list[InferenceSection]
+    sub_question_top_chunks: Annotated[Sequence[dict], operator.add]
+    base_answer: str
+    deep_answer: str
+
+
+class RetrieverState(TypedDict):
+    # The state for the parallel Retrievers. They each need to see only one query
+    rewritten_query: str
+    graph_start_time: datetime
+
+
+class VerifierState(TypedDict):
+    # The state for the parallel verification step.  Each node execution need to see only one question/doc pair
+    document: InferenceSection
+    question: str
+    graph_start_time: datetime
--- a/backend/danswer/agent_search/run_graph.py
+++ b/backend/danswer/agent_search/run_graph.py
@@ -0,0 +1,22 @@
+from danswer.agent_search.primary_graph.graph_builder import build_core_graph
+from danswer.llm.answering.answer import AnswerStream
+from danswer.llm.interfaces import LLM
+from danswer.tools.tool import Tool
+
+
+def run_graph(
+    query: str,
+    llm: LLM,
+    tools: list[Tool],
+) -> AnswerStream:
+    graph = build_core_graph()
+
+    inputs = {
+        "original_question": query,
+        "messages": [],
+        "tools": tools,
+        "llm": llm,
+    }
+    compiled_graph = graph.compile()
+    output = compiled_graph.invoke(input=inputs)
+    yield from output
--- a/backend/danswer/agent_search/shared_graph_utils/models.py
+++ b/backend/danswer/agent_search/shared_graph_utils/models.py
@@ -0,0 +1,16 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+# Pydantic models for structured outputs
+class RewrittenQueries(BaseModel):
+    rewritten_queries: list[str]
+
+
+class BinaryDecision(BaseModel):
+    decision: Literal["yes", "no"]
+
+
+class SubQuestions(BaseModel):
+    sub_questions: list[str]
--- a/backend/danswer/agent_search/shared_graph_utils/prompts.py
+++ b/backend/danswer/agent_search/shared_graph_utils/prompts.py
@@ -0,0 +1,342 @@
+REWRITE_PROMPT_MULTI_ORIGINAL = """ \n
+    Please convert an initial user question into a 2-3 more appropriate short and pointed search queries for retrievel from a
+    document store. Particularly, try to think about resolving ambiguities and make the search queries more specific,
+    enabling the system to search more broadly.
+    Also, try to make the search queries not redundant, i.e. not too similar! \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Formulate the queries separated by '--' (Do not say 'Query 1: ...', just write the querytext): """
+
+
+REWRITE_PROMPT_MULTI = """ \n
+    Please create a list of 2-3 sample documents that could answer an original question. Each document
+    should be about as long as the original question. \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Formulate the sample documents separated by '--' (Do not say 'Document 1: ...', just write the text): """
+
+BASE_RAG_PROMPT = """ \n
+    You are an assistant for question-answering tasks. Use the context provided below - and only the
+    provided context - to answer the question. If you don't know the answer or if the provided context is
+    empty, just say "I don't know". Do not use your internal knowledge!
+
+    Again, only use the provided context and do not use your internal knowledge! If you cannot answer the
+    question based on the context, say "I don't know". It is a matter of life and death that you do NOT
+    use your internal knowledge, just the provided information!
+
+    Use three sentences maximum and keep the answer concise.
+    answer concise.\nQuestion:\n {question} \nContext:\n {context} \n\n
+    \n\n
+    Answer:"""
+
+BASE_CHECK_PROMPT = """ \n
+    Please check whether 1) the suggested answer seems to fully address the original question AND 2)the
+    original question requests a simple, factual answer, and there are no ambiguities, judgements,
+    aggregations, or any other complications that may require extra context. (I.e., if the question is
+    somewhat addressed, but the answer would benefit from more context, then answer with 'no'.)
+
+    Please only answer with 'yes' or 'no' \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    Here is the proposed answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+    Please answer with yes or no:"""
+
+VERIFIER_PROMPT = """ \n
+    Please check whether the document seems to be relevant for the answer of the original question. Please
+    only answer with 'yes' or 'no' \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    Here is the document text:
+    \n ------- \n
+    {document_content}
+    \n ------- \n
+    Please answer with yes or no:"""
+
+INITIAL_DECOMPOSITION_PROMPT_BASIC = """ \n
+    Please decompose an initial user question into not more than 4 appropriate sub-questions that help to
+    answer the original question. The purpose for this decomposition is to isolate individulal entities
+    (i.e., 'compare sales of company A and company B' -> 'what are sales for company A' + 'what are sales
+    for company B'), split ambiguous terms (i.e., 'what is our success with company A' -> 'what are our
+    sales with company A' + 'what is our market share with company A' + 'is company A a reference customer
+    for us'), etc. Each sub-question should be realistically be answerable by a good RAG system. \n
+
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Please formulate your answer as a list of subquestions:
+
+    Answer:
+    """
+
+REWRITE_PROMPT_SINGLE = """ \n
+    Please convert an initial user question into a more appropriate search query for retrievel from a
+    document store. \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Formulate the query: """
+
+MODIFIED_RAG_PROMPT = """You are an assistant for question-answering tasks. Use the context provided below
+    - and only this context - to answer the question. If you don't know the answer, just say "I don't know".
+    Use three sentences maximum and keep the answer concise.
+    Pay also particular attention to the sub-questions and their answers, at least it may enrich the answer.
+    Again, only use the provided context and do not use your internal knowledge! If you cannot answer the
+    question based on the context, say "I don't know". It is a matter of life and death that you do NOT
+    use your internal knowledge, just the provided information!
+
+    \nQuestion: {question}
+    \nContext: {combined_context} \n
+
+    Answer:"""
+
+ORIG_DEEP_DECOMPOSE_PROMPT = """ \n
+    An initial user question needs to be answered. An initial answer has been provided but it wasn't quite
+    good enough. Also, some sub-questions had been answered and this information has been used to provide
+    the initial answer. Some other subquestions may have been suggested based on little knowledge, but they
+    were not directly answerable. Also, some entities, relationships and terms are givenm to you so that
+    you have an idea of how the avaiolable data looks like.
+
+    Your role is to generate 3-5 new sub-questions that would help to answer the initial question,
+    considering:
+
+    1) The initial question
+    2) The initial answer that was found to be unsatisfactory
+    3) The sub-questions that were answered
+    4) The sub-questions that were suggested but not answered
+    5) The entities, relationships and terms that were extracted from the context
+
+    The individual questions should be answerable by a good RAG system.
+    So a good idea would be to use the sub-questions to resolve ambiguities and/or to separate the
+    question for different entities that may be involved in the original question, but in a way that does
+    not duplicate questions that were already tried.
+
+    Additional Guidelines:
+    - The sub-questions should be specific to the question and provide richer context for the question,
+    resolve ambiguities, or address shortcoming of the initial answer
+    - Each sub-question - when answered - should be relevant for the answer to the original question
+    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
+    other complications that may require extra context.
+    - The sub-questions MUST have the full context of the original question so that it can be executed by
+    a RAG system independently without the original question available
+      (Example:
+        - initial question: "What is the capital of France?"
+        - bad sub-question: "What is the name of the river there?"
+        - good sub-question: "What is the name of the river that flows through Paris?"
+    - For each sub-question, please provide a short explanation for why it is a good sub-question. So
+    generate a list of dictionaries with the following format:
+      [{{"sub_question": <sub-question>, "explanation": <explanation>, "search_term": <rewrite the
+      sub-question using as a search phrase for the document store>}}, ...]
+
+    \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Here is the initial sub-optimal answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+
+    Here are the sub-questions that were answered:
+    \n ------- \n
+    {answered_sub_questions}
+    \n ------- \n
+
+    Here are the sub-questions that were suggested but not answered:
+    \n ------- \n
+    {failed_sub_questions}
+    \n ------- \n
+
+    And here are the entities, relationships and terms extracted from the context:
+    \n ------- \n
+    {entity_term_extraction_str}
+    \n ------- \n
+
+   Please generate the list of good, fully contextualized sub-questions that would help to address the
+   main question. Again, please find questions that are NOT overlapping too much with the already answered
+   sub-questions or those that already were suggested and failed.
+   In other words - what can we try in addition to what has been tried so far?
+
+   Please think through it step by step and then generate the list of json dictionaries with the following
+   format:
+
+   {{"sub_questions": [{{"sub_question": <sub-question>,
+        "explanation": <explanation>,
+        "search_term": <rewrite the sub-question using as a search phrase for the document store>}},
+        ...]}} """
+
+DEEP_DECOMPOSE_PROMPT = """ \n
+    An initial user question needs to be answered. An initial answer has been provided but it wasn't quite
+    good enough. Also, some sub-questions had been answered and this information has been used to provide
+    the initial answer. Some other subquestions may have been suggested based on little knowledge, but they
+    were not directly answerable. Also, some entities, relationships and terms are givenm to you so that
+    you have an idea of how the avaiolable data looks like.
+
+    Your role is to generate 4-6 new sub-questions that would help to answer the initial question,
+    considering:
+
+    1) The initial question
+    2) The initial answer that was found to be unsatisfactory
+    3) The sub-questions that were answered
+    4) The sub-questions that were suggested but not answered
+    5) The entities, relationships and terms that were extracted from the context
+
+    The individual questions should be answerable by a good RAG system.
+    So a good idea would be to use the sub-questions to resolve ambiguities and/or to separate the
+    question for different entities that may be involved in the original question, but in a way that does
+    not duplicate questions that were already tried.
+
+    Additional Guidelines:
+    - The sub-questions should be specific to the question and provide richer context for the question,
+    resolve ambiguities, or address shortcoming of the initial answer
+    - Each sub-question - when answered - should be relevant for the answer to the original question
+    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
+    other complications that may require extra context.
+    - The sub-questions MUST have the full context of the original question so that it can be executed by
+    a RAG system independently without the original question available
+      (Example:
+        - initial question: "What is the capital of France?"
+        - bad sub-question: "What is the name of the river there?"
+        - good sub-question: "What is the name of the river that flows through Paris?"
+    - For each sub-question, please also provide a search term that can be used to retrieve relevant
+    documents from a document store.
+    \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Here is the initial sub-optimal answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+
+    Here are the sub-questions that were answered:
+    \n ------- \n
+    {answered_sub_questions}
+    \n ------- \n
+
+    Here are the sub-questions that were suggested but not answered:
+    \n ------- \n
+    {failed_sub_questions}
+    \n ------- \n
+
+    And here are the entities, relationships and terms extracted from the context:
+    \n ------- \n
+    {entity_term_extraction_str}
+    \n ------- \n
+
+   Please generate the list of good, fully contextualized sub-questions that would help to address the
+   main question. Again, please find questions that are NOT overlapping too much with the already answered
+   sub-questions or those that already were suggested and failed.
+   In other words - what can we try in addition to what has been tried so far?
+
+   Generate the list of json dictionaries with the following format:
+
+   {{"sub_questions": [{{"sub_question": <sub-question>,
+        "search_term": <rewrite the sub-question using as a search phrase for the document store>}},
+        ...]}} """
+
+DECOMPOSE_PROMPT = """ \n
+    For an initial user question, please generate at 5-10 individual sub-questions whose answers would help
+    \n to answer the initial question. The individual questions should be answerable by a good RAG system.
+    So a good idea would be to \n use the sub-questions to resolve ambiguities and/or to separate the
+    question for different entities that may be involved in the original question.
+
+    In order to arrive at meaningful sub-questions, please also consider the context retrieved from the
+    document store, expressed as entities, relationships and terms. You can also think about the types
+    mentioned in brackets
+
+    Guidelines:
+    - The sub-questions should be specific to the question and provide richer context for the question,
+    and or resolve ambiguities
+    - Each sub-question - when answered - should be relevant for the answer to the original question
+    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
+    other complications that may require extra context.
+    - The sub-questions MUST have the full context of the original question so that it can be executed by
+    a RAG system independently without the original question available
+      (Example:
+        - initial question: "What is the capital of France?"
+        - bad sub-question: "What is the name of the river there?"
+        - good sub-question: "What is the name of the river that flows through Paris?"
+    - For each sub-question, please provide a short explanation for why it is a good sub-question. So
+    generate a list of dictionaries with the following format:
+      [{{"sub_question": <sub-question>, "explanation": <explanation>, "search_term": <rewrite the
+      sub-question using as a search phrase for the document store>}}, ...]
+
+    \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    And here are the entities, relationships and terms extracted from the context:
+    \n ------- \n
+    {entity_term_extraction_str}
+    \n ------- \n
+
+   Please generate the list of good, fully contextualized sub-questions that would help to address the
+   main question. Don't be too specific unless the original question is specific.
+   Please think through it step by step and then generate the list of json dictionaries with the following
+   format:
+   {{"sub_questions": [{{"sub_question": <sub-question>,
+        "explanation": <explanation>,
+        "search_term": <rewrite the sub-question using as a search phrase for the document store>}},
+        ...]}} """
+
+#### Consolidations
+COMBINED_CONTEXT = """-------
+    Below you will find useful information to answer the original question. First, you see a number of
+    sub-questions with their answers. This information should be considered to be more focussed and
+    somewhat more specific to the original question as it tries to contextualized facts.
+    After that will see the documents that were considered to be relevant to answer the original question.
+
+    Here are the sub-questions and their answers:
+    \n\n {deep_answer_context} \n\n
+    \n\n Here are the documents that were considered to be relevant to answer the original question:
+    \n\n {formated_docs} \n\n
+    ----------------
+    """
+
+SUB_QUESTION_EXPLANATION_RANKER_PROMPT = """-------
+    Below you will find a question that we ultimately want to answer (the original question) and a list of
+    motivations in arbitrary order for generated sub-questions that are supposed to help us answering the
+    original question. The motivations are formatted as <motivation number>:  <motivation explanation>.
+    (Again, the numbering is arbitrary and does not necessarily mean that 1 is the most relevant
+    motivation and 2 is less relevant.)
+
+    Please rank the motivations in order of relevance for answering the original question. Also, try to
+    ensure that the top questions do not duplicate too much, i.e. that they are not too similar.
+    Ultimately, create a list with the motivation numbers where the number of the most relevant
+    motivations comes first.
+
+    Here is the original question:
+    \n\n {original_question} \n\n
+    \n\n Here is the list of sub-question motivations:
+    \n\n {sub_question_explanations} \n\n
+    ----------------
+
+    Please think step by step and then generate the ranked list of motivations.
+
+    Please format your answer as a json object in the following format:
+    {{"reasonning": <explain your reasoning for the ranking>,
+      "ranked_motivations": <ranked list of motivation numbers>}}
+    """
--- a/backend/danswer/agent_search/shared_graph_utils/utils.py
+++ b/backend/danswer/agent_search/shared_graph_utils/utils.py
@@ -0,0 +1,91 @@
+import ast
+import json
+import re
+from collections.abc import Sequence
+from datetime import datetime
+from datetime import timedelta
+from typing import Any
+
+from danswer.context.search.models import InferenceSection
+
+
+def normalize_whitespace(text: str) -> str:
+    """Normalize whitespace in text to single spaces and strip leading/trailing whitespace."""
+    import re
+
+    return re.sub(r"\s+", " ", text.strip())
+
+
+# Post-processing
+def format_docs(docs: Sequence[InferenceSection]) -> str:
+    return "\n\n".join(doc.combined_content for doc in docs)
+
+
+def clean_and_parse_list_string(json_string: str) -> list[dict]:
+    # Remove markdown code block markers and any newline prefixes
+    cleaned_string = re.sub(r"```json\n|\n```", "", json_string)
+    cleaned_string = cleaned_string.replace("\\n", " ").replace("\n", " ")
+    cleaned_string = " ".join(cleaned_string.split())
+    # Parse the cleaned string into a Python dictionary
+    return ast.literal_eval(cleaned_string)
+
+
+def clean_and_parse_json_string(json_string: str) -> dict[str, Any]:
+    # Remove markdown code block markers and any newline prefixes
+    cleaned_string = re.sub(r"```json\n|\n```", "", json_string)
+    cleaned_string = cleaned_string.replace("\\n", " ").replace("\n", " ")
+    cleaned_string = " ".join(cleaned_string.split())
+    # Parse the cleaned string into a Python dictionary
+    return json.loads(cleaned_string)
+
+
+def format_entity_term_extraction(entity_term_extraction_dict: dict[str, Any]) -> str:
+    entities = entity_term_extraction_dict["entities"]
+    terms = entity_term_extraction_dict["terms"]
+    relationships = entity_term_extraction_dict["relationships"]
+
+    entity_strs = ["\nEntities:\n"]
+    for entity in entities:
+        entity_str = f"{entity['entity_name']} ({entity['entity_type']})"
+        entity_strs.append(entity_str)
+
+    entity_str = "\n - ".join(entity_strs)
+
+    relationship_strs = ["\n\nRelationships:\n"]
+    for relationship in relationships:
+        relationship_str = f"{relationship['name']} ({relationship['type']}): {relationship['entities']}"
+        relationship_strs.append(relationship_str)
+
+    relationship_str = "\n - ".join(relationship_strs)
+
+    term_strs = ["\n\nTerms:\n"]
+    for term in terms:
+        term_str = f"{term['term_name']} ({term['term_type']}): similar to {term['similar_to']}"
+        term_strs.append(term_str)
+
+    term_str = "\n - ".join(term_strs)
+
+    return "\n".join(entity_strs + relationship_strs + term_strs)
+
+
+def _format_time_delta(time: timedelta) -> str:
+    seconds_from_start = f"{((time).seconds):03d}"
+    microseconds_from_start = f"{((time).microseconds):06d}"
+    return f"{seconds_from_start}.{microseconds_from_start}"
+
+
+def generate_log_message(
+    message: str,
+    node_start_time: datetime,
+    graph_start_time: datetime | None = None,
+) -> str:
+    current_time = datetime.now()
+
+    if graph_start_time is not None:
+        graph_time_str = _format_time_delta(current_time - graph_start_time)
+    else:
+        graph_time_str = "N/A"
+
+    node_time_str = _format_time_delta(current_time - node_start_time)
+
+    return f"{graph_time_str} ({node_time_str} s): {message}"
--- a/backend/danswer/auth/api_key.py
+++ b/backend/danswer/auth/api_key.py
@@ -0,0 +1,89 @@
+import secrets
+import uuid
+from urllib.parse import quote
+from urllib.parse import unquote
+
+from fastapi import Request
+from passlib.hash import sha256_crypt
+from pydantic import BaseModel
+
+from danswer.auth.schemas import UserRole
+from danswer.configs.app_configs import API_KEY_HASH_ROUNDS
+
+
+_API_KEY_HEADER_NAME = "Authorization"
+# NOTE for others who are curious: In the context of a header, "X-" often refers
+# to non-standard, experimental, or custom headers in HTTP or other protocols. It
+# indicates that the header is not part of the official standards defined by
+# organizations like the Internet Engineering Task Force (IETF).
+_API_KEY_HEADER_ALTERNATIVE_NAME = "X-Danswer-Authorization"
+_BEARER_PREFIX = "Bearer "
+_API_KEY_PREFIX = "dn_"
+_API_KEY_LEN = 192
+
+
+class ApiKeyDescriptor(BaseModel):
+    api_key_id: int
+    api_key_display: str
+    api_key: str | None = None  # only present on initial creation
+    api_key_name: str | None = None
+    api_key_role: UserRole
+
+    user_id: uuid.UUID
+
+
+def generate_api_key(tenant_id: str | None = None) -> str:
+    # For backwards compatibility, if no tenant_id, generate old style key
+    if not tenant_id:
+        return _API_KEY_PREFIX + secrets.token_urlsafe(_API_KEY_LEN)
+
+    encoded_tenant = quote(tenant_id)  # URL encode the tenant ID
+    return f"{_API_KEY_PREFIX}{encoded_tenant}.{secrets.token_urlsafe(_API_KEY_LEN)}"
+
+
+def extract_tenant_from_api_key_header(request: Request) -> str | None:
+    """Extract tenant ID from request. Returns None if auth is disabled or invalid format."""
+    raw_api_key_header = request.headers.get(
+        _API_KEY_HEADER_ALTERNATIVE_NAME
+    ) or request.headers.get(_API_KEY_HEADER_NAME)
+
+    if not raw_api_key_header or not raw_api_key_header.startswith(_BEARER_PREFIX):
+        return None
+
+    api_key = raw_api_key_header[len(_BEARER_PREFIX) :].strip()
+
+    if not api_key.startswith(_API_KEY_PREFIX):
+        return None
+
+    parts = api_key[len(_API_KEY_PREFIX) :].split(".", 1)
+    if len(parts) != 2:
+        return None
+
+    tenant_id = parts[0]
+    return unquote(tenant_id) if tenant_id else None
+
+
+def hash_api_key(api_key: str) -> str:
+    # NOTE: no salt is needed, as the API key is randomly generated
+    # and overlaps are impossible
+    return sha256_crypt.hash(api_key, salt="", rounds=API_KEY_HASH_ROUNDS)
+
+
+def build_displayable_api_key(api_key: str) -> str:
+    if api_key.startswith(_API_KEY_PREFIX):
+        api_key = api_key[len(_API_KEY_PREFIX) :]
+
+    return _API_KEY_PREFIX + api_key[:4] + "********" + api_key[-4:]
+
+
+def get_hashed_api_key_from_request(request: Request) -> str | None:
+    raw_api_key_header = request.headers.get(
+        _API_KEY_HEADER_ALTERNATIVE_NAME
+    ) or request.headers.get(_API_KEY_HEADER_NAME)
+    if raw_api_key_header is None:
+        return None
+
+    if raw_api_key_header.startswith(_BEARER_PREFIX):
+        raw_api_key_header = raw_api_key_header[len(_BEARER_PREFIX) :].strip()
+
+    return hash_api_key(raw_api_key_header)
--- a/backend/danswer/auth/invited_users.py
+++ b/backend/danswer/auth/invited_users.py
@@ -2,8 +2,8 @@ from typing import cast

 from danswer.configs.constants import KV_USER_STORE_KEY
 from danswer.key_value_store.factory import get_kv_store
-from danswer.key_value_store.interface import JSON_ro
 from danswer.key_value_store.interface import KvKeyNotFoundError
+from danswer.utils.special_types import JSON_ro


 def get_invited_users() -> list[str]:
--- a/backend/danswer/auth/noauth_user.py
+++ b/backend/danswer/auth/noauth_user.py
@@ -23,7 +23,9 @@ def load_no_auth_user_preferences(store: KeyValueStore) -> UserPreferences:
        )
        return UserPreferences(**preferences_data)
    except KvKeyNotFoundError:
-        return UserPreferences(chosen_assistants=None, default_model=None)
+        return UserPreferences(
+            chosen_assistants=None, default_model=None, auto_scroll=True
+        )


 def fetch_no_auth_user(store: KeyValueStore) -> UserInfo:
--- a/backend/danswer/auth/schemas.py
+++ b/backend/danswer/auth/schemas.py
@@ -13,12 +13,24 @@ class UserRole(str, Enum):
        groups they are curators of
    - Global Curator can perform admin actions
        for all groups they are a member of
+    - Limited can access a limited set of basic api endpoints
+    - Slack are users that have used danswer via slack but dont have a web login
+    - External permissioned users that have been picked up during the external permissions sync process but don't have a web login
    """

+    LIMITED = "limited"
    BASIC = "basic"
    ADMIN = "admin"
    CURATOR = "curator"
    GLOBAL_CURATOR = "global_curator"
+    SLACK_USER = "slack_user"
+    EXT_PERM_USER = "ext_perm_user"
+
+    def is_web_login(self) -> bool:
+        return self not in [
+            UserRole.SLACK_USER,
+            UserRole.EXT_PERM_USER,
+        ]


 class UserStatus(str, Enum):
@@ -33,10 +45,8 @@ class UserRead(schemas.BaseUser[uuid.UUID]):

 class UserCreate(schemas.BaseUserCreate):
    role: UserRole = UserRole.BASIC
-    has_web_login: bool | None = True
    tenant_id: str | None = None


 class UserUpdate(schemas.BaseUserUpdate):
    role: UserRole
-    has_web_login: bool | None = True
--- a/backend/danswer/auth/users.py
+++ b/backend/danswer/auth/users.py
@@ -48,11 +48,10 @@ from httpx_oauth.integrations.fastapi import OAuth2AuthorizeCallback
 from httpx_oauth.oauth2 import BaseOAuth2
 from httpx_oauth.oauth2 import OAuth2Token
 from pydantic import BaseModel
-from sqlalchemy import select
 from sqlalchemy import text
-from sqlalchemy.orm import attributes
-from sqlalchemy.orm import Session
+from sqlalchemy.ext.asyncio import AsyncSession

+from danswer.auth.api_key import get_hashed_api_key_from_request
 from danswer.auth.invited_users import get_invited_users
 from danswer.auth.schemas import UserCreate
 from danswer.auth.schemas import UserRole
@@ -75,28 +74,28 @@ from danswer.configs.constants import AuthType
 from danswer.configs.constants import DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN
 from danswer.configs.constants import DANSWER_API_KEY_PREFIX
 from danswer.configs.constants import UNNAMED_KEY_PLACEHOLDER
+from danswer.db.api_key import fetch_user_for_api_key
 from danswer.db.auth import get_access_token_db
 from danswer.db.auth import get_default_admin_user_emails
 from danswer.db.auth import get_user_count
 from danswer.db.auth import get_user_db
 from danswer.db.auth import SQLAlchemyUserAdminDB
+from danswer.db.engine import get_async_session
 from danswer.db.engine import get_async_session_with_tenant
-from danswer.db.engine import get_session
 from danswer.db.engine import get_session_with_tenant
-from danswer.db.engine import get_sqlalchemy_engine
 from danswer.db.models import AccessToken
 from danswer.db.models import OAuthAccount
 from danswer.db.models import User
-from danswer.db.models import UserTenantMapping
 from danswer.db.users import get_user_by_email
+from danswer.server.utils import BasicAuthenticationError
 from danswer.utils.logger import setup_logger
 from danswer.utils.telemetry import optional_telemetry
 from danswer.utils.telemetry import RecordType
+from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
 from danswer.utils.variable_functionality import fetch_versioned_implementation
-from shared_configs.configs import CURRENT_TENANT_ID_CONTEXTVAR
+from shared_configs.configs import async_return_default_schema
 from shared_configs.configs import MULTI_TENANT
-from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
-
+from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR

 logger = setup_logger()

@@ -190,20 +189,6 @@ def verify_email_domain(email: str) -> None:
            )


-def get_tenant_id_for_email(email: str) -> str:
-    if not MULTI_TENANT:
-        return POSTGRES_DEFAULT_SCHEMA
-    # Implement logic to get tenant_id from the mapping table
-    with Session(get_sqlalchemy_engine()) as db_session:
-        result = db_session.execute(
-            select(UserTenantMapping.tenant_id).where(UserTenantMapping.email == email)
-        )
-        tenant_id = result.scalar_one_or_none()
-    if tenant_id is None:
-        raise exceptions.UserNotExists()
-    return tenant_id
-
-
 def send_user_verification_email(
    user_email: str,
    token: str,
@@ -232,25 +217,26 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
    reset_password_token_secret = USER_AUTH_SECRET
    verification_token_secret = USER_AUTH_SECRET

+    user_db: SQLAlchemyUserDatabase[User, uuid.UUID]
+
    async def create(
        self,
        user_create: schemas.UC | UserCreate,
        safe: bool = False,
        request: Optional[Request] = None,
    ) -> User:
-        try:
-            tenant_id = (
-                get_tenant_id_for_email(user_create.email)
-                if MULTI_TENANT
-                else POSTGRES_DEFAULT_SCHEMA
-            )
-        except exceptions.UserNotExists:
-            raise HTTPException(status_code=401, detail="User not found")
+        referral_source = None
+        if request is not None:
+            referral_source = request.cookies.get("referral_source", None)

-        if not tenant_id:
-            raise HTTPException(
-                status_code=401, detail="User does not belong to an organization"
-            )
+        tenant_id = await fetch_ee_implementation_or_noop(
+            "danswer.server.tenants.provisioning",
+            "get_or_create_tenant_id",
+            async_return_default_schema,
+        )(
+            email=user_create.email,
+            referral_source=referral_source,
+        )

        async with get_async_session_with_tenant(tenant_id) as db_session:
            token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
@@ -258,7 +244,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            verify_email_is_invited(user_create.email)
            verify_email_domain(user_create.email)
            if MULTI_TENANT:
-                tenant_user_db = SQLAlchemyUserAdminDB(db_session, User, OAuthAccount)
+                tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID](
+                    db_session, User, OAuthAccount
+                )
                self.user_db = tenant_user_db
                self.database = tenant_user_db

@@ -271,20 +259,15 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    user_create.role = UserRole.ADMIN
                else:
                    user_create.role = UserRole.BASIC
-            user = None
+
            try:
                user = await super().create(user_create, safe=safe, request=request)  # type: ignore
            except exceptions.UserAlreadyExists:
                user = await self.get_by_email(user_create.email)
                # Handle case where user has used product outside of web and is now creating an account through web
-                if (
-                    not user.has_web_login
-                    and hasattr(user_create, "has_web_login")
-                    and user_create.has_web_login
-                ):
+                if not user.role.is_web_login() and user_create.role.is_web_login():
                    user_update = UserUpdate(
                        password=user_create.password,
-                        has_web_login=True,
                        role=user_create.role,
                        is_verified=user_create.is_verified,
                    )
@@ -292,11 +275,13 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                else:
                    raise exceptions.UserAlreadyExists()

-            CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
+            finally:
+                CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
+
            return user

    async def oauth_callback(
-        self: "BaseUserManager[models.UOAP, models.ID]",
+        self,
        oauth_name: str,
        access_token: str,
        account_id: str,
@@ -307,20 +292,24 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        *,
        associate_by_email: bool = False,
        is_verified_by_default: bool = False,
-    ) -> models.UOAP:
-        # Get tenant_id from mapping table
-        try:
-            tenant_id = (
-                get_tenant_id_for_email(account_email)
-                if MULTI_TENANT
-                else POSTGRES_DEFAULT_SCHEMA
-            )
-        except exceptions.UserNotExists:
-            raise HTTPException(status_code=401, detail="User not found")
+    ) -> User:
+        referral_source = None
+        if request:
+            referral_source = getattr(request.state, "referral_source", None)
+
+        tenant_id = await fetch_ee_implementation_or_noop(
+            "danswer.server.tenants.provisioning",
+            "get_or_create_tenant_id",
+            async_return_default_schema,
+        )(
+            email=account_email,
+            referral_source=referral_source,
+        )

        if not tenant_id:
            raise HTTPException(status_code=401, detail="User not found")

+        # Proceed with the tenant context
        token = None
        async with get_async_session_with_tenant(tenant_id) as db_session:
            token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
@@ -329,9 +318,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            verify_email_domain(account_email)

            if MULTI_TENANT:
-                tenant_user_db = SQLAlchemyUserAdminDB(db_session, User, OAuthAccount)
+                tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID](
+                    db_session, User, OAuthAccount
+                )
                self.user_db = tenant_user_db
-                self.database = tenant_user_db  # type: ignore
+                self.database = tenant_user_db

            oauth_account_dict = {
                "oauth_name": oauth_name,
@@ -371,9 +362,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    # Explicitly set the Postgres schema for this session to ensure
                    # OAuth account creation happens in the correct tenant schema
                    await db_session.execute(text(f'SET search_path = "{tenant_id}"'))
-                    user = await self.user_db.add_oauth_account(
-                        user, oauth_account_dict
-                    )
+
+                    # Add OAuth account
+                    await self.user_db.add_oauth_account(user, oauth_account_dict)
                    await self.on_after_register(user, request)

            else:
@@ -383,7 +374,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                        and existing_oauth_account.oauth_name == oauth_name
                    ):
                        user = await self.user_db.update_oauth_account(
-                            user, existing_oauth_account, oauth_account_dict
+                            user,
+                            # NOTE: OAuthAccount DOES implement the OAuthAccountProtocol
+                            # but the type checker doesn't know that :(
+                            existing_oauth_account,  # type: ignore
+                            oauth_account_dict,
                        )

            # NOTE: Most IdPs have very short expiry times, and we don't want to force the user to
@@ -396,16 +391,15 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                )

            # Handle case where user has used product outside of web and is now creating an account through web
-            if not user.has_web_login:  # type: ignore
+            if not user.role.is_web_login():
                await self.user_db.update(
                    user,
                    {
                        "is_verified": is_verified_by_default,
-                        "has_web_login": True,
+                        "role": UserRole.BASIC,
                    },
                )
                user.is_verified = is_verified_by_default
-                user.has_web_login = True  # type: ignore

            # this is needed if an organization goes from `TRACK_EXTERNAL_IDP_EXPIRY=true` to `false`
            # otherwise, the oidc expiry will always be old, and the user will never be able to login
@@ -453,7 +447,13 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        email = credentials.username

        # Get tenant_id from mapping table
-        tenant_id = get_tenant_id_for_email(email)
+        tenant_id = await fetch_ee_implementation_or_noop(
+            "danswer.server.tenants.provisioning",
+            "get_or_create_tenant_id",
+            async_return_default_schema,
+        )(
+            email=email,
+        )
        if not tenant_id:
            # User not found in mapping
            self.password_helper.hash(credentials.password)
@@ -474,11 +474,8 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                self.password_helper.hash(credentials.password)
                return None

-            has_web_login = attributes.get_attribute(user, "has_web_login")
-
-            if not has_web_login:
-                raise HTTPException(
-                    status_code=status.HTTP_403_FORBIDDEN,
+            if not user.role.is_web_login():
+                raise BasicAuthenticationError(
                    detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD",
                )

@@ -510,19 +507,30 @@ cookie_transport = CookieTransport(

 # This strategy is used to add tenant_id to the JWT token
 class TenantAwareJWTStrategy(JWTStrategy):
-    async def write_token(self, user: User) -> str:
-        tenant_id = get_tenant_id_for_email(user.email)
+    async def _create_token_data(self, user: User, impersonate: bool = False) -> dict:
+        tenant_id = await fetch_ee_implementation_or_noop(
+            "danswer.server.tenants.provisioning",
+            "get_or_create_tenant_id",
+            async_return_default_schema,
+        )(
+            email=user.email,
+        )
+
        data = {
            "sub": str(user.id),
            "aud": self.token_audience,
            "tenant_id": tenant_id,
        }
+        return data
+
+    async def write_token(self, user: User) -> str:
+        data = await self._create_token_data(user)
        return generate_jwt(
            data, self.encode_key, self.lifetime_seconds, algorithm=self.algorithm
        )


-def get_jwt_strategy() -> JWTStrategy:
+def get_jwt_strategy() -> TenantAwareJWTStrategy:
    return TenantAwareJWTStrategy(
        secret=USER_AUTH_SECRET,
        lifetime_seconds=SESSION_EXPIRE_TIME_SECONDS,
@@ -597,7 +605,7 @@ optional_fastapi_current_user = fastapi_users.current_user(active=True, optional
 async def optional_user_(
    request: Request,
    user: User | None,
-    db_session: Session,
+    async_db_session: AsyncSession,
 ) -> User | None:
    """NOTE: `request` and `db_session` are not used here, but are included
    for the EE version of this function."""
@@ -606,13 +614,21 @@ async def optional_user_(

 async def optional_user(
    request: Request,
-    db_session: Session = Depends(get_session),
+    async_db_session: AsyncSession = Depends(get_async_session),
    user: User | None = Depends(optional_fastapi_current_user),
 ) -> User | None:
    versioned_fetch_user = fetch_versioned_implementation(
        "danswer.auth.users", "optional_user_"
    )
-    return await versioned_fetch_user(request, user, db_session)
+    user = await versioned_fetch_user(request, user, async_db_session)
+
+    # check if an API key is present
+    if user is None:
+        hashed_api_key = get_hashed_api_key_from_request(request)
+        if hashed_api_key:
+            user = await fetch_user_for_api_key(hashed_api_key, async_db_session)
+
+    return user


 async def double_check_user(
@@ -624,14 +640,12 @@ async def double_check_user(
        return None

    if user is None:
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
+        raise BasicAuthenticationError(
            detail="Access denied. User is not authenticated.",
        )

    if user_needs_to_be_verified() and not user.is_verified:
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
+        raise BasicAuthenticationError(
            detail="Access denied. User is not verified.",
        )

@@ -640,8 +654,7 @@ async def double_check_user(
        and user.oidc_expiry < datetime.now(timezone.utc)
        and not include_expired
    ):
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
+        raise BasicAuthenticationError(
            detail="Access denied. User's OIDC token has expired.",
        )

@@ -654,12 +667,26 @@ async def current_user_with_expired_token(
    return await double_check_user(user, include_expired=True)


-async def current_user(
+async def current_limited_user(
    user: User | None = Depends(optional_user),
 ) -> User | None:
    return await double_check_user(user)


+async def current_user(
+    user: User | None = Depends(optional_user),
+) -> User | None:
+    user = await double_check_user(user)
+    if not user:
+        return None
+
+    if user.role == UserRole.LIMITED:
+        raise BasicAuthenticationError(
+            detail="Access denied. User role is LIMITED. BASIC or higher permissions are required.",
+        )
+    return user
+
+
 async def current_curator_or_admin_user(
    user: User | None = Depends(current_user),
 ) -> User | None:
@@ -667,15 +694,13 @@ async def current_curator_or_admin_user(
        return None

    if not user or not hasattr(user, "role"):
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
+        raise BasicAuthenticationError(
            detail="Access denied. User is not authenticated or lacks role information.",
        )

    allowed_roles = {UserRole.GLOBAL_CURATOR, UserRole.CURATOR, UserRole.ADMIN}
    if user.role not in allowed_roles:
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
+        raise BasicAuthenticationError(
            detail="Access denied. User is not a curator or admin.",
        )

@@ -687,8 +712,7 @@ async def current_admin_user(user: User | None = Depends(current_user)) -> User
        return None

    if not user or not hasattr(user, "role") or user.role != UserRole.ADMIN:
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
+        raise BasicAuthenticationError(
            detail="Access denied. User must be an admin to perform this action.",
        )

@@ -716,8 +740,6 @@ def generate_state_token(


 # refer to https://github.com/fastapi-users/fastapi-users/blob/42ddc241b965475390e2bce887b084152ae1a2cd/fastapi_users/fastapi_users.py#L91
-
-
 def create_danswer_oauth_router(
    oauth_client: BaseOAuth2,
    backend: AuthenticationBackend,
@@ -767,15 +789,22 @@ def get_oauth_router(
        response_model=OAuth2AuthorizeResponse,
    )
    async def authorize(
-        request: Request, scopes: List[str] = Query(None)
+        request: Request,
+        scopes: List[str] = Query(None),
    ) -> OAuth2AuthorizeResponse:
+        referral_source = request.cookies.get("referral_source", None)
+
        if redirect_url is not None:
            authorize_redirect_url = redirect_url
        else:
            authorize_redirect_url = str(request.url_for(callback_route_name))

        next_url = request.query_params.get("next", "/")
-        state_data: Dict[str, str] = {"next_url": next_url}
+
+        state_data: Dict[str, str] = {
+            "next_url": next_url,
+            "referral_source": referral_source or "default_referral",
+        }
        state = generate_state_token(state_data, state_secret)
        authorization_url = await oauth_client.get_authorization_url(
            authorize_redirect_url,
@@ -834,8 +863,11 @@ def get_oauth_router(
            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST)

        next_url = state_data.get("next_url", "/")
+        referral_source = state_data.get("referral_source", None)

-        # Authenticate user
+        request.state.referral_source = referral_source
+
+        # Proceed to authenticate or create the user
        try:
            user = await user_manager.oauth_callback(
                oauth_client.name,
@@ -877,7 +909,25 @@ def get_oauth_router(
            redirect_response.status_code = response.status_code
        if hasattr(response, "media_type"):
            redirect_response.media_type = response.media_type
-
        return redirect_response

    return router
+
+
+async def api_key_dep(
+    request: Request, async_db_session: AsyncSession = Depends(get_async_session)
+) -> User | None:
+    if AUTH_TYPE == AuthType.DISABLED:
+        return None
+
+    hashed_api_key = get_hashed_api_key_from_request(request)
+    if not hashed_api_key:
+        raise HTTPException(status_code=401, detail="Missing API key")
+
+    if hashed_api_key:
+        user = await fetch_user_for_api_key(hashed_api_key, async_db_session)
+
+    if user is None:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+
+    return user
--- a/backend/danswer/background/celery/apps/app_base.py
+++ b/backend/danswer/background/celery/apps/app_base.py
@@ -3,6 +3,7 @@ import multiprocessing
 import time
 from typing import Any

+import requests
 import sentry_sdk
 from celery import Task
 from celery.app import trace
@@ -10,19 +11,26 @@ from celery.exceptions import WorkerShutdown
 from celery.states import READY_STATES
 from celery.utils.log import get_task_logger
 from celery.worker import strategy  # type: ignore
+from redis.lock import Lock as RedisLock
 from sentry_sdk.integrations.celery import CeleryIntegration
+from sqlalchemy import text
+from sqlalchemy.orm import Session

 from danswer.background.celery.apps.task_formatters import CeleryTaskColoredFormatter
 from danswer.background.celery.apps.task_formatters import CeleryTaskPlainFormatter
-from danswer.background.celery.celery_redis import RedisConnectorCredentialPair
-from danswer.background.celery.celery_redis import RedisConnectorDeletion
-from danswer.background.celery.celery_redis import RedisConnectorPruning
-from danswer.background.celery.celery_redis import RedisDocumentSet
-from danswer.background.celery.celery_redis import RedisUserGroup
 from danswer.background.celery.celery_utils import celery_is_worker_primary
 from danswer.configs.constants import DanswerRedisLocks
-from danswer.db.engine import get_all_tenant_ids
+from danswer.db.engine import get_sqlalchemy_engine
+from danswer.document_index.vespa_constants import VESPA_CONFIG_SERVER_URL
+from danswer.redis.redis_connector import RedisConnector
+from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
+from danswer.redis.redis_connector_delete import RedisConnectorDelete
+from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
+from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
+from danswer.redis.redis_connector_prune import RedisConnectorPrune
+from danswer.redis.redis_document_set import RedisDocumentSet
 from danswer.redis.redis_pool import get_redis_client
+from danswer.redis.redis_usergroup import RedisUserGroup
 from danswer.utils.logger import ColoredFormatter
 from danswer.utils.logger import PlainFormatter
 from danswer.utils.logger import setup_logger
@@ -108,29 +116,43 @@ def on_task_postrun(
    if task_id.startswith(RedisDocumentSet.PREFIX):
        document_set_id = RedisDocumentSet.get_id_from_task_id(task_id)
        if document_set_id is not None:
-            rds = RedisDocumentSet(int(document_set_id))
+            rds = RedisDocumentSet(tenant_id, int(document_set_id))
            r.srem(rds.taskset_key, task_id)
        return

    if task_id.startswith(RedisUserGroup.PREFIX):
        usergroup_id = RedisUserGroup.get_id_from_task_id(task_id)
        if usergroup_id is not None:
-            rug = RedisUserGroup(int(usergroup_id))
+            rug = RedisUserGroup(tenant_id, int(usergroup_id))
            r.srem(rug.taskset_key, task_id)
        return

-    if task_id.startswith(RedisConnectorDeletion.PREFIX):
-        cc_pair_id = RedisConnectorDeletion.get_id_from_task_id(task_id)
+    if task_id.startswith(RedisConnectorDelete.PREFIX):
+        cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
        if cc_pair_id is not None:
-            rcd = RedisConnectorDeletion(int(cc_pair_id))
-            r.srem(rcd.taskset_key, task_id)
+            RedisConnectorDelete.remove_from_taskset(int(cc_pair_id), task_id, r)
        return

-    if task_id.startswith(RedisConnectorPruning.SUBTASK_PREFIX):
-        cc_pair_id = RedisConnectorPruning.get_id_from_task_id(task_id)
+    if task_id.startswith(RedisConnectorPrune.SUBTASK_PREFIX):
+        cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
        if cc_pair_id is not None:
-            rcp = RedisConnectorPruning(int(cc_pair_id))
-            r.srem(rcp.taskset_key, task_id)
+            RedisConnectorPrune.remove_from_taskset(int(cc_pair_id), task_id, r)
+        return
+
+    if task_id.startswith(RedisConnectorPermissionSync.SUBTASK_PREFIX):
+        cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
+        if cc_pair_id is not None:
+            RedisConnectorPermissionSync.remove_from_taskset(
+                int(cc_pair_id), task_id, r
+            )
+        return
+
+    if task_id.startswith(RedisConnectorExternalGroupSync.SUBTASK_PREFIX):
+        cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
+        if cc_pair_id is not None:
+            RedisConnectorExternalGroupSync.remove_from_taskset(
+                int(cc_pair_id), task_id, r
+            )
        return


@@ -140,77 +162,154 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None


 def wait_for_redis(sender: Any, **kwargs: Any) -> None:
+    """Waits for redis to become ready subject to a hardcoded timeout.
+    Will raise WorkerShutdown to kill the celery worker if the timeout is reached."""
+
    r = get_redis_client(tenant_id=None)

    WAIT_INTERVAL = 5
    WAIT_LIMIT = 60

+    ready = False
    time_start = time.monotonic()
-    logger.info("Redis: Readiness check starting.")
+    logger.info("Redis: Readiness probe starting.")
    while True:
        try:
            if r.ping():
+                ready = True
                break
        except Exception:
            pass

        time_elapsed = time.monotonic() - time_start
-        logger.info(
-            f"Redis: Ping failed. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
-        )
        if time_elapsed > WAIT_LIMIT:
-            msg = (
-                f"Redis: Readiness check did not succeed within the timeout "
-                f"({WAIT_LIMIT} seconds). Exiting..."
-            )
-            logger.error(msg)
-            raise WorkerShutdown(msg)
+            break
+
+        logger.info(
+            f"Redis: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
+        )

        time.sleep(WAIT_INTERVAL)

-    logger.info("Redis: Readiness check succeeded. Continuing...")
+    if not ready:
+        msg = (
+            f"Redis: Readiness probe did not succeed within the timeout "
+            f"({WAIT_LIMIT} seconds). Exiting..."
+        )
+        logger.error(msg)
+        raise WorkerShutdown(msg)
+
+    logger.info("Redis: Readiness probe succeeded. Continuing...")
+    return
+
+
+def wait_for_db(sender: Any, **kwargs: Any) -> None:
+    """Waits for the db to become ready subject to a hardcoded timeout.
+    Will raise WorkerShutdown to kill the celery worker if the timeout is reached."""
+
+    WAIT_INTERVAL = 5
+    WAIT_LIMIT = 60
+
+    ready = False
+    time_start = time.monotonic()
+    logger.info("Database: Readiness probe starting.")
+    while True:
+        try:
+            with Session(get_sqlalchemy_engine()) as db_session:
+                result = db_session.execute(text("SELECT NOW()")).scalar()
+                if result:
+                    ready = True
+                    break
+        except Exception:
+            pass
+
+        time_elapsed = time.monotonic() - time_start
+        if time_elapsed > WAIT_LIMIT:
+            break
+
+        logger.info(
+            f"Database: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
+        )
+
+        time.sleep(WAIT_INTERVAL)
+
+    if not ready:
+        msg = (
+            f"Database: Readiness probe did not succeed within the timeout "
+            f"({WAIT_LIMIT} seconds). Exiting..."
+        )
+        logger.error(msg)
+        raise WorkerShutdown(msg)
+
+    logger.info("Database: Readiness probe succeeded. Continuing...")
+    return
+
+
+def wait_for_vespa(sender: Any, **kwargs: Any) -> None:
+    """Waits for Vespa to become ready subject to a hardcoded timeout.
+    Will raise WorkerShutdown to kill the celery worker if the timeout is reached."""
+
+    WAIT_INTERVAL = 5
+    WAIT_LIMIT = 60
+
+    ready = False
+    time_start = time.monotonic()
+    logger.info("Vespa: Readiness probe starting.")
+    while True:
+        try:
+            response = requests.get(f"{VESPA_CONFIG_SERVER_URL}/state/v1/health")
+            response.raise_for_status()
+
+            response_dict = response.json()
+            if response_dict["status"]["code"] == "up":
+                ready = True
+                break
+        except Exception:
+            pass
+
+        time_elapsed = time.monotonic() - time_start
+        if time_elapsed > WAIT_LIMIT:
+            break
+
+        logger.info(
+            f"Vespa: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
+        )
+
+        time.sleep(WAIT_INTERVAL)
+
+    if not ready:
+        msg = (
+            f"Vespa: Readiness probe did not succeed within the timeout "
+            f"({WAIT_LIMIT} seconds). Exiting..."
+        )
+        logger.error(msg)
+        raise WorkerShutdown(msg)
+
+    logger.info("Vespa: Readiness probe succeeded. Continuing...")
    return


 def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None:
+    logger.info("Running as a secondary celery worker.")
+
+    # Set up variables for waiting on primary worker
    WAIT_INTERVAL = 5
    WAIT_LIMIT = 60
-
-    logger.info("Running as a secondary celery worker.")
-    logger.info("Waiting for all tenant primary workers to be ready...")
+    r = get_redis_client(tenant_id=None)
    time_start = time.monotonic()

+    logger.info("Waiting for primary worker to be ready...")
    while True:
-        tenant_ids = get_all_tenant_ids()
-        # Check if we have a primary worker lock for each tenant
-        all_tenants_ready = all(
-            get_redis_client(tenant_id=tenant_id).exists(
-                DanswerRedisLocks.PRIMARY_WORKER
-            )
-            for tenant_id in tenant_ids
-        )
-
-        if all_tenants_ready:
+        if r.exists(DanswerRedisLocks.PRIMARY_WORKER):
            break

        time_elapsed = time.monotonic() - time_start
-        ready_tenants = sum(
-            1
-            for tenant_id in tenant_ids
-            if get_redis_client(tenant_id=tenant_id).exists(
-                DanswerRedisLocks.PRIMARY_WORKER
-            )
-        )
-
        logger.info(
-            f"Not all tenant primary workers are ready yet. "
-            f"Ready tenants: {ready_tenants}/{len(tenant_ids)} "
-            f"elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
+            f"Primary worker is not ready yet. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
        )
-
        if time_elapsed > WAIT_LIMIT:
            msg = (
-                f"Not all tenant primary workers were ready within the timeout "
+                f"Primary worker was not ready within the timeout. "
                f"({WAIT_LIMIT} seconds). Exiting..."
            )
            logger.error(msg)
@@ -218,7 +317,7 @@ def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None:

        time.sleep(WAIT_INTERVAL)

-    logger.info("All tenant primary workers are ready. Continuing...")
+    logger.info("Wait for primary worker completed successfully. Continuing...")
    return


@@ -230,26 +329,20 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
    if not celery_is_worker_primary(sender):
        return

-    if not hasattr(sender, "primary_worker_locks"):
+    if not sender.primary_worker_lock:
        return

-    for tenant_id, lock in sender.primary_worker_locks.items():
-        try:
-            if lock and lock.owned():
-                logger.debug(f"Attempting to release lock for tenant {tenant_id}")
-                try:
-                    lock.release()
-                    logger.debug(f"Successfully released lock for tenant {tenant_id}")
-                except Exception as e:
-                    logger.error(
-                        f"Failed to release lock for tenant {tenant_id}. Error: {str(e)}"
-                    )
-                finally:
-                    sender.primary_worker_locks[tenant_id] = None
-        except Exception as e:
-            logger.error(
-                f"Error checking lock status for tenant {tenant_id}. Error: {str(e)}"
-            )
+    logger.info("Releasing primary worker lock.")
+    lock: RedisLock = sender.primary_worker_lock
+    try:
+        if lock.owned():
+            try:
+                lock.release()
+                sender.primary_worker_lock = None
+            except Exception:
+                logger.exception("Failed to release primary worker lock")
+    except Exception:
+        logger.exception("Failed to check if primary worker lock is owned")


 def on_setup_logging(
--- a/backend/danswer/background/celery/apps/beat.py
+++ b/backend/danswer/background/celery/apps/beat.py
@@ -3,28 +3,162 @@ from typing import Any

 from celery import Celery
 from celery import signals
+from celery.beat import PersistentScheduler  # type: ignore
 from celery.signals import beat_init

 import danswer.background.celery.apps.app_base as app_base
-from danswer.configs.constants import DanswerCeleryPriority
 from danswer.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME
 from danswer.db.engine import get_all_tenant_ids
 from danswer.db.engine import SqlEngine
 from danswer.utils.logger import setup_logger
+from danswer.utils.variable_functionality import fetch_versioned_implementation
+from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
+from shared_configs.configs import MULTI_TENANT

-logger = setup_logger()
+logger = setup_logger(__name__)

 celery_app = Celery(__name__)
 celery_app.config_from_object("danswer.background.celery.configs.beat")


+class DynamicTenantScheduler(PersistentScheduler):
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        logger.info("Initializing DynamicTenantScheduler")
+        super().__init__(*args, **kwargs)
+        self._reload_interval = timedelta(minutes=2)
+        self._last_reload = self.app.now() - self._reload_interval
+        # Let the parent class handle store initialization
+        self.setup_schedule()
+        self._update_tenant_tasks()
+        logger.info(f"Set reload interval to {self._reload_interval}")
+
+    def setup_schedule(self) -> None:
+        logger.info("Setting up initial schedule")
+        super().setup_schedule()
+        logger.info("Initial schedule setup complete")
+
+    def tick(self) -> float:
+        retval = super().tick()
+        now = self.app.now()
+        if (
+            self._last_reload is None
+            or (now - self._last_reload) > self._reload_interval
+        ):
+            logger.info("Reload interval reached, initiating tenant task update")
+            self._update_tenant_tasks()
+            self._last_reload = now
+            logger.info("Tenant task update completed, reset reload timer")
+        return retval
+
+    def _update_tenant_tasks(self) -> None:
+        logger.info("Starting tenant task update process")
+        try:
+            logger.info("Fetching all tenant IDs")
+            tenant_ids = get_all_tenant_ids()
+            logger.info(f"Found {len(tenant_ids)} tenants")
+
+            logger.info("Fetching tasks to schedule")
+            tasks_to_schedule = fetch_versioned_implementation(
+                "danswer.background.celery.tasks.beat_schedule", "get_tasks_to_schedule"
+            )
+
+            new_beat_schedule: dict[str, dict[str, Any]] = {}
+
+            current_schedule = self.schedule.items()
+
+            existing_tenants = set()
+            for task_name, _ in current_schedule:
+                if "-" in task_name:
+                    existing_tenants.add(task_name.split("-")[-1])
+            logger.info(f"Found {len(existing_tenants)} existing tenants in schedule")
+
+            for tenant_id in tenant_ids:
+                if (
+                    IGNORED_SYNCING_TENANT_LIST
+                    and tenant_id in IGNORED_SYNCING_TENANT_LIST
+                ):
+                    logger.info(
+                        f"Skipping tenant {tenant_id} as it is in the ignored syncing list"
+                    )
+                    continue
+
+                if tenant_id not in existing_tenants:
+                    logger.info(f"Processing new tenant: {tenant_id}")
+
+                for task in tasks_to_schedule():
+                    task_name = f"{task['name']}-{tenant_id}"
+                    logger.debug(f"Creating task configuration for {task_name}")
+                    new_task = {
+                        "task": task["task"],
+                        "schedule": task["schedule"],
+                        "kwargs": {"tenant_id": tenant_id},
+                    }
+                    if options := task.get("options"):
+                        logger.debug(f"Adding options to task {task_name}: {options}")
+                        new_task["options"] = options
+                    new_beat_schedule[task_name] = new_task
+
+            if self._should_update_schedule(current_schedule, new_beat_schedule):
+                logger.info(
+                    "Schedule update required",
+                    extra={
+                        "new_tasks": len(new_beat_schedule),
+                        "current_tasks": len(current_schedule),
+                    },
+                )
+
+                # Create schedule entries
+                entries = {}
+                for name, entry in new_beat_schedule.items():
+                    entries[name] = self.Entry(
+                        name=name,
+                        app=self.app,
+                        task=entry["task"],
+                        schedule=entry["schedule"],
+                        options=entry.get("options", {}),
+                        kwargs=entry.get("kwargs", {}),
+                    )
+
+                # Update the schedule using the scheduler's methods
+                self.schedule.clear()
+                self.schedule.update(entries)
+
+                # Ensure changes are persisted
+                self.sync()
+
+                logger.info("Schedule update completed successfully")
+            else:
+                logger.info("Schedule is up to date, no changes needed")
+
+        except (AttributeError, KeyError):
+            logger.exception("Failed to process task configuration")
+        except Exception:
+            logger.exception("Unexpected error updating tenant tasks")
+
+    def _should_update_schedule(
+        self, current_schedule: dict, new_schedule: dict
+    ) -> bool:
+        """Compare schedules to determine if an update is needed."""
+        logger.debug("Comparing current and new schedules")
+        current_tasks = set(name for name, _ in current_schedule)
+        new_tasks = set(new_schedule.keys())
+        needs_update = current_tasks != new_tasks
+        logger.debug(f"Schedule update needed: {needs_update}")
+        return needs_update
+
+
@beat_init.connect
 def on_beat_init(sender: Any, **kwargs: Any) -> None:
    logger.info("beat_init signal received.")

-    # celery beat shouldn't touch the db at all. But just setting a low minimum here.
+    # Celery beat shouldn't touch the db at all. But just setting a low minimum here.
    SqlEngine.set_app_name(POSTGRES_CELERY_BEAT_APP_NAME)
    SqlEngine.init_engine(pool_size=2, max_overflow=0)
+
+    # Startup checks are not needed in multi-tenant case
+    if MULTI_TENANT:
+        return
+
    app_base.wait_for_redis(sender, **kwargs)


@@ -35,68 +169,4 @@ def on_setup_logging(
    app_base.on_setup_logging(loglevel, logfile, format, colorize, **kwargs)


-#####
-# Celery Beat (Periodic Tasks) Settings
-#####
-
-tenant_ids = get_all_tenant_ids()
-
-tasks_to_schedule = [
-    {
-        "name": "check-for-vespa-sync",
-        "task": "check_for_vespa_sync_task",
-        "schedule": timedelta(seconds=5),
-        "options": {"priority": DanswerCeleryPriority.HIGH},
-    },
-    {
-        "name": "check-for-connector-deletion",
-        "task": "check_for_connector_deletion_task",
-        "schedule": timedelta(seconds=60),
-        "options": {"priority": DanswerCeleryPriority.HIGH},
-    },
-    {
-        "name": "check-for-indexing",
-        "task": "check_for_indexing",
-        "schedule": timedelta(seconds=10),
-        "options": {"priority": DanswerCeleryPriority.HIGH},
-    },
-    {
-        "name": "check-for-prune",
-        "task": "check_for_pruning",
-        "schedule": timedelta(seconds=10),
-        "options": {"priority": DanswerCeleryPriority.HIGH},
-    },
-    {
-        "name": "kombu-message-cleanup",
-        "task": "kombu_message_cleanup_task",
-        "schedule": timedelta(seconds=3600),
-        "options": {"priority": DanswerCeleryPriority.LOWEST},
-    },
-    {
-        "name": "monitor-vespa-sync",
-        "task": "monitor_vespa_sync",
-        "schedule": timedelta(seconds=5),
-        "options": {"priority": DanswerCeleryPriority.HIGH},
-    },
-]
-
-
-# Build the celery beat schedule dynamically
-beat_schedule = {}
-
-for tenant_id in tenant_ids:
-    for task in tasks_to_schedule:
-        task_name = f"{task['name']}-{tenant_id}"  # Unique name for each scheduled task
-        beat_schedule[task_name] = {
-            "task": task["task"],
-            "schedule": task["schedule"],
-            "options": task["options"],
-            "kwargs": {"tenant_id": tenant_id},  # Must pass tenant_id as an argument
-        }
-
-# Include any existing beat schedules
-existing_beat_schedule = celery_app.conf.beat_schedule or {}
-beat_schedule.update(existing_beat_schedule)
-
-# Update the Celery app configuration once
-celery_app.conf.beat_schedule = beat_schedule
+celery_app.conf.beat_scheduler = DynamicTenantScheduler
--- a/backend/danswer/background/celery/apps/heavy.py
+++ b/backend/danswer/background/celery/apps/heavy.py
@@ -13,6 +13,7 @@ import danswer.background.celery.apps.app_base as app_base
 from danswer.configs.constants import POSTGRES_CELERY_WORKER_HEAVY_APP_NAME
 from danswer.db.engine import SqlEngine
 from danswer.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT


 logger = setup_logger()
@@ -60,7 +61,13 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
    SqlEngine.init_engine(pool_size=4, max_overflow=12)

+    # Startup checks are not needed in multi-tenant case
+    if MULTI_TENANT:
+        return
+
    app_base.wait_for_redis(sender, **kwargs)
+    app_base.wait_for_db(sender, **kwargs)
+    app_base.wait_for_vespa(sender, **kwargs)
    app_base.on_secondary_worker_init(sender, **kwargs)


@@ -84,5 +91,7 @@ def on_setup_logging(
 celery_app.autodiscover_tasks(
    [
        "danswer.background.celery.tasks.pruning",
+        "danswer.background.celery.tasks.doc_permission_syncing",
+        "danswer.background.celery.tasks.external_group_syncing",
    ]
 )
--- a/backend/danswer/background/celery/apps/indexing.py
+++ b/backend/danswer/background/celery/apps/indexing.py
@@ -6,6 +6,7 @@ from celery import signals
 from celery import Task
 from celery.signals import celeryd_init
 from celery.signals import worker_init
+from celery.signals import worker_process_init
 from celery.signals import worker_ready
 from celery.signals import worker_shutdown

@@ -13,6 +14,7 @@ import danswer.background.celery.apps.app_base as app_base
 from danswer.configs.constants import POSTGRES_CELERY_WORKER_INDEXING_APP_NAME
 from danswer.db.engine import SqlEngine
 from danswer.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT


 logger = setup_logger()
@@ -58,9 +60,15 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
-    SqlEngine.init_engine(pool_size=8, max_overflow=0)
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency)
+
+    # Startup checks are not needed in multi-tenant case
+    if MULTI_TENANT:
+        return

    app_base.wait_for_redis(sender, **kwargs)
+    app_base.wait_for_db(sender, **kwargs)
+    app_base.wait_for_vespa(sender, **kwargs)
    app_base.on_secondary_worker_init(sender, **kwargs)


@@ -74,6 +82,11 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
    app_base.on_worker_shutdown(sender, **kwargs)


+@worker_process_init.connect
+def init_worker(**kwargs: Any) -> None:
+    SqlEngine.reset_engine()
+
+
@signals.setup_logging.connect
 def on_setup_logging(
    loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any
--- a/backend/danswer/background/celery/apps/light.py
+++ b/backend/danswer/background/celery/apps/light.py
@@ -13,6 +13,7 @@ import danswer.background.celery.apps.app_base as app_base
 from danswer.configs.constants import POSTGRES_CELERY_WORKER_LIGHT_APP_NAME
 from danswer.db.engine import SqlEngine
 from danswer.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT


 logger = setup_logger()
@@ -59,8 +60,13 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
+    # Startup checks are not needed in multi-tenant case
+    if MULTI_TENANT:
+        return

    app_base.wait_for_redis(sender, **kwargs)
+    app_base.wait_for_db(sender, **kwargs)
+    app_base.wait_for_vespa(sender, **kwargs)
    app_base.on_secondary_worker_init(sender, **kwargs)


@@ -85,5 +91,7 @@ celery_app.autodiscover_tasks(
    [
        "danswer.background.celery.tasks.shared",
        "danswer.background.celery.tasks.vespa",
+        "danswer.background.celery.tasks.connector_deletion",
+        "danswer.background.celery.tasks.doc_permission_syncing",
    ]
 )
--- a/backend/danswer/background/celery/apps/primary.py
+++ b/backend/danswer/background/celery/apps/primary.py
@@ -1,5 +1,6 @@
 import multiprocessing
 from typing import Any
+from typing import cast

 from celery import bootsteps  # type: ignore
 from celery import Celery
@@ -10,25 +11,33 @@ from celery.signals import celeryd_init
 from celery.signals import worker_init
 from celery.signals import worker_ready
 from celery.signals import worker_shutdown
+from redis.lock import Lock as RedisLock

 import danswer.background.celery.apps.app_base as app_base
 from danswer.background.celery.apps.app_base import task_logger
-from danswer.background.celery.celery_redis import RedisConnectorCredentialPair
-from danswer.background.celery.celery_redis import RedisConnectorDeletion
-from danswer.background.celery.celery_redis import RedisConnectorIndexing
-from danswer.background.celery.celery_redis import RedisConnectorPruning
-from danswer.background.celery.celery_redis import RedisConnectorStop
-from danswer.background.celery.celery_redis import RedisDocumentSet
-from danswer.background.celery.celery_redis import RedisUserGroup
 from danswer.background.celery.celery_utils import celery_is_worker_primary
+from danswer.background.celery.tasks.indexing.tasks import (
+    get_unfenced_index_attempt_ids,
+)
 from danswer.configs.constants import CELERY_PRIMARY_WORKER_LOCK_TIMEOUT
 from danswer.configs.constants import DanswerRedisLocks
 from danswer.configs.constants import POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME
-from danswer.db.engine import get_all_tenant_ids
+from danswer.db.engine import get_session_with_default_tenant
 from danswer.db.engine import SqlEngine
+from danswer.db.index_attempt import get_index_attempt
+from danswer.db.index_attempt import mark_attempt_canceled
+from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
+from danswer.redis.redis_connector_delete import RedisConnectorDelete
+from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
+from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
+from danswer.redis.redis_connector_index import RedisConnectorIndex
+from danswer.redis.redis_connector_prune import RedisConnectorPrune
+from danswer.redis.redis_connector_stop import RedisConnectorStop
+from danswer.redis.redis_document_set import RedisDocumentSet
 from danswer.redis.redis_pool import get_redis_client
+from danswer.redis.redis_usergroup import RedisUserGroup
 from danswer.utils.logger import setup_logger
-
+from shared_configs.configs import MULTI_TENANT

 logger = setup_logger()

@@ -75,95 +84,98 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
    SqlEngine.init_engine(pool_size=8, max_overflow=0)

+    # Startup checks are not needed in multi-tenant case
+    if MULTI_TENANT:
+        return
+
    app_base.wait_for_redis(sender, **kwargs)
+    app_base.wait_for_db(sender, **kwargs)
+    app_base.wait_for_vespa(sender, **kwargs)

    logger.info("Running as the primary celery worker.")

-    sender.primary_worker_locks = {}
-
    # This is singleton work that should be done on startup exactly once
-    # by the primary worker
-    tenant_ids = get_all_tenant_ids()
-    for tenant_id in tenant_ids:
-        r = get_redis_client(tenant_id=tenant_id)
+    # by the primary worker. This is unnecessary in the multi tenant scenario
+    r = get_redis_client(tenant_id=None)

-        # For the moment, we're assuming that we are the only primary worker
-        # that should be running.
-        # TODO: maybe check for or clean up another zombie primary worker if we detect it
-        r.delete(DanswerRedisLocks.PRIMARY_WORKER)
+    # Log the role and slave count - being connected to a slave or slave count > 0 could be problematic
+    info: dict[str, Any] = cast(dict, r.info("replication"))
+    role: str = cast(str, info.get("role"))
+    connected_slaves: int = info.get("connected_slaves", 0)

-        # this process wide lock is taken to help other workers start up in order.
-        # it is planned to use this lock to enforce singleton behavior on the primary
-        # worker, since the primary worker does redis cleanup on startup, but this isn't
-        # implemented yet.
-        lock = r.lock(
-            DanswerRedisLocks.PRIMARY_WORKER,
-            timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
-        )
+    logger.info(
+        f"Redis INFO REPLICATION: role={role} connected_slaves={connected_slaves}"
+    )

-        logger.info("Primary worker lock: Acquire starting.")
-        acquired = lock.acquire(blocking_timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 2)
-        if acquired:
-            logger.info("Primary worker lock: Acquire succeeded.")
-        else:
-            logger.error("Primary worker lock: Acquire failed!")
-            raise WorkerShutdown("Primary worker lock could not be acquired!")
+    # For the moment, we're assuming that we are the only primary worker
+    # that should be running.
+    # TODO: maybe check for or clean up another zombie primary worker if we detect it
+    r.delete(DanswerRedisLocks.PRIMARY_WORKER)

-        # tacking on our own user data to the sender
-        sender.primary_worker_locks[tenant_id] = lock
+    # this process wide lock is taken to help other workers start up in order.
+    # it is planned to use this lock to enforce singleton behavior on the primary
+    # worker, since the primary worker does redis cleanup on startup, but this isn't
+    # implemented yet.

-        # As currently designed, when this worker starts as "primary", we reinitialize redis
-        # to a clean state (for our purposes, anyway)
-        r.delete(DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK)
-        r.delete(DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
+    # set thread_local=False since we don't control what thread the periodic task might
+    # reacquire the lock with
+    lock: RedisLock = r.lock(
+        DanswerRedisLocks.PRIMARY_WORKER,
+        timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
+        thread_local=False,
+    )

-        r.delete(RedisConnectorCredentialPair.get_taskset_key())
-        r.delete(RedisConnectorCredentialPair.get_fence_key())
+    logger.info("Primary worker lock: Acquire starting.")
+    acquired = lock.acquire(blocking_timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 2)
+    if acquired:
+        logger.info("Primary worker lock: Acquire succeeded.")
+    else:
+        logger.error("Primary worker lock: Acquire failed!")
+        raise WorkerShutdown("Primary worker lock could not be acquired!")

-        for key in r.scan_iter(RedisDocumentSet.TASKSET_PREFIX + "*"):
-            r.delete(key)
+    # tacking on our own user data to the sender
+    sender.primary_worker_lock = lock

-        for key in r.scan_iter(RedisDocumentSet.FENCE_PREFIX + "*"):
-            r.delete(key)
+    # As currently designed, when this worker starts as "primary", we reinitialize redis
+    # to a clean state (for our purposes, anyway)
+    r.delete(DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK)
+    r.delete(DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)

-        for key in r.scan_iter(RedisUserGroup.TASKSET_PREFIX + "*"):
-            r.delete(key)
+    r.delete(RedisConnectorCredentialPair.get_taskset_key())
+    r.delete(RedisConnectorCredentialPair.get_fence_key())

-        for key in r.scan_iter(RedisUserGroup.FENCE_PREFIX + "*"):
-            r.delete(key)
+    RedisDocumentSet.reset_all(r)

-        for key in r.scan_iter(RedisConnectorDeletion.TASKSET_PREFIX + "*"):
-            r.delete(key)
+    RedisUserGroup.reset_all(r)

-        for key in r.scan_iter(RedisConnectorDeletion.FENCE_PREFIX + "*"):
-            r.delete(key)
+    RedisConnectorDelete.reset_all(r)

-        for key in r.scan_iter(RedisConnectorPruning.TASKSET_PREFIX + "*"):
-            r.delete(key)
+    RedisConnectorPrune.reset_all(r)

-        for key in r.scan_iter(RedisConnectorPruning.GENERATOR_COMPLETE_PREFIX + "*"):
-            r.delete(key)
+    RedisConnectorIndex.reset_all(r)

-        for key in r.scan_iter(RedisConnectorPruning.GENERATOR_PROGRESS_PREFIX + "*"):
-            r.delete(key)
+    RedisConnectorStop.reset_all(r)

-        for key in r.scan_iter(RedisConnectorPruning.FENCE_PREFIX + "*"):
-            r.delete(key)
+    RedisConnectorPermissionSync.reset_all(r)

-        for key in r.scan_iter(RedisConnectorIndexing.TASKSET_PREFIX + "*"):
-            r.delete(key)
+    RedisConnectorExternalGroupSync.reset_all(r)

-        for key in r.scan_iter(RedisConnectorIndexing.GENERATOR_COMPLETE_PREFIX + "*"):
-            r.delete(key)
+    # mark orphaned index attempts as failed
+    with get_session_with_default_tenant() as db_session:
+        unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
+        for attempt_id in unfenced_attempt_ids:
+            attempt = get_index_attempt(db_session, attempt_id)
+            if not attempt:
+                continue

-        for key in r.scan_iter(RedisConnectorIndexing.GENERATOR_PROGRESS_PREFIX + "*"):
-            r.delete(key)
-
-        for key in r.scan_iter(RedisConnectorIndexing.FENCE_PREFIX + "*"):
-            r.delete(key)
-
-        for key in r.scan_iter(RedisConnectorStop.FENCE_PREFIX + "*"):
-            r.delete(key)
+            failure_reason = (
+                f"Canceling leftover index attempt found on startup: "
+                f"index_attempt={attempt.id} "
+                f"cc_pair={attempt.connector_credential_pair_id} "
+                f"search_settings={attempt.search_settings_id}"
+            )
+            logger.warning(failure_reason)
+            mark_attempt_canceled(attempt.id, db_session, failure_reason)


@worker_ready.connect
@@ -216,52 +228,36 @@ class HubPeriodicTask(bootsteps.StartStopStep):
            if not celery_is_worker_primary(worker):
                return

-            if not hasattr(worker, "primary_worker_locks"):
+            if not hasattr(worker, "primary_worker_lock"):
                return

-            # Retrieve all tenant IDs
-            tenant_ids = get_all_tenant_ids()
+            lock: RedisLock = worker.primary_worker_lock

-            for tenant_id in tenant_ids:
-                lock = worker.primary_worker_locks.get(tenant_id)
-                if not lock:
-                    continue  # Skip if no lock for this tenant
+            r = get_redis_client(tenant_id=None)

-                r = get_redis_client(tenant_id=tenant_id)
+            if lock.owned():
+                task_logger.debug("Reacquiring primary worker lock.")
+                lock.reacquire()
+            else:
+                task_logger.warning(
+                    "Full acquisition of primary worker lock. "
+                    "Reasons could be worker restart or lock expiration."
+                )
+                lock = r.lock(
+                    DanswerRedisLocks.PRIMARY_WORKER,
+                    timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
+                )

-                if lock.owned():
-                    task_logger.debug(
-                        f"Reacquiring primary worker lock for tenant {tenant_id}."
-                    )
-                    lock.reacquire()
+                task_logger.info("Primary worker lock: Acquire starting.")
+                acquired = lock.acquire(
+                    blocking_timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 2
+                )
+                if acquired:
+                    task_logger.info("Primary worker lock: Acquire succeeded.")
+                    worker.primary_worker_lock = lock
                else:
-                    task_logger.warning(
-                        f"Full acquisition of primary worker lock for tenant {tenant_id}. "
-                        "Reasons could be worker restart or lock expiration."
-                    )
-                    lock = r.lock(
-                        DanswerRedisLocks.PRIMARY_WORKER,
-                        timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
-                    )
-
-                    task_logger.info(
-                        f"Primary worker lock for tenant {tenant_id}: Acquire starting."
-                    )
-                    acquired = lock.acquire(
-                        blocking_timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 2
-                    )
-                    if acquired:
-                        task_logger.info(
-                            f"Primary worker lock for tenant {tenant_id}: Acquire succeeded."
-                        )
-                        worker.primary_worker_locks[tenant_id] = lock
-                    else:
-                        task_logger.error(
-                            f"Primary worker lock for tenant {tenant_id}: Acquire failed!"
-                        )
-                        raise TimeoutError(
-                            f"Primary worker lock for tenant {tenant_id} could not be acquired!"
-                        )
+                    task_logger.error("Primary worker lock: Acquire failed!")
+                    raise TimeoutError("Primary worker lock could not be acquired!")

        except Exception:
            task_logger.exception("Periodic task failed.")
@@ -280,6 +276,8 @@ celery_app.autodiscover_tasks(
        "danswer.background.celery.tasks.connector_deletion",
        "danswer.background.celery.tasks.indexing",
        "danswer.background.celery.tasks.periodic",
+        "danswer.background.celery.tasks.doc_permission_syncing",
+        "danswer.background.celery.tasks.external_group_syncing",
        "danswer.background.celery.tasks.pruning",
        "danswer.background.celery.tasks.shared",
        "danswer.background.celery.tasks.vespa",
--- a/backend/danswer/background/celery/celery_redis.py
+++ b/backend/danswer/background/celery/celery_redis.py
@@ -1,568 +1,10 @@
 # These are helper objects for tracking the keys we need to write in redis
-import time
-from abc import ABC
-from abc import abstractmethod
 from typing import cast
-from uuid import uuid4

-import redis
-from celery import Celery
 from redis import Redis
-from sqlalchemy.orm import Session

 from danswer.background.celery.configs.base import CELERY_SEPARATOR
-from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
 from danswer.configs.constants import DanswerCeleryPriority
-from danswer.configs.constants import DanswerCeleryQueues
-from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
-from danswer.db.document import construct_document_select_for_connector_credential_pair
-from danswer.db.document import (
-    construct_document_select_for_connector_credential_pair_by_needs_sync,
-)
-from danswer.db.document_set import construct_document_select_by_docset
-from danswer.utils.variable_functionality import fetch_versioned_implementation
-from danswer.utils.variable_functionality import global_version
-
-
-class RedisObjectHelper(ABC):
-    PREFIX = "base"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def __init__(self, id: str):
-        self._id: str = id
-
-    @property
-    def task_id_prefix(self) -> str:
-        return f"{self.PREFIX}_{self._id}"
-
-    @property
-    def fence_key(self) -> str:
-        # example: documentset_fence_1
-        return f"{self.FENCE_PREFIX}_{self._id}"
-
-    @property
-    def taskset_key(self) -> str:
-        # example: documentset_taskset_1
-        return f"{self.TASKSET_PREFIX}_{self._id}"
-
-    @staticmethod
-    def get_id_from_fence_key(key: str) -> str | None:
-        """
-        Extracts the object ID from a fence key in the format `PREFIX_fence_X`.
-
-        Args:
-            key (str): The fence key string.
-
-        Returns:
-            Optional[int]: The extracted ID if the key is in the correct format, otherwise None.
-        """
-        parts = key.split("_")
-        if len(parts) != 3:
-            return None
-
-        object_id = parts[2]
-        return object_id
-
-    @staticmethod
-    def get_id_from_task_id(task_id: str) -> str | None:
-        """
-        Extracts the object ID from a task ID string.
-
-        This method assumes the task ID is formatted as `prefix_objectid_suffix`, where:
-        - `prefix` is an arbitrary string (e.g., the name of the task or entity),
-        - `objectid` is the ID you want to extract,
-        - `suffix` is another arbitrary string (e.g., a UUID).
-
-        Example:
-            If the input `task_id` is `documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc`,
-            this method will return the string `"1"`.
-
-        Args:
-            task_id (str): The task ID string from which to extract the object ID.
-
-        Returns:
-            str | None: The extracted object ID if the task ID is in the correct format, otherwise None.
-        """
-        # example: task_id=documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc
-        parts = task_id.split("_")
-        if len(parts) != 3:
-            return None
-
-        object_id = parts[1]
-        return object_id
-
-    @abstractmethod
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-        tenant_id: str | None,
-    ) -> int | None:
-        pass
-
-
-class RedisDocumentSet(RedisObjectHelper):
-    PREFIX = "documentset"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def __init__(self, id: int) -> None:
-        super().__init__(str(id))
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-        tenant_id: str | None,
-    ) -> int | None:
-        last_lock_time = time.monotonic()
-
-        async_results = []
-        stmt = construct_document_select_by_docset(int(self._id), current_only=False)
-        for doc in db_session.scalars(stmt).yield_per(1):
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the set BEFORE creating the task.
-            redis_client.sadd(self.taskset_key, custom_task_id)
-
-            result = celery_app.send_task(
-                "vespa_metadata_sync_task",
-                kwargs=dict(document_id=doc.id, tenant_id=tenant_id),
-                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.LOW,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-
-class RedisUserGroup(RedisObjectHelper):
-    PREFIX = "usergroup"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def __init__(self, id: int) -> None:
-        super().__init__(str(id))
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-        tenant_id: str | None,
-    ) -> int | None:
-        last_lock_time = time.monotonic()
-
-        async_results = []
-
-        if not global_version.is_ee_version():
-            return 0
-
-        try:
-            construct_document_select_by_usergroup = fetch_versioned_implementation(
-                "danswer.db.user_group",
-                "construct_document_select_by_usergroup",
-            )
-        except ModuleNotFoundError:
-            return 0
-
-        stmt = construct_document_select_by_usergroup(int(self._id))
-        for doc in db_session.scalars(stmt).yield_per(1):
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the set BEFORE creating the task.
-            redis_client.sadd(self.taskset_key, custom_task_id)
-
-            result = celery_app.send_task(
-                "vespa_metadata_sync_task",
-                kwargs=dict(document_id=doc.id, tenant_id=tenant_id),
-                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.LOW,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-
-class RedisConnectorCredentialPair(RedisObjectHelper):
-    """This class is used to scan documents by cc_pair in the db and collect them into
-    a unified set for syncing.
-
-    It differs from the other redis helpers in that the taskset used spans
-    all connectors and is not per connector."""
-
-    PREFIX = "connectorsync"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def __init__(self, id: int) -> None:
-        super().__init__(str(id))
-
-    @classmethod
-    def get_fence_key(cls) -> str:
-        return RedisConnectorCredentialPair.FENCE_PREFIX
-
-    @classmethod
-    def get_taskset_key(cls) -> str:
-        return RedisConnectorCredentialPair.TASKSET_PREFIX
-
-    @property
-    def taskset_key(self) -> str:
-        """Notice that this is intentionally reusing the same taskset for all
-        connector syncs"""
-        # example: connector_taskset
-        return f"{self.TASKSET_PREFIX}"
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-        tenant_id: str | None,
-    ) -> int | None:
-        last_lock_time = time.monotonic()
-
-        async_results = []
-        cc_pair = get_connector_credential_pair_from_id(int(self._id), db_session)
-        if not cc_pair:
-            return None
-
-        stmt = construct_document_select_for_connector_credential_pair_by_needs_sync(
-            cc_pair.connector_id, cc_pair.credential_id
-        )
-        for doc in db_session.scalars(stmt).yield_per(1):
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the tracking taskset in redis BEFORE creating the celery task.
-            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
-            redis_client.sadd(
-                RedisConnectorCredentialPair.get_taskset_key(), custom_task_id
-            )
-
-            # Priority on sync's triggered by new indexing should be medium
-            result = celery_app.send_task(
-                "vespa_metadata_sync_task",
-                kwargs=dict(document_id=doc.id, tenant_id=tenant_id),
-                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.MEDIUM,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-
-class RedisConnectorDeletion(RedisObjectHelper):
-    PREFIX = "connectordeletion"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def __init__(self, id: int) -> None:
-        super().__init__(str(id))
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-        tenant_id: str | None,
-    ) -> int | None:
-        """Returns None if the cc_pair doesn't exist.
-        Otherwise, returns an int with the number of generated tasks."""
-        last_lock_time = time.monotonic()
-
-        async_results = []
-        cc_pair = get_connector_credential_pair_from_id(int(self._id), db_session)
-        if not cc_pair:
-            return None
-
-        stmt = construct_document_select_for_connector_credential_pair(
-            cc_pair.connector_id, cc_pair.credential_id
-        )
-        for doc in db_session.scalars(stmt).yield_per(1):
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the tracking taskset in redis BEFORE creating the celery task.
-            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
-            redis_client.sadd(self.taskset_key, custom_task_id)
-
-            # Priority on sync's triggered by new indexing should be medium
-            result = celery_app.send_task(
-                "document_by_cc_pair_cleanup_task",
-                kwargs=dict(
-                    document_id=doc.id,
-                    connector_id=cc_pair.connector_id,
-                    credential_id=cc_pair.credential_id,
-                    tenant_id=tenant_id,
-                ),
-                queue=DanswerCeleryQueues.CONNECTOR_DELETION,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.MEDIUM,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-
-class RedisConnectorPruning(RedisObjectHelper):
-    """Celery will kick off a long running generator task to crawl the connector and
-    find any missing docs, which will each then get a new cleanup task. The progress of
-    those tasks will then be monitored to completion.
-
-    Example rough happy path order:
-    Check connectorpruning_fence_1
-    Send generator task with id connectorpruning+generator_1_{uuid}
-
-    generator runs connector with callbacks that increment connectorpruning_generator_progress_1
-    generator creates many subtasks with id connectorpruning+sub_1_{uuid}
-      in taskset connectorpruning_taskset_1
-    on completion, generator sets connectorpruning_generator_complete_1
-
-    celery postrun removes subtasks from taskset
-    monitor beat task cleans up when taskset reaches 0 items
-    """
-
-    PREFIX = "connectorpruning"
-    FENCE_PREFIX = PREFIX + "_fence"  # a fence for the entire pruning process
-    GENERATOR_TASK_PREFIX = PREFIX + "+generator"
-
-    TASKSET_PREFIX = PREFIX + "_taskset"  # stores a list of prune tasks id's
-    SUBTASK_PREFIX = PREFIX + "+sub"
-
-    GENERATOR_PROGRESS_PREFIX = (
-        PREFIX + "_generator_progress"
-    )  # a signal that contains generator progress
-    GENERATOR_COMPLETE_PREFIX = (
-        PREFIX + "_generator_complete"
-    )  # a signal that the generator has finished
-
-    def __init__(self, id: int) -> None:
-        super().__init__(str(id))
-        self.documents_to_prune: set[str] = set()
-
-    @property
-    def generator_task_id_prefix(self) -> str:
-        return f"{self.GENERATOR_TASK_PREFIX}_{self._id}"
-
-    @property
-    def generator_progress_key(self) -> str:
-        # example: connectorpruning_generator_progress_1
-        return f"{self.GENERATOR_PROGRESS_PREFIX}_{self._id}"
-
-    @property
-    def generator_complete_key(self) -> str:
-        # example: connectorpruning_generator_complete_1
-        return f"{self.GENERATOR_COMPLETE_PREFIX}_{self._id}"
-
-    @property
-    def subtask_id_prefix(self) -> str:
-        return f"{self.SUBTASK_PREFIX}_{self._id}"
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock | None,
-        tenant_id: str | None,
-    ) -> int | None:
-        last_lock_time = time.monotonic()
-
-        async_results = []
-        cc_pair = get_connector_credential_pair_from_id(int(self._id), db_session)
-        if not cc_pair:
-            return None
-
-        for doc_id in self.documents_to_prune:
-            current_time = time.monotonic()
-            if lock and current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.subtask_id_prefix}_{uuid4()}"
-
-            # add to the tracking taskset in redis BEFORE creating the celery task.
-            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
-            redis_client.sadd(self.taskset_key, custom_task_id)
-
-            # Priority on sync's triggered by new indexing should be medium
-            result = celery_app.send_task(
-                "document_by_cc_pair_cleanup_task",
-                kwargs=dict(
-                    document_id=doc_id,
-                    connector_id=cc_pair.connector_id,
-                    credential_id=cc_pair.credential_id,
-                    tenant_id=tenant_id,
-                ),
-                queue=DanswerCeleryQueues.CONNECTOR_DELETION,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.MEDIUM,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-    def is_pruning(self, redis_client: Redis) -> bool:
-        """A single example of a helper method being refactored into the redis helper"""
-        if redis_client.exists(self.fence_key):
-            return True
-
-        return False
-
-
-class RedisConnectorIndexing(RedisObjectHelper):
-    """Celery will kick off a long running indexing task to crawl the connector and
-    find any new or updated docs docs, which will each then get a new sync task or be
-    indexed inline.
-
-    ID should be a concatenation of cc_pair_id and search_setting_id, delimited by "/".
-    e.g. "2/5"
-    """
-
-    PREFIX = "connectorindexing"
-    FENCE_PREFIX = PREFIX + "_fence"  # a fence for the entire indexing process
-    GENERATOR_TASK_PREFIX = PREFIX + "+generator"
-
-    TASKSET_PREFIX = PREFIX + "_taskset"  # stores a list of prune tasks id's
-    SUBTASK_PREFIX = PREFIX + "+sub"
-
-    GENERATOR_LOCK_PREFIX = "da_lock:indexing"
-    GENERATOR_PROGRESS_PREFIX = (
-        PREFIX + "_generator_progress"
-    )  # a signal that contains generator progress
-    GENERATOR_COMPLETE_PREFIX = (
-        PREFIX + "_generator_complete"
-    )  # a signal that the generator has finished
-
-    def __init__(self, cc_pair_id: int, search_settings_id: int) -> None:
-        super().__init__(f"{cc_pair_id}/{search_settings_id}")
-
-    @property
-    def generator_lock_key(self) -> str:
-        return f"{self.GENERATOR_LOCK_PREFIX}_{self._id}"
-
-    @property
-    def generator_task_id_prefix(self) -> str:
-        return f"{self.GENERATOR_TASK_PREFIX}_{self._id}"
-
-    @property
-    def generator_progress_key(self) -> str:
-        # example: connectorpruning_generator_progress_1
-        return f"{self.GENERATOR_PROGRESS_PREFIX}_{self._id}"
-
-    @property
-    def generator_complete_key(self) -> str:
-        # example: connectorpruning_generator_complete_1
-        return f"{self.GENERATOR_COMPLETE_PREFIX}_{self._id}"
-
-    @property
-    def subtask_id_prefix(self) -> str:
-        return f"{self.SUBTASK_PREFIX}_{self._id}"
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock | None,
-        tenant_id: str | None,
-    ) -> int | None:
-        return None
-
-    def is_indexing(self, redis_client: Redis) -> bool:
-        """A single example of a helper method being refactored into the redis helper"""
-        if redis_client.exists(self.fence_key):
-            return True
-
-        return False
-
-
-class RedisConnectorStop(RedisObjectHelper):
-    """Used to signal any running tasks for a connector to stop. We should refactor
-    connector related redis helpers into a single class.
-    """
-
-    PREFIX = "connectorstop"
-    FENCE_PREFIX = PREFIX + "_fence"  # a fence for the entire indexing process
-    TASKSET_PREFIX = PREFIX + "_taskset"  # stores a list of prune tasks id's
-
-    def __init__(self, id: int) -> None:
-        super().__init__(str(id))
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock | None,
-        tenant_id: str | None,
-    ) -> int | None:
-        return None


 def celery_get_queue_length(queue: str, r: Redis) -> int:
--- a/backend/danswer/background/celery/celery_utils.py
+++ b/backend/danswer/background/celery/celery_utils.py
@@ -4,8 +4,6 @@ from typing import Any

 from sqlalchemy.orm import Session

-from danswer.background.celery.celery_redis import RedisConnectorDeletion
-from danswer.background.indexing.run_indexing import RunIndexingCallbackInterface
 from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
@@ -18,7 +16,8 @@ from danswer.connectors.models import Document
 from danswer.db.connector_credential_pair import get_connector_credential_pair
 from danswer.db.enums import TaskStatus
 from danswer.db.models import TaskQueueState
-from danswer.redis.redis_pool import get_redis_client
+from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
+from danswer.redis.redis_connector import RedisConnector
 from danswer.server.documents.models import DeletionAttemptSnapshot
 from danswer.utils.logger import setup_logger

@@ -41,14 +40,14 @@ def _get_deletion_status(
    if not cc_pair:
        return None

-    rcd = RedisConnectorDeletion(cc_pair.id)
-
-    r = get_redis_client(tenant_id=tenant_id)
-    if not r.exists(rcd.fence_key):
+    redis_connector = RedisConnector(tenant_id, cc_pair.id)
+    if not redis_connector.delete.fenced:
        return None

    return TaskQueueState(
-        task_id="", task_name=rcd.fence_key, status=TaskStatus.STARTED
+        task_id="",
+        task_name=redis_connector.delete.fence_key,
+        status=TaskStatus.STARTED,
    )


@@ -79,10 +78,10 @@ def document_batch_to_ids(

 def extract_ids_from_runnable_connector(
    runnable_connector: BaseConnector,
-    callback: RunIndexingCallbackInterface | None = None,
+    callback: IndexingHeartbeatInterface | None = None,
 ) -> set[str]:
    """
-    If the PruneConnector hasnt been implemented for the given connector, just pull
+    If the SlimConnector hasnt been implemented for the given connector, just pull
    all docs using the load_from_state and grab out the IDs.

    Optionally, a callback can be passed to handle the length of each document batch.
@@ -112,10 +111,15 @@ def extract_ids_from_runnable_connector(
    for doc_batch in doc_batch_generator:
        if callback:
            if callback.should_stop():
-                raise RuntimeError("Stop signal received")
-            callback.progress(len(doc_batch))
+                raise RuntimeError(
+                    "extract_ids_from_runnable_connector: Stop signal detected"
+                )
+
        all_connector_doc_ids.update(doc_batch_processing_func(doc_batch))

+        if callback:
+            callback.progress("extract_ids_from_runnable_connector", len(doc_batch))
+
    return all_connector_doc_ids


--- a/Show More
+++ b/Show More