Debug test

Add support for o1 (#2538 )
* add o1 support + bump litellm/openai * ports * update exception message for testing
2026-02-16 23:35:46 +00:00 · 2024-09-23 11:05:27 -07:00 · 2024-09-22 23:16:28 +00:00 · 2024-09-21 22:36:22 +00:00 · 2024-09-21 19:55:54 +00:00 · 2024-09-20 23:12:52 +00:00
1019 changed files with 84288 additions and 28165 deletions
--- a/.github/actions/custom-build-and-push/action.yml
+++ b/.github/actions/custom-build-and-push/action.yml
@@ -0,0 +1,76 @@
+name: 'Build and Push Docker Image with Retry'
+description: 'Attempts to build and push a Docker image, with a retry on failure'
+inputs:
+  context:
+    description: 'Build context'
+    required: true
+  file:
+    description: 'Dockerfile location'
+    required: true
+  platforms:
+    description: 'Target platforms'
+    required: true
+  pull:
+    description: 'Always attempt to pull a newer version of the image'
+    required: false
+    default: 'true'
+  push:
+    description: 'Push the image to registry'
+    required: false
+    default: 'true'
+  load:
+    description: 'Load the image into Docker daemon'
+    required: false
+    default: 'true'
+  tags:
+    description: 'Image tags'
+    required: true
+  cache-from:
+    description: 'Cache sources'
+    required: false
+  cache-to:
+    description: 'Cache destinations'
+    required: false
+  retry-wait-time:
+    description: 'Time to wait before retry in seconds'
+    required: false
+    default: '5'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Build and push Docker image (First Attempt)
+      id: buildx1
+      uses: docker/build-push-action@v5
+      continue-on-error: true
+      with:
+        context: ${{ inputs.context }}
+        file: ${{ inputs.file }}
+        platforms: ${{ inputs.platforms }}
+        pull: ${{ inputs.pull }}
+        push: ${{ inputs.push }}
+        load: ${{ inputs.load }}
+        tags: ${{ inputs.tags }}
+        cache-from: ${{ inputs.cache-from }}
+        cache-to: ${{ inputs.cache-to }}
+
+    - name: Wait to retry
+      if: steps.buildx1.outcome != 'success'
+      run: |
+        echo "First attempt failed. Waiting ${{ inputs.retry-wait-time }} seconds before retry..."
+        sleep ${{ inputs.retry-wait-time }}
+      shell: bash
+
+    - name: Build and push Docker image (Retry Attempt)
+      if: steps.buildx1.outcome != 'success'
+      uses: docker/build-push-action@v5
+      with:
+        context: ${{ inputs.context }}
+        file: ${{ inputs.file }}
+        platforms: ${{ inputs.platforms }}
+        pull: ${{ inputs.pull }}
+        push: ${{ inputs.push }}
+        load: ${{ inputs.load }}
+        tags: ${{ inputs.tags }}
+        cache-from: ${{ inputs.cache-from }}
+        cache-to: ${{ inputs.cache-to }}
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -0,0 +1,25 @@
+## Description
+[Provide a brief description of the changes in this PR]
+
+
+## How Has This Been Tested?
+[Describe the tests you ran to verify your changes]
+
+
+## Accepted Risk
+[Any know risks or failure modes to point out to reviewers]
+
+
+## Related Issue(s)
+[If applicable, link to the issue(s) this PR addresses]
+
+
+## Checklist:
+- [ ] All of the automated tests pass
+- [ ] All PR comments are addressed and marked resolved
+- [ ] If there are migrations, they have been rebased to latest main
+- [ ] If there are new dependencies, they are added to the requirements
+- [ ] If there are new environment variables, they are added to all of the deployment methods
+- [ ] If there are new APIs that don't require auth, they are added to PUBLIC_ENDPOINT_SPECS
+- [ ] Docker images build and basic functionalities work
+- [ ] Author has done a final read through of the PR right before merge
--- a/.github/workflows/docker-build-push-backend-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-backend-container-on-tag.yml
@@ -5,9 +5,14 @@ on:
    tags:
      - '*'

+env:
+  REGISTRY_IMAGE: danswer/danswer-backend
+
 jobs:
  build-and-push:
-    runs-on: ubuntu-latest
+    # TODO: make this a matrix build like the web containers
+    runs-on: 
+      group: amd64-image-builders

    steps:
    - name: Checkout code
@@ -22,6 +27,11 @@ jobs:
        username: ${{ secrets.DOCKER_USERNAME }}
        password: ${{ secrets.DOCKER_TOKEN }}

+    - name: Install build-essential
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y build-essential
+        
    - name: Backend Image Docker Build and Push
      uses: docker/build-push-action@v5
      with:
@@ -30,8 +40,8 @@ jobs:
        platforms: linux/amd64,linux/arm64
        push: true
        tags: |
-          danswer/danswer-backend:${{ github.ref_name }}
-          danswer/danswer-backend:latest
+          ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
+          ${{ env.REGISTRY_IMAGE }}:latest
        build-args: |
          DANSWER_VERSION=${{ github.ref_name }}

@@ -39,6 +49,6 @@ jobs:
      uses: aquasecurity/trivy-action@master
      with:
        # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend
-        image-ref: docker.io/danswer/danswer-backend:${{ github.ref_name }}
+        image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
        severity: 'CRITICAL,HIGH'
        trivyignores: ./backend/.trivyignore
--- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml
@@ -7,7 +7,8 @@ on:

 jobs:
  build-and-push:
-    runs-on: ubuntu-latest
+    runs-on: 
+      group: amd64-image-builders

    steps:
    - name: Checkout code
--- a/.github/workflows/docker-build-push-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-web-container-on-tag.yml
@@ -10,7 +10,7 @@ env:

 jobs:
  build:
-    runs-on:
+    runs-on: 
      group: ${{ matrix.platform == 'linux/amd64' && 'amd64-image-builders' || 'arm64-image-builders' }}
    strategy:
      fail-fast: false
@@ -34,8 +34,8 @@ jobs:
        with:
          images: ${{ env.REGISTRY_IMAGE }}
          tags: |
-            type=raw,value=danswer/danswer-web-server:${{ github.ref_name }}
-            type=raw,value=danswer/danswer-web-server:latest
+            type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
+            type=raw,value=${{ env.REGISTRY_IMAGE }}:latest
      
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
--- a/.github/workflows/pr-helm-chart-testing.yml.disabled.txt
+++ b/.github/workflows/pr-helm-chart-testing.yml.disabled.txt
@@ -0,0 +1,67 @@
+# This workflow is intentionally disabled while we're still working on it
+# It's close to ready, but a race condition needs to be fixed with
+# API server and Vespa startup, and it needs to have a way to build/test against
+# local containers
+
+name: Helm - Lint and Test Charts
+
+on:
+  merge_group:
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  lint-test:
+    runs-on: Amd64
+
+    # fetch-depth 0 is required for helm/chart-testing-action
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+        
+    - name: Set up Helm
+      uses: azure/setup-helm@v4.2.0
+      with:
+        version: v3.14.4
+      
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+        cache-dependency-path: |
+          backend/requirements/default.txt
+          backend/requirements/dev.txt
+          backend/requirements/model_server.txt
+    - run: |
+        python -m pip install --upgrade pip
+        pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
+
+    - name: Set up chart-testing
+      uses: helm/chart-testing-action@v2.6.1
+
+    - name: Run chart-testing (list-changed)
+      id: list-changed
+      run: |
+        changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
+        if [[ -n "$changed" ]]; then
+          echo "changed=true" >> "$GITHUB_OUTPUT"
+        fi
+
+    - name: Run chart-testing (lint)
+#       if: steps.list-changed.outputs.changed == 'true'
+      run: ct lint --all --config ct.yaml --target-branch ${{ github.event.repository.default_branch }}
+
+    - name: Create kind cluster
+#       if: steps.list-changed.outputs.changed == 'true'
+      uses: helm/kind-action@v1.10.0
+
+    - name: Run chart-testing (install)
+#       if: steps.list-changed.outputs.changed == 'true'
+      run: ct install --all --config ct.yaml
+#       run: ct install --target-branch ${{ github.event.repository.default_branch }}
+      
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -1,6 +1,7 @@
 name: Python Checks

 on:
+  merge_group:
  pull_request:
    branches: [ main ]

@@ -23,9 +24,9 @@ jobs:
          backend/requirements/model_server.txt
    - run: |
        python -m pip install --upgrade pip
-        pip install -r backend/requirements/default.txt
-        pip install -r backend/requirements/dev.txt
-        pip install -r backend/requirements/model_server.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt

    - name: Run MyPy
      run: |
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -0,0 +1,57 @@
+name: Connector Tests
+
+on:
+  pull_request:
+    branches: [main]
+  schedule:
+    # This cron expression runs the job daily at 16:00 UTC (9am PT)
+    - cron: "0 16 * * *"
+
+env:
+  # Confluence
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
+  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
+  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+
+jobs:
+  connectors-check:
+    runs-on: ubuntu-latest
+
+    env:
+      PYTHONPATH: ./backend
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+
+      - name: Run Tests
+        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
+        run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
+
+      - name: Alert on Failure
+        if: failure() && github.event_name == 'schedule'
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+        run: |
+          curl -X POST \
+            -H 'Content-type: application/json' \
+            --data '{"text":"Scheduled Connector Tests failed! Check the run at: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' \
+            $SLACK_WEBHOOK
--- a/.github/workflows/pr-python-tests.yml
+++ b/.github/workflows/pr-python-tests.yml
@@ -1,6 +1,7 @@
 name: Python Unit Tests

 on:
+  merge_group:
  pull_request:
    branches: [ main ]

@@ -10,7 +11,8 @@ jobs:

    env:
      PYTHONPATH: ./backend
-
+      REDIS_CLOUD_PYTEST_PASSWORD: ${{ secrets.REDIS_CLOUD_PYTEST_PASSWORD }}
+      
    steps:
    - name: Checkout code
      uses: actions/checkout@v4
@@ -27,8 +29,8 @@ jobs:
    - name: Install Dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install -r backend/requirements/default.txt
-        pip install -r backend/requirements/dev.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt

    - name: Run Tests
      shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
--- a/.github/workflows/pr-quality-checks.yml
+++ b/.github/workflows/pr-quality-checks.yml
@@ -4,18 +4,19 @@ concurrency:
  cancel-in-progress: true

 on:
+  merge_group:
  pull_request: null

 jobs:
  quality-checks:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v4
-      with:
-        fetch-depth: 0
-    - uses: actions/setup-python@v5
-      with:
-        python-version: '3.11'
-    - uses: pre-commit/action@v3.0.0
-      with:
-        extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - uses: pre-commit/action@v3.0.0
+        with:
+          extra_args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || '' }}
--- a/.github/workflows/run-it.yml
+++ b/.github/workflows/run-it.yml
@@ -0,0 +1,161 @@
+name: Run Integration Tests
+concurrency:
+  group: Run-Integration-Tests-${{ github.head_ref }}
+  cancel-in-progress: true
+
+on:
+  merge_group:
+  pull_request:
+    branches: [ main ]
+
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+jobs:
+  integration-tests:
+    runs-on: Amd64
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      # NOTE: we don't need to build the Web Docker image since it's not used
+      # during the IT for now. We have a separate action to verify it builds 
+      # succesfully
+      - name: Pull Web Docker image
+        run: |
+          docker pull danswer/danswer-web-server:latest
+          docker tag danswer/danswer-web-server:latest danswer/danswer-web-server:it
+
+      - name: Build Backend Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile
+          platforms: linux/amd64
+          tags: danswer/danswer-backend:it
+          cache-from: type=registry,ref=danswer/danswer-backend:it
+          cache-to: |
+            type=registry,ref=danswer/danswer-backend:it,mode=max
+            type=inline
+
+      - name: Build Model Server Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile.model_server
+          platforms: linux/amd64
+          tags: danswer/danswer-model-server:it
+          cache-from: type=registry,ref=danswer/danswer-model-server:it
+          cache-to: |
+            type=registry,ref=danswer/danswer-model-server:it,mode=max
+            type=inline
+
+      - name: Build integration test Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/tests/integration/Dockerfile
+          platforms: linux/amd64
+          tags: danswer/integration-test-runner:it
+          cache-from: type=registry,ref=danswer/integration-test-runner:it
+          cache-to: |
+            type=registry,ref=danswer/integration-test-runner:it,mode=max
+            type=inline
+
+      - name: Start Docker containers
+        run: |
+          cd deployment/docker_compose
+          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
+          AUTH_TYPE=basic \
+          REQUIRE_EMAIL_VERIFICATION=false \
+          DISABLE_TELEMETRY=true \
+          IMAGE_TAG=it \
+          docker compose -f docker-compose.dev.yml -p danswer-stack up -d
+        id: start_docker
+
+      - name: Wait for service to be ready
+        run: |
+          echo "Starting wait-for-service script..."
+          
+          docker logs -f danswer-stack-api_server-1 &
+
+          start_time=$(date +%s)
+          timeout=300  # 5 minutes in seconds
+          
+          while true; do
+            current_time=$(date +%s)
+            elapsed_time=$((current_time - start_time))
+            
+            if [ $elapsed_time -ge $timeout ]; then
+              echo "Timeout reached. Service did not become ready in 5 minutes."
+              exit 1
+            fi
+            
+            # Use curl with error handling to ignore specific exit code 56
+            response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
+            
+            if [ "$response" = "200" ]; then
+              echo "Service is ready!"
+              break
+            elif [ "$response" = "curl_error" ]; then
+              echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
+            else
+              echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
+            fi
+            
+            sleep 5
+          done
+          echo "Finished waiting for service."
+
+      - name: Run integration tests
+        run: |
+          echo "Running integration tests..."
+          docker run --rm --network danswer-stack_default \
+            -e POSTGRES_HOST=relational_db \
+            -e POSTGRES_USER=postgres \
+            -e POSTGRES_PASSWORD=password \
+            -e POSTGRES_DB=postgres \
+            -e VESPA_HOST=index \
+            -e REDIS_HOST=cache \
+            -e API_SERVER_HOST=api_server \
+            -e OPENAI_API_KEY=${OPENAI_API_KEY} \
+            danswer/integration-test-runner:it
+        continue-on-error: true
+        id: run_tests
+
+      - name: Check test results
+        run: |
+          if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then
+            echo "Integration tests failed. Exiting with error."
+            exit 1
+          else
+            echo "All integration tests passed successfully."
+          fi
+
+      - name: Save Docker logs
+        if: success() || failure()
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
+          mv docker-compose.log ${{ github.workspace }}/docker-compose.log
+      
+      - name: Upload logs
+        if: success() || failure()
+        uses: actions/upload-artifact@v3
+        with:
+          name: docker-logs
+          path: ${{ github.workspace }}/docker-compose.log
+
+      - name: Stop Docker containers
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p danswer-stack down -v
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,6 @@
 .mypy_cache
 .idea
 /deployment/data/nginx/app.conf
-.vscode/launch.json
+.vscode/
+*.sw?
+/backend/tests/regression/answer_quality/search_test_config.yaml
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -1,20 +1,23 @@
-# Copy this file to .env at the base of the repo and fill in the <REPLACE THIS> values
-# This will help with development iteration speed and reduce repeat tasks for dev
+# Copy this file to .env in the .vscode folder
+# Fill in the <REPLACE THIS> values as needed, it is recommended to set the GEN_AI_API_KEY value to avoid having to set up an LLM in the UI
 # Also check out danswer/backend/scripts/restart_containers.sh for a script to restart the containers which Danswer relies on outside of VSCode/Cursor processes

 # For local dev, often user Authentication is not needed
 AUTH_TYPE=disabled
-# This passes top N results to LLM an additional time for reranking prior to answer generation, quite token heavy so we disable it for dev generally
-DISABLE_LLM_CHUNK_FILTER=True


 # Always keep these on for Dev
 # Logs all model prompts to stdout
-LOG_ALL_MODEL_INTERACTIONS=True
+LOG_DANSWER_MODEL_INTERACTIONS=True
 # More verbose logging
 LOG_LEVEL=debug


+# This passes top N results to LLM an additional time for reranking prior to answer generation
+# This step is quite heavy on token usage so we disable it for dev generally
+DISABLE_LLM_DOC_RELEVANCE=False
+
+
 # Useful if you want to toggle auth on/off (google_oauth/OIDC specifically)
 OAUTH_CLIENT_ID=<REPLACE THIS>
 OAUTH_CLIENT_SECRET=<REPLACE THIS>
@@ -22,15 +25,11 @@ OAUTH_CLIENT_SECRET=<REPLACE THIS>
 REQUIRE_EMAIL_VERIFICATION=False


-# Toggles on/off the EE Features
-NEXT_PUBLIC_ENABLE_PAID_EE_FEATURES=False
-
-
 # Set these so if you wipe the DB, you don't end up having to go through the UI every time
 GEN_AI_API_KEY=<REPLACE THIS>
-# If answer quality isn't important for dev, use 3.5 turbo due to it being cheaper
-GEN_AI_MODEL_VERSION=gpt-3.5-turbo
-FAST_GEN_AI_MODEL_VERSION=gpt-3.5-turbo
+# If answer quality isn't important for dev, use gpt-4o-mini since it's cheaper
+GEN_AI_MODEL_VERSION=gpt-4o
+FAST_GEN_AI_MODEL_VERSION=gpt-4o

 # For Danswer Slack Bot, overrides the UI values so no need to set this up via UI every time
 # Only needed if using DanswerBot
@@ -39,5 +38,14 @@ FAST_GEN_AI_MODEL_VERSION=gpt-3.5-turbo


 # Python stuff
-PYTHONPATH=./backend
+PYTHONPATH=../backend
 PYTHONUNBUFFERED=1
+
+
+# Internet Search 
+BING_API_KEY=<REPLACE THIS>
+
+
+# Enable the full set of Danswer Enterprise Edition features
+# NOTE: DO NOT ENABLE THIS UNLESS YOU HAVE A PAID ENTERPRISE LICENSE (or if you are using this for local testing/development)
+ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=False
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -1,15 +1,23 @@
-/*
-
-  Copy this file into '.vscode/launch.json' or merge its
-  contents into your existing configurations.
-
-*/
+/* Copy this file into '.vscode/launch.json' or merge its contents into your existing configurations. */

 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
+    "compounds": [
+        {
+            "name": "Run All Danswer Services",
+            "configurations": [
+                "Web Server",
+                "Model Server",
+                "API Server",
+                "Indexing",
+                "Background Jobs",
+                "Slack Bot"
+            ]
+        }
+    ],
    "configurations": [
        {
            "name": "Web Server",
@@ -17,6 +25,7 @@
            "request": "launch",
            "cwd": "${workspaceRoot}/web",
            "runtimeExecutable": "npm",
+            "envFile": "${workspaceFolder}/.vscode/.env",
            "runtimeArgs": [
                "run", "dev"
            ],
@@ -24,10 +33,12 @@
        },
        {
            "name": "Model Server",
-            "type": "python",
+            "consoleName": "Model Server",
+            "type": "debugpy",
            "request": "launch",
            "module": "uvicorn",
            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.vscode/.env",
            "env": {
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1"
@@ -41,12 +52,14 @@
        },
        {
            "name": "API Server",
-            "type": "python",
+            "consoleName": "API Server",
+            "type": "debugpy",
            "request": "launch",
            "module": "uvicorn",
            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.vscode/.env",
            "env": {
-                "LOG_ALL_MODEL_INTERACTIONS": "True",
+                "LOG_DANSWER_MODEL_INTERACTIONS": "True",
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1"
            },
@@ -59,12 +72,14 @@
        },
        {
            "name": "Indexing",
-            "type": "python",
+            "consoleName": "Indexing",
+            "type": "debugpy",
            "request": "launch",
            "program": "danswer/background/update.py",
            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.vscode/.env",
            "env": {
-                "ENABLE_MINI_CHUNK": "false",
+                "ENABLE_MULTIPASS_INDEXING": "false",
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1",
                "PYTHONPATH": "."
@@ -73,11 +88,14 @@
        // Celery and all async jobs, usually would include indexing as well but this is handled separately above for dev
        {
            "name": "Background Jobs",
-            "type": "python",
+            "consoleName": "Background Jobs",
+            "type": "debugpy",
            "request": "launch",
            "program": "scripts/dev_run_background_jobs.py",
            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.vscode/.env",
            "env": {
+                "LOG_DANSWER_MODEL_INTERACTIONS": "True",
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1",
                "PYTHONPATH": "."
@@ -90,16 +108,46 @@
        // DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
        {
            "name": "Slack Bot",
-            "type": "python",
+            "consoleName": "Slack Bot",
+            "type": "debugpy",
            "request": "launch",
            "program": "danswer/danswerbot/slack/listener.py",
            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.env",
+            "envFile": "${workspaceFolder}/.vscode/.env",
            "env": {
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1",
                "PYTHONPATH": "."
            }
+        },
+        {
+            "name": "Pytest",
+            "consoleName": "Pytest",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "pytest",
+            "cwd": "${workspaceFolder}/backend",
+            "envFile": "${workspaceFolder}/.vscode/.env",
+            "env": {
+                "LOG_LEVEL": "DEBUG",
+                "PYTHONUNBUFFERED": "1",
+                "PYTHONPATH": "."
+            },
+            "args": [
+                "-v"
+                // Specify a sepcific module/test to run or provide nothing to run all tests
+                //"tests/unit/danswer/llm/answering/test_prune_and_merge.py"
+            ]
+        },
+        {
+            "name": "Clear and Restart External Volumes and Containers",
+            "type": "node",
+            "request": "launch",
+            "runtimeExecutable": "bash",
+            "runtimeArgs": ["${workspaceFolder}/backend/scripts/restart_containers.sh"],
+            "cwd": "${workspaceFolder}",
+            "console": "integratedTerminal",
+            "stopOnEntry": true
        }
    ]
-}
+}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -48,23 +48,26 @@ We would love to see you there!


 ## Get Started 🚀
-Danswer being a fully functional app, relies on some external pieces of software, specifically:
+Danswer being a fully functional app, relies on some external software, specifically:
 - [Postgres](https://www.postgresql.org/) (Relational DB)
 - [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
+- [Redis](https://redis.io/) (Cache)
+- [Nginx](https://nginx.org/) (Not needed for development flows generally)

-This guide provides instructions to set up the Danswer specific services outside of Docker because it's easier for
-development purposes but also feel free to just use the containers and update with local changes by providing the
-`--build` flag.
+
+> **Note:**
+> This guide provides instructions to build and run Danswer locally from source with Docker containers providing the above external software. We believe this combination is easier for
+> development purposes. If you prefer to use pre-built container images, we provide instructions on running the full Danswer stack within Docker below.


 ### Local Set Up
-It is recommended to use Python version 3.11
+Be sure to use Python version 3.11. For instructions on installing Python 3.11 on macOS, refer to the [CONTRIBUTING_MACOS.md](./CONTRIBUTING_MACOS.md) readme.

 If using a lower version, modifications will have to be made to the code.
-If using a higher version, the version of Tensorflow we use may not be available for your platform.
+If using a higher version, sometimes some libraries will not be available (i.e. we had problems with Tensorflow in the past with higher versions of python).


-#### Installing Requirements
+#### Backend: Python requirements
 Currently, we use pip and recommend creating a virtual environment.

 For convenience here's a command for it:
@@ -72,6 +75,11 @@ For convenience here's a command for it:
 python -m venv .venv
 source .venv/bin/activate
 ```
+
+> **Note:**
+> This virtual environment MUST NOT be set up WITHIN the danswer directory if you plan on using mypy within certain IDEs.
+> For simplicity, we recommend setting up the virtual environment outside of the danswer directory.
+
 _For Windows, activate the virtual environment using Command Prompt:_
 ```bash
 .venv\Scripts\activate
@@ -85,34 +93,38 @@ Install the required python dependencies:
 ```bash
 pip install -r danswer/backend/requirements/default.txt
 pip install -r danswer/backend/requirements/dev.txt
+pip install -r danswer/backend/requirements/ee.txt
 pip install -r danswer/backend/requirements/model_server.txt
 ```

+Install Playwright for Python (headless browser required by the Web Connector)
+
+In the activated Python virtualenv, install Playwright for Python by running:
+```bash
+playwright install
+```
+
+You may have to deactivate and reactivate your virtualenv for `playwright` to appear on your path.
+
+#### Frontend: Node dependencies
+
 Install [Node.js and npm](https://docs.npmjs.com/downloading-and-installing-node-js-and-npm) for the frontend.
 Once the above is done, navigate to `danswer/web` run:
 ```bash
 npm i
 ```

-Install Playwright (required by the Web Connector)
+#### Docker containers for external software
+You will need Docker installed to run these containers.

-> Note: If you have just done the pip install, open a new terminal and source the python virtual-env again.
-This will update the path to include playwright
-
-Then install Playwright by running:
+First navigate to `danswer/deployment/docker_compose`, then start up Postgres/Vespa/Redis with:
 ```bash
-playwright install
+docker compose -f docker-compose.dev.yml -p danswer-stack up -d index relational_db cache
 ```
+(index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)


-#### Dependent Docker Containers
-First navigate to `danswer/deployment/docker_compose`, then start up Vespa and Postgres with:
-```bash
-docker compose -f docker-compose.dev.yml -p danswer-stack up -d index relational_db
-```
-(index refers to Vespa and relational_db refers to Postgres)
-
-#### Running Danswer
+#### Running Danswer locally
 To start the frontend, navigate to `danswer/web` and run:
 ```bash
 npm run dev
@@ -123,11 +135,10 @@ Navigate to `danswer/backend` and run:
 ```bash
 uvicorn model_server.main:app --reload --port 9000
 ```
+
 _For Windows (for compatibility with both PowerShell and Command Prompt):_
 ```bash
-powershell -Command "
-    uvicorn model_server.main:app --reload --port 9000
-"
+powershell -Command "uvicorn model_server.main:app --reload --port 9000"
 ```

 The first time running Danswer, you will need to run the DB migrations for Postgres.
@@ -150,6 +161,7 @@ To run the backend API server, navigate back to `danswer/backend` and run:
 ```bash
 AUTH_TYPE=disabled uvicorn danswer.main:app --reload --port 8080
 ```
+
 _For Windows (for compatibility with both PowerShell and Command Prompt):_
 ```bash
 powershell -Command "
@@ -158,20 +170,58 @@ powershell -Command "
 "
 ```

-Note: if you need finer logging, add the additional environment variable `LOG_LEVEL=DEBUG` to the relevant services.
+> **Note:**
+> If you need finer logging, add the additional environment variable `LOG_LEVEL=DEBUG` to the relevant services.
+
+#### Wrapping up
+
+You should now have 4 servers running:
+
+- Web server
+- Backend API
+- Model server
+- Background jobs
+
+Now, visit `http://localhost:3000` in your browser. You should see the Danswer onboarding wizard where you can connect your external LLM provider to Danswer.
+
+You've successfully set up a local Danswer instance! 🏁
+
+#### Running the Danswer application in a container
+
+You can run the full Danswer application stack from pre-built images including all external software dependencies.
+
+Navigate to `danswer/deployment/docker_compose` and run:
+
+```bash
+docker compose -f docker-compose.dev.yml -p danswer-stack up -d
+```
+
+After Docker pulls and starts these containers, navigate to `http://localhost:3000` to use Danswer.
+
+If you want to make changes to Danswer and run those changes in Docker, you can also build a local version of the Danswer container images that incorporates your changes like so:
+
+```bash
+docker compose -f docker-compose.dev.yml -p danswer-stack up -d --build
+```

 ### Formatting and Linting
 #### Backend
 For the backend, you'll need to setup pre-commit hooks (black / reorder-python-imports).
 First, install pre-commit (if you don't have it already) following the instructions
 [here](https://pre-commit.com/#installation).
+
+With the virtual environment active, install the pre-commit library with:
+```bash
+pip install pre-commit
+```
+
 Then, from the `danswer/backend` directory, run:
 ```bash
 pre-commit install
 ```

 Additionally, we use `mypy` for static type checking.
-Danswer is fully type-annotated, and we would like to keep it that way! 
+Danswer is fully type-annotated, and we want to keep it that way! 
 To run the mypy checks manually, run `python -m mypy .` from the `danswer/backend` directory.


@@ -182,6 +232,7 @@ Please double check that prettier passes before creating a pull request.


 ### Release Process
-Danswer follows the semver versioning standard.
+Danswer loosely follows the SemVer versioning standard.
+Major changes are released with a "minor" version bump. Currently we use patch release versions to indicate small feature changes.
 A set of Docker containers will be pushed automatically to DockerHub with every tag.
 You can see the containers [here](https://hub.docker.com/search?q=danswer%2F).
--- a/CONTRIBUTING_MACOS.md
+++ b/CONTRIBUTING_MACOS.md
@@ -0,0 +1,31 @@
+## Some additional notes for Mac Users
+The base instructions to set up the development environment are located in [CONTRIBUTING.md](https://github.com/danswer-ai/danswer/blob/main/CONTRIBUTING.md).
+
+### Setting up Python
+Ensure [Homebrew](https://brew.sh/) is already set up.
+
+Then install python 3.11.
+```bash
+brew install python@3.11
+```
+
+Add python 3.11 to your path: add the following line to ~/.zshrc
+```
+export PATH="$(brew --prefix)/opt/python@3.11/libexec/bin:$PATH"
+```
+
+> **Note:**
+> You will need to open a new terminal for the path change above to take effect.
+
+
+### Setting up Docker
+On macOS, you will need to install [Docker Desktop](https://www.docker.com/products/docker-desktop/) and 
+ensure it is running before continuing with the docker commands.
+
+
+### Formatting and Linting
+MacOS will likely require you to remove some quarantine attributes on some of the hooks for them to execute properly.
+After installing pre-commit, run the following command:
+```bash
+sudo xattr -r -d com.apple.quarantine ~/.cache/pre-commit
+```
--- a/8
+++ b/8
@@ -1,6 +1,10 @@
-MIT License
+Copyright (c) 2023-present DanswerAI, Inc.

-Copyright (c) 2023 Yuhong Sun, Chris Weaver
+Portions of this software are licensed as follows:
+
+* All content that resides under "ee" directories of this repository, if that directory exists, is licensed under the license defined in "backend/ee/LICENSE". Specifically all content under "backend/ee" and "web/src/app/ee" is licensed under the license defined in "backend/ee/LICENSE".
+* All third party components incorporated into the Danswer Software are licensed under the original license provided by the owner of the applicable component.
+* Content outside of the above mentioned directories or restrictions above is available under the "MIT Expat" license as defined below.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
 <a href="https://docs.danswer.dev/" target="_blank">
    <img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
 </a>
-<a href="https://join.slack.com/t/danswer/shared_invite/zt-1w76msxmd-HJHLe3KNFIAIzk_0dSOKaQ" target="_blank">
+<a href="https://join.slack.com/t/danswer/shared_invite/zt-2lcmqw703-071hBuZBfNEOGUsLa5PXvQ" target="_blank">
    <img src="https://img.shields.io/badge/slack-join-blue.svg?logo=slack" alt="Slack">
 </a>
 <a href="https://discord.gg/TDJ59cGV2X" target="_blank">
@@ -105,5 +105,25 @@ Efficiently pulls the latest changes from:
  * Websites
  * And more ...

+## 📚 Editions
+
+There are two editions of Danswer:
+
+  * Danswer Community Edition (CE) is available freely under the MIT Expat license. This version has ALL the core features discussed above. This is the version of Danswer you will get if you follow the Deployment guide above.
+  * Danswer Enterprise Edition (EE) includes extra features that are primarily useful for larger organizations. Specifically, this includes:
+    * Single Sign-On (SSO), with support for both SAML and OIDC
+    * Role-based access control
+    * Document permission inheritance from connected sources
+    * Usage analytics and query history accessible to admins
+    * Whitelabeling
+    * API key authentication
+    * Encryption of secrets
+    * Any many more! Checkout [our website](https://www.danswer.ai/) for the latest.
+
+To try the Danswer Enterprise Edition: 
+
+  1. Checkout our [Cloud product](https://app.danswer.ai/signup).
+  2. For self-hosting, contact us at [founders@danswer.ai](mailto:founders@danswer.ai) or book a call with us on our [Cal](https://cal.com/team/danswer/founders).
+
 ## 💡 Contributing
 Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details.
--- a/backend/.gitignore
+++ b/backend/.gitignore
@@ -5,7 +5,7 @@ site_crawls/
 .ipynb_checkpoints/
 api_keys.py
 *ipynb
-.env
+.env*
 vespa-app.zip
 dynamic_config_storage/
 celerybeat-schedule*
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -1,15 +1,18 @@
 FROM python:3.11.7-slim-bookworm

 LABEL com.danswer.maintainer="founders@danswer.ai"
-LABEL com.danswer.description="This image is for the backend of Danswer. It is MIT Licensed and \
-free for all to use. You can find it at https://hub.docker.com/r/danswer/danswer-backend. For \
-more details, visit https://github.com/danswer-ai/danswer."
+LABEL com.danswer.description="This image is the web/frontend container of Danswer which \
+contains code for both the Community and Enterprise editions of Danswer. If you do not \
+have a contract or agreement with DanswerAI, you are not permitted to use the Enterprise \
+Edition features outside of personal development or testing purposes. Please reach out to \
+founders@danswer.ai for more information. Please visit https://github.com/danswer-ai/danswer"

 # Default DANSWER_VERSION, typically overriden during builds by GitHub Actions.
 ARG DANSWER_VERSION=0.3-dev
-ENV DANSWER_VERSION=${DANSWER_VERSION}
-RUN echo "DANSWER_VERSION: ${DANSWER_VERSION}"
+ENV DANSWER_VERSION=${DANSWER_VERSION} \
+    DANSWER_RUNNING_IN_DOCKER="true"

+RUN echo "DANSWER_VERSION: ${DANSWER_VERSION}"
 # Install system dependencies
 # cmake needed for psycopg (postgres)
 # libpq-dev needed for psycopg (postgres)
@@ -17,18 +20,34 @@ RUN echo "DANSWER_VERSION: ${DANSWER_VERSION}"
 # zip for Vespa step futher down
 # ca-certificates for HTTPS
 RUN apt-get update && \
-    apt-get install -y cmake curl zip ca-certificates libgnutls30=3.7.9-2+deb12u2 \
-    libblkid1=2.38.1-5+deb12u1 libmount1=2.38.1-5+deb12u1 libsmartcols1=2.38.1-5+deb12u1 \
-    libuuid1=2.38.1-5+deb12u1 && \
+    apt-get install -y \
+        cmake \
+        curl \
+        zip \
+        ca-certificates \
+        libgnutls30=3.7.9-2+deb12u3 \
+        libblkid1=2.38.1-5+deb12u1 \
+        libmount1=2.38.1-5+deb12u1 \
+        libsmartcols1=2.38.1-5+deb12u1 \
+        libuuid1=2.38.1-5+deb12u1 \
+        libxmlsec1-dev \
+        pkg-config \
+        gcc && \
    rm -rf /var/lib/apt/lists/* && \
    apt-get clean

 # Install Python dependencies
 # Remove py which is pulled in by retry, py is not needed and is a CVE
 COPY ./requirements/default.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt && \
+COPY ./requirements/ee.txt /tmp/ee-requirements.txt
+RUN pip install --no-cache-dir --upgrade \
+        --retries 5 \
+        --timeout 30 \
+        -r /tmp/requirements.txt \
+        -r /tmp/ee-requirements.txt && \
    pip uninstall -y py && \
-    playwright install chromium && playwright install-deps chromium && \
+    playwright install chromium && \
+    playwright install-deps chromium && \
    ln -s /usr/local/bin/supervisord /usr/bin/supervisord

 # Cleanup for CVEs and size reduction
@@ -36,29 +55,52 @@ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt && \
 # xserver-common and xvfb included by playwright installation but not needed after
 # perl-base is part of the base Python Debian image but not needed for Danswer functionality
 # perl-base could only be removed with --allow-remove-essential
-RUN apt-get remove -y --allow-remove-essential perl-base xserver-common xvfb cmake \
-    libldap-2.5-0 libldap-2.5-0 && \
+RUN apt-get update && \
+    apt-get remove -y --allow-remove-essential \
+        perl-base \
+        xserver-common \
+        xvfb \
+        cmake \
+        libldap-2.5-0 \
+        libxmlsec1-dev \
+        pkg-config \
+        gcc && \
+    apt-get install -y libxmlsec1-openssl && \
    apt-get autoremove -y && \
    rm -rf /var/lib/apt/lists/* && \
-    rm /usr/local/lib/python3.11/site-packages/tornado/test/test.key
+    rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key

 # Pre-downloading models for setups with limited egress
-RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('intfloat/e5-base-v2')"
+RUN python -c "from tokenizers import Tokenizer; \
+Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
+

 # Pre-downloading NLTK for setups with limited egress
 RUN python -c "import nltk; \
 nltk.download('stopwords', quiet=True); \
-nltk.download('wordnet', quiet=True); \
 nltk.download('punkt', quiet=True);"
+# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed

 # Set up application files
 WORKDIR /app
+
+# Enterprise Version Files
+COPY ./ee /app/ee
+COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
+
+# Set up application files
 COPY ./danswer /app/danswer
 COPY ./shared_configs /app/shared_configs
 COPY ./alembic /app/alembic
 COPY ./alembic.ini /app/alembic.ini
 COPY supervisord.conf /usr/etc/supervisord.conf

+# Escape hatch
+COPY ./scripts/force_delete_connector_by_id.py /app/scripts/force_delete_connector_by_id.py
+
+# Put logo in assets
+COPY ./assets /app/assets
+
 ENV PYTHONPATH /app

 # Default command which does nothing
--- a/backend/Dockerfile.model_server
+++ b/backend/Dockerfile.model_server
@@ -8,24 +8,38 @@ visit https://github.com/danswer-ai/danswer."

 # Default DANSWER_VERSION, typically overriden during builds by GitHub Actions.
 ARG DANSWER_VERSION=0.3-dev
-ENV DANSWER_VERSION=${DANSWER_VERSION}
+ENV DANSWER_VERSION=${DANSWER_VERSION} \
+    DANSWER_RUNNING_IN_DOCKER="true"
+
+
 RUN echo "DANSWER_VERSION: ${DANSWER_VERSION}"

 COPY ./requirements/model_server.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade \
+        --retries 5 \
+        --timeout 30 \
+        -r /tmp/requirements.txt

 RUN apt-get remove -y --allow-remove-essential perl-base && \
    apt-get autoremove -y

 # Pre-downloading models for setups with limited egress
-RUN python -c "from transformers import AutoModel, AutoTokenizer, TFDistilBertForSequenceClassification; \
-from huggingface_hub import snapshot_download; \
-AutoTokenizer.from_pretrained('danswer/intent-model'); \
-AutoTokenizer.from_pretrained('intfloat/e5-base-v2'); \
+# Download tokenizers, distilbert for the Danswer model
+# Download model weights
+# Run Nomic to pull in the custom architecture and have it cached locally
+RUN python -c "from transformers import AutoTokenizer; \
+AutoTokenizer.from_pretrained('distilbert-base-uncased'); \
 AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
-snapshot_download('danswer/intent-model'); \
-snapshot_download('intfloat/e5-base-v2'); \
-snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1')"
+from huggingface_hub import snapshot_download; \
+snapshot_download(repo_id='danswer/hybrid-intent-token-classifier', revision='v1.0.3'); \
+snapshot_download('nomic-ai/nomic-embed-text-v1'); \
+snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
+from sentence_transformers import SentenceTransformer; \
+SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True);"
+
+# In case the user has volumes mounted to /root/.cache/huggingface that they've downloaded while
+# running Danswer, don't overwrite it with the built in cache folder
+RUN mv /root/.cache/huggingface /root/.cache/temp_huggingface

 WORKDIR /app

--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -8,6 +8,7 @@ from sqlalchemy import pool
 from sqlalchemy.engine import Connection
 from sqlalchemy.ext.asyncio import create_async_engine
 from celery.backends.database.session import ResultModelBase  # type: ignore
+from sqlalchemy.schema import SchemaItem

 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
@@ -15,7 +16,9 @@ config = context.config

 # Interpret the config file for Python logging.
 # This line sets up loggers basically.
-if config.config_file_name is not None:
+if config.config_file_name is not None and config.attributes.get(
+    "configure_logger", True
+):
    fileConfig(config.config_file_name)

 # add your model's MetaData object here
@@ -29,6 +32,20 @@ target_metadata = [Base.metadata, ResultModelBase.metadata]
 # my_important_option = config.get_main_option("my_important_option")
 # ... etc.

+EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}
+
+
+def include_object(
+    object: SchemaItem,
+    name: str,
+    type_: str,
+    reflected: bool,
+    compare_to: SchemaItem | None,
+) -> bool:
+    if type_ == "table" and name in EXCLUDE_TABLES:
+        return False
+    return True
+

 def run_migrations_offline() -> None:
    """Run migrations in 'offline' mode.
@@ -55,7 +72,11 @@ def run_migrations_offline() -> None:


 def do_run_migrations(connection: Connection) -> None:
-    context.configure(connection=connection, target_metadata=target_metadata)  # type: ignore
+    context.configure(
+        connection=connection,
+        target_metadata=target_metadata,  # type: ignore
+        include_object=include_object,
+    )  # type: ignore

    with context.begin_transaction():
        context.run_migrations()
--- a/backend/alembic/versions/0568ccf46a6b_add_thread_specific_model_selection.py
+++ b/backend/alembic/versions/0568ccf46a6b_add_thread_specific_model_selection.py
@@ -0,0 +1,27 @@
+"""Add thread specific model selection
+
+Revision ID: 0568ccf46a6b
+Revises: e209dc5a8156
+Create Date: 2024-06-19 14:25:36.376046
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "0568ccf46a6b"
+down_revision = "e209dc5a8156"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "chat_session",
+        sa.Column("current_alternate_model", sa.String(), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("chat_session", "current_alternate_model")
--- a/backend/alembic/versions/05c07bf07c00_add_search_doc_relevance_details.py
+++ b/backend/alembic/versions/05c07bf07c00_add_search_doc_relevance_details.py
@@ -0,0 +1,32 @@
+"""add search doc relevance details
+
+Revision ID: 05c07bf07c00
+Revises: b896bbd0d5a7
+Create Date: 2024-07-10 17:48:15.886653
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "05c07bf07c00"
+down_revision = "b896bbd0d5a7"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "search_doc",
+        sa.Column("is_relevant", sa.Boolean(), nullable=True),
+    )
+    op.add_column(
+        "search_doc",
+        sa.Column("relevance_explanation", sa.String(), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("search_doc", "relevance_explanation")
+    op.drop_column("search_doc", "is_relevant")
--- a/backend/alembic/versions/08a1eda20fe1_add_earliest_indexing_to_connector.py
+++ b/backend/alembic/versions/08a1eda20fe1_add_earliest_indexing_to_connector.py
@@ -0,0 +1,26 @@
+"""add_indexing_start_to_connector
+
+Revision ID: 08a1eda20fe1
+Revises: 8a87bd6ec550
+Create Date: 2024-07-23 11:12:39.462397
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "08a1eda20fe1"
+down_revision = "8a87bd6ec550"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector", sa.Column("indexing_start", sa.DateTime(), nullable=True)
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector", "indexing_start")
--- a/backend/alembic/versions/0ebb1d516877_add_ccpair_deletion_failure_message.py
+++ b/backend/alembic/versions/0ebb1d516877_add_ccpair_deletion_failure_message.py
@@ -0,0 +1,27 @@
+"""add ccpair deletion failure message
+
+Revision ID: 0ebb1d516877
+Revises: 52a219fb5233
+Create Date: 2024-09-10 15:03:48.233926
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "0ebb1d516877"
+down_revision = "52a219fb5233"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column("deletion_failure_message", sa.String(), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "deletion_failure_message")
--- a/backend/alembic/versions/1b8206b29c5d_add_user_delete_cascades.py
+++ b/backend/alembic/versions/1b8206b29c5d_add_user_delete_cascades.py
@@ -0,0 +1,102 @@
+"""add_user_delete_cascades
+
+Revision ID: 1b8206b29c5d
+Revises: 35e6853a51d5
+Create Date: 2024-09-18 11:48:59.418726
+
+"""
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "1b8206b29c5d"
+down_revision = "35e6853a51d5"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.drop_constraint("credential_user_id_fkey", "credential", type_="foreignkey")
+    op.create_foreign_key(
+        "credential_user_id_fkey",
+        "credential",
+        "user",
+        ["user_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+    op.drop_constraint("chat_session_user_id_fkey", "chat_session", type_="foreignkey")
+    op.create_foreign_key(
+        "chat_session_user_id_fkey",
+        "chat_session",
+        "user",
+        ["user_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+    op.drop_constraint("chat_folder_user_id_fkey", "chat_folder", type_="foreignkey")
+    op.create_foreign_key(
+        "chat_folder_user_id_fkey",
+        "chat_folder",
+        "user",
+        ["user_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+    op.drop_constraint("prompt_user_id_fkey", "prompt", type_="foreignkey")
+    op.create_foreign_key(
+        "prompt_user_id_fkey", "prompt", "user", ["user_id"], ["id"], ondelete="CASCADE"
+    )
+
+    op.drop_constraint("notification_user_id_fkey", "notification", type_="foreignkey")
+    op.create_foreign_key(
+        "notification_user_id_fkey",
+        "notification",
+        "user",
+        ["user_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+    op.drop_constraint("inputprompt_user_id_fkey", "inputprompt", type_="foreignkey")
+    op.create_foreign_key(
+        "inputprompt_user_id_fkey",
+        "inputprompt",
+        "user",
+        ["user_id"],
+        ["id"],
+        ondelete="CASCADE",
+    )
+
+
+def downgrade() -> None:
+    op.drop_constraint("credential_user_id_fkey", "credential", type_="foreignkey")
+    op.create_foreign_key(
+        "credential_user_id_fkey", "credential", "user", ["user_id"], ["id"]
+    )
+
+    op.drop_constraint("chat_session_user_id_fkey", "chat_session", type_="foreignkey")
+    op.create_foreign_key(
+        "chat_session_user_id_fkey", "chat_session", "user", ["user_id"], ["id"]
+    )
+
+    op.drop_constraint("chat_folder_user_id_fkey", "chat_folder", type_="foreignkey")
+    op.create_foreign_key(
+        "chat_folder_user_id_fkey", "chat_folder", "user", ["user_id"], ["id"]
+    )
+
+    op.drop_constraint("prompt_user_id_fkey", "prompt", type_="foreignkey")
+    op.create_foreign_key("prompt_user_id_fkey", "prompt", "user", ["user_id"], ["id"])
+
+    op.drop_constraint("notification_user_id_fkey", "notification", type_="foreignkey")
+    op.create_foreign_key(
+        "notification_user_id_fkey", "notification", "user", ["user_id"], ["id"]
+    )
+
+    op.drop_constraint("inputprompt_user_id_fkey", "inputprompt", type_="foreignkey")
+    op.create_foreign_key(
+        "inputprompt_user_id_fkey", "inputprompt", "user", ["user_id"], ["id"]
+    )
--- a/backend/alembic/versions/1f60f60c3401_embedding_model_search_settings.py
+++ b/backend/alembic/versions/1f60f60c3401_embedding_model_search_settings.py
@@ -0,0 +1,135 @@
+"""embedding model -> search settings
+
+Revision ID: 1f60f60c3401
+Revises: f17bf3b0d9f1
+Create Date: 2024-08-25 12:39:51.731632
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+from danswer.configs.chat_configs import NUM_POSTPROCESSED_RESULTS
+
+# revision identifiers, used by Alembic.
+revision = "1f60f60c3401"
+down_revision = "f17bf3b0d9f1"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.drop_constraint(
+        "index_attempt__embedding_model_fk", "index_attempt", type_="foreignkey"
+    )
+    # Rename the table
+    op.rename_table("embedding_model", "search_settings")
+
+    # Add new columns
+    op.add_column(
+        "search_settings",
+        sa.Column(
+            "multipass_indexing", sa.Boolean(), nullable=False, server_default="false"
+        ),
+    )
+    op.add_column(
+        "search_settings",
+        sa.Column(
+            "multilingual_expansion",
+            postgresql.ARRAY(sa.String()),
+            nullable=False,
+            server_default="{}",
+        ),
+    )
+    op.add_column(
+        "search_settings",
+        sa.Column(
+            "disable_rerank_for_streaming",
+            sa.Boolean(),
+            nullable=False,
+            server_default="false",
+        ),
+    )
+    op.add_column(
+        "search_settings", sa.Column("rerank_model_name", sa.String(), nullable=True)
+    )
+    op.add_column(
+        "search_settings", sa.Column("rerank_provider_type", sa.String(), nullable=True)
+    )
+    op.add_column(
+        "search_settings", sa.Column("rerank_api_key", sa.String(), nullable=True)
+    )
+    op.add_column(
+        "search_settings",
+        sa.Column(
+            "num_rerank",
+            sa.Integer(),
+            nullable=False,
+            server_default=str(NUM_POSTPROCESSED_RESULTS),
+        ),
+    )
+
+    # Add the new column as nullable initially
+    op.add_column(
+        "index_attempt", sa.Column("search_settings_id", sa.Integer(), nullable=True)
+    )
+
+    # Populate the new column with data from the existing embedding_model_id
+    op.execute("UPDATE index_attempt SET search_settings_id = embedding_model_id")
+
+    # Create the foreign key constraint
+    op.create_foreign_key(
+        "fk_index_attempt_search_settings",
+        "index_attempt",
+        "search_settings",
+        ["search_settings_id"],
+        ["id"],
+    )
+
+    # Make the new column non-nullable
+    op.alter_column("index_attempt", "search_settings_id", nullable=False)
+
+    # Drop the old embedding_model_id column
+    op.drop_column("index_attempt", "embedding_model_id")
+
+
+def downgrade() -> None:
+    # Add back the embedding_model_id column
+    op.add_column(
+        "index_attempt", sa.Column("embedding_model_id", sa.Integer(), nullable=True)
+    )
+
+    # Populate the old column with data from search_settings_id
+    op.execute("UPDATE index_attempt SET embedding_model_id = search_settings_id")
+
+    # Make the old column non-nullable
+    op.alter_column("index_attempt", "embedding_model_id", nullable=False)
+
+    # Drop the foreign key constraint
+    op.drop_constraint(
+        "fk_index_attempt_search_settings", "index_attempt", type_="foreignkey"
+    )
+
+    # Drop the new search_settings_id column
+    op.drop_column("index_attempt", "search_settings_id")
+
+    # Rename the table back
+    op.rename_table("search_settings", "embedding_model")
+
+    # Remove added columns
+    op.drop_column("embedding_model", "num_rerank")
+    op.drop_column("embedding_model", "rerank_api_key")
+    op.drop_column("embedding_model", "rerank_provider_type")
+    op.drop_column("embedding_model", "rerank_model_name")
+    op.drop_column("embedding_model", "disable_rerank_for_streaming")
+    op.drop_column("embedding_model", "multilingual_expansion")
+    op.drop_column("embedding_model", "multipass_indexing")
+
+    op.create_foreign_key(
+        "index_attempt__embedding_model_fk",
+        "index_attempt",
+        "embedding_model",
+        ["embedding_model_id"],
+        ["id"],
+    )
--- a/backend/alembic/versions/213fd978c6d8_notifications.py
+++ b/backend/alembic/versions/213fd978c6d8_notifications.py
@@ -0,0 +1,44 @@
+"""notifications
+
+Revision ID: 213fd978c6d8
+Revises: 5fc1f54cc252
+Create Date: 2024-08-10 11:13:36.070790
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "213fd978c6d8"
+down_revision = "5fc1f54cc252"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "notification",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "notif_type",
+            sa.String(),
+            nullable=False,
+        ),
+        sa.Column(
+            "user_id",
+            sa.UUID(),
+            nullable=True,
+        ),
+        sa.Column("dismissed", sa.Boolean(), nullable=False),
+        sa.Column("last_shown", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("first_shown", sa.DateTime(timezone=True), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["user_id"],
+            ["user.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("notification")
--- a/backend/alembic/versions/23957775e5f5_remove_feedback_foreignkey_constraint.py
+++ b/backend/alembic/versions/23957775e5f5_remove_feedback_foreignkey_constraint.py
@@ -0,0 +1,86 @@
+"""remove-feedback-foreignkey-constraint
+
+Revision ID: 23957775e5f5
+Revises: bc9771dccadf
+Create Date: 2024-06-27 16:04:51.480437
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "23957775e5f5"
+down_revision = "bc9771dccadf"
+branch_labels = None  # type: ignore
+depends_on = None  # type: ignore
+
+
+def upgrade() -> None:
+    op.drop_constraint(
+        "chat_feedback__chat_message_fk", "chat_feedback", type_="foreignkey"
+    )
+    op.create_foreign_key(
+        "chat_feedback__chat_message_fk",
+        "chat_feedback",
+        "chat_message",
+        ["chat_message_id"],
+        ["id"],
+        ondelete="SET NULL",
+    )
+    op.alter_column(
+        "chat_feedback", "chat_message_id", existing_type=sa.Integer(), nullable=True
+    )
+    op.drop_constraint(
+        "document_retrieval_feedback__chat_message_fk",
+        "document_retrieval_feedback",
+        type_="foreignkey",
+    )
+    op.create_foreign_key(
+        "document_retrieval_feedback__chat_message_fk",
+        "document_retrieval_feedback",
+        "chat_message",
+        ["chat_message_id"],
+        ["id"],
+        ondelete="SET NULL",
+    )
+    op.alter_column(
+        "document_retrieval_feedback",
+        "chat_message_id",
+        existing_type=sa.Integer(),
+        nullable=True,
+    )
+
+
+def downgrade() -> None:
+    op.alter_column(
+        "chat_feedback", "chat_message_id", existing_type=sa.Integer(), nullable=False
+    )
+    op.drop_constraint(
+        "chat_feedback__chat_message_fk", "chat_feedback", type_="foreignkey"
+    )
+    op.create_foreign_key(
+        "chat_feedback__chat_message_fk",
+        "chat_feedback",
+        "chat_message",
+        ["chat_message_id"],
+        ["id"],
+    )
+
+    op.alter_column(
+        "document_retrieval_feedback",
+        "chat_message_id",
+        existing_type=sa.Integer(),
+        nullable=False,
+    )
+    op.drop_constraint(
+        "document_retrieval_feedback__chat_message_fk",
+        "document_retrieval_feedback",
+        type_="foreignkey",
+    )
+    op.create_foreign_key(
+        "document_retrieval_feedback__chat_message_fk",
+        "document_retrieval_feedback",
+        "chat_message",
+        ["chat_message_id"],
+        ["id"],
+    )
--- a/backend/alembic/versions/27c6ecc08586_permission_framework.py
+++ b/backend/alembic/versions/27c6ecc08586_permission_framework.py
@@ -160,12 +160,28 @@ def downgrade() -> None:
            nullable=False,
        ),
    )
-    op.drop_constraint(
-        "fk_index_attempt_credential_id", "index_attempt", type_="foreignkey"
-    )
-    op.drop_constraint(
-        "fk_index_attempt_connector_id", "index_attempt", type_="foreignkey"
-    )
+
+    # Check if the constraint exists before dropping
+    conn = op.get_bind()
+    inspector = sa.inspect(conn)
+    constraints = inspector.get_foreign_keys("index_attempt")
+
+    if any(
+        constraint["name"] == "fk_index_attempt_credential_id"
+        for constraint in constraints
+    ):
+        op.drop_constraint(
+            "fk_index_attempt_credential_id", "index_attempt", type_="foreignkey"
+        )
+
+    if any(
+        constraint["name"] == "fk_index_attempt_connector_id"
+        for constraint in constraints
+    ):
+        op.drop_constraint(
+            "fk_index_attempt_connector_id", "index_attempt", type_="foreignkey"
+        )
+
    op.drop_column("index_attempt", "credential_id")
    op.drop_column("index_attempt", "connector_id")
    op.drop_table("connector_credential_pair")
--- a/backend/alembic/versions/2d2304e27d8c_add_above_below_to_persona.py
+++ b/backend/alembic/versions/2d2304e27d8c_add_above_below_to_persona.py
@@ -0,0 +1,32 @@
+"""Add Above Below to Persona
+
+Revision ID: 2d2304e27d8c
+Revises: 4b08d97e175a
+Create Date: 2024-08-21 19:15:15.762948
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "2d2304e27d8c"
+down_revision = "4b08d97e175a"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column("persona", sa.Column("chunks_above", sa.Integer(), nullable=True))
+    op.add_column("persona", sa.Column("chunks_below", sa.Integer(), nullable=True))
+
+    op.execute(
+        "UPDATE persona SET chunks_above = 1, chunks_below = 1 WHERE chunks_above IS NULL AND chunks_below IS NULL"
+    )
+
+    op.alter_column("persona", "chunks_above", nullable=False)
+    op.alter_column("persona", "chunks_below", nullable=False)
+
+
+def downgrade() -> None:
+    op.drop_column("persona", "chunks_below")
+    op.drop_column("persona", "chunks_above")
--- a/backend/alembic/versions/325975216eb3_add_icon_color_and_icon_shape_to_persona.py
+++ b/backend/alembic/versions/325975216eb3_add_icon_color_and_icon_shape_to_persona.py
@@ -0,0 +1,70 @@
+"""Add icon_color and icon_shape to Persona
+
+Revision ID: 325975216eb3
+Revises: 91ffac7e65b3
+Create Date: 2024-07-24 21:29:31.784562
+
+"""
+import random
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.sql import table, column, select
+
+# revision identifiers, used by Alembic.
+revision = "325975216eb3"
+down_revision = "91ffac7e65b3"
+branch_labels: None = None
+depends_on: None = None
+
+
+colorOptions = [
+    "#FF6FBF",
+    "#6FB1FF",
+    "#B76FFF",
+    "#FFB56F",
+    "#6FFF8D",
+    "#FF6F6F",
+    "#6FFFFF",
+]
+
+
+# Function to generate a random shape ensuring at least 3 of the middle 4 squares are filled
+def generate_random_shape() -> int:
+    center_squares = [12, 10, 6, 14, 13, 11, 7, 15]
+    center_fill = random.choice(center_squares)
+    remaining_squares = [i for i in range(16) if not (center_fill & (1 << i))]
+    random.shuffle(remaining_squares)
+    for i in range(10 - bin(center_fill).count("1")):
+        center_fill |= 1 << remaining_squares[i]
+    return center_fill
+
+
+def upgrade() -> None:
+    op.add_column("persona", sa.Column("icon_color", sa.String(), nullable=True))
+    op.add_column("persona", sa.Column("icon_shape", sa.Integer(), nullable=True))
+    op.add_column("persona", sa.Column("uploaded_image_id", sa.String(), nullable=True))
+
+    persona = table(
+        "persona",
+        column("id", sa.Integer),
+        column("icon_color", sa.String),
+        column("icon_shape", sa.Integer),
+    )
+
+    conn = op.get_bind()
+    personas = conn.execute(select(persona.c.id))
+
+    for persona_id in personas:
+        random_color = random.choice(colorOptions)
+        random_shape = generate_random_shape()
+        conn.execute(
+            persona.update()
+            .where(persona.c.id == persona_id[0])
+            .values(icon_color=random_color, icon_shape=random_shape)
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("persona", "icon_shape")
+    op.drop_column("persona", "uploaded_image_id")
+    op.drop_column("persona", "icon_color")
--- a/backend/alembic/versions/351faebd379d_add_curator_fields.py
+++ b/backend/alembic/versions/351faebd379d_add_curator_fields.py
@@ -0,0 +1,90 @@
+"""Add curator fields
+
+Revision ID: 351faebd379d
+Revises: ee3f4b47fad5
+Create Date: 2024-08-15 22:37:08.397052
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "351faebd379d"
+down_revision = "ee3f4b47fad5"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    # Add is_curator column to User__UserGroup table
+    op.add_column(
+        "user__user_group",
+        sa.Column("is_curator", sa.Boolean(), nullable=False, server_default="false"),
+    )
+
+    # Use batch mode to modify the enum type
+    with op.batch_alter_table("user", schema=None) as batch_op:
+        batch_op.alter_column(  # type: ignore[attr-defined]
+            "role",
+            type_=sa.Enum(
+                "BASIC",
+                "ADMIN",
+                "CURATOR",
+                "GLOBAL_CURATOR",
+                name="userrole",
+                native_enum=False,
+            ),
+            existing_type=sa.Enum("BASIC", "ADMIN", name="userrole", native_enum=False),
+            existing_nullable=False,
+        )
+    # Create the association table
+    op.create_table(
+        "credential__user_group",
+        sa.Column("credential_id", sa.Integer(), nullable=False),
+        sa.Column("user_group_id", sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["credential_id"],
+            ["credential.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["user_group_id"],
+            ["user_group.id"],
+        ),
+        sa.PrimaryKeyConstraint("credential_id", "user_group_id"),
+    )
+    op.add_column(
+        "credential",
+        sa.Column(
+            "curator_public", sa.Boolean(), nullable=False, server_default="false"
+        ),
+    )
+
+
+def downgrade() -> None:
+    # Update existing records to ensure they fit within the BASIC/ADMIN roles
+    op.execute(
+        "UPDATE \"user\" SET role = 'ADMIN' WHERE role IN ('CURATOR', 'GLOBAL_CURATOR')"
+    )
+
+    # Remove is_curator column from User__UserGroup table
+    op.drop_column("user__user_group", "is_curator")
+
+    with op.batch_alter_table("user", schema=None) as batch_op:
+        batch_op.alter_column(  # type: ignore[attr-defined]
+            "role",
+            type_=sa.Enum(
+                "BASIC", "ADMIN", name="userrole", native_enum=False, length=20
+            ),
+            existing_type=sa.Enum(
+                "BASIC",
+                "ADMIN",
+                "CURATOR",
+                "GLOBAL_CURATOR",
+                name="userrole",
+                native_enum=False,
+            ),
+            existing_nullable=False,
+        )
+    # Drop the association table
+    op.drop_table("credential__user_group")
+    op.drop_column("credential", "curator_public")
--- a/backend/alembic/versions/35e6853a51d5_server_default_chosen_assistants.py
+++ b/backend/alembic/versions/35e6853a51d5_server_default_chosen_assistants.py
@@ -0,0 +1,64 @@
+"""server default chosen assistants
+
+Revision ID: 35e6853a51d5
+Revises: c99d76fcd298
+Create Date: 2024-09-13 13:20:32.885317
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "35e6853a51d5"
+down_revision = "c99d76fcd298"
+branch_labels = None
+depends_on = None
+
+DEFAULT_ASSISTANTS = [-2, -1, 0]
+
+
+def upgrade() -> None:
+    # Step 1: Update any NULL values to the default value
+    # This upgrades existing users without ordered assistant
+    # to have default assistants set to visible assistants which are
+    # accessible by them.
+    op.execute(
+        """
+        UPDATE "user" u
+        SET chosen_assistants = (
+            SELECT jsonb_agg(
+                p.id ORDER BY
+                    COALESCE(p.display_priority, 2147483647) ASC,
+                    p.id ASC
+            )
+            FROM persona p
+            LEFT JOIN persona__user pu ON p.id = pu.persona_id AND pu.user_id = u.id
+            WHERE p.is_visible = true
+            AND (p.is_public = true OR pu.user_id IS NOT NULL)
+        )
+        WHERE chosen_assistants IS NULL
+        OR chosen_assistants = 'null'
+        OR jsonb_typeof(chosen_assistants) = 'null'
+        OR (jsonb_typeof(chosen_assistants) = 'string' AND chosen_assistants = '"null"')
+    """
+    )
+
+    # Step 2: Alter the column to make it non-nullable
+    op.alter_column(
+        "user",
+        "chosen_assistants",
+        type_=postgresql.JSONB(astext_type=sa.Text()),
+        nullable=False,
+        server_default=sa.text(f"'{DEFAULT_ASSISTANTS}'::jsonb"),
+    )
+
+
+def downgrade() -> None:
+    op.alter_column(
+        "user",
+        "chosen_assistants",
+        type_=postgresql.JSONB(astext_type=sa.Text()),
+        nullable=True,
+        server_default=None,
+    )
--- a/backend/alembic/versions/3a7802814195_add_alternate_assistant_to_chat_message.py
+++ b/backend/alembic/versions/3a7802814195_add_alternate_assistant_to_chat_message.py
@@ -0,0 +1,35 @@
+"""add alternate assistant to chat message
+
+Revision ID: 3a7802814195
+Revises: 23957775e5f5
+Create Date: 2024-06-05 11:18:49.966333
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "3a7802814195"
+down_revision = "23957775e5f5"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "chat_message", sa.Column("alternate_assistant_id", sa.Integer(), nullable=True)
+    )
+    op.create_foreign_key(
+        "fk_chat_message_persona",
+        "chat_message",
+        "persona",
+        ["alternate_assistant_id"],
+        ["id"],
+    )
+
+
+def downgrade() -> None:
+    op.drop_constraint("fk_chat_message_persona", "chat_message", type_="foreignkey")
+    op.drop_column("chat_message", "alternate_assistant_id")
--- a/backend/alembic/versions/43cbbb3f5e6a_rename_index_origin_to_index_recursively.py
+++ b/backend/alembic/versions/43cbbb3f5e6a_rename_index_origin_to_index_recursively.py
@@ -0,0 +1,42 @@
+"""Rename index_origin to index_recursively
+
+Revision ID: 1d6ad76d1f37
+Revises: e1392f05e840
+Create Date: 2024-08-01 12:38:54.466081
+
+"""
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "1d6ad76d1f37"
+down_revision = "e1392f05e840"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        UPDATE connector
+        SET connector_specific_config = jsonb_set(
+            connector_specific_config,
+            '{index_recursively}',
+            'true'::jsonb
+        ) - 'index_origin'
+        WHERE connector_specific_config ? 'index_origin'
+    """
+    )
+
+
+def downgrade() -> None:
+    op.execute(
+        """
+        UPDATE connector
+        SET connector_specific_config = jsonb_set(
+            connector_specific_config,
+            '{index_origin}',
+            connector_specific_config->'index_recursively'
+        ) - 'index_recursively'
+        WHERE connector_specific_config ? 'index_recursively'
+    """
+    )
--- a/backend/alembic/versions/44f856ae2a4a_add_cloud_embedding_model.py
+++ b/backend/alembic/versions/44f856ae2a4a_add_cloud_embedding_model.py
@@ -0,0 +1,65 @@
+"""add cloud embedding model and update embedding_model
+
+Revision ID: 44f856ae2a4a
+Revises: d716b0791ddd
+Create Date: 2024-06-28 20:01:05.927647
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "44f856ae2a4a"
+down_revision = "d716b0791ddd"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    # Create embedding_provider table
+    op.create_table(
+        "embedding_provider",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sa.String(), nullable=False),
+        sa.Column("api_key", sa.LargeBinary(), nullable=True),
+        sa.Column("default_model_id", sa.Integer(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
+    )
+
+    # Add cloud_provider_id to embedding_model table
+    op.add_column(
+        "embedding_model", sa.Column("cloud_provider_id", sa.Integer(), nullable=True)
+    )
+
+    # Add foreign key constraints
+    op.create_foreign_key(
+        "fk_embedding_model_cloud_provider",
+        "embedding_model",
+        "embedding_provider",
+        ["cloud_provider_id"],
+        ["id"],
+    )
+    op.create_foreign_key(
+        "fk_embedding_provider_default_model",
+        "embedding_provider",
+        "embedding_model",
+        ["default_model_id"],
+        ["id"],
+    )
+
+
+def downgrade() -> None:
+    # Remove foreign key constraints
+    op.drop_constraint(
+        "fk_embedding_model_cloud_provider", "embedding_model", type_="foreignkey"
+    )
+    op.drop_constraint(
+        "fk_embedding_provider_default_model", "embedding_provider", type_="foreignkey"
+    )
+
+    # Remove cloud_provider_id column
+    op.drop_column("embedding_model", "cloud_provider_id")
+
+    # Drop embedding_provider table
+    op.drop_table("embedding_provider")
--- a/backend/alembic/versions/4505fd7302e1_added_is_internet_to_dbdoc.py
+++ b/backend/alembic/versions/4505fd7302e1_added_is_internet_to_dbdoc.py
@@ -0,0 +1,23 @@
+"""added is_internet to DBDoc
+
+Revision ID: 4505fd7302e1
+Revises: c18cdf4b497e
+Create Date: 2024-06-18 20:46:09.095034
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "4505fd7302e1"
+down_revision = "c18cdf4b497e"
+
+
+def upgrade() -> None:
+    op.add_column("search_doc", sa.Column("is_internet", sa.Boolean(), nullable=True))
+    op.add_column("tool", sa.Column("display_name", sa.String(), nullable=True))
+
+
+def downgrade() -> None:
+    op.drop_column("tool", "display_name")
+    op.drop_column("search_doc", "is_internet")
--- a/backend/alembic/versions/473a1a7ca408_add_display_model_names_to_llm_provider.py
+++ b/backend/alembic/versions/473a1a7ca408_add_display_model_names_to_llm_provider.py
@@ -0,0 +1,49 @@
+"""Add display_model_names to llm_provider
+
+Revision ID: 473a1a7ca408
+Revises: 325975216eb3
+Create Date: 2024-07-25 14:31:02.002917
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "473a1a7ca408"
+down_revision = "325975216eb3"
+branch_labels: None = None
+depends_on: None = None
+
+default_models_by_provider = {
+    "openai": ["gpt-4", "gpt-4o", "gpt-4o-mini"],
+    "bedrock": [
+        "meta.llama3-1-70b-instruct-v1:0",
+        "meta.llama3-1-8b-instruct-v1:0",
+        "anthropic.claude-3-opus-20240229-v1:0",
+        "mistral.mistral-large-2402-v1:0",
+        "anthropic.claude-3-5-sonnet-20240620-v1:0",
+    ],
+    "anthropic": ["claude-3-opus-20240229", "claude-3-5-sonnet-20240620"],
+}
+
+
+def upgrade() -> None:
+    op.add_column(
+        "llm_provider",
+        sa.Column("display_model_names", postgresql.ARRAY(sa.String()), nullable=True),
+    )
+
+    connection = op.get_bind()
+    for provider, models in default_models_by_provider.items():
+        connection.execute(
+            sa.text(
+                "UPDATE llm_provider SET display_model_names = :models WHERE provider = :provider"
+            ),
+            {"models": models, "provider": provider},
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("llm_provider", "display_model_names")
--- a/backend/alembic/versions/48d14957fe80_add_support_for_custom_tools.py
+++ b/backend/alembic/versions/48d14957fe80_add_support_for_custom_tools.py
@@ -13,8 +13,8 @@ from sqlalchemy.dialects import postgresql
 # revision identifiers, used by Alembic.
 revision = "48d14957fe80"
 down_revision = "b85f02ec1308"
-branch_labels = None
-depends_on = None
+branch_labels: None = None
+depends_on: None = None


 def upgrade() -> None:
--- a/backend/alembic/versions/4a951134c801_moved_status_to_connector_credential_.py
+++ b/backend/alembic/versions/4a951134c801_moved_status_to_connector_credential_.py
@@ -0,0 +1,80 @@
+"""Moved status to connector credential pair
+
+Revision ID: 4a951134c801
+Revises: 7477a5f5d728
+Create Date: 2024-08-10 19:20:34.527559
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "4a951134c801"
+down_revision = "7477a5f5d728"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "status",
+            sa.Enum(
+                "ACTIVE",
+                "PAUSED",
+                "DELETING",
+                name="connectorcredentialpairstatus",
+                native_enum=False,
+            ),
+            nullable=True,
+        ),
+    )
+
+    # Update status of connector_credential_pair based on connector's disabled status
+    op.execute(
+        """
+        UPDATE connector_credential_pair
+        SET status = CASE
+            WHEN (
+                SELECT disabled
+                FROM connector
+                WHERE connector.id = connector_credential_pair.connector_id
+            ) = FALSE THEN 'ACTIVE'
+            ELSE 'PAUSED'
+        END
+        """
+    )
+
+    # Make the status column not nullable after setting values
+    op.alter_column("connector_credential_pair", "status", nullable=False)
+
+    op.drop_column("connector", "disabled")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "connector",
+        sa.Column("disabled", sa.BOOLEAN(), autoincrement=False, nullable=True),
+    )
+
+    # Update disabled status of connector based on connector_credential_pair's status
+    op.execute(
+        """
+        UPDATE connector
+        SET disabled = CASE
+            WHEN EXISTS (
+                SELECT 1
+                FROM connector_credential_pair
+                WHERE connector_credential_pair.connector_id = connector.id
+                AND connector_credential_pair.status = 'ACTIVE'
+            ) THEN FALSE
+            ELSE TRUE
+        END
+        """
+    )
+
+    # Make the disabled column not nullable after setting values
+    op.alter_column("connector", "disabled", nullable=False)
+
+    op.drop_column("connector_credential_pair", "status")
--- a/backend/alembic/versions/4b08d97e175a_change_default_prune_freq.py
+++ b/backend/alembic/versions/4b08d97e175a_change_default_prune_freq.py
@@ -0,0 +1,34 @@
+"""change default prune_freq
+
+Revision ID: 4b08d97e175a
+Revises: d9ec13955951
+Create Date: 2024-08-20 15:28:52.993827
+
+"""
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "4b08d97e175a"
+down_revision = "d9ec13955951"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        UPDATE connector
+        SET prune_freq = 2592000
+        WHERE prune_freq = 86400
+        """
+    )
+
+
+def downgrade() -> None:
+    op.execute(
+        """
+        UPDATE connector
+        SET prune_freq = 86400
+        WHERE prune_freq = 2592000
+        """
+    )
--- a/backend/alembic/versions/4ea2c93919c1_add_type_to_credentials.py
+++ b/backend/alembic/versions/4ea2c93919c1_add_type_to_credentials.py
@@ -0,0 +1,72 @@
+"""Add type to credentials
+
+Revision ID: 4ea2c93919c1
+Revises: 473a1a7ca408
+Create Date: 2024-07-18 13:07:13.655895
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "4ea2c93919c1"
+down_revision = "473a1a7ca408"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    # Add the new 'source' column to the 'credential' table
+    op.add_column(
+        "credential",
+        sa.Column(
+            "source",
+            sa.String(length=100),  # Use String instead of Enum
+            nullable=True,  # Initially allow NULL values
+        ),
+    )
+    op.add_column(
+        "credential",
+        sa.Column(
+            "name",
+            sa.String(),
+            nullable=True,
+        ),
+    )
+
+    # Create a temporary table that maps each credential to a single connector source.
+    # This is needed because a credential can be associated with multiple connectors,
+    # but we want to assign a single source to each credential.
+    # We use DISTINCT ON to ensure we only get one row per credential_id.
+    op.execute(
+        """
+    CREATE TEMPORARY TABLE temp_connector_credential AS
+    SELECT DISTINCT ON (cc.credential_id)
+        cc.credential_id,
+        c.source AS connector_source
+    FROM connector_credential_pair cc
+    JOIN connector c ON cc.connector_id = c.id
+    """
+    )
+
+    # Update the 'source' column in the 'credential' table
+    op.execute(
+        """
+    UPDATE credential cred
+    SET source = COALESCE(
+        (SELECT connector_source
+         FROM temp_connector_credential temp
+         WHERE cred.id = temp.credential_id),
+        'NOT_APPLICABLE'
+    )
+    """
+    )
+    # If no exception was raised, alter the column
+    op.alter_column("credential", "source", nullable=True)  # TODO modify
+    # # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    op.drop_column("credential", "source")
+    op.drop_column("credential", "name")
--- a/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py
+++ b/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py
@@ -0,0 +1,66 @@
+"""Add last synced and last modified to document table
+
+Revision ID: 52a219fb5233
+Revises: f7e58d357687
+Create Date: 2024-08-28 17:40:46.077470
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.sql import func
+
+# revision identifiers, used by Alembic.
+revision = "52a219fb5233"
+down_revision = "f7e58d357687"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # last modified represents the last time anything needing syncing to vespa changed
+    # including row metadata and the document itself. This obviously does not include
+    # the last_synced column.
+    op.add_column(
+        "document",
+        sa.Column(
+            "last_modified",
+            sa.DateTime(timezone=True),
+            nullable=False,
+            server_default=func.now(),
+        ),
+    )
+
+    # last synced represents the last time this document was synced to Vespa
+    op.add_column(
+        "document",
+        sa.Column("last_synced", sa.DateTime(timezone=True), nullable=True),
+    )
+
+    # Set last_synced to the same value as last_modified for existing rows
+    op.execute(
+        """
+        UPDATE document
+        SET last_synced = last_modified
+        """
+    )
+
+    op.create_index(
+        op.f("ix_document_last_modified"),
+        "document",
+        ["last_modified"],
+        unique=False,
+    )
+
+    op.create_index(
+        op.f("ix_document_last_synced"),
+        "document",
+        ["last_synced"],
+        unique=False,
+    )
+
+
+def downgrade() -> None:
+    op.drop_index(op.f("ix_document_last_synced"), table_name="document")
+    op.drop_index(op.f("ix_document_last_modified"), table_name="document")
+    op.drop_column("document", "last_synced")
+    op.drop_column("document", "last_modified")
--- a/backend/alembic/versions/55546a7967ee_assistant_rework.py
+++ b/backend/alembic/versions/55546a7967ee_assistant_rework.py
@@ -0,0 +1,79 @@
+"""assistant_rework
+
+Revision ID: 55546a7967ee
+Revises: 61ff3651add4
+Create Date: 2024-09-18 17:00:23.755399
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+
+# revision identifiers, used by Alembic.
+revision = "55546a7967ee"
+down_revision = "61ff3651add4"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Reworking persona and user tables for new assistant features
+    # keep track of user's chosen assistants separate from their `ordering`
+    op.add_column("persona", sa.Column("builtin_persona", sa.Boolean(), nullable=True))
+    op.execute("UPDATE persona SET builtin_persona = default_persona")
+    op.alter_column("persona", "builtin_persona", nullable=False)
+    op.drop_index("_default_persona_name_idx", table_name="persona")
+    op.create_index(
+        "_builtin_persona_name_idx",
+        "persona",
+        ["name"],
+        unique=True,
+        postgresql_where=sa.text("builtin_persona = true"),
+    )
+
+    op.add_column(
+        "user", sa.Column("visible_assistants", postgresql.JSONB(), nullable=True)
+    )
+    op.add_column(
+        "user", sa.Column("hidden_assistants", postgresql.JSONB(), nullable=True)
+    )
+    op.execute(
+        "UPDATE \"user\" SET visible_assistants = '[]'::jsonb, hidden_assistants = '[]'::jsonb"
+    )
+    op.alter_column(
+        "user",
+        "visible_assistants",
+        nullable=False,
+        server_default=sa.text("'[]'::jsonb"),
+    )
+    op.alter_column(
+        "user",
+        "hidden_assistants",
+        nullable=False,
+        server_default=sa.text("'[]'::jsonb"),
+    )
+    op.drop_column("persona", "default_persona")
+    op.add_column(
+        "persona", sa.Column("is_default_persona", sa.Boolean(), nullable=True)
+    )
+
+
+def downgrade() -> None:
+    # Reverting changes made in upgrade
+    op.drop_column("user", "hidden_assistants")
+    op.drop_column("user", "visible_assistants")
+    op.drop_index("_builtin_persona_name_idx", table_name="persona")
+
+    op.drop_column("persona", "is_default_persona")
+    op.add_column("persona", sa.Column("default_persona", sa.Boolean(), nullable=True))
+    op.execute("UPDATE persona SET default_persona = builtin_persona")
+    op.alter_column("persona", "default_persona", nullable=False)
+    op.drop_column("persona", "builtin_persona")
+    op.create_index(
+        "_default_persona_name_idx",
+        "persona",
+        ["name"],
+        unique=True,
+        postgresql_where=sa.text("default_persona = true"),
+    )
--- a/backend/alembic/versions/5c7fdadae813_match_any_keywords_flag_for_standard_.py
+++ b/backend/alembic/versions/5c7fdadae813_match_any_keywords_flag_for_standard_.py
@@ -0,0 +1,35 @@
+"""match_any_keywords flag for standard answers
+
+Revision ID: 5c7fdadae813
+Revises: efb35676026c
+Create Date: 2024-09-13 18:52:59.256478
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "5c7fdadae813"
+down_revision = "efb35676026c"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "standard_answer",
+        sa.Column(
+            "match_any_keywords",
+            sa.Boolean(),
+            nullable=False,
+            server_default=sa.false(),
+        ),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("standard_answer", "match_any_keywords")
+    # ### end Alembic commands ###
--- a/backend/alembic/versions/5fc1f54cc252_hybrid_enum.py
+++ b/backend/alembic/versions/5fc1f54cc252_hybrid_enum.py
@@ -0,0 +1,25 @@
+"""hybrid-enum
+
+Revision ID: 5fc1f54cc252
+Revises: 1d6ad76d1f37
+Create Date: 2024-08-06 15:35:40.278485
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "5fc1f54cc252"
+down_revision = "1d6ad76d1f37"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.drop_column("persona", "search_type")
+
+
+def downgrade() -> None:
+    op.add_column("persona", sa.Column("search_type", sa.String(), nullable=True))
+    op.execute("UPDATE persona SET search_type = 'SEMANTIC'")
+    op.alter_column("persona", "search_type", nullable=False)
--- a/backend/alembic/versions/61ff3651add4_add_permission_syncing.py
+++ b/backend/alembic/versions/61ff3651add4_add_permission_syncing.py
@@ -0,0 +1,162 @@
+"""Add Permission Syncing
+
+Revision ID: 61ff3651add4
+Revises: 1b8206b29c5d
+Create Date: 2024-09-05 13:57:11.770413
+
+"""
+import fastapi_users_db_sqlalchemy
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "61ff3651add4"
+down_revision = "1b8206b29c5d"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Admin user who set up connectors will lose access to the docs temporarily
+    # only way currently to give back access is to rerun from beginning
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "access_type",
+            sa.String(),
+            nullable=True,
+        ),
+    )
+    op.execute(
+        "UPDATE connector_credential_pair SET access_type = 'PUBLIC' WHERE is_public = true"
+    )
+    op.execute(
+        "UPDATE connector_credential_pair SET access_type = 'PRIVATE' WHERE is_public = false"
+    )
+    op.alter_column("connector_credential_pair", "access_type", nullable=False)
+
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "auto_sync_options",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+        ),
+    )
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column("last_time_perm_sync", sa.DateTime(timezone=True), nullable=True),
+    )
+    op.drop_column("connector_credential_pair", "is_public")
+
+    op.add_column(
+        "document",
+        sa.Column("external_user_emails", postgresql.ARRAY(sa.String()), nullable=True),
+    )
+    op.add_column(
+        "document",
+        sa.Column(
+            "external_user_group_ids", postgresql.ARRAY(sa.String()), nullable=True
+        ),
+    )
+    op.add_column(
+        "document",
+        sa.Column("is_public", sa.Boolean(), nullable=True),
+    )
+
+    op.create_table(
+        "user__external_user_group_id",
+        sa.Column(
+            "user_id", fastapi_users_db_sqlalchemy.generics.GUID(), nullable=False
+        ),
+        sa.Column("external_user_group_id", sa.String(), nullable=False),
+        sa.Column("cc_pair_id", sa.Integer(), nullable=False),
+        sa.PrimaryKeyConstraint("user_id"),
+    )
+
+    op.drop_column("external_permission", "user_id")
+    op.drop_column("email_to_external_user_cache", "user_id")
+    op.drop_table("permission_sync_run")
+    op.drop_table("external_permission")
+    op.drop_table("email_to_external_user_cache")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column("is_public", sa.BOOLEAN(), nullable=True),
+    )
+    op.execute(
+        "UPDATE connector_credential_pair SET is_public = (access_type = 'PUBLIC')"
+    )
+    op.alter_column("connector_credential_pair", "is_public", nullable=False)
+
+    op.drop_column("connector_credential_pair", "auto_sync_options")
+    op.drop_column("connector_credential_pair", "access_type")
+    op.drop_column("connector_credential_pair", "last_time_perm_sync")
+    op.drop_column("document", "external_user_emails")
+    op.drop_column("document", "external_user_group_ids")
+    op.drop_column("document", "is_public")
+
+    op.drop_table("user__external_user_group_id")
+
+    # Drop the enum type at the end of the downgrade
+    op.create_table(
+        "permission_sync_run",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column(
+            "source_type",
+            sa.String(),
+            nullable=False,
+        ),
+        sa.Column("update_type", sa.String(), nullable=False),
+        sa.Column("cc_pair_id", sa.Integer(), nullable=True),
+        sa.Column(
+            "status",
+            sa.String(),
+            nullable=False,
+        ),
+        sa.Column("error_msg", sa.Text(), nullable=True),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.ForeignKeyConstraint(
+            ["cc_pair_id"],
+            ["connector_credential_pair.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_table(
+        "external_permission",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("user_id", sa.UUID(), nullable=True),
+        sa.Column("user_email", sa.String(), nullable=False),
+        sa.Column(
+            "source_type",
+            sa.String(),
+            nullable=False,
+        ),
+        sa.Column("external_permission_group", sa.String(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["user_id"],
+            ["user.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_table(
+        "email_to_external_user_cache",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("external_user_id", sa.String(), nullable=False),
+        sa.Column("user_id", sa.UUID(), nullable=True),
+        sa.Column("user_email", sa.String(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["user_id"],
+            ["user.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
--- a/backend/alembic/versions/7477a5f5d728_added_model_defaults_for_users.py
+++ b/backend/alembic/versions/7477a5f5d728_added_model_defaults_for_users.py
@@ -0,0 +1,24 @@
+"""Added model defaults for users
+
+Revision ID: 7477a5f5d728
+Revises: 213fd978c6d8
+Create Date: 2024-08-04 19:00:04.512634
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "7477a5f5d728"
+down_revision = "213fd978c6d8"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column("user", sa.Column("default_model", sa.Text(), nullable=True))
+
+
+def downgrade() -> None:
+    op.drop_column("user", "default_model")
--- a/backend/alembic/versions/76b60d407dfb_cc_pair_name_not_unique.py
+++ b/backend/alembic/versions/76b60d407dfb_cc_pair_name_not_unique.py
@@ -28,5 +28,9 @@ def upgrade() -> None:


 def downgrade() -> None:
-    # This wasn't really required by the code either, no good reason to make it unique again
-    pass
+    op.create_unique_constraint(
+        "connector_credential_pair__name__key", "connector_credential_pair", ["name"]
+    )
+    op.alter_column(
+        "connector_credential_pair", "name", existing_type=sa.String(), nullable=True
+    )
--- a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
+++ b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
@@ -10,7 +10,7 @@ import sqlalchemy as sa

 from danswer.db.models import IndexModelStatus
 from danswer.search.enums import RecencyBiasSetting
-from danswer.search.models import SearchType
+from danswer.search.enums import SearchType

 # revision identifiers, used by Alembic.
 revision = "776b3bbe9092"
--- a/backend/alembic/versions/795b20b85b4b_add_llm_group_permissions_control.py
+++ b/backend/alembic/versions/795b20b85b4b_add_llm_group_permissions_control.py
@@ -0,0 +1,41 @@
+"""add_llm_group_permissions_control
+
+Revision ID: 795b20b85b4b
+Revises: 05c07bf07c00
+Create Date: 2024-07-19 11:54:35.701558
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+revision = "795b20b85b4b"
+down_revision = "05c07bf07c00"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "llm_provider__user_group",
+        sa.Column("llm_provider_id", sa.Integer(), nullable=False),
+        sa.Column("user_group_id", sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["llm_provider_id"],
+            ["llm_provider.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["user_group_id"],
+            ["user_group.id"],
+        ),
+        sa.PrimaryKeyConstraint("llm_provider_id", "user_group_id"),
+    )
+    op.add_column(
+        "llm_provider",
+        sa.Column("is_public", sa.Boolean(), nullable=False, server_default="true"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("llm_provider__user_group")
+    op.drop_column("llm_provider", "is_public")
--- a/backend/alembic/versions/797089dfb4d2_persona_start_date.py
+++ b/backend/alembic/versions/797089dfb4d2_persona_start_date.py
@@ -0,0 +1,27 @@
+"""persona_start_date
+
+Revision ID: 797089dfb4d2
+Revises: 55546a7967ee
+Create Date: 2024-09-11 14:51:49.785835
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "797089dfb4d2"
+down_revision = "55546a7967ee"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "persona",
+        sa.Column("search_start_date", sa.DateTime(timezone=True), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("persona", "search_start_date")
--- a/backend/alembic/versions/7aea705850d5_added_slack_auto_filter.py
+++ b/backend/alembic/versions/7aea705850d5_added_slack_auto_filter.py
@@ -0,0 +1,35 @@
+"""added slack_auto_filter
+
+Revision ID: 7aea705850d5
+Revises: 4505fd7302e1
+Create Date: 2024-07-10 11:01:23.581015
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+revision = "7aea705850d5"
+down_revision = "4505fd7302e1"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "slack_bot_config",
+        sa.Column("enable_auto_filters", sa.Boolean(), nullable=True),
+    )
+    op.execute(
+        "UPDATE slack_bot_config SET enable_auto_filters = FALSE WHERE enable_auto_filters IS NULL"
+    )
+    op.alter_column(
+        "slack_bot_config",
+        "enable_auto_filters",
+        existing_type=sa.Boolean(),
+        nullable=False,
+        server_default=sa.false(),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("slack_bot_config", "enable_auto_filters")
--- a/backend/alembic/versions/8a87bd6ec550_associate_index_attempts_with_ccpair.py
+++ b/backend/alembic/versions/8a87bd6ec550_associate_index_attempts_with_ccpair.py
@@ -0,0 +1,107 @@
+"""associate index attempts with ccpair
+
+Revision ID: 8a87bd6ec550
+Revises: 4ea2c93919c1
+Create Date: 2024-07-22 15:15:52.558451
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "8a87bd6ec550"
+down_revision = "4ea2c93919c1"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    # Add the new connector_credential_pair_id column
+    op.add_column(
+        "index_attempt",
+        sa.Column("connector_credential_pair_id", sa.Integer(), nullable=True),
+    )
+
+    # Create a foreign key constraint to the connector_credential_pair table
+    op.create_foreign_key(
+        "fk_index_attempt_connector_credential_pair_id",
+        "index_attempt",
+        "connector_credential_pair",
+        ["connector_credential_pair_id"],
+        ["id"],
+    )
+
+    # Populate the new connector_credential_pair_id column using existing connector_id and credential_id
+    op.execute(
+        """
+        UPDATE index_attempt ia
+        SET connector_credential_pair_id = (
+            SELECT id FROM connector_credential_pair ccp
+            WHERE
+                (ia.connector_id IS NULL OR ccp.connector_id = ia.connector_id)
+                AND (ia.credential_id IS NULL OR ccp.credential_id = ia.credential_id)
+            LIMIT 1
+        )
+        WHERE ia.connector_id IS NOT NULL OR ia.credential_id IS NOT NULL
+        """
+    )
+
+    # For good measure
+    op.execute(
+        """
+        DELETE FROM index_attempt
+        WHERE connector_credential_pair_id IS NULL
+        """
+    )
+
+    # Make the new connector_credential_pair_id column non-nullable
+    op.alter_column("index_attempt", "connector_credential_pair_id", nullable=False)
+
+    # Drop the old connector_id and credential_id columns
+    op.drop_column("index_attempt", "connector_id")
+    op.drop_column("index_attempt", "credential_id")
+
+    # Update the index to use connector_credential_pair_id
+    op.create_index(
+        "ix_index_attempt_latest_for_connector_credential_pair",
+        "index_attempt",
+        ["connector_credential_pair_id", "time_created"],
+    )
+
+
+def downgrade() -> None:
+    # Add back the old connector_id and credential_id columns
+    op.add_column(
+        "index_attempt", sa.Column("connector_id", sa.Integer(), nullable=True)
+    )
+    op.add_column(
+        "index_attempt", sa.Column("credential_id", sa.Integer(), nullable=True)
+    )
+
+    # Populate the old connector_id and credential_id columns using the connector_credential_pair_id
+    op.execute(
+        """
+        UPDATE index_attempt ia
+        SET connector_id = ccp.connector_id, credential_id = ccp.credential_id
+        FROM connector_credential_pair ccp
+        WHERE ia.connector_credential_pair_id = ccp.id
+        """
+    )
+
+    # Make the old connector_id and credential_id columns non-nullable
+    op.alter_column("index_attempt", "connector_id", nullable=False)
+    op.alter_column("index_attempt", "credential_id", nullable=False)
+
+    # Drop the new connector_credential_pair_id column
+    op.drop_constraint(
+        "fk_index_attempt_connector_credential_pair_id",
+        "index_attempt",
+        type_="foreignkey",
+    )
+    op.drop_column("index_attempt", "connector_credential_pair_id")
+
+    op.create_index(
+        "ix_index_attempt_latest_for_connector_credential_pair",
+        "index_attempt",
+        ["connector_id", "credential_id", "time_created"],
+    )
--- a/backend/alembic/versions/91ffac7e65b3_add_expiry_time.py
+++ b/backend/alembic/versions/91ffac7e65b3_add_expiry_time.py
@@ -0,0 +1,26 @@
+"""add expiry time
+
+Revision ID: 91ffac7e65b3
+Revises: bc9771dccadf
+Create Date: 2024-06-24 09:39:56.462242
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "91ffac7e65b3"
+down_revision = "795b20b85b4b"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "user", sa.Column("oidc_expiry", sa.DateTime(timezone=True), nullable=True)
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("user", "oidc_expiry")
--- a/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py
+++ b/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py
@@ -0,0 +1,158 @@
+"""migration confluence to be explicit
+
+Revision ID: a3795dce87be
+Revises: 1f60f60c3401
+Create Date: 2024-09-01 13:52:12.006740
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.sql import table, column
+
+revision = "a3795dce87be"
+down_revision = "1f60f60c3401"
+branch_labels: None = None
+depends_on: None = None
+
+
+def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
+    from urllib.parse import urlparse
+
+    def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
+        parsed_url = urlparse(wiki_url)
+        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
+        path_parts = parsed_url.path.split("/")
+        space = path_parts[3]
+        page_id = path_parts[5] if len(path_parts) > 5 else ""
+        return wiki_base, space, page_id
+
+    def _extract_confluence_keys_from_datacenter_url(
+        wiki_url: str,
+    ) -> tuple[str, str, str]:
+        DISPLAY = "/display/"
+        PAGE = "/pages/"
+        parsed_url = urlparse(wiki_url)
+        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
+        space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
+        page_id = ""
+        if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
+            page_id = content[1]
+        return wiki_base, space, page_id
+
+    is_confluence_cloud = (
+        ".atlassian.net/wiki/spaces/" in wiki_url
+        or ".jira.com/wiki/spaces/" in wiki_url
+    )
+
+    if is_confluence_cloud:
+        wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
+    else:
+        wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
+            wiki_url
+        )
+
+    return wiki_base, space, page_id, is_confluence_cloud
+
+
+def reconstruct_confluence_url(
+    wiki_base: str, space: str, page_id: str, is_cloud: bool
+) -> str:
+    if is_cloud:
+        url = f"{wiki_base}/spaces/{space}"
+        if page_id:
+            url += f"/pages/{page_id}"
+    else:
+        url = f"{wiki_base}/display/{space}"
+        if page_id:
+            url += f"/pages/{page_id}"
+    return url
+
+
+def upgrade() -> None:
+    connector = table(
+        "connector",
+        column("id", sa.Integer),
+        column("source", sa.String()),
+        column("input_type", sa.String()),
+        column("connector_specific_config", postgresql.JSONB),
+    )
+
+    # Fetch all Confluence connectors
+    connection = op.get_bind()
+    confluence_connectors = connection.execute(
+        sa.select(connector).where(
+            sa.and_(
+                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
+            )
+        )
+    ).fetchall()
+
+    for row in confluence_connectors:
+        config = row.connector_specific_config
+        wiki_page_url = config["wiki_page_url"]
+        wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
+            wiki_page_url
+        )
+
+        new_config = {
+            "wiki_base": wiki_base,
+            "space": space,
+            "page_id": page_id,
+            "is_cloud": is_cloud,
+        }
+
+        for key, value in config.items():
+            if key not in ["wiki_page_url"]:
+                new_config[key] = value
+
+        op.execute(
+            connector.update()
+            .where(connector.c.id == row.id)
+            .values(connector_specific_config=new_config)
+        )
+
+
+def downgrade() -> None:
+    connector = table(
+        "connector",
+        column("id", sa.Integer),
+        column("source", sa.String()),
+        column("input_type", sa.String()),
+        column("connector_specific_config", postgresql.JSONB),
+    )
+
+    confluence_connectors = (
+        op.get_bind()
+        .execute(
+            sa.select(connector).where(
+                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
+            )
+        )
+        .fetchall()
+    )
+
+    for row in confluence_connectors:
+        config = row.connector_specific_config
+        if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
+            wiki_page_url = reconstruct_confluence_url(
+                config["wiki_base"],
+                config["space"],
+                config.get("page_id", ""),
+                config["is_cloud"],
+            )
+
+            new_config = {"wiki_page_url": wiki_page_url}
+            new_config.update(
+                {
+                    k: v
+                    for k, v in config.items()
+                    if k not in ["wiki_base", "space", "page_id", "is_cloud"]
+                }
+            )
+
+            op.execute(
+                connector.update()
+                .where(connector.c.id == row.id)
+                .values(connector_specific_config=new_config)
+            )
--- a/backend/alembic/versions/b082fec533f0_make_last_attempt_status_nullable.py
+++ b/backend/alembic/versions/b082fec533f0_make_last_attempt_status_nullable.py
@@ -16,7 +16,6 @@ depends_on: None = None


 def upgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
    op.alter_column(
        "connector_credential_pair",
        "last_attempt_status",
@@ -29,11 +28,9 @@ def upgrade() -> None:
        ),
        nullable=True,
    )
-    # ### end Alembic commands ###


 def downgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
    op.alter_column(
        "connector_credential_pair",
        "last_attempt_status",
@@ -46,4 +43,3 @@ def downgrade() -> None:
        ),
        nullable=False,
    )
-    # ### end Alembic commands ###
--- a/backend/alembic/versions/b896bbd0d5a7_backfill_is_internet_data_to_false.py
+++ b/backend/alembic/versions/b896bbd0d5a7_backfill_is_internet_data_to_false.py
@@ -0,0 +1,23 @@
+"""backfill is_internet data to False
+
+Revision ID: b896bbd0d5a7
+Revises: 44f856ae2a4a
+Create Date: 2024-07-16 15:21:05.718571
+
+"""
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "b896bbd0d5a7"
+down_revision = "44f856ae2a4a"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.execute("UPDATE search_doc SET is_internet = FALSE WHERE is_internet IS NULL")
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/ba98eba0f66a_add_support_for_litellm_proxy_in_.py
+++ b/backend/alembic/versions/ba98eba0f66a_add_support_for_litellm_proxy_in_.py
@@ -0,0 +1,26 @@
+"""add support for litellm proxy in reranking
+
+Revision ID: ba98eba0f66a
+Revises: bceb1e139447
+Create Date: 2024-09-06 10:36:04.507332
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "ba98eba0f66a"
+down_revision = "bceb1e139447"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "search_settings", sa.Column("rerank_api_url", sa.String(), nullable=True)
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("search_settings", "rerank_api_url")
--- a/backend/alembic/versions/bc9771dccadf_create_usage_reports_table.py
+++ b/backend/alembic/versions/bc9771dccadf_create_usage_reports_table.py
@@ -0,0 +1,51 @@
+"""create usage reports table
+
+Revision ID: bc9771dccadf
+Revises: 0568ccf46a6b
+Create Date: 2024-06-18 10:04:26.800282
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import fastapi_users_db_sqlalchemy
+
+# revision identifiers, used by Alembic.
+revision = "bc9771dccadf"
+down_revision = "0568ccf46a6b"
+
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "usage_reports",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("report_name", sa.String(), nullable=False),
+        sa.Column(
+            "requestor_user_id",
+            fastapi_users_db_sqlalchemy.generics.GUID(),
+            nullable=True,
+        ),
+        sa.Column(
+            "time_created",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.Column("period_from", sa.DateTime(timezone=True), nullable=True),
+        sa.Column("period_to", sa.DateTime(timezone=True), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["report_name"],
+            ["file_store.file_name"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["requestor_user_id"],
+            ["user.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("usage_reports")
--- a/backend/alembic/versions/bceb1e139447_add_base_url_to_cloudembeddingprovider.py
+++ b/backend/alembic/versions/bceb1e139447_add_base_url_to_cloudembeddingprovider.py
@@ -0,0 +1,26 @@
+"""Add base_url to CloudEmbeddingProvider
+
+Revision ID: bceb1e139447
+Revises: a3795dce87be
+Create Date: 2024-08-28 17:00:52.554580
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "bceb1e139447"
+down_revision = "a3795dce87be"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "embedding_provider", sa.Column("api_url", sa.String(), nullable=True)
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("embedding_provider", "api_url")
--- a/backend/alembic/versions/bd2921608c3a_non_nullable_default_persona.py
+++ b/backend/alembic/versions/bd2921608c3a_non_nullable_default_persona.py
@@ -0,0 +1,43 @@
+"""non nullable default persona
+
+Revision ID: bd2921608c3a
+Revises: 797089dfb4d2
+Create Date: 2024-09-20 10:28:37.992042
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "bd2921608c3a"
+down_revision = "797089dfb4d2"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Set existing NULL values to False
+    op.execute(
+        "UPDATE persona SET is_default_persona = FALSE WHERE is_default_persona IS NULL"
+    )
+
+    # Alter the column to be not nullable with a default value of False
+    op.alter_column(
+        "persona",
+        "is_default_persona",
+        existing_type=sa.Boolean(),
+        nullable=False,
+        server_default=sa.text("false"),
+    )
+
+
+def downgrade() -> None:
+    # Revert the changes
+    op.alter_column(
+        "persona",
+        "is_default_persona",
+        existing_type=sa.Boolean(),
+        nullable=True,
+        server_default=None,
+    )
--- a/backend/alembic/versions/c18cdf4b497e_add_standard_answer_tables.py
+++ b/backend/alembic/versions/c18cdf4b497e_add_standard_answer_tables.py
@@ -0,0 +1,75 @@
+"""Add standard_answer tables
+
+Revision ID: c18cdf4b497e
+Revises: 3a7802814195
+Create Date: 2024-06-06 15:15:02.000648
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "c18cdf4b497e"
+down_revision = "3a7802814195"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "standard_answer",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("keyword", sa.String(), nullable=False),
+        sa.Column("answer", sa.String(), nullable=False),
+        sa.Column("active", sa.Boolean(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("keyword"),
+    )
+    op.create_table(
+        "standard_answer_category",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sa.String(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
+    )
+    op.create_table(
+        "standard_answer__standard_answer_category",
+        sa.Column("standard_answer_id", sa.Integer(), nullable=False),
+        sa.Column("standard_answer_category_id", sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["standard_answer_category_id"],
+            ["standard_answer_category.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["standard_answer_id"],
+            ["standard_answer.id"],
+        ),
+        sa.PrimaryKeyConstraint("standard_answer_id", "standard_answer_category_id"),
+    )
+    op.create_table(
+        "slack_bot_config__standard_answer_category",
+        sa.Column("slack_bot_config_id", sa.Integer(), nullable=False),
+        sa.Column("standard_answer_category_id", sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["slack_bot_config_id"],
+            ["slack_bot_config.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["standard_answer_category_id"],
+            ["standard_answer_category.id"],
+        ),
+        sa.PrimaryKeyConstraint("slack_bot_config_id", "standard_answer_category_id"),
+    )
+
+    op.add_column(
+        "chat_session", sa.Column("slack_thread_id", sa.String(), nullable=True)
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("chat_session", "slack_thread_id")
+
+    op.drop_table("slack_bot_config__standard_answer_category")
+    op.drop_table("standard_answer__standard_answer_category")
+    op.drop_table("standard_answer_category")
+    op.drop_table("standard_answer")
--- a/backend/alembic/versions/c5b692fa265c_add_index_attempt_errors_table.py
+++ b/backend/alembic/versions/c5b692fa265c_add_index_attempt_errors_table.py
@@ -0,0 +1,57 @@
+"""Add index_attempt_errors table
+
+Revision ID: c5b692fa265c
+Revises: 4a951134c801
+Create Date: 2024-08-08 14:06:39.581972
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "c5b692fa265c"
+down_revision = "4a951134c801"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "index_attempt_errors",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("index_attempt_id", sa.Integer(), nullable=True),
+        sa.Column("batch", sa.Integer(), nullable=True),
+        sa.Column(
+            "doc_summaries",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=False,
+        ),
+        sa.Column("error_msg", sa.Text(), nullable=True),
+        sa.Column("traceback", sa.Text(), nullable=True),
+        sa.Column(
+            "time_created",
+            sa.DateTime(timezone=True),
+            server_default=sa.text("now()"),
+            nullable=False,
+        ),
+        sa.ForeignKeyConstraint(
+            ["index_attempt_id"],
+            ["index_attempt.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        "index_attempt_id",
+        "index_attempt_errors",
+        ["time_created"],
+        unique=False,
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index("index_attempt_id", table_name="index_attempt_errors")
+    op.drop_table("index_attempt_errors")
+    # ### end Alembic commands ###
--- a/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py
+++ b/backend/alembic/versions/c99d76fcd298_add_nullable_to_persona_id_in_chat_.py
@@ -0,0 +1,31 @@
+"""add nullable to persona id in Chat Session
+
+Revision ID: c99d76fcd298
+Revises: 5c7fdadae813
+Create Date: 2024-07-09 19:27:01.579697
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "c99d76fcd298"
+down_revision = "5c7fdadae813"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.alter_column(
+        "chat_session", "persona_id", existing_type=sa.INTEGER(), nullable=True
+    )
+
+
+def downgrade() -> None:
+    op.alter_column(
+        "chat_session",
+        "persona_id",
+        existing_type=sa.INTEGER(),
+        nullable=False,
+    )
--- a/backend/alembic/versions/d5645c915d0e_remove_deletion_attempt_table.py
+++ b/backend/alembic/versions/d5645c915d0e_remove_deletion_attempt_table.py
@@ -19,6 +19,9 @@ depends_on: None = None
 def upgrade() -> None:
    op.drop_table("deletion_attempt")

+    # Remove the DeletionStatus enum
+    op.execute("DROP TYPE IF EXISTS deletionstatus;")
+

 def downgrade() -> None:
    op.create_table(
--- a/backend/alembic/versions/d716b0791ddd_combined_slack_id_fields.py
+++ b/backend/alembic/versions/d716b0791ddd_combined_slack_id_fields.py
@@ -0,0 +1,45 @@
+"""combined slack id fields
+
+Revision ID: d716b0791ddd
+Revises: 7aea705850d5
+Create Date: 2024-07-10 17:57:45.630550
+
+"""
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "d716b0791ddd"
+down_revision = "7aea705850d5"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+    UPDATE slack_bot_config
+    SET channel_config = jsonb_set(
+        channel_config,
+        '{respond_member_group_list}',
+        coalesce(channel_config->'respond_team_member_list', '[]'::jsonb) ||
+        coalesce(channel_config->'respond_slack_group_list', '[]'::jsonb)
+    ) - 'respond_team_member_list' - 'respond_slack_group_list'
+    """
+    )
+
+
+def downgrade() -> None:
+    op.execute(
+        """
+    UPDATE slack_bot_config
+    SET channel_config = jsonb_set(
+        jsonb_set(
+            channel_config - 'respond_member_group_list',
+            '{respond_team_member_list}',
+            '[]'::jsonb
+        ),
+        '{respond_slack_group_list}',
+        '[]'::jsonb
+    )
+    """
+    )
--- a/backend/alembic/versions/d9ec13955951_remove__dim_suffix_from_model_name.py
+++ b/backend/alembic/versions/d9ec13955951_remove__dim_suffix_from_model_name.py
@@ -0,0 +1,31 @@
+"""Remove _alt suffix from model_name
+
+Revision ID: d9ec13955951
+Revises: da4c21c69164
+Create Date: 2024-08-20 16:31:32.955686
+
+"""
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "d9ec13955951"
+down_revision = "da4c21c69164"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        UPDATE embedding_model
+        SET model_name = regexp_replace(model_name, '__danswer_alt_index$', '')
+        WHERE model_name LIKE '%__danswer_alt_index'
+    """
+    )
+
+
+def downgrade() -> None:
+    # We can't reliably add the __danswer_alt_index suffix back, so we'll leave this empty
+    pass
--- a/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py
+++ b/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py
@@ -0,0 +1,65 @@
+"""chosen_assistants changed to jsonb
+
+Revision ID: da4c21c69164
+Revises: c5b692fa265c
+Create Date: 2024-08-18 19:06:47.291491
+
+"""
+import json
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "da4c21c69164"
+down_revision = "c5b692fa265c"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    existing_ids_and_chosen_assistants = conn.execute(
+        sa.text("select id, chosen_assistants from public.user")
+    )
+    op.drop_column(
+        "user",
+        "chosen_assistants",
+    )
+    op.add_column(
+        "user",
+        sa.Column(
+            "chosen_assistants",
+            postgresql.JSONB(astext_type=sa.Text()),
+            nullable=True,
+        ),
+    )
+    for id, chosen_assistants in existing_ids_and_chosen_assistants:
+        conn.execute(
+            sa.text(
+                "update public.user set chosen_assistants = :chosen_assistants where id = :id"
+            ),
+            {"chosen_assistants": json.dumps(chosen_assistants), "id": id},
+        )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+    existing_ids_and_chosen_assistants = conn.execute(
+        sa.text("select id, chosen_assistants from public.user")
+    )
+    op.drop_column(
+        "user",
+        "chosen_assistants",
+    )
+    op.add_column(
+        "user",
+        sa.Column("chosen_assistants", postgresql.ARRAY(sa.Integer()), nullable=True),
+    )
+    for id, chosen_assistants in existing_ids_and_chosen_assistants:
+        conn.execute(
+            sa.text(
+                "update public.user set chosen_assistants = :chosen_assistants where id = :id"
+            ),
+            {"chosen_assistants": chosen_assistants, "id": id},
+        )
--- a/backend/alembic/versions/dbaa756c2ccf_embedding_models.py
+++ b/backend/alembic/versions/dbaa756c2ccf_embedding_models.py
@@ -9,7 +9,7 @@ from alembic import op
 import sqlalchemy as sa
 from sqlalchemy import table, column, String, Integer, Boolean

-from danswer.db.embedding_model import (
+from danswer.db.search_settings import (
    get_new_default_embedding_model,
    get_old_default_embedding_model,
    user_has_overridden_embedding_model,
@@ -71,14 +71,14 @@ def upgrade() -> None:
                "query_prefix": old_embedding_model.query_prefix,
                "passage_prefix": old_embedding_model.passage_prefix,
                "index_name": old_embedding_model.index_name,
-                "status": old_embedding_model.status,
+                "status": IndexModelStatus.PRESENT,
            }
        ],
    )
    # if the user has not overridden the default embedding model via env variables,
    # insert the new default model into the database to auto-upgrade them
    if not user_has_overridden_embedding_model():
-        new_embedding_model = get_new_default_embedding_model(is_present=False)
+        new_embedding_model = get_new_default_embedding_model()
        op.bulk_insert(
            EmbeddingModel,
            [
@@ -136,4 +136,4 @@ def downgrade() -> None:
    )
    op.drop_column("index_attempt", "embedding_model_id")
    op.drop_table("embedding_model")
-    op.execute("DROP TYPE indexmodelstatus;")
+    op.execute("DROP TYPE IF EXISTS indexmodelstatus;")
--- a/backend/alembic/versions/e1392f05e840_added_input_prompts.py
+++ b/backend/alembic/versions/e1392f05e840_added_input_prompts.py
@@ -0,0 +1,58 @@
+"""Added input prompts
+
+Revision ID: e1392f05e840
+Revises: 08a1eda20fe1
+Create Date: 2024-07-13 19:09:22.556224
+
+"""
+
+import fastapi_users_db_sqlalchemy
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "e1392f05e840"
+down_revision = "08a1eda20fe1"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "inputprompt",
+        sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
+        sa.Column("prompt", sa.String(), nullable=False),
+        sa.Column("content", sa.String(), nullable=False),
+        sa.Column("active", sa.Boolean(), nullable=False),
+        sa.Column("is_public", sa.Boolean(), nullable=False),
+        sa.Column(
+            "user_id",
+            fastapi_users_db_sqlalchemy.generics.GUID(),
+            nullable=True,
+        ),
+        sa.ForeignKeyConstraint(
+            ["user_id"],
+            ["user.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_table(
+        "inputprompt__user",
+        sa.Column("input_prompt_id", sa.Integer(), nullable=False),
+        sa.Column("user_id", sa.Integer(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["input_prompt_id"],
+            ["inputprompt.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["user_id"],
+            ["inputprompt.id"],
+        ),
+        sa.PrimaryKeyConstraint("input_prompt_id", "user_id"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("inputprompt__user")
+    op.drop_table("inputprompt")
--- a/backend/alembic/versions/e209dc5a8156_added_prune_frequency.py
+++ b/backend/alembic/versions/e209dc5a8156_added_prune_frequency.py
@@ -0,0 +1,22 @@
+"""added-prune-frequency
+
+Revision ID: e209dc5a8156
+Revises: 48d14957fe80
+Create Date: 2024-06-16 16:02:35.273231
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+revision = "e209dc5a8156"
+down_revision = "48d14957fe80"
+branch_labels = None  # type: ignore
+depends_on = None  # type: ignore
+
+
+def upgrade() -> None:
+    op.add_column("connector", sa.Column("prune_freq", sa.Integer(), nullable=True))
+
+
+def downgrade() -> None:
+    op.drop_column("connector", "prune_freq")
--- a/backend/alembic/versions/ee3f4b47fad5_added_alternate_model_to_chat_message.py
+++ b/backend/alembic/versions/ee3f4b47fad5_added_alternate_model_to_chat_message.py
@@ -0,0 +1,28 @@
+"""Added alternate model to chat message
+
+Revision ID: ee3f4b47fad5
+Revises: 2d2304e27d8c
+Create Date: 2024-08-12 00:11:50.915845
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "ee3f4b47fad5"
+down_revision = "2d2304e27d8c"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "chat_message",
+        sa.Column("overridden_model", sa.String(length=255), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("chat_message", "overridden_model")
--- a/backend/alembic/versions/efb35676026c_standard_answer_match_regex_flag.py
+++ b/backend/alembic/versions/efb35676026c_standard_answer_match_regex_flag.py
@@ -0,0 +1,32 @@
+"""standard answer match_regex flag
+
+Revision ID: efb35676026c
+Revises: 0ebb1d516877
+Create Date: 2024-09-11 13:55:46.101149
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "efb35676026c"
+down_revision = "0ebb1d516877"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "standard_answer",
+        sa.Column(
+            "match_regex", sa.Boolean(), nullable=False, server_default=sa.false()
+        ),
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("standard_answer", "match_regex")
+    # ### end Alembic commands ###
--- a/backend/alembic/versions/f17bf3b0d9f1_embedding_provider_by_provider_type.py
+++ b/backend/alembic/versions/f17bf3b0d9f1_embedding_provider_by_provider_type.py
@@ -0,0 +1,172 @@
+"""embedding provider by provider type
+
+Revision ID: f17bf3b0d9f1
+Revises: 351faebd379d
+Create Date: 2024-08-21 13:13:31.120460
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "f17bf3b0d9f1"
+down_revision = "351faebd379d"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    # Add provider_type column to embedding_provider
+    op.add_column(
+        "embedding_provider",
+        sa.Column("provider_type", sa.String(50), nullable=True),
+    )
+
+    # Update provider_type with existing name values
+    op.execute("UPDATE embedding_provider SET provider_type = UPPER(name)")
+
+    # Make provider_type not nullable
+    op.alter_column("embedding_provider", "provider_type", nullable=False)
+
+    # Drop the foreign key constraint in embedding_model table
+    op.drop_constraint(
+        "fk_embedding_model_cloud_provider", "embedding_model", type_="foreignkey"
+    )
+
+    # Drop the existing primary key constraint
+    op.drop_constraint("embedding_provider_pkey", "embedding_provider", type_="primary")
+
+    # Create a new primary key constraint on provider_type
+    op.create_primary_key(
+        "embedding_provider_pkey", "embedding_provider", ["provider_type"]
+    )
+
+    # Add provider_type column to embedding_model
+    op.add_column(
+        "embedding_model",
+        sa.Column("provider_type", sa.String(50), nullable=True),
+    )
+
+    # Update provider_type for existing embedding models
+    op.execute(
+        """
+        UPDATE embedding_model
+        SET provider_type = (
+            SELECT provider_type
+            FROM embedding_provider
+            WHERE embedding_provider.id = embedding_model.cloud_provider_id
+        )
+    """
+    )
+
+    # Drop the old id column from embedding_provider
+    op.drop_column("embedding_provider", "id")
+
+    # Drop the name column from embedding_provider
+    op.drop_column("embedding_provider", "name")
+
+    # Drop the default_model_id column from embedding_provider
+    op.drop_column("embedding_provider", "default_model_id")
+
+    # Drop the old cloud_provider_id column from embedding_model
+    op.drop_column("embedding_model", "cloud_provider_id")
+
+    # Create the new foreign key constraint
+    op.create_foreign_key(
+        "fk_embedding_model_cloud_provider",
+        "embedding_model",
+        "embedding_provider",
+        ["provider_type"],
+        ["provider_type"],
+    )
+
+
+def downgrade() -> None:
+    # Drop the foreign key constraint in embedding_model table
+    op.drop_constraint(
+        "fk_embedding_model_cloud_provider", "embedding_model", type_="foreignkey"
+    )
+
+    # Add back the cloud_provider_id column to embedding_model
+    op.add_column(
+        "embedding_model", sa.Column("cloud_provider_id", sa.Integer(), nullable=True)
+    )
+    op.add_column("embedding_provider", sa.Column("id", sa.Integer(), nullable=True))
+
+    # Assign incrementing IDs to embedding providers
+    op.execute(
+        """
+        CREATE SEQUENCE IF NOT EXISTS embedding_provider_id_seq;"""
+    )
+    op.execute(
+        """
+        UPDATE embedding_provider SET id = nextval('embedding_provider_id_seq');
+    """
+    )
+
+    # Update cloud_provider_id based on provider_type
+    op.execute(
+        """
+        UPDATE embedding_model
+        SET cloud_provider_id = CASE
+            WHEN provider_type IS NULL THEN NULL
+            ELSE (
+                SELECT id
+                FROM embedding_provider
+                WHERE embedding_provider.provider_type = embedding_model.provider_type
+            )
+        END
+    """
+    )
+
+    # Drop the provider_type column from embedding_model
+    op.drop_column("embedding_model", "provider_type")
+
+    # Add back the columns to embedding_provider
+    op.add_column("embedding_provider", sa.Column("name", sa.String(50), nullable=True))
+    op.add_column(
+        "embedding_provider", sa.Column("default_model_id", sa.Integer(), nullable=True)
+    )
+
+    # Drop the existing primary key constraint on provider_type
+    op.drop_constraint("embedding_provider_pkey", "embedding_provider", type_="primary")
+
+    # Create the original primary key constraint on id
+    op.create_primary_key("embedding_provider_pkey", "embedding_provider", ["id"])
+
+    # Update name with existing provider_type values
+    op.execute(
+        """
+        UPDATE embedding_provider
+        SET name = CASE
+            WHEN provider_type = 'OPENAI' THEN 'OpenAI'
+            WHEN provider_type = 'COHERE' THEN 'Cohere'
+            WHEN provider_type = 'GOOGLE' THEN 'Google'
+            WHEN provider_type = 'VOYAGE' THEN 'Voyage'
+            ELSE provider_type
+        END
+    """
+    )
+
+    # Drop the provider_type column from embedding_provider
+    op.drop_column("embedding_provider", "provider_type")
+
+    # Recreate the foreign key constraint in embedding_model table
+    op.create_foreign_key(
+        "fk_embedding_model_cloud_provider",
+        "embedding_model",
+        "embedding_provider",
+        ["cloud_provider_id"],
+        ["id"],
+    )
+
+    # Recreate the foreign key constraint in embedding_model table
+    op.create_foreign_key(
+        "fk_embedding_provider_default_model",
+        "embedding_provider",
+        "embedding_model",
+        ["default_model_id"],
+        ["id"],
+    )
--- a/backend/alembic/versions/f32615f71aeb_add_custom_headers_to_tools.py
+++ b/backend/alembic/versions/f32615f71aeb_add_custom_headers_to_tools.py
@@ -0,0 +1,26 @@
+"""add custom headers to tools
+
+Revision ID: f32615f71aeb
+Revises: bd2921608c3a
+Create Date: 2024-09-12 20:26:38.932377
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "f32615f71aeb"
+down_revision = "bd2921608c3a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "tool", sa.Column("custom_headers", postgresql.JSONB(), nullable=True)
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("tool", "custom_headers")
--- a/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py
+++ b/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py
@@ -0,0 +1,26 @@
+"""add has_web_login column to user
+
+Revision ID: f7e58d357687
+Revises: ba98eba0f66a
+Create Date: 2024-09-07 20:20:54.522620
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "f7e58d357687"
+down_revision = "ba98eba0f66a"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column("has_web_login", sa.Boolean(), nullable=False, server_default="true"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("user", "has_web_login")
--- a/backend/assets/.gitignore
+++ b/backend/assets/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/backend/danswer/access/access.py
+++ b/backend/danswer/access/access.py
@@ -1,49 +1,95 @@
 from sqlalchemy.orm import Session

 from danswer.access.models import DocumentAccess
+from danswer.access.utils import prefix_user_email
 from danswer.configs.constants import PUBLIC_DOC_PAT
-from danswer.db.document import get_acccess_info_for_documents
+from danswer.db.document import get_access_info_for_document
+from danswer.db.document import get_access_info_for_documents
 from danswer.db.models import User
-from danswer.server.documents.models import ConnectorCredentialPairIdentifier
 from danswer.utils.variable_functionality import fetch_versioned_implementation


+def _get_access_for_document(
+    document_id: str,
+    db_session: Session,
+) -> DocumentAccess:
+    info = get_access_info_for_document(
+        db_session=db_session,
+        document_id=document_id,
+    )
+
+    return DocumentAccess.build(
+        user_emails=info[1] if info and info[1] else [],
+        user_groups=[],
+        external_user_emails=[],
+        external_user_group_ids=[],
+        is_public=info[2] if info else False,
+    )
+
+
+def get_access_for_document(
+    document_id: str,
+    db_session: Session,
+) -> DocumentAccess:
+    versioned_get_access_for_document_fn = fetch_versioned_implementation(
+        "danswer.access.access", "_get_access_for_document"
+    )
+    return versioned_get_access_for_document_fn(document_id, db_session)  # type: ignore
+
+
+def get_null_document_access() -> DocumentAccess:
+    return DocumentAccess(
+        user_emails=set(),
+        user_groups=set(),
+        is_public=False,
+        external_user_emails=set(),
+        external_user_group_ids=set(),
+    )
+
+
 def _get_access_for_documents(
    document_ids: list[str],
    db_session: Session,
-    cc_pair_to_delete: ConnectorCredentialPairIdentifier | None = None,
 ) -> dict[str, DocumentAccess]:
-    document_access_info = get_acccess_info_for_documents(
+    document_access_info = get_access_info_for_documents(
        db_session=db_session,
        document_ids=document_ids,
-        cc_pair_to_delete=cc_pair_to_delete,
    )
-    return {
-        document_id: DocumentAccess.build(user_ids, is_public)
-        for document_id, user_ids, is_public in document_access_info
+    doc_access = {
+        document_id: DocumentAccess(
+            user_emails=set([email for email in user_emails if email]),
+            # MIT version will wipe all groups and external groups on update
+            user_groups=set(),
+            is_public=is_public,
+            external_user_emails=set(),
+            external_user_group_ids=set(),
+        )
+        for document_id, user_emails, is_public in document_access_info
    }

+    # Sometimes the document has not be indexed by the indexing job yet, in those cases
+    # the document does not exist and so we use least permissive. Specifically the EE version
+    # checks the MIT version permissions and creates a superset. This ensures that this flow
+    # does not fail even if the Document has not yet been indexed.
+    for doc_id in document_ids:
+        if doc_id not in doc_access:
+            doc_access[doc_id] = get_null_document_access()
+    return doc_access
+

 def get_access_for_documents(
    document_ids: list[str],
    db_session: Session,
-    cc_pair_to_delete: ConnectorCredentialPairIdentifier | None = None,
 ) -> dict[str, DocumentAccess]:
    """Fetches all access information for the given documents."""
    versioned_get_access_for_documents_fn = fetch_versioned_implementation(
        "danswer.access.access", "_get_access_for_documents"
    )
    return versioned_get_access_for_documents_fn(
-        document_ids, db_session, cc_pair_to_delete
+        document_ids, db_session
    )  # type: ignore


-def prefix_user(user_id: str) -> str:
-    """Prefixes a user ID to eliminate collision with group names.
-    This assumes that groups are prefixed with a different prefix."""
-    return f"user_id:{user_id}"
-
-
 def _get_acl_for_user(user: User | None, db_session: Session) -> set[str]:
    """Returns a list of ACL entries that the user has access to. This is meant to be
    used downstream to filter out documents that the user does not have access to. The
@@ -51,7 +97,7 @@ def _get_acl_for_user(user: User | None, db_session: Session) -> set[str]:
    matches one entry in the returned set.
    """
    if user:
-        return {prefix_user(str(user.id)), PUBLIC_DOC_PAT}
+        return {prefix_user_email(user.email), PUBLIC_DOC_PAT}
    return {PUBLIC_DOC_PAT}


--- a/backend/danswer/access/models.py
+++ b/backend/danswer/access/models.py
@@ -1,20 +1,72 @@
 from dataclasses import dataclass
-from uuid import UUID

+from danswer.access.utils import prefix_external_group
+from danswer.access.utils import prefix_user_email
+from danswer.access.utils import prefix_user_group
 from danswer.configs.constants import PUBLIC_DOC_PAT


@dataclass(frozen=True)
-class DocumentAccess:
-    user_ids: set[str]  # stringified UUIDs
+class ExternalAccess:
+    # Emails of external users with access to the doc externally
+    external_user_emails: set[str]
+    # Names or external IDs of groups with access to the doc
+    external_user_group_ids: set[str]
+    # Whether the document is public in the external system or Danswer
    is_public: bool

-    def to_acl(self) -> list[str]:
-        return list(self.user_ids) + ([PUBLIC_DOC_PAT] if self.is_public else [])
+
+@dataclass(frozen=True)
+class DocumentAccess(ExternalAccess):
+    # User emails for Danswer users, None indicates admin
+    user_emails: set[str | None]
+    # Names of user groups associated with this document
+    user_groups: set[str]
+
+    def to_acl(self) -> set[str]:
+        return set(
+            [
+                prefix_user_email(user_email)
+                for user_email in self.user_emails
+                if user_email
+            ]
+            + [prefix_user_group(group_name) for group_name in self.user_groups]
+            + [
+                prefix_user_email(user_email)
+                for user_email in self.external_user_emails
+            ]
+            + [
+                # The group names are already prefixed by the source type
+                # This adds an additional prefix of "external_group:"
+                prefix_external_group(group_name)
+                for group_name in self.external_user_group_ids
+            ]
+            + ([PUBLIC_DOC_PAT] if self.is_public else [])
+        )

    @classmethod
-    def build(cls, user_ids: list[UUID | None], is_public: bool) -> "DocumentAccess":
+    def build(
+        cls,
+        user_emails: list[str | None],
+        user_groups: list[str],
+        external_user_emails: list[str],
+        external_user_group_ids: list[str],
+        is_public: bool,
+    ) -> "DocumentAccess":
        return cls(
-            user_ids={str(user_id) for user_id in user_ids if user_id},
+            external_user_emails={
+                prefix_user_email(external_email)
+                for external_email in external_user_emails
+            },
+            external_user_group_ids={
+                prefix_external_group(external_group_id)
+                for external_group_id in external_user_group_ids
+            },
+            user_emails={
+                prefix_user_email(user_email)
+                for user_email in user_emails
+                if user_email
+            },
+            user_groups=set(user_groups),
            is_public=is_public,
        )
--- a/backend/danswer/access/utils.py
+++ b/backend/danswer/access/utils.py
@@ -0,0 +1,24 @@
+from danswer.configs.constants import DocumentSource
+
+
+def prefix_user_email(user_email: str) -> str:
+    """Prefixes a user email to eliminate collision with group names.
+    This applies to both a Danswer user and an External user, this is to make the query time
+    more efficient"""
+    return f"user_email:{user_email}"
+
+
+def prefix_user_group(user_group_name: str) -> str:
+    """Prefixes a user group name to eliminate collision with user emails.
+    This assumes that user ids are prefixed with a different prefix."""
+    return f"group:{user_group_name}"
+
+
+def prefix_external_group(ext_group_name: str) -> str:
+    """Prefixes an external group name to eliminate collision with user emails / Danswer groups."""
+    return f"external_group:{ext_group_name}"
+
+
+def prefix_group_w_source(ext_group_name: str, source: DocumentSource) -> str:
+    """External groups may collide across sources, every source needs its own prefix."""
+    return f"{source.value.upper()}_{ext_group_name}"
--- a/backend/danswer/auth/invited_users.py
+++ b/backend/danswer/auth/invited_users.py
@@ -0,0 +1,20 @@
+from typing import cast
+
+from danswer.configs.constants import KV_USER_STORE_KEY
+from danswer.dynamic_configs.factory import get_dynamic_config_store
+from danswer.dynamic_configs.interface import ConfigNotFoundError
+from danswer.dynamic_configs.interface import JSON_ro
+
+
+def get_invited_users() -> list[str]:
+    try:
+        store = get_dynamic_config_store()
+        return cast(list, store.load(KV_USER_STORE_KEY))
+    except ConfigNotFoundError:
+        return list()
+
+
+def write_invited_users(emails: list[str]) -> int:
+    store = get_dynamic_config_store()
+    store.store(KV_USER_STORE_KEY, cast(JSON_ro, emails))
+    return len(emails)
--- a/backend/danswer/auth/noauth_user.py
+++ b/backend/danswer/auth/noauth_user.py
@@ -3,29 +3,27 @@ from typing import Any
 from typing import cast

 from danswer.auth.schemas import UserRole
+from danswer.configs.constants import KV_NO_AUTH_USER_PREFERENCES_KEY
 from danswer.dynamic_configs.store import ConfigNotFoundError
 from danswer.dynamic_configs.store import DynamicConfigStore
 from danswer.server.manage.models import UserInfo
 from danswer.server.manage.models import UserPreferences


-NO_AUTH_USER_PREFERENCES_KEY = "no_auth_user_preferences"
-
-
 def set_no_auth_user_preferences(
    store: DynamicConfigStore, preferences: UserPreferences
 ) -> None:
-    store.store(NO_AUTH_USER_PREFERENCES_KEY, preferences.dict())
+    store.store(KV_NO_AUTH_USER_PREFERENCES_KEY, preferences.model_dump())


 def load_no_auth_user_preferences(store: DynamicConfigStore) -> UserPreferences:
    try:
        preferences_data = cast(
-            Mapping[str, Any], store.load(NO_AUTH_USER_PREFERENCES_KEY)
+            Mapping[str, Any], store.load(KV_NO_AUTH_USER_PREFERENCES_KEY)
        )
        return UserPreferences(**preferences_data)
    except ConfigNotFoundError:
-        return UserPreferences(chosen_assistants=None)
+        return UserPreferences(chosen_assistants=None, default_model=None)


 def fetch_no_auth_user(store: DynamicConfigStore) -> UserInfo:
--- a/backend/danswer/auth/schemas.py
+++ b/backend/danswer/auth/schemas.py
@@ -5,8 +5,26 @@ from fastapi_users import schemas


 class UserRole(str, Enum):
+    """
+    User roles
+    - Basic can't perform any admin actions
+    - Admin can perform all admin actions
+    - Curator can perform admin actions for
+        groups they are curators of
+    - Global Curator can perform admin actions
+        for all groups they are a member of
+    """
+
    BASIC = "basic"
    ADMIN = "admin"
+    CURATOR = "curator"
+    GLOBAL_CURATOR = "global_curator"
+
+
+class UserStatus(str, Enum):
+    LIVE = "live"
+    INVITED = "invited"
+    DEACTIVATED = "deactivated"


 class UserRead(schemas.BaseUser[uuid.UUID]):
@@ -15,7 +33,9 @@ class UserRead(schemas.BaseUser[uuid.UUID]):

 class UserCreate(schemas.BaseUserCreate):
    role: UserRole = UserRole.BASIC
+    has_web_login: bool | None = True


 class UserUpdate(schemas.BaseUserUpdate):
    role: UserRole
+    has_web_login: bool | None = True
--- a/backend/danswer/auth/users.py
+++ b/backend/danswer/auth/users.py
@@ -1,19 +1,24 @@
-import os
 import smtplib
 import uuid
 from collections.abc import AsyncGenerator
+from datetime import datetime
+from datetime import timezone
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from typing import Optional
 from typing import Tuple

+from email_validator import EmailNotValidError
+from email_validator import validate_email
 from fastapi import APIRouter
 from fastapi import Depends
 from fastapi import HTTPException
 from fastapi import Request
 from fastapi import Response
 from fastapi import status
+from fastapi.security import OAuth2PasswordRequestForm
 from fastapi_users import BaseUserManager
+from fastapi_users import exceptions
 from fastapi_users import FastAPIUsers
 from fastapi_users import models
 from fastapi_users import schemas
@@ -27,8 +32,10 @@ from fastapi_users.openapi import OpenAPIResponseType
 from fastapi_users_db_sqlalchemy import SQLAlchemyUserDatabase
 from sqlalchemy.orm import Session

+from danswer.auth.invited_users import get_invited_users
 from danswer.auth.schemas import UserCreate
 from danswer.auth.schemas import UserRole
+from danswer.auth.schemas import UserUpdate
 from danswer.configs.app_configs import AUTH_TYPE
 from danswer.configs.app_configs import DISABLE_AUTH
 from danswer.configs.app_configs import EMAIL_FROM
@@ -38,6 +45,7 @@ from danswer.configs.app_configs import SMTP_PASS
 from danswer.configs.app_configs import SMTP_PORT
 from danswer.configs.app_configs import SMTP_SERVER
 from danswer.configs.app_configs import SMTP_USER
+from danswer.configs.app_configs import TRACK_EXTERNAL_IDP_EXPIRY
 from danswer.configs.app_configs import USER_AUTH_SECRET
 from danswer.configs.app_configs import VALID_EMAIL_DOMAINS
 from danswer.configs.app_configs import WEB_DOMAIN
@@ -46,21 +54,28 @@ from danswer.configs.constants import DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN
 from danswer.configs.constants import DANSWER_API_KEY_PREFIX
 from danswer.configs.constants import UNNAMED_KEY_PLACEHOLDER
 from danswer.db.auth import get_access_token_db
+from danswer.db.auth import get_default_admin_user_emails
 from danswer.db.auth import get_user_count
 from danswer.db.auth import get_user_db
 from danswer.db.engine import get_session
+from danswer.db.engine import get_sqlalchemy_engine
 from danswer.db.models import AccessToken
 from danswer.db.models import User
+from danswer.db.users import get_user_by_email
 from danswer.utils.logger import setup_logger
 from danswer.utils.telemetry import optional_telemetry
 from danswer.utils.telemetry import RecordType
 from danswer.utils.variable_functionality import fetch_versioned_implementation

-
 logger = setup_logger()

-USER_WHITELIST_FILE = "/home/danswer_whitelist.txt"
-_user_whitelist: list[str] | None = None
+
+def is_user_admin(user: User | None) -> bool:
+    if AUTH_TYPE == AuthType.DISABLED:
+        return True
+    if user and user.role == UserRole.ADMIN:
+        return True
+    return False


 def verify_auth_setting() -> None:
@@ -69,7 +84,7 @@ def verify_auth_setting() -> None:
            "User must choose a valid user authentication method: "
            "disabled, basic, or google_oauth"
        )
-    logger.info(f"Using Auth Type: {AUTH_TYPE.value}")
+    logger.notice(f"Using Auth Type: {AUTH_TYPE.value}")


 def get_display_email(email: str | None, space_less: bool = False) -> str:
@@ -92,22 +107,36 @@ def user_needs_to_be_verified() -> bool:
    return AUTH_TYPE != AuthType.BASIC or REQUIRE_EMAIL_VERIFICATION


-def get_user_whitelist() -> list[str]:
-    global _user_whitelist
-    if _user_whitelist is None:
-        if os.path.exists(USER_WHITELIST_FILE):
-            with open(USER_WHITELIST_FILE, "r") as file:
-                _user_whitelist = [line.strip() for line in file]
-        else:
-            _user_whitelist = []
+def verify_email_is_invited(email: str) -> None:
+    whitelist = get_invited_users()
+    if not whitelist:
+        return

-    return _user_whitelist
+    if not email:
+        raise PermissionError("Email must be specified")
+
+    email_info = validate_email(email)  # can raise EmailNotValidError
+
+    for email_whitelist in whitelist:
+        try:
+            # normalized emails are now being inserted into the db
+            # we can remove this normalization on read after some time has passed
+            email_info_whitelist = validate_email(email_whitelist)
+        except EmailNotValidError:
+            continue
+
+        # oddly, normalization does not include lowercasing the user part of the
+        # email address ... which we want to allow
+        if email_info.normalized.lower() == email_info_whitelist.normalized.lower():
+            return
+
+    raise PermissionError("User not on allowed user whitelist")


 def verify_email_in_whitelist(email: str) -> None:
-    whitelist = get_user_whitelist()
-    if (whitelist and email not in whitelist) or not email:
-        raise PermissionError("User not on allowed user whitelist")
+    with Session(get_sqlalchemy_engine()) as db_session:
+        if not get_user_by_email(email, db_session):
+            verify_email_is_invited(email)


 def verify_email_domain(email: str) -> None:
@@ -158,16 +187,36 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        user_create: schemas.UC | UserCreate,
        safe: bool = False,
        request: Optional[Request] = None,
-    ) -> models.UP:
-        verify_email_in_whitelist(user_create.email)
+    ) -> User:
+        verify_email_is_invited(user_create.email)
        verify_email_domain(user_create.email)
        if hasattr(user_create, "role"):
            user_count = await get_user_count()
-            if user_count == 0:
+            if user_count == 0 or user_create.email in get_default_admin_user_emails():
                user_create.role = UserRole.ADMIN
            else:
                user_create.role = UserRole.BASIC
-        return await super().create(user_create, safe=safe, request=request)  # type: ignore
+        user = None
+        try:
+            user = await super().create(user_create, safe=safe, request=request)  # type: ignore
+        except exceptions.UserAlreadyExists:
+            user = await self.get_by_email(user_create.email)
+            # Handle case where user has used product outside of web and is now creating an account through web
+            if (
+                not user.has_web_login
+                and hasattr(user_create, "has_web_login")
+                and user_create.has_web_login
+            ):
+                user_update = UserUpdate(
+                    password=user_create.password,
+                    has_web_login=True,
+                    role=user_create.role,
+                    is_verified=user_create.is_verified,
+                )
+                user = await self.update(user_update, user)
+            else:
+                raise exceptions.UserAlreadyExists()
+        return user

    async def oauth_callback(
        self: "BaseUserManager[models.UOAP, models.ID]",
@@ -185,7 +234,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        verify_email_in_whitelist(account_email)
        verify_email_domain(account_email)

-        return await super().oauth_callback(  # type: ignore
+        user = await super().oauth_callback(  # type: ignore
            oauth_name=oauth_name,
            access_token=access_token,
            account_id=account_id,
@@ -197,10 +246,35 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            is_verified_by_default=is_verified_by_default,
        )

+        # NOTE: Most IdPs have very short expiry times, and we don't want to force the user to
+        # re-authenticate that frequently, so by default this is disabled
+        if expires_at and TRACK_EXTERNAL_IDP_EXPIRY:
+            oidc_expiry = datetime.fromtimestamp(expires_at, tz=timezone.utc)
+            await self.user_db.update(user, update_dict={"oidc_expiry": oidc_expiry})
+
+        # this is needed if an organization goes from `TRACK_EXTERNAL_IDP_EXPIRY=true` to `false`
+        # otherwise, the oidc expiry will always be old, and the user will never be able to login
+        if user.oidc_expiry and not TRACK_EXTERNAL_IDP_EXPIRY:
+            await self.user_db.update(user, update_dict={"oidc_expiry": None})
+
+        # Handle case where user has used product outside of web and is now creating an account through web
+        if not user.has_web_login:
+            await self.user_db.update(
+                user,
+                update_dict={
+                    "is_verified": is_verified_by_default,
+                    "has_web_login": True,
+                },
+            )
+            user.is_verified = is_verified_by_default
+            user.has_web_login = True
+
+        return user
+
    async def on_after_register(
        self, user: User, request: Optional[Request] = None
    ) -> None:
-        logger.info(f"User {user.id} has registered.")
+        logger.notice(f"User {user.id} has registered.")
        optional_telemetry(
            record_type=RecordType.SIGN_UP,
            data={"action": "create"},
@@ -210,19 +284,45 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
    async def on_after_forgot_password(
        self, user: User, token: str, request: Optional[Request] = None
    ) -> None:
-        logger.info(f"User {user.id} has forgot their password. Reset token: {token}")
+        logger.notice(f"User {user.id} has forgot their password. Reset token: {token}")

    async def on_after_request_verify(
        self, user: User, token: str, request: Optional[Request] = None
    ) -> None:
        verify_email_domain(user.email)

-        logger.info(
+        logger.notice(
            f"Verification requested for user {user.id}. Verification token: {token}"
        )

        send_user_verification_email(user.email, token)

+    async def authenticate(
+        self, credentials: OAuth2PasswordRequestForm
+    ) -> Optional[User]:
+        try:
+            user = await self.get_by_email(credentials.username)
+        except exceptions.UserNotExists:
+            self.password_helper.hash(credentials.password)
+            return None
+
+        if not user.has_web_login:
+            raise HTTPException(
+                status_code=status.HTTP_403_FORBIDDEN,
+                detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD",
+            )
+
+        verified, updated_password_hash = self.password_helper.verify_and_update(
+            credentials.password, user.hashed_password
+        )
+        if not verified:
+            return None
+
+        if updated_password_hash is not None:
+            await self.user_db.update(user, {"hashed_password": updated_password_hash})
+
+        return user
+

 async def get_user_manager(
    user_db: SQLAlchemyUserDatabase = Depends(get_user_db),
@@ -239,10 +339,12 @@ cookie_transport = CookieTransport(
 def get_database_strategy(
    access_token_db: AccessTokenDatabase[AccessToken] = Depends(get_access_token_db),
 ) -> DatabaseStrategy:
-    return DatabaseStrategy(
+    strategy = DatabaseStrategy(
        access_token_db, lifetime_seconds=SESSION_EXPIRE_TIME_SECONDS  # type: ignore
    )

+    return strategy
+

 auth_backend = AuthenticationBackend(
    name="database",
@@ -323,6 +425,7 @@ async def optional_user(
 async def double_check_user(
    user: User | None,
    optional: bool = DISABLE_AUTH,
+    include_expired: bool = False,
 ) -> User | None:
    if optional:
        return None
@@ -339,15 +442,53 @@ async def double_check_user(
            detail="Access denied. User is not verified.",
        )

+    if (
+        user.oidc_expiry
+        and user.oidc_expiry < datetime.now(timezone.utc)
+        and not include_expired
+    ):
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Access denied. User's OIDC token has expired.",
+        )
+
    return user


+async def current_user_with_expired_token(
+    user: User | None = Depends(optional_user),
+) -> User | None:
+    return await double_check_user(user, include_expired=True)
+
+
 async def current_user(
    user: User | None = Depends(optional_user),
 ) -> User | None:
    return await double_check_user(user)


+async def current_curator_or_admin_user(
+    user: User | None = Depends(current_user),
+) -> User | None:
+    if DISABLE_AUTH:
+        return None
+
+    if not user or not hasattr(user, "role"):
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Access denied. User is not authenticated or lacks role information.",
+        )
+
+    allowed_roles = {UserRole.GLOBAL_CURATOR, UserRole.CURATOR, UserRole.ADMIN}
+    if user.role not in allowed_roles:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Access denied. User is not a curator or admin.",
+        )
+
+    return user
+
+
 async def current_admin_user(user: User | None = Depends(current_user)) -> User | None:
    if DISABLE_AUTH:
        return None
@@ -355,6 +496,12 @@ async def current_admin_user(user: User | None = Depends(current_user)) -> User
    if not user or not hasattr(user, "role") or user.role != UserRole.ADMIN:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Access denied. User is not an admin.",
+            detail="Access denied. User must be an admin to perform this action.",
        )
+
    return user
+
+
+def get_default_admin_user_emails_() -> list[str]:
+    # No default seeding available for Danswer MIT
+    return []
--- a/backend/danswer/background/celery/celery.py
+++ b/backend/danswer/background/celery/celery.py
@@ -1,214 +0,0 @@
-from datetime import timedelta
-from typing import cast
-
-from celery import Celery  # type: ignore
-from sqlalchemy.orm import Session
-
-from danswer.background.connector_deletion import delete_connector_credential_pair
-from danswer.background.task_utils import build_celery_task_wrapper
-from danswer.background.task_utils import name_cc_cleanup_task
-from danswer.background.task_utils import name_document_set_sync_task
-from danswer.configs.app_configs import JOB_TIMEOUT
-from danswer.db.connector_credential_pair import get_connector_credential_pair
-from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
-from danswer.db.document import prepare_to_modify_documents
-from danswer.db.document_set import delete_document_set
-from danswer.db.document_set import fetch_document_sets
-from danswer.db.document_set import fetch_document_sets_for_documents
-from danswer.db.document_set import fetch_documents_for_document_set_paginated
-from danswer.db.document_set import get_document_set_by_id
-from danswer.db.document_set import mark_document_set_as_synced
-from danswer.db.engine import build_connection_string
-from danswer.db.engine import get_sqlalchemy_engine
-from danswer.db.engine import SYNC_DB_API
-from danswer.db.models import DocumentSet
-from danswer.db.tasks import check_live_task_not_timed_out
-from danswer.db.tasks import get_latest_task
-from danswer.document_index.document_index_utils import get_both_index_names
-from danswer.document_index.factory import get_default_document_index
-from danswer.document_index.interfaces import UpdateRequest
-from danswer.utils.logger import setup_logger
-
-logger = setup_logger()
-
-connection_string = build_connection_string(db_api=SYNC_DB_API)
-celery_broker_url = f"sqla+{connection_string}"
-celery_backend_url = f"db+{connection_string}"
-celery_app = Celery(__name__, broker=celery_broker_url, backend=celery_backend_url)
-
-
-_SYNC_BATCH_SIZE = 100
-
-
-#####
-# Tasks that need to be run in job queue, registered via APIs
-#
-# If imports from this module are needed, use local imports to avoid circular importing
-#####
-@build_celery_task_wrapper(name_cc_cleanup_task)
-@celery_app.task(soft_time_limit=JOB_TIMEOUT)
-def cleanup_connector_credential_pair_task(
-    connector_id: int,
-    credential_id: int,
-) -> int:
-    """Connector deletion task. This is run as an async task because it is a somewhat slow job.
-    Needs to potentially update a large number of Postgres and Vespa docs, including deleting them
-    or updating the ACL"""
-    engine = get_sqlalchemy_engine()
-    with Session(engine) as db_session:
-        # validate that the connector / credential pair is deletable
-        cc_pair = get_connector_credential_pair(
-            db_session=db_session,
-            connector_id=connector_id,
-            credential_id=credential_id,
-        )
-        if not cc_pair:
-            raise ValueError(
-                f"Cannot run deletion attempt - connector_credential_pair with Connector ID: "
-                f"{connector_id} and Credential ID: {credential_id} does not exist."
-            )
-
-        deletion_attempt_disallowed_reason = check_deletion_attempt_is_allowed(
-            connector_credential_pair=cc_pair, db_session=db_session
-        )
-        if deletion_attempt_disallowed_reason:
-            raise ValueError(deletion_attempt_disallowed_reason)
-
-        try:
-            # The bulk of the work is in here, updates Postgres and Vespa
-            curr_ind_name, sec_ind_name = get_both_index_names(db_session)
-            document_index = get_default_document_index(
-                primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
-            )
-            return delete_connector_credential_pair(
-                db_session=db_session,
-                document_index=document_index,
-                cc_pair=cc_pair,
-            )
-        except Exception as e:
-            logger.exception(f"Failed to run connector_deletion due to {e}")
-            raise e
-
-
-@build_celery_task_wrapper(name_document_set_sync_task)
-@celery_app.task(soft_time_limit=JOB_TIMEOUT)
-def sync_document_set_task(document_set_id: int) -> None:
-    """For document sets marked as not up to date, sync the state from postgres
-    into the datastore. Also handles deletions."""
-
-    def _sync_document_batch(document_ids: list[str], db_session: Session) -> None:
-        logger.debug(f"Syncing document sets for: {document_ids}")
-
-        # Acquires a lock on the documents so that no other process can modify them
-        with prepare_to_modify_documents(
-            db_session=db_session, document_ids=document_ids
-        ):
-            # get current state of document sets for these documents
-            document_set_map = {
-                document_id: document_sets
-                for document_id, document_sets in fetch_document_sets_for_documents(
-                    document_ids=document_ids, db_session=db_session
-                )
-            }
-
-            # update Vespa
-            curr_ind_name, sec_ind_name = get_both_index_names(db_session)
-            document_index = get_default_document_index(
-                primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
-            )
-            update_requests = [
-                UpdateRequest(
-                    document_ids=[document_id],
-                    document_sets=set(document_set_map.get(document_id, [])),
-                )
-                for document_id in document_ids
-            ]
-            document_index.update(update_requests=update_requests)
-
-    with Session(get_sqlalchemy_engine()) as db_session:
-        try:
-            cursor = None
-            while True:
-                document_batch, cursor = fetch_documents_for_document_set_paginated(
-                    document_set_id=document_set_id,
-                    db_session=db_session,
-                    current_only=False,
-                    last_document_id=cursor,
-                    limit=_SYNC_BATCH_SIZE,
-                )
-                _sync_document_batch(
-                    document_ids=[document.id for document in document_batch],
-                    db_session=db_session,
-                )
-                if cursor is None:
-                    break
-
-            # if there are no connectors, then delete the document set. Otherwise, just
-            # mark it as successfully synced.
-            document_set = cast(
-                DocumentSet,
-                get_document_set_by_id(
-                    db_session=db_session, document_set_id=document_set_id
-                ),
-            )  # casting since we "know" a document set with this ID exists
-            if not document_set.connector_credential_pairs:
-                delete_document_set(
-                    document_set_row=document_set, db_session=db_session
-                )
-                logger.info(
-                    f"Successfully deleted document set with ID: '{document_set_id}'!"
-                )
-            else:
-                mark_document_set_as_synced(
-                    document_set_id=document_set_id, db_session=db_session
-                )
-                logger.info(f"Document set sync for '{document_set_id}' complete!")
-
-        except Exception:
-            logger.exception("Failed to sync document set %s", document_set_id)
-            raise
-
-
-#####
-# Periodic Tasks
-#####
-@celery_app.task(
-    name="check_for_document_sets_sync_task",
-    soft_time_limit=JOB_TIMEOUT,
-)
-def check_for_document_sets_sync_task() -> None:
-    """Runs periodically to check if any document sets are out of sync
-    Creates a task to sync the set if needed"""
-    with Session(get_sqlalchemy_engine()) as db_session:
-        # check if any document sets are not synced
-        document_set_info = fetch_document_sets(
-            user_id=None, db_session=db_session, include_outdated=True
-        )
-        for document_set, _ in document_set_info:
-            if not document_set.is_up_to_date:
-                task_name = name_document_set_sync_task(document_set.id)
-                latest_sync = get_latest_task(task_name, db_session)
-
-                if latest_sync and check_live_task_not_timed_out(
-                    latest_sync, db_session
-                ):
-                    logger.info(
-                        f"Document set '{document_set.id}' is already syncing. Skipping."
-                    )
-                    continue
-
-                logger.info(f"Document set {document_set.id} syncing now!")
-                sync_document_set_task.apply_async(
-                    kwargs=dict(document_set_id=document_set.id),
-                )
-
-
-#####
-# Celery Beat (Periodic Tasks) Settings
-#####
-celery_app.conf.beat_schedule = {
-    "check-for-document-set-sync": {
-        "task": "check_for_document_sets_sync_task",
-        "schedule": timedelta(seconds=5),
-    },
-}
--- a/backend/danswer/background/celery/celery_app.py
+++ b/backend/danswer/background/celery/celery_app.py
--- a/backend/danswer/background/celery/celery_redis.py
+++ b/backend/danswer/background/celery/celery_redis.py
@@ -0,0 +1,361 @@
+# These are helper objects for tracking the keys we need to write in redis
+import time
+from abc import ABC
+from abc import abstractmethod
+from typing import cast
+from uuid import uuid4
+
+import redis
+from celery import Celery
+from redis import Redis
+from sqlalchemy.orm import Session
+
+from danswer.background.celery.celeryconfig import CELERY_SEPARATOR
+from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
+from danswer.configs.constants import DanswerCeleryPriority
+from danswer.configs.constants import DanswerCeleryQueues
+from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
+from danswer.db.document import construct_document_select_for_connector_credential_pair
+from danswer.db.document import (
+    construct_document_select_for_connector_credential_pair_by_needs_sync,
+)
+from danswer.db.document_set import construct_document_select_by_docset
+from danswer.utils.variable_functionality import fetch_versioned_implementation
+
+
+class RedisObjectHelper(ABC):
+    PREFIX = "base"
+    FENCE_PREFIX = PREFIX + "_fence"
+    TASKSET_PREFIX = PREFIX + "_taskset"
+
+    def __init__(self, id: int):
+        self._id: int = id
+
+    @property
+    def task_id_prefix(self) -> str:
+        return f"{self.PREFIX}_{self._id}"
+
+    @property
+    def fence_key(self) -> str:
+        # example: documentset_fence_1
+        return f"{self.FENCE_PREFIX}_{self._id}"
+
+    @property
+    def taskset_key(self) -> str:
+        # example: documentset_taskset_1
+        return f"{self.TASKSET_PREFIX}_{self._id}"
+
+    @staticmethod
+    def get_id_from_fence_key(key: str) -> int | None:
+        """
+        Extracts the object ID from a fence key in the format `PREFIX_fence_X`.
+
+        Args:
+            key (str): The fence key string.
+
+        Returns:
+            Optional[int]: The extracted ID if the key is in the correct format, otherwise None.
+        """
+        parts = key.split("_")
+        if len(parts) != 3:
+            return None
+
+        try:
+            object_id = int(parts[2])
+        except ValueError:
+            return None
+
+        return object_id
+
+    @staticmethod
+    def get_id_from_task_id(task_id: str) -> int | None:
+        """
+        Extracts the object ID from a task ID string.
+
+        This method assumes the task ID is formatted as `prefix_objectid_suffix`, where:
+        - `prefix` is an arbitrary string (e.g., the name of the task or entity),
+        - `objectid` is the ID you want to extract,
+        - `suffix` is another arbitrary string (e.g., a UUID).
+
+        Example:
+            If the input `task_id` is `documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc`,
+            this method will return the string `"1"`.
+
+        Args:
+            task_id (str): The task ID string from which to extract the object ID.
+
+        Returns:
+            str | None: The extracted object ID if the task ID is in the correct format, otherwise None.
+        """
+        # example: task_id=documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc
+        parts = task_id.split("_")
+        if len(parts) != 3:
+            return None
+
+        try:
+            object_id = int(parts[1])
+        except ValueError:
+            return None
+
+        return object_id
+
+    @abstractmethod
+    def generate_tasks(
+        self,
+        celery_app: Celery,
+        db_session: Session,
+        redis_client: Redis,
+        lock: redis.lock.Lock,
+    ) -> int | None:
+        pass
+
+
+class RedisDocumentSet(RedisObjectHelper):
+    PREFIX = "documentset"
+    FENCE_PREFIX = PREFIX + "_fence"
+    TASKSET_PREFIX = PREFIX + "_taskset"
+
+    def generate_tasks(
+        self,
+        celery_app: Celery,
+        db_session: Session,
+        redis_client: Redis,
+        lock: redis.lock.Lock,
+    ) -> int | None:
+        last_lock_time = time.monotonic()
+
+        async_results = []
+        stmt = construct_document_select_by_docset(self._id, current_only=False)
+        for doc in db_session.scalars(stmt).yield_per(1):
+            current_time = time.monotonic()
+            if current_time - last_lock_time >= (
+                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
+            ):
+                lock.reacquire()
+                last_lock_time = current_time
+
+            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # we prefix the task id so it's easier to keep track of who created the task
+            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
+
+            # add to the set BEFORE creating the task.
+            redis_client.sadd(self.taskset_key, custom_task_id)
+
+            result = celery_app.send_task(
+                "vespa_metadata_sync_task",
+                kwargs=dict(document_id=doc.id),
+                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
+                task_id=custom_task_id,
+                priority=DanswerCeleryPriority.LOW,
+            )
+
+            async_results.append(result)
+
+        return len(async_results)
+
+
+class RedisUserGroup(RedisObjectHelper):
+    PREFIX = "usergroup"
+    FENCE_PREFIX = PREFIX + "_fence"
+    TASKSET_PREFIX = PREFIX + "_taskset"
+
+    def generate_tasks(
+        self,
+        celery_app: Celery,
+        db_session: Session,
+        redis_client: Redis,
+        lock: redis.lock.Lock,
+    ) -> int | None:
+        last_lock_time = time.monotonic()
+
+        async_results = []
+
+        try:
+            construct_document_select_by_usergroup = fetch_versioned_implementation(
+                "danswer.db.user_group",
+                "construct_document_select_by_usergroup",
+            )
+        except ModuleNotFoundError:
+            return 0
+
+        stmt = construct_document_select_by_usergroup(self._id)
+        for doc in db_session.scalars(stmt).yield_per(1):
+            current_time = time.monotonic()
+            if current_time - last_lock_time >= (
+                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
+            ):
+                lock.reacquire()
+                last_lock_time = current_time
+
+            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # we prefix the task id so it's easier to keep track of who created the task
+            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
+
+            # add to the set BEFORE creating the task.
+            redis_client.sadd(self.taskset_key, custom_task_id)
+
+            result = celery_app.send_task(
+                "vespa_metadata_sync_task",
+                kwargs=dict(document_id=doc.id),
+                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
+                task_id=custom_task_id,
+                priority=DanswerCeleryPriority.LOW,
+            )
+
+            async_results.append(result)
+
+        return len(async_results)
+
+
+class RedisConnectorCredentialPair(RedisObjectHelper):
+    """This class differs from the default in that the taskset used spans
+    all connectors and is not per connector."""
+
+    PREFIX = "connectorsync"
+    FENCE_PREFIX = PREFIX + "_fence"
+    TASKSET_PREFIX = PREFIX + "_taskset"
+
+    @classmethod
+    def get_fence_key(cls) -> str:
+        return RedisConnectorCredentialPair.FENCE_PREFIX
+
+    @classmethod
+    def get_taskset_key(cls) -> str:
+        return RedisConnectorCredentialPair.TASKSET_PREFIX
+
+    @property
+    def taskset_key(self) -> str:
+        """Notice that this is intentionally reusing the same taskset for all
+        connector syncs"""
+        # example: connector_taskset
+        return f"{self.TASKSET_PREFIX}"
+
+    def generate_tasks(
+        self,
+        celery_app: Celery,
+        db_session: Session,
+        redis_client: Redis,
+        lock: redis.lock.Lock,
+    ) -> int | None:
+        last_lock_time = time.monotonic()
+
+        async_results = []
+        cc_pair = get_connector_credential_pair_from_id(self._id, db_session)
+        if not cc_pair:
+            return None
+
+        stmt = construct_document_select_for_connector_credential_pair_by_needs_sync(
+            cc_pair.connector_id, cc_pair.credential_id
+        )
+        for doc in db_session.scalars(stmt).yield_per(1):
+            current_time = time.monotonic()
+            if current_time - last_lock_time >= (
+                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
+            ):
+                lock.reacquire()
+                last_lock_time = current_time
+
+            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # we prefix the task id so it's easier to keep track of who created the task
+            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
+
+            # add to the tracking taskset in redis BEFORE creating the celery task.
+            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
+            redis_client.sadd(
+                RedisConnectorCredentialPair.get_taskset_key(), custom_task_id
+            )
+
+            # Priority on sync's triggered by new indexing should be medium
+            result = celery_app.send_task(
+                "vespa_metadata_sync_task",
+                kwargs=dict(document_id=doc.id),
+                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
+                task_id=custom_task_id,
+                priority=DanswerCeleryPriority.MEDIUM,
+            )
+
+            async_results.append(result)
+
+        return len(async_results)
+
+
+class RedisConnectorDeletion(RedisObjectHelper):
+    PREFIX = "connectordeletion"
+    FENCE_PREFIX = PREFIX + "_fence"
+    TASKSET_PREFIX = PREFIX + "_taskset"
+
+    def generate_tasks(
+        self,
+        celery_app: Celery,
+        db_session: Session,
+        redis_client: Redis,
+        lock: redis.lock.Lock,
+    ) -> int | None:
+        last_lock_time = time.monotonic()
+
+        async_results = []
+        cc_pair = get_connector_credential_pair_from_id(self._id, db_session)
+        if not cc_pair:
+            return None
+
+        stmt = construct_document_select_for_connector_credential_pair(
+            cc_pair.connector_id, cc_pair.credential_id
+        )
+        for doc in db_session.scalars(stmt).yield_per(1):
+            current_time = time.monotonic()
+            if current_time - last_lock_time >= (
+                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
+            ):
+                lock.reacquire()
+                last_lock_time = current_time
+
+            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # we prefix the task id so it's easier to keep track of who created the task
+            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
+
+            # add to the tracking taskset in redis BEFORE creating the celery task.
+            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
+            redis_client.sadd(self.taskset_key, custom_task_id)
+
+            # Priority on sync's triggered by new indexing should be medium
+            result = celery_app.send_task(
+                "document_by_cc_pair_cleanup_task",
+                kwargs=dict(
+                    document_id=doc.id,
+                    connector_id=cc_pair.connector_id,
+                    credential_id=cc_pair.credential_id,
+                ),
+                queue=DanswerCeleryQueues.CONNECTOR_DELETION,
+                task_id=custom_task_id,
+                priority=DanswerCeleryPriority.MEDIUM,
+            )
+
+            async_results.append(result)
+
+        return len(async_results)
+
+
+def celery_get_queue_length(queue: str, r: Redis) -> int:
+    """This is a redis specific way to get the length of a celery queue.
+    It is priority aware and knows how to count across the multiple redis lists
+    used to implement task prioritization.
+    This operation is not atomic."""
+    total_length = 0
+    for i in range(len(DanswerCeleryPriority)):
+        queue_name = queue
+        if i > 0:
+            queue_name += CELERY_SEPARATOR
+            queue_name += str(i)
+
+        length = r.llen(queue_name)
+        total_length += cast(int, length)
+
+    return total_length
--- a/backend/danswer/background/celery/celery_run.py
+++ b/backend/danswer/background/celery/celery_run.py
@@ -0,0 +1,9 @@
+"""Entry point for running celery worker / celery beat."""
+from danswer.utils.variable_functionality import fetch_versioned_implementation
+from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable
+
+
+set_is_ee_based_on_env_variable()
+celery_app = fetch_versioned_implementation(
+    "danswer.background.celery.celery_app", "celery_app"
+)
--- a/backend/danswer/background/celery/celery_utils.py
+++ b/backend/danswer/background/celery/celery_utils.py
@@ -1,23 +1,143 @@
+from datetime import datetime
+from datetime import timezone
+
 from sqlalchemy.orm import Session

-from danswer.background.task_utils import name_cc_cleanup_task
+from danswer.background.celery.celery_redis import RedisConnectorDeletion
+from danswer.background.task_utils import name_cc_prune_task
+from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING
+from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
+from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
+    rate_limit_builder,
+)
+from danswer.connectors.interfaces import BaseConnector
+from danswer.connectors.interfaces import IdConnector
+from danswer.connectors.interfaces import LoadConnector
+from danswer.connectors.interfaces import PollConnector
+from danswer.connectors.models import Document
+from danswer.db.connector_credential_pair import get_connector_credential_pair
+from danswer.db.engine import get_db_current_time
+from danswer.db.enums import TaskStatus
+from danswer.db.models import Connector
+from danswer.db.models import Credential
+from danswer.db.models import TaskQueueState
+from danswer.db.tasks import check_task_is_live_and_not_timed_out
 from danswer.db.tasks import get_latest_task
+from danswer.db.tasks import get_latest_task_by_type
+from danswer.redis.redis_pool import RedisPool
 from danswer.server.documents.models import DeletionAttemptSnapshot
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+redis_pool = RedisPool()


-def get_deletion_status(
+def _get_deletion_status(
+    connector_id: int, credential_id: int, db_session: Session
+) -> TaskQueueState | None:
+    """We no longer store TaskQueueState in the DB for a deletion attempt.
+    This function populates TaskQueueState by just checking redis.
+    """
+    cc_pair = get_connector_credential_pair(
+        connector_id=connector_id, credential_id=credential_id, db_session=db_session
+    )
+    if not cc_pair:
+        return None
+
+    rcd = RedisConnectorDeletion(cc_pair.id)
+
+    r = redis_pool.get_client()
+    if not r.exists(rcd.fence_key):
+        return None
+
+    return TaskQueueState(
+        task_id="", task_name=rcd.fence_key, status=TaskStatus.STARTED
+    )
+
+
+def get_deletion_attempt_snapshot(
    connector_id: int, credential_id: int, db_session: Session
 ) -> DeletionAttemptSnapshot | None:
-    cleanup_task_name = name_cc_cleanup_task(
-        connector_id=connector_id, credential_id=credential_id
-    )
-    task_state = get_latest_task(task_name=cleanup_task_name, db_session=db_session)
-
-    if not task_state:
+    deletion_task = _get_deletion_status(connector_id, credential_id, db_session)
+    if not deletion_task:
        return None

    return DeletionAttemptSnapshot(
        connector_id=connector_id,
        credential_id=credential_id,
-        status=task_state.status,
+        status=deletion_task.status,
    )
+
+
+def should_prune_cc_pair(
+    connector: Connector, credential: Credential, db_session: Session
+) -> bool:
+    if not connector.prune_freq:
+        return False
+
+    pruning_task_name = name_cc_prune_task(
+        connector_id=connector.id, credential_id=credential.id
+    )
+    last_pruning_task = get_latest_task(pruning_task_name, db_session)
+    current_db_time = get_db_current_time(db_session)
+
+    if not last_pruning_task:
+        time_since_initialization = current_db_time - connector.time_created
+        if time_since_initialization.total_seconds() >= connector.prune_freq:
+            return True
+        return False
+
+    if not ALLOW_SIMULTANEOUS_PRUNING:
+        pruning_type_task_name = name_cc_prune_task()
+        last_pruning_type_task = get_latest_task_by_type(
+            pruning_type_task_name, db_session
+        )
+
+        if last_pruning_type_task and check_task_is_live_and_not_timed_out(
+            last_pruning_type_task, db_session
+        ):
+            return False
+
+    if check_task_is_live_and_not_timed_out(last_pruning_task, db_session):
+        return False
+
+    if not last_pruning_task.start_time:
+        return False
+
+    time_since_last_pruning = current_db_time - last_pruning_task.start_time
+    return time_since_last_pruning.total_seconds() >= connector.prune_freq
+
+
+def document_batch_to_ids(doc_batch: list[Document]) -> set[str]:
+    return {doc.id for doc in doc_batch}
+
+
+def extract_ids_from_runnable_connector(runnable_connector: BaseConnector) -> set[str]:
+    """
+    If the PruneConnector hasnt been implemented for the given connector, just pull
+    all docs using the load_from_state and grab out the IDs
+    """
+    all_connector_doc_ids: set[str] = set()
+
+    doc_batch_generator = None
+    if isinstance(runnable_connector, IdConnector):
+        all_connector_doc_ids = runnable_connector.retrieve_all_source_ids()
+    elif isinstance(runnable_connector, LoadConnector):
+        doc_batch_generator = runnable_connector.load_from_state()
+    elif isinstance(runnable_connector, PollConnector):
+        start = datetime(1970, 1, 1, tzinfo=timezone.utc).timestamp()
+        end = datetime.now(timezone.utc).timestamp()
+        doc_batch_generator = runnable_connector.poll_source(start=start, end=end)
+    else:
+        raise RuntimeError("Pruning job could not find a valid runnable_connector.")
+
+    if doc_batch_generator:
+        doc_batch_processing_func = document_batch_to_ids
+        if MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE:
+            doc_batch_processing_func = rate_limit_builder(
+                max_calls=MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE, period=60
+            )(document_batch_to_ids)
+        for doc_batch in doc_batch_generator:
+            all_connector_doc_ids.update(doc_batch_processing_func(doc_batch))
+
+    return all_connector_doc_ids
--- a/backend/danswer/background/celery/celeryconfig.py
+++ b/backend/danswer/background/celery/celeryconfig.py
@@ -0,0 +1,76 @@
+# docs: https://docs.celeryq.dev/en/stable/userguide/configuration.html
+from danswer.configs.app_configs import CELERY_RESULT_EXPIRES
+from danswer.configs.app_configs import REDIS_DB_NUMBER_CELERY
+from danswer.configs.app_configs import REDIS_DB_NUMBER_CELERY_RESULT_BACKEND
+from danswer.configs.app_configs import REDIS_HOST
+from danswer.configs.app_configs import REDIS_PASSWORD
+from danswer.configs.app_configs import REDIS_PORT
+from danswer.configs.app_configs import REDIS_SSL
+from danswer.configs.app_configs import REDIS_SSL_CA_CERTS
+from danswer.configs.app_configs import REDIS_SSL_CERT_REQS
+from danswer.configs.constants import DanswerCeleryPriority
+
+CELERY_SEPARATOR = ":"
+
+CELERY_PASSWORD_PART = ""
+if REDIS_PASSWORD:
+    CELERY_PASSWORD_PART = f":{REDIS_PASSWORD}@"
+
+REDIS_SCHEME = "redis"
+
+# SSL-specific query parameters for Redis URL
+SSL_QUERY_PARAMS = ""
+if REDIS_SSL:
+    REDIS_SCHEME = "rediss"
+    SSL_QUERY_PARAMS = f"?ssl_cert_reqs={REDIS_SSL_CERT_REQS}"
+    if REDIS_SSL_CA_CERTS:
+        SSL_QUERY_PARAMS += f"&ssl_ca_certs={REDIS_SSL_CA_CERTS}"
+
+# example celery_broker_url: "redis://:password@localhost:6379/15"
+broker_url = f"{REDIS_SCHEME}://{CELERY_PASSWORD_PART}{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB_NUMBER_CELERY}{SSL_QUERY_PARAMS}"
+
+result_backend = f"{REDIS_SCHEME}://{CELERY_PASSWORD_PART}{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB_NUMBER_CELERY_RESULT_BACKEND}{SSL_QUERY_PARAMS}"
+
+# NOTE: prefetch 4 is significantly faster than prefetch 1 for small tasks
+# however, prefetching is bad when tasks are lengthy as those tasks
+# can stall other tasks.
+worker_prefetch_multiplier = 4
+
+broker_transport_options = {
+    "priority_steps": list(range(len(DanswerCeleryPriority))),
+    "sep": CELERY_SEPARATOR,
+    "queue_order_strategy": "priority",
+}
+
+task_default_priority = DanswerCeleryPriority.MEDIUM
+task_acks_late = True
+
+# It's possible we don't even need celery's result backend, in which case all of the optimization below
+# might be irrelevant
+result_expires = CELERY_RESULT_EXPIRES  # 86400 seconds is the default
+
+# Option 0: Defaults (json serializer, no compression)
+# about 1.5 KB per queued task. 1KB in queue, 400B for result, 100 as a child entry in generator result
+
+# Option 1: Reduces generator task result sizes by roughly 20%
+# task_compression = "bzip2"
+# task_serializer = "pickle"
+# result_compression = "bzip2"
+# result_serializer = "pickle"
+# accept_content=["pickle"]
+
+# Option 2: this significantly reduces the size of the result for generator tasks since the list of children
+# can be large. small tasks change very little
+# def pickle_bz2_encoder(data):
+#     return bz2.compress(pickle.dumps(data))
+
+# def pickle_bz2_decoder(data):
+#     return pickle.loads(bz2.decompress(data))
+
+# from kombu import serialization  # To register custom serialization with Celery/Kombu
+
+# serialization.register('pickle-bzip2', pickle_bz2_encoder, pickle_bz2_decoder, 'application/x-pickle-bz2', 'binary')
+
+# task_serializer = "pickle-bzip2"
+# result_serializer = "pickle-bzip2"
+# accept_content=["pickle", "pickle-bzip2"]
--- a/backend/danswer/background/connector_deletion.py
+++ b/backend/danswer/background/connector_deletion.py
@@ -10,27 +10,15 @@ are multiple connector / credential pairs that have indexed it
 connector / credential pair from the access list
 (6) delete all relevant entries from postgres
 """
-import time
-
 from sqlalchemy.orm import Session

 from danswer.access.access import get_access_for_documents
-from danswer.db.connector import fetch_connector_by_id
-from danswer.db.connector_credential_pair import (
-    delete_connector_credential_pair__no_commit,
-)
-from danswer.db.document import delete_document_by_connector_credential_pair__no_commit
+from danswer.db.document import delete_documents_by_connector_credential_pair__no_commit
 from danswer.db.document import delete_documents_complete__no_commit
-from danswer.db.document import get_document_connector_cnts
-from danswer.db.document import get_documents_for_connector_credential_pair
+from danswer.db.document import get_document_connector_counts
 from danswer.db.document import prepare_to_modify_documents
-from danswer.db.document_set import get_document_sets_by_ids
-from danswer.db.document_set import (
-    mark_cc_pair__document_set_relationships_to_be_deleted__no_commit,
-)
+from danswer.db.document_set import fetch_document_sets_for_documents
 from danswer.db.engine import get_sqlalchemy_engine
-from danswer.db.index_attempt import delete_index_attempts
-from danswer.db.models import ConnectorCredentialPair
 from danswer.document_index.interfaces import DocumentIndex
 from danswer.document_index.interfaces import UpdateRequest
 from danswer.server.documents.models import ConnectorCredentialPairIdentifier
@@ -41,7 +29,7 @@ logger = setup_logger()
 _DELETION_BATCH_SIZE = 1000


-def _delete_connector_credential_pair_batch(
+def delete_connector_credential_pair_batch(
    document_ids: list[str],
    connector_id: int,
    credential_id: int,
@@ -57,13 +45,15 @@ def _delete_connector_credential_pair_batch(
        with prepare_to_modify_documents(
            db_session=db_session, document_ids=document_ids
        ):
-            document_connector_cnts = get_document_connector_cnts(
+            document_connector_counts = get_document_connector_counts(
                db_session=db_session, document_ids=document_ids
            )

            # figure out which docs need to be completely deleted
            document_ids_to_delete = [
-                document_id for document_id, cnt in document_connector_cnts if cnt == 1
+                document_id
+                for document_id, cnt in document_connector_counts
+                if cnt == 1
            ]
            logger.debug(f"Deleting documents: {document_ids_to_delete}")

@@ -76,28 +66,40 @@ def _delete_connector_credential_pair_batch(

            # figure out which docs need to be updated
            document_ids_to_update = [
-                document_id for document_id, cnt in document_connector_cnts if cnt > 1
+                document_id for document_id, cnt in document_connector_counts if cnt > 1
            ]
+
+            # maps document id to list of document set names
+            new_doc_sets_for_documents: dict[str, set[str]] = {
+                document_id_and_document_set_names_tuple[0]: set(
+                    document_id_and_document_set_names_tuple[1]
+                )
+                for document_id_and_document_set_names_tuple in fetch_document_sets_for_documents(
+                    db_session=db_session,
+                    document_ids=document_ids_to_update,
+                )
+            }
+
+            # determine future ACLs for documents in batch
            access_for_documents = get_access_for_documents(
                document_ids=document_ids_to_update,
                db_session=db_session,
-                cc_pair_to_delete=ConnectorCredentialPairIdentifier(
-                    connector_id=connector_id,
-                    credential_id=credential_id,
-                ),
            )
+
+            # update Vespa
+            logger.debug(f"Updating documents: {document_ids_to_update}")
            update_requests = [
                UpdateRequest(
                    document_ids=[document_id],
                    access=access,
+                    document_sets=new_doc_sets_for_documents[document_id],
                )
                for document_id, access in access_for_documents.items()
            ]
-            logger.debug(f"Updating documents: {document_ids_to_update}")
-
            document_index.update(update_requests=update_requests)

-            delete_document_by_connector_credential_pair__no_commit(
+            # clean up Postgres
+            delete_documents_by_connector_credential_pair__no_commit(
                db_session=db_session,
                document_ids=document_ids_to_update,
                connector_credential_pair_identifier=ConnectorCredentialPairIdentifier(
@@ -106,105 +108,3 @@ def _delete_connector_credential_pair_batch(
                ),
            )
            db_session.commit()
-
-
-def cleanup_synced_entities(
-    cc_pair: ConnectorCredentialPair, db_session: Session
-) -> None:
-    """Updates the document sets associated with the connector / credential pair,
-    then relies on the document set sync script to kick off Celery jobs which will
-    sync these updates to Vespa.
-
-    Waits until the document sets are synced before returning."""
-    logger.info(f"Cleaning up Document Sets for CC Pair with ID: '{cc_pair.id}'")
-    document_sets_ids_to_sync = list(
-        mark_cc_pair__document_set_relationships_to_be_deleted__no_commit(
-            cc_pair_id=cc_pair.id,
-            db_session=db_session,
-        )
-    )
-    db_session.commit()
-
-    # wait till all document sets are synced before continuing
-    while True:
-        all_synced = True
-        document_sets = get_document_sets_by_ids(
-            db_session=db_session, document_set_ids=document_sets_ids_to_sync
-        )
-        for document_set in document_sets:
-            if not document_set.is_up_to_date:
-                all_synced = False
-
-        if all_synced:
-            break
-
-        # wait for 30 seconds before checking again
-        db_session.commit()  # end transaction
-        logger.info(
-            f"Document sets '{document_sets_ids_to_sync}' not synced yet, waiting 30s"
-        )
-        time.sleep(30)
-
-    logger.info(
-        f"Finished cleaning up Document Sets for CC Pair with ID: '{cc_pair.id}'"
-    )
-
-
-def delete_connector_credential_pair(
-    db_session: Session,
-    document_index: DocumentIndex,
-    cc_pair: ConnectorCredentialPair,
-) -> int:
-    connector_id = cc_pair.connector_id
-    credential_id = cc_pair.credential_id
-
-    num_docs_deleted = 0
-    while True:
-        documents = get_documents_for_connector_credential_pair(
-            db_session=db_session,
-            connector_id=connector_id,
-            credential_id=credential_id,
-            limit=_DELETION_BATCH_SIZE,
-        )
-        if not documents:
-            break
-
-        _delete_connector_credential_pair_batch(
-            document_ids=[document.id for document in documents],
-            connector_id=connector_id,
-            credential_id=credential_id,
-            document_index=document_index,
-        )
-        num_docs_deleted += len(documents)
-
-    # Clean up document sets / access information from Postgres
-    # and sync these updates to Vespa
-    # TODO: add user group cleanup with `fetch_versioned_implementation`
-    cleanup_synced_entities(cc_pair, db_session)
-
-    # clean up the rest of the related Postgres entities
-    delete_index_attempts(
-        db_session=db_session,
-        connector_id=connector_id,
-        credential_id=credential_id,
-    )
-    delete_connector_credential_pair__no_commit(
-        db_session=db_session,
-        connector_id=connector_id,
-        credential_id=credential_id,
-    )
-    # if there are no credentials left, delete the connector
-    connector = fetch_connector_by_id(
-        db_session=db_session,
-        connector_id=connector_id,
-    )
-    if not connector or not len(connector.credentials):
-        logger.debug("Found no credentials left for connector, deleting connector")
-        db_session.delete(connector)
-    db_session.commit()
-
-    logger.info(
-        "Successfully deleted connector_credential_pair with connector_id:"
-        f" '{connector_id}' and credential_id: '{credential_id}'. Deleted {num_docs_deleted} docs."
-    )
-    return num_docs_deleted
--- a/backend/danswer/background/indexing/job_client.py
+++ b/backend/danswer/background/indexing/job_client.py
@@ -41,6 +41,12 @@ def _initializer(
    return func(*args, **kwargs)


+def _run_in_process(
+    func: Callable, args: list | tuple, kwargs: dict[str, Any] | None = None
+) -> None:
+    _initializer(func, args, kwargs)
+
+
@dataclass
 class SimpleJob:
    """Drop in replacement for `dask.distributed.Future`"""
@@ -105,13 +111,15 @@ class SimpleJobClient:
        """NOTE: `pure` arg is needed so this can be a drop in replacement for Dask"""
        self._cleanup_completed_jobs()
        if len(self.jobs) >= self.n_workers:
-            logger.debug("No available workers to run job")
+            logger.debug(
+                f"No available workers to run job. Currently running '{len(self.jobs)}' jobs, with a limit of '{self.n_workers}'."
+            )
            return None

        job_id = self.job_id_counter
        self.job_id_counter += 1

-        process = Process(target=_initializer(func=func, args=args), daemon=True)
+        process = Process(target=_run_in_process, args=(func, args), daemon=True)
        job = SimpleJob(id=job_id, process=process)
        process.start()

--- a/backend/danswer/background/indexing/run_indexing.py
+++ b/backend/danswer/background/indexing/run_indexing.py
@@ -6,27 +6,22 @@ from datetime import timezone

 from sqlalchemy.orm import Session

-from danswer.background.connector_deletion import (
-    _delete_connector_credential_pair_batch,
-)
 from danswer.background.indexing.checkpointing import get_time_windows_for_index_attempt
-from danswer.configs.app_configs import DISABLE_DOCUMENT_CLEANUP
+from danswer.background.indexing.tracer import DanswerTracer
+from danswer.configs.app_configs import INDEXING_SIZE_WARNING_THRESHOLD
+from danswer.configs.app_configs import INDEXING_TRACER_INTERVAL
 from danswer.configs.app_configs import POLL_CONNECTOR_OFFSET
+from danswer.connectors.connector_runner import ConnectorRunner
 from danswer.connectors.factory import instantiate_connector
-from danswer.connectors.interfaces import GenerateDocumentsOutput
-from danswer.connectors.interfaces import LoadConnector
-from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.models import IndexAttemptMetadata
-from danswer.connectors.models import InputType
-from danswer.db.connector import disable_connector
 from danswer.db.connector_credential_pair import get_last_successful_attempt_time
 from danswer.db.connector_credential_pair import update_connector_credential_pair
-from danswer.db.credentials import backend_update_credential_json
-from danswer.db.document import get_documents_for_connector_credential_pair
 from danswer.db.engine import get_sqlalchemy_engine
+from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.index_attempt import get_index_attempt
 from danswer.db.index_attempt import mark_attempt_failed
-from danswer.db.index_attempt import mark_attempt_in_progress__no_commit
+from danswer.db.index_attempt import mark_attempt_in_progress
+from danswer.db.index_attempt import mark_attempt_partially_succeeded
 from danswer.db.index_attempt import mark_attempt_succeeded
 from danswer.db.index_attempt import update_docs_indexed
 from danswer.db.models import IndexAttempt
@@ -37,16 +32,19 @@ from danswer.indexing.embedder import DefaultIndexingEmbedder
 from danswer.indexing.indexing_pipeline import build_indexing_pipeline
 from danswer.utils.logger import IndexAttemptSingleton
 from danswer.utils.logger import setup_logger
+from danswer.utils.variable_functionality import global_version

 logger = setup_logger()

+INDEXING_TRACER_NUM_PRINT_ENTRIES = 5

-def _get_document_generator(
+
+def _get_connector_runner(
    db_session: Session,
    attempt: IndexAttempt,
    start_time: datetime,
    end_time: datetime,
-) -> tuple[GenerateDocumentsOutput, bool]:
+) -> ConnectorRunner:
    """
    NOTE: `start_time` and `end_time` are only used for poll connectors

@@ -54,47 +52,31 @@ def _get_document_generator(
    are the complete list of existing documents of the connector. If the task
    of type LOAD_STATE, the list will be considered complete and otherwise incomplete.
    """
-    task = attempt.connector.input_type
+    task = attempt.connector_credential_pair.connector.input_type

    try:
-        runnable_connector, new_credential_json = instantiate_connector(
-            attempt.connector.source,
-            task,
-            attempt.connector.connector_specific_config,
-            attempt.credential.credential_json,
+        runnable_connector = instantiate_connector(
+            db_session=db_session,
+            source=attempt.connector_credential_pair.connector.source,
+            input_type=task,
+            connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
+            credential=attempt.connector_credential_pair.credential,
        )
-        if new_credential_json is not None:
-            backend_update_credential_json(
-                attempt.credential, new_credential_json, db_session
-            )
    except Exception as e:
        logger.exception(f"Unable to instantiate connector due to {e}")
-        disable_connector(attempt.connector.id, db_session)
+        # since we failed to even instantiate the connector, we pause the CCPair since
+        # it will never succeed
+        update_connector_credential_pair(
+            db_session=db_session,
+            connector_id=attempt.connector_credential_pair.connector.id,
+            credential_id=attempt.connector_credential_pair.credential.id,
+            status=ConnectorCredentialPairStatus.PAUSED,
+        )
        raise e

-    if task == InputType.LOAD_STATE:
-        assert isinstance(runnable_connector, LoadConnector)
-        doc_batch_generator = runnable_connector.load_from_state()
-        is_listing_complete = True
-    elif task == InputType.POLL:
-        assert isinstance(runnable_connector, PollConnector)
-        if attempt.connector_id is None or attempt.credential_id is None:
-            raise ValueError(
-                f"Polling attempt {attempt.id} is missing connector_id or credential_id, "
-                f"can't fetch time range."
-            )
-
-        logger.info(f"Polling for updates between {start_time} and {end_time}")
-        doc_batch_generator = runnable_connector.poll_source(
-            start=start_time.timestamp(), end=end_time.timestamp()
-        )
-        is_listing_complete = False
-
-    else:
-        # Event types cannot be handled by a background type
-        raise RuntimeError(f"Invalid task type: {task}")
-
-    return doc_batch_generator, is_listing_complete
+    return ConnectorRunner(
+        connector=runnable_connector, time_range=(start_time, end_time)
+    )


 def _run_indexing(
@@ -108,46 +90,62 @@ def _run_indexing(
    """
    start_time = time.time()

-    db_embedding_model = index_attempt.embedding_model
-    index_name = db_embedding_model.index_name
+    search_settings = index_attempt.search_settings
+    index_name = search_settings.index_name

    # Only update cc-pair status for primary index jobs
    # Secondary index syncs at the end when swapping
-    is_primary = index_attempt.embedding_model.status == IndexModelStatus.PRESENT
+    is_primary = search_settings.status == IndexModelStatus.PRESENT

    # Indexing is only done into one index at a time
    document_index = get_default_document_index(
        primary_index_name=index_name, secondary_index_name=None
    )

-    embedding_model = DefaultIndexingEmbedder(
-        model_name=db_embedding_model.model_name,
-        normalize=db_embedding_model.normalize,
-        query_prefix=db_embedding_model.query_prefix,
-        passage_prefix=db_embedding_model.passage_prefix,
+    embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
+        search_settings=search_settings
    )

    indexing_pipeline = build_indexing_pipeline(
+        attempt_id=index_attempt.id,
        embedder=embedding_model,
        document_index=document_index,
        ignore_time_skip=index_attempt.from_beginning
-        or (db_embedding_model.status == IndexModelStatus.FUTURE),
+        or (search_settings.status == IndexModelStatus.FUTURE),
        db_session=db_session,
    )

-    db_connector = index_attempt.connector
-    db_credential = index_attempt.credential
+    db_cc_pair = index_attempt.connector_credential_pair
+    db_connector = index_attempt.connector_credential_pair.connector
+    db_credential = index_attempt.connector_credential_pair.credential
+    earliest_index_time = (
+        db_connector.indexing_start.timestamp() if db_connector.indexing_start else 0
+    )
+
    last_successful_index_time = (
-        0.0
+        earliest_index_time
        if index_attempt.from_beginning
        else get_last_successful_attempt_time(
            connector_id=db_connector.id,
            credential_id=db_credential.id,
-            embedding_model=index_attempt.embedding_model,
+            earliest_index=earliest_index_time,
+            search_settings=index_attempt.search_settings,
            db_session=db_session,
        )
    )

+    if INDEXING_TRACER_INTERVAL > 0:
+        logger.debug(f"Memory tracer starting: interval={INDEXING_TRACER_INTERVAL}")
+        tracer = DanswerTracer()
+        tracer.start()
+        tracer.snap()
+
+    index_attempt_md = IndexAttemptMetadata(
+        connector_id=db_connector.id,
+        credential_id=db_credential.id,
+    )
+
+    batch_num = 0
    net_doc_change = 0
    document_count = 0
    chunk_count = 0
@@ -166,7 +164,7 @@ def _run_indexing(
                datetime(1970, 1, 1, tzinfo=timezone.utc),
            )

-            doc_batch_generator, is_listing_complete = _get_document_generator(
+            connector_runner = _get_connector_runner(
                db_session=db_session,
                attempt=index_attempt,
                start_time=window_start,
@@ -174,15 +172,23 @@ def _run_indexing(
            )

            all_connector_doc_ids: set[str] = set()
-            for doc_batch in doc_batch_generator:
+
+            tracer_counter = 0
+            if INDEXING_TRACER_INTERVAL > 0:
+                tracer.snap()
+            for doc_batch in connector_runner.run():
                # Check if connector is disabled mid run and stop if so unless it's the secondary
                # index being built. We want to populate it even for paused connectors
                # Often paused connectors are sources that aren't updated frequently but the
                # contents still need to be initially pulled.
                db_session.refresh(db_connector)
                if (
-                    db_connector.disabled
-                    and db_embedding_model.status != IndexModelStatus.FUTURE
+                    (
+                        db_cc_pair.status == ConnectorCredentialPairStatus.PAUSED
+                        and search_settings.status != IndexModelStatus.FUTURE
+                    )
+                    # if it's deleting, we don't care if this is a secondary index
+                    or db_cc_pair.status == ConnectorCredentialPairStatus.DELETING
                ):
                    # let the `except` block handle this
                    raise RuntimeError("Connector was disabled mid run")
@@ -192,17 +198,30 @@ def _run_indexing(
                    # Likely due to user manually disabling it or model swap
                    raise RuntimeError("Index Attempt was canceled")

-                logger.debug(
-                    f"Indexing batch of documents: {[doc.to_short_descriptor() for doc in doc_batch]}"
+                batch_description = []
+                for doc in doc_batch:
+                    batch_description.append(doc.to_short_descriptor())
+
+                    doc_size = 0
+                    for section in doc.sections:
+                        doc_size += len(section.text)
+
+                    if doc_size > INDEXING_SIZE_WARNING_THRESHOLD:
+                        logger.warning(
+                            f"Document size: doc='{doc.to_short_descriptor()}' "
+                            f"size={doc_size} "
+                            f"threshold={INDEXING_SIZE_WARNING_THRESHOLD}"
+                        )
+
+                logger.debug(f"Indexing batch of documents: {batch_description}")
+
+                index_attempt_md.batch_num = batch_num + 1  # use 1-index for this
+                new_docs, total_batch_chunks = indexing_pipeline(
+                    document_batch=doc_batch,
+                    index_attempt_metadata=index_attempt_md,
                )

-                new_docs, total_batch_chunks = indexing_pipeline(
-                    documents=doc_batch,
-                    index_attempt_metadata=IndexAttemptMetadata(
-                        connector_id=db_connector.id,
-                        credential_id=db_credential.id,
-                    ),
-                )
+                batch_num += 1
                net_doc_change += new_docs
                chunk_count += total_batch_chunks
                document_count += len(doc_batch)
@@ -224,38 +243,16 @@ def _run_indexing(
                    docs_removed_from_index=0,
                )

-            if is_listing_complete and not DISABLE_DOCUMENT_CLEANUP:
-                # clean up all documents from the index that have not been returned from the connector
-                all_indexed_document_ids = {
-                    d.id
-                    for d in get_documents_for_connector_credential_pair(
-                        db_session=db_session,
-                        connector_id=db_connector.id,
-                        credential_id=db_credential.id,
+                tracer_counter += 1
+                if (
+                    INDEXING_TRACER_INTERVAL > 0
+                    and tracer_counter % INDEXING_TRACER_INTERVAL == 0
+                ):
+                    logger.debug(
+                        f"Running trace comparison for batch {tracer_counter}. interval={INDEXING_TRACER_INTERVAL}"
                    )
-                }
-                doc_ids_to_remove = list(
-                    all_indexed_document_ids - all_connector_doc_ids
-                )
-                logger.debug(
-                    f"Cleaning up {len(doc_ids_to_remove)} documents that are not contained in the newest connector state"
-                )
-
-                # delete docs from cc-pair and receive the number of completely deleted docs in return
-                _delete_connector_credential_pair_batch(
-                    document_ids=doc_ids_to_remove,
-                    connector_id=db_connector.id,
-                    credential_id=db_credential.id,
-                    document_index=document_index,
-                )
-
-                update_docs_indexed(
-                    db_session=db_session,
-                    index_attempt=index_attempt,
-                    total_docs_indexed=document_count,
-                    new_docs_indexed=net_doc_change,
-                    docs_removed_from_index=len(doc_ids_to_remove),
-                )
+                    tracer.snap()
+                    tracer.log_previous_diff(INDEXING_TRACER_NUM_PRINT_ENTRIES)

            run_end_dt = window_end
            if is_primary:
@@ -267,7 +264,7 @@ def _run_indexing(
                    run_dt=run_end_dt,
                )
        except Exception as e:
-            logger.info(
+            logger.exception(
                f"Connector run ran into exception after elapsed time: {time.time() - start_time} seconds"
            )
            # Only mark the attempt as a complete failure if this is the first indexing window.
@@ -279,7 +276,7 @@ def _run_indexing(
            # to give better clarity in the UI, as the next run will never happen.
            if (
                ind == 0
-                or db_connector.disabled
+                or not db_cc_pair.status.is_active()
                or index_attempt.status != IndexingStatus.IN_PROGRESS
            ):
                mark_attempt_failed(
@@ -291,17 +288,66 @@ def _run_indexing(
                if is_primary:
                    update_connector_credential_pair(
                        db_session=db_session,
-                        connector_id=index_attempt.connector.id,
-                        credential_id=index_attempt.credential.id,
+                        connector_id=db_connector.id,
+                        credential_id=db_credential.id,
                        net_docs=net_doc_change,
                    )
+
+                if INDEXING_TRACER_INTERVAL > 0:
+                    tracer.stop()
                raise e

            # break => similar to success case. As mentioned above, if the next run fails for the same
            # reason it will then be marked as a failure
            break

-    mark_attempt_succeeded(index_attempt, db_session)
+    if INDEXING_TRACER_INTERVAL > 0:
+        logger.debug(
+            f"Running trace comparison between start and end of indexing. {tracer_counter} batches processed."
+        )
+        tracer.snap()
+        tracer.log_first_diff(INDEXING_TRACER_NUM_PRINT_ENTRIES)
+        tracer.stop()
+        logger.debug("Memory tracer stopped.")
+
+    if (
+        index_attempt_md.num_exceptions > 0
+        and index_attempt_md.num_exceptions >= batch_num
+    ):
+        mark_attempt_failed(
+            index_attempt,
+            db_session,
+            failure_reason="All batches exceptioned.",
+        )
+        if is_primary:
+            update_connector_credential_pair(
+                db_session=db_session,
+                connector_id=index_attempt.connector_credential_pair.connector.id,
+                credential_id=index_attempt.connector_credential_pair.credential.id,
+            )
+        raise Exception(
+            f"Connector failed - All batches exceptioned: batches={batch_num}"
+        )
+
+    elapsed_time = time.time() - start_time
+
+    if index_attempt_md.num_exceptions == 0:
+        mark_attempt_succeeded(index_attempt, db_session)
+        logger.info(
+            f"Connector succeeded: "
+            f"docs={document_count} chunks={chunk_count} elapsed={elapsed_time:.2f}s"
+        )
+    else:
+        mark_attempt_partially_succeeded(index_attempt, db_session)
+        logger.info(
+            f"Connector completed with some errors: "
+            f"exceptions={index_attempt_md.num_exceptions} "
+            f"batches={batch_num} "
+            f"docs={document_count} "
+            f"chunks={chunk_count} "
+            f"elapsed={elapsed_time:.2f}s"
+        )
+
    if is_primary:
        update_connector_credential_pair(
            db_session=db_session,
@@ -310,13 +356,6 @@ def _run_indexing(
            run_dt=run_end_dt,
        )

-    logger.info(
-        f"Indexed or refreshed {document_count} total documents for a total of {chunk_count} indexed chunks"
-    )
-    logger.info(
-        f"Connector successfully finished, elapsed time: {time.time() - start_time} seconds"
-    )
-

 def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexAttempt:
    # make sure that the index attempt can't change in between checking the
@@ -329,6 +368,7 @@ def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexA
        db_session=db_session,
        index_attempt_id=index_attempt_id,
    )
+
    if attempt is None:
        raise RuntimeError(f"Unable to find IndexAttempt for ID '{index_attempt_id}'")

@@ -339,21 +379,27 @@ def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexA
        )

    # only commit once, to make sure this all happens in a single transaction
-    mark_attempt_in_progress__no_commit(attempt)
-    if attempt.embedding_model.status != IndexModelStatus.PRESENT:
-        db_session.commit()
+    mark_attempt_in_progress(attempt, db_session)

    return attempt


-def run_indexing_entrypoint(index_attempt_id: int) -> None:
+def run_indexing_entrypoint(
+    index_attempt_id: int, connector_credential_pair_id: int, is_ee: bool = False
+) -> None:
    """Entrypoint for indexing run when using dask distributed.
    Wraps the actual logic in a `try` block so that we can catch any exceptions
    and mark the attempt as failed."""
+
    try:
+        if is_ee:
+            global_version.set_ee()
+
        # set the indexing attempt ID so that all log messages from this process
        # will have it added as a prefix
-        IndexAttemptSingleton.set_index_attempt_id(index_attempt_id)
+        IndexAttemptSingleton.set_cc_and_index_id(
+            index_attempt_id, connector_credential_pair_id
+        )

        with Session(get_sqlalchemy_engine()) as db_session:
            # make sure that it is valid to run this indexing attempt + mark it
@@ -361,17 +407,19 @@ def run_indexing_entrypoint(index_attempt_id: int) -> None:
            attempt = _prepare_index_attempt(db_session, index_attempt_id)

            logger.info(
-                f"Running indexing attempt for connector: '{attempt.connector.name}', "
-                f"with config: '{attempt.connector.connector_specific_config}', and "
-                f"with credentials: '{attempt.credential_id}'"
+                f"Indexing starting: "
+                f"connector='{attempt.connector_credential_pair.connector.name}' "
+                f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
+                f"credentials='{attempt.connector_credential_pair.connector_id}'"
            )

            _run_indexing(db_session, attempt)

            logger.info(
-                f"Completed indexing attempt for connector: '{attempt.connector.name}', "
-                f"with config: '{attempt.connector.connector_specific_config}', and "
-                f"with credentials: '{attempt.credential_id}'"
+                f"Indexing finished: "
+                f"connector='{attempt.connector_credential_pair.connector.name}' "
+                f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
+                f"credentials='{attempt.connector_credential_pair.connector_id}'"
            )
    except Exception as e:
        logger.exception(f"Indexing job with ID '{index_attempt_id}' failed due to {e}")
--- a/backend/danswer/background/indexing/tracer.py
+++ b/backend/danswer/background/indexing/tracer.py
@@ -0,0 +1,77 @@
+import tracemalloc
+
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+
+DANSWER_TRACEMALLOC_FRAMES = 10
+
+
+class DanswerTracer:
+    def __init__(self) -> None:
+        self.snapshot_first: tracemalloc.Snapshot | None = None
+        self.snapshot_prev: tracemalloc.Snapshot | None = None
+        self.snapshot: tracemalloc.Snapshot | None = None
+
+    def start(self) -> None:
+        tracemalloc.start(DANSWER_TRACEMALLOC_FRAMES)
+
+    def stop(self) -> None:
+        tracemalloc.stop()
+
+    def snap(self) -> None:
+        snapshot = tracemalloc.take_snapshot()
+        # Filter out irrelevant frames (e.g., from tracemalloc itself or importlib)
+        snapshot = snapshot.filter_traces(
+            (
+                tracemalloc.Filter(False, tracemalloc.__file__),  # Exclude tracemalloc
+                tracemalloc.Filter(
+                    False, "<frozen importlib._bootstrap>"
+                ),  # Exclude importlib
+                tracemalloc.Filter(
+                    False, "<frozen importlib._bootstrap_external>"
+                ),  # Exclude external importlib
+            )
+        )
+
+        if not self.snapshot_first:
+            self.snapshot_first = snapshot
+
+        if self.snapshot:
+            self.snapshot_prev = self.snapshot
+
+        self.snapshot = snapshot
+
+    def log_snapshot(self, numEntries: int) -> None:
+        if not self.snapshot:
+            return
+
+        stats = self.snapshot.statistics("traceback")
+        for s in stats[:numEntries]:
+            logger.debug(f"Tracer snap: {s}")
+            for line in s.traceback:
+                logger.debug(f"* {line}")
+
+    @staticmethod
+    def log_diff(
+        snap_current: tracemalloc.Snapshot,
+        snap_previous: tracemalloc.Snapshot,
+        numEntries: int,
+    ) -> None:
+        stats = snap_current.compare_to(snap_previous, "traceback")
+        for s in stats[:numEntries]:
+            logger.debug(f"Tracer diff: {s}")
+            for line in s.traceback.format():
+                logger.debug(f"* {line}")
+
+    def log_previous_diff(self, numEntries: int) -> None:
+        if not self.snapshot or not self.snapshot_prev:
+            return
+
+        DanswerTracer.log_diff(self.snapshot, self.snapshot_prev, numEntries)
+
+    def log_first_diff(self, numEntries: int) -> None:
+        if not self.snapshot or not self.snapshot_first:
+            return
+
+        DanswerTracer.log_diff(self.snapshot, self.snapshot_first, numEntries)
--- a/backend/danswer/background/task_utils.py
+++ b/backend/danswer/background/task_utils.py
@@ -14,12 +14,13 @@ from danswer.db.tasks import mark_task_start
 from danswer.db.tasks import register_task


-def name_cc_cleanup_task(connector_id: int, credential_id: int) -> str:
-    return f"cleanup_connector_credential_pair_{connector_id}_{credential_id}"
-
-
-def name_document_set_sync_task(document_set_id: int) -> str:
-    return f"sync_doc_set_{document_set_id}"
+def name_cc_prune_task(
+    connector_id: int | None = None, credential_id: int | None = None
+) -> str:
+    task_name = f"prune_connector_credential_pair_{connector_id}_{credential_id}"
+    if not connector_id or not credential_id:
+        task_name = "prune_connector_credential_pair"
+    return task_name


 T = TypeVar("T", bound=Callable)
@@ -84,9 +85,16 @@ def build_apply_async_wrapper(build_name_fn: Callable[..., str]) -> Callable[[AA
            kwargs_for_build_name = kwargs or {}
            task_name = build_name_fn(*args_for_build_name, **kwargs_for_build_name)
            with Session(get_sqlalchemy_engine()) as db_session:
-                # mark the task as started
+                # register_task must come before fn = apply_async or else the task
+                # might run mark_task_start (and crash) before the task row exists
+                db_task = register_task(task_name, db_session)
+
                task = fn(args, kwargs, *other_args, **other_kwargs)
-                register_task(task.id, task_name, db_session)
+
+                # we update the celery task id for diagnostic purposes
+                # but it isn't currently used by any code
+                db_task.task_id = task.id
+                db_session.commit()

            return task

--- a/backend/danswer/background/update.py
+++ b/backend/danswer/background/update.py
@@ -16,25 +16,33 @@ from danswer.configs.app_configs import CLEANUP_INDEXING_JOBS_TIMEOUT
 from danswer.configs.app_configs import DASK_JOB_CLIENT_ENABLED
 from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
 from danswer.configs.app_configs import NUM_INDEXING_WORKERS
+from danswer.configs.app_configs import NUM_SECONDARY_INDEXING_WORKERS
+from danswer.configs.constants import DocumentSource
+from danswer.configs.constants import POSTGRES_INDEXER_APP_NAME
 from danswer.db.connector import fetch_connectors
-from danswer.db.embedding_model import get_current_db_embedding_model
-from danswer.db.embedding_model import get_secondary_db_embedding_model
+from danswer.db.connector_credential_pair import fetch_connector_credential_pairs
 from danswer.db.engine import get_db_current_time
 from danswer.db.engine import get_sqlalchemy_engine
+from danswer.db.engine import init_sqlalchemy_engine
 from danswer.db.index_attempt import create_index_attempt
 from danswer.db.index_attempt import get_index_attempt
 from danswer.db.index_attempt import get_inprogress_index_attempts
-from danswer.db.index_attempt import get_last_attempt
+from danswer.db.index_attempt import get_last_attempt_for_cc_pair
 from danswer.db.index_attempt import get_not_started_index_attempts
 from danswer.db.index_attempt import mark_attempt_failed
-from danswer.db.models import Connector
-from danswer.db.models import EmbeddingModel
+from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import IndexAttempt
 from danswer.db.models import IndexingStatus
 from danswer.db.models import IndexModelStatus
+from danswer.db.models import SearchSettings
+from danswer.db.search_settings import get_current_search_settings
+from danswer.db.search_settings import get_secondary_search_settings
 from danswer.db.swap_index import check_index_swap
-from danswer.search.search_nlp_models import warm_up_encoders
+from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
+from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
 from danswer.utils.logger import setup_logger
+from danswer.utils.variable_functionality import global_version
+from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable
 from shared_configs.configs import INDEXING_MODEL_SERVER_HOST
 from shared_configs.configs import LOG_LEVEL
 from shared_configs.configs import MODEL_SERVER_PORT
@@ -51,41 +59,68 @@ _UNEXPECTED_STATE_FAILURE_REASON = (


 def _should_create_new_indexing(
-    connector: Connector,
+    cc_pair: ConnectorCredentialPair,
    last_index: IndexAttempt | None,
-    model: EmbeddingModel,
+    search_settings_instance: SearchSettings,
    secondary_index_building: bool,
    db_session: Session,
 ) -> bool:
+    connector = cc_pair.connector
+
+    # don't kick off indexing for `NOT_APPLICABLE` sources
+    if connector.source == DocumentSource.NOT_APPLICABLE:
+        return False
+
    # User can still manually create single indexing attempts via the UI for the
    # currently in use index
    if DISABLE_INDEX_UPDATE_ON_SWAP:
-        if model.status == IndexModelStatus.PRESENT and secondary_index_building:
+        if (
+            search_settings_instance.status == IndexModelStatus.PRESENT
+            and secondary_index_building
+        ):
            return False

    # When switching over models, always index at least once
-    if model.status == IndexModelStatus.FUTURE and not last_index:
-        if connector.id == 0:  # Ingestion API
-            return False
+    if search_settings_instance.status == IndexModelStatus.FUTURE:
+        if last_index:
+            # No new index if the last index attempt succeeded
+            # Once is enough. The model will never be able to swap otherwise.
+            if last_index.status == IndexingStatus.SUCCESS:
+                return False
+
+            # No new index if the last index attempt is waiting to start
+            if last_index.status == IndexingStatus.NOT_STARTED:
+                return False
+
+            # No new index if the last index attempt is running
+            if last_index.status == IndexingStatus.IN_PROGRESS:
+                return False
+        else:
+            if connector.id == 0:  # Ingestion API
+                return False
        return True

-    # If the connector is disabled, don't index
-    # NOTE: during an embedding model switch over, we ignore this
-    # and index the disabled connectors as well (which is why this if
-    # statement is below the first condition above)
-    if connector.disabled:
+    # If the connector is paused or is the ingestion API, don't index
+    # NOTE: during an embedding model switch over, the following logic
+    # is bypassed by the above check for a future model
+    if not cc_pair.status.is_active() or connector.id == 0:
        return False

-    if connector.refresh_freq is None:
-        return False
    if not last_index:
        return True

-    # Only one scheduled job per connector at a time
-    # Can schedule another one if the current one is already running however
-    # Because the currently running one will not be until the latest time
-    # Note, this last index is for the given embedding model
-    if last_index.status == IndexingStatus.NOT_STARTED:
+    if connector.refresh_freq is None:
+        return False
+
+    # Only one scheduled/ongoing job per connector at a time
+    # this prevents cases where
+    # (1) the "latest" index_attempt is scheduled so we show
+    #     that in the UI despite another index_attempt being in-progress
+    # (2) multiple scheduled index_attempts at a time
+    if (
+        last_index.status == IndexingStatus.NOT_STARTED
+        or last_index.status == IndexingStatus.IN_PROGRESS
+    ):
        return False

    current_db_time = get_db_current_time(db_session)
@@ -93,24 +128,14 @@ def _should_create_new_indexing(
    return time_since_index.total_seconds() >= connector.refresh_freq


-def _is_indexing_job_marked_as_finished(index_attempt: IndexAttempt | None) -> bool:
-    if index_attempt is None:
-        return False
-
-    return (
-        index_attempt.status == IndexingStatus.FAILED
-        or index_attempt.status == IndexingStatus.SUCCESS
-    )
-
-
 def _mark_run_failed(
    db_session: Session, index_attempt: IndexAttempt, failure_reason: str
 ) -> None:
    """Marks the `index_attempt` row as failed + updates the `
    connector_credential_pair` to reflect that the run failed"""
    logger.warning(
-        f"Marking in-progress attempt 'connector: {index_attempt.connector_id}, "
-        f"credential: {index_attempt.credential_id}' as failed due to {failure_reason}"
+        f"Marking in-progress attempt 'connector: {index_attempt.connector_credential_pair.connector_id}, "
+        f"credential: {index_attempt.connector_credential_pair.credential_id}' as failed due to {failure_reason}"
    )
    mark_attempt_failed(
        index_attempt=index_attempt,
@@ -129,7 +154,7 @@ def create_indexing_jobs(existing_jobs: dict[int, Future | SimpleJob]) -> None:
    3. There is not already an ongoing indexing attempt for this pair
    """
    with Session(get_sqlalchemy_engine()) as db_session:
-        ongoing: set[tuple[int | None, int | None, int]] = set()
+        ongoing: set[tuple[int | None, int]] = set()
        for attempt_id in existing_jobs:
            attempt = get_index_attempt(
                db_session=db_session, index_attempt_id=attempt_id
@@ -142,42 +167,43 @@ def create_indexing_jobs(existing_jobs: dict[int, Future | SimpleJob]) -> None:
                continue
            ongoing.add(
                (
-                    attempt.connector_id,
-                    attempt.credential_id,
-                    attempt.embedding_model_id,
+                    attempt.connector_credential_pair_id,
+                    attempt.search_settings_id,
                )
            )

-        embedding_models = [get_current_db_embedding_model(db_session)]
-        secondary_embedding_model = get_secondary_db_embedding_model(db_session)
-        if secondary_embedding_model is not None:
-            embedding_models.append(secondary_embedding_model)
+        # Get the primary search settings
+        primary_search_settings = get_current_search_settings(db_session)
+        search_settings = [primary_search_settings]

-        all_connectors = fetch_connectors(db_session)
-        for connector in all_connectors:
-            for association in connector.credentials:
-                for model in embedding_models:
-                    credential = association.credential
+        # Check for secondary search settings
+        secondary_search_settings = get_secondary_search_settings(db_session)
+        if secondary_search_settings is not None:
+            # If secondary settings exist, add them to the list
+            search_settings.append(secondary_search_settings)

-                    # Check if there is an ongoing indexing attempt for this connector + credential pair
-                    if (connector.id, credential.id, model.id) in ongoing:
-                        continue
+        all_connector_credential_pairs = fetch_connector_credential_pairs(db_session)
+        for cc_pair in all_connector_credential_pairs:
+            for search_settings_instance in search_settings:
+                # Check if there is an ongoing indexing attempt for this connector credential pair
+                if (cc_pair.id, search_settings_instance.id) in ongoing:
+                    continue

-                    last_attempt = get_last_attempt(
-                        connector.id, credential.id, model.id, db_session
-                    )
-                    if not _should_create_new_indexing(
-                        connector=connector,
-                        last_index=last_attempt,
-                        model=model,
-                        secondary_index_building=len(embedding_models) > 1,
-                        db_session=db_session,
-                    ):
-                        continue
+                last_attempt = get_last_attempt_for_cc_pair(
+                    cc_pair.id, search_settings_instance.id, db_session
+                )
+                if not _should_create_new_indexing(
+                    cc_pair=cc_pair,
+                    last_index=last_attempt,
+                    search_settings_instance=search_settings_instance,
+                    secondary_index_building=len(search_settings) > 1,
+                    db_session=db_session,
+                ):
+                    continue

-                    create_index_attempt(
-                        connector.id, credential.id, model.id, db_session
-                    )
+                create_index_attempt(
+                    cc_pair.id, search_settings_instance.id, db_session
+                )


 def cleanup_indexing_jobs(
@@ -185,7 +211,6 @@ def cleanup_indexing_jobs(
    timeout_hours: int = CLEANUP_INDEXING_JOBS_TIMEOUT,
 ) -> dict[int, Future | SimpleJob]:
    existing_jobs_copy = existing_jobs.copy()
-
    # clean up completed jobs
    with Session(get_sqlalchemy_engine()) as db_session:
        for attempt_id, job in existing_jobs.items():
@@ -194,10 +219,12 @@ def cleanup_indexing_jobs(
            )

            # do nothing for ongoing jobs that haven't been stopped
-            if not job.done() and not _is_indexing_job_marked_as_finished(
-                index_attempt
-            ):
-                continue
+            if not job.done():
+                if not index_attempt:
+                    continue
+
+                if not index_attempt.is_finished():
+                    continue

            if job.status == "error":
                logger.error(job.exception())
@@ -269,24 +296,33 @@ def kickoff_indexing_jobs(
    # Don't include jobs waiting in the Dask queue that just haven't started running
    # Also (rarely) don't include for jobs that started but haven't updated the indexing tables yet
    with Session(engine) as db_session:
+        # get_not_started_index_attempts orders its returned results from oldest to newest
+        # we must process attempts in a FIFO manner to prevent connector starvation
        new_indexing_attempts = [
-            (attempt, attempt.embedding_model)
+            (attempt, attempt.search_settings)
            for attempt in get_not_started_index_attempts(db_session)
            if attempt.id not in existing_jobs
        ]

-    logger.info(f"Found {len(new_indexing_attempts)} new indexing tasks.")
+    logger.debug(f"Found {len(new_indexing_attempts)} new indexing task(s).")

    if not new_indexing_attempts:
        return existing_jobs

-    for attempt, embedding_model in new_indexing_attempts:
+    indexing_attempt_count = 0
+
+    primary_client_full = False
+    secondary_client_full = False
+    for attempt, search_settings in new_indexing_attempts:
+        if primary_client_full and secondary_client_full:
+            break
+
        use_secondary_index = (
-            embedding_model.status == IndexModelStatus.FUTURE
-            if embedding_model is not None
+            search_settings.status == IndexModelStatus.FUTURE
+            if search_settings is not None
            else False
        )
-        if attempt.connector is None:
+        if attempt.connector_credential_pair.connector is None:
            logger.warning(
                f"Skipping index attempt as Connector has been deleted: {attempt}"
            )
@@ -295,7 +331,7 @@ def kickoff_indexing_jobs(
                    attempt, db_session, failure_reason="Connector is null"
                )
            continue
-        if attempt.credential is None:
+        if attempt.connector_credential_pair.credential is None:
            logger.warning(
                f"Skipping index attempt as Credential has been deleted: {attempt}"
            )
@@ -305,41 +341,81 @@ def kickoff_indexing_jobs(
                )
            continue

-        if use_secondary_index:
-            run = secondary_client.submit(
-                run_indexing_entrypoint, attempt.id, pure=False
-            )
+        if not use_secondary_index:
+            if not primary_client_full:
+                run = client.submit(
+                    run_indexing_entrypoint,
+                    attempt.id,
+                    attempt.connector_credential_pair_id,
+                    global_version.get_is_ee_version(),
+                    pure=False,
+                )
+                if not run:
+                    primary_client_full = True
        else:
-            run = client.submit(run_indexing_entrypoint, attempt.id, pure=False)
+            if not secondary_client_full:
+                run = secondary_client.submit(
+                    run_indexing_entrypoint,
+                    attempt.id,
+                    attempt.connector_credential_pair_id,
+                    global_version.get_is_ee_version(),
+                    pure=False,
+                )
+                if not run:
+                    secondary_client_full = True

        if run:
-            secondary_str = "(secondary index) " if use_secondary_index else ""
+            if indexing_attempt_count == 0:
+                logger.info(
+                    f"Indexing dispatch starts: pending={len(new_indexing_attempts)}"
+                )
+
+            indexing_attempt_count += 1
+            secondary_str = " (secondary index)" if use_secondary_index else ""
            logger.info(
-                f"Kicked off {secondary_str}"
-                f"indexing attempt for connector: '{attempt.connector.name}', "
-                f"with config: '{attempt.connector.connector_specific_config}', and "
-                f"with credentials: '{attempt.credential_id}'"
+                f"Indexing dispatched{secondary_str}: "
+                f"attempt_id={attempt.id} "
+                f"connector='{attempt.connector_credential_pair.connector.name}' "
+                f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
+                f"credentials='{attempt.connector_credential_pair.credential_id}'"
            )
            existing_jobs_copy[attempt.id] = run

+    if indexing_attempt_count > 0:
+        logger.info(
+            f"Indexing dispatch results: "
+            f"initial_pending={len(new_indexing_attempts)} "
+            f"started={indexing_attempt_count} "
+            f"remaining={len(new_indexing_attempts) - indexing_attempt_count}"
+        )
+
    return existing_jobs_copy


-def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> None:
+def update_loop(
+    delay: int = 10,
+    num_workers: int = NUM_INDEXING_WORKERS,
+    num_secondary_workers: int = NUM_SECONDARY_INDEXING_WORKERS,
+) -> None:
    engine = get_sqlalchemy_engine()
    with Session(engine) as db_session:
        check_index_swap(db_session=db_session)
-        db_embedding_model = get_current_db_embedding_model(db_session)
+        search_settings = get_current_search_settings(db_session)

-    # So that the first time users aren't surprised by really slow speed of first
-    # batch of documents indexed
-    logger.info("Running a first inference to warm up embedding model")
-    warm_up_encoders(
-        model_name=db_embedding_model.model_name,
-        normalize=db_embedding_model.normalize,
-        model_server_host=INDEXING_MODEL_SERVER_HOST,
-        model_server_port=MODEL_SERVER_PORT,
-    )
+        # So that the first time users aren't surprised by really slow speed of first
+        # batch of documents indexed
+
+        if search_settings.provider_type is None:
+            logger.notice("Running a first inference to warm up embedding model")
+            embedding_model = EmbeddingModel.from_db_model(
+                search_settings=search_settings,
+                server_host=INDEXING_MODEL_SERVER_HOST,
+                server_port=MODEL_SERVER_PORT,
+            )
+
+            warm_up_bi_encoder(
+                embedding_model=embedding_model,
+            )

    client_primary: Client | SimpleJobClient
    client_secondary: Client | SimpleJobClient
@@ -354,7 +430,7 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non
            silence_logs=logging.ERROR,
        )
        cluster_secondary = LocalCluster(
-            n_workers=num_workers,
+            n_workers=num_secondary_workers,
            threads_per_worker=1,
            silence_logs=logging.ERROR,
        )
@@ -364,18 +440,18 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non
            client_primary.register_worker_plugin(ResourceLogger())
    else:
        client_primary = SimpleJobClient(n_workers=num_workers)
-        client_secondary = SimpleJobClient(n_workers=num_workers)
+        client_secondary = SimpleJobClient(n_workers=num_secondary_workers)

    existing_jobs: dict[int, Future | SimpleJob] = {}

    while True:
        start = time.time()
        start_time_utc = datetime.utcfromtimestamp(start).strftime("%Y-%m-%d %H:%M:%S")
-        logger.info(f"Running update, current UTC time: {start_time_utc}")
+        logger.debug(f"Running update, current UTC time: {start_time_utc}")

        if existing_jobs:
            # TODO: make this debug level once the "no jobs are being scheduled" issue is resolved
-            logger.info(
+            logger.debug(
                "Found existing indexing jobs: "
                f"{[(attempt_id, job.status) for attempt_id, job in existing_jobs.items()]}"
            )
@@ -398,7 +474,10 @@ def update_loop(delay: int = 10, num_workers: int = NUM_INDEXING_WORKERS) -> Non


 def update__main() -> None:
-    logger.info("Starting Indexing Loop")
+    set_is_ee_based_on_env_variable()
+    init_sqlalchemy_engine(POSTGRES_INDEXER_APP_NAME)
+
+    logger.notice("Starting indexing service")
    update_loop()


--- a/backend/danswer/chat/chat_utils.py
+++ b/backend/danswer/chat/chat_utils.py
@@ -1,5 +1,4 @@
 import re
-from collections.abc import Sequence
 from typing import cast

 from sqlalchemy.orm import Session
@@ -9,53 +8,46 @@ from danswer.chat.models import LlmDoc
 from danswer.db.chat import get_chat_messages_by_session
 from danswer.db.models import ChatMessage
 from danswer.llm.answering.models import PreviousMessage
-from danswer.search.models import InferenceChunk
 from danswer.search.models import InferenceSection
 from danswer.utils.logger import setup_logger

 logger = setup_logger()


-def llm_doc_from_inference_section(inf_chunk: InferenceSection) -> LlmDoc:
+def llm_doc_from_inference_section(inference_section: InferenceSection) -> LlmDoc:
    return LlmDoc(
-        document_id=inf_chunk.document_id,
+        document_id=inference_section.center_chunk.document_id,
        # This one is using the combined content of all the chunks of the section
        # In default settings, this is the same as just the content of base chunk
-        content=inf_chunk.combined_content,
-        blurb=inf_chunk.blurb,
-        semantic_identifier=inf_chunk.semantic_identifier,
-        source_type=inf_chunk.source_type,
-        metadata=inf_chunk.metadata,
-        updated_at=inf_chunk.updated_at,
-        link=inf_chunk.source_links[0] if inf_chunk.source_links else None,
-        source_links=inf_chunk.source_links,
+        content=inference_section.combined_content,
+        blurb=inference_section.center_chunk.blurb,
+        semantic_identifier=inference_section.center_chunk.semantic_identifier,
+        source_type=inference_section.center_chunk.source_type,
+        metadata=inference_section.center_chunk.metadata,
+        updated_at=inference_section.center_chunk.updated_at,
+        link=inference_section.center_chunk.source_links[0]
+        if inference_section.center_chunk.source_links
+        else None,
+        source_links=inference_section.center_chunk.source_links,
    )


-def map_document_id_order(
-    chunks: Sequence[InferenceChunk | LlmDoc], one_indexed: bool = True
-) -> dict[str, int]:
-    order_mapping = {}
-    current = 1 if one_indexed else 0
-    for chunk in chunks:
-        if chunk.document_id not in order_mapping:
-            order_mapping[chunk.document_id] = current
-            current += 1
-
-    return order_mapping
-
-
 def create_chat_chain(
    chat_session_id: int,
    db_session: Session,
+    prefetch_tool_calls: bool = True,
+    # Optional id at which we finish processing
+    stop_at_message_id: int | None = None,
 ) -> tuple[ChatMessage, list[ChatMessage]]:
    """Build the linear chain of messages without including the root message"""
    mainline_messages: list[ChatMessage] = []
+
    all_chat_messages = get_chat_messages_by_session(
        chat_session_id=chat_session_id,
        user_id=None,
        db_session=db_session,
        skip_permission_check=True,
+        prefetch_tool_calls=prefetch_tool_calls,
    )
    id_to_msg = {msg.id: msg for msg in all_chat_messages}

@@ -71,7 +63,12 @@ def create_chat_chain(
    current_message: ChatMessage | None = root_message
    while current_message is not None:
        child_msg = current_message.latest_child_message
-        if not child_msg:
+
+        # Break if at the end of the chain
+        # or have reached the `final_id` of the submitted message
+        if not child_msg or (
+            stop_at_message_id and current_message.id == stop_at_message_id
+        ):
            break
        current_message = id_to_msg.get(child_msg)

--- a/Show More
+++ b/Show More