rebase needs fixing

post rebase fix
add sequential tool calls
2026-02-21 01:35:46 +00:00 · 2024-08-19 07:40:53 -07:00 · 2024-08-18 16:41:12 -07:00 · 2024-08-18 16:40:07 -07:00 · 2024-08-18 15:05:44 -07:00 · 2024-08-18 15:05:44 -07:00
560 changed files with 10984 additions and 23261 deletions
--- a/.github/actions/custom-build-and-push/action.yml
+++ b/.github/actions/custom-build-and-push/action.yml
@@ -1,76 +0,0 @@
-name: 'Build and Push Docker Image with Retry'
-description: 'Attempts to build and push a Docker image, with a retry on failure'
-inputs:
-  context:
-    description: 'Build context'
-    required: true
-  file:
-    description: 'Dockerfile location'
-    required: true
-  platforms:
-    description: 'Target platforms'
-    required: true
-  pull:
-    description: 'Always attempt to pull a newer version of the image'
-    required: false
-    default: 'true'
-  push:
-    description: 'Push the image to registry'
-    required: false
-    default: 'true'
-  load:
-    description: 'Load the image into Docker daemon'
-    required: false
-    default: 'true'
-  tags:
-    description: 'Image tags'
-    required: true
-  cache-from:
-    description: 'Cache sources'
-    required: false
-  cache-to:
-    description: 'Cache destinations'
-    required: false
-  retry-wait-time:
-    description: 'Time to wait before retry in seconds'
-    required: false
-    default: '5'
-
-runs:
-  using: "composite"
-  steps:
-    - name: Build and push Docker image (First Attempt)
-      id: buildx1
-      uses: docker/build-push-action@v5
-      continue-on-error: true
-      with:
-        context: ${{ inputs.context }}
-        file: ${{ inputs.file }}
-        platforms: ${{ inputs.platforms }}
-        pull: ${{ inputs.pull }}
-        push: ${{ inputs.push }}
-        load: ${{ inputs.load }}
-        tags: ${{ inputs.tags }}
-        cache-from: ${{ inputs.cache-from }}
-        cache-to: ${{ inputs.cache-to }}
-
-    - name: Wait to retry
-      if: steps.buildx1.outcome != 'success'
-      run: |
-        echo "First attempt failed. Waiting ${{ inputs.retry-wait-time }} seconds before retry..."
-        sleep ${{ inputs.retry-wait-time }}
-      shell: bash
-
-    - name: Build and push Docker image (Retry Attempt)
-      if: steps.buildx1.outcome != 'success'
-      uses: docker/build-push-action@v5
-      with:
-        context: ${{ inputs.context }}
-        file: ${{ inputs.file }}
-        platforms: ${{ inputs.platforms }}
-        pull: ${{ inputs.pull }}
-        push: ${{ inputs.push }}
-        load: ${{ inputs.load }}
-        tags: ${{ inputs.tags }}
-        cache-from: ${{ inputs.cache-from }}
-        cache-to: ${{ inputs.cache-to }}
--- a/.github/workflows/check-backend-changes.yml
+++ b/.github/workflows/check-backend-changes.yml
@@ -1,23 +0,0 @@
-name: Check Backend Changes
-
-on:
-  workflow_call:
-    outputs:
-      run-tests:
-        description: "Whether to run tests based on backend changes"
-        value: ${{ jobs.check-run-needed.outputs.run-tests }}
-
-jobs:
-  check-run-needed:
-    runs-on: ubuntu-latest
-    outputs:
-      run-tests: ${{ steps.check.outputs.run-tests }}
-    steps:
-      - uses: actions/checkout@v4
-      - id: check
-        run: |
-          if git diff --name-only ${{ github.event.before }} ${{ github.sha }} | grep -q '^backend/'; then
-            echo "run-tests=true" >> $GITHUB_OUTPUT
-          else
-            echo "run-tests=false" >> $GITHUB_OUTPUT
-          fi
--- a/.github/workflows/docker-build-backend-container-on-merge-group.yml
+++ b/.github/workflows/docker-build-backend-container-on-merge-group.yml
@@ -0,0 +1,33 @@
+name: Build Backend Image on Merge Group
+
+on:
+  merge_group:
+    types: [checks_requested]
+
+env:
+  REGISTRY_IMAGE: danswer/danswer-backend
+
+jobs:
+  build:
+    # TODO: make this a matrix build like the web containers
+    runs-on: 
+      group: amd64-image-builders
+      
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Backend Image Docker Build
+      uses: docker/build-push-action@v5
+      with:
+        context: ./backend
+        file: ./backend/Dockerfile
+        platforms: linux/amd64,linux/arm64
+        push: false
+        tags: |
+          ${{ env.REGISTRY_IMAGE }}:latest
+        build-args: |
+          DANSWER_VERSION=v0.0.1
--- a/.github/workflows/docker-build-web-container-on-merge-group.yml
+++ b/.github/workflows/docker-build-web-container-on-merge-group.yml
@@ -0,0 +1,53 @@
+name: Build Web Image on Merge Group
+
+on:
+  merge_group:
+    types: [checks_requested]
+
+env:
+  REGISTRY_IMAGE: danswer/danswer-web-server
+
+jobs:
+  build:
+    runs-on: 
+      group: ${{ matrix.platform == 'linux/amd64' && 'amd64-image-builders' || 'arm64-image-builders' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - linux/amd64
+          - linux/arm64
+
+    steps:
+      - name: Prepare
+        run: |
+          platform=${{ matrix.platform }}
+          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV          
+      
+      - name: Checkout
+        uses: actions/checkout@v4
+      
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY_IMAGE }}
+          tags: |
+            type=raw,value=${{ env.REGISTRY_IMAGE }}:latest
+      
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+    
+      - name: Build by digest
+        id: build
+        uses: docker/build-push-action@v5
+        with:
+          context: ./web
+          file: ./web/Dockerfile
+          platforms: ${{ matrix.platform }}
+          push: false
+          build-args: |
+            DANSWER_VERSION=v0.0.1
+          # needed due to weird interactions with the builds for different platforms  
+          no-cache: true
+          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/pr-helm-chart-testing.yml.disabled.txt
+++ b/.github/workflows/pr-helm-chart-testing.yml.disabled.txt
@@ -1,67 +0,0 @@
-# This workflow is intentionally disabled while we're still working on it
-# It's close to ready, but a race condition needs to be fixed with
-# API server and Vespa startup, and it needs to have a way to build/test against
-# local containers
-
-name: Helm - Lint and Test Charts
-
-on:
-  merge_group:
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  lint-test:
-    runs-on: Amd64
-
-    # fetch-depth 0 is required for helm/chart-testing-action
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 0
-        
-    - name: Set up Helm
-      uses: azure/setup-helm@v4.2.0
-      with:
-        version: v3.14.4
-      
-    - name: Set up Python
-      uses: actions/setup-python@v4
-      with:
-        python-version: '3.11'
-        cache: 'pip'
-        cache-dependency-path: |
-          backend/requirements/default.txt
-          backend/requirements/dev.txt
-          backend/requirements/model_server.txt
-    - run: |
-        python -m pip install --upgrade pip
-        pip install -r backend/requirements/default.txt
-        pip install -r backend/requirements/dev.txt
-        pip install -r backend/requirements/model_server.txt
-
-    - name: Set up chart-testing
-      uses: helm/chart-testing-action@v2.6.1
-
-    - name: Run chart-testing (list-changed)
-      id: list-changed
-      run: |
-        changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
-        if [[ -n "$changed" ]]; then
-          echo "changed=true" >> "$GITHUB_OUTPUT"
-        fi
-
-    - name: Run chart-testing (lint)
-#       if: steps.list-changed.outputs.changed == 'true'
-      run: ct lint --all --config ct.yaml --target-branch ${{ github.event.repository.default_branch }}
-
-    - name: Create kind cluster
-#       if: steps.list-changed.outputs.changed == 'true'
-      uses: helm/kind-action@v1.10.0
-
-    - name: Run chart-testing (install)
-#       if: steps.list-changed.outputs.changed == 'true'
-      run: ct install --all --config ct.yaml
-#       run: ct install --target-branch ${{ github.event.repository.default_branch }}
-      
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -1,17 +1,11 @@
 name: Python Checks

 on:
-  merge_group:
  pull_request:
    branches: [ main ]

 jobs:
-  check-changes:
-    uses: ./.github/workflows/check-backend-changes.yml
-
  mypy-check:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'true'
    runs-on: ubuntu-latest

    steps:
@@ -52,10 +46,3 @@ jobs:
      run: |
        cd backend
        black --check .
-
-  skip-tests:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'false'
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo "No changes in backend, skipping this test."
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -1,69 +0,0 @@
-name: Connector Tests
-
-on:
-  pull_request:
-    branches: [main]
-  schedule:
-    # This cron expression runs the job daily at 16:00 UTC (9am PT)
-    - cron: "0 16 * * *"
-
-env:
-  # Confluence
-  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
-  CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
-  CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
-  CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
-  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
-  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
-
-jobs:
-  check-changes:
-    uses: ./.github/workflows/check-backend-changes.yml
-
-  connectors-check:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'true'
-    runs-on: ubuntu-latest
-
-    env:
-      PYTHONPATH: ./backend
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-          cache: "pip"
-          cache-dependency-path: |
-            backend/requirements/default.txt
-            backend/requirements/dev.txt
-
-      - name: Install Dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r backend/requirements/default.txt
-          pip install -r backend/requirements/dev.txt
-
-      - name: Run Tests
-        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
-        run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
-
-      - name: Alert on Failure
-        if: failure() && github.event_name == 'schedule'
-        env:
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-        run: |
-          curl -X POST \
-            -H 'Content-type: application/json' \
-            --data '{"text":"Scheduled Connector Tests failed! Check the run at: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}"}' \
-            $SLACK_WEBHOOK
-
-  skip-tests:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'false'
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo "No changes in backend, skipping this test."
--- a/.github/workflows/pr-python-tests.yml
+++ b/.github/workflows/pr-python-tests.yml
@@ -1,17 +1,11 @@
 name: Python Unit Tests

 on:
-  merge_group:
  pull_request:
    branches: [ main ]

 jobs:
-  check-changes:
-    uses: ./.github/workflows/check-backend-changes.yml
-
  backend-check:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'true'
    runs-on: ubuntu-latest

    env:
@@ -39,11 +33,3 @@ jobs:
    - name: Run Tests
      shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
      run: py.test -o junit_family=xunit2 -xv --ff backend/tests/unit
-
-
-  skip-tests:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'false'
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo "No changes in backend, skipping this test."
--- a/.github/workflows/pr-quality-checks.yml
+++ b/.github/workflows/pr-quality-checks.yml
@@ -4,19 +4,18 @@ concurrency:
  cancel-in-progress: true

 on:
-  merge_group:
  pull_request: null

 jobs:
  quality-checks:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - uses: pre-commit/action@v3.0.0
-        with:
-          extra_args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || '' }}
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.11'
+    - uses: pre-commit/action@v3.0.0
+      with:
+        extra_args: --from-ref ${{ github.event.pull_request.base.sha }} --to-ref ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/run-it.yml
+++ b/.github/workflows/run-it.yml
@@ -1,172 +0,0 @@
-name: Run Integration Tests
-concurrency:
-  group: Run-Integration-Tests-${{ github.head_ref }}
-  cancel-in-progress: true
-
-on:
-  merge_group:
-  pull_request:
-    branches: [ main ]
-
-env:
-  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-
-jobs:
-  check-changes:
-    uses: ./.github/workflows/check-backend-changes.yml
-
-  integration-tests:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'true'
-    runs-on: 
-      group: 'arm64-image-builders'
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
-
-      # NOTE: we don't need to build the Web Docker image since it's not used
-      # during the IT for now. We have a separate action to verify it builds 
-      # succesfully
-      - name: Pull Web Docker image
-        run: |
-          docker pull danswer/danswer-web-server:latest
-          docker tag danswer/danswer-web-server:latest danswer/danswer-web-server:it
-
-      - name: Build Backend Docker image
-        uses: ./.github/actions/custom-build-and-push
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile
-          platforms: linux/arm64
-          tags: danswer/danswer-backend:it
-          cache-from: type=registry,ref=danswer/danswer-backend:it
-          cache-to: |
-            type=registry,ref=danswer/danswer-backend:it,mode=max
-            type=inline
-
-      - name: Build Model Server Docker image
-        uses: ./.github/actions/custom-build-and-push
-        with:
-          context: ./backend
-          file: ./backend/Dockerfile.model_server
-          platforms: linux/arm64
-          tags: danswer/danswer-model-server:it
-          cache-from: type=registry,ref=danswer/danswer-model-server:it
-          cache-to: |
-            type=registry,ref=danswer/danswer-model-server:it,mode=max
-            type=inline
-
-      - name: Build integration test Docker image
-        uses: ./.github/actions/custom-build-and-push
-        with:
-          context: ./backend
-          file: ./backend/tests/integration/Dockerfile
-          platforms: linux/arm64
-          tags: danswer/integration-test-runner:it
-          cache-from: type=registry,ref=danswer/integration-test-runner:it
-          cache-to: |
-            type=registry,ref=danswer/integration-test-runner:it,mode=max
-            type=inline
-
-      - name: Start Docker containers
-        run: |
-          cd deployment/docker_compose
-          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
-          AUTH_TYPE=basic \
-          REQUIRE_EMAIL_VERIFICATION=false \
-          DISABLE_TELEMETRY=true \
-          IMAGE_TAG=it \
-          docker compose -f docker-compose.dev.yml -p danswer-stack up -d
-        id: start_docker
-
-      - name: Wait for service to be ready
-        run: |
-          echo "Starting wait-for-service script..."
-          
-          start_time=$(date +%s)
-          timeout=300  # 5 minutes in seconds
-          
-          while true; do
-            current_time=$(date +%s)
-            elapsed_time=$((current_time - start_time))
-            
-            if [ $elapsed_time -ge $timeout ]; then
-              echo "Timeout reached. Service did not become ready in 5 minutes."
-              exit 1
-            fi
-            
-            # Use curl with error handling to ignore specific exit code 56
-            response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
-            
-            if [ "$response" = "200" ]; then
-              echo "Service is ready!"
-              break
-            elif [ "$response" = "curl_error" ]; then
-              echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
-            else
-              echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
-            fi
-            
-            sleep 5
-          done
-          echo "Finished waiting for service."
-
-      - name: Run integration tests
-        run: |
-          echo "Running integration tests..."
-          docker run --rm --network danswer-stack_default \
-            -e POSTGRES_HOST=relational_db \
-            -e POSTGRES_USER=postgres \
-            -e POSTGRES_PASSWORD=password \
-            -e POSTGRES_DB=postgres \
-            -e VESPA_HOST=index \
-            -e REDIS_HOST=cache \
-            -e API_SERVER_HOST=api_server \
-            -e OPENAI_API_KEY=${OPENAI_API_KEY} \
-            danswer/integration-test-runner:it
-        continue-on-error: true
-        id: run_tests
-
-      - name: Check test results
-        run: |
-          if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then
-            echo "Integration tests failed. Exiting with error."
-            exit 1
-          else
-            echo "All integration tests passed successfully."
-          fi
-
-      - name: Save Docker logs
-        if: success() || failure()
-        run: |
-          cd deployment/docker_compose
-          docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
-          mv docker-compose.log ${{ github.workspace }}/docker-compose.log
-      
-      - name: Upload logs
-        if: success() || failure()
-        uses: actions/upload-artifact@v3
-        with:
-          name: docker-logs
-          path: ${{ github.workspace }}/docker-compose.log
-
-      - name: Stop Docker containers
-        run: |
-          cd deployment/docker_compose
-          docker compose -f docker-compose.dev.yml -p danswer-stack down -v
-
-  skip-tests:
-    needs: check-changes
-    if: needs.check-changes.outputs.run-tests == 'false'
-    runs-on: ubuntu-latest
-    steps:
-      - run: echo "No changes in backend, skipping this test."
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,6 @@
 .mypy_cache
 .idea
 /deployment/data/nginx/app.conf
-.vscode/
+.vscode/launch.json
 *.sw?
 /backend/tests/regression/answer_quality/search_test_config.yaml
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -1,5 +1,5 @@
-# Copy this file to .env in the .vscode folder
-# Fill in the <REPLACE THIS> values as needed, it is recommended to set the GEN_AI_API_KEY value to avoid having to set up an LLM in the UI
+# Copy this file to .env at the base of the repo and fill in the <REPLACE THIS> values
+# This will help with development iteration speed and reduce repeat tasks for dev
 # Also check out danswer/backend/scripts/restart_containers.sh for a script to restart the containers which Danswer relies on outside of VSCode/Cursor processes

 # For local dev, often user Authentication is not needed
@@ -15,7 +15,7 @@ LOG_LEVEL=debug

 # This passes top N results to LLM an additional time for reranking prior to answer generation
 # This step is quite heavy on token usage so we disable it for dev generally
-DISABLE_LLM_DOC_RELEVANCE=False
+DISABLE_LLM_DOC_RELEVANCE=True


 # Useful if you want to toggle auth on/off (google_oauth/OIDC specifically)
@@ -27,9 +27,9 @@ REQUIRE_EMAIL_VERIFICATION=False

 # Set these so if you wipe the DB, you don't end up having to go through the UI every time
 GEN_AI_API_KEY=<REPLACE THIS>
-# If answer quality isn't important for dev, use gpt-4o-mini since it's cheaper
-GEN_AI_MODEL_VERSION=gpt-4o
-FAST_GEN_AI_MODEL_VERSION=gpt-4o
+# If answer quality isn't important for dev, use 3.5 turbo due to it being cheaper
+GEN_AI_MODEL_VERSION=gpt-3.5-turbo
+FAST_GEN_AI_MODEL_VERSION=gpt-3.5-turbo

 # For Danswer Slack Bot, overrides the UI values so no need to set this up via UI every time
 # Only needed if using DanswerBot
@@ -38,7 +38,7 @@ FAST_GEN_AI_MODEL_VERSION=gpt-4o


 # Python stuff
-PYTHONPATH=../backend
+PYTHONPATH=./backend
 PYTHONUNBUFFERED=1


@@ -49,3 +49,4 @@ BING_API_KEY=<REPLACE THIS>
 # Enable the full set of Danswer Enterprise Edition features
 # NOTE: DO NOT ENABLE THIS UNLESS YOU HAVE A PAID ENTERPRISE LICENSE (or if you are using this for local testing/development)
 ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=False
+
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -1,23 +1,15 @@
-/* Copy this file into '.vscode/launch.json' or merge its contents into your existing configurations. */
+/*
+
+  Copy this file into '.vscode/launch.json' or merge its
+  contents into your existing configurations.
+
+*/

 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
-    "compounds": [
-        {
-            "name": "Run All Danswer Services",
-            "configurations": [
-                "Web Server",
-                "Model Server",
-                "API Server",
-                "Indexing",
-                "Background Jobs",
-                "Slack Bot"
-            ]
-        }
-    ],
    "configurations": [
        {
            "name": "Web Server",
@@ -25,7 +17,7 @@
            "request": "launch",
            "cwd": "${workspaceRoot}/web",
            "runtimeExecutable": "npm",
-            "envFile": "${workspaceFolder}/.vscode/.env",
+            "envFile": "${workspaceFolder}/.env",
            "runtimeArgs": [
                "run", "dev"
            ],
@@ -33,12 +25,11 @@
        },
        {
            "name": "Model Server",
-            "consoleName": "Model Server",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "module": "uvicorn",
            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
+            "envFile": "${workspaceFolder}/.env",
            "env": {
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1"
@@ -48,16 +39,16 @@
                "--reload",
                "--port",
                "9000"
-            ]
+            ],
+            "consoleTitle": "Model Server"
        },
        {
            "name": "API Server",
-            "consoleName": "API Server",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "module": "uvicorn",
            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
+            "envFile": "${workspaceFolder}/.env",
            "env": {
                "LOG_DANSWER_MODEL_INTERACTIONS": "True",
                "LOG_LEVEL": "DEBUG",
@@ -68,32 +59,32 @@
                "--reload",
                "--port",
                "8080"
-            ]
+            ],
+            "consoleTitle": "API Server"
        },
        {
            "name": "Indexing",
-            "consoleName": "Indexing",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "danswer/background/update.py",
            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
+            "envFile": "${workspaceFolder}/.env",
            "env": {
                "ENABLE_MULTIPASS_INDEXING": "false",
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1",
                "PYTHONPATH": "."
-            }
+            },
+            "consoleTitle": "Indexing"
        },
        // Celery and all async jobs, usually would include indexing as well but this is handled separately above for dev
        {
            "name": "Background Jobs",
-            "consoleName": "Background Jobs",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "scripts/dev_run_background_jobs.py",
            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
+            "envFile": "${workspaceFolder}/.env",
            "env": {
                "LOG_DANSWER_MODEL_INTERACTIONS": "True",
                "LOG_LEVEL": "DEBUG",
@@ -102,18 +93,18 @@
            },
            "args": [
                "--no-indexing"
-            ]
+            ],
+            "consoleTitle": "Background Jobs"
        },
        // For the listner to access the Slack API,
        // DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
        {
            "name": "Slack Bot",
-            "consoleName": "Slack Bot",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "danswer/danswerbot/slack/listener.py",
            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
+            "envFile": "${workspaceFolder}/.env",
            "env": {
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1",
@@ -122,12 +113,11 @@
        },
        {
            "name": "Pytest",
-            "consoleName": "Pytest",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "module": "pytest",
            "cwd": "${workspaceFolder}/backend",
-            "envFile": "${workspaceFolder}/.vscode/.env",
+            "envFile": "${workspaceFolder}/.env",
            "env": {
                "LOG_LEVEL": "DEBUG",
                "PYTHONUNBUFFERED": "1",
@@ -138,16 +128,18 @@
                // Specify a sepcific module/test to run or provide nothing to run all tests
                //"tests/unit/danswer/llm/answering/test_prune_and_merge.py"
            ]
-        },
+        }
+    ],
+    "compounds": [
        {
-            "name": "Clear and Restart External Volumes and Containers",
-            "type": "node",
-            "request": "launch",
-            "runtimeExecutable": "bash",
-            "runtimeArgs": ["${workspaceFolder}/backend/scripts/restart_containers.sh"],
-            "cwd": "${workspaceFolder}",
-            "console": "integratedTerminal",
-            "stopOnEntry": true
+            "name": "Run Danswer",
+            "configurations": [
+                "Web Server",
+                "Model Server",
+                "API Server",
+                "Indexing",
+                "Background Jobs",
+            ]
        }
    ]
 }
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -48,24 +48,20 @@ We would love to see you there!


 ## Get Started 🚀
-Danswer being a fully functional app, relies on some external software, specifically:
+Danswer being a fully functional app, relies on some external pieces of software, specifically:
 - [Postgres](https://www.postgresql.org/) (Relational DB)
 - [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
- [Redis](https://redis.io/) (Cache)
- [Nginx](https://nginx.org/) (Not needed for development flows generally)

-
-> **Note:**
-> This guide provides instructions to set up the Danswer specific services outside of Docker because it's easier for
-> development purposes. However, you can also use the containers and update with local changes by providing the
-> `--build` flag.
+This guide provides instructions to set up the Danswer specific services outside of Docker because it's easier for
+development purposes but also feel free to just use the containers and update with local changes by providing the
+`--build` flag.


 ### Local Set Up
-Be sure to use Python version 3.11.
+It is recommended to use Python version 3.11

 If using a lower version, modifications will have to be made to the code.
-If using a higher version, sometimes some libraries will not be available (i.e. we had problems with Tensorflow in the past with higher versions of python).
+If using a higher version, the version of Tensorflow we use may not be available for your platform.


 #### Installing Requirements
@@ -77,9 +73,8 @@ python -m venv .venv
 source .venv/bin/activate
 ```

-> **Note:**
-> This virtual environment MUST NOT be set up WITHIN the danswer directory if you plan on using mypy within certain IDEs.
-> For simplicity, we recommend setting up the virtual environment outside of the danswer directory.
+--> Note that this virtual environment MUST NOT be set up WITHIN the danswer
+directory

 _For Windows, activate the virtual environment using Command Prompt:_
 ```bash
@@ -94,22 +89,19 @@ Install the required python dependencies:
 ```bash
 pip install -r danswer/backend/requirements/default.txt
 pip install -r danswer/backend/requirements/dev.txt
-pip install -r danswer/backend/requirements/ee.txt
 pip install -r danswer/backend/requirements/model_server.txt
 ```

-
 Install [Node.js and npm](https://docs.npmjs.com/downloading-and-installing-node-js-and-npm) for the frontend.
 Once the above is done, navigate to `danswer/web` run:
 ```bash
 npm i
 ```

-Install Playwright (headless browser required by the Web Connector)
+Install Playwright (required by the Web Connector)

-> **Note:**
-> If you have just run the pip install, open a new terminal and source the python virtual-env again.
-> This will pull the updated PATH to include playwright
+> Note: If you have just done the pip install, open a new terminal and source the python virtual-env again.
+This will update the path to include playwright

 Then install Playwright by running:
 ```bash
@@ -118,14 +110,11 @@ playwright install


 #### Dependent Docker Containers
-You will need Docker installed to run these containers.
-
-First navigate to `danswer/deployment/docker_compose`, then start up Postgres/Vespa/Redis with:
+First navigate to `danswer/deployment/docker_compose`, then start up Vespa and Postgres with:
 ```bash
-docker compose -f docker-compose.dev.yml -p danswer-stack up -d index relational_db cache
+docker compose -f docker-compose.dev.yml -p danswer-stack up -d index relational_db
 ```
-(index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
-
+(index refers to Vespa and relational_db refers to Postgres)

 #### Running Danswer
 To start the frontend, navigate to `danswer/web` and run:
@@ -138,10 +127,11 @@ Navigate to `danswer/backend` and run:
 ```bash
 uvicorn model_server.main:app --reload --port 9000
 ```
-
 _For Windows (for compatibility with both PowerShell and Command Prompt):_
 ```bash
-powershell -Command "uvicorn model_server.main:app --reload --port 9000"
+powershell -Command "
+    uvicorn model_server.main:app --reload --port 9000
+"
 ```

 The first time running Danswer, you will need to run the DB migrations for Postgres.
@@ -164,7 +154,6 @@ To run the backend API server, navigate back to `danswer/backend` and run:
 ```bash
 AUTH_TYPE=disabled uvicorn danswer.main:app --reload --port 8080
 ```
-
 _For Windows (for compatibility with both PowerShell and Command Prompt):_
 ```bash
 powershell -Command "
@@ -173,28 +162,20 @@ powershell -Command "
 "
 ```

-> **Note:**
-> If you need finer logging, add the additional environment variable `LOG_LEVEL=DEBUG` to the relevant services.
-
+Note: if you need finer logging, add the additional environment variable `LOG_LEVEL=DEBUG` to the relevant services.

 ### Formatting and Linting
 #### Backend
 For the backend, you'll need to setup pre-commit hooks (black / reorder-python-imports).
 First, install pre-commit (if you don't have it already) following the instructions
 [here](https://pre-commit.com/#installation).
-
-With the virtual environment active, install the pre-commit library with:
-```bash
-pip install pre-commit
-```
-
 Then, from the `danswer/backend` directory, run:
 ```bash
 pre-commit install
 ```

 Additionally, we use `mypy` for static type checking.
-Danswer is fully type-annotated, and we want to keep it that way! 
+Danswer is fully type-annotated, and we would like to keep it that way! 
 To run the mypy checks manually, run `python -m mypy .` from the `danswer/backend` directory.


@@ -205,7 +186,6 @@ Please double check that prettier passes before creating a pull request.


 ### Release Process
-Danswer loosely follows the SemVer versioning standard.
-Major changes are released with a "minor" version bump. Currently we use patch release versions to indicate small feature changes.
+Danswer follows the semver versioning standard.
 A set of Docker containers will be pushed automatically to DockerHub with every tag.
 You can see the containers [here](https://hub.docker.com/search?q=danswer%2F).
--- a/CONTRIBUTING_MACOS.md
+++ b/CONTRIBUTING_MACOS.md
@@ -1,31 +0,0 @@
-## Some additional notes for Mac Users
-The base instructions to set up the development environment are located in [CONTRIBUTING.md](https://github.com/danswer-ai/danswer/blob/main/CONTRIBUTING.md).
-
-### Setting up Python
-Ensure [Homebrew](https://brew.sh/) is already set up.
-
-Then install python 3.11.
-```bash
-brew install python@3.11
-```
-
-Add python 3.11 to your path: add the following line to ~/.zshrc
-```
-export PATH="$(brew --prefix)/opt/python@3.11/libexec/bin:$PATH"
-```
-
-> **Note:**
-> You will need to open a new terminal for the path change above to take effect.
-
-
-### Setting up Docker
-On macOS, you will need to install [Docker Desktop](https://www.docker.com/products/docker-desktop/) and 
-ensure it is running before continuing with the docker commands.
-
-
-### Formatting and Linting
-MacOS will likely require you to remove some quarantine attributes on some of the hooks for them to execute properly.
-After installing pre-commit, run the following command:
-```bash
-sudo xattr -r -d com.apple.quarantine ~/.cache/pre-commit
-```
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -75,8 +75,8 @@ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
 # Pre-downloading NLTK for setups with limited egress
 RUN python -c "import nltk; \
 nltk.download('stopwords', quiet=True); \
+nltk.download('wordnet', quiet=True); \
 nltk.download('punkt', quiet=True);"
-# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed

 # Set up application files
 WORKDIR /app
--- a/backend/Dockerfile.model_server
+++ b/backend/Dockerfile.model_server
@@ -22,18 +22,14 @@ RUN apt-get remove -y --allow-remove-essential perl-base && \
 # Download model weights
 # Run Nomic to pull in the custom architecture and have it cached locally
 RUN python -c "from transformers import AutoTokenizer; \
-AutoTokenizer.from_pretrained('distilbert-base-uncased'); \
-AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
+AutoTokenizer.from_pretrained('distilbert-base-uncased', cache_folder='/root/.cache/temp_huggingface/hub/'); \
+AutoTokenizer.from_pretrained('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_folder='/root/.cache/temp_huggingface/hub/'); \
 from huggingface_hub import snapshot_download; \
-snapshot_download(repo_id='danswer/hybrid-intent-token-classifier', revision='v1.0.3'); \
-snapshot_download('nomic-ai/nomic-embed-text-v1'); \
-snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
+snapshot_download(repo_id='danswer/hybrid-intent-token-classifier', revision='v1.0.3', cache_dir='/root/.cache/temp_huggingface/hub/'); \
+snapshot_download('nomic-ai/nomic-embed-text-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \
+snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1', cache_dir='/root/.cache/temp_huggingface/hub/'); \
 from sentence_transformers import SentenceTransformer; \
-SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True);"
-
-# In case the user has volumes mounted to /root/.cache/huggingface that they've downloaded while
-# running Danswer, don't overwrite it with the built in cache folder
-RUN mv /root/.cache/huggingface /root/.cache/temp_huggingface
+SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True, cache_folder='/root/.cache/temp_huggingface/hub/');"

 WORKDIR /app

--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -8,7 +8,6 @@ from sqlalchemy import pool
 from sqlalchemy.engine import Connection
 from sqlalchemy.ext.asyncio import create_async_engine
 from celery.backends.database.session import ResultModelBase  # type: ignore
-from sqlalchemy.schema import SchemaItem

 # this is the Alembic Config object, which provides
 # access to the values within the .ini file in use.
@@ -16,9 +15,7 @@ config = context.config

 # Interpret the config file for Python logging.
 # This line sets up loggers basically.
-if config.config_file_name is not None and config.attributes.get(
-    "configure_logger", True
-):
+if config.config_file_name is not None:
    fileConfig(config.config_file_name)

 # add your model's MetaData object here
@@ -32,20 +29,6 @@ target_metadata = [Base.metadata, ResultModelBase.metadata]
 # my_important_option = config.get_main_option("my_important_option")
 # ... etc.

-EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}
-
-
-def include_object(
-    object: SchemaItem,
-    name: str,
-    type_: str,
-    reflected: bool,
-    compare_to: SchemaItem | None,
-) -> bool:
-    if type_ == "table" and name in EXCLUDE_TABLES:
-        return False
-    return True
-

 def run_migrations_offline() -> None:
    """Run migrations in 'offline' mode.
@@ -72,11 +55,7 @@ def run_migrations_offline() -> None:


 def do_run_migrations(connection: Connection) -> None:
-    context.configure(
-        connection=connection,
-        target_metadata=target_metadata,  # type: ignore
-        include_object=include_object,
-    )  # type: ignore
+    context.configure(connection=connection, target_metadata=target_metadata)  # type: ignore

    with context.begin_transaction():
        context.run_migrations()
--- a/backend/alembic/versions/1f60f60c3401_embedding_model_search_settings.py
+++ b/backend/alembic/versions/1f60f60c3401_embedding_model_search_settings.py
@@ -1,135 +0,0 @@
-"""embedding model -> search settings
-
-Revision ID: 1f60f60c3401
-Revises: f17bf3b0d9f1
-Create Date: 2024-08-25 12:39:51.731632
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-from danswer.configs.chat_configs import NUM_POSTPROCESSED_RESULTS
-
-# revision identifiers, used by Alembic.
-revision = "1f60f60c3401"
-down_revision = "f17bf3b0d9f1"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.drop_constraint(
-        "index_attempt__embedding_model_fk", "index_attempt", type_="foreignkey"
-    )
-    # Rename the table
-    op.rename_table("embedding_model", "search_settings")
-
-    # Add new columns
-    op.add_column(
-        "search_settings",
-        sa.Column(
-            "multipass_indexing", sa.Boolean(), nullable=False, server_default="true"
-        ),
-    )
-    op.add_column(
-        "search_settings",
-        sa.Column(
-            "multilingual_expansion",
-            postgresql.ARRAY(sa.String()),
-            nullable=False,
-            server_default="{}",
-        ),
-    )
-    op.add_column(
-        "search_settings",
-        sa.Column(
-            "disable_rerank_for_streaming",
-            sa.Boolean(),
-            nullable=False,
-            server_default="false",
-        ),
-    )
-    op.add_column(
-        "search_settings", sa.Column("rerank_model_name", sa.String(), nullable=True)
-    )
-    op.add_column(
-        "search_settings", sa.Column("rerank_provider_type", sa.String(), nullable=True)
-    )
-    op.add_column(
-        "search_settings", sa.Column("rerank_api_key", sa.String(), nullable=True)
-    )
-    op.add_column(
-        "search_settings",
-        sa.Column(
-            "num_rerank",
-            sa.Integer(),
-            nullable=False,
-            server_default=str(NUM_POSTPROCESSED_RESULTS),
-        ),
-    )
-
-    # Add the new column as nullable initially
-    op.add_column(
-        "index_attempt", sa.Column("search_settings_id", sa.Integer(), nullable=True)
-    )
-
-    # Populate the new column with data from the existing embedding_model_id
-    op.execute("UPDATE index_attempt SET search_settings_id = embedding_model_id")
-
-    # Create the foreign key constraint
-    op.create_foreign_key(
-        "fk_index_attempt_search_settings",
-        "index_attempt",
-        "search_settings",
-        ["search_settings_id"],
-        ["id"],
-    )
-
-    # Make the new column non-nullable
-    op.alter_column("index_attempt", "search_settings_id", nullable=False)
-
-    # Drop the old embedding_model_id column
-    op.drop_column("index_attempt", "embedding_model_id")
-
-
-def downgrade() -> None:
-    # Add back the embedding_model_id column
-    op.add_column(
-        "index_attempt", sa.Column("embedding_model_id", sa.Integer(), nullable=True)
-    )
-
-    # Populate the old column with data from search_settings_id
-    op.execute("UPDATE index_attempt SET embedding_model_id = search_settings_id")
-
-    # Make the old column non-nullable
-    op.alter_column("index_attempt", "embedding_model_id", nullable=False)
-
-    # Drop the foreign key constraint
-    op.drop_constraint(
-        "fk_index_attempt_search_settings", "index_attempt", type_="foreignkey"
-    )
-
-    # Drop the new search_settings_id column
-    op.drop_column("index_attempt", "search_settings_id")
-
-    # Rename the table back
-    op.rename_table("search_settings", "embedding_model")
-
-    # Remove added columns
-    op.drop_column("embedding_model", "num_rerank")
-    op.drop_column("embedding_model", "rerank_api_key")
-    op.drop_column("embedding_model", "rerank_provider_type")
-    op.drop_column("embedding_model", "rerank_model_name")
-    op.drop_column("embedding_model", "disable_rerank_for_streaming")
-    op.drop_column("embedding_model", "multilingual_expansion")
-    op.drop_column("embedding_model", "multipass_indexing")
-
-    op.create_foreign_key(
-        "index_attempt__embedding_model_fk",
-        "index_attempt",
-        "embedding_model",
-        ["embedding_model_id"],
-        ["id"],
-    )
--- a/backend/alembic/versions/2d2304e27d8c_add_above_below_to_persona.py
+++ b/backend/alembic/versions/2d2304e27d8c_add_above_below_to_persona.py
@@ -1,32 +0,0 @@
-"""Add Above Below to Persona
-
-Revision ID: 2d2304e27d8c
-Revises: 4b08d97e175a
-Create Date: 2024-08-21 19:15:15.762948
-
-"""
-from alembic import op
-import sqlalchemy as sa
-
-# revision identifiers, used by Alembic.
-revision = "2d2304e27d8c"
-down_revision = "4b08d97e175a"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.add_column("persona", sa.Column("chunks_above", sa.Integer(), nullable=True))
-    op.add_column("persona", sa.Column("chunks_below", sa.Integer(), nullable=True))
-
-    op.execute(
-        "UPDATE persona SET chunks_above = 1, chunks_below = 1 WHERE chunks_above IS NULL AND chunks_below IS NULL"
-    )
-
-    op.alter_column("persona", "chunks_above", nullable=False)
-    op.alter_column("persona", "chunks_below", nullable=False)
-
-
-def downgrade() -> None:
-    op.drop_column("persona", "chunks_below")
-    op.drop_column("persona", "chunks_above")
--- a/backend/alembic/versions/351faebd379d_add_curator_fields.py
+++ b/backend/alembic/versions/351faebd379d_add_curator_fields.py
@@ -1,90 +0,0 @@
-"""Add curator fields
-
-Revision ID: 351faebd379d
-Revises: ee3f4b47fad5
-Create Date: 2024-08-15 22:37:08.397052
-
-"""
-from alembic import op
-import sqlalchemy as sa
-
-# revision identifiers, used by Alembic.
-revision = "351faebd379d"
-down_revision = "ee3f4b47fad5"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    # Add is_curator column to User__UserGroup table
-    op.add_column(
-        "user__user_group",
-        sa.Column("is_curator", sa.Boolean(), nullable=False, server_default="false"),
-    )
-
-    # Use batch mode to modify the enum type
-    with op.batch_alter_table("user", schema=None) as batch_op:
-        batch_op.alter_column(  # type: ignore[attr-defined]
-            "role",
-            type_=sa.Enum(
-                "BASIC",
-                "ADMIN",
-                "CURATOR",
-                "GLOBAL_CURATOR",
-                name="userrole",
-                native_enum=False,
-            ),
-            existing_type=sa.Enum("BASIC", "ADMIN", name="userrole", native_enum=False),
-            existing_nullable=False,
-        )
-    # Create the association table
-    op.create_table(
-        "credential__user_group",
-        sa.Column("credential_id", sa.Integer(), nullable=False),
-        sa.Column("user_group_id", sa.Integer(), nullable=False),
-        sa.ForeignKeyConstraint(
-            ["credential_id"],
-            ["credential.id"],
-        ),
-        sa.ForeignKeyConstraint(
-            ["user_group_id"],
-            ["user_group.id"],
-        ),
-        sa.PrimaryKeyConstraint("credential_id", "user_group_id"),
-    )
-    op.add_column(
-        "credential",
-        sa.Column(
-            "curator_public", sa.Boolean(), nullable=False, server_default="false"
-        ),
-    )
-
-
-def downgrade() -> None:
-    # Update existing records to ensure they fit within the BASIC/ADMIN roles
-    op.execute(
-        "UPDATE \"user\" SET role = 'ADMIN' WHERE role IN ('CURATOR', 'GLOBAL_CURATOR')"
-    )
-
-    # Remove is_curator column from User__UserGroup table
-    op.drop_column("user__user_group", "is_curator")
-
-    with op.batch_alter_table("user", schema=None) as batch_op:
-        batch_op.alter_column(  # type: ignore[attr-defined]
-            "role",
-            type_=sa.Enum(
-                "BASIC", "ADMIN", name="userrole", native_enum=False, length=20
-            ),
-            existing_type=sa.Enum(
-                "BASIC",
-                "ADMIN",
-                "CURATOR",
-                "GLOBAL_CURATOR",
-                name="userrole",
-                native_enum=False,
-            ),
-            existing_nullable=False,
-        )
-    # Drop the association table
-    op.drop_table("credential__user_group")
-    op.drop_column("credential", "curator_public")
--- a/backend/alembic/versions/4b08d97e175a_change_default_prune_freq.py
+++ b/backend/alembic/versions/4b08d97e175a_change_default_prune_freq.py
@@ -1,34 +0,0 @@
-"""change default prune_freq
-
-Revision ID: 4b08d97e175a
-Revises: d9ec13955951
-Create Date: 2024-08-20 15:28:52.993827
-
-"""
-from alembic import op
-
-# revision identifiers, used by Alembic.
-revision = "4b08d97e175a"
-down_revision = "d9ec13955951"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.execute(
-        """
-        UPDATE connector
-        SET prune_freq = 2592000
-        WHERE prune_freq = 86400
-        """
-    )
-
-
-def downgrade() -> None:
-    op.execute(
-        """
-        UPDATE connector
-        SET prune_freq = 86400
-        WHERE prune_freq = 2592000
-        """
-    )
--- a/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py
+++ b/backend/alembic/versions/52a219fb5233_add_last_synced_and_last_modified_to_document_table.py
@@ -1,66 +0,0 @@
-"""Add last synced and last modified to document table
-
-Revision ID: 52a219fb5233
-Revises: f17bf3b0d9f1
-Create Date: 2024-08-28 17:40:46.077470
-
-"""
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.sql import func
-
-# revision identifiers, used by Alembic.
-revision = "52a219fb5233"
-down_revision = "f7e58d357687"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    # last modified represents the last time anything needing syncing to vespa changed
-    # including row metadata and the document itself. This obviously does not include
-    # the last_synced column.
-    op.add_column(
-        "document",
-        sa.Column(
-            "last_modified",
-            sa.DateTime(timezone=True),
-            nullable=False,
-            server_default=func.now(),
-        ),
-    )
-
-    # last synced represents the last time this document was synced to Vespa
-    op.add_column(
-        "document",
-        sa.Column("last_synced", sa.DateTime(timezone=True), nullable=True),
-    )
-
-    # Set last_synced to the same value as last_modified for existing rows
-    op.execute(
-        """
-        UPDATE document
-        SET last_synced = last_modified
-        """
-    )
-
-    op.create_index(
-        op.f("ix_document_last_modified"),
-        "document",
-        ["last_modified"],
-        unique=False,
-    )
-
-    op.create_index(
-        op.f("ix_document_last_synced"),
-        "document",
-        ["last_synced"],
-        unique=False,
-    )
-
-
-def downgrade() -> None:
-    op.drop_index(op.f("ix_document_last_synced"), table_name="document")
-    op.drop_index(op.f("ix_document_last_modified"), table_name="document")
-    op.drop_column("document", "last_synced")
-    op.drop_column("document", "last_modified")
--- a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
+++ b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
@@ -10,7 +10,7 @@ import sqlalchemy as sa

 from danswer.db.models import IndexModelStatus
 from danswer.search.enums import RecencyBiasSetting
-from danswer.search.enums import SearchType
+from danswer.search.models import SearchType

 # revision identifiers, used by Alembic.
 revision = "776b3bbe9092"
--- a/backend/alembic/versions/8a87bd6ec550_associate_index_attempts_with_ccpair.py
+++ b/backend/alembic/versions/8a87bd6ec550_associate_index_attempts_with_ccpair.py
@@ -35,22 +35,18 @@ def upgrade() -> None:
    op.execute(
        """
        UPDATE index_attempt ia
-        SET connector_credential_pair_id = (
-            SELECT id FROM connector_credential_pair ccp
-            WHERE
-                (ia.connector_id IS NULL OR ccp.connector_id = ia.connector_id)
-                AND (ia.credential_id IS NULL OR ccp.credential_id = ia.credential_id)
-            LIMIT 1
-        )
-        WHERE ia.connector_id IS NOT NULL OR ia.credential_id IS NOT NULL
-        """
-    )
-
-    # For good measure
-    op.execute(
-        """
-        DELETE FROM index_attempt
-        WHERE connector_credential_pair_id IS NULL
+        SET connector_credential_pair_id =
+            CASE
+                WHEN ia.credential_id IS NULL THEN
+                    (SELECT id FROM connector_credential_pair
+                     WHERE connector_id = ia.connector_id
+                     LIMIT 1)
+                ELSE
+                    (SELECT id FROM connector_credential_pair
+                     WHERE connector_id = ia.connector_id
+                     AND credential_id = ia.credential_id)
+            END
+        WHERE ia.connector_id IS NOT NULL
        """
    )

--- a/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py
+++ b/backend/alembic/versions/a3795dce87be_migration_confluence_to_be_explicit.py
@@ -1,158 +0,0 @@
-"""migration confluence to be explicit
-
-Revision ID: a3795dce87be
-Revises: 1f60f60c3401
-Create Date: 2024-09-01 13:52:12.006740
-
-"""
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-from sqlalchemy.sql import table, column
-
-revision = "a3795dce87be"
-down_revision = "1f60f60c3401"
-branch_labels: None = None
-depends_on: None = None
-
-
-def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
-    from urllib.parse import urlparse
-
-    def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
-        parsed_url = urlparse(wiki_url)
-        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split('/spaces')[0]}"
-        path_parts = parsed_url.path.split("/")
-        space = path_parts[3]
-        page_id = path_parts[5] if len(path_parts) > 5 else ""
-        return wiki_base, space, page_id
-
-    def _extract_confluence_keys_from_datacenter_url(
-        wiki_url: str,
-    ) -> tuple[str, str, str]:
-        DISPLAY = "/display/"
-        PAGE = "/pages/"
-        parsed_url = urlparse(wiki_url)
-        wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.split(DISPLAY)[0]}"
-        space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
-        page_id = ""
-        if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
-            page_id = content[1]
-        return wiki_base, space, page_id
-
-    is_confluence_cloud = (
-        ".atlassian.net/wiki/spaces/" in wiki_url
-        or ".jira.com/wiki/spaces/" in wiki_url
-    )
-
-    if is_confluence_cloud:
-        wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(wiki_url)
-    else:
-        wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
-            wiki_url
-        )
-
-    return wiki_base, space, page_id, is_confluence_cloud
-
-
-def reconstruct_confluence_url(
-    wiki_base: str, space: str, page_id: str, is_cloud: bool
-) -> str:
-    if is_cloud:
-        url = f"{wiki_base}/spaces/{space}"
-        if page_id:
-            url += f"/pages/{page_id}"
-    else:
-        url = f"{wiki_base}/display/{space}"
-        if page_id:
-            url += f"/pages/{page_id}"
-    return url
-
-
-def upgrade() -> None:
-    connector = table(
-        "connector",
-        column("id", sa.Integer),
-        column("source", sa.String()),
-        column("input_type", sa.String()),
-        column("connector_specific_config", postgresql.JSONB),
-    )
-
-    # Fetch all Confluence connectors
-    connection = op.get_bind()
-    confluence_connectors = connection.execute(
-        sa.select(connector).where(
-            sa.and_(
-                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
-            )
-        )
-    ).fetchall()
-
-    for row in confluence_connectors:
-        config = row.connector_specific_config
-        wiki_page_url = config["wiki_page_url"]
-        wiki_base, space, page_id, is_cloud = extract_confluence_keys_from_url(
-            wiki_page_url
-        )
-
-        new_config = {
-            "wiki_base": wiki_base,
-            "space": space,
-            "page_id": page_id,
-            "is_cloud": is_cloud,
-        }
-
-        for key, value in config.items():
-            if key not in ["wiki_page_url"]:
-                new_config[key] = value
-
-        op.execute(
-            connector.update()
-            .where(connector.c.id == row.id)
-            .values(connector_specific_config=new_config)
-        )
-
-
-def downgrade() -> None:
-    connector = table(
-        "connector",
-        column("id", sa.Integer),
-        column("source", sa.String()),
-        column("input_type", sa.String()),
-        column("connector_specific_config", postgresql.JSONB),
-    )
-
-    confluence_connectors = (
-        op.get_bind()
-        .execute(
-            sa.select(connector).where(
-                connector.c.source == "CONFLUENCE", connector.c.input_type == "POLL"
-            )
-        )
-        .fetchall()
-    )
-
-    for row in confluence_connectors:
-        config = row.connector_specific_config
-        if all(key in config for key in ["wiki_base", "space", "is_cloud"]):
-            wiki_page_url = reconstruct_confluence_url(
-                config["wiki_base"],
-                config["space"],
-                config.get("page_id", ""),
-                config["is_cloud"],
-            )
-
-            new_config = {"wiki_page_url": wiki_page_url}
-            new_config.update(
-                {
-                    k: v
-                    for k, v in config.items()
-                    if k not in ["wiki_base", "space", "page_id", "is_cloud"]
-                }
-            )
-
-            op.execute(
-                connector.update()
-                .where(connector.c.id == row.id)
-                .values(connector_specific_config=new_config)
-            )
--- a/backend/alembic/versions/ba98eba0f66a_add_support_for_litellm_proxy_in_.py
+++ b/backend/alembic/versions/ba98eba0f66a_add_support_for_litellm_proxy_in_.py
@@ -1,26 +0,0 @@
-"""add support for litellm proxy in reranking
-
-Revision ID: ba98eba0f66a
-Revises: bceb1e139447
-Create Date: 2024-09-06 10:36:04.507332
-
-"""
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "ba98eba0f66a"
-down_revision = "bceb1e139447"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.add_column(
-        "search_settings", sa.Column("rerank_api_url", sa.String(), nullable=True)
-    )
-
-
-def downgrade() -> None:
-    op.drop_column("search_settings", "rerank_api_url")
--- a/backend/alembic/versions/bceb1e139447_add_base_url_to_cloudembeddingprovider.py
+++ b/backend/alembic/versions/bceb1e139447_add_base_url_to_cloudembeddingprovider.py
@@ -1,26 +0,0 @@
-"""Add base_url to CloudEmbeddingProvider
-
-Revision ID: bceb1e139447
-Revises: a3795dce87be
-Create Date: 2024-08-28 17:00:52.554580
-
-"""
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "bceb1e139447"
-down_revision = "a3795dce87be"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.add_column(
-        "embedding_provider", sa.Column("api_url", sa.String(), nullable=True)
-    )
-
-
-def downgrade() -> None:
-    op.drop_column("embedding_provider", "api_url")
--- a/backend/alembic/versions/c5b692fa265c_add_index_attempt_errors_table.py
+++ b/backend/alembic/versions/c5b692fa265c_add_index_attempt_errors_table.py
@@ -1,57 +0,0 @@
-"""Add index_attempt_errors table
-
-Revision ID: c5b692fa265c
-Revises: 4a951134c801
-Create Date: 2024-08-08 14:06:39.581972
-
-"""
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision = "c5b692fa265c"
-down_revision = "4a951134c801"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.create_table(
-        "index_attempt_errors",
-        sa.Column("id", sa.Integer(), nullable=False),
-        sa.Column("index_attempt_id", sa.Integer(), nullable=True),
-        sa.Column("batch", sa.Integer(), nullable=True),
-        sa.Column(
-            "doc_summaries",
-            postgresql.JSONB(astext_type=sa.Text()),
-            nullable=False,
-        ),
-        sa.Column("error_msg", sa.Text(), nullable=True),
-        sa.Column("traceback", sa.Text(), nullable=True),
-        sa.Column(
-            "time_created",
-            sa.DateTime(timezone=True),
-            server_default=sa.text("now()"),
-            nullable=False,
-        ),
-        sa.ForeignKeyConstraint(
-            ["index_attempt_id"],
-            ["index_attempt.id"],
-        ),
-        sa.PrimaryKeyConstraint("id"),
-    )
-    op.create_index(
-        "index_attempt_id",
-        "index_attempt_errors",
-        ["time_created"],
-        unique=False,
-    )
-    # ### end Alembic commands ###
-
-
-def downgrade() -> None:
-    # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index("index_attempt_id", table_name="index_attempt_errors")
-    op.drop_table("index_attempt_errors")
-    # ### end Alembic commands ###
--- a/backend/alembic/versions/d9ec13955951_remove__dim_suffix_from_model_name.py
+++ b/backend/alembic/versions/d9ec13955951_remove__dim_suffix_from_model_name.py
@@ -1,31 +0,0 @@
-"""Remove _alt suffix from model_name
-
-Revision ID: d9ec13955951
-Revises: da4c21c69164
-Create Date: 2024-08-20 16:31:32.955686
-
-"""
-
-from alembic import op
-
-
-# revision identifiers, used by Alembic.
-revision = "d9ec13955951"
-down_revision = "da4c21c69164"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.execute(
-        """
-        UPDATE embedding_model
-        SET model_name = regexp_replace(model_name, '__danswer_alt_index$', '')
-        WHERE model_name LIKE '%__danswer_alt_index'
-    """
-    )
-
-
-def downgrade() -> None:
-    # We can't reliably add the __danswer_alt_index suffix back, so we'll leave this empty
-    pass
--- a/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py
+++ b/backend/alembic/versions/da4c21c69164_chosen_assistants_changed_to_jsonb.py
@@ -1,65 +0,0 @@
-"""chosen_assistants changed to jsonb
-
-Revision ID: da4c21c69164
-Revises: c5b692fa265c
-Create Date: 2024-08-18 19:06:47.291491
-
-"""
-import json
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision = "da4c21c69164"
-down_revision = "c5b692fa265c"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    conn = op.get_bind()
-    existing_ids_and_chosen_assistants = conn.execute(
-        sa.text("select id, chosen_assistants from public.user")
-    )
-    op.drop_column(
-        "user",
-        "chosen_assistants",
-    )
-    op.add_column(
-        "user",
-        sa.Column(
-            "chosen_assistants",
-            postgresql.JSONB(astext_type=sa.Text()),
-            nullable=True,
-        ),
-    )
-    for id, chosen_assistants in existing_ids_and_chosen_assistants:
-        conn.execute(
-            sa.text(
-                "update public.user set chosen_assistants = :chosen_assistants where id = :id"
-            ),
-            {"chosen_assistants": json.dumps(chosen_assistants), "id": id},
-        )
-
-
-def downgrade() -> None:
-    conn = op.get_bind()
-    existing_ids_and_chosen_assistants = conn.execute(
-        sa.text("select id, chosen_assistants from public.user")
-    )
-    op.drop_column(
-        "user",
-        "chosen_assistants",
-    )
-    op.add_column(
-        "user",
-        sa.Column("chosen_assistants", postgresql.ARRAY(sa.Integer()), nullable=True),
-    )
-    for id, chosen_assistants in existing_ids_and_chosen_assistants:
-        conn.execute(
-            sa.text(
-                "update public.user set chosen_assistants = :chosen_assistants where id = :id"
-            ),
-            {"chosen_assistants": chosen_assistants, "id": id},
-        )
--- a/backend/alembic/versions/dbaa756c2ccf_embedding_models.py
+++ b/backend/alembic/versions/dbaa756c2ccf_embedding_models.py
@@ -9,7 +9,7 @@ from alembic import op
 import sqlalchemy as sa
 from sqlalchemy import table, column, String, Integer, Boolean

-from danswer.db.search_settings import (
+from danswer.db.embedding_model import (
    get_new_default_embedding_model,
    get_old_default_embedding_model,
    user_has_overridden_embedding_model,
@@ -71,14 +71,14 @@ def upgrade() -> None:
                "query_prefix": old_embedding_model.query_prefix,
                "passage_prefix": old_embedding_model.passage_prefix,
                "index_name": old_embedding_model.index_name,
-                "status": IndexModelStatus.PRESENT,
+                "status": old_embedding_model.status,
            }
        ],
    )
    # if the user has not overridden the default embedding model via env variables,
    # insert the new default model into the database to auto-upgrade them
    if not user_has_overridden_embedding_model():
-        new_embedding_model = get_new_default_embedding_model()
+        new_embedding_model = get_new_default_embedding_model(is_present=False)
        op.bulk_insert(
            EmbeddingModel,
            [
--- a/backend/alembic/versions/eb690a089310_migrate_tool_calls.py
+++ b/backend/alembic/versions/eb690a089310_migrate_tool_calls.py
@@ -0,0 +1,59 @@
+"""migrate tool calls
+
+Revision ID: eb690a089310
+Revises: ee3f4b47fad5
+Create Date: 2024-08-04 17:07:47.533051
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "eb690a089310"
+down_revision = "ee3f4b47fad5"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create the new column
+    op.add_column(
+        "chat_message", sa.Column("tool_call_id", sa.Integer(), nullable=True)
+    )
+    op.create_foreign_key(
+        "fk_chat_message_tool_call",
+        "chat_message",
+        "tool_call",
+        ["tool_call_id"],
+        ["id"],
+    )
+
+    # Migrate existing data
+    op.execute(
+        "UPDATE chat_message SET tool_call_id = (SELECT id FROM tool_call WHERE tool_call.message_id = chat_message.id LIMIT 1)"
+    )
+
+    # Drop the old relationship
+    op.drop_constraint("tool_call_message_id_fkey", "tool_call", type_="foreignkey")
+    op.drop_column("tool_call", "message_id")
+
+
+def downgrade() -> None:
+    # Add back the old column
+    op.add_column(
+        "tool_call",
+        sa.Column("message_id", sa.INTEGER(), autoincrement=False, nullable=True),
+    )
+    op.create_foreign_key(
+        "tool_call_message_id_fkey", "tool_call", "chat_message", ["message_id"], ["id"]
+    )
+
+    # Migrate data back
+    op.execute(
+        "UPDATE tool_call SET message_id = (SELECT id FROM chat_message WHERE chat_message.tool_call_id = tool_call.id)"
+    )
+
+    # Drop the new column
+    op.drop_constraint("fk_chat_message_tool_call", "chat_message", type_="foreignkey")
+    op.drop_column("chat_message", "tool_call_id")
--- a/backend/alembic/versions/ee3f4b47fad5_added_alternate_model_to_chat_message.py
+++ b/backend/alembic/versions/ee3f4b47fad5_added_alternate_model_to_chat_message.py
@@ -1,7 +1,7 @@
 """Added alternate model to chat message

 Revision ID: ee3f4b47fad5
-Revises: 2d2304e27d8c
+Revises: 4a951134c801
 Create Date: 2024-08-12 00:11:50.915845

 """
@@ -12,17 +12,17 @@ import sqlalchemy as sa

 # revision identifiers, used by Alembic.
 revision = "ee3f4b47fad5"
-down_revision = "2d2304e27d8c"
-branch_labels: None = None
-depends_on: None = None
+down_revision = "4a951134c801"
+branch_labels = None
+depends_on = None


 def upgrade() -> None:
    op.add_column(
        "chat_message",
-        sa.Column("overridden_model", sa.String(length=255), nullable=True),
+        sa.Column("alternate_model", sa.String(length=255), nullable=True),
    )


 def downgrade() -> None:
-    op.drop_column("chat_message", "overridden_model")
+    op.drop_column("chat_message", "alternate_model")
--- a/backend/alembic/versions/f17bf3b0d9f1_embedding_provider_by_provider_type.py
+++ b/backend/alembic/versions/f17bf3b0d9f1_embedding_provider_by_provider_type.py
@@ -1,172 +0,0 @@
-"""embedding provider by provider type
-
-Revision ID: f17bf3b0d9f1
-Revises: 351faebd379d
-Create Date: 2024-08-21 13:13:31.120460
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "f17bf3b0d9f1"
-down_revision = "351faebd379d"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    # Add provider_type column to embedding_provider
-    op.add_column(
-        "embedding_provider",
-        sa.Column("provider_type", sa.String(50), nullable=True),
-    )
-
-    # Update provider_type with existing name values
-    op.execute("UPDATE embedding_provider SET provider_type = UPPER(name)")
-
-    # Make provider_type not nullable
-    op.alter_column("embedding_provider", "provider_type", nullable=False)
-
-    # Drop the foreign key constraint in embedding_model table
-    op.drop_constraint(
-        "fk_embedding_model_cloud_provider", "embedding_model", type_="foreignkey"
-    )
-
-    # Drop the existing primary key constraint
-    op.drop_constraint("embedding_provider_pkey", "embedding_provider", type_="primary")
-
-    # Create a new primary key constraint on provider_type
-    op.create_primary_key(
-        "embedding_provider_pkey", "embedding_provider", ["provider_type"]
-    )
-
-    # Add provider_type column to embedding_model
-    op.add_column(
-        "embedding_model",
-        sa.Column("provider_type", sa.String(50), nullable=True),
-    )
-
-    # Update provider_type for existing embedding models
-    op.execute(
-        """
-        UPDATE embedding_model
-        SET provider_type = (
-            SELECT provider_type
-            FROM embedding_provider
-            WHERE embedding_provider.id = embedding_model.cloud_provider_id
-        )
-    """
-    )
-
-    # Drop the old id column from embedding_provider
-    op.drop_column("embedding_provider", "id")
-
-    # Drop the name column from embedding_provider
-    op.drop_column("embedding_provider", "name")
-
-    # Drop the default_model_id column from embedding_provider
-    op.drop_column("embedding_provider", "default_model_id")
-
-    # Drop the old cloud_provider_id column from embedding_model
-    op.drop_column("embedding_model", "cloud_provider_id")
-
-    # Create the new foreign key constraint
-    op.create_foreign_key(
-        "fk_embedding_model_cloud_provider",
-        "embedding_model",
-        "embedding_provider",
-        ["provider_type"],
-        ["provider_type"],
-    )
-
-
-def downgrade() -> None:
-    # Drop the foreign key constraint in embedding_model table
-    op.drop_constraint(
-        "fk_embedding_model_cloud_provider", "embedding_model", type_="foreignkey"
-    )
-
-    # Add back the cloud_provider_id column to embedding_model
-    op.add_column(
-        "embedding_model", sa.Column("cloud_provider_id", sa.Integer(), nullable=True)
-    )
-    op.add_column("embedding_provider", sa.Column("id", sa.Integer(), nullable=True))
-
-    # Assign incrementing IDs to embedding providers
-    op.execute(
-        """
-        CREATE SEQUENCE IF NOT EXISTS embedding_provider_id_seq;"""
-    )
-    op.execute(
-        """
-        UPDATE embedding_provider SET id = nextval('embedding_provider_id_seq');
-    """
-    )
-
-    # Update cloud_provider_id based on provider_type
-    op.execute(
-        """
-        UPDATE embedding_model
-        SET cloud_provider_id = CASE
-            WHEN provider_type IS NULL THEN NULL
-            ELSE (
-                SELECT id
-                FROM embedding_provider
-                WHERE embedding_provider.provider_type = embedding_model.provider_type
-            )
-        END
-    """
-    )
-
-    # Drop the provider_type column from embedding_model
-    op.drop_column("embedding_model", "provider_type")
-
-    # Add back the columns to embedding_provider
-    op.add_column("embedding_provider", sa.Column("name", sa.String(50), nullable=True))
-    op.add_column(
-        "embedding_provider", sa.Column("default_model_id", sa.Integer(), nullable=True)
-    )
-
-    # Drop the existing primary key constraint on provider_type
-    op.drop_constraint("embedding_provider_pkey", "embedding_provider", type_="primary")
-
-    # Create the original primary key constraint on id
-    op.create_primary_key("embedding_provider_pkey", "embedding_provider", ["id"])
-
-    # Update name with existing provider_type values
-    op.execute(
-        """
-        UPDATE embedding_provider
-        SET name = CASE
-            WHEN provider_type = 'OPENAI' THEN 'OpenAI'
-            WHEN provider_type = 'COHERE' THEN 'Cohere'
-            WHEN provider_type = 'GOOGLE' THEN 'Google'
-            WHEN provider_type = 'VOYAGE' THEN 'Voyage'
-            ELSE provider_type
-        END
-    """
-    )
-
-    # Drop the provider_type column from embedding_provider
-    op.drop_column("embedding_provider", "provider_type")
-
-    # Recreate the foreign key constraint in embedding_model table
-    op.create_foreign_key(
-        "fk_embedding_model_cloud_provider",
-        "embedding_model",
-        "embedding_provider",
-        ["cloud_provider_id"],
-        ["id"],
-    )
-
-    # Recreate the foreign key constraint in embedding_model table
-    op.create_foreign_key(
-        "fk_embedding_provider_default_model",
-        "embedding_provider",
-        "embedding_model",
-        ["default_model_id"],
-        ["id"],
-    )
--- a/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py
+++ b/backend/alembic/versions/f7e58d357687_add_has_web_column_to_user.py
@@ -1,26 +0,0 @@
-"""add has_web_login column to user
-
-Revision ID: f7e58d357687
-Revises: bceb1e139447
-Create Date: 2024-09-07 20:20:54.522620
-
-"""
-from alembic import op
-import sqlalchemy as sa
-
-# revision identifiers, used by Alembic.
-revision = "f7e58d357687"
-down_revision = "ba98eba0f66a"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    op.add_column(
-        "user",
-        sa.Column("has_web_login", sa.Boolean(), nullable=False, server_default="true"),
-    )
-
-
-def downgrade() -> None:
-    op.drop_column("user", "has_web_login")
--- a/backend/danswer/access/access.py
+++ b/backend/danswer/access/access.py
@@ -3,49 +3,21 @@ from sqlalchemy.orm import Session
 from danswer.access.models import DocumentAccess
 from danswer.access.utils import prefix_user
 from danswer.configs.constants import PUBLIC_DOC_PAT
-from danswer.db.document import get_access_info_for_document
-from danswer.db.document import get_access_info_for_documents
+from danswer.db.document import get_acccess_info_for_documents
 from danswer.db.models import User
 from danswer.utils.variable_functionality import fetch_versioned_implementation


-def _get_access_for_document(
-    document_id: str,
-    db_session: Session,
-) -> DocumentAccess:
-    info = get_access_info_for_document(
-        db_session=db_session,
-        document_id=document_id,
-    )
-
-    if not info:
-        return DocumentAccess.build(user_ids=[], user_groups=[], is_public=False)
-
-    return DocumentAccess.build(user_ids=info[1], user_groups=[], is_public=info[2])
-
-
-def get_access_for_document(
-    document_id: str,
-    db_session: Session,
-) -> DocumentAccess:
-    versioned_get_access_for_document_fn = fetch_versioned_implementation(
-        "danswer.access.access", "_get_access_for_document"
-    )
-    return versioned_get_access_for_document_fn(document_id, db_session)  # type: ignore
-
-
 def _get_access_for_documents(
    document_ids: list[str],
    db_session: Session,
 ) -> dict[str, DocumentAccess]:
-    document_access_info = get_access_info_for_documents(
+    document_access_info = get_acccess_info_for_documents(
        db_session=db_session,
        document_ids=document_ids,
    )
    return {
-        document_id: DocumentAccess.build(
-            user_ids=user_ids, user_groups=[], is_public=is_public
-        )
+        document_id: DocumentAccess.build(user_ids, [], is_public)
        for document_id, user_ids, is_public in document_access_info
    }

--- a/backend/danswer/auth/noauth_user.py
+++ b/backend/danswer/auth/noauth_user.py
@@ -13,7 +13,7 @@ from danswer.server.manage.models import UserPreferences
 def set_no_auth_user_preferences(
    store: DynamicConfigStore, preferences: UserPreferences
 ) -> None:
-    store.store(KV_NO_AUTH_USER_PREFERENCES_KEY, preferences.model_dump())
+    store.store(KV_NO_AUTH_USER_PREFERENCES_KEY, preferences.dict())


 def load_no_auth_user_preferences(store: DynamicConfigStore) -> UserPreferences:
--- a/backend/danswer/auth/schemas.py
+++ b/backend/danswer/auth/schemas.py
@@ -5,20 +5,8 @@ from fastapi_users import schemas


 class UserRole(str, Enum):
-    """
-    User roles
-    - Basic can't perform any admin actions
-    - Admin can perform all admin actions
-    - Curator can perform admin actions for
-        groups they are curators of
-    - Global Curator can perform admin actions
-        for all groups they are a member of
-    """
-
    BASIC = "basic"
    ADMIN = "admin"
-    CURATOR = "curator"
-    GLOBAL_CURATOR = "global_curator"


 class UserStatus(str, Enum):
@@ -33,9 +21,7 @@ class UserRead(schemas.BaseUser[uuid.UUID]):

 class UserCreate(schemas.BaseUserCreate):
    role: UserRole = UserRole.BASIC
-    has_web_login: bool | None = True


 class UserUpdate(schemas.BaseUserUpdate):
    role: UserRole
-    has_web_login: bool | None = True
--- a/backend/danswer/auth/users.py
+++ b/backend/danswer/auth/users.py
@@ -8,17 +8,13 @@ from email.mime.text import MIMEText
 from typing import Optional
 from typing import Tuple

-from email_validator import EmailNotValidError
-from email_validator import validate_email
 from fastapi import APIRouter
 from fastapi import Depends
 from fastapi import HTTPException
 from fastapi import Request
 from fastapi import Response
 from fastapi import status
-from fastapi.security import OAuth2PasswordRequestForm
 from fastapi_users import BaseUserManager
-from fastapi_users import exceptions
 from fastapi_users import FastAPIUsers
 from fastapi_users import models
 from fastapi_users import schemas
@@ -35,7 +31,6 @@ from sqlalchemy.orm import Session
 from danswer.auth.invited_users import get_invited_users
 from danswer.auth.schemas import UserCreate
 from danswer.auth.schemas import UserRole
-from danswer.auth.schemas import UserUpdate
 from danswer.configs.app_configs import AUTH_TYPE
 from danswer.configs.app_configs import DISABLE_AUTH
 from danswer.configs.app_configs import EMAIL_FROM
@@ -45,7 +40,6 @@ from danswer.configs.app_configs import SMTP_PASS
 from danswer.configs.app_configs import SMTP_PORT
 from danswer.configs.app_configs import SMTP_SERVER
 from danswer.configs.app_configs import SMTP_USER
-from danswer.configs.app_configs import TRACK_EXTERNAL_IDP_EXPIRY
 from danswer.configs.app_configs import USER_AUTH_SECRET
 from danswer.configs.app_configs import VALID_EMAIL_DOMAINS
 from danswer.configs.app_configs import WEB_DOMAIN
@@ -65,7 +59,10 @@ from danswer.db.users import get_user_by_email
 from danswer.utils.logger import setup_logger
 from danswer.utils.telemetry import optional_telemetry
 from danswer.utils.telemetry import RecordType
-from danswer.utils.variable_functionality import fetch_versioned_implementation
+from danswer.utils.variable_functionality import (
+    fetch_versioned_implementation,
+)
+

 logger = setup_logger()

@@ -84,7 +81,7 @@ def verify_auth_setting() -> None:
            "User must choose a valid user authentication method: "
            "disabled, basic, or google_oauth"
        )
-    logger.notice(f"Using Auth Type: {AUTH_TYPE.value}")
+    logger.info(f"Using Auth Type: {AUTH_TYPE.value}")


 def get_display_email(email: str | None, space_less: bool = False) -> str:
@@ -109,28 +106,8 @@ def user_needs_to_be_verified() -> bool:

 def verify_email_is_invited(email: str) -> None:
    whitelist = get_invited_users()
-    if not whitelist:
-        return
-
-    if not email:
-        raise PermissionError("Email must be specified")
-
-    email_info = validate_email(email)  # can raise EmailNotValidError
-
-    for email_whitelist in whitelist:
-        try:
-            # normalized emails are now being inserted into the db
-            # we can remove this normalization on read after some time has passed
-            email_info_whitelist = validate_email(email_whitelist)
-        except EmailNotValidError:
-            continue
-
-        # oddly, normalization does not include lowercasing the user part of the
-        # email address ... which we want to allow
-        if email_info.normalized.lower() == email_info_whitelist.normalized.lower():
-            return
-
-    raise PermissionError("User not on allowed user whitelist")
+    if (whitelist and email not in whitelist) or not email:
+        raise PermissionError("User not on allowed user whitelist")


 def verify_email_in_whitelist(email: str) -> None:
@@ -187,7 +164,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        user_create: schemas.UC | UserCreate,
        safe: bool = False,
        request: Optional[Request] = None,
-    ) -> User:
+    ) -> models.UP:
        verify_email_is_invited(user_create.email)
        verify_email_domain(user_create.email)
        if hasattr(user_create, "role"):
@@ -196,27 +173,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                user_create.role = UserRole.ADMIN
            else:
                user_create.role = UserRole.BASIC
-        user = None
-        try:
-            user = await super().create(user_create, safe=safe, request=request)  # type: ignore
-        except exceptions.UserAlreadyExists:
-            user = await self.get_by_email(user_create.email)
-            # Handle case where user has used product outside of web and is now creating an account through web
-            if (
-                not user.has_web_login
-                and hasattr(user_create, "has_web_login")
-                and user_create.has_web_login
-            ):
-                user_update = UserUpdate(
-                    password=user_create.password,
-                    has_web_login=True,
-                    role=user_create.role,
-                    is_verified=user_create.is_verified,
-                )
-                user = await self.update(user_update, user)
-            else:
-                raise exceptions.UserAlreadyExists()
-        return user
+        return await super().create(user_create, safe=safe, request=request)  # type: ignore

    async def oauth_callback(
        self: "BaseUserManager[models.UOAP, models.ID]",
@@ -246,34 +203,18 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            is_verified_by_default=is_verified_by_default,
        )

-        # NOTE: Most IdPs have very short expiry times, and we don't want to force the user to
-        # re-authenticate that frequently, so by default this is disabled
-        if expires_at and TRACK_EXTERNAL_IDP_EXPIRY:
+        # NOTE: google oauth expires after 1hr. We don't want to force the user to
+        # re-authenticate that frequently, so for now we'll just ignore this for
+        # google oauth users
+        if expires_at and AUTH_TYPE != AuthType.GOOGLE_OAUTH:
            oidc_expiry = datetime.fromtimestamp(expires_at, tz=timezone.utc)
            await self.user_db.update(user, update_dict={"oidc_expiry": oidc_expiry})
-
-        # this is needed if an organization goes from `TRACK_EXTERNAL_IDP_EXPIRY=true` to `false`
-        # otherwise, the oidc expiry will always be old, and the user will never be able to login
-        if user.oidc_expiry and not TRACK_EXTERNAL_IDP_EXPIRY:
-            await self.user_db.update(user, update_dict={"oidc_expiry": None})
-
-        # Handle case where user has used product outside of web and is now creating an account through web
-        if not user.has_web_login:
-            await self.user_db.update(
-                user,
-                update_dict={
-                    "is_verified": is_verified_by_default,
-                    "has_web_login": True,
-                },
-            )
-            user.is_verified = is_verified_by_default
-            user.has_web_login = True
        return user

    async def on_after_register(
        self, user: User, request: Optional[Request] = None
    ) -> None:
-        logger.notice(f"User {user.id} has registered.")
+        logger.info(f"User {user.id} has registered.")
        optional_telemetry(
            record_type=RecordType.SIGN_UP,
            data={"action": "create"},
@@ -283,35 +224,19 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
    async def on_after_forgot_password(
        self, user: User, token: str, request: Optional[Request] = None
    ) -> None:
-        logger.notice(f"User {user.id} has forgot their password. Reset token: {token}")
+        logger.info(f"User {user.id} has forgot their password. Reset token: {token}")

    async def on_after_request_verify(
        self, user: User, token: str, request: Optional[Request] = None
    ) -> None:
        verify_email_domain(user.email)

-        logger.notice(
+        logger.info(
            f"Verification requested for user {user.id}. Verification token: {token}"
        )

        send_user_verification_email(user.email, token)

-    async def authenticate(
-        self, credentials: OAuth2PasswordRequestForm
-    ) -> Optional[User]:
-        user = await super().authenticate(credentials)
-        if user is None:
-            try:
-                user = await self.get_by_email(credentials.username)
-                if not user.has_web_login:
-                    raise HTTPException(
-                        status_code=status.HTTP_403_FORBIDDEN,
-                        detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD",
-                    )
-            except exceptions.UserNotExists:
-                pass
-        return user
-

 async def get_user_manager(
    user_db: SQLAlchemyUserDatabase = Depends(get_user_db),
@@ -445,28 +370,6 @@ async def current_user(
    return await double_check_user(user)


-async def current_curator_or_admin_user(
-    user: User | None = Depends(current_user),
-) -> User | None:
-    if DISABLE_AUTH:
-        return None
-
-    if not user or not hasattr(user, "role"):
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Access denied. User is not authenticated or lacks role information.",
-        )
-
-    allowed_roles = {UserRole.GLOBAL_CURATOR, UserRole.CURATOR, UserRole.ADMIN}
-    if user.role not in allowed_roles:
-        raise HTTPException(
-            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Access denied. User is not a curator or admin.",
-        )
-
-    return user
-
-
 async def current_admin_user(user: User | None = Depends(current_user)) -> User | None:
    if DISABLE_AUTH:
        return None
@@ -474,12 +377,7 @@ async def current_admin_user(user: User | None = Depends(current_user)) -> User
    if not user or not hasattr(user, "role") or user.role != UserRole.ADMIN:
        raise HTTPException(
            status_code=status.HTTP_403_FORBIDDEN,
-            detail="Access denied. User must be an admin to perform this action.",
+            detail="Access denied. User is not an admin.",
        )

    return user
-
-
-def get_default_admin_user_emails_() -> list[str]:
-    # No default seeding available for Danswer MIT
-    return []
--- a/backend/danswer/background/celery/celery_app.py
+++ b/backend/danswer/background/celery/celery_app.py
@@ -1,85 +1,54 @@
-import json
 from datetime import timedelta
-from typing import Any
 from typing import cast

-import redis
-from celery import Celery
-from celery import signals
-from celery import Task
-from celery.contrib.abortable import AbortableTask  # type: ignore
-from celery.exceptions import SoftTimeLimitExceeded
-from celery.exceptions import TaskRevokedError
-from celery.signals import beat_init
-from celery.signals import worker_init
-from celery.states import READY_STATES
-from celery.utils.log import get_task_logger
-from redis import Redis
-from sqlalchemy import inspect
-from sqlalchemy import text
+from celery import Celery  # type: ignore
 from sqlalchemy.orm import Session

-from danswer.access.access import get_access_for_document
-from danswer.background.celery.celery_redis import RedisConnectorCredentialPair
-from danswer.background.celery.celery_redis import RedisDocumentSet
-from danswer.background.celery.celery_redis import RedisUserGroup
 from danswer.background.celery.celery_utils import extract_ids_from_runnable_connector
 from danswer.background.celery.celery_utils import should_kick_off_deletion_of_cc_pair
 from danswer.background.celery.celery_utils import should_prune_cc_pair
+from danswer.background.celery.celery_utils import should_sync_doc_set
 from danswer.background.connector_deletion import delete_connector_credential_pair
 from danswer.background.connector_deletion import delete_connector_credential_pair_batch
 from danswer.background.task_utils import build_celery_task_wrapper
 from danswer.background.task_utils import name_cc_cleanup_task
 from danswer.background.task_utils import name_cc_prune_task
+from danswer.background.task_utils import name_document_set_sync_task
 from danswer.configs.app_configs import JOB_TIMEOUT
-from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
-from danswer.configs.constants import DanswerCeleryPriority
-from danswer.configs.constants import DanswerRedisLocks
-from danswer.configs.constants import POSTGRES_CELERY_BEAT_APP_NAME
-from danswer.configs.constants import POSTGRES_CELERY_WORKER_APP_NAME
-from danswer.configs.constants import PostgresAdvisoryLocks
+from danswer.configs.constants import POSTGRES_CELERY_APP_NAME
 from danswer.connectors.factory import instantiate_connector
 from danswer.connectors.models import InputType
-from danswer.db.connector_credential_pair import (
-    get_connector_credential_pair,
-)
+from danswer.db.connector_credential_pair import get_connector_credential_pair
 from danswer.db.connector_credential_pair import get_connector_credential_pairs
 from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
-from danswer.db.document import count_documents_by_needs_sync
-from danswer.db.document import get_document
 from danswer.db.document import get_documents_for_connector_credential_pair
-from danswer.db.document import mark_document_as_synced
+from danswer.db.document import prepare_to_modify_documents
 from danswer.db.document_set import delete_document_set
-from danswer.db.document_set import fetch_document_set_for_document
 from danswer.db.document_set import fetch_document_sets
+from danswer.db.document_set import fetch_document_sets_for_documents
+from danswer.db.document_set import fetch_documents_for_document_set_paginated
 from danswer.db.document_set import get_document_set_by_id
 from danswer.db.document_set import mark_document_set_as_synced
+from danswer.db.engine import build_connection_string
 from danswer.db.engine import get_sqlalchemy_engine
-from danswer.db.engine import init_sqlalchemy_engine
+from danswer.db.engine import SYNC_DB_API
 from danswer.db.models import DocumentSet
-from danswer.db.models import UserGroup
 from danswer.document_index.document_index_utils import get_both_index_names
 from danswer.document_index.factory import get_default_document_index
 from danswer.document_index.interfaces import UpdateRequest
-from danswer.redis.redis_pool import RedisPool
 from danswer.utils.logger import setup_logger
-from danswer.utils.variable_functionality import fetch_versioned_implementation
-from danswer.utils.variable_functionality import (
-    fetch_versioned_implementation_with_fallback,
-)
-from danswer.utils.variable_functionality import noop_fallback

 logger = setup_logger()

-# use this within celery tasks to get celery task specific logging
-task_logger = get_task_logger(__name__)
+connection_string = build_connection_string(
+    db_api=SYNC_DB_API, app_name=POSTGRES_CELERY_APP_NAME
+)
+celery_broker_url = f"sqla+{connection_string}"
+celery_backend_url = f"db+{connection_string}"
+celery_app = Celery(__name__, broker=celery_broker_url, backend=celery_backend_url)

-redis_pool = RedisPool()

-celery_app = Celery(__name__)
-celery_app.config_from_object(
-    "danswer.background.celery.celeryconfig"
-)  # Load configuration from 'celeryconfig.py'
+_SYNC_BATCH_SIZE = 100


 #####
@@ -128,10 +97,7 @@ def cleanup_connector_credential_pair_task(
                cc_pair=cc_pair,
            )
        except Exception as e:
-            task_logger.exception(
-                f"Failed to run connector_deletion. "
-                f"connector_id={connector_id} credential_id={credential_id}"
-            )
+            logger.exception(f"Failed to run connector_deletion due to {e}")
            raise e


@@ -150,9 +116,7 @@ def prune_documents_task(connector_id: int, credential_id: int) -> None:
            )

            if not cc_pair:
-                task_logger.warning(
-                    f"ccpair not found for {connector_id} {credential_id}"
-                )
+                logger.warning(f"ccpair not found for {connector_id} {credential_id}")
                return

            runnable_connector = instantiate_connector(
@@ -184,12 +148,12 @@ def prune_documents_task(connector_id: int, credential_id: int) -> None:
            )

            if len(doc_ids_to_remove) == 0:
-                task_logger.info(
+                logger.info(
                    f"No docs to prune from {cc_pair.connector.source} connector"
                )
                return

-            task_logger.info(
+            logger.info(
                f"pruning {len(doc_ids_to_remove)} doc(s) from {cc_pair.connector.source} connector"
            )
            delete_connector_credential_pair_batch(
@@ -199,201 +163,112 @@ def prune_documents_task(connector_id: int, credential_id: int) -> None:
                document_index=document_index,
            )
        except Exception as e:
-            task_logger.exception(
-                f"Failed to run pruning for connector id {connector_id}."
+            logger.exception(
+                f"Failed to run pruning for connector id {connector_id} due to {e}"
            )
            raise e


-def try_generate_stale_document_sync_tasks(
-    db_session: Session, r: Redis, lock_beat: redis.lock.Lock
-) -> int | None:
-    # the fence is up, do nothing
-    if r.exists(RedisConnectorCredentialPair.get_fence_key()):
-        return None
+@build_celery_task_wrapper(name_document_set_sync_task)
+@celery_app.task(soft_time_limit=JOB_TIMEOUT)
+def sync_document_set_task(document_set_id: int) -> None:
+    """For document sets marked as not up to date, sync the state from postgres
+    into the datastore. Also handles deletions."""

-    r.delete(RedisConnectorCredentialPair.get_taskset_key())  # delete the taskset
+    def _sync_document_batch(document_ids: list[str], db_session: Session) -> None:
+        logger.debug(f"Syncing document sets for: {document_ids}")

-    # add tasks to celery and build up the task set to monitor in redis
-    stale_doc_count = count_documents_by_needs_sync(db_session)
-    if stale_doc_count == 0:
-        return None
+        # Acquires a lock on the documents so that no other process can modify them
+        with prepare_to_modify_documents(
+            db_session=db_session, document_ids=document_ids
+        ):
+            # get current state of document sets for these documents
+            document_set_map = {
+                document_id: document_sets
+                for document_id, document_sets in fetch_document_sets_for_documents(
+                    document_ids=document_ids, db_session=db_session
+                )
+            }

-    task_logger.info(
-        f"Stale documents found (at least {stale_doc_count}). Generating sync tasks by cc pair."
-    )
+            # update Vespa
+            curr_ind_name, sec_ind_name = get_both_index_names(db_session)
+            document_index = get_default_document_index(
+                primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
+            )
+            update_requests = [
+                UpdateRequest(
+                    document_ids=[document_id],
+                    document_sets=set(document_set_map.get(document_id, [])),
+                )
+                for document_id in document_ids
+            ]
+            document_index.update(update_requests=update_requests)

-    # rkuo: we could technically sync all stale docs in one big pass.
-    # but I feel it's more understandable to group the docs by cc_pair
-    total_tasks_generated = 0
-    cc_pairs = get_connector_credential_pairs(db_session)
-    for cc_pair in cc_pairs:
-        rc = RedisConnectorCredentialPair(cc_pair.id)
-        tasks_generated = rc.generate_tasks(celery_app, db_session, r, lock_beat)
+    with Session(get_sqlalchemy_engine()) as db_session:
+        try:
+            cursor = None
+            while True:
+                document_batch, cursor = fetch_documents_for_document_set_paginated(
+                    document_set_id=document_set_id,
+                    db_session=db_session,
+                    current_only=False,
+                    last_document_id=cursor,
+                    limit=_SYNC_BATCH_SIZE,
+                )
+                _sync_document_batch(
+                    document_ids=[document.id for document in document_batch],
+                    db_session=db_session,
+                )
+                if cursor is None:
+                    break

-        if tasks_generated is None:
-            continue
+            # if there are no connectors, then delete the document set. Otherwise, just
+            # mark it as successfully synced.
+            document_set = cast(
+                DocumentSet,
+                get_document_set_by_id(
+                    db_session=db_session, document_set_id=document_set_id
+                ),
+            )  # casting since we "know" a document set with this ID exists
+            if not document_set.connector_credential_pairs:
+                delete_document_set(
+                    document_set_row=document_set, db_session=db_session
+                )
+                logger.info(
+                    f"Successfully deleted document set with ID: '{document_set_id}'!"
+                )
+            else:
+                mark_document_set_as_synced(
+                    document_set_id=document_set_id, db_session=db_session
+                )
+                logger.info(f"Document set sync for '{document_set_id}' complete!")

-        if tasks_generated == 0:
-            continue
-
-        task_logger.info(
-            f"RedisConnector.generate_tasks finished. "
-            f"cc_pair_id={cc_pair.id} tasks_generated={tasks_generated}"
-        )
-
-        total_tasks_generated += tasks_generated
-
-    task_logger.info(
-        f"All per connector generate_tasks finished. total_tasks_generated={total_tasks_generated}"
-    )
-
-    r.set(RedisConnectorCredentialPair.get_fence_key(), total_tasks_generated)
-    return total_tasks_generated
-
-
-def try_generate_document_set_sync_tasks(
-    document_set: DocumentSet, db_session: Session, r: Redis, lock_beat: redis.lock.Lock
-) -> int | None:
-    lock_beat.reacquire()
-
-    rds = RedisDocumentSet(document_set.id)
-
-    # don't generate document set sync tasks if tasks are still pending
-    if r.exists(rds.fence_key):
-        return None
-
-    # don't generate sync tasks if we're up to date
-    if document_set.is_up_to_date:
-        return None
-
-    # add tasks to celery and build up the task set to monitor in redis
-    r.delete(rds.taskset_key)
-
-    task_logger.info(
-        f"RedisDocumentSet.generate_tasks starting. document_set_id={document_set.id}"
-    )
-
-    # Add all documents that need to be updated into the queue
-    tasks_generated = rds.generate_tasks(celery_app, db_session, r, lock_beat)
-    if tasks_generated is None:
-        return None
-
-    # Currently we are allowing the sync to proceed with 0 tasks.
-    # It's possible for sets/groups to be generated initially with no entries
-    # and they still need to be marked as up to date.
-    # if tasks_generated == 0:
-    #     return 0
-
-    task_logger.info(
-        f"RedisDocumentSet.generate_tasks finished. "
-        f"document_set_id={document_set.id} tasks_generated={tasks_generated}"
-    )
-
-    # set this only after all tasks have been added
-    r.set(rds.fence_key, tasks_generated)
-    return tasks_generated
-
-
-def try_generate_user_group_sync_tasks(
-    usergroup: UserGroup, db_session: Session, r: Redis, lock_beat: redis.lock.Lock
-) -> int | None:
-    lock_beat.reacquire()
-
-    rug = RedisUserGroup(usergroup.id)
-
-    # don't generate sync tasks if tasks are still pending
-    if r.exists(rug.fence_key):
-        return None
-
-    if usergroup.is_up_to_date:
-        return None
-
-    # add tasks to celery and build up the task set to monitor in redis
-    r.delete(rug.taskset_key)
-
-    # Add all documents that need to be updated into the queue
-    task_logger.info(f"generate_tasks starting. usergroup_id={usergroup.id}")
-    tasks_generated = rug.generate_tasks(celery_app, db_session, r, lock_beat)
-    if tasks_generated is None:
-        return None
-
-    # Currently we are allowing the sync to proceed with 0 tasks.
-    # It's possible for sets/groups to be generated initially with no entries
-    # and they still need to be marked as up to date.
-    # if tasks_generated == 0:
-    #     return 0
-
-    task_logger.info(
-        f"generate_tasks finished. "
-        f"usergroup_id={usergroup.id} tasks_generated={tasks_generated}"
-    )
-
-    # set this only after all tasks have been added
-    r.set(rug.fence_key, tasks_generated)
-    return tasks_generated
+        except Exception:
+            logger.exception("Failed to sync document set %s", document_set_id)
+            raise


 #####
 # Periodic Tasks
 #####
@celery_app.task(
-    name="check_for_vespa_sync_task",
+    name="check_for_document_sets_sync_task",
    soft_time_limit=JOB_TIMEOUT,
 )
-def check_for_vespa_sync_task() -> None:
-    """Runs periodically to check if any document needs syncing.
-    Generates sets of tasks for Celery if syncing is needed."""
-
-    r = redis_pool.get_client()
-
-    lock_beat = r.lock(
-        DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK,
-        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
-    )
-
-    try:
-        # these tasks should never overlap
-        if not lock_beat.acquire(blocking=False):
-            return
-
-        with Session(get_sqlalchemy_engine()) as db_session:
-            try_generate_stale_document_sync_tasks(db_session, r, lock_beat)
-
-            # check if any document sets are not synced
-            document_set_info = fetch_document_sets(
-                user_id=None, db_session=db_session, include_outdated=True
-            )
-            for document_set, _ in document_set_info:
-                try_generate_document_set_sync_tasks(
-                    document_set, db_session, r, lock_beat
-                )
-
-            # check if any user groups are not synced
-            try:
-                fetch_user_groups = fetch_versioned_implementation(
-                    "danswer.db.user_group", "fetch_user_groups"
-                )
-
-                user_groups = fetch_user_groups(
-                    db_session=db_session, only_up_to_date=False
-                )
-                for usergroup in user_groups:
-                    try_generate_user_group_sync_tasks(
-                        usergroup, db_session, r, lock_beat
-                    )
-            except ModuleNotFoundError:
-                # Always exceptions on the MIT version, which is expected
-                pass
-    except SoftTimeLimitExceeded:
-        task_logger.info(
-            "Soft time limit exceeded, task is being terminated gracefully."
+def check_for_document_sets_sync_task() -> None:
+    """Runs periodically to check if any sync tasks should be run and adds them
+    to the queue"""
+    with Session(get_sqlalchemy_engine()) as db_session:
+        # check if any document sets are not synced
+        document_set_info = fetch_document_sets(
+            user_id=None, db_session=db_session, include_outdated=True
        )
-    except Exception:
-        task_logger.exception("Unexpected exception")
-    finally:
-        if lock_beat.owned():
-            lock_beat.release()
+        for document_set, _ in document_set_info:
+            if should_sync_doc_set(document_set, db_session):
+                logger.info(f"Syncing the {document_set.name} document set")
+                sync_document_set_task.apply_async(
+                    kwargs=dict(document_set_id=document_set.id),
+                )


@celery_app.task(
@@ -403,13 +278,11 @@ def check_for_vespa_sync_task() -> None:
 def check_for_cc_pair_deletion_task() -> None:
    """Runs periodically to check if any deletion tasks should be run"""
    with Session(get_sqlalchemy_engine()) as db_session:
-        # check if any cc pairs are up for deletion
+        # check if any document sets are not synced
        cc_pairs = get_connector_credential_pairs(db_session)
        for cc_pair in cc_pairs:
            if should_kick_off_deletion_of_cc_pair(cc_pair, db_session):
-                task_logger.info(
-                    f"Deleting the {cc_pair.name} connector credential pair"
-                )
+                logger.info(f"Deleting the {cc_pair.name} connector credential pair")
                cleanup_connector_credential_pair_task.apply_async(
                    kwargs=dict(
                        connector_id=cc_pair.connector.id,
@@ -418,126 +291,6 @@ def check_for_cc_pair_deletion_task() -> None:
                )


-@celery_app.task(
-    name="kombu_message_cleanup_task",
-    soft_time_limit=JOB_TIMEOUT,
-    bind=True,
-    base=AbortableTask,
-)
-def kombu_message_cleanup_task(self: Any) -> int:
-    """Runs periodically to clean up the kombu_message table"""
-
-    # we will select messages older than this amount to clean up
-    KOMBU_MESSAGE_CLEANUP_AGE = 7  # days
-    KOMBU_MESSAGE_CLEANUP_PAGE_LIMIT = 1000
-
-    ctx = {}
-    ctx["last_processed_id"] = 0
-    ctx["deleted"] = 0
-    ctx["cleanup_age"] = KOMBU_MESSAGE_CLEANUP_AGE
-    ctx["page_limit"] = KOMBU_MESSAGE_CLEANUP_PAGE_LIMIT
-    with Session(get_sqlalchemy_engine()) as db_session:
-        # Exit the task if we can't take the advisory lock
-        result = db_session.execute(
-            text("SELECT pg_try_advisory_lock(:id)"),
-            {"id": PostgresAdvisoryLocks.KOMBU_MESSAGE_CLEANUP_LOCK_ID.value},
-        ).scalar()
-        if not result:
-            return 0
-
-        while True:
-            if self.is_aborted():
-                raise TaskRevokedError("kombu_message_cleanup_task was aborted.")
-
-            b = kombu_message_cleanup_task_helper(ctx, db_session)
-            if not b:
-                break
-
-            db_session.commit()
-
-    if ctx["deleted"] > 0:
-        task_logger.info(
-            f"Deleted {ctx['deleted']} orphaned messages from kombu_message."
-        )
-
-    return ctx["deleted"]
-
-
-def kombu_message_cleanup_task_helper(ctx: dict, db_session: Session) -> bool:
-    """
-    Helper function to clean up old messages from the `kombu_message` table that are no longer relevant.
-
-    This function retrieves messages from the `kombu_message` table that are no longer visible and
-    older than a specified interval. It checks if the corresponding task_id exists in the
-    `celery_taskmeta` table. If the task_id does not exist, the message is deleted.
-
-    Args:
-        ctx (dict): A context dictionary containing configuration parameters such as:
-            - 'cleanup_age' (int): The age in days after which messages are considered old.
-            - 'page_limit' (int): The maximum number of messages to process in one batch.
-            - 'last_processed_id' (int): The ID of the last processed message to handle pagination.
-            - 'deleted' (int): A counter to track the number of deleted messages.
-        db_session (Session): The SQLAlchemy database session for executing queries.
-
-    Returns:
-        bool: Returns True if there are more rows to process, False if not.
-    """
-
-    inspector = inspect(db_session.bind)
-    if not inspector:
-        return False
-
-    # With the move to redis as celery's broker and backend, kombu tables may not even exist.
-    # We can fail silently.
-    if not inspector.has_table("kombu_message"):
-        return False
-
-    query = text(
-        """
-    SELECT id, timestamp, payload
-    FROM kombu_message WHERE visible = 'false'
-    AND timestamp < CURRENT_TIMESTAMP - INTERVAL :interval_days
-    AND id > :last_processed_id
-    ORDER BY id
-    LIMIT :page_limit
-"""
-    )
-    kombu_messages = db_session.execute(
-        query,
-        {
-            "interval_days": f"{ctx['cleanup_age']} days",
-            "page_limit": ctx["page_limit"],
-            "last_processed_id": ctx["last_processed_id"],
-        },
-    ).fetchall()
-
-    if len(kombu_messages) == 0:
-        return False
-
-    for msg in kombu_messages:
-        payload = json.loads(msg[2])
-        task_id = payload["headers"]["id"]
-
-        # Check if task_id exists in celery_taskmeta
-        task_exists = db_session.execute(
-            text("SELECT 1 FROM celery_taskmeta WHERE task_id = :task_id"),
-            {"task_id": task_id},
-        ).fetchone()
-
-        # If task_id does not exist, delete the message
-        if not task_exists:
-            result = db_session.execute(
-                text("DELETE FROM kombu_message WHERE id = :message_id"),
-                {"message_id": msg[0]},
-            )
-            if result.rowcount > 0:  # type: ignore
-                ctx["deleted"] += 1
-
-        ctx["last_processed_id"] = msg[0]
-
-    return True
-
-
@celery_app.task(
    name="check_for_prune_task",
    soft_time_limit=JOB_TIMEOUT,
@@ -555,7 +308,7 @@ def check_for_prune_task() -> None:
                credential=cc_pair.credential,
                db_session=db_session,
            ):
-                task_logger.info(f"Pruning the {cc_pair.connector.name} connector")
+                logger.info(f"Pruning the {cc_pair.connector.name} connector")

                prune_documents_task.apply_async(
                    kwargs=dict(
@@ -565,331 +318,19 @@ def check_for_prune_task() -> None:
                )


-@celery_app.task(
-    name="vespa_metadata_sync_task",
-    bind=True,
-    soft_time_limit=45,
-    time_limit=60,
-    max_retries=3,
-)
-def vespa_metadata_sync_task(self: Task, document_id: str) -> bool:
-    task_logger.info(f"document_id={document_id}")
-
-    try:
-        with Session(get_sqlalchemy_engine()) as db_session:
-            curr_ind_name, sec_ind_name = get_both_index_names(db_session)
-            document_index = get_default_document_index(
-                primary_index_name=curr_ind_name, secondary_index_name=sec_ind_name
-            )
-
-            doc = get_document(document_id, db_session)
-            if not doc:
-                return False
-
-            # document set sync
-            doc_sets = fetch_document_set_for_document(document_id, db_session)
-            update_doc_sets: set[str] = set(doc_sets)
-
-            # User group sync
-            doc_access = get_access_for_document(
-                document_id=document_id, db_session=db_session
-            )
-            update_request = UpdateRequest(
-                document_ids=[document_id],
-                document_sets=update_doc_sets,
-                access=doc_access,
-                boost=doc.boost,
-                hidden=doc.hidden,
-            )
-
-            # update Vespa
-            document_index.update(update_requests=[update_request])
-
-            # update db last. Worst case = we crash right before this and
-            # the sync might repeat again later
-            mark_document_as_synced(document_id, db_session)
-    except SoftTimeLimitExceeded:
-        task_logger.info(f"SoftTimeLimitExceeded exception. doc_id={document_id}")
-    except Exception as e:
-        task_logger.exception("Unexpected exception")
-
-        # Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64
-        countdown = 2 ** (self.request.retries + 4)
-        self.retry(exc=e, countdown=countdown)
-
-    return True
-
-
-@signals.task_postrun.connect
-def celery_task_postrun(
-    sender: Any | None = None,
-    task_id: str | None = None,
-    task: Task | None = None,
-    args: tuple | None = None,
-    kwargs: dict | None = None,
-    retval: Any | None = None,
-    state: str | None = None,
-    **kwds: Any,
-) -> None:
-    """We handle this signal in order to remove completed tasks
-    from their respective tasksets. This allows us to track the progress of document set
-    and user group syncs.
-
-    This function runs after any task completes (both success and failure)
-    Note that this signal does not fire on a task that failed to complete and is going
-    to be retried.
-    """
-    if not task:
-        return
-
-    task_logger.debug(f"Task {task.name} (ID: {task_id}) completed with state: {state}")
-    # logger.debug(f"Result: {retval}")
-
-    if state not in READY_STATES:
-        return
-
-    if not task_id:
-        return
-
-    if task_id.startswith(RedisConnectorCredentialPair.PREFIX):
-        r = redis_pool.get_client()
-        r.srem(RedisConnectorCredentialPair.get_taskset_key(), task_id)
-        return
-
-    if task_id.startswith(RedisDocumentSet.PREFIX):
-        r = redis_pool.get_client()
-        document_set_id = RedisDocumentSet.get_id_from_task_id(task_id)
-        if document_set_id is not None:
-            rds = RedisDocumentSet(document_set_id)
-            r.srem(rds.taskset_key, task_id)
-        return
-
-    if task_id.startswith(RedisUserGroup.PREFIX):
-        r = redis_pool.get_client()
-        usergroup_id = RedisUserGroup.get_id_from_task_id(task_id)
-        if usergroup_id is not None:
-            rug = RedisUserGroup(usergroup_id)
-            r.srem(rug.taskset_key, task_id)
-        return
-
-
-def monitor_connector_taskset(r: Redis) -> None:
-    fence_value = r.get(RedisConnectorCredentialPair.get_fence_key())
-    if fence_value is None:
-        return
-
-    try:
-        initial_count = int(cast(int, fence_value))
-    except ValueError:
-        task_logger.error("The value is not an integer.")
-        return
-
-    count = r.scard(RedisConnectorCredentialPair.get_taskset_key())
-    task_logger.info(f"Stale documents: remaining={count} initial={initial_count}")
-    if count == 0:
-        r.delete(RedisConnectorCredentialPair.get_taskset_key())
-        r.delete(RedisConnectorCredentialPair.get_fence_key())
-        task_logger.info(f"Successfully synced stale documents. count={initial_count}")
-
-
-def monitor_document_set_taskset(
-    key_bytes: bytes, r: Redis, db_session: Session
-) -> None:
-    fence_key = key_bytes.decode("utf-8")
-    document_set_id = RedisDocumentSet.get_id_from_fence_key(fence_key)
-    if document_set_id is None:
-        task_logger.warning("could not parse document set id from {key}")
-        return
-
-    rds = RedisDocumentSet(document_set_id)
-
-    fence_value = r.get(rds.fence_key)
-    if fence_value is None:
-        return
-
-    try:
-        initial_count = int(cast(int, fence_value))
-    except ValueError:
-        task_logger.error("The value is not an integer.")
-        return
-
-    count = cast(int, r.scard(rds.taskset_key))
-    task_logger.info(
-        f"document_set_id={document_set_id} remaining={count} initial={initial_count}"
-    )
-    if count > 0:
-        return
-
-    document_set = cast(
-        DocumentSet,
-        get_document_set_by_id(db_session=db_session, document_set_id=document_set_id),
-    )  # casting since we "know" a document set with this ID exists
-    if document_set:
-        if not document_set.connector_credential_pairs:
-            # if there are no connectors, then delete the document set.
-            delete_document_set(document_set_row=document_set, db_session=db_session)
-            task_logger.info(
-                f"Successfully deleted document set with ID: '{document_set_id}'!"
-            )
-        else:
-            mark_document_set_as_synced(document_set_id, db_session)
-            task_logger.info(
-                f"Successfully synced document set with ID: '{document_set_id}'!"
-            )
-
-    r.delete(rds.taskset_key)
-    r.delete(rds.fence_key)
-
-
-def monitor_usergroup_taskset(key_bytes: bytes, r: Redis, db_session: Session) -> None:
-    key = key_bytes.decode("utf-8")
-    usergroup_id = RedisUserGroup.get_id_from_fence_key(key)
-    if not usergroup_id:
-        task_logger.warning("Could not parse usergroup id from {key}")
-        return
-
-    rug = RedisUserGroup(usergroup_id)
-    fence_value = r.get(rug.fence_key)
-    if fence_value is None:
-        return
-
-    try:
-        initial_count = int(cast(int, fence_value))
-    except ValueError:
-        task_logger.error("The value is not an integer.")
-        return
-
-    count = cast(int, r.scard(rug.taskset_key))
-    task_logger.info(
-        f"usergroup_id={usergroup_id} remaining={count} initial={initial_count}"
-    )
-    if count > 0:
-        return
-
-    try:
-        fetch_user_group = fetch_versioned_implementation(
-            "danswer.db.user_group", "fetch_user_group"
-        )
-    except ModuleNotFoundError:
-        task_logger.exception(
-            "fetch_versioned_implementation failed to look up fetch_user_group."
-        )
-        return
-
-    user_group: UserGroup | None = fetch_user_group(
-        db_session=db_session, user_group_id=usergroup_id
-    )
-    if user_group:
-        if user_group.is_up_for_deletion:
-            delete_user_group = fetch_versioned_implementation_with_fallback(
-                "danswer.db.user_group", "delete_user_group", noop_fallback
-            )
-
-            delete_user_group(db_session=db_session, user_group=user_group)
-            task_logger.info(f" Deleted usergroup. id='{usergroup_id}'")
-        else:
-            mark_user_group_as_synced = fetch_versioned_implementation_with_fallback(
-                "danswer.db.user_group", "mark_user_group_as_synced", noop_fallback
-            )
-
-            mark_user_group_as_synced(db_session=db_session, user_group=user_group)
-            task_logger.info(f"Synced usergroup. id='{usergroup_id}'")
-
-    r.delete(rug.taskset_key)
-    r.delete(rug.fence_key)
-
-
-@celery_app.task(name="monitor_vespa_sync", soft_time_limit=300)
-def monitor_vespa_sync() -> None:
-    """This is a celery beat task that monitors and finalizes metadata sync tasksets.
-    It scans for fence values and then gets the counts of any associated tasksets.
-    If the count is 0, that means all tasks finished and we should clean up.
-
-    This task lock timeout is CELERY_METADATA_SYNC_BEAT_LOCK_TIMEOUT seconds, so don't
-    do anything too expensive in this function!
-    """
-    r = redis_pool.get_client()
-
-    lock_beat = r.lock(
-        DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK,
-        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
-    )
-
-    try:
-        # prevent overlapping tasks
-        if not lock_beat.acquire(blocking=False):
-            return
-
-        with Session(get_sqlalchemy_engine()) as db_session:
-            if r.exists(RedisConnectorCredentialPair.get_fence_key()):
-                monitor_connector_taskset(r)
-
-            for key_bytes in r.scan_iter(RedisDocumentSet.FENCE_PREFIX + "*"):
-                monitor_document_set_taskset(key_bytes, r, db_session)
-
-            for key_bytes in r.scan_iter(RedisUserGroup.FENCE_PREFIX + "*"):
-                monitor_usergroup_taskset(key_bytes, r, db_session)
-
-        #
-        # r_celery = celery_app.broker_connection().channel().client
-        # length = celery_get_queue_length(DanswerCeleryQueues.VESPA_METADATA_SYNC, r_celery)
-        # task_logger.warning(f"queue={DanswerCeleryQueues.VESPA_METADATA_SYNC} length={length}")
-    except SoftTimeLimitExceeded:
-        task_logger.info(
-            "Soft time limit exceeded, task is being terminated gracefully."
-        )
-    finally:
-        if lock_beat.owned():
-            lock_beat.release()
-
-
-@beat_init.connect
-def on_beat_init(sender: Any, **kwargs: Any) -> None:
-    init_sqlalchemy_engine(POSTGRES_CELERY_BEAT_APP_NAME)
-
-
-@worker_init.connect
-def on_worker_init(sender: Any, **kwargs: Any) -> None:
-    init_sqlalchemy_engine(POSTGRES_CELERY_WORKER_APP_NAME)
-
-    # TODO(rkuo): this is singleton work that should be done on startup exactly once
-    # if we run multiple workers, we'll need to centralize where this cleanup happens
-    r = redis_pool.get_client()
-
-    r.delete(DanswerRedisLocks.CHECK_VESPA_SYNC_BEAT_LOCK)
-    r.delete(DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK)
-
-    r.delete(RedisConnectorCredentialPair.get_taskset_key())
-    r.delete(RedisConnectorCredentialPair.get_fence_key())
-
-    for key in r.scan_iter(RedisDocumentSet.TASKSET_PREFIX + "*"):
-        r.delete(key)
-
-    for key in r.scan_iter(RedisDocumentSet.FENCE_PREFIX + "*"):
-        r.delete(key)
-
-    for key in r.scan_iter(RedisUserGroup.TASKSET_PREFIX + "*"):
-        r.delete(key)
-
-    for key in r.scan_iter(RedisUserGroup.FENCE_PREFIX + "*"):
-        r.delete(key)
-
-
 #####
 # Celery Beat (Periodic Tasks) Settings
 #####
 celery_app.conf.beat_schedule = {
-    "check-for-vespa-sync": {
-        "task": "check_for_vespa_sync_task",
+    "check-for-document-set-sync": {
+        "task": "check_for_document_sets_sync_task",
        "schedule": timedelta(seconds=5),
-        "options": {"priority": DanswerCeleryPriority.HIGH},
    },
    "check-for-cc-pair-deletion": {
        "task": "check_for_cc_pair_deletion_task",
        # don't need to check too often, since we kick off a deletion initially
        # during the API call that actually marks the CC pair for deletion
        "schedule": timedelta(minutes=1),
-        "options": {"priority": DanswerCeleryPriority.HIGH},
    },
 }
 celery_app.conf.beat_schedule.update(
@@ -897,25 +338,6 @@ celery_app.conf.beat_schedule.update(
        "check-for-prune": {
            "task": "check_for_prune_task",
            "schedule": timedelta(seconds=5),
-            "options": {"priority": DanswerCeleryPriority.HIGH},
-        },
-    }
-)
-celery_app.conf.beat_schedule.update(
-    {
-        "kombu-message-cleanup": {
-            "task": "kombu_message_cleanup_task",
-            "schedule": timedelta(seconds=3600),
-            "options": {"priority": DanswerCeleryPriority.LOWEST},
-        },
-    }
-)
-celery_app.conf.beat_schedule.update(
-    {
-        "monitor-vespa-sync": {
-            "task": "monitor_vespa_sync",
-            "schedule": timedelta(seconds=5),
-            "options": {"priority": DanswerCeleryPriority.HIGH},
        },
    }
 )
--- a/backend/danswer/background/celery/celery_redis.py
+++ b/backend/danswer/background/celery/celery_redis.py
@@ -1,299 +0,0 @@
-# These are helper objects for tracking the keys we need to write in redis
-import time
-from abc import ABC
-from abc import abstractmethod
-from typing import cast
-from uuid import uuid4
-
-import redis
-from celery import Celery
-from redis import Redis
-from sqlalchemy.orm import Session
-
-from danswer.background.celery.celeryconfig import CELERY_SEPARATOR
-from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
-from danswer.configs.constants import DanswerCeleryPriority
-from danswer.configs.constants import DanswerCeleryQueues
-from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
-from danswer.db.document import (
-    construct_document_select_for_connector_credential_pair_by_needs_sync,
-)
-from danswer.db.document_set import construct_document_select_by_docset
-from danswer.utils.variable_functionality import fetch_versioned_implementation
-
-
-class RedisObjectHelper(ABC):
-    PREFIX = "base"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def __init__(self, id: int):
-        self._id: int = id
-
-    @property
-    def task_id_prefix(self) -> str:
-        return f"{self.PREFIX}_{self._id}"
-
-    @property
-    def fence_key(self) -> str:
-        # example: documentset_fence_1
-        return f"{self.FENCE_PREFIX}_{self._id}"
-
-    @property
-    def taskset_key(self) -> str:
-        # example: documentset_taskset_1
-        return f"{self.TASKSET_PREFIX}_{self._id}"
-
-    @staticmethod
-    def get_id_from_fence_key(key: str) -> int | None:
-        """
-        Extracts the object ID from a fence key in the format `PREFIX_fence_X`.
-
-        Args:
-            key (str): The fence key string.
-
-        Returns:
-            Optional[int]: The extracted ID if the key is in the correct format, otherwise None.
-        """
-        parts = key.split("_")
-        if len(parts) != 3:
-            return None
-
-        try:
-            object_id = int(parts[2])
-        except ValueError:
-            return None
-
-        return object_id
-
-    @staticmethod
-    def get_id_from_task_id(task_id: str) -> int | None:
-        """
-        Extracts the object ID from a task ID string.
-
-        This method assumes the task ID is formatted as `prefix_objectid_suffix`, where:
-        - `prefix` is an arbitrary string (e.g., the name of the task or entity),
-        - `objectid` is the ID you want to extract,
-        - `suffix` is another arbitrary string (e.g., a UUID).
-
-        Example:
-            If the input `task_id` is `documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc`,
-            this method will return the string `"1"`.
-
-        Args:
-            task_id (str): The task ID string from which to extract the object ID.
-
-        Returns:
-            str | None: The extracted object ID if the task ID is in the correct format, otherwise None.
-        """
-        # example: task_id=documentset_1_cbfdc96a-80ca-4312-a242-0bb68da3c1dc
-        parts = task_id.split("_")
-        if len(parts) != 3:
-            return None
-
-        try:
-            object_id = int(parts[1])
-        except ValueError:
-            return None
-
-        return object_id
-
-    @abstractmethod
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-    ) -> int | None:
-        pass
-
-
-class RedisDocumentSet(RedisObjectHelper):
-    PREFIX = "documentset"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-    ) -> int | None:
-        last_lock_time = time.monotonic()
-
-        async_results = []
-        stmt = construct_document_select_by_docset(self._id)
-        for doc in db_session.scalars(stmt).yield_per(1):
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the set BEFORE creating the task.
-            redis_client.sadd(self.taskset_key, custom_task_id)
-
-            result = celery_app.send_task(
-                "vespa_metadata_sync_task",
-                kwargs=dict(document_id=doc.id),
-                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.LOW,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-
-class RedisUserGroup(RedisObjectHelper):
-    PREFIX = "usergroup"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-    ) -> int | None:
-        last_lock_time = time.monotonic()
-
-        async_results = []
-
-        try:
-            construct_document_select_by_usergroup = fetch_versioned_implementation(
-                "danswer.db.user_group",
-                "construct_document_select_by_usergroup",
-            )
-        except ModuleNotFoundError:
-            return 0
-
-        stmt = construct_document_select_by_usergroup(self._id)
-        for doc in db_session.scalars(stmt).yield_per(1):
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the set BEFORE creating the task.
-            redis_client.sadd(self.taskset_key, custom_task_id)
-
-            result = celery_app.send_task(
-                "vespa_metadata_sync_task",
-                kwargs=dict(document_id=doc.id),
-                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.LOW,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-
-class RedisConnectorCredentialPair(RedisObjectHelper):
-    PREFIX = "connectorsync"
-    FENCE_PREFIX = PREFIX + "_fence"
-    TASKSET_PREFIX = PREFIX + "_taskset"
-
-    @classmethod
-    def get_fence_key(cls) -> str:
-        return RedisConnectorCredentialPair.FENCE_PREFIX
-
-    @classmethod
-    def get_taskset_key(cls) -> str:
-        return RedisConnectorCredentialPair.TASKSET_PREFIX
-
-    @property
-    def taskset_key(self) -> str:
-        """Notice that this is intentionally reusing the same taskset for all
-        connector syncs"""
-        # example: connector_taskset
-        return f"{self.TASKSET_PREFIX}"
-
-    def generate_tasks(
-        self,
-        celery_app: Celery,
-        db_session: Session,
-        redis_client: Redis,
-        lock: redis.lock.Lock,
-    ) -> int | None:
-        last_lock_time = time.monotonic()
-
-        async_results = []
-        cc_pair = get_connector_credential_pair_from_id(self._id, db_session)
-        if not cc_pair:
-            return None
-
-        stmt = construct_document_select_for_connector_credential_pair_by_needs_sync(
-            cc_pair.connector_id, cc_pair.credential_id
-        )
-        for doc in db_session.scalars(stmt).yield_per(1):
-            current_time = time.monotonic()
-            if current_time - last_lock_time >= (
-                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
-            ):
-                lock.reacquire()
-                last_lock_time = current_time
-
-            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            # we prefix the task id so it's easier to keep track of who created the task
-            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
-            custom_task_id = f"{self.task_id_prefix}_{uuid4()}"
-
-            # add to the tracking taskset in redis BEFORE creating the celery task.
-            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
-            redis_client.sadd(
-                RedisConnectorCredentialPair.get_taskset_key(), custom_task_id
-            )
-
-            # Priority on sync's triggered by new indexing should be medium
-            result = celery_app.send_task(
-                "vespa_metadata_sync_task",
-                kwargs=dict(document_id=doc.id),
-                queue=DanswerCeleryQueues.VESPA_METADATA_SYNC,
-                task_id=custom_task_id,
-                priority=DanswerCeleryPriority.MEDIUM,
-            )
-
-            async_results.append(result)
-
-        return len(async_results)
-
-
-def celery_get_queue_length(queue: str, r: Redis) -> int:
-    """This is a redis specific way to get the length of a celery queue.
-    It is priority aware and knows how to count across the multiple redis lists
-    used to implement task prioritization.
-    This operation is not atomic."""
-    total_length = 0
-    for i in range(len(DanswerCeleryPriority)):
-        queue_name = queue
-        if i > 0:
-            queue_name += CELERY_SEPARATOR
-            queue_name += str(i)
-
-        length = r.llen(queue_name)
-        total_length += cast(int, length)
-
-    return total_length
--- a/backend/danswer/background/celery/celery_utils.py
+++ b/backend/danswer/background/celery/celery_utils.py
@@ -5,6 +5,7 @@ from sqlalchemy.orm import Session

 from danswer.background.task_utils import name_cc_cleanup_task
 from danswer.background.task_utils import name_cc_prune_task
+from danswer.background.task_utils import name_document_set_sync_task
 from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING
 from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
@@ -21,6 +22,7 @@ from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.models import Connector
 from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import Credential
+from danswer.db.models import DocumentSet
 from danswer.db.models import TaskQueueState
 from danswer.db.tasks import check_task_is_live_and_not_timed_out
 from danswer.db.tasks import get_latest_task
@@ -31,7 +33,7 @@ from danswer.utils.logger import setup_logger
 logger = setup_logger()


-def _get_deletion_status(
+def get_deletion_status(
    connector_id: int, credential_id: int, db_session: Session
 ) -> TaskQueueState | None:
    cleanup_task_name = name_cc_cleanup_task(
@@ -43,7 +45,7 @@ def _get_deletion_status(
 def get_deletion_attempt_snapshot(
    connector_id: int, credential_id: int, db_session: Session
 ) -> DeletionAttemptSnapshot | None:
-    deletion_task = _get_deletion_status(connector_id, credential_id, db_session)
+    deletion_task = get_deletion_status(connector_id, credential_id, db_session)
    if not deletion_task:
        return None

@@ -63,7 +65,7 @@ def should_kick_off_deletion_of_cc_pair(
    if check_deletion_attempt_is_allowed(cc_pair, db_session):
        return False

-    deletion_task = _get_deletion_status(
+    deletion_task = get_deletion_status(
        connector_id=cc_pair.connector_id,
        credential_id=cc_pair.credential_id,
        db_session=db_session,
@@ -79,6 +81,21 @@ def should_kick_off_deletion_of_cc_pair(
    return True


+def should_sync_doc_set(document_set: DocumentSet, db_session: Session) -> bool:
+    if document_set.is_up_to_date:
+        return False
+
+    task_name = name_document_set_sync_task(document_set.id)
+    latest_sync = get_latest_task(task_name, db_session)
+
+    if latest_sync and check_task_is_live_and_not_timed_out(latest_sync, db_session):
+        logger.info(f"Document set '{document_set.id}' is already syncing. Skipping.")
+        return False
+
+    logger.info(f"Document set {document_set.id} syncing now!")
+    return True
+
+
 def should_prune_cc_pair(
    connector: Connector, credential: Credential, db_session: Session
 ) -> bool:
--- a/backend/danswer/background/celery/celeryconfig.py
+++ b/backend/danswer/background/celery/celeryconfig.py
@@ -1,35 +0,0 @@
-# docs: https://docs.celeryq.dev/en/stable/userguide/configuration.html
-from danswer.configs.app_configs import REDIS_DB_NUMBER_CELERY
-from danswer.configs.app_configs import REDIS_HOST
-from danswer.configs.app_configs import REDIS_PASSWORD
-from danswer.configs.app_configs import REDIS_PORT
-from danswer.configs.constants import DanswerCeleryPriority
-
-CELERY_SEPARATOR = ":"
-
-CELERY_PASSWORD_PART = ""
-if REDIS_PASSWORD:
-    CELERY_PASSWORD_PART = f":{REDIS_PASSWORD}@"
-
-# example celery_broker_url: "redis://:password@localhost:6379/15"
-broker_url = (
-    f"redis://{CELERY_PASSWORD_PART}{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB_NUMBER_CELERY}"
-)
-
-result_backend = (
-    f"redis://{CELERY_PASSWORD_PART}{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB_NUMBER_CELERY}"
-)
-
-# NOTE: prefetch 4 is significantly faster than prefetch 1
-# however, prefetching is bad when tasks are lengthy as those tasks
-# can stall other tasks.
-worker_prefetch_multiplier = 4
-
-broker_transport_options = {
-    "priority_steps": list(range(len(DanswerCeleryPriority))),
-    "sep": CELERY_SEPARATOR,
-    "queue_order_strategy": "priority",
-}
-
-task_default_priority = DanswerCeleryPriority.MEDIUM
-task_acks_late = True
--- a/backend/danswer/background/connector_deletion.py
+++ b/backend/danswer/background/connector_deletion.py
@@ -151,7 +151,8 @@ def delete_connector_credential_pair(
    # index attempts
    delete_index_attempts(
        db_session=db_session,
-        cc_pair_id=cc_pair.id,
+        connector_id=connector_id,
+        credential_id=credential_id,
    )

    # document sets
@@ -184,11 +185,11 @@ def delete_connector_credential_pair(
        connector_id=connector_id,
    )
    if not connector or not len(connector.credentials):
-        logger.info("Found no credentials left for connector, deleting connector")
+        logger.debug("Found no credentials left for connector, deleting connector")
        db_session.delete(connector)
    db_session.commit()

-    logger.notice(
+    logger.info(
        "Successfully deleted connector_credential_pair with connector_id:"
        f" '{connector_id}' and credential_id: '{credential_id}'. Deleted {num_docs_deleted} docs."
    )
--- a/backend/danswer/background/indexing/run_indexing.py
+++ b/backend/danswer/background/indexing/run_indexing.py
@@ -11,9 +11,12 @@ from danswer.background.indexing.tracer import DanswerTracer
 from danswer.configs.app_configs import INDEXING_SIZE_WARNING_THRESHOLD
 from danswer.configs.app_configs import INDEXING_TRACER_INTERVAL
 from danswer.configs.app_configs import POLL_CONNECTOR_OFFSET
-from danswer.connectors.connector_runner import ConnectorRunner
 from danswer.connectors.factory import instantiate_connector
+from danswer.connectors.interfaces import GenerateDocumentsOutput
+from danswer.connectors.interfaces import LoadConnector
+from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.models import IndexAttemptMetadata
+from danswer.connectors.models import InputType
 from danswer.db.connector_credential_pair import get_last_successful_attempt_time
 from danswer.db.connector_credential_pair import update_connector_credential_pair
 from danswer.db.engine import get_sqlalchemy_engine
@@ -21,7 +24,6 @@ from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.index_attempt import get_index_attempt
 from danswer.db.index_attempt import mark_attempt_failed
 from danswer.db.index_attempt import mark_attempt_in_progress
-from danswer.db.index_attempt import mark_attempt_partially_succeeded
 from danswer.db.index_attempt import mark_attempt_succeeded
 from danswer.db.index_attempt import update_docs_indexed
 from danswer.db.models import IndexAttempt
@@ -39,12 +41,12 @@ logger = setup_logger()
 INDEXING_TRACER_NUM_PRINT_ENTRIES = 5


-def _get_connector_runner(
+def _get_document_generator(
    db_session: Session,
    attempt: IndexAttempt,
    start_time: datetime,
    end_time: datetime,
-) -> ConnectorRunner:
+) -> GenerateDocumentsOutput:
    """
    NOTE: `start_time` and `end_time` are only used for poll connectors

@@ -74,9 +76,31 @@ def _get_connector_runner(
        )
        raise e

-    return ConnectorRunner(
-        connector=runnable_connector, time_range=(start_time, end_time)
-    )
+    if task == InputType.LOAD_STATE:
+        assert isinstance(runnable_connector, LoadConnector)
+        doc_batch_generator = runnable_connector.load_from_state()
+
+    elif task == InputType.POLL:
+        assert isinstance(runnable_connector, PollConnector)
+        if (
+            attempt.connector_credential_pair.connector_id is None
+            or attempt.connector_credential_pair.connector_id is None
+        ):
+            raise ValueError(
+                f"Polling attempt {attempt.id} is missing connector_id or credential_id, "
+                f"can't fetch time range."
+            )
+
+        logger.info(f"Polling for updates between {start_time} and {end_time}")
+        doc_batch_generator = runnable_connector.poll_source(
+            start=start_time.timestamp(), end=end_time.timestamp()
+        )
+
+    else:
+        # Event types cannot be handled by a background type
+        raise RuntimeError(f"Invalid task type: {task}")
+
+    return doc_batch_generator


 def _run_indexing(
@@ -90,62 +114,55 @@ def _run_indexing(
    """
    start_time = time.time()

-    search_settings = index_attempt.search_settings
-    index_name = search_settings.index_name
+    db_embedding_model = index_attempt.embedding_model
+    index_name = db_embedding_model.index_name

    # Only update cc-pair status for primary index jobs
    # Secondary index syncs at the end when swapping
-    is_primary = search_settings.status == IndexModelStatus.PRESENT
+    is_primary = index_attempt.embedding_model.status == IndexModelStatus.PRESENT

    # Indexing is only done into one index at a time
    document_index = get_default_document_index(
        primary_index_name=index_name, secondary_index_name=None
    )

-    embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
-        search_settings=search_settings
+    embedding_model = DefaultIndexingEmbedder.from_db_embedding_model(
+        db_embedding_model
    )

    indexing_pipeline = build_indexing_pipeline(
-        attempt_id=index_attempt.id,
        embedder=embedding_model,
        document_index=document_index,
        ignore_time_skip=index_attempt.from_beginning
-        or (search_settings.status == IndexModelStatus.FUTURE),
+        or (db_embedding_model.status == IndexModelStatus.FUTURE),
        db_session=db_session,
    )

    db_cc_pair = index_attempt.connector_credential_pair
    db_connector = index_attempt.connector_credential_pair.connector
    db_credential = index_attempt.connector_credential_pair.credential
-    earliest_index_time = (
-        db_connector.indexing_start.timestamp() if db_connector.indexing_start else 0
-    )

    last_successful_index_time = (
-        earliest_index_time
-        if index_attempt.from_beginning
-        else get_last_successful_attempt_time(
-            connector_id=db_connector.id,
-            credential_id=db_credential.id,
-            earliest_index=earliest_index_time,
-            search_settings=index_attempt.search_settings,
-            db_session=db_session,
+        db_connector.indexing_start.timestamp()
+        if index_attempt.from_beginning and db_connector.indexing_start is not None
+        else (
+            0.0
+            if index_attempt.from_beginning
+            else get_last_successful_attempt_time(
+                connector_id=db_connector.id,
+                credential_id=db_credential.id,
+                embedding_model=index_attempt.embedding_model,
+                db_session=db_session,
+            )
        )
    )

    if INDEXING_TRACER_INTERVAL > 0:
-        logger.debug(f"Memory tracer starting: interval={INDEXING_TRACER_INTERVAL}")
+        logger.info(f"Memory tracer starting: interval={INDEXING_TRACER_INTERVAL}")
        tracer = DanswerTracer()
        tracer.start()
        tracer.snap()

-    index_attempt_md = IndexAttemptMetadata(
-        connector_id=db_connector.id,
-        credential_id=db_credential.id,
-    )
-
-    batch_num = 0
    net_doc_change = 0
    document_count = 0
    chunk_count = 0
@@ -164,7 +181,7 @@ def _run_indexing(
                datetime(1970, 1, 1, tzinfo=timezone.utc),
            )

-            connector_runner = _get_connector_runner(
+            doc_batch_generator = _get_document_generator(
                db_session=db_session,
                attempt=index_attempt,
                start_time=window_start,
@@ -176,19 +193,15 @@ def _run_indexing(
            tracer_counter = 0
            if INDEXING_TRACER_INTERVAL > 0:
                tracer.snap()
-            for doc_batch in connector_runner.run():
+            for doc_batch in doc_batch_generator:
                # Check if connector is disabled mid run and stop if so unless it's the secondary
                # index being built. We want to populate it even for paused connectors
                # Often paused connectors are sources that aren't updated frequently but the
                # contents still need to be initially pulled.
                db_session.refresh(db_connector)
                if (
-                    (
-                        db_cc_pair.status == ConnectorCredentialPairStatus.PAUSED
-                        and search_settings.status != IndexModelStatus.FUTURE
-                    )
-                    # if it's deleting, we don't care if this is a secondary index
-                    or db_cc_pair.status == ConnectorCredentialPairStatus.DELETING
+                    db_cc_pair.status == ConnectorCredentialPairStatus.PAUSED
+                    and db_embedding_model.status != IndexModelStatus.FUTURE
                ):
                    # let the `except` block handle this
                    raise RuntimeError("Connector was disabled mid run")
@@ -215,13 +228,13 @@ def _run_indexing(

                logger.debug(f"Indexing batch of documents: {batch_description}")

-                index_attempt_md.batch_num = batch_num + 1  # use 1-index for this
                new_docs, total_batch_chunks = indexing_pipeline(
                    document_batch=doc_batch,
-                    index_attempt_metadata=index_attempt_md,
+                    index_attempt_metadata=IndexAttemptMetadata(
+                        connector_id=db_connector.id,
+                        credential_id=db_credential.id,
+                    ),
                )
-
-                batch_num += 1
                net_doc_change += new_docs
                chunk_count += total_batch_chunks
                document_count += len(doc_batch)
@@ -248,7 +261,7 @@ def _run_indexing(
                    INDEXING_TRACER_INTERVAL > 0
                    and tracer_counter % INDEXING_TRACER_INTERVAL == 0
                ):
-                    logger.debug(
+                    logger.info(
                        f"Running trace comparison for batch {tracer_counter}. interval={INDEXING_TRACER_INTERVAL}"
                    )
                    tracer.snap()
@@ -264,7 +277,7 @@ def _run_indexing(
                    run_dt=run_end_dt,
                )
        except Exception as e:
-            logger.exception(
+            logger.info(
                f"Connector run ran into exception after elapsed time: {time.time() - start_time} seconds"
            )
            # Only mark the attempt as a complete failure if this is the first indexing window.
@@ -276,7 +289,7 @@ def _run_indexing(
            # to give better clarity in the UI, as the next run will never happen.
            if (
                ind == 0
-                or not db_cc_pair.status.is_active()
+                or db_cc_pair.status == ConnectorCredentialPairStatus.PAUSED
                or index_attempt.status != IndexingStatus.IN_PROGRESS
            ):
                mark_attempt_failed(
@@ -302,52 +315,15 @@ def _run_indexing(
            break

    if INDEXING_TRACER_INTERVAL > 0:
-        logger.debug(
+        logger.info(
            f"Running trace comparison between start and end of indexing. {tracer_counter} batches processed."
        )
        tracer.snap()
        tracer.log_first_diff(INDEXING_TRACER_NUM_PRINT_ENTRIES)
        tracer.stop()
-        logger.debug("Memory tracer stopped.")
-
-    if (
-        index_attempt_md.num_exceptions > 0
-        and index_attempt_md.num_exceptions >= batch_num
-    ):
-        mark_attempt_failed(
-            index_attempt,
-            db_session,
-            failure_reason="All batches exceptioned.",
-        )
-        if is_primary:
-            update_connector_credential_pair(
-                db_session=db_session,
-                connector_id=index_attempt.connector_credential_pair.connector.id,
-                credential_id=index_attempt.connector_credential_pair.credential.id,
-            )
-        raise Exception(
-            f"Connector failed - All batches exceptioned: batches={batch_num}"
-        )
-
-    elapsed_time = time.time() - start_time
-
-    if index_attempt_md.num_exceptions == 0:
-        mark_attempt_succeeded(index_attempt, db_session)
-        logger.info(
-            f"Connector succeeded: "
-            f"docs={document_count} chunks={chunk_count} elapsed={elapsed_time:.2f}s"
-        )
-    else:
-        mark_attempt_partially_succeeded(index_attempt, db_session)
-        logger.info(
-            f"Connector completed with some errors: "
-            f"exceptions={index_attempt_md.num_exceptions} "
-            f"batches={batch_num} "
-            f"docs={document_count} "
-            f"chunks={chunk_count} "
-            f"elapsed={elapsed_time:.2f}s"
-        )
+        logger.info("Memory tracer stopped.")

+    mark_attempt_succeeded(index_attempt, db_session)
    if is_primary:
        update_connector_credential_pair(
            db_session=db_session,
@@ -356,6 +332,11 @@ def _run_indexing(
            run_dt=run_end_dt,
        )

+    elapsed_time = time.time() - start_time
+    logger.info(
+        f"Connector succeeded: docs={document_count} chunks={chunk_count} elapsed={elapsed_time:.2f}s"
+    )
+

 def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexAttempt:
    # make sure that the index attempt can't change in between checking the
@@ -384,22 +365,17 @@ def _prepare_index_attempt(db_session: Session, index_attempt_id: int) -> IndexA
    return attempt


-def run_indexing_entrypoint(
-    index_attempt_id: int, connector_credential_pair_id: int, is_ee: bool = False
-) -> None:
+def run_indexing_entrypoint(index_attempt_id: int, is_ee: bool = False) -> None:
    """Entrypoint for indexing run when using dask distributed.
    Wraps the actual logic in a `try` block so that we can catch any exceptions
    and mark the attempt as failed."""
-
    try:
        if is_ee:
            global_version.set_ee()

        # set the indexing attempt ID so that all log messages from this process
        # will have it added as a prefix
-        IndexAttemptSingleton.set_cc_and_index_id(
-            index_attempt_id, connector_credential_pair_id
-        )
+        IndexAttemptSingleton.set_index_attempt_id(index_attempt_id)

        with Session(get_sqlalchemy_engine()) as db_session:
            # make sure that it is valid to run this indexing attempt + mark it
--- a/backend/danswer/background/indexing/tracer.py
+++ b/backend/danswer/background/indexing/tracer.py
@@ -48,9 +48,9 @@ class DanswerTracer:

        stats = self.snapshot.statistics("traceback")
        for s in stats[:numEntries]:
-            logger.debug(f"Tracer snap: {s}")
+            logger.info(f"Tracer snap: {s}")
            for line in s.traceback:
-                logger.debug(f"* {line}")
+                logger.info(f"* {line}")

    @staticmethod
    def log_diff(
@@ -60,9 +60,9 @@ class DanswerTracer:
    ) -> None:
        stats = snap_current.compare_to(snap_previous, "traceback")
        for s in stats[:numEntries]:
-            logger.debug(f"Tracer diff: {s}")
+            logger.info(f"Tracer diff: {s}")
            for line in s.traceback.format():
-                logger.debug(f"* {line}")
+                logger.info(f"* {line}")

    def log_previous_diff(self, numEntries: int) -> None:
        if not self.snapshot or not self.snapshot_prev:
--- a/backend/danswer/background/task_utils.py
+++ b/backend/danswer/background/task_utils.py
@@ -93,16 +93,9 @@ def build_apply_async_wrapper(build_name_fn: Callable[..., str]) -> Callable[[AA
            kwargs_for_build_name = kwargs or {}
            task_name = build_name_fn(*args_for_build_name, **kwargs_for_build_name)
            with Session(get_sqlalchemy_engine()) as db_session:
-                # register_task must come before fn = apply_async or else the task
-                # might run mark_task_start (and crash) before the task row exists
-                db_task = register_task(task_name, db_session)
-
+                # mark the task as started
                task = fn(args, kwargs, *other_args, **other_kwargs)
-
-                # we update the celery task id for diagnostic purposes
-                # but it isn't currently used by any code
-                db_task.task_id = task.id
-                db_session.commit()
+                register_task(task.id, task_name, db_session)

            return task

--- a/backend/danswer/background/update.py
+++ b/backend/danswer/background/update.py
@@ -17,13 +17,15 @@ from danswer.configs.app_configs import DASK_JOB_CLIENT_ENABLED
 from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
 from danswer.configs.app_configs import NUM_INDEXING_WORKERS
 from danswer.configs.app_configs import NUM_SECONDARY_INDEXING_WORKERS
-from danswer.configs.constants import DocumentSource
 from danswer.configs.constants import POSTGRES_INDEXER_APP_NAME
 from danswer.db.connector import fetch_connectors
 from danswer.db.connector_credential_pair import fetch_connector_credential_pairs
+from danswer.db.embedding_model import get_current_db_embedding_model
+from danswer.db.embedding_model import get_secondary_db_embedding_model
 from danswer.db.engine import get_db_current_time
 from danswer.db.engine import get_sqlalchemy_engine
 from danswer.db.engine import init_sqlalchemy_engine
+from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.index_attempt import create_index_attempt
 from danswer.db.index_attempt import get_index_attempt
 from danswer.db.index_attempt import get_inprogress_index_attempts
@@ -31,14 +33,11 @@ from danswer.db.index_attempt import get_last_attempt_for_cc_pair
 from danswer.db.index_attempt import get_not_started_index_attempts
 from danswer.db.index_attempt import mark_attempt_failed
 from danswer.db.models import ConnectorCredentialPair
+from danswer.db.models import EmbeddingModel
 from danswer.db.models import IndexAttempt
 from danswer.db.models import IndexingStatus
 from danswer.db.models import IndexModelStatus
-from danswer.db.models import SearchSettings
-from danswer.db.search_settings import get_current_search_settings
-from danswer.db.search_settings import get_secondary_search_settings
 from danswer.db.swap_index import check_index_swap
-from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
 from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
 from danswer.utils.logger import setup_logger
 from danswer.utils.variable_functionality import global_version
@@ -61,27 +60,20 @@ _UNEXPECTED_STATE_FAILURE_REASON = (
 def _should_create_new_indexing(
    cc_pair: ConnectorCredentialPair,
    last_index: IndexAttempt | None,
-    search_settings_instance: SearchSettings,
+    model: EmbeddingModel,
    secondary_index_building: bool,
    db_session: Session,
 ) -> bool:
    connector = cc_pair.connector

-    # don't kick off indexing for `NOT_APPLICABLE` sources
-    if connector.source == DocumentSource.NOT_APPLICABLE:
-        return False
-
    # User can still manually create single indexing attempts via the UI for the
    # currently in use index
    if DISABLE_INDEX_UPDATE_ON_SWAP:
-        if (
-            search_settings_instance.status == IndexModelStatus.PRESENT
-            and secondary_index_building
-        ):
+        if model.status == IndexModelStatus.PRESENT and secondary_index_building:
            return False

    # When switching over models, always index at least once
-    if search_settings_instance.status == IndexModelStatus.FUTURE:
+    if model.status == IndexModelStatus.FUTURE:
        if last_index:
            # No new index if the last index attempt succeeded
            # Once is enough. The model will never be able to swap otherwise.
@@ -103,7 +95,7 @@ def _should_create_new_indexing(
    # If the connector is paused or is the ingestion API, don't index
    # NOTE: during an embedding model switch over, the following logic
    # is bypassed by the above check for a future model
-    if not cc_pair.status.is_active() or connector.id == 0:
+    if cc_pair.status == ConnectorCredentialPairStatus.PAUSED or connector.id == 0:
        return False

    if not last_index:
@@ -128,6 +120,16 @@ def _should_create_new_indexing(
    return time_since_index.total_seconds() >= connector.refresh_freq


+def _is_indexing_job_marked_as_finished(index_attempt: IndexAttempt | None) -> bool:
+    if index_attempt is None:
+        return False
+
+    return (
+        index_attempt.status == IndexingStatus.FAILED
+        or index_attempt.status == IndexingStatus.SUCCESS
+    )
+
+
 def _mark_run_failed(
    db_session: Session, index_attempt: IndexAttempt, failure_reason: str
 ) -> None:
@@ -168,42 +170,35 @@ def create_indexing_jobs(existing_jobs: dict[int, Future | SimpleJob]) -> None:
            ongoing.add(
                (
                    attempt.connector_credential_pair_id,
-                    attempt.search_settings_id,
+                    attempt.embedding_model_id,
                )
            )

-        # Get the primary search settings
-        primary_search_settings = get_current_search_settings(db_session)
-        search_settings = [primary_search_settings]
-
-        # Check for secondary search settings
-        secondary_search_settings = get_secondary_search_settings(db_session)
-        if secondary_search_settings is not None:
-            # If secondary settings exist, add them to the list
-            search_settings.append(secondary_search_settings)
+        embedding_models = [get_current_db_embedding_model(db_session)]
+        secondary_embedding_model = get_secondary_db_embedding_model(db_session)
+        if secondary_embedding_model is not None:
+            embedding_models.append(secondary_embedding_model)

        all_connector_credential_pairs = fetch_connector_credential_pairs(db_session)
        for cc_pair in all_connector_credential_pairs:
-            for search_settings_instance in search_settings:
+            for model in embedding_models:
                # Check if there is an ongoing indexing attempt for this connector credential pair
-                if (cc_pair.id, search_settings_instance.id) in ongoing:
+                if (cc_pair.id, model.id) in ongoing:
                    continue

                last_attempt = get_last_attempt_for_cc_pair(
-                    cc_pair.id, search_settings_instance.id, db_session
+                    cc_pair.id, model.id, db_session
                )
                if not _should_create_new_indexing(
                    cc_pair=cc_pair,
                    last_index=last_attempt,
-                    search_settings_instance=search_settings_instance,
-                    secondary_index_building=len(search_settings) > 1,
+                    model=model,
+                    secondary_index_building=len(embedding_models) > 1,
                    db_session=db_session,
                ):
                    continue

-                create_index_attempt(
-                    cc_pair.id, search_settings_instance.id, db_session
-                )
+                create_index_attempt(cc_pair.id, model.id, db_session)


 def cleanup_indexing_jobs(
@@ -220,12 +215,10 @@ def cleanup_indexing_jobs(
            )

            # do nothing for ongoing jobs that haven't been stopped
-            if not job.done():
-                if not index_attempt:
-                    continue
-
-                if not index_attempt.is_finished():
-                    continue
+            if not job.done() and not _is_indexing_job_marked_as_finished(
+                index_attempt
+            ):
+                continue

            if job.status == "error":
                logger.error(job.exception())
@@ -300,7 +293,7 @@ def kickoff_indexing_jobs(
        # get_not_started_index_attempts orders its returned results from oldest to newest
        # we must process attempts in a FIFO manner to prevent connector starvation
        new_indexing_attempts = [
-            (attempt, attempt.search_settings)
+            (attempt, attempt.embedding_model)
            for attempt in get_not_started_index_attempts(db_session)
            if attempt.id not in existing_jobs
        ]
@@ -312,10 +305,10 @@ def kickoff_indexing_jobs(

    indexing_attempt_count = 0

-    for attempt, search_settings in new_indexing_attempts:
+    for attempt, embedding_model in new_indexing_attempts:
        use_secondary_index = (
-            search_settings.status == IndexModelStatus.FUTURE
-            if search_settings is not None
+            embedding_model.status == IndexModelStatus.FUTURE
+            if embedding_model is not None
            else False
        )
        if attempt.connector_credential_pair.connector is None:
@@ -341,7 +334,6 @@ def kickoff_indexing_jobs(
            run = secondary_client.submit(
                run_indexing_entrypoint,
                attempt.id,
-                attempt.connector_credential_pair_id,
                global_version.get_is_ee_version(),
                pure=False,
            )
@@ -349,7 +341,6 @@ def kickoff_indexing_jobs(
            run = client.submit(
                run_indexing_entrypoint,
                attempt.id,
-                attempt.connector_credential_pair_id,
                global_version.get_is_ee_version(),
                pure=False,
            )
@@ -390,21 +381,17 @@ def update_loop(
    engine = get_sqlalchemy_engine()
    with Session(engine) as db_session:
        check_index_swap(db_session=db_session)
-        search_settings = get_current_search_settings(db_session)
+        db_embedding_model = get_current_db_embedding_model(db_session)

        # So that the first time users aren't surprised by really slow speed of first
        # batch of documents indexed

-        if search_settings.provider_type is None:
-            logger.notice("Running a first inference to warm up embedding model")
-            embedding_model = EmbeddingModel.from_db_model(
-                search_settings=search_settings,
-                server_host=INDEXING_MODEL_SERVER_HOST,
-                server_port=MODEL_SERVER_PORT,
-            )
-
+        if db_embedding_model.cloud_provider_id is None:
+            logger.debug("Running a first inference to warm up embedding model")
            warm_up_bi_encoder(
-                embedding_model=embedding_model,
+                embedding_model=db_embedding_model,
+                model_server_host=INDEXING_MODEL_SERVER_HOST,
+                model_server_port=MODEL_SERVER_PORT,
            )

    client_primary: Client | SimpleJobClient
@@ -467,7 +454,7 @@ def update__main() -> None:
    set_is_ee_based_on_env_variable()
    init_sqlalchemy_engine(POSTGRES_INDEXER_APP_NAME)

-    logger.notice("Starting indexing service")
+    logger.info("Starting indexing service")
    update_loop()


--- a/backend/danswer/chat/chat_utils.py
+++ b/backend/danswer/chat/chat_utils.py
@@ -36,8 +36,7 @@ def create_chat_chain(
    chat_session_id: int,
    db_session: Session,
    prefetch_tool_calls: bool = True,
-    # Optional id at which we finish processing
-    stop_at_message_id: int | None = None,
+    parent_id: int | None = None,
 ) -> tuple[ChatMessage, list[ChatMessage]]:
    """Build the linear chain of messages without including the root message"""
    mainline_messages: list[ChatMessage] = []
@@ -63,12 +62,7 @@ def create_chat_chain(
    current_message: ChatMessage | None = root_message
    while current_message is not None:
        child_msg = current_message.latest_child_message
-
-        # Break if at the end of the chain
-        # or have reached the `final_id` of the submitted message
-        if not child_msg or (
-            stop_at_message_id and current_message.id == stop_at_message_id
-        ):
+        if not child_msg or (parent_id and current_message.id == parent_id):
            break
        current_message = id_to_msg.get(child_msg)

--- a/backend/danswer/chat/models.py
+++ b/backend/danswer/chat/models.py
@@ -1,6 +1,5 @@
 from collections.abc import Iterator
 from datetime import datetime
-from enum import Enum
 from typing import Any

 from pydantic import BaseModel
@@ -10,7 +9,6 @@ from danswer.search.enums import QueryFlow
 from danswer.search.enums import SearchType
 from danswer.search.models import RetrievalDocs
 from danswer.search.models import SearchResponse
-from danswer.tools.custom.base_tool_types import ToolResultType


 class LlmDoc(BaseModel):
@@ -36,29 +34,14 @@ class QADocsResponse(RetrievalDocs):
    applied_time_cutoff: datetime | None
    recency_bias_multiplier: float

-    def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
-        initial_dict = super().model_dump(mode="json", *args, **kwargs)  # type: ignore
+    def dict(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
+        initial_dict = super().dict(*args, **kwargs)  # type: ignore
        initial_dict["applied_time_cutoff"] = (
            self.applied_time_cutoff.isoformat() if self.applied_time_cutoff else None
        )
-
        return initial_dict


-class StreamStopReason(Enum):
-    CONTEXT_LENGTH = "context_length"
-    CANCELLED = "cancelled"
-
-
-class StreamStopInfo(BaseModel):
-    stop_reason: StreamStopReason
-
-    def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
-        data = super().model_dump(mode="json", *args, **kwargs)  # type: ignore
-        data["stop_reason"] = self.stop_reason.name
-        return data
-
-
 class LLMRelevanceFilterResponse(BaseModel):
    relevant_chunk_indices: list[int]

@@ -81,6 +64,10 @@ class DocumentRelevance(BaseModel):
    relevance_summaries: dict[str, RelevanceAnalysis]


+class Delimiter(BaseModel):
+    delimiter: bool
+
+
 class DanswerAnswerPiece(BaseModel):
    # A small piece of a complete answer. Used for streaming back answers.
    answer_piece: str | None  # if None, specifies the end of an Answer
@@ -147,7 +134,7 @@ class ImageGenerationDisplay(BaseModel):


 class CustomToolResponse(BaseModel):
-    response: ToolResultType
+    response: dict
    tool_name: str


@@ -159,7 +146,7 @@ AnswerQuestionPossibleReturn = (
    | ImageGenerationDisplay
    | CustomToolResponse
    | StreamingError
-    | StreamStopInfo
+    | Delimiter
 )


--- a/backend/danswer/chat/process_message.py
+++ b/backend/danswer/chat/process_message.py
@@ -1,4 +1,3 @@
-import traceback
 from collections.abc import Callable
 from collections.abc import Iterator
 from functools import partial
@@ -10,6 +9,7 @@ from danswer.chat.chat_utils import create_chat_chain
 from danswer.chat.models import CitationInfo
 from danswer.chat.models import CustomToolResponse
 from danswer.chat.models import DanswerAnswerPiece
+from danswer.chat.models import Delimiter
 from danswer.chat.models import ImageGenerationDisplay
 from danswer.chat.models import LLMRelevanceFilterResponse
 from danswer.chat.models import MessageResponseIDInfo
@@ -32,13 +32,13 @@ from danswer.db.chat import get_or_create_root_message
 from danswer.db.chat import reserve_message_id
 from danswer.db.chat import translate_db_message_to_chat_message_detail
 from danswer.db.chat import translate_db_search_doc_to_server_search_doc
+from danswer.db.embedding_model import get_current_db_embedding_model
 from danswer.db.engine import get_session_context_manager
 from danswer.db.llm import fetch_existing_llm_providers
 from danswer.db.models import SearchDoc as DbSearchDoc
 from danswer.db.models import ToolCall
 from danswer.db.models import User
 from danswer.db.persona import get_persona_by_id
-from danswer.db.search_settings import get_current_search_settings
 from danswer.document_index.factory import get_default_document_index
 from danswer.file_store.models import ChatFileType
 from danswer.file_store.models import FileDescriptor
@@ -91,7 +91,7 @@ from danswer.tools.search.search_tool import SearchTool
 from danswer.tools.search.search_tool import SECTION_RELEVANCE_LIST_ID
 from danswer.tools.tool import Tool
 from danswer.tools.tool import ToolResponse
-from danswer.tools.tool_runner import ToolCallFinalResult
+from danswer.tools.tool_runner import ToolCallMetadata
 from danswer.tools.utils import compute_all_tool_tokens
 from danswer.tools.utils import explicit_tool_calling_supported
 from danswer.utils.logger import setup_logger
@@ -245,6 +245,7 @@ ChatPacket = (
    | ImageGenerationDisplay
    | CustomToolResponse
    | MessageResponseIDInfo
+    | Delimiter
 )
 ChatPacketStream = Iterator[ChatPacket]

@@ -270,11 +271,6 @@ def stream_chat_message_objects(
    3. [always] A set of streamed LLM tokens or an error anywhere along the line if something fails
    4. [always] Details on the final AI response message that is created
    """
-    # Currently surrounding context is not supported for chat
-    # Chat is already token heavy and harder for the model to process plus it would roll history over much faster
-    new_msg_req.chunks_above = 0
-    new_msg_req.chunks_below = 0
-
    try:
        user_id = user.id if user is not None else None

@@ -331,9 +327,9 @@ def stream_chat_message_objects(
            Callable[[str], list[int]], llm_tokenizer.encode
        )

-        search_settings = get_current_search_settings(db_session)
+        embedding_model = get_current_db_embedding_model(db_session)
        document_index = get_default_document_index(
-            primary_index_name=search_settings.index_name, secondary_index_name=None
+            primary_index_name=embedding_model.index_name, secondary_index_name=None
        )

        # Every chat Session begins with an empty root message
@@ -354,7 +350,7 @@ def stream_chat_message_objects(

        if new_msg_req.regenerate:
            final_msg, history_msgs = create_chat_chain(
-                stop_at_message_id=parent_id,
+                parent_id=parent_id,
                chat_session_id=chat_session_id,
                db_session=db_session,
            )
@@ -465,6 +461,8 @@ def stream_chat_message_objects(
                    else default_num_chunks
                ),
                max_window_percentage=max_document_percentage,
+                use_sections=new_msg_req.chunks_above > 0
+                or new_msg_req.chunks_below > 0,
            )
        reserved_message_id = reserve_message_id(
            db_session=db_session,
@@ -479,17 +477,16 @@ def stream_chat_message_objects(
            reserved_assistant_message_id=reserved_message_id,
        )

-        overridden_model = (
+        alternate_model = (
            new_msg_req.llm_override.model_version if new_msg_req.llm_override else None
        )
-
        # Cannot determine these without the LLM step or breaking out early
        partial_response = partial(
            create_new_chat_message,
            chat_session_id=chat_session_id,
            parent_message=final_msg,
            prompt_id=prompt_id,
-            overridden_model=overridden_model,
+            alternate_model=alternate_model,
            # message=,
            # rephrased_query=,
            # token_count=,
@@ -613,6 +610,7 @@ def stream_chat_message_objects(
        document_pruning_config.using_tool_message = explicit_tool_calling_supported(
            llm_provider, llm_model_name
        )
+        tool_has_been_called = False  # TODO remove

        # LLM prompt building, response capturing, etc.
        answer = Answer(
@@ -653,6 +651,8 @@ def stream_chat_message_objects(

        for packet in answer.processed_streamed_output:
            if isinstance(packet, ToolResponse):
+                tool_has_been_called = True
+
                if packet.id == SEARCH_RESPONSE_SUMMARY_ID:
                    (
                        qa_docs_response,
@@ -723,76 +723,137 @@ def stream_chat_message_objects(
                    )

            else:
-                if isinstance(packet, ToolCallFinalResult):
-                    tool_result = packet
-                yield cast(ChatPacket, packet)
+                if isinstance(packet, Delimiter):
+                    db_citations = None
+
+                    if reference_db_search_docs:
+                        db_citations = translate_citations(
+                            citations_list=answer.citations,
+                            db_docs=reference_db_search_docs,
+                        )
+
+                    # Saving Gen AI answer and responding with message info
+                    tool_name_to_tool_id: dict[str, int] = {}
+                    for tool_id, tool_list in tool_dict.items():
+                        for tool in tool_list:
+                            tool_name_to_tool_id[tool.name] = tool_id
+
+                    if tool_result is None:
+                        tool_call = None
+                    else:
+                        tool_call = ToolCall(
+                            tool_id=tool_name_to_tool_id[tool_result.tool_name],
+                            tool_name=tool_result.tool_name,
+                            tool_arguments=tool_result.tool_args,
+                            tool_result=tool_result.tool_result,
+                        )
+
+                    gen_ai_response_message = partial_response(
+                        message=answer.llm_answer,
+                        rephrased_query=(
+                            qa_docs_response.rephrased_query
+                            if qa_docs_response
+                            else None
+                        ),
+                        reference_docs=reference_db_search_docs,
+                        files=ai_message_files,
+                        token_count=len(llm_tokenizer_encode_func(answer.llm_answer)),
+                        citations=db_citations,
+                        error=None,
+                        tool_call=tool_call,
+                    )
+
+                    db_session.commit()  # actually save user / assistant message
+
+                    msg_detail_response = translate_db_message_to_chat_message_detail(
+                        gen_ai_response_message
+                    )
+
+                    yield msg_detail_response
+                    yield Delimiter(delimiter=True)
+                    partial_response = partial(
+                        create_new_chat_message,
+                        chat_session_id=chat_session_id,
+                        parent_message=gen_ai_response_message,
+                        prompt_id=prompt_id,
+                        # message=,
+                        # rephrased_query=,
+                        # token_count=,
+                        message_type=MessageType.ASSISTANT,
+                        alternate_assistant_id=new_msg_req.alternate_assistant_id,
+                        # error=,
+                        # reference_docs=,
+                        db_session=db_session,
+                        commit=False,
+                    )
+
+                else:
+                    if isinstance(packet, ToolCallMetadata):
+                        tool_result = packet
+                    yield cast(ChatPacket, packet)
        logger.debug("Reached end of stream")
+
    except Exception as e:
        error_msg = str(e)
        logger.exception(f"Failed to process chat message: {error_msg}")

-        stack_trace = traceback.format_exc()
        client_error_msg = litellm_exception_to_error_msg(e, llm)
        if llm.config.api_key and len(llm.config.api_key) > 2:
            error_msg = error_msg.replace(llm.config.api_key, "[REDACTED_API_KEY]")
-            stack_trace = stack_trace.replace(llm.config.api_key, "[REDACTED_API_KEY]")
-
-        yield StreamingError(error=client_error_msg, stack_trace=stack_trace)
+        yield StreamingError(error=client_error_msg, stack_trace=error_msg)
        db_session.rollback()
        return

-    # Post-LLM answer processing
-    try:
-        db_citations = None
-        if reference_db_search_docs:
-            db_citations = translate_citations(
-                citations_list=answer.citations,
-                db_docs=reference_db_search_docs,
-            )
+    if not tool_has_been_called:
+        try:
+            db_citations = None
+            if reference_db_search_docs:
+                db_citations = translate_citations(
+                    citations_list=answer.citations,
+                    db_docs=reference_db_search_docs,
+                )

-        # Saving Gen AI answer and responding with message info
-        tool_name_to_tool_id: dict[str, int] = {}
-        for tool_id, tool_list in tool_dict.items():
-            for tool in tool_list:
-                tool_name_to_tool_id[tool.name] = tool_id
+            # Saving Gen AI answer and responding with message info
+            tool_name_to_tool_id = {}
+            for tool_id, tool_list in tool_dict.items():
+                for tool in tool_list:
+                    tool_name_to_tool_id[tool.name] = tool_id

-        gen_ai_response_message = partial_response(
-            reserved_message_id=reserved_message_id,
-            message=answer.llm_answer,
-            rephrased_query=(
-                qa_docs_response.rephrased_query if qa_docs_response else None
-            ),
-            reference_docs=reference_db_search_docs,
-            files=ai_message_files,
-            token_count=len(llm_tokenizer_encode_func(answer.llm_answer)),
-            citations=db_citations,
-            error=None,
-            tool_calls=[
-                ToolCall(
+            gen_ai_response_message = partial_response(
+                reserved_message_id=reserved_message_id,
+                message=answer.llm_answer,
+                rephrased_query=(
+                    qa_docs_response.rephrased_query if qa_docs_response else None
+                ),
+                reference_docs=reference_db_search_docs,
+                files=ai_message_files,
+                token_count=len(llm_tokenizer_encode_func(answer.llm_answer)),
+                citations=db_citations,
+                error=None,
+                tool_call=ToolCall(
                    tool_id=tool_name_to_tool_id[tool_result.tool_name],
                    tool_name=tool_result.tool_name,
                    tool_arguments=tool_result.tool_args,
                    tool_result=tool_result.tool_result,
                )
-            ]
-            if tool_result
-            else [],
-        )
+                if tool_result
+                else None,
+            )

-        logger.debug("Committing messages")
-        db_session.commit()  # actually save user / assistant message
+            logger.debug("Committing messages")
+            db_session.commit()  # actually save user / assistant message

-        msg_detail_response = translate_db_message_to_chat_message_detail(
-            gen_ai_response_message
-        )
+            msg_detail_response = translate_db_message_to_chat_message_detail(
+                gen_ai_response_message
+            )

-        yield msg_detail_response
-    except Exception as e:
-        error_msg = str(e)
-        logger.exception(error_msg)
+            yield msg_detail_response
+        except Exception as e:
+            error_msg = str(e)
+            logger.exception(error_msg)

-        # Frontend will erase whatever answer and show this instead
-        yield StreamingError(error="Failed to parse LLM output")
+            # Frontend will erase whatever answer and show this instead
+            yield StreamingError(error="Failed to parse LLM output")


@log_generator_function_time()
@@ -813,4 +874,4 @@ def stream_chat_message(
            is_connected=is_connected,
        )
        for obj in objects:
-            yield get_json_line(obj.model_dump())
+            yield get_json_line(obj.dict())
--- a/backend/danswer/chat/prompts.yaml
+++ b/backend/danswer/chat/prompts.yaml
@@ -42,7 +42,8 @@ prompts:
    task: >
      Generate an image based on the user's description.
      
-      Provide a detailed description of the generated image, including key elements, colors, and composition.
+      Provide a detailed description of the generated image, including key elements, colors, and composition. 
+      
      
      If the request is not possible or appropriate, explain why and suggest alternatives.
    datetime_aware: true
--- a/backend/danswer/chat/tools.py
+++ b/backend/danswer/chat/tools.py
@@ -1,4 +1,4 @@
-from typing_extensions import TypedDict  # noreorder
+from typing import TypedDict

 from pydantic import BaseModel

--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -93,14 +93,6 @@ SMTP_USER = os.environ.get("SMTP_USER", "your-email@gmail.com")
 SMTP_PASS = os.environ.get("SMTP_PASS", "your-gmail-password")
 EMAIL_FROM = os.environ.get("EMAIL_FROM") or SMTP_USER

-# If set, Danswer will listen to the `expires_at` returned by the identity
-# provider (e.g. Okta, Google, etc.) and force the user to re-authenticate
-# after this time has elapsed. Disabled since by default many auth providers
-# have very short expiry times (e.g. 1 hour) which provide a poor user experience
-TRACK_EXTERNAL_IDP_EXPIRY = (
-    os.environ.get("TRACK_EXTERNAL_IDP_EXPIRY", "").lower() == "true"
-)
-

 #####
 # DB Configs
@@ -149,16 +141,6 @@ try:
 except ValueError:
    POSTGRES_POOL_RECYCLE = POSTGRES_POOL_RECYCLE_DEFAULT

-REDIS_HOST = os.environ.get("REDIS_HOST") or "localhost"
-REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
-REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD") or ""
-
-# Used for general redis things
-REDIS_DB_NUMBER = int(os.environ.get("REDIS_DB_NUMBER", 0))
-
-# Used by celery as broker and backend
-REDIS_DB_NUMBER_CELERY = int(os.environ.get("REDIS_DB_NUMBER_CELERY", 15))
-
 #####
 # Connector Configs
 #####
@@ -210,8 +192,8 @@ CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
 ]

 # Avoid to get archived pages
-CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES = (
-    os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES", "").lower() == "true"
+CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES = (
+    os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES", "").lower() == "true"
 )

 # Save pages labels as Danswer metadata tags
@@ -222,12 +204,7 @@ CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING = (

 # Attachments exceeding this size will not be retrieved (in bytes)
 CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD = int(
-    os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD", 10 * 1024 * 1024)
-)
-# Attachments with more chars than this will not be indexed. This is to prevent extremely
-# large files from freezing indexing. 200,000 is ~100 google doc pages.
-CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD = int(
-    os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
+    os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD", 50 * 1024 * 1024)
 )

 JIRA_CONNECTOR_LABELS_TO_SKIP = [
@@ -318,10 +295,6 @@ INDEXING_SIZE_WARNING_THRESHOLD = int(
 # 0 disables this behavior and is the default.
 INDEXING_TRACER_INTERVAL = int(os.environ.get("INDEXING_TRACER_INTERVAL", 0))

-# During an indexing attempt, specifies the number of batches which are allowed to
-# exception without aborting the attempt.
-INDEXING_EXCEPTION_LIMIT = int(os.environ.get("INDEXING_EXCEPTION_LIMIT", 0))
-
 #####
 # Miscellaneous
 #####
@@ -348,10 +321,6 @@ LOG_VESPA_TIMING_INFORMATION = (
    os.environ.get("LOG_VESPA_TIMING_INFORMATION", "").lower() == "true"
 )
 LOG_ENDPOINT_LATENCY = os.environ.get("LOG_ENDPOINT_LATENCY", "").lower() == "true"
-LOG_POSTGRES_LATENCY = os.environ.get("LOG_POSTGRES_LATENCY", "").lower() == "true"
-LOG_POSTGRES_CONN_COUNTS = (
-    os.environ.get("LOG_POSTGRES_CONN_COUNTS", "").lower() == "true"
-)
 # Anonymous usage telemetry
 DISABLE_TELEMETRY = os.environ.get("DISABLE_TELEMETRY", "").lower() == "true"

--- a/backend/danswer/configs/chat_configs.py
+++ b/backend/danswer/configs/chat_configs.py
@@ -31,9 +31,8 @@ FAVOR_RECENT_DECAY_MULTIPLIER = 2.0
 DISABLE_LLM_QUERY_ANSWERABILITY = QA_PROMPT_OVERRIDE == "weak"
 # For the highest matching base size chunk, how many chunks above and below do we pull in by default
 # Note this is not in any of the deployment configs yet
-# Currently only applies to search flow not chat
-CONTEXT_CHUNKS_ABOVE = int(os.environ.get("CONTEXT_CHUNKS_ABOVE") or 1)
-CONTEXT_CHUNKS_BELOW = int(os.environ.get("CONTEXT_CHUNKS_BELOW") or 1)
+CONTEXT_CHUNKS_ABOVE = int(os.environ.get("CONTEXT_CHUNKS_ABOVE") or 0)
+CONTEXT_CHUNKS_BELOW = int(os.environ.get("CONTEXT_CHUNKS_BELOW") or 0)
 # Whether the LLM should be used to decide if a search would help given the chat history
 DISABLE_LLM_CHOOSE_SEARCH = (
    os.environ.get("DISABLE_LLM_CHOOSE_SEARCH", "").lower() == "true"
@@ -45,7 +44,7 @@ DISABLE_LLM_QUERY_REPHRASE = (
 QUOTE_ALLOWED_ERROR_PERCENT = 0.05
 QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60")  # 60 seconds
 # Weighting factor between Vector and Keyword Search, 1 for completely vector search
-HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.5)))
+HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.62)))
 HYBRID_ALPHA_KEYWORD = max(
    0, min(1, float(os.environ.get("HYBRID_ALPHA_KEYWORD") or 0.4))
 )
@@ -54,7 +53,7 @@ HYBRID_ALPHA_KEYWORD = max(
 # Content. This is to avoid cases where the Content is very relevant but it may not be clear
 # if the title is separated out. Title is most of a "boost" than a separate field.
 TITLE_CONTENT_RATIO = max(
-    0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.10))
+    0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.20))
 )

 # A list of languages passed to the LLM to rephase the query
@@ -88,6 +87,3 @@ HARD_DELETE_CHATS = False

 # Internet Search
 BING_API_KEY = os.environ.get("BING_API_KEY") or None
-
-# Enable in-house model for detecting connector-based filtering in queries
-ENABLE_CONNECTOR_CLASSIFIER = os.environ.get("ENABLE_CONNECTOR_CLASSIFIER", False)
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -1,4 +1,3 @@
-from enum import auto
 from enum import Enum

 SOURCE_TYPE = "source_type"
@@ -13,6 +12,10 @@ ID_SEPARATOR = ":;:"
 DEFAULT_BOOST = 0
 SESSION_KEY = "session"

+
+# For tool calling
+MAXIMUM_TOOL_CALL_SEQUENCE = 5
+
 # For chunking/processing chunks
 RETURN_SEPARATOR = "\n\r\n"
 SECTION_SEPARATOR = "\n\n"
@@ -57,12 +60,9 @@ KV_SLACK_BOT_TOKENS_CONFIG_KEY = "slack_bot_tokens_config_key"
 KV_GEN_AI_KEY_CHECK_TIME = "genai_api_key_last_check_time"
 KV_SETTINGS_KEY = "danswer_settings"
 KV_CUSTOMER_UUID_KEY = "customer_uuid"
-KV_INSTANCE_DOMAIN_KEY = "instance_domain"
 KV_ENTERPRISE_SETTINGS_KEY = "danswer_enterprise_settings"
 KV_CUSTOM_ANALYTICS_SCRIPT_KEY = "__custom_analytics_script__"

-CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT = 60
-

 class DocumentSource(str, Enum):
    # Special case, document passed in via Danswer APIs without specifying a source type
@@ -165,27 +165,3 @@ class FileOrigin(str, Enum):
    CONNECTOR = "connector"
    GENERATED_REPORT = "generated_report"
    OTHER = "other"
-
-
-class PostgresAdvisoryLocks(Enum):
-    KOMBU_MESSAGE_CLEANUP_LOCK_ID = auto()
-
-
-class DanswerCeleryQueues:
-    VESPA_DOCSET_SYNC_GENERATOR = "vespa_docset_sync_generator"
-    VESPA_USERGROUP_SYNC_GENERATOR = "vespa_usergroup_sync_generator"
-    VESPA_METADATA_SYNC = "vespa_metadata_sync"
-    CONNECTOR_DELETION = "connector_deletion"
-
-
-class DanswerRedisLocks:
-    CHECK_VESPA_SYNC_BEAT_LOCK = "da_lock:check_vespa_sync_beat"
-    MONITOR_VESPA_SYNC_BEAT_LOCK = "da_lock:monitor_vespa_sync_beat"
-
-
-class DanswerCeleryPriority(int, Enum):
-    HIGHEST = 0
-    HIGH = auto()
-    MEDIUM = auto()
-    LOW = auto()
-    LOWEST = auto()
--- a/backend/danswer/configs/danswerbot_configs.py
+++ b/backend/danswer/configs/danswerbot_configs.py
@@ -73,15 +73,3 @@ DANSWER_BOT_FEEDBACK_REMINDER = int(
 DANSWER_BOT_REPHRASE_MESSAGE = (
    os.environ.get("DANSWER_BOT_REPHRASE_MESSAGE", "").lower() == "true"
 )
-
-# DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD is the number of
-# responses DanswerBot can send in a given time period.
-# Set to 0 to disable the limit.
-DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD = int(
-    os.environ.get("DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD", "5000")
-)
-# DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS is the number
-# of seconds until the response limit is reset.
-DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS = int(
-    os.environ.get("DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS", "86400")
-)
--- a/backend/danswer/configs/model_configs.py
+++ b/backend/danswer/configs/model_configs.py
@@ -51,23 +51,37 @@ CROSS_ENCODER_RANGE_MIN = 0
 # Generative AI Model Configs
 #####

-# NOTE: the 3 below should only be used for dev.
-GEN_AI_API_KEY = os.environ.get("GEN_AI_API_KEY")
+# If changing GEN_AI_MODEL_PROVIDER or GEN_AI_MODEL_VERSION from the default,
+# be sure to use one that is LiteLLM compatible:
+# https://litellm.vercel.app/docs/providers/azure#completion---using-env-variables
+# The provider is the prefix before / in the model argument
+
+# Additionally Danswer supports GPT4All and custom request library based models
+# Set GEN_AI_MODEL_PROVIDER to "custom" to use the custom requests approach
+# Set GEN_AI_MODEL_PROVIDER to "gpt4all" to use gpt4all models running locally
+GEN_AI_MODEL_PROVIDER = os.environ.get("GEN_AI_MODEL_PROVIDER") or "openai"
+# If using Azure, it's the engine name, for example: Danswer
 GEN_AI_MODEL_VERSION = os.environ.get("GEN_AI_MODEL_VERSION")
+
+# For secondary flows like extracting filters or deciding if a chunk is useful, we don't need
+# as powerful of a model as say GPT-4 so we can use an alternative that is faster and cheaper
 FAST_GEN_AI_MODEL_VERSION = os.environ.get("FAST_GEN_AI_MODEL_VERSION")

-# Override the auto-detection of LLM max context length
-GEN_AI_MAX_TOKENS = int(os.environ.get("GEN_AI_MAX_TOKENS") or 0) or None
-
-# Set this to be enough for an answer + quotes. Also used for Chat
-# This is the minimum token context we will leave for the LLM to generate an answer
-GEN_AI_NUM_RESERVED_OUTPUT_TOKENS = int(
-    os.environ.get("GEN_AI_NUM_RESERVED_OUTPUT_TOKENS") or 1024
+# If the Generative AI model requires an API key for access, otherwise can leave blank
+GEN_AI_API_KEY = (
+    os.environ.get("GEN_AI_API_KEY", os.environ.get("OPENAI_API_KEY")) or None
 )

-# Typically, GenAI models nowadays are at least 4K tokens
-GEN_AI_MODEL_FALLBACK_MAX_TOKENS = 4096
-
+# API Base, such as (for Azure): https://danswer.openai.azure.com/
+GEN_AI_API_ENDPOINT = os.environ.get("GEN_AI_API_ENDPOINT") or None
+# API Version, such as (for Azure): 2023-09-15-preview
+GEN_AI_API_VERSION = os.environ.get("GEN_AI_API_VERSION") or None
+# LiteLLM custom_llm_provider
+GEN_AI_LLM_PROVIDER_TYPE = os.environ.get("GEN_AI_LLM_PROVIDER_TYPE") or None
+# Override the auto-detection of LLM max context length
+GEN_AI_MAX_TOKENS = int(os.environ.get("GEN_AI_MAX_TOKENS") or 0) or None
+# Set this to be enough for an answer + quotes. Also used for Chat
+GEN_AI_MAX_OUTPUT_TOKENS = int(os.environ.get("GEN_AI_MAX_OUTPUT_TOKENS") or 1024)
 # Number of tokens from chat history to include at maximum
 # 3000 should be enough context regardless of use, no need to include as much as possible
 # as this drives up the cost unnecessarily
--- a/backend/danswer/connectors/README.md
+++ b/backend/danswer/connectors/README.md
@@ -59,8 +59,6 @@ if __name__ == "__main__":
    latest_docs = test_connector.poll_source(one_day_ago, current)
 ```

-> Note: Be sure to set PYTHONPATH to danswer/backend before running the above main.
-

 ### Additional Required Changes:
 #### Backend Changes
@@ -70,16 +68,17 @@ if __name__ == "__main__":
 [here](https://github.com/danswer-ai/danswer/blob/main/backend/danswer/connectors/factory.py#L33)

 #### Frontend Changes
- Add the new Connector definition to the `SOURCE_METADATA_MAP` [here](https://github.com/danswer-ai/danswer/blob/main/web/src/lib/sources.ts#L59).
- Add the definition for the new Form to the `connectorConfigs` object [here](https://github.com/danswer-ai/danswer/blob/main/web/src/lib/connectors/connectors.ts#L79). 
+- Create the new connector directory and admin page under `danswer/web/src/app/admin/connectors/`
+- Create the new icon, type, source, and filter changes
+(refer to existing [PR](https://github.com/danswer-ai/danswer/pull/139))

 #### Docs Changes
 Create the new connector page (with guiding images!) with how to get the connector credentials and how to set up the
-connector in Danswer. Then create a Pull Request in https://github.com/danswer-ai/danswer-docs.
+connector in Danswer. Then create a Pull Request in https://github.com/danswer-ai/danswer-docs
+

 ### Before opening PR
 1. Be sure to fully test changes end to end with setting up the connector and updating the index with new docs from the
-new connector. To make it easier to review, please attach a video showing the successful creation of the connector via the UI (starting from the `Add Connector` page).
-2. Add a folder + tests under `backend/tests/daily/connectors` director. For an example, checkout the [test for Confluence](https://github.com/danswer-ai/danswer/blob/main/backend/tests/daily/connectors/confluence/test_confluence_basic.py). In the PR description, include a guide on how to setup the new source to pass the test. Before merging, we will re-create the environment and make sure the test(s) pass.  
-3. Be sure to run the linting/formatting, refer to the formatting and linting section in
+new connector.
+2. Be sure to run the linting/formatting, refer to the formatting and linting section in
 [CONTRIBUTING.md](https://github.com/danswer-ai/danswer/blob/main/CONTRIBUTING.md#formatting-and-linting)
--- a/backend/danswer/connectors/blob/connector.py
+++ b/backend/danswer/connectors/blob/connector.py
@@ -56,7 +56,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
        Raises ValueError for unsupported bucket types.
        """

-        logger.debug(
+        logger.info(
            f"Loading credentials for {self.bucket_name} or type {self.bucket_type}"
        )

@@ -220,7 +220,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
            yield batch

    def load_from_state(self) -> GenerateDocumentsOutput:
-        logger.debug("Loading blob objects")
+        logger.info("Loading blob objects")
        return self._yield_blob_objects(
            start=datetime(1970, 1, 1, tzinfo=timezone.utc),
            end=datetime.now(timezone.utc),
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -7,16 +7,14 @@ from datetime import timezone
 from functools import lru_cache
 from typing import Any
 from typing import cast
+from urllib.parse import urlparse

 import bs4
 from atlassian import Confluence  # type:ignore
 from requests import HTTPError

-from danswer.configs.app_configs import (
-    CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD,
-)
 from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD
-from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES
+from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
 from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
 from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING
 from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
@@ -44,12 +42,77 @@ logger = setup_logger()
 # 2. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost


-NO_PERMISSIONS_TO_VIEW_ATTACHMENTS_ERROR_STR = (
-    "User not permitted to view attachments on content"
-)
-NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR = (
-    "No parent or not permitted to view content with id"
-)
+def _extract_confluence_keys_from_cloud_url(wiki_url: str) -> tuple[str, str, str]:
+    """Sample
+    URL w/ page: https://danswer.atlassian.net/wiki/spaces/1234abcd/pages/5678efgh/overview
+    URL w/o page: https://danswer.atlassian.net/wiki/spaces/ASAM/overview
+
+    wiki_base is https://danswer.atlassian.net/wiki
+    space is 1234abcd
+    page_id is 5678efgh
+    """
+    parsed_url = urlparse(wiki_url)
+    wiki_base = (
+        parsed_url.scheme
+        + "://"
+        + parsed_url.netloc
+        + parsed_url.path.split("/spaces")[0]
+    )
+
+    path_parts = parsed_url.path.split("/")
+    space = path_parts[3]
+
+    page_id = path_parts[5] if len(path_parts) > 5 else ""
+    return wiki_base, space, page_id
+
+
+def _extract_confluence_keys_from_datacenter_url(wiki_url: str) -> tuple[str, str, str]:
+    """Sample
+    URL w/ page https://danswer.ai/confluence/display/1234abcd/pages/5678efgh/overview
+    URL w/o page https://danswer.ai/confluence/display/1234abcd/overview
+    wiki_base is https://danswer.ai/confluence
+    space is 1234abcd
+    page_id is 5678efgh
+    """
+    # /display/ is always right before the space and at the end of the base print()
+    DISPLAY = "/display/"
+    PAGE = "/pages/"
+
+    parsed_url = urlparse(wiki_url)
+    wiki_base = (
+        parsed_url.scheme
+        + "://"
+        + parsed_url.netloc
+        + parsed_url.path.split(DISPLAY)[0]
+    )
+    space = DISPLAY.join(parsed_url.path.split(DISPLAY)[1:]).split("/")[0]
+    page_id = ""
+    if (content := parsed_url.path.split(PAGE)) and len(content) > 1:
+        page_id = content[1]
+    return wiki_base, space, page_id
+
+
+def extract_confluence_keys_from_url(wiki_url: str) -> tuple[str, str, str, bool]:
+    is_confluence_cloud = (
+        ".atlassian.net/wiki/spaces/" in wiki_url
+        or ".jira.com/wiki/spaces/" in wiki_url
+    )
+
+    try:
+        if is_confluence_cloud:
+            wiki_base, space, page_id = _extract_confluence_keys_from_cloud_url(
+                wiki_url
+            )
+        else:
+            wiki_base, space, page_id = _extract_confluence_keys_from_datacenter_url(
+                wiki_url
+            )
+    except Exception as e:
+        error_msg = f"Not a valid Confluence Wiki Link, unable to extract wiki base, space, and page id. Exception: {e}"
+        logger.error(error_msg)
+        raise ValueError(error_msg)
+
+    return wiki_base, space, page_id, is_confluence_cloud


@lru_cache()
@@ -137,38 +200,19 @@ def _comment_dfs(
        comments_str += "\nComment:\n" + parse_html_page(
            comment_html, confluence_client
        )
-        try:
-            child_comment_pages = get_page_child_by_type(
-                comment_page["id"],
-                type="comment",
-                start=None,
-                limit=None,
-                expand="body.storage.value",
-            )
-            comments_str = _comment_dfs(
-                comments_str, child_comment_pages, confluence_client
-            )
-        except HTTPError as e:
-            # not the cleanest, but I'm not aware of a nicer way to check the error
-            if NO_PARENT_OR_NO_PERMISSIONS_ERROR_STR not in str(e):
-                raise
-
+        child_comment_pages = get_page_child_by_type(
+            comment_page["id"],
+            type="comment",
+            start=None,
+            limit=None,
+            expand="body.storage.value",
+        )
+        comments_str = _comment_dfs(
+            comments_str, child_comment_pages, confluence_client
+        )
    return comments_str


-def _datetime_from_string(datetime_string: str) -> datetime:
-    datetime_object = datetime.fromisoformat(datetime_string)
-
-    if datetime_object.tzinfo is None:
-        # If no timezone info, assume it is UTC
-        datetime_object = datetime_object.replace(tzinfo=timezone.utc)
-    else:
-        # If not in UTC, translate it
-        datetime_object = datetime_object.astimezone(timezone.utc)
-
-    return datetime_object
-
-
 class RecursiveIndexer:
    def __init__(
        self,
@@ -298,10 +342,7 @@ class RecursiveIndexer:
 class ConfluenceConnector(LoadConnector, PollConnector):
    def __init__(
        self,
-        wiki_base: str,
-        space: str,
-        is_cloud: bool,
-        page_id: str = "",
+        wiki_page_url: str,
        index_recursively: bool = True,
        batch_size: int = INDEX_BATCH_SIZE,
        continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
@@ -315,15 +356,15 @@ class ConfluenceConnector(LoadConnector, PollConnector):
        self.labels_to_skip = set(labels_to_skip)
        self.recursive_indexer: RecursiveIndexer | None = None
        self.index_recursively = index_recursively
-
-        # Remove trailing slash from wiki_base if present
-        self.wiki_base = wiki_base.rstrip("/")
-        self.space = space
-        self.page_id = page_id
-
-        self.is_cloud = is_cloud
+        (
+            self.wiki_base,
+            self.space,
+            self.page_id,
+            self.is_cloud,
+        ) = extract_confluence_keys_from_url(wiki_page_url)

        self.space_level_scan = False
+
        self.confluence_client: Confluence | None = None

        if self.page_id is None or self.page_id == "":
@@ -343,6 +384,7 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            username=username if self.is_cloud else None,
            password=access_token if self.is_cloud else None,
            token=access_token if not self.is_cloud else None,
+            cloud=self.is_cloud,
        )
        return None

@@ -361,7 +403,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                    start=start_ind,
                    limit=batch_size,
                    status=(
-                        None if CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES else "current"
+                        "current"
+                        if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
+                        else None
                    ),
                    expand="body.storage.value,version",
                )
@@ -382,9 +426,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
                                start=start_ind + i,
                                limit=1,
                                status=(
-                                    None
-                                    if CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES
-                                    else "current"
+                                    "current"
+                                    if CONFLUENCE_CONNECTOR_INDEX_ONLY_ACTIVE_PAGES
+                                    else None
                                ),
                                expand="body.storage.value,version",
                            )
@@ -491,249 +535,145 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            logger.exception("Ran into exception when fetching labels from Confluence")
            return []

-    @classmethod
-    def _attachment_to_download_link(
-        cls, confluence_client: Confluence, attachment: dict[str, Any]
-    ) -> str:
-        return confluence_client.url + attachment["_links"]["download"]
-
-    @classmethod
-    def _attachment_to_content(
-        cls,
-        confluence_client: Confluence,
-        attachment: dict[str, Any],
-    ) -> str | None:
-        """If it returns None, assume that we should skip this attachment."""
-        if attachment["metadata"]["mediaType"] in [
-            "image/jpeg",
-            "image/png",
-            "image/gif",
-            "image/svg+xml",
-            "video/mp4",
-            "video/quicktime",
-        ]:
-            return None
-
-        download_link = cls._attachment_to_download_link(confluence_client, attachment)
-
-        attachment_size = attachment["extensions"]["fileSize"]
-        if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD:
-            logger.warning(
-                f"Skipping {download_link} due to size. "
-                f"size={attachment_size} "
-                f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}"
-            )
-            return None
-
-        response = confluence_client._session.get(download_link)
-        if response.status_code != 200:
-            logger.warning(
-                f"Failed to fetch {download_link} with invalid status code {response.status_code}"
-            )
-            return None
-
-        extracted_text = extract_file_text(
-            attachment["title"], io.BytesIO(response.content), False
-        )
-        if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD:
-            logger.warning(
-                f"Skipping {download_link} due to char count. "
-                f"char count={len(extracted_text)} "
-                f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD}"
-            )
-            return None
-
-        return extracted_text
-
    def _fetch_attachments(
        self, confluence_client: Confluence, page_id: str, files_in_used: list[str]
-    ) -> tuple[str, list[dict[str, Any]]]:
-        unused_attachments: list = []
-
+    ) -> str:
        get_attachments_from_content = make_confluence_call_handle_rate_limit(
            confluence_client.get_attachments_from_content
        )
        files_attachment_content: list = []

        try:
-            expand = "history.lastUpdated,metadata.labels"
            attachments_container = get_attachments_from_content(
-                page_id, start=0, limit=500, expand=expand
+                page_id, start=0, limit=500
            )
            for attachment in attachments_container["results"]:
-                if attachment["title"] not in files_in_used:
-                    unused_attachments.append(attachment)
+                if attachment["metadata"]["mediaType"] in [
+                    "image/jpeg",
+                    "image/png",
+                    "image/gif",
+                    "image/svg+xml",
+                    "video/mp4",
+                    "video/quicktime",
+                ]:
                    continue

-                attachment_content = self._attachment_to_content(
-                    confluence_client, attachment
-                )
-                if attachment_content:
-                    files_attachment_content.append(attachment_content)
+                if attachment["title"] not in files_in_used:
+                    continue
+
+                download_link = confluence_client.url + attachment["_links"]["download"]
+
+                attachment_size = attachment["extensions"]["fileSize"]
+                if attachment_size > CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD:
+                    logger.warning(
+                        f"Skipping {download_link} due to size. "
+                        f"size={attachment_size} "
+                        f"threshold={CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD}"
+                    )
+                    continue
+
+                download_link = confluence_client.url + attachment["_links"]["download"]
+                response = confluence_client._session.get(download_link)
+
+                if response.status_code == 200:
+                    extract = extract_file_text(
+                        attachment["title"], io.BytesIO(response.content), False
+                    )
+                    files_attachment_content.append(extract)

        except Exception as e:
-            if isinstance(
-                e, HTTPError
-            ) and NO_PERMISSIONS_TO_VIEW_ATTACHMENTS_ERROR_STR in str(e):
-                logger.warning(
-                    f"User does not have access to attachments on page '{page_id}'"
-                )
-                return "", []
-
            if not self.continue_on_failure:
                raise e
            logger.exception(
                f"Ran into exception when fetching attachments from Confluence: {e}"
            )

-        return "\n".join(files_attachment_content), unused_attachments
+        return "\n".join(files_attachment_content)

    def _get_doc_batch(
        self, start_ind: int, time_filter: Callable[[datetime], bool] | None = None
-    ) -> tuple[list[Document], list[dict[str, Any]], int]:
+    ) -> tuple[list[Document], int]:
        doc_batch: list[Document] = []
-        unused_attachments: list[dict[str, Any]] = []

        if self.confluence_client is None:
            raise ConnectorMissingCredentialError("Confluence")
        batch = self._fetch_pages(self.confluence_client, start_ind)

        for page in batch:
-            last_modified = _datetime_from_string(page["version"]["when"])
+            last_modified_str = page["version"]["when"]
            author = cast(str | None, page["version"].get("by", {}).get("email"))
+            last_modified = datetime.fromisoformat(last_modified_str)

-            if time_filter and not time_filter(last_modified):
-                continue
+            if last_modified.tzinfo is None:
+                # If no timezone info, assume it is UTC
+                last_modified = last_modified.replace(tzinfo=timezone.utc)
+            else:
+                # If not in UTC, translate it
+                last_modified = last_modified.astimezone(timezone.utc)

-            page_id = page["id"]
+            if time_filter is None or time_filter(last_modified):
+                page_id = page["id"]

-            if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING:
-                page_labels = self._fetch_labels(self.confluence_client, page_id)
+                if self.labels_to_skip or not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING:
+                    page_labels = self._fetch_labels(self.confluence_client, page_id)

-            # check disallowed labels
-            if self.labels_to_skip:
-                label_intersection = self.labels_to_skip.intersection(page_labels)
-                if label_intersection:
-                    logger.info(
-                        f"Page with ID '{page_id}' has a label which has been "
-                        f"designated as disallowed: {label_intersection}. Skipping."
-                    )
+                # check disallowed labels
+                if self.labels_to_skip:
+                    label_intersection = self.labels_to_skip.intersection(page_labels)
+                    if label_intersection:
+                        logger.info(
+                            f"Page with ID '{page_id}' has a label which has been "
+                            f"designated as disallowed: {label_intersection}. Skipping."
+                        )

+                        continue
+
+                page_html = (
+                    page["body"]
+                    .get("storage", page["body"].get("view", {}))
+                    .get("value")
+                )
+                page_url = self.wiki_base + page["_links"]["webui"]
+                if not page_html:
+                    logger.debug("Page is empty, skipping: %s", page_url)
                    continue
+                page_text = parse_html_page(page_html, self.confluence_client)

-            page_html = (
-                page["body"].get("storage", page["body"].get("view", {})).get("value")
-            )
-            page_url = self.wiki_base + page["_links"]["webui"]
-            if not page_html:
-                logger.debug("Page is empty, skipping: %s", page_url)
-                continue
-            page_text = parse_html_page(page_html, self.confluence_client)
-
-            files_in_used = get_used_attachments(page_html, self.confluence_client)
-            attachment_text, unused_page_attachments = self._fetch_attachments(
-                self.confluence_client, page_id, files_in_used
-            )
-            unused_attachments.extend(unused_page_attachments)
-
-            page_text += attachment_text
-            comments_text = self._fetch_comments(self.confluence_client, page_id)
-            page_text += comments_text
-            doc_metadata: dict[str, str | list[str]] = {"Wiki Space Name": self.space}
-            if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING and page_labels:
-                doc_metadata["labels"] = page_labels
-
-            doc_batch.append(
-                Document(
-                    id=page_url,
-                    sections=[Section(link=page_url, text=page_text)],
-                    source=DocumentSource.CONFLUENCE,
-                    semantic_identifier=page["title"],
-                    doc_updated_at=last_modified,
-                    primary_owners=(
-                        [BasicExpertInfo(email=author)] if author else None
-                    ),
-                    metadata=doc_metadata,
+                files_in_used = get_used_attachments(page_html, self.confluence_client)
+                attachment_text = self._fetch_attachments(
+                    self.confluence_client, page_id, files_in_used
                )
-            )
-        return (
-            doc_batch,
-            unused_attachments,
-            len(batch),
-        )
+                page_text += attachment_text
+                comments_text = self._fetch_comments(self.confluence_client, page_id)
+                page_text += comments_text
+                doc_metadata: dict[str, str | list[str]] = {
+                    "Wiki Space Name": self.space
+                }
+                if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING and page_labels:
+                    doc_metadata["labels"] = page_labels

-    def _get_attachment_batch(
-        self,
-        start_ind: int,
-        attachments: list[dict[str, Any]],
-        time_filter: Callable[[datetime], bool] | None = None,
-    ) -> tuple[list[Document], int]:
-        doc_batch: list[Document] = []
-
-        if self.confluence_client is None:
-            raise ConnectorMissingCredentialError("Confluence")
-
-        end_ind = min(start_ind + self.batch_size, len(attachments))
-
-        for attachment in attachments[start_ind:end_ind]:
-            last_updated = _datetime_from_string(
-                attachment["history"]["lastUpdated"]["when"]
-            )
-
-            if time_filter and not time_filter(last_updated):
-                continue
-
-            attachment_url = self._attachment_to_download_link(
-                self.confluence_client, attachment
-            )
-            attachment_content = self._attachment_to_content(
-                self.confluence_client, attachment
-            )
-            if attachment_content is None:
-                continue
-
-            creator_email = attachment["history"]["createdBy"].get("email")
-
-            comment = attachment["metadata"].get("comment", "")
-            doc_metadata: dict[str, str | list[str]] = {"comment": comment}
-
-            attachment_labels: list[str] = []
-            if not CONFLUENCE_CONNECTOR_SKIP_LABEL_INDEXING:
-                for label in attachment["metadata"]["labels"]["results"]:
-                    attachment_labels.append(label["name"])
-
-            doc_metadata["labels"] = attachment_labels
-
-            doc_batch.append(
-                Document(
-                    id=attachment_url,
-                    sections=[Section(link=attachment_url, text=attachment_content)],
-                    source=DocumentSource.CONFLUENCE,
-                    semantic_identifier=attachment["title"],
-                    doc_updated_at=last_updated,
-                    primary_owners=(
-                        [BasicExpertInfo(email=creator_email)]
-                        if creator_email
-                        else None
-                    ),
-                    metadata=doc_metadata,
+                doc_batch.append(
+                    Document(
+                        id=page_url,
+                        sections=[Section(link=page_url, text=page_text)],
+                        source=DocumentSource.CONFLUENCE,
+                        semantic_identifier=page["title"],
+                        doc_updated_at=last_modified,
+                        primary_owners=(
+                            [BasicExpertInfo(email=author)] if author else None
+                        ),
+                        metadata=doc_metadata,
+                    )
                )
-            )
-
-        return doc_batch, end_ind - start_ind
+        return doc_batch, len(batch)

    def load_from_state(self) -> GenerateDocumentsOutput:
-        unused_attachments = []
-
        if self.confluence_client is None:
            raise ConnectorMissingCredentialError("Confluence")

        start_ind = 0
        while True:
-            doc_batch, unused_attachments_batch, num_pages = self._get_doc_batch(
-                start_ind
-            )
-            unused_attachments.extend(unused_attachments_batch)
+            doc_batch, num_pages = self._get_doc_batch(start_ind)
            start_ind += num_pages
            if doc_batch:
                yield doc_batch
@@ -741,23 +681,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            if num_pages < self.batch_size:
                break

-        start_ind = 0
-        while True:
-            attachment_batch, num_attachments = self._get_attachment_batch(
-                start_ind, unused_attachments
-            )
-            start_ind += num_attachments
-            if attachment_batch:
-                yield attachment_batch
-
-            if num_attachments < self.batch_size:
-                break
-
    def poll_source(
        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
    ) -> GenerateDocumentsOutput:
-        unused_attachments = []
-
        if self.confluence_client is None:
            raise ConnectorMissingCredentialError("Confluence")

@@ -766,11 +692,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):

        start_ind = 0
        while True:
-            doc_batch, unused_attachments_batch, num_pages = self._get_doc_batch(
+            doc_batch, num_pages = self._get_doc_batch(
                start_ind, time_filter=lambda t: start_time <= t <= end_time
            )
-            unused_attachments.extend(unused_attachments_batch)
-
            start_ind += num_pages
            if doc_batch:
                yield doc_batch
@@ -778,29 +702,9 @@ class ConfluenceConnector(LoadConnector, PollConnector):
            if num_pages < self.batch_size:
                break

-        start_ind = 0
-        while True:
-            attachment_batch, num_attachments = self._get_attachment_batch(
-                start_ind,
-                unused_attachments,
-                time_filter=lambda t: start_time <= t <= end_time,
-            )
-            start_ind += num_attachments
-            if attachment_batch:
-                yield attachment_batch
-
-            if num_attachments < self.batch_size:
-                break
-

 if __name__ == "__main__":
-    connector = ConfluenceConnector(
-        wiki_base=os.environ["CONFLUENCE_TEST_SPACE_URL"],
-        space=os.environ["CONFLUENCE_TEST_SPACE"],
-        is_cloud=os.environ.get("CONFLUENCE_IS_CLOUD", "true").lower() == "true",
-        page_id=os.environ.get("CONFLUENCE_TEST_PAGE_ID", ""),
-        index_recursively=True,
-    )
+    connector = ConfluenceConnector(os.environ["CONFLUENCE_TEST_SPACE_URL"])
    connector.load_credentials(
        {
            "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
--- a/backend/danswer/connectors/confluence/rate_limit_handler.py
+++ b/backend/danswer/connectors/confluence/rate_limit_handler.py
@@ -23,33 +23,25 @@ class ConfluenceRateLimitError(Exception):

 def make_confluence_call_handle_rate_limit(confluence_call: F) -> F:
    def wrapped_call(*args: list[Any], **kwargs: Any) -> Any:
-        max_retries = 5
        starting_delay = 5
        backoff = 2
        max_delay = 600

-        for attempt in range(max_retries):
+        for attempt in range(10):
            try:
                return confluence_call(*args, **kwargs)
            except HTTPError as e:
-                # Check if the response or headers are None to avoid potential AttributeError
-                if e.response is None or e.response.headers is None:
-                    logger.warning("HTTPError with `None` as response or as headers")
-                    raise e
-
-                retry_after_header = e.response.headers.get("Retry-After")
                if (
                    e.response.status_code == 429
                    or RATE_LIMIT_MESSAGE_LOWERCASE in e.response.text.lower()
                ):
                    retry_after = None
-                    if retry_after_header is not None:
-                        try:
-                            retry_after = int(retry_after_header)
-                        except ValueError:
-                            pass
+                    try:
+                        retry_after = int(e.response.headers.get("Retry-After"))
+                    except (ValueError, TypeError):
+                        pass

-                    if retry_after is not None:
+                    if retry_after:
                        logger.warning(
                            f"Rate limit hit. Retrying after {retry_after} seconds..."
                        )
@@ -63,14 +55,5 @@ def make_confluence_call_handle_rate_limit(confluence_call: F) -> F:
                else:
                    # re-raise, let caller handle
                    raise
-            except AttributeError as e:
-                # Some error within the Confluence library, unclear why it fails.
-                # Users reported it to be intermittent, so just retry
-                logger.warning(f"Confluence Internal Error, retrying... {e}")
-                delay = min(starting_delay * (backoff**attempt), max_delay)
-                time.sleep(delay)
-
-                if attempt == max_retries - 1:
-                    raise e

    return cast(F, wrapped_call)
--- a/backend/danswer/connectors/connector_runner.py
+++ b/backend/danswer/connectors/connector_runner.py
@@ -1,70 +0,0 @@
-import sys
-from datetime import datetime
-
-from danswer.connectors.interfaces import BaseConnector
-from danswer.connectors.interfaces import GenerateDocumentsOutput
-from danswer.connectors.interfaces import LoadConnector
-from danswer.connectors.interfaces import PollConnector
-from danswer.utils.logger import setup_logger
-
-
-logger = setup_logger()
-
-
-TimeRange = tuple[datetime, datetime]
-
-
-class ConnectorRunner:
-    def __init__(
-        self,
-        connector: BaseConnector,
-        time_range: TimeRange | None = None,
-        fail_loudly: bool = False,
-    ):
-        self.connector = connector
-
-        if isinstance(self.connector, PollConnector):
-            if time_range is None:
-                raise ValueError("time_range is required for PollConnector")
-
-            self.doc_batch_generator = self.connector.poll_source(
-                time_range[0].timestamp(), time_range[1].timestamp()
-            )
-
-        elif isinstance(self.connector, LoadConnector):
-            if time_range and fail_loudly:
-                raise ValueError(
-                    "time_range specified, but passed in connector is not a PollConnector"
-                )
-
-            self.doc_batch_generator = self.connector.load_from_state()
-
-        else:
-            raise ValueError(f"Invalid connector. type: {type(self.connector)}")
-
-    def run(self) -> GenerateDocumentsOutput:
-        """Adds additional exception logging to the connector."""
-        try:
-            yield from self.doc_batch_generator
-        except Exception:
-            exc_type, _, exc_traceback = sys.exc_info()
-
-            # Traverse the traceback to find the last frame where the exception was raised
-            tb = exc_traceback
-            if tb is None:
-                logger.error("No traceback found for exception")
-                raise
-
-            while tb.tb_next:
-                tb = tb.tb_next  # Move to the next frame in the traceback
-
-            # Get the local variables from the frame where the exception occurred
-            local_vars = tb.tb_frame.f_locals
-            local_vars_str = "\n".join(
-                f"{key}: {value}" for key, value in local_vars.items()
-            )
-            logger.error(
-                f"Error in connector. type: {exc_type};\n"
-                f"local_vars below -> \n{local_vars_str}"
-            )
-            raise
--- a/backend/danswer/connectors/cross_connector_utils/rate_limit_wrapper.py
+++ b/backend/danswer/connectors/cross_connector_utils/rate_limit_wrapper.py
@@ -56,7 +56,7 @@ class _RateLimitDecorator:
            sleep_cnt = 0
            while len(self.call_history) == self.max_calls:
                sleep_time = self.sleep_time * (self.sleep_backoff**sleep_cnt)
-                logger.notice(
+                logger.info(
                    f"Rate limit exceeded for function {func.__name__}. "
                    f"Waiting {sleep_time} seconds before retrying."
                )
--- a/backend/danswer/connectors/danswer_jira/connector.py
+++ b/backend/danswer/connectors/danswer_jira/connector.py
@@ -45,15 +45,10 @@ def extract_jira_project(url: str) -> tuple[str, str]:
    return jira_base, jira_project


-def extract_text_from_adf(adf: dict | None) -> str:
-    """Extracts plain text from Atlassian Document Format:
-    https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
-
-    WARNING: This function is incomplete and will e.g. skip lists!
-    """
+def extract_text_from_content(content: dict) -> str:
    texts = []
-    if adf is not None and "content" in adf:
-        for block in adf["content"]:
+    if "content" in content:
+        for block in content["content"]:
            if "content" in block:
                for item in block["content"]:
                    if item["type"] == "text":
@@ -77,15 +72,18 @@ def _get_comment_strs(
    comment_strs = []
    for comment in jira.fields.comment.comments:
        try:
-            body_text = (
-                comment.body
-                if JIRA_API_VERSION == "2"
-                else extract_text_from_adf(comment.raw["body"])
-            )
+            if hasattr(comment, "body"):
+                body_text = extract_text_from_content(comment.raw["body"])
+            elif hasattr(comment, "raw"):
+                body = comment.raw.get("body", "No body content available")
+                body_text = (
+                    extract_text_from_content(body) if isinstance(body, dict) else body
+                )
+            else:
+                body_text = "No body attribute found"

            if (
                hasattr(comment, "author")
-                and hasattr(comment.author, "emailAddress")
                and comment.author.emailAddress in comment_email_blacklist
            ):
                continue  # Skip adding comment if author's email is in blacklist
@@ -128,14 +126,11 @@ def fetch_jira_issues_batch(
            )
            continue

-        description = (
-            jira.fields.description
-            if JIRA_API_VERSION == "2"
-            else extract_text_from_adf(jira.raw["fields"]["description"])
-        )
        comments = _get_comment_strs(jira, comment_email_blacklist)
-        semantic_rep = f"{description}\n" + "\n".join(
-            [f"Comment: {comment}" for comment in comments if comment]
+        semantic_rep = (
+            f"{jira.fields.description}\n"
+            if jira.fields.description
+            else "" + "\n".join([f"Comment: {comment}" for comment in comments])
        )

        page_url = f"{jira_client.client_info()}/browse/{jira.key}"
--- a/backend/danswer/connectors/file/connector.py
+++ b/backend/danswer/connectors/file/connector.py
@@ -23,7 +23,7 @@ from danswer.file_processing.extract_file_text import extract_file_text
 from danswer.file_processing.extract_file_text import get_file_ext
 from danswer.file_processing.extract_file_text import is_text_file_extension
 from danswer.file_processing.extract_file_text import load_files_from_zip
-from danswer.file_processing.extract_file_text import read_pdf_file
+from danswer.file_processing.extract_file_text import pdf_to_text
 from danswer.file_processing.extract_file_text import read_text_file
 from danswer.file_store.file_store import get_default_file_store
 from danswer.utils.logger import setup_logger
@@ -75,7 +75,7 @@ def _process_file(

    # Using the PDF reader function directly to pass in password cleanly
    elif extension == ".pdf":
-        file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass)
+        file_content_raw = pdf_to_text(file=file, pdf_pass=pdf_pass)

    else:
        file_content_raw = extract_file_text(
--- a/backend/danswer/connectors/github/connector.py
+++ b/backend/danswer/connectors/github/connector.py
@@ -38,7 +38,7 @@ def _sleep_after_rate_limit_exception(github_client: Github) -> None:
        tzinfo=timezone.utc
    ) - datetime.now(tz=timezone.utc)
    sleep_time += timedelta(minutes=1)  # add an extra minute just to be safe
-    logger.notice(f"Ran into Github rate-limit. Sleeping {sleep_time.seconds} seconds.")
+    logger.info(f"Ran into Github rate-limit. Sleeping {sleep_time.seconds} seconds.")
    time.sleep(sleep_time.seconds)


--- a/backend/danswer/connectors/gmail/connector_auth.py
+++ b/backend/danswer/connectors/gmail/connector_auth.py
@@ -50,7 +50,7 @@ def get_gmail_creds_for_authorized_user(
        try:
            creds.refresh(Request())
            if creds.valid:
-                logger.notice("Refreshed Gmail tokens.")
+                logger.info("Refreshed Gmail tokens.")
                return creds
        except Exception as e:
            logger.exception(f"Failed to refresh gmail access token due to: {e}")
@@ -125,7 +125,7 @@ def update_gmail_credential_access_tokens(
 ) -> OAuthCredentials | None:
    app_credentials = get_google_app_gmail_cred()
    flow = InstalledAppFlow.from_client_config(
-        app_credentials.model_dump(),
+        app_credentials.dict(),
        scopes=SCOPES,
        redirect_uri=_build_frontend_gmail_redirect(),
    )
--- a/backend/danswer/connectors/gong/connector.py
+++ b/backend/danswer/connectors/gong/connector.py
@@ -81,10 +81,10 @@ class GongConnector(LoadConnector, PollConnector):

        for workspace in workspace_list:
            if workspace:
-                logger.info(f"Updating Gong workspace: {workspace}")
+                logger.info(f"Updating workspace: {workspace}")
                workspace_id = workspace_map.get(workspace)
                if not workspace_id:
-                    logger.error(f"Invalid Gong workspace: {workspace}")
+                    logger.error(f"Invalid workspace: {workspace}")
                    if not self.continue_on_fail:
                        raise ValueError(f"Invalid workspace: {workspace}")
                    continue
--- a/backend/danswer/connectors/google_drive/connector.py
+++ b/backend/danswer/connectors/google_drive/connector.py
@@ -41,8 +41,8 @@ from danswer.connectors.interfaces import SecondsSinceUnixEpoch
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
 from danswer.file_processing.extract_file_text import docx_to_text
+from danswer.file_processing.extract_file_text import pdf_to_text
 from danswer.file_processing.extract_file_text import pptx_to_text
-from danswer.file_processing.extract_file_text import read_pdf_file
 from danswer.utils.batching import batch_generator
 from danswer.utils.logger import setup_logger

@@ -62,8 +62,6 @@ class GDriveMimeType(str, Enum):
    POWERPOINT = (
        "application/vnd.openxmlformats-officedocument.presentationml.presentation"
    )
-    PLAIN_TEXT = "text/plain"
-    MARKDOWN = "text/markdown"


 GoogleDriveFileType = dict[str, Any]
@@ -269,7 +267,7 @@ def get_all_files_batched(
    yield from batch_generator(
        items=found_files,
        batch_size=batch_size,
-        pre_batch_yield=lambda batch_files: logger.debug(
+        pre_batch_yield=lambda batch_files: logger.info(
            f"Parseable Documents in batch: {[file['name'] for file in batch_files]}"
        ),
    )
@@ -318,29 +316,25 @@ def extract_text(file: dict[str, str], service: discovery.Resource) -> str:
        GDriveMimeType.PPT.value,
        GDriveMimeType.SPREADSHEET.value,
    ]:
-        export_mime_type = (
-            "text/plain"
-            if mime_type != GDriveMimeType.SPREADSHEET.value
-            else "text/csv"
-        )
-        return (
+        export_mime_type = "text/plain"
+        if mime_type == GDriveMimeType.SPREADSHEET.value:
+            export_mime_type = "text/csv"
+        elif mime_type == GDriveMimeType.PPT.value:
+            export_mime_type = "text/plain"
+
+        response = (
            service.files()
            .export(fileId=file["id"], mimeType=export_mime_type)
            .execute()
-            .decode("utf-8")
        )
-    elif mime_type in [
-        GDriveMimeType.PLAIN_TEXT.value,
-        GDriveMimeType.MARKDOWN.value,
-    ]:
-        return service.files().get_media(fileId=file["id"]).execute().decode("utf-8")
+        return response.decode("utf-8")
+
    elif mime_type == GDriveMimeType.WORD_DOC.value:
        response = service.files().get_media(fileId=file["id"]).execute()
        return docx_to_text(file=io.BytesIO(response))
    elif mime_type == GDriveMimeType.PDF.value:
        response = service.files().get_media(fileId=file["id"]).execute()
-        text, _ = read_pdf_file(file=io.BytesIO(response))
-        return text
+        return pdf_to_text(file=io.BytesIO(response))
    elif mime_type == GDriveMimeType.POWERPOINT.value:
        response = service.files().get_media(fileId=file["id"]).execute()
        return pptx_to_text(file=io.BytesIO(response))
--- a/backend/danswer/connectors/google_drive/connector_auth.py
+++ b/backend/danswer/connectors/google_drive/connector_auth.py
@@ -50,7 +50,7 @@ def get_google_drive_creds_for_authorized_user(
        try:
            creds.refresh(Request())
            if creds.valid:
-                logger.notice("Refreshed Google Drive tokens.")
+                logger.info("Refreshed Google Drive tokens.")
                return creds
        except Exception as e:
            logger.exception(f"Failed to refresh google drive access token due to: {e}")
@@ -106,7 +106,7 @@ def update_credential_access_tokens(
 ) -> OAuthCredentials | None:
    app_credentials = get_google_app_cred()
    flow = InstalledAppFlow.from_client_config(
-        app_credentials.model_dump(),
+        app_credentials.dict(),
        scopes=SCOPES,
        redirect_uri=_build_frontend_google_drive_redirect(),
    )
--- a/backend/danswer/connectors/guru/connector.py
+++ b/backend/danswer/connectors/guru/connector.py
@@ -103,10 +103,6 @@ class GuruConnector(LoadConnector, PollConnector):
                    # In UI it's called Folders
                    metadata_dict["folders"] = boards

-                collection = card.get("collection", {})
-                if collection:
-                    metadata_dict["collection_name"] = collection.get("name", "")
-
                owner = card.get("owner", {})
                author = None
                if owner:
--- a/backend/danswer/connectors/models.py
+++ b/backend/danswer/connectors/models.py
@@ -166,36 +166,6 @@ class Document(DocumentBase):
        )


-class DocumentErrorSummary(BaseModel):
-    id: str
-    semantic_id: str
-    section_link: str | None
-
-    @classmethod
-    def from_document(cls, doc: Document) -> "DocumentErrorSummary":
-        section_link = doc.sections[0].link if len(doc.sections) > 0 else None
-        return cls(
-            id=doc.id, semantic_id=doc.semantic_identifier, section_link=section_link
-        )
-
-    @classmethod
-    def from_dict(cls, data: dict) -> "DocumentErrorSummary":
-        return cls(
-            id=str(data.get("id")),
-            semantic_id=str(data.get("semantic_id")),
-            section_link=str(data.get("section_link")),
-        )
-
-    def to_dict(self) -> dict[str, str | None]:
-        return {
-            "id": self.id,
-            "semantic_id": self.semantic_id,
-            "section_link": self.section_link,
-        }
-
-
 class IndexAttemptMetadata(BaseModel):
-    batch_num: int | None = None
-    num_exceptions: int = 0
    connector_id: int
    credential_id: int
--- a/backend/danswer/connectors/notion/connector.py
+++ b/backend/danswer/connectors/notion/connector.py
@@ -237,14 +237,6 @@ class NotionConnector(LoadConnector, PollConnector):
                    )
                    continue

-                if result_type == "external_object_instance_page":
-                    logger.warning(
-                        f"Skipping 'external_object_instance_page' ('{result_block_id}') for base block '{base_block_id}': "
-                        f"Notion API does not currently support reading external blocks (as of 24/07/03) "
-                        f"(discussion: https://github.com/danswer-ai/danswer/issues/1761)"
-                    )
-                    continue
-
                cur_result_text_arr = []
                if "rich_text" in result_obj:
                    for rich_text in result_obj["rich_text"]:
--- a/backend/danswer/connectors/productboard/connector.py
+++ b/backend/danswer/connectors/productboard/connector.py
@@ -98,15 +98,6 @@ class ProductboardConnector(PollConnector):
            owner = self._get_owner_email(feature)
            experts = [BasicExpertInfo(email=owner)] if owner else None

-            metadata: dict[str, str | list[str]] = {}
-            entity_type = feature.get("type", "feature")
-            if entity_type:
-                metadata["entity_type"] = str(entity_type)
-
-            status = feature.get("status", {}).get("name")
-            if status:
-                metadata["status"] = str(status)
-
            yield Document(
                id=feature["id"],
                sections=[
@@ -119,7 +110,10 @@ class ProductboardConnector(PollConnector):
                source=DocumentSource.PRODUCTBOARD,
                doc_updated_at=time_str_to_utc(feature["updatedAt"]),
                primary_owners=experts,
-                metadata=metadata,
+                metadata={
+                    "entity_type": feature["type"],
+                    "status": feature["status"]["name"],
+                },
            )

    def _get_components(self) -> Generator[Document, None, None]:
@@ -180,12 +174,6 @@ class ProductboardConnector(PollConnector):
            owner = self._get_owner_email(objective)
            experts = [BasicExpertInfo(email=owner)] if owner else None

-            metadata: dict[str, str | list[str]] = {
-                "entity_type": "objective",
-            }
-            if objective.get("state"):
-                metadata["state"] = str(objective["state"])
-
            yield Document(
                id=objective["id"],
                sections=[
@@ -198,7 +186,10 @@ class ProductboardConnector(PollConnector):
                source=DocumentSource.PRODUCTBOARD,
                doc_updated_at=time_str_to_utc(objective["updatedAt"]),
                primary_owners=experts,
-                metadata=metadata,
+                metadata={
+                    "entity_type": "release",
+                    "state": objective["state"],
+                },
            )

    def _is_updated_at_out_of_time_range(
--- a/backend/danswer/connectors/sharepoint/connector.py
+++ b/backend/danswer/connectors/sharepoint/connector.py
@@ -25,6 +25,7 @@ from danswer.connectors.models import Section
 from danswer.file_processing.extract_file_text import extract_file_text
 from danswer.utils.logger import setup_logger

+
 logger = setup_logger()


@@ -136,7 +137,7 @@ class SharepointConnector(LoadConnector, PollConnector):
                    .execute_query()
                ]
        else:
-            sites = self.graph_client.sites.get_all().execute_query()
+            sites = self.graph_client.sites.get().execute_query()
            self.site_data = [
                SiteData(url=None, folder=None, sites=sites, driveitems=[])
            ]
--- a/backend/danswer/connectors/slack/connector.py
+++ b/backend/danswer/connectors/slack/connector.py
@@ -29,7 +29,6 @@ from danswer.connectors.slack.utils import make_slack_api_rate_limited
 from danswer.connectors.slack.utils import SlackTextCleaner
 from danswer.utils.logger import setup_logger

-
 logger = setup_logger()


--- a/backend/danswer/connectors/web/connector.py
+++ b/backend/danswer/connectors/web/connector.py
@@ -1,8 +1,6 @@
 import io
 import ipaddress
 import socket
-from datetime import datetime
-from datetime import timezone
 from enum import Enum
 from typing import Any
 from typing import cast
@@ -29,7 +27,7 @@ from danswer.connectors.interfaces import GenerateDocumentsOutput
 from danswer.connectors.interfaces import LoadConnector
 from danswer.connectors.models import Document
 from danswer.connectors.models import Section
-from danswer.file_processing.extract_file_text import read_pdf_file
+from danswer.file_processing.extract_file_text import pdf_to_text
 from danswer.file_processing.html_utils import web_html_cleanup
 from danswer.utils.logger import setup_logger
 from danswer.utils.sitemap import list_pages_for_site
@@ -86,20 +84,6 @@ def check_internet_connection(url: str) -> None:
    try:
        response = requests.get(url, timeout=3)
        response.raise_for_status()
-    except requests.exceptions.HTTPError as e:
-        # Extract status code from the response, defaulting to -1 if response is None
-        status_code = e.response.status_code if e.response is not None else -1
-        error_msg = {
-            400: "Bad Request",
-            401: "Unauthorized",
-            403: "Forbidden",
-            404: "Not Found",
-            500: "Internal Server Error",
-            502: "Bad Gateway",
-            503: "Service Unavailable",
-            504: "Gateway Timeout",
-        }.get(status_code, "HTTP Error")
-        raise Exception(f"{error_msg} ({status_code}) for {url} - {e}")
    except requests.exceptions.SSLError as e:
        cause = (
            e.args[0].reason
@@ -107,8 +91,8 @@ def check_internet_connection(url: str) -> None:
            else e.args
        )
        raise Exception(f"SSL error {str(cause)}")
-    except (requests.RequestException, ValueError) as e:
-        raise Exception(f"Unable to reach {url} - check your internet connection: {e}")
+    except (requests.RequestException, ValueError):
+        raise Exception(f"Unable to reach {url} - check your internet connection")


 def is_valid_url(url: str) -> bool:
@@ -205,15 +189,6 @@ def _read_urls_file(location: str) -> list[str]:
    return urls


-def _get_datetime_from_last_modified_header(last_modified: str) -> datetime | None:
-    try:
-        return datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z").replace(
-            tzinfo=timezone.utc
-        )
-    except (ValueError, TypeError):
-        return None
-
-
 class WebConnector(LoadConnector):
    def __init__(
        self,
@@ -296,10 +271,7 @@ class WebConnector(LoadConnector):
                if current_url.split(".")[-1] == "pdf":
                    # PDF files are not checked for links
                    response = requests.get(current_url)
-                    page_text, metadata = read_pdf_file(
-                        file=io.BytesIO(response.content)
-                    )
-                    last_modified = response.headers.get("Last-Modified")
+                    page_text = pdf_to_text(file=io.BytesIO(response.content))

                    doc_batch.append(
                        Document(
@@ -307,23 +279,13 @@ class WebConnector(LoadConnector):
                            sections=[Section(link=current_url, text=page_text)],
                            source=DocumentSource.WEB,
                            semantic_identifier=current_url.split("/")[-1],
-                            metadata=metadata,
-                            doc_updated_at=_get_datetime_from_last_modified_header(
-                                last_modified
-                            )
-                            if last_modified
-                            else None,
+                            metadata={},
                        )
                    )
                    continue

                page = context.new_page()
                page_response = page.goto(current_url)
-                last_modified = (
-                    page_response.header_value("Last-Modified")
-                    if page_response
-                    else None
-                )
                final_page = page.url
                if final_page != current_url:
                    logger.info(f"Redirected to {final_page}")
@@ -359,11 +321,6 @@ class WebConnector(LoadConnector):
                        source=DocumentSource.WEB,
                        semantic_identifier=parsed_html.title or current_url,
                        metadata={},
-                        doc_updated_at=_get_datetime_from_last_modified_header(
-                            last_modified
-                        )
-                        if last_modified
-                        else None,
                    )
                )

--- a/backend/danswer/connectors/zendesk/connector.py
+++ b/backend/danswer/connectors/zendesk/connector.py
@@ -3,7 +3,6 @@ from typing import Any
 import requests
 from retry import retry
 from zenpy import Zenpy  # type: ignore
-from zenpy.lib.api_objects import Ticket  # type: ignore
 from zenpy.lib.api_objects.help_centre_objects import Article  # type: ignore

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
@@ -60,15 +59,10 @@ class ZendeskClientNotSetUpError(PermissionError):


 class ZendeskConnector(LoadConnector, PollConnector):
-    def __init__(
-        self,
-        batch_size: int = INDEX_BATCH_SIZE,
-        content_type: str = "articles",
-    ) -> None:
+    def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
        self.batch_size = batch_size
        self.zendesk_client: Zenpy | None = None
        self.content_tags: dict[str, str] = {}
-        self.content_type = content_type

    @retry(tries=3, delay=2, backoff=2)
    def _set_content_tags(
@@ -128,86 +122,16 @@ class ZendeskConnector(LoadConnector, PollConnector):
    def load_from_state(self) -> GenerateDocumentsOutput:
        return self.poll_source(None, None)

-    def _ticket_to_document(self, ticket: Ticket) -> Document:
-        if self.zendesk_client is None:
-            raise ZendeskClientNotSetUpError()
-
-        owner = None
-        if ticket.requester and ticket.requester.name and ticket.requester.email:
-            owner = [
-                BasicExpertInfo(
-                    display_name=ticket.requester.name, email=ticket.requester.email
-                )
-            ]
-        update_time = time_str_to_utc(ticket.updated_at) if ticket.updated_at else None
-
-        metadata: dict[str, str | list[str]] = {}
-        if ticket.status is not None:
-            metadata["status"] = ticket.status
-        if ticket.priority is not None:
-            metadata["priority"] = ticket.priority
-        if ticket.tags:
-            metadata["tags"] = ticket.tags
-        if ticket.type is not None:
-            metadata["ticket_type"] = ticket.type
-
-        # Fetch comments for the ticket
-        comments = self.zendesk_client.tickets.comments(ticket=ticket)
-
-        # Combine all comments into a single text
-        comments_text = "\n\n".join(
-            [
-                f"Comment{f' by {comment.author.name}' if comment.author and comment.author.name else ''}"
-                f"{f' at {comment.created_at}' if comment.created_at else ''}:\n{comment.body}"
-                for comment in comments
-                if comment.body
-            ]
-        )
-
-        # Combine ticket description and comments
-        description = (
-            ticket.description
-            if hasattr(ticket, "description") and ticket.description
-            else ""
-        )
-        full_text = f"Ticket Description:\n{description}\n\nComments:\n{comments_text}"
-
-        # Extract subdomain from ticket.url
-        subdomain = ticket.url.split("//")[1].split(".zendesk.com")[0]
-
-        # Build the html url for the ticket
-        ticket_url = f"https://{subdomain}.zendesk.com/agent/tickets/{ticket.id}"
-
-        return Document(
-            id=f"zendesk_ticket_{ticket.id}",
-            sections=[Section(link=ticket_url, text=full_text)],
-            source=DocumentSource.ZENDESK,
-            semantic_identifier=f"Ticket #{ticket.id}: {ticket.subject or 'No Subject'}",
-            doc_updated_at=update_time,
-            primary_owners=owner,
-            metadata=metadata,
-        )
-
    def poll_source(
        self, start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
    ) -> GenerateDocumentsOutput:
        if self.zendesk_client is None:
            raise ZendeskClientNotSetUpError()

-        if self.content_type == "articles":
-            yield from self._poll_articles(start)
-        elif self.content_type == "tickets":
-            yield from self._poll_tickets(start)
-        else:
-            raise ValueError(f"Unsupported content_type: {self.content_type}")
-
-    def _poll_articles(
-        self, start: SecondsSinceUnixEpoch | None
-    ) -> GenerateDocumentsOutput:
        articles = (
-            self.zendesk_client.help_center.articles(cursor_pagination=True)  # type: ignore
+            self.zendesk_client.help_center.articles(cursor_pagination=True)
            if start is None
-            else self.zendesk_client.help_center.articles.incremental(  # type: ignore
+            else self.zendesk_client.help_center.articles.incremental(
                start_time=int(start)
            )
        )
@@ -231,43 +155,9 @@ class ZendeskConnector(LoadConnector, PollConnector):
        if doc_batch:
            yield doc_batch

-    def _poll_tickets(
-        self, start: SecondsSinceUnixEpoch | None
-    ) -> GenerateDocumentsOutput:
-        if self.zendesk_client is None:
-            raise ZendeskClientNotSetUpError()
-
-        ticket_generator = self.zendesk_client.tickets.incremental(start_time=start)
-
-        while True:
-            doc_batch = []
-            for _ in range(self.batch_size):
-                try:
-                    ticket = next(ticket_generator)
-
-                    # Check if the ticket status is deleted and skip it if so
-                    if ticket.status == "deleted":
-                        continue
-
-                    doc_batch.append(self._ticket_to_document(ticket))
-
-                    if len(doc_batch) >= self.batch_size:
-                        yield doc_batch
-                        doc_batch.clear()
-
-                except StopIteration:
-                    # No more tickets to process
-                    if doc_batch:
-                        yield doc_batch
-                    return
-
-            if doc_batch:
-                yield doc_batch
-

 if __name__ == "__main__":
    import os
-
    import time

    connector = ZendeskConnector()
--- a/backend/danswer/connectors/zulip/schemas.py
+++ b/backend/danswer/connectors/zulip/schemas.py
@@ -3,7 +3,6 @@ from typing import List
 from typing import Optional

 from pydantic import BaseModel
-from pydantic import Field


 class Message(BaseModel):
@@ -19,11 +18,11 @@ class Message(BaseModel):
    sender_realm_str: str
    subject: str
    topic_links: Optional[List[Any]] = None
-    last_edit_timestamp: Optional[int]
-    edit_history: Any = None
+    last_edit_timestamp: Optional[int] = None
+    edit_history: Any
    reactions: List[Any]
    submessages: List[Any]
-    flags: List[str] = Field(default_factory=list)
+    flags: List[str] = []
    display_recipient: Optional[str] = None
    type: Optional[str] = None
    stream_id: int
@@ -40,4 +39,4 @@ class GetMessagesResponse(BaseModel):
    found_newest: Optional[bool] = None
    history_limited: Optional[bool] = None
    anchor: Optional[str] = None
-    messages: List[Message] = Field(default_factory=list)
+    messages: List[Message] = []
--- a/backend/danswer/danswerbot/slack/constants.py
+++ b/backend/danswer/danswerbot/slack/constants.py
@@ -6,6 +6,7 @@ FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID = "feedback-doc-button"
 IMMEDIATE_RESOLVED_BUTTON_ACTION_ID = "immediate-resolved-button"
 FOLLOWUP_BUTTON_ACTION_ID = "followup-button"
 FOLLOWUP_BUTTON_RESOLVED_ACTION_ID = "followup-resolved-button"
+SLACK_CHANNEL_ID = "channel_id"
 VIEW_DOC_FEEDBACK_ID = "view-doc-feedback"
 GENERATE_ANSWER_BUTTON_ACTION_ID = "generate-answer-button"

--- a/backend/danswer/danswerbot/slack/handlers/handle_buttons.py
+++ b/backend/danswer/danswerbot/slack/handlers/handle_buttons.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Any
 from typing import cast

@@ -11,7 +12,6 @@ from sqlalchemy.orm import Session
 from danswer.configs.constants import MessageType
 from danswer.configs.constants import SearchFeedbackType
 from danswer.configs.danswerbot_configs import DANSWER_FOLLOWUP_EMOJI
-from danswer.connectors.slack.utils import expert_info_from_slack_id
 from danswer.connectors.slack.utils import make_slack_api_rate_limited
 from danswer.danswerbot.slack.blocks import build_follow_up_resolved_blocks
 from danswer.danswerbot.slack.blocks import get_document_feedback_blocks
@@ -88,8 +88,6 @@ def handle_generate_answer_button(
    message_ts = req.payload["message"]["ts"]
    thread_ts = req.payload["container"]["thread_ts"]
    user_id = req.payload["user"]["id"]
-    expert_info = expert_info_from_slack_id(user_id, client.web_client, user_cache={})
-    email = expert_info.email if expert_info else None

    if not thread_ts:
        raise ValueError("Missing thread_ts in the payload")
@@ -128,7 +126,6 @@ def handle_generate_answer_button(
                msg_to_respond=cast(str, message_ts or thread_ts),
                thread_to_respond=cast(str, thread_ts or message_ts),
                sender=user_id or None,
-                email=email or None,
                bypass_filters=True,
                is_bot_msg=False,
                is_bot_dm=False,
@@ -137,7 +134,7 @@ def handle_generate_answer_button(
            receiver_ids=None,
            client=client.web_client,
            channel=channel_id,
-            logger=logger,
+            logger=cast(logging.Logger, logger),
            feedback_reminder_id=None,
        )

--- a/backend/danswer/danswerbot/slack/handlers/handle_message.py
+++ b/backend/danswer/danswerbot/slack/handlers/handle_message.py
@@ -1,4 +1,6 @@
 import datetime
+import logging
+from typing import cast

 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
@@ -7,6 +9,7 @@ from sqlalchemy.orm import Session
 from danswer.configs.danswerbot_configs import DANSWER_BOT_FEEDBACK_REMINDER
 from danswer.configs.danswerbot_configs import DANSWER_REACT_EMOJI
 from danswer.danswerbot.slack.blocks import get_feedback_reminder_blocks
+from danswer.danswerbot.slack.constants import SLACK_CHANNEL_ID
 from danswer.danswerbot.slack.handlers.handle_regular_answer import (
    handle_regular_answer,
 )
@@ -14,6 +17,7 @@ from danswer.danswerbot.slack.handlers.handle_standard_answers import (
    handle_standard_answers,
 )
 from danswer.danswerbot.slack.models import SlackMessageInfo
+from danswer.danswerbot.slack.utils import ChannelIdAdapter
 from danswer.danswerbot.slack.utils import fetch_user_ids_from_emails
 from danswer.danswerbot.slack.utils import fetch_user_ids_from_groups
 from danswer.danswerbot.slack.utils import respond_in_thread
@@ -21,9 +25,7 @@ from danswer.danswerbot.slack.utils import slack_usage_report
 from danswer.danswerbot.slack.utils import update_emote_react
 from danswer.db.engine import get_sqlalchemy_engine
 from danswer.db.models import SlackBotConfig
-from danswer.db.users import add_non_web_user_if_not_exists
 from danswer.utils.logger import setup_logger
-from shared_configs.configs import SLACK_CHANNEL_ID

 logger_base = setup_logger()

@@ -51,8 +53,12 @@ def send_msg_ack_to_user(details: SlackMessageInfo, client: WebClient) -> None:
 def schedule_feedback_reminder(
    details: SlackMessageInfo, include_followup: bool, client: WebClient
 ) -> str | None:
-    logger = setup_logger(extra={SLACK_CHANNEL_ID: details.channel_to_respond})
-
+    logger = cast(
+        logging.Logger,
+        ChannelIdAdapter(
+            logger_base, extra={SLACK_CHANNEL_ID: details.channel_to_respond}
+        ),
+    )
    if not DANSWER_BOT_FEEDBACK_REMINDER:
        logger.info("Scheduled feedback reminder disabled...")
        return None
@@ -91,7 +97,10 @@ def schedule_feedback_reminder(
 def remove_scheduled_feedback_reminder(
    client: WebClient, channel: str | None, msg_id: str
 ) -> None:
-    logger = setup_logger(extra={SLACK_CHANNEL_ID: channel})
+    logger = cast(
+        logging.Logger,
+        ChannelIdAdapter(logger_base, extra={SLACK_CHANNEL_ID: channel}),
+    )

    try:
        client.chat_deleteScheduledMessage(
@@ -120,7 +129,10 @@ def handle_message(
    """
    channel = message_info.channel_to_respond

-    logger = setup_logger(extra={SLACK_CHANNEL_ID: channel})
+    logger = cast(
+        logging.Logger,
+        ChannelIdAdapter(logger_base, extra={SLACK_CHANNEL_ID: channel}),
+    )

    messages = message_info.thread_messages
    sender_id = message_info.sender
@@ -210,9 +222,6 @@ def handle_message(
        logger.error(f"Was not able to react to user message due to: {e}")

    with Session(get_sqlalchemy_engine()) as db_session:
-        if message_info.email:
-            add_non_web_user_if_not_exists(message_info.email, db_session)
-
        # first check if we need to respond with a standard answer
        used_standard_answer = handle_standard_answers(
            message_info=message_info,
--- a/backend/danswer/danswerbot/slack/handlers/handle_regular_answer.py
+++ b/backend/danswer/danswerbot/slack/handlers/handle_regular_answer.py
@@ -1,4 +1,5 @@
 import functools
+import logging
 from collections.abc import Callable
 from typing import Any
 from typing import cast
@@ -37,8 +38,6 @@ from danswer.db.models import Persona
 from danswer.db.models import SlackBotConfig
 from danswer.db.models import SlackBotResponseType
 from danswer.db.persona import fetch_persona_by_id
-from danswer.db.search_settings import get_current_search_settings
-from danswer.db.users import get_user_by_email
 from danswer.llm.answering.prompts.citations_prompt import (
    compute_max_document_tokens_for_persona,
 )
@@ -50,9 +49,8 @@ from danswer.one_shot_answer.models import DirectQARequest
 from danswer.one_shot_answer.models import OneShotQAResponse
 from danswer.search.enums import OptionalSearchSetting
 from danswer.search.models import BaseFilters
-from danswer.search.models import RerankingDetails
 from danswer.search.models import RetrievalDetails
-from danswer.utils.logger import DanswerLoggingAdapter
+from danswer.search.search_settings import get_search_settings


 srl = SlackRateLimiter()
@@ -85,7 +83,7 @@ def handle_regular_answer(
    receiver_ids: list[str] | None,
    client: WebClient,
    channel: str,
-    logger: DanswerLoggingAdapter,
+    logger: logging.Logger,
    feedback_reminder_id: str | None,
    num_retries: int = DANSWER_BOT_NUM_RETRIES,
    answer_generation_timeout: int = DANSWER_BOT_ANSWER_GENERATION_TIMEOUT,
@@ -100,12 +98,6 @@ def handle_regular_answer(
    messages = message_info.thread_messages
    message_ts_to_respond_to = message_info.msg_to_respond
    is_bot_msg = message_info.is_bot_msg
-    user = None
-    if message_info.is_bot_dm:
-        if message_info.email:
-            engine = get_sqlalchemy_engine()
-            with Session(engine) as db_session:
-                user = get_user_by_email(message_info.email, db_session)

    document_set_names: list[str] | None = None
    persona = slack_bot_config.persona if slack_bot_config else None
@@ -144,6 +136,7 @@ def handle_regular_answer(
        tries=num_retries,
        delay=0.25,
        backoff=2,
+        logger=logger,
    )
    @rate_limits(client=client, channel=channel, thread_ts=message_ts_to_respond_to)
    def _get_answer(new_message_request: DirectQARequest) -> OneShotQAResponse | None:
@@ -154,12 +147,7 @@ def handle_regular_answer(
            if len(new_message_request.messages) > 1:
                persona = cast(
                    Persona,
-                    fetch_persona_by_id(
-                        db_session,
-                        new_message_request.persona_id,
-                        user=None,
-                        get_editable=False,
-                    ),
+                    fetch_persona_by_id(db_session, new_message_request.persona_id),
                )
                llm, _ = get_llms_for_persona(persona)

@@ -192,7 +180,7 @@ def handle_regular_answer(
            # This also handles creating the query event in postgres
            answer = get_search_answer(
                query_req=new_message_request,
-                user=user,
+                user=None,
                max_document_tokens=max_document_tokens,
                max_history_tokens=max_history_tokens,
                db_session=db_session,
@@ -236,8 +224,7 @@ def handle_regular_answer(
        )

        # Always apply reranking settings if it exists, this is the non-streaming flow
-        with Session(get_sqlalchemy_engine()) as db_session:
-            saved_search_settings = get_current_search_settings(db_session)
+        saved_search_settings = get_search_settings()

        # This includes throwing out answer via reflexion
        answer = _get_answer(
@@ -250,7 +237,7 @@ def handle_regular_answer(
                persona_id=persona.id if persona is not None else 0,
                retrieval_options=retrieval_details,
                chain_of_thought=not disable_cot,
-                rerank_settings=RerankingDetails.from_db_model(saved_search_settings)
+                rerank_settings=saved_search_settings.to_reranking_detail()
                if saved_search_settings
                else None,
            )
@@ -332,7 +319,7 @@ def handle_regular_answer(
    )

    if answer.answer_valid is False:
-        logger.notice(
+        logger.info(
            "Answer was evaluated to be invalid, throwing it away without responding."
        )
        update_emote_react(
@@ -370,7 +357,7 @@ def handle_regular_answer(
        return True

    if not answer.answer and disable_docs_only_answer:
-        logger.notice(
+        logger.info(
            "Unable to find answer - not responding since the "
            "`DANSWER_BOT_DISABLE_DOCS_ONLY_ANSWER` env variable is set"
        )
--- a/backend/danswer/danswerbot/slack/handlers/handle_standard_answers.py
+++ b/backend/danswer/danswerbot/slack/handlers/handle_standard_answers.py
@@ -1,3 +1,5 @@
+import logging
+
 from slack_sdk import WebClient
 from sqlalchemy.orm import Session

@@ -19,7 +21,6 @@ from danswer.db.models import SlackBotConfig
 from danswer.db.standard_answer import fetch_standard_answer_categories_by_names
 from danswer.db.standard_answer import find_matching_standard_answers
 from danswer.server.manage.models import StandardAnswer
-from danswer.utils.logger import DanswerLoggingAdapter
 from danswer.utils.logger import setup_logger

 logger = setup_logger()
@@ -60,7 +61,7 @@ def handle_standard_answers(
    receiver_ids: list[str] | None,
    slack_bot_config: SlackBotConfig | None,
    prompt: Prompt | None,
-    logger: DanswerLoggingAdapter,
+    logger: logging.Logger,
    client: WebClient,
    db_session: Session,
 ) -> bool:
@@ -142,7 +143,7 @@ def handle_standard_answers(
            parent_message=root_message,
            prompt_id=prompt.id if prompt else None,
            message=query_msg.message,
-            token_count=0,
+            token_count=10,
            message_type=MessageType.USER,
            db_session=db_session,
            commit=True,
--- a/backend/danswer/danswerbot/slack/listener.py
+++ b/backend/danswer/danswerbot/slack/listener.py
@@ -13,7 +13,6 @@ from danswer.configs.constants import MessageType
 from danswer.configs.danswerbot_configs import DANSWER_BOT_REPHRASE_MESSAGE
 from danswer.configs.danswerbot_configs import DANSWER_BOT_RESPOND_EVERY_CHANNEL
 from danswer.configs.danswerbot_configs import NOTIFY_SLACKBOT_NO_ANSWER
-from danswer.connectors.slack.utils import expert_info_from_slack_id
 from danswer.danswerbot.slack.config import get_slack_bot_config_for_channel
 from danswer.danswerbot.slack.constants import DISLIKE_BLOCK_ACTION_ID
 from danswer.danswerbot.slack.constants import FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID
@@ -22,6 +21,7 @@ from danswer.danswerbot.slack.constants import FOLLOWUP_BUTTON_RESOLVED_ACTION_I
 from danswer.danswerbot.slack.constants import GENERATE_ANSWER_BUTTON_ACTION_ID
 from danswer.danswerbot.slack.constants import IMMEDIATE_RESOLVED_BUTTON_ACTION_ID
 from danswer.danswerbot.slack.constants import LIKE_BLOCK_ACTION_ID
+from danswer.danswerbot.slack.constants import SLACK_CHANNEL_ID
 from danswer.danswerbot.slack.constants import VIEW_DOC_FEEDBACK_ID
 from danswer.danswerbot.slack.handlers.handle_buttons import handle_doc_feedback_button
 from danswer.danswerbot.slack.handlers.handle_buttons import handle_followup_button
@@ -39,7 +39,7 @@ from danswer.danswerbot.slack.handlers.handle_message import (
 from danswer.danswerbot.slack.handlers.handle_message import schedule_feedback_reminder
 from danswer.danswerbot.slack.models import SlackMessageInfo
 from danswer.danswerbot.slack.tokens import fetch_tokens
-from danswer.danswerbot.slack.utils import check_message_limit
+from danswer.danswerbot.slack.utils import ChannelIdAdapter
 from danswer.danswerbot.slack.utils import decompose_action_id
 from danswer.danswerbot.slack.utils import get_channel_name_from_id
 from danswer.danswerbot.slack.utils import get_danswer_bot_app_id
@@ -47,10 +47,9 @@ from danswer.danswerbot.slack.utils import read_slack_thread
 from danswer.danswerbot.slack.utils import remove_danswer_bot_tag
 from danswer.danswerbot.slack.utils import rephrase_slack_message
 from danswer.danswerbot.slack.utils import respond_in_thread
+from danswer.db.embedding_model import get_current_db_embedding_model
 from danswer.db.engine import get_sqlalchemy_engine
-from danswer.db.search_settings import get_current_search_settings
 from danswer.dynamic_configs.interface import ConfigNotFoundError
-from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
 from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
 from danswer.one_shot_answer.models import ThreadMessage
 from danswer.search.retrieval.search_runner import download_nltk_data
@@ -58,7 +57,6 @@ from danswer.server.manage.models import SlackBotTokens
 from danswer.utils.logger import setup_logger
 from shared_configs.configs import MODEL_SERVER_HOST
 from shared_configs.configs import MODEL_SERVER_PORT
-from shared_configs.configs import SLACK_CHANNEL_ID

 logger = setup_logger()

@@ -86,18 +84,18 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool
        event = cast(dict[str, Any], req.payload.get("event", {}))
        msg = cast(str | None, event.get("text"))
        channel = cast(str | None, event.get("channel"))
-        channel_specific_logger = setup_logger(extra={SLACK_CHANNEL_ID: channel})
+        channel_specific_logger = ChannelIdAdapter(
+            logger, extra={SLACK_CHANNEL_ID: channel}
+        )

        # This should never happen, but we can't continue without a channel since
        # we can't send a response without it
        if not channel:
-            channel_specific_logger.warning("Found message without channel - skipping")
+            channel_specific_logger.error("Found message without channel - skipping")
            return False

        if not msg:
-            channel_specific_logger.warning(
-                "Cannot respond to empty message - skipping"
-            )
+            channel_specific_logger.error("Cannot respond to empty message - skipping")
            return False

        if (
@@ -132,19 +130,9 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool

        if event_type == "message":
            bot_tag_id = get_danswer_bot_app_id(client.web_client)
-
-            is_dm = event.get("channel_type") == "im"
-            is_tagged = bot_tag_id and bot_tag_id in msg
-            is_danswer_bot_msg = bot_tag_id and bot_tag_id in event.get("user", "")
-
-            # DanswerBot should never respond to itself
-            if is_danswer_bot_msg:
-                logger.info("Ignoring message from DanswerBot")
-                return False
-
            # DMs with the bot don't pick up the @DanswerBot so we have to keep the
            # caught events_api
-            if is_tagged and not is_dm:
+            if bot_tag_id and bot_tag_id in msg and event.get("channel_type") != "im":
                # Let the tag flow handle this case, don't reply twice
                return False

@@ -197,8 +185,9 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool
    if req.type == "slash_commands":
        # Verify that there's an associated channel
        channel = req.payload.get("channel_id")
-        channel_specific_logger = setup_logger(extra={SLACK_CHANNEL_ID: channel})
-
+        channel_specific_logger = ChannelIdAdapter(
+            logger, extra={SLACK_CHANNEL_ID: channel}
+        )
        if not channel:
            channel_specific_logger.error(
                "Received DanswerBot command without channel - skipping"
@@ -212,9 +201,6 @@ def prefilter_requests(req: SocketModeRequest, client: SocketModeClient) -> bool
            )
            return False

-    if not check_message_limit():
-        return False
-
    logger.debug(f"Handling Slack request with Payload: '{req.payload}'")
    return True

@@ -244,7 +230,7 @@ def process_feedback(req: SocketModeRequest, client: SocketModeClient) -> None:
    )

    query_event_id, _, _ = decompose_action_id(feedback_id)
-    logger.notice(f"Successfully handled QA feedback for event: {query_event_id}")
+    logger.info(f"Successfully handled QA feedback for event: {query_event_id}")


 def build_request_details(
@@ -257,26 +243,19 @@ def build_request_details(
        tagged = event.get("type") == "app_mention"
        message_ts = event.get("ts")
        thread_ts = event.get("thread_ts")
-        sender = event.get("user") or None
-        expert_info = expert_info_from_slack_id(
-            sender, client.web_client, user_cache={}
-        )
-        email = expert_info.email if expert_info else None

        msg = remove_danswer_bot_tag(msg, client=client.web_client)

        if DANSWER_BOT_REPHRASE_MESSAGE:
-            logger.notice(f"Rephrasing Slack message. Original message: {msg}")
+            logger.info(f"Rephrasing Slack message. Original message: {msg}")
            try:
                msg = rephrase_slack_message(msg)
-                logger.notice(f"Rephrased message: {msg}")
+                logger.info(f"Rephrased message: {msg}")
            except Exception as e:
                logger.error(f"Error while trying to rephrase the Slack message: {e}")
-        else:
-            logger.notice(f"Received Slack message: {msg}")

        if tagged:
-            logger.debug("User tagged DanswerBot")
+            logger.info("User tagged DanswerBot")

        if thread_ts != message_ts and thread_ts is not None:
            thread_messages = read_slack_thread(
@@ -292,8 +271,7 @@ def build_request_details(
            channel_to_respond=channel,
            msg_to_respond=cast(str, message_ts or thread_ts),
            thread_to_respond=cast(str, thread_ts or message_ts),
-            sender=sender,
-            email=email,
+            sender=event.get("user") or None,
            bypass_filters=tagged,
            is_bot_msg=False,
            is_bot_dm=event.get("channel_type") == "im",
@@ -303,10 +281,6 @@ def build_request_details(
        channel = req.payload["channel_id"]
        msg = req.payload["text"]
        sender = req.payload["user_id"]
-        expert_info = expert_info_from_slack_id(
-            sender, client.web_client, user_cache={}
-        )
-        email = expert_info.email if expert_info else None

        single_msg = ThreadMessage(message=msg, sender=None, role=MessageType.USER)

@@ -316,7 +290,6 @@ def build_request_details(
            msg_to_respond=None,
            thread_to_respond=None,
            sender=sender,
-            email=email,
            bypass_filters=True,
            is_bot_msg=True,
            is_bot_dm=False,
@@ -464,7 +437,7 @@ def _initialize_socket_client(socket_client: SocketModeClient) -> None:
    socket_client.socket_mode_request_listeners.append(process_slack_event)  # type: ignore

    # Establish a WebSocket connection to the Socket Mode servers
-    logger.notice("Listening for messages from Slack...")
+    logger.info("Listening for messages from Slack...")
    socket_client.connect()


@@ -481,7 +454,7 @@ if __name__ == "__main__":
    slack_bot_tokens: SlackBotTokens | None = None
    socket_client: SocketModeClient | None = None

-    logger.notice("Verifying query preprocessing (NLTK) data is downloaded")
+    logger.info("Verifying query preprocessing (NLTK) data is downloaded")
    download_nltk_data()

    while True:
@@ -490,21 +463,18 @@ if __name__ == "__main__":

            if latest_slack_bot_tokens != slack_bot_tokens:
                if slack_bot_tokens is not None:
-                    logger.notice("Slack Bot tokens have changed - reconnecting")
+                    logger.info("Slack Bot tokens have changed - reconnecting")
                else:
                    # This happens on the very first time the listener process comes up
                    # or the tokens have updated (set up for the first time)
                    with Session(get_sqlalchemy_engine()) as db_session:
-                        search_settings = get_current_search_settings(db_session)
-                        embedding_model = EmbeddingModel.from_db_model(
-                            search_settings=search_settings,
-                            server_host=MODEL_SERVER_HOST,
-                            server_port=MODEL_SERVER_PORT,
-                        )
-
-                        warm_up_bi_encoder(
-                            embedding_model=embedding_model,
-                        )
+                        embedding_model = get_current_db_embedding_model(db_session)
+                        if embedding_model.cloud_provider_id is None:
+                            warm_up_bi_encoder(
+                                embedding_model=embedding_model,
+                                model_server_host=MODEL_SERVER_HOST,
+                                model_server_port=MODEL_SERVER_PORT,
+                            )

                slack_bot_tokens = latest_slack_bot_tokens
                # potentially may cause a message to be dropped, but it is complicated
--- a/backend/danswer/danswerbot/slack/models.py
+++ b/backend/danswer/danswerbot/slack/models.py
@@ -9,7 +9,6 @@ class SlackMessageInfo(BaseModel):
    msg_to_respond: str | None
    thread_to_respond: str | None
    sender: str | None
-    email: str | None
    bypass_filters: bool  # User has tagged @DanswerBot
    is_bot_msg: bool  # User is using /DanswerBot
    is_bot_dm: bool  # User is direct messaging to DanswerBot
--- a/backend/danswer/danswerbot/slack/utils.py
+++ b/backend/danswer/danswerbot/slack/utils.py
@@ -3,6 +3,7 @@ import random
 import re
 import string
 import time
+from collections.abc import MutableMapping
 from typing import Any
 from typing import cast
 from typing import Optional
@@ -21,15 +22,10 @@ from danswer.configs.danswerbot_configs import DANSWER_BOT_FEEDBACK_VISIBILITY
 from danswer.configs.danswerbot_configs import DANSWER_BOT_MAX_QPM
 from danswer.configs.danswerbot_configs import DANSWER_BOT_MAX_WAIT_TIME
 from danswer.configs.danswerbot_configs import DANSWER_BOT_NUM_RETRIES
-from danswer.configs.danswerbot_configs import (
-    DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD,
-)
-from danswer.configs.danswerbot_configs import (
-    DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS,
-)
 from danswer.connectors.slack.utils import make_slack_api_rate_limited
 from danswer.connectors.slack.utils import SlackTextCleaner
 from danswer.danswerbot.slack.constants import FeedbackVisibility
+from danswer.danswerbot.slack.constants import SLACK_CHANNEL_ID
 from danswer.danswerbot.slack.tokens import fetch_tokens
 from danswer.db.engine import get_sqlalchemy_engine
 from danswer.db.users import get_user_by_email
@@ -47,41 +43,7 @@ from danswer.utils.text_processing import replace_whitespaces_w_space
 logger = setup_logger()


-_DANSWER_BOT_APP_ID: str | None = None
-_DANSWER_BOT_MESSAGE_COUNT: int = 0
-_DANSWER_BOT_COUNT_START_TIME: float = time.time()
-
-
-def get_danswer_bot_app_id(web_client: WebClient) -> Any:
-    global _DANSWER_BOT_APP_ID
-    if _DANSWER_BOT_APP_ID is None:
-        _DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id")
-    return _DANSWER_BOT_APP_ID
-
-
-def check_message_limit() -> bool:
-    """
-    This isnt a perfect solution.
-    High traffic at the end of one period and start of another could cause
-    the limit to be exceeded.
-    """
-    if DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD == 0:
-        return True
-    global _DANSWER_BOT_MESSAGE_COUNT
-    global _DANSWER_BOT_COUNT_START_TIME
-    time_since_start = time.time() - _DANSWER_BOT_COUNT_START_TIME
-    if time_since_start > DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS:
-        _DANSWER_BOT_MESSAGE_COUNT = 0
-        _DANSWER_BOT_COUNT_START_TIME = time.time()
-    if (_DANSWER_BOT_MESSAGE_COUNT + 1) > DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD:
-        logger.error(
-            f"DanswerBot has reached the message limit {DANSWER_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD}"
-            f" for the time period {DANSWER_BOT_RESPONSE_LIMIT_TIME_PERIOD_SECONDS} seconds."
-            " These limits are configurable in backend/danswer/configs/danswerbot_configs.py"
-        )
-        return False
-    _DANSWER_BOT_MESSAGE_COUNT += 1
-    return True
+DANSWER_BOT_APP_ID: str | None = None


 def rephrase_slack_message(msg: str) -> str:
@@ -136,11 +98,32 @@ def update_emote_react(
            logger.error(f"Was not able to react to user message due to: {e}")


+def get_danswer_bot_app_id(web_client: WebClient) -> Any:
+    global DANSWER_BOT_APP_ID
+    if DANSWER_BOT_APP_ID is None:
+        DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id")
+    return DANSWER_BOT_APP_ID
+
+
 def remove_danswer_bot_tag(message_str: str, client: WebClient) -> str:
    bot_tag_id = get_danswer_bot_app_id(web_client=client)
    return re.sub(rf"<@{bot_tag_id}>\s", "", message_str)


+class ChannelIdAdapter(logging.LoggerAdapter):
+    """This is used to add the channel ID to all log messages
+    emitted in this file"""
+
+    def process(
+        self, msg: str, kwargs: MutableMapping[str, Any]
+    ) -> tuple[str, MutableMapping[str, Any]]:
+        channel_id = self.extra.get(SLACK_CHANNEL_ID) if self.extra else None
+        if channel_id:
+            return f"[Channel ID: {channel_id}] {msg}", kwargs
+        else:
+            return msg, kwargs
+
+
 def get_web_client() -> WebClient:
    slack_tokens = fetch_tokens()
    return WebClient(token=slack_tokens.bot_token)
--- a/backend/danswer/db/auth.py
+++ b/backend/danswer/db/auth.py
@@ -28,7 +28,7 @@ def get_default_admin_user_emails() -> list[str]:
    get_default_admin_user_emails_fn: Callable[
        [], list[str]
    ] = fetch_versioned_implementation_with_fallback(
-        "danswer.auth.users", "get_default_admin_user_emails_", lambda: list[str]()
+        "danswer.auth.users", "get_default_admin_user_emails_", lambda: []
    )
    return get_default_admin_user_emails_fn()

--- a/backend/danswer/db/chat.py
+++ b/backend/danswer/db/chat.py
@@ -3,6 +3,7 @@ from datetime import datetime
 from datetime import timedelta
 from uuid import UUID

+from sqlalchemy import and_
 from sqlalchemy import delete
 from sqlalchemy import desc
 from sqlalchemy import func
@@ -35,7 +36,7 @@ from danswer.search.models import RetrievalDocs
 from danswer.search.models import SavedSearchDoc
 from danswer.search.models import SearchDoc as ServerSearchDoc
 from danswer.server.query_and_chat.models import ChatMessageDetail
-from danswer.tools.tool_runner import ToolCallFinalResult
+from danswer.tools.tool_runner import ToolCallMetadata
 from danswer.utils.logger import setup_logger


@@ -86,57 +87,29 @@ def get_chat_sessions_by_slack_thread_id(
    return db_session.scalars(stmt).all()


-def get_valid_messages_from_query_sessions(
-    chat_session_ids: list[int],
-    db_session: Session,
+def get_first_messages_for_chat_sessions(
+    chat_session_ids: list[int], db_session: Session
 ) -> dict[int, str]:
-    user_message_subquery = (
-        select(
-            ChatMessage.chat_session_id, func.min(ChatMessage.id).label("user_msg_id")
-        )
+    subquery = (
+        select(ChatMessage.chat_session_id, func.min(ChatMessage.id).label("min_id"))
        .where(
-            ChatMessage.chat_session_id.in_(chat_session_ids),
-            ChatMessage.message_type == MessageType.USER,
+            and_(
+                ChatMessage.chat_session_id.in_(chat_session_ids),
+                ChatMessage.message_type == MessageType.USER,  # Select USER messages
+            )
        )
        .group_by(ChatMessage.chat_session_id)
        .subquery()
    )

-    assistant_message_subquery = (
-        select(
-            ChatMessage.chat_session_id,
-            func.min(ChatMessage.id).label("assistant_msg_id"),
-        )
-        .where(
-            ChatMessage.chat_session_id.in_(chat_session_ids),
-            ChatMessage.message_type == MessageType.ASSISTANT,
-        )
-        .group_by(ChatMessage.chat_session_id)
-        .subquery()
-    )
-
-    query = (
-        select(ChatMessage.chat_session_id, ChatMessage.message)
-        .join(
-            user_message_subquery,
-            ChatMessage.chat_session_id == user_message_subquery.c.chat_session_id,
-        )
-        .join(
-            assistant_message_subquery,
-            ChatMessage.chat_session_id == assistant_message_subquery.c.chat_session_id,
-        )
-        .join(
-            ChatMessage__SearchDoc,
-            ChatMessage__SearchDoc.chat_message_id
-            == assistant_message_subquery.c.assistant_msg_id,
-        )
-        .where(ChatMessage.id == user_message_subquery.c.user_msg_id)
+    query = select(ChatMessage.chat_session_id, ChatMessage.message).join(
+        subquery,
+        (ChatMessage.chat_session_id == subquery.c.chat_session_id)
+        & (ChatMessage.id == subquery.c.min_id),
    )

    first_messages = db_session.execute(query).all()
-    logger.info(f"Retrieved {len(first_messages)} first messages with documents")
-
-    return {row.chat_session_id: row.message for row in first_messages}
+    return dict([(row.chat_session_id, row.message) for row in first_messages])


 def get_chat_sessions_by_user(
@@ -144,7 +117,6 @@ def get_chat_sessions_by_user(
    deleted: bool | None,
    db_session: Session,
    only_one_shot: bool = False,
-    limit: int = 50,
 ) -> list[ChatSession]:
    stmt = select(ChatSession).where(ChatSession.user_id == user_id)

@@ -158,9 +130,6 @@ def get_chat_sessions_by_user(
    if deleted is not None:
        stmt = stmt.where(ChatSession.deleted == deleted)

-    if limit:
-        stmt = stmt.limit(limit)
-
    result = db_session.execute(stmt)
    chat_sessions = result.scalars().all()

@@ -178,8 +147,14 @@ def delete_search_doc_message_relationship(


 def delete_tool_call_for_message_id(message_id: int, db_session: Session) -> None:
-    stmt = delete(ToolCall).where(ToolCall.message_id == message_id)
-    db_session.execute(stmt)
+    chat_message = (
+        db_session.query(ChatMessage).filter(ChatMessage.id == message_id).first()
+    )
+    if chat_message and chat_message.tool_call_id:
+        stmt = delete(ToolCall).where(ToolCall.id == chat_message.tool_call_id)
+        db_session.execute(stmt)
+        chat_message.tool_call_id = None
+
    db_session.commit()


@@ -280,13 +255,6 @@ def delete_chat_session(
    db_session: Session,
    hard_delete: bool = HARD_DELETE_CHATS,
 ) -> None:
-    chat_session = get_chat_session_by_id(
-        chat_session_id=chat_session_id, user_id=user_id, db_session=db_session
-    )
-
-    if chat_session.deleted:
-        raise ValueError("Cannot delete an already deleted chat session")
-
    if hard_delete:
        delete_messages_and_files_from_chat_session(chat_session_id, db_session)
        db_session.execute(delete(ChatSession).where(ChatSession.id == chat_session_id))
@@ -388,7 +356,7 @@ def get_chat_messages_by_session(
    )

    if prefetch_tool_calls:
-        stmt = stmt.options(joinedload(ChatMessage.tool_calls))
+        stmt = stmt.options(joinedload(ChatMessage.tool_call))
        result = db_session.scalars(stmt).unique().all()
    else:
        result = db_session.scalars(stmt).all()
@@ -474,10 +442,10 @@ def create_new_chat_message(
    alternate_assistant_id: int | None = None,
    # Maps the citation number [n] to the DB SearchDoc
    citations: dict[int, int] | None = None,
-    tool_calls: list[ToolCall] | None = None,
+    tool_call: ToolCall | None = None,
    commit: bool = True,
    reserved_message_id: int | None = None,
-    overridden_model: str | None = None,
+    alternate_model: str | None = None,
 ) -> ChatMessage:
    if reserved_message_id is not None:
        # Edit existing message
@@ -494,10 +462,10 @@ def create_new_chat_message(
        existing_message.message_type = message_type
        existing_message.citations = citations
        existing_message.files = files
-        existing_message.tool_calls = tool_calls if tool_calls else []
+        existing_message.tool_call = tool_call
        existing_message.error = error
        existing_message.alternate_assistant_id = alternate_assistant_id
-        existing_message.overridden_model = overridden_model
+        existing_message.alternate_model = alternate_model

        new_chat_message = existing_message
    else:
@@ -513,10 +481,10 @@ def create_new_chat_message(
            message_type=message_type,
            citations=citations,
            files=files,
-            tool_calls=tool_calls if tool_calls else [],
+            tool_call=tool_call,
            error=error,
            alternate_assistant_id=alternate_assistant_id,
-            overridden_model=overridden_model,
+            alternate_model=alternate_model,
        )
        db_session.add(new_chat_message)

@@ -530,7 +498,6 @@ def create_new_chat_message(
    parent_message.latest_child_message = new_chat_message.id
    if commit:
        db_session.commit()
-
    return new_chat_message


@@ -747,16 +714,15 @@ def translate_db_message_to_chat_message_detail(
        time_sent=chat_message.time_sent,
        citations=chat_message.citations,
        files=chat_message.files or [],
-        tool_calls=[
-            ToolCallFinalResult(
-                tool_name=tool_call.tool_name,
-                tool_args=tool_call.tool_arguments,
-                tool_result=tool_call.tool_result,
-            )
-            for tool_call in chat_message.tool_calls
-        ],
+        tool_call=ToolCallMetadata(
+            tool_name=chat_message.tool_call.tool_name,
+            tool_args=chat_message.tool_call.tool_arguments,
+            tool_result=chat_message.tool_call.tool_result,
+        )
+        if chat_message.tool_call
+        else None,
        alternate_assistant_id=chat_message.alternate_assistant_id,
-        overridden_model=chat_message.overridden_model,
+        alternate_model=chat_message.alternate_model,
    )

    return chat_msg_detail
--- a/backend/danswer/db/connector.py
+++ b/backend/danswer/db/connector.py
@@ -75,8 +75,8 @@ def fetch_ingestion_connector_by_name(


 def create_connector(
-    db_session: Session,
    connector_data: ConnectorBase,
+    db_session: Session,
 ) -> ObjectCreationIdResponse:
    if connector_by_name_source_exists(
        connector_data.name, connector_data.source, db_session
@@ -132,8 +132,8 @@ def update_connector(


 def delete_connector(
-    db_session: Session,
    connector_id: int,
+    db_session: Session,
 ) -> StatusResponse[int]:
    """Only used in special cases (e.g. a connector is in a bad state and we need to delete it).
    Be VERY careful using this, as it could lead to a bad state if not used correctly.
--- a/backend/danswer/db/connector_credential_pair.py
+++ b/backend/danswer/db/connector_credential_pair.py
@@ -3,10 +3,7 @@ from datetime import datetime
 from fastapi import HTTPException
 from sqlalchemy import delete
 from sqlalchemy import desc
-from sqlalchemy import exists
-from sqlalchemy import Select
 from sqlalchemy import select
-from sqlalchemy.orm import aliased
 from sqlalchemy.orm import Session

 from danswer.configs.constants import DocumentSource
@@ -14,115 +11,35 @@ from danswer.db.connector import fetch_connector_by_id
 from danswer.db.credentials import fetch_credential_by_id
 from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.models import ConnectorCredentialPair
+from danswer.db.models import EmbeddingModel
 from danswer.db.models import IndexAttempt
 from danswer.db.models import IndexingStatus
 from danswer.db.models import IndexModelStatus
-from danswer.db.models import SearchSettings
 from danswer.db.models import User
-from danswer.db.models import User__UserGroup
-from danswer.db.models import UserGroup__ConnectorCredentialPair
-from danswer.db.models import UserRole
 from danswer.server.models import StatusResponse
 from danswer.utils.logger import setup_logger

 logger = setup_logger()


-def _add_user_filters(
-    stmt: Select, user: User | None, get_editable: bool = True
-) -> Select:
-    # If user is None, assume the user is an admin or auth is disabled
-    if user is None or user.role == UserRole.ADMIN:
-        return stmt
-
-    UG__CCpair = aliased(UserGroup__ConnectorCredentialPair)
-    User__UG = aliased(User__UserGroup)
-
-    """
-    Here we select cc_pairs by relation:
-    User -> User__UserGroup -> UserGroup__ConnectorCredentialPair ->
-    ConnectorCredentialPair
-    """
-    stmt = stmt.outerjoin(UG__CCpair).outerjoin(
-        User__UG,
-        User__UG.user_group_id == UG__CCpair.user_group_id,
-    )
-
-    """
-    Filter cc_pairs by:
-    - if the user is in the user_group that owns the cc_pair
-    - if the user is not a global_curator, they must also have a curator relationship
-    to the user_group
-    - if editing is being done, we also filter out cc_pairs that are owned by groups
-    that the user isn't a curator for
-    - if we are not editing, we show all cc_pairs in the groups the user is a curator
-    for (as well as public cc_pairs)
-    """
-    where_clause = User__UG.user_id == user.id
-    if user.role == UserRole.CURATOR and get_editable:
-        where_clause &= User__UG.is_curator == True  # noqa: E712
-    if get_editable:
-        user_groups = select(User__UG.user_group_id).where(User__UG.user_id == user.id)
-        if user.role == UserRole.CURATOR:
-            user_groups = user_groups.where(
-                User__UserGroup.is_curator == True  # noqa: E712
-            )
-        where_clause &= (
-            ~exists()
-            .where(UG__CCpair.cc_pair_id == ConnectorCredentialPair.id)
-            .where(~UG__CCpair.user_group_id.in_(user_groups))
-            .correlate(ConnectorCredentialPair)
-        )
-    else:
-        where_clause |= ConnectorCredentialPair.is_public == True  # noqa: E712
-
-    return stmt.where(where_clause)
-
-
 def get_connector_credential_pairs(
-    db_session: Session,
-    include_disabled: bool = True,
-    user: User | None = None,
-    get_editable: bool = True,
-    ids: list[int] | None = None,
+    db_session: Session, include_disabled: bool = True
 ) -> list[ConnectorCredentialPair]:
-    stmt = select(ConnectorCredentialPair).distinct()
-    stmt = _add_user_filters(stmt, user, get_editable)
+    stmt = select(ConnectorCredentialPair)
    if not include_disabled:
        stmt = stmt.where(
            ConnectorCredentialPair.status == ConnectorCredentialPairStatus.ACTIVE
        )  # noqa
-    if ids:
-        stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
    results = db_session.scalars(stmt)
    return list(results.all())


-def get_cc_pair_groups_for_ids(
-    db_session: Session,
-    cc_pair_ids: list[int],
-    user: User | None = None,
-    get_editable: bool = True,
-) -> list[UserGroup__ConnectorCredentialPair]:
-    stmt = select(UserGroup__ConnectorCredentialPair).distinct()
-    stmt = stmt.outerjoin(
-        ConnectorCredentialPair,
-        UserGroup__ConnectorCredentialPair.cc_pair_id == ConnectorCredentialPair.id,
-    )
-    stmt = _add_user_filters(stmt, user, get_editable)
-    stmt = stmt.where(UserGroup__ConnectorCredentialPair.cc_pair_id.in_(cc_pair_ids))
-    return list(db_session.scalars(stmt).all())
-
-
 def get_connector_credential_pair(
    connector_id: int,
    credential_id: int,
    db_session: Session,
-    user: User | None = None,
-    get_editable: bool = True,
 ) -> ConnectorCredentialPair | None:
    stmt = select(ConnectorCredentialPair)
-    stmt = _add_user_filters(stmt, user, get_editable)
    stmt = stmt.where(ConnectorCredentialPair.connector_id == connector_id)
    stmt = stmt.where(ConnectorCredentialPair.credential_id == credential_id)
    result = db_session.execute(stmt)
@@ -132,11 +49,8 @@ def get_connector_credential_pair(
 def get_connector_credential_source_from_id(
    cc_pair_id: int,
    db_session: Session,
-    user: User | None = None,
-    get_editable: bool = True,
 ) -> DocumentSource | None:
    stmt = select(ConnectorCredentialPair)
-    stmt = _add_user_filters(stmt, user, get_editable)
    stmt = stmt.where(ConnectorCredentialPair.id == cc_pair_id)
    result = db_session.execute(stmt)
    cc_pair = result.scalar_one_or_none()
@@ -146,11 +60,8 @@ def get_connector_credential_source_from_id(
 def get_connector_credential_pair_from_id(
    cc_pair_id: int,
    db_session: Session,
-    user: User | None = None,
-    get_editable: bool = True,
 ) -> ConnectorCredentialPair | None:
-    stmt = select(ConnectorCredentialPair).distinct()
-    stmt = _add_user_filters(stmt, user, get_editable)
+    stmt = select(ConnectorCredentialPair)
    stmt = stmt.where(ConnectorCredentialPair.id == cc_pair_id)
    result = db_session.execute(stmt)
    return result.scalar_one_or_none()
@@ -159,13 +70,12 @@ def get_connector_credential_pair_from_id(
 def get_last_successful_attempt_time(
    connector_id: int,
    credential_id: int,
-    earliest_index: float,
-    search_settings: SearchSettings,
+    embedding_model: EmbeddingModel,
    db_session: Session,
 ) -> float:
    """Gets the timestamp of the last successful index run stored in
    the CC Pair row in the database"""
-    if search_settings.status == IndexModelStatus.PRESENT:
+    if embedding_model.status == IndexModelStatus.PRESENT:
        connector_credential_pair = get_connector_credential_pair(
            connector_id, credential_id, db_session
        )
@@ -173,7 +83,7 @@ def get_last_successful_attempt_time(
            connector_credential_pair is None
            or connector_credential_pair.last_successful_index_time is None
        ):
-            return earliest_index
+            return 0.0

        return connector_credential_pair.last_successful_index_time.timestamp()

@@ -187,15 +97,17 @@ def get_last_successful_attempt_time(
        .filter(
            ConnectorCredentialPair.connector_id == connector_id,
            ConnectorCredentialPair.credential_id == credential_id,
-            IndexAttempt.search_settings_id == search_settings.id,
+            IndexAttempt.embedding_model_id == embedding_model.id,
            IndexAttempt.status == IndexingStatus.SUCCESS,
        )
        .order_by(IndexAttempt.time_started.desc())
        .first()
    )
-
    if not attempt or not attempt.time_started:
-        return earliest_index
+        connector = fetch_connector_by_id(connector_id, db_session)
+        if connector and connector.indexing_start:
+            return connector.indexing_start.timestamp()
+        return 0.0

    return attempt.time_started.timestamp()

@@ -305,28 +217,14 @@ def associate_default_cc_pair(db_session: Session) -> None:
    db_session.commit()


-def _relate_groups_to_cc_pair__no_commit(
-    db_session: Session,
-    cc_pair_id: int,
-    user_group_ids: list[int],
-) -> None:
-    for group_id in user_group_ids:
-        db_session.add(
-            UserGroup__ConnectorCredentialPair(
-                user_group_id=group_id, cc_pair_id=cc_pair_id
-            )
-        )
-
-
 def add_credential_to_connector(
-    db_session: Session,
-    user: User | None,
    connector_id: int,
    credential_id: int,
    cc_pair_name: str | None,
    is_public: bool,
-    groups: list[int] | None,
-) -> StatusResponse:
+    user: User | None,
+    db_session: Session,
+) -> StatusResponse[int]:
    connector = fetch_connector_by_id(connector_id, db_session)
    credential = fetch_credential_by_id(credential_id, user, db_session)

@@ -334,13 +232,9 @@ def add_credential_to_connector(
        raise HTTPException(status_code=404, detail="Connector does not exist")

    if credential is None:
-        error_msg = (
-            f"Credential {credential_id} does not exist or does not belong to user"
-        )
-        logger.error(error_msg)
        raise HTTPException(
            status_code=401,
-            detail=error_msg,
+            detail="Credential does not exist or does not belong to user",
        )

    existing_association = (
@@ -354,7 +248,7 @@ def add_credential_to_connector(
    if existing_association is not None:
        return StatusResponse(
            success=False,
-            message=f"Connector {connector_id} already has Credential {credential_id}",
+            message=f"Connector already has Credential {credential_id}",
            data=connector_id,
        )

@@ -366,21 +260,12 @@ def add_credential_to_connector(
        is_public=is_public,
    )
    db_session.add(association)
-    db_session.flush()  # make sure the association has an id
-
-    if groups:
-        _relate_groups_to_cc_pair__no_commit(
-            db_session=db_session,
-            cc_pair_id=association.id,
-            user_group_ids=groups,
-        )
-
    db_session.commit()

    return StatusResponse(
        success=True,
-        message=f"Creating new association between Connector {connector_id} and Credential {credential_id}",
-        data=association.id,
+        message=f"New Credential {credential_id} added to Connector",
+        data=connector_id,
    )


@@ -402,12 +287,13 @@ def remove_credential_from_connector(
            detail="Credential does not exist or does not belong to user",
        )

-    association = get_connector_credential_pair(
-        connector_id=connector_id,
-        credential_id=credential_id,
-        db_session=db_session,
-        user=user,
-        get_editable=True,
+    association = (
+        db_session.query(ConnectorCredentialPair)
+        .filter(
+            ConnectorCredentialPair.connector_id == connector_id,
+            ConnectorCredentialPair.credential_id == credential_id,
+        )
+        .one_or_none()
    )

    if association is not None:
@@ -448,11 +334,11 @@ def resync_cc_pair(
                ConnectorCredentialPair,
                IndexAttempt.connector_credential_pair_id == ConnectorCredentialPair.id,
            )
-            .join(SearchSettings, IndexAttempt.search_settings_id == SearchSettings.id)
+            .join(EmbeddingModel, IndexAttempt.embedding_model_id == EmbeddingModel.id)
            .filter(
                ConnectorCredentialPair.connector_id == connector_id,
                ConnectorCredentialPair.credential_id == credential_id,
-                SearchSettings.status == IndexModelStatus.PRESENT,
+                EmbeddingModel.status == IndexModelStatus.PRESENT,
            )
        )

--- a/backend/danswer/db/credentials.py
+++ b/backend/danswer/db/credentials.py
@@ -1,6 +1,5 @@
 from typing import Any

-from sqlalchemy import exists
 from sqlalchemy import Select
 from sqlalchemy import select
 from sqlalchemy import update
@@ -18,10 +17,8 @@ from danswer.connectors.google_drive.constants import (
 )
 from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import Credential
-from danswer.db.models import Credential__UserGroup
 from danswer.db.models import DocumentByConnectorCredentialPair
 from danswer.db.models import User
-from danswer.db.models import User__UserGroup
 from danswer.server.documents.models import CredentialBase
 from danswer.server.documents.models import CredentialDataUpdateRequest
 from danswer.utils.logger import setup_logger
@@ -29,122 +26,42 @@ from danswer.utils.logger import setup_logger

 logger = setup_logger()

-# The credentials for these sources are not real so
-# permissions are not enforced for them
-CREDENTIAL_PERMISSIONS_TO_IGNORE = {
-    DocumentSource.FILE,
-    DocumentSource.WEB,
-    DocumentSource.NOT_APPLICABLE,
-    DocumentSource.GOOGLE_SITES,
-    DocumentSource.WIKIPEDIA,
-    DocumentSource.MEDIAWIKI,
-}

-
-def _add_user_filters(
-    stmt: Select,
+def _attach_user_filters(
+    stmt: Select[tuple[Credential]],
    user: User | None,
    assume_admin: bool = False,  # Used with API key
-    get_editable: bool = True,
 ) -> Select:
    """Attaches filters to the statement to ensure that the user can only
    access the appropriate credentials"""
-    if not user:
-        if assume_admin:
-            # apply admin filters minus the user_id check
+    if user:
+        if user.role == UserRole.ADMIN:
            stmt = stmt.where(
                or_(
+                    Credential.user_id == user.id,
                    Credential.user_id.is_(None),
                    Credential.admin_public == True,  # noqa: E712
-                    Credential.source.in_(CREDENTIAL_PERMISSIONS_TO_IGNORE),
                )
            )
-        return stmt
-
-    if user.role == UserRole.ADMIN:
-        # Admins can access all credentials that are public or owned by them
-        # or are not associated with any user
-        return stmt.where(
+        else:
+            stmt = stmt.where(Credential.user_id == user.id)
+    elif assume_admin:
+        stmt = stmt.where(
            or_(
-                Credential.user_id == user.id,
                Credential.user_id.is_(None),
                Credential.admin_public == True,  # noqa: E712
-                Credential.source.in_(CREDENTIAL_PERMISSIONS_TO_IGNORE),
            )
        )
-    if user.role == UserRole.BASIC:
-        # Basic users can only access credentials that are owned by them
-        return stmt.where(Credential.user_id == user.id)

-    """
-    THIS PART IS FOR CURATORS AND GLOBAL CURATORS
-    Here we select cc_pairs by relation:
-    User -> User__UserGroup -> Credential__UserGroup -> Credential
-    """
-    stmt = stmt.outerjoin(Credential__UserGroup).outerjoin(
-        User__UserGroup,
-        User__UserGroup.user_group_id == Credential__UserGroup.user_group_id,
-    )
-    """
-    Filter Credentials by:
-    - if the user is in the user_group that owns the Credential
-    - if the user is not a global_curator, they must also have a curator relationship
-    to the user_group
-    - if editing is being done, we also filter out Credentials that are owned by groups
-    that the user isn't a curator for
-    - if we are not editing, we show all Credentials in the groups the user is a curator
-    for (as well as public Credentials)
-    - if we are not editing, we return all Credentials directly connected to the user
-    """
-    where_clause = User__UserGroup.user_id == user.id
-    if user.role == UserRole.CURATOR:
-        where_clause &= User__UserGroup.is_curator == True  # noqa: E712
-    if get_editable:
-        user_groups = select(User__UserGroup.user_group_id).where(
-            User__UserGroup.user_id == user.id
-        )
-        if user.role == UserRole.CURATOR:
-            user_groups = user_groups.where(
-                User__UserGroup.is_curator == True  # noqa: E712
-            )
-        where_clause &= (
-            ~exists()
-            .where(Credential__UserGroup.credential_id == Credential.id)
-            .where(~Credential__UserGroup.user_group_id.in_(user_groups))
-            .correlate(Credential)
-        )
-    else:
-        where_clause |= Credential.curator_public == True  # noqa: E712
-        where_clause |= Credential.user_id == user.id  # noqa: E712
-
-    where_clause |= Credential.source.in_(CREDENTIAL_PERMISSIONS_TO_IGNORE)
-
-    return stmt.where(where_clause)
-
-
-def _relate_credential_to_user_groups__no_commit(
-    db_session: Session,
-    credential_id: int,
-    user_group_ids: list[int],
-) -> None:
-    credential_user_groups = []
-    for group_id in user_group_ids:
-        credential_user_groups.append(
-            Credential__UserGroup(
-                credential_id=credential_id,
-                user_group_id=group_id,
-            )
-        )
-    db_session.add_all(credential_user_groups)
+    return stmt


 def fetch_credentials(
    db_session: Session,
    user: User | None = None,
-    get_editable: bool = True,
 ) -> list[Credential]:
    stmt = select(Credential)
-    stmt = _add_user_filters(stmt, user, get_editable=get_editable)
+    stmt = _attach_user_filters(stmt, user)
    results = db_session.scalars(stmt)
    return list(results.all())

@@ -155,9 +72,8 @@ def fetch_credential_by_id(
    db_session: Session,
    assume_admin: bool = False,
 ) -> Credential | None:
-    stmt = select(Credential).distinct()
-    stmt = stmt.where(Credential.id == credential_id)
-    stmt = _add_user_filters(stmt, user, assume_admin=assume_admin)
+    stmt = select(Credential).where(Credential.id == credential_id)
+    stmt = _attach_user_filters(stmt, user, assume_admin=assume_admin)
    result = db_session.execute(stmt)
    credential = result.scalar_one_or_none()
    return credential
@@ -167,10 +83,9 @@ def fetch_credentials_by_source(
    db_session: Session,
    user: User | None,
    document_source: DocumentSource | None = None,
-    get_editable: bool = True,
 ) -> list[Credential]:
    base_query = select(Credential).where(Credential.source == document_source)
-    base_query = _add_user_filters(base_query, user, get_editable=get_editable)
+    base_query = _attach_user_filters(base_query, user)
    credentials = db_session.execute(base_query).scalars().all()
    return list(credentials)

@@ -238,38 +153,19 @@ def create_credential(
        admin_public=credential_data.admin_public,
        source=credential_data.source,
        name=credential_data.name,
-        curator_public=credential_data.curator_public,
    )
    db_session.add(credential)
-    db_session.flush()  # This ensures the credential gets an ID
-
-    _relate_credential_to_user_groups__no_commit(
-        db_session=db_session,
-        credential_id=credential.id,
-        user_group_ids=credential_data.groups,
-    )
-
    db_session.commit()

    return credential


-def _cleanup_credential__user_group_relationships__no_commit(
-    db_session: Session, credential_id: int
-) -> None:
-    """NOTE: does not commit the transaction."""
-    db_session.query(Credential__UserGroup).filter(
-        Credential__UserGroup.credential_id == credential_id
-    ).delete(synchronize_session=False)
-
-
 def alter_credential(
    credential_id: int,
    credential_data: CredentialDataUpdateRequest,
    user: User,
    db_session: Session,
 ) -> Credential | None:
-    # TODO: add user group relationship update
    credential = fetch_credential_by_id(credential_id, user, db_session)

    if credential is None:
@@ -375,11 +271,10 @@ def delete_credential(
            )

    if force:
-        logger.warning(f"Force deleting credential {credential_id}")
+        logger.info(f"Force deleting credential {credential_id}")
    else:
-        logger.notice(f"Deleting credential {credential_id}")
+        logger.info(f"Deleting credential {credential_id}")

-    _cleanup_credential__user_group_relationships__no_commit(db_session, credential_id)
    db_session.delete(credential)
    db_session.commit()

--- a/backend/danswer/db/deletion_attempt.py
+++ b/backend/danswer/db/deletion_attempt.py
@@ -1,9 +1,10 @@
 from sqlalchemy.orm import Session

+from danswer.db.embedding_model import get_current_db_embedding_model
+from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.index_attempt import get_last_attempt
 from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import IndexingStatus
-from danswer.db.search_settings import get_current_search_settings


 def check_deletion_attempt_is_allowed(
@@ -23,17 +24,20 @@ def check_deletion_attempt_is_allowed(
        f"'{connector_credential_pair.credential_id}' is not deletable."
    )

-    if connector_credential_pair.status.is_active():
+    if (
+        connector_credential_pair.status != ConnectorCredentialPairStatus.PAUSED
+        and connector_credential_pair.status != ConnectorCredentialPairStatus.DELETING
+    ):
        return base_error_msg + " Connector must be paused."

    connector_id = connector_credential_pair.connector_id
    credential_id = connector_credential_pair.credential_id
-    search_settings = get_current_search_settings(db_session)
+    current_embedding_model = get_current_db_embedding_model(db_session)

    last_indexing = get_last_attempt(
        connector_id=connector_id,
        credential_id=credential_id,
-        search_settings_id=search_settings.id,
+        embedding_model_id=current_embedding_model.id,
        db_session=db_session,
    )

--- a/backend/danswer/db/document.py
+++ b/backend/danswer/db/document.py
@@ -3,7 +3,6 @@ import time
 from collections.abc import Generator
 from collections.abc import Sequence
 from datetime import datetime
-from datetime import timezone
 from uuid import UUID

 from sqlalchemy import and_
@@ -11,7 +10,6 @@ from sqlalchemy import delete
 from sqlalchemy import exists
 from sqlalchemy import func
 from sqlalchemy import or_
-from sqlalchemy import Select
 from sqlalchemy import select
 from sqlalchemy.dialects.postgresql import insert
 from sqlalchemy.engine.util import TransactionalContext
@@ -40,68 +38,6 @@ def check_docs_exist(db_session: Session) -> bool:
    return result.scalar() or False


-def count_documents_by_needs_sync(session: Session) -> int:
-    """Get the count of all documents where:
-    1. last_modified is newer than last_synced
-    2. last_synced is null (meaning we've never synced)
-
-    This function executes the query and returns the count of
-    documents matching the criteria."""
-
-    count = (
-        session.query(func.count())
-        .select_from(DbDocument)
-        .filter(
-            or_(
-                DbDocument.last_modified > DbDocument.last_synced,
-                DbDocument.last_synced.is_(None),
-            )
-        )
-        .scalar()
-    )
-
-    return count
-
-
-def construct_document_select_for_connector_credential_pair_by_needs_sync(
-    connector_id: int, credential_id: int
-) -> Select:
-    initial_doc_ids_stmt = select(DocumentByConnectorCredentialPair.id).where(
-        and_(
-            DocumentByConnectorCredentialPair.connector_id == connector_id,
-            DocumentByConnectorCredentialPair.credential_id == credential_id,
-        )
-    )
-
-    stmt = (
-        select(DbDocument)
-        .where(
-            DbDocument.id.in_(initial_doc_ids_stmt),
-            or_(
-                DbDocument.last_modified
-                > DbDocument.last_synced,  # last_modified is newer than last_synced
-                DbDocument.last_synced.is_(None),  # never synced
-            ),
-        )
-        .distinct()
-    )
-
-    return stmt
-
-
-def construct_document_select_for_connector_credential_pair(
-    connector_id: int, credential_id: int | None = None
-) -> Select:
-    initial_doc_ids_stmt = select(DocumentByConnectorCredentialPair.id).where(
-        and_(
-            DocumentByConnectorCredentialPair.connector_id == connector_id,
-            DocumentByConnectorCredentialPair.credential_id == credential_id,
-        )
-    )
-    stmt = select(DbDocument).where(DbDocument.id.in_(initial_doc_ids_stmt)).distinct()
-    return stmt
-
-
 def get_documents_for_connector_credential_pair(
    db_session: Session, connector_id: int, credential_id: int, limit: int | None = None
 ) -> Sequence[DbDocument]:
@@ -172,29 +108,7 @@ def get_document_cnts_for_cc_pairs(
    return db_session.execute(stmt).all()  # type: ignore


-def get_access_info_for_document(
-    db_session: Session,
-    document_id: str,
-) -> tuple[str, list[UUID | None], bool] | None:
-    """Gets access info for a single document by calling the get_access_info_for_documents function
-    and passing a list with a single document ID.
-
-    Args:
-        db_session (Session): The database session to use.
-        document_id (str): The document ID to fetch access info for.
-
-    Returns:
-        Optional[Tuple[str, List[UUID | None], bool]]: A tuple containing the document ID, a list of user IDs,
-        and a boolean indicating if the document is globally public, or None if no results are found.
-    """
-    results = get_access_info_for_documents(db_session, [document_id])
-    if not results:
-        return None
-
-    return results[0]
-
-
-def get_access_info_for_documents(
+def get_acccess_info_for_documents(
    db_session: Session,
    document_ids: list[str],
 ) -> Sequence[tuple[str, list[UUID | None], bool]]:
@@ -259,7 +173,6 @@ def upsert_documents(
                    semantic_id=doc.semantic_identifier,
                    link=doc.first_link,
                    doc_updated_at=None,  # this is intentional
-                    last_modified=datetime.now(timezone.utc),
                    primary_owners=doc.primary_owners,
                    secondary_owners=doc.secondary_owners,
                )
@@ -301,7 +214,7 @@ def upsert_document_by_connector_credential_pair(
    db_session.commit()


-def update_docs_updated_at__no_commit(
+def update_docs_updated_at(
    ids_to_new_updated_at: dict[str, datetime],
    db_session: Session,
 ) -> None:
@@ -313,28 +226,6 @@ def update_docs_updated_at__no_commit(
    for document in documents_to_update:
        document.doc_updated_at = ids_to_new_updated_at[document.id]

-
-def update_docs_last_modified__no_commit(
-    document_ids: list[str],
-    db_session: Session,
-) -> None:
-    documents_to_update = (
-        db_session.query(DbDocument).filter(DbDocument.id.in_(document_ids)).all()
-    )
-
-    now = datetime.now(timezone.utc)
-    for doc in documents_to_update:
-        doc.last_modified = now
-
-
-def mark_document_as_synced(document_id: str, db_session: Session) -> None:
-    stmt = select(DbDocument).where(DbDocument.id == document_id)
-    doc = db_session.scalar(stmt)
-    if doc is None:
-        raise ValueError(f"No document with ID: {document_id}")
-
-    # update last_synced
-    doc.last_synced = datetime.now(timezone.utc)
    db_session.commit()


@@ -426,7 +317,7 @@ def prepare_to_modify_documents(
    called ahead of any modification to Vespa. Locks should be released by the
    caller as soon as updates are complete by finishing the transaction.

-    NOTE: only one commit is allowed within the context manager returned by this function.
+    NOTE: only one commit is allowed within the context manager returned by this funtion.
    Multiple commits will result in a sqlalchemy.exc.InvalidRequestError.
    NOTE: this function will commit any existing transaction.
    """
@@ -444,9 +335,7 @@ def prepare_to_modify_documents(
                    yield transaction
                    break
        except OperationalError as e:
-            logger.warning(
-                f"Failed to acquire locks for documents, retrying. Error: {e}"
-            )
+            logger.info(f"Failed to acquire locks for documents, retrying. Error: {e}")

        time.sleep(retry_delay)

@@ -488,12 +377,3 @@ def get_documents_by_cc_pair(
        .filter(ConnectorCredentialPair.id == cc_pair_id)
        .all()
    )
-
-
-def get_document(
-    document_id: str,
-    db_session: Session,
-) -> DbDocument | None:
-    stmt = select(DbDocument).where(DbDocument.id == document_id)
-    doc: DbDocument | None = db_session.execute(stmt).scalar_one_or_none()
-    return doc
--- a/backend/danswer/db/document_set.py
+++ b/backend/danswer/db/document_set.py
@@ -4,79 +4,21 @@ from uuid import UUID

 from sqlalchemy import and_
 from sqlalchemy import delete
-from sqlalchemy import exists
 from sqlalchemy import func
 from sqlalchemy import or_
-from sqlalchemy import Select
 from sqlalchemy import select
-from sqlalchemy.orm import aliased
 from sqlalchemy.orm import Session

-from danswer.db.connector_credential_pair import get_cc_pair_groups_for_ids
-from danswer.db.connector_credential_pair import get_connector_credential_pairs
 from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import Document
 from danswer.db.models import DocumentByConnectorCredentialPair
 from danswer.db.models import DocumentSet as DocumentSetDBModel
 from danswer.db.models import DocumentSet__ConnectorCredentialPair
-from danswer.db.models import DocumentSet__UserGroup
-from danswer.db.models import User
-from danswer.db.models import User__UserGroup
-from danswer.db.models import UserRole
 from danswer.server.features.document_set.models import DocumentSetCreationRequest
 from danswer.server.features.document_set.models import DocumentSetUpdateRequest
-from danswer.utils.logger import setup_logger
 from danswer.utils.variable_functionality import fetch_versioned_implementation

-logger = setup_logger()
-
-
-def _add_user_filters(
-    stmt: Select, user: User | None, get_editable: bool = True
-) -> Select:
-    # If user is None, assume the user is an admin or auth is disabled
-    if user is None or user.role == UserRole.ADMIN:
-        return stmt
-
-    DocumentSet__UG = aliased(DocumentSet__UserGroup)
-    User__UG = aliased(User__UserGroup)
-    """
-    Here we select cc_pairs by relation:
-    User -> User__UserGroup -> DocumentSet__UserGroup -> DocumentSet
-    """
-    stmt = stmt.outerjoin(DocumentSet__UG).outerjoin(
-        User__UserGroup,
-        User__UserGroup.user_group_id == DocumentSet__UG.user_group_id,
-    )
-    """
-    Filter DocumentSets by:
-    - if the user is in the user_group that owns the DocumentSet
-    - if the user is not a global_curator, they must also have a curator relationship
-    to the user_group
-    - if editing is being done, we also filter out DocumentSets that are owned by groups
-    that the user isn't a curator for
-    - if we are not editing, we show all DocumentSets in the groups the user is a curator
-    for (as well as public DocumentSets)
-    """
-    where_clause = User__UserGroup.user_id == user.id
-    if user.role == UserRole.CURATOR and get_editable:
-        where_clause &= User__UserGroup.is_curator == True  # noqa: E712
-    if get_editable:
-        user_groups = select(User__UG.user_group_id).where(User__UG.user_id == user.id)
-        if user.role == UserRole.CURATOR:
-            user_groups = user_groups.where(User__UG.is_curator == True)  # noqa: E712
-        where_clause &= (
-            ~exists()
-            .where(DocumentSet__UG.document_set_id == DocumentSetDBModel.id)
-            .where(~DocumentSet__UG.user_group_id.in_(user_groups))
-            .correlate(DocumentSetDBModel)
-        )
-    else:
-        where_clause |= DocumentSetDBModel.is_public == True  # noqa: E712
-
-    return stmt.where(where_clause)
-

 def _delete_document_set_cc_pairs__no_commit(
    db_session: Session, document_set_id: int, is_current: bool | None = None
@@ -108,15 +50,11 @@ def delete_document_set_privacy__no_commit(


 def get_document_set_by_id(
-    db_session: Session,
-    document_set_id: int,
-    user: User | None = None,
-    get_editable: bool = True,
+    db_session: Session, document_set_id: int
 ) -> DocumentSetDBModel | None:
-    stmt = select(DocumentSetDBModel).distinct()
-    stmt = stmt.where(DocumentSetDBModel.id == document_set_id)
-    stmt = _add_user_filters(stmt=stmt, user=user, get_editable=get_editable)
-    return db_session.scalar(stmt)
+    return db_session.scalar(
+        select(DocumentSetDBModel).where(DocumentSetDBModel.id == document_set_id)
+    )


 def get_document_set_by_name(
@@ -148,45 +86,6 @@ def make_doc_set_private(
        raise NotImplementedError("Danswer MIT does not support private Document Sets")


-def _check_if_cc_pairs_are_owned_by_groups(
-    db_session: Session,
-    cc_pair_ids: list[int],
-    group_ids: list[int],
-) -> None:
-    """
-    This function checks if the CC pairs are owned by the specified groups or public.
-    If not, it raises a ValueError.
-    """
-    group_cc_pair_relationships = get_cc_pair_groups_for_ids(
-        db_session=db_session,
-        cc_pair_ids=cc_pair_ids,
-    )
-
-    group_cc_pair_relationships_set = {
-        (relationship.cc_pair_id, relationship.user_group_id)
-        for relationship in group_cc_pair_relationships
-    }
-
-    missing_cc_pair_ids = []
-    for cc_pair_id in cc_pair_ids:
-        for group_id in group_ids:
-            if (cc_pair_id, group_id) not in group_cc_pair_relationships_set:
-                missing_cc_pair_ids.append(cc_pair_id)
-                break
-
-    if missing_cc_pair_ids:
-        cc_pairs = get_connector_credential_pairs(
-            db_session=db_session,
-            ids=missing_cc_pair_ids,
-        )
-        for cc_pair in cc_pairs:
-            if not cc_pair.is_public:
-                raise ValueError(
-                    f"Connector Credential Pair with ID: '{cc_pair.id}'"
-                    " is not owned by the specified groups"
-                )
-
-
 def insert_document_set(
    document_set_creation_request: DocumentSetCreationRequest,
    user_id: UUID | None,
@@ -196,12 +95,8 @@ def insert_document_set(
        # It's cc-pairs in actuality but the UI displays this error
        raise ValueError("Cannot create a document set with no Connectors")

-    if not document_set_creation_request.is_public:
-        _check_if_cc_pairs_are_owned_by_groups(
-            db_session=db_session,
-            cc_pair_ids=document_set_creation_request.cc_pair_ids,
-            group_ids=document_set_creation_request.groups or [],
-        )
+    # start a transaction
+    db_session.begin()

    try:
        new_document_set_row = DocumentSetDBModel(
@@ -236,40 +131,27 @@ def insert_document_set(
        )

        db_session.commit()
-    except Exception as e:
+    except:
        db_session.rollback()
-        logger.error(f"Error creating document set: {e}")
+        raise

    return new_document_set_row, ds_cc_pairs


 def update_document_set(
-    db_session: Session,
-    document_set_update_request: DocumentSetUpdateRequest,
-    user: User | None = None,
+    document_set_update_request: DocumentSetUpdateRequest, db_session: Session
 ) -> tuple[DocumentSetDBModel, list[DocumentSet__ConnectorCredentialPair]]:
-    """If successful, this sets document_set_row.is_up_to_date = False.
-    That will be processed via Celery in check_for_vespa_sync_task
-    and trigger a long running background sync to Vespa.
-    """
    if not document_set_update_request.cc_pair_ids:
        # It's cc-pairs in actuality but the UI displays this error
        raise ValueError("Cannot create a document set with no Connectors")

-    if not document_set_update_request.is_public:
-        _check_if_cc_pairs_are_owned_by_groups(
-            db_session=db_session,
-            cc_pair_ids=document_set_update_request.cc_pair_ids,
-            group_ids=document_set_update_request.groups,
-        )
+    # start a transaction
+    db_session.begin()

    try:
        # update the description
        document_set_row = get_document_set_by_id(
-            db_session=db_session,
-            document_set_id=document_set_update_request.id,
-            user=user,
-            get_editable=True,
+            db_session=db_session, document_set_id=document_set_update_request.id
        )
        if document_set_row is None:
            raise ValueError(
@@ -347,26 +229,20 @@ def delete_document_set(


 def mark_document_set_as_to_be_deleted(
-    db_session: Session,
-    document_set_id: int,
-    user: User | None = None,
+    document_set_id: int, db_session: Session
 ) -> None:
    """Cleans up all document_set -> cc_pair relationships and marks the document set
    as needing an update. The actual document set row will be deleted by the background
    job which syncs these changes to Vespa."""
+    # start a transaction
+    db_session.begin()

    try:
        document_set_row = get_document_set_by_id(
-            db_session=db_session,
-            document_set_id=document_set_id,
-            user=user,
-            get_editable=True,
+            db_session=db_session, document_set_id=document_set_id
        )
        if document_set_row is None:
-            error_msg = f"Document set with ID: '{document_set_id}' does not exist "
-            if user is not None:
-                error_msg += f"or is not editable by user with email: '{user.email}'"
-            raise ValueError(error_msg)
+            raise ValueError(f"No document set with ID: '{document_set_id}'")
        if not document_set_row.is_up_to_date:
            raise ValueError(
                "Cannot delete document set while it is syncing. Please wait "
@@ -465,14 +341,29 @@ def fetch_document_sets(
    ]


-def fetch_all_document_sets_for_user(
-    db_session: Session,
-    user: User | None = None,
-    get_editable: bool = True,
-) -> Sequence[DocumentSetDBModel]:
-    stmt = select(DocumentSetDBModel).distinct()
-    stmt = _add_user_filters(stmt, user, get_editable=get_editable)
-    return db_session.scalars(stmt).all()
+def fetch_all_document_sets(db_session: Session) -> Sequence[DocumentSetDBModel]:
+    """Used for Admin UI where they should have visibility into all document sets"""
+    return db_session.scalars(select(DocumentSetDBModel)).all()
+
+
+def fetch_user_document_sets(
+    user_id: UUID | None, db_session: Session
+) -> list[tuple[DocumentSetDBModel, list[ConnectorCredentialPair]]]:
+    # If Auth is turned off, all document sets become visible
+    # document sets are not permission enforced, only for organizational purposes
+    # the documents themselves are permission enforced
+    if user_id is None:
+        return fetch_document_sets(
+            user_id=user_id, db_session=db_session, include_outdated=True
+        )
+
+    versioned_fetch_doc_sets_fn = fetch_versioned_implementation(
+        "danswer.db.document_set", "fetch_document_sets"
+    )
+
+    return versioned_fetch_doc_sets_fn(
+        user_id=user_id, db_session=db_session, include_outdated=True
+    )


 def fetch_documents_for_document_set_paginated(
@@ -523,135 +414,42 @@ def fetch_documents_for_document_set_paginated(
    return documents, documents[-1].id if documents else None


-def construct_document_select_by_docset(
-    document_set_id: int,
-    current_only: bool = True,
-) -> Select:
-    """This returns a statement that should be executed using
-    .yield_per() to minimize overhead. The primary consumers of this function
-    are background processing task generators."""
-
-    stmt = (
-        select(Document)
-        .join(
-            DocumentByConnectorCredentialPair,
-            DocumentByConnectorCredentialPair.id == Document.id,
-        )
-        .join(
-            ConnectorCredentialPair,
-            and_(
-                ConnectorCredentialPair.connector_id
-                == DocumentByConnectorCredentialPair.connector_id,
-                ConnectorCredentialPair.credential_id
-                == DocumentByConnectorCredentialPair.credential_id,
-            ),
-        )
-        .join(
-            DocumentSet__ConnectorCredentialPair,
-            DocumentSet__ConnectorCredentialPair.connector_credential_pair_id
-            == ConnectorCredentialPair.id,
-        )
-        .join(
-            DocumentSetDBModel,
-            DocumentSetDBModel.id
-            == DocumentSet__ConnectorCredentialPair.document_set_id,
-        )
-        .where(DocumentSetDBModel.id == document_set_id)
-        .order_by(Document.id)
-    )
-
-    if current_only:
-        stmt = stmt.where(
-            DocumentSet__ConnectorCredentialPair.is_current == True  # noqa: E712
-        )
-
-    stmt = stmt.distinct()
-    return stmt
-
-
-def fetch_document_set_for_document(
-    document_id: str,
-    db_session: Session,
-) -> list[str]:
-    """
-    Fetches the document set names for a single document ID.
-
-    :param document_id: The ID of the document to fetch sets for.
-    :param db_session: The SQLAlchemy session to use for the query.
-    :return: A list of document set names, or None if no result is found.
-    """
-    result = fetch_document_sets_for_documents([document_id], db_session)
-    if not result:
-        return []
-
-    return result[0][1]
-
-
 def fetch_document_sets_for_documents(
    document_ids: list[str],
    db_session: Session,
 ) -> Sequence[tuple[str, list[str]]]:
    """Gives back a list of (document_id, list[document_set_names]) tuples"""
-
-    """Building subqueries"""
-    # NOTE: have to build these subqueries first in order to guarantee that we get one
-    # returned row for each specified document_id. Basically, we want to do the filters first,
-    # then the outer joins.
-
-    # don't include CC pairs that are being deleted
-    # NOTE: CC pairs can never go from DELETING to any other state -> it's safe to ignore them
-    # as we can assume their document sets are no longer relevant
-    valid_cc_pairs_subquery = aliased(
-        ConnectorCredentialPair,
-        select(ConnectorCredentialPair)
-        .where(
-            ConnectorCredentialPair.status != ConnectorCredentialPairStatus.DELETING
-        )  # noqa: E712
-        .subquery(),
-    )
-
-    valid_document_set__cc_pairs_subquery = aliased(
-        DocumentSet__ConnectorCredentialPair,
-        select(DocumentSet__ConnectorCredentialPair)
-        .where(DocumentSet__ConnectorCredentialPair.is_current == True)  # noqa: E712
-        .subquery(),
-    )
-    """End building subqueries"""
-
    stmt = (
-        select(
-            Document.id,
-            func.coalesce(
-                func.array_remove(func.array_agg(DocumentSetDBModel.name), None), []
-            ).label("document_set_names"),
+        select(Document.id, func.array_agg(DocumentSetDBModel.name))
+        .join(
+            DocumentSet__ConnectorCredentialPair,
+            DocumentSetDBModel.id
+            == DocumentSet__ConnectorCredentialPair.document_set_id,
        )
-        # Here we select document sets by relation:
-        # Document -> DocumentByConnectorCredentialPair -> ConnectorCredentialPair ->
-        # DocumentSet__ConnectorCredentialPair -> DocumentSet
-        .outerjoin(
+        .join(
+            ConnectorCredentialPair,
+            ConnectorCredentialPair.id
+            == DocumentSet__ConnectorCredentialPair.connector_credential_pair_id,
+        )
+        .join(
            DocumentByConnectorCredentialPair,
-            Document.id == DocumentByConnectorCredentialPair.id,
-        )
-        .outerjoin(
-            valid_cc_pairs_subquery,
            and_(
                DocumentByConnectorCredentialPair.connector_id
-                == valid_cc_pairs_subquery.connector_id,
+                == ConnectorCredentialPair.connector_id,
                DocumentByConnectorCredentialPair.credential_id
-                == valid_cc_pairs_subquery.credential_id,
+                == ConnectorCredentialPair.credential_id,
            ),
        )
-        .outerjoin(
-            valid_document_set__cc_pairs_subquery,
-            valid_cc_pairs_subquery.id
-            == valid_document_set__cc_pairs_subquery.connector_credential_pair_id,
-        )
-        .outerjoin(
-            DocumentSetDBModel,
-            DocumentSetDBModel.id
-            == valid_document_set__cc_pairs_subquery.document_set_id,
+        .join(
+            Document,
+            Document.id == DocumentByConnectorCredentialPair.id,
        )
        .where(Document.id.in_(document_ids))
+        # don't include CC pairs that are being deleted
+        # NOTE: CC pairs can never go from DELETING to any other state -> it's safe to ignore them
+        # as we can assume their document sets are no longer relevant
+        .where(ConnectorCredentialPair.status != ConnectorCredentialPairStatus.DELETING)
+        .where(DocumentSet__ConnectorCredentialPair.is_current == True)  # noqa: E712
        .group_by(Document.id)
    )
    return db_session.execute(stmt).all()  # type: ignore
--- a/backend/danswer/db/embedding_model.py
+++ b/backend/danswer/db/embedding_model.py
@@ -0,0 +1,157 @@
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from danswer.configs.model_configs import ASYM_PASSAGE_PREFIX
+from danswer.configs.model_configs import ASYM_QUERY_PREFIX
+from danswer.configs.model_configs import DEFAULT_DOCUMENT_ENCODER_MODEL
+from danswer.configs.model_configs import DOC_EMBEDDING_DIM
+from danswer.configs.model_configs import DOCUMENT_ENCODER_MODEL
+from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS
+from danswer.configs.model_configs import OLD_DEFAULT_DOCUMENT_ENCODER_MODEL
+from danswer.configs.model_configs import OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM
+from danswer.configs.model_configs import OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS
+from danswer.db.llm import fetch_embedding_provider
+from danswer.db.models import CloudEmbeddingProvider
+from danswer.db.models import EmbeddingModel
+from danswer.db.models import IndexModelStatus
+from danswer.indexing.models import EmbeddingModelDetail
+from danswer.natural_language_processing.search_nlp_models import clean_model_name
+from danswer.server.manage.embedding.models import (
+    CloudEmbeddingProvider as ServerCloudEmbeddingProvider,
+)
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def create_embedding_model(
+    model_details: EmbeddingModelDetail,
+    db_session: Session,
+    status: IndexModelStatus = IndexModelStatus.FUTURE,
+) -> EmbeddingModel:
+    embedding_model = EmbeddingModel(
+        model_name=model_details.model_name,
+        model_dim=model_details.model_dim,
+        normalize=model_details.normalize,
+        query_prefix=model_details.query_prefix,
+        passage_prefix=model_details.passage_prefix,
+        status=status,
+        cloud_provider_id=model_details.cloud_provider_id,
+        # Every single embedding model except the initial one from migrations has this name
+        # The initial one from migration is called "danswer_chunk"
+        index_name=f"danswer_chunk_{clean_model_name(model_details.model_name)}",
+    )
+
+    db_session.add(embedding_model)
+    db_session.commit()
+
+    return embedding_model
+
+
+def get_model_id_from_name(
+    db_session: Session, embedding_provider_name: str
+) -> int | None:
+    query = select(CloudEmbeddingProvider).where(
+        CloudEmbeddingProvider.name == embedding_provider_name
+    )
+    provider = db_session.execute(query).scalars().first()
+    return provider.id if provider else None
+
+
+def get_current_db_embedding_provider(
+    db_session: Session,
+) -> ServerCloudEmbeddingProvider | None:
+    current_embedding_model = EmbeddingModelDetail.from_model(
+        get_current_db_embedding_model(db_session=db_session)
+    )
+
+    if (
+        current_embedding_model is None
+        or current_embedding_model.cloud_provider_id is None
+    ):
+        return None
+
+    embedding_provider = fetch_embedding_provider(
+        db_session=db_session, provider_id=current_embedding_model.cloud_provider_id
+    )
+    if embedding_provider is None:
+        raise RuntimeError("No embedding provider exists for this model.")
+
+    current_embedding_provider = ServerCloudEmbeddingProvider.from_request(
+        cloud_provider_model=embedding_provider
+    )
+
+    return current_embedding_provider
+
+
+def get_current_db_embedding_model(db_session: Session) -> EmbeddingModel:
+    query = (
+        select(EmbeddingModel)
+        .where(EmbeddingModel.status == IndexModelStatus.PRESENT)
+        .order_by(EmbeddingModel.id.desc())
+    )
+    result = db_session.execute(query)
+    latest_model = result.scalars().first()
+
+    if not latest_model:
+        raise RuntimeError("No embedding model selected, DB is not in a valid state")
+
+    return latest_model
+
+
+def get_secondary_db_embedding_model(db_session: Session) -> EmbeddingModel | None:
+    query = (
+        select(EmbeddingModel)
+        .where(EmbeddingModel.status == IndexModelStatus.FUTURE)
+        .order_by(EmbeddingModel.id.desc())
+    )
+    result = db_session.execute(query)
+    latest_model = result.scalars().first()
+
+    return latest_model
+
+
+def update_embedding_model_status(
+    embedding_model: EmbeddingModel, new_status: IndexModelStatus, db_session: Session
+) -> None:
+    embedding_model.status = new_status
+    db_session.commit()
+
+
+def user_has_overridden_embedding_model() -> bool:
+    return DOCUMENT_ENCODER_MODEL != DEFAULT_DOCUMENT_ENCODER_MODEL
+
+
+def get_old_default_embedding_model() -> EmbeddingModel:
+    is_overridden = user_has_overridden_embedding_model()
+    return EmbeddingModel(
+        model_name=(
+            DOCUMENT_ENCODER_MODEL
+            if is_overridden
+            else OLD_DEFAULT_DOCUMENT_ENCODER_MODEL
+        ),
+        model_dim=(
+            DOC_EMBEDDING_DIM if is_overridden else OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM
+        ),
+        normalize=(
+            NORMALIZE_EMBEDDINGS
+            if is_overridden
+            else OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS
+        ),
+        query_prefix=(ASYM_QUERY_PREFIX if is_overridden else ""),
+        passage_prefix=(ASYM_PASSAGE_PREFIX if is_overridden else ""),
+        status=IndexModelStatus.PRESENT,
+        index_name="danswer_chunk",
+    )
+
+
+def get_new_default_embedding_model(is_present: bool) -> EmbeddingModel:
+    return EmbeddingModel(
+        model_name=DOCUMENT_ENCODER_MODEL,
+        model_dim=DOC_EMBEDDING_DIM,
+        normalize=NORMALIZE_EMBEDDINGS,
+        query_prefix=ASYM_QUERY_PREFIX,
+        passage_prefix=ASYM_PASSAGE_PREFIX,
+        status=IndexModelStatus.PRESENT if is_present else IndexModelStatus.FUTURE,
+        index_name=f"danswer_chunk_{clean_model_name(DOCUMENT_ENCODER_MODEL)}",
+    )
--- a/backend/danswer/db/engine.py
+++ b/backend/danswer/db/engine.py
@@ -1,11 +1,9 @@
 import contextlib
-import time
 from collections.abc import AsyncGenerator
 from collections.abc import Generator
 from datetime import datetime
 from typing import ContextManager

-from sqlalchemy import event
 from sqlalchemy import text
 from sqlalchemy.engine import create_engine
 from sqlalchemy.engine import Engine
@@ -15,8 +13,6 @@ from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.orm import Session
 from sqlalchemy.orm import sessionmaker

-from danswer.configs.app_configs import LOG_POSTGRES_CONN_COUNTS
-from danswer.configs.app_configs import LOG_POSTGRES_LATENCY
 from danswer.configs.app_configs import POSTGRES_DB
 from danswer.configs.app_configs import POSTGRES_HOST
 from danswer.configs.app_configs import POSTGRES_PASSWORD
@@ -45,58 +41,6 @@ _ASYNC_ENGINE: AsyncEngine | None = None
 SessionFactory: sessionmaker[Session] | None = None


-if LOG_POSTGRES_LATENCY:
-    # Function to log before query execution
-    @event.listens_for(Engine, "before_cursor_execute")
-    def before_cursor_execute(  # type: ignore
-        conn, cursor, statement, parameters, context, executemany
-    ):
-        conn.info["query_start_time"] = time.time()
-
-    # Function to log after query execution
-    @event.listens_for(Engine, "after_cursor_execute")
-    def after_cursor_execute(  # type: ignore
-        conn, cursor, statement, parameters, context, executemany
-    ):
-        total_time = time.time() - conn.info["query_start_time"]
-        # don't spam TOO hard
-        if total_time > 0.1:
-            logger.debug(
-                f"Query Complete: {statement}\n\nTotal Time: {total_time:.4f} seconds"
-            )
-
-
-if LOG_POSTGRES_CONN_COUNTS:
-    # Global counter for connection checkouts and checkins
-    checkout_count = 0
-    checkin_count = 0
-
-    @event.listens_for(Engine, "checkout")
-    def log_checkout(dbapi_connection, connection_record, connection_proxy):  # type: ignore
-        global checkout_count
-        checkout_count += 1
-
-        active_connections = connection_proxy._pool.checkedout()
-        idle_connections = connection_proxy._pool.checkedin()
-        pool_size = connection_proxy._pool.size()
-        logger.debug(
-            "Connection Checkout\n"
-            f"Active Connections: {active_connections};\n"
-            f"Idle: {idle_connections};\n"
-            f"Pool Size: {pool_size};\n"
-            f"Total connection checkouts: {checkout_count}"
-        )
-
-    @event.listens_for(Engine, "checkin")
-    def log_checkin(dbapi_connection, connection_record):  # type: ignore
-        global checkin_count
-        checkin_count += 1
-        logger.debug(f"Total connection checkins: {checkin_count}")
-
-
-"""END DEBUGGING LOGGING"""
-
-
 def get_db_current_time(db_session: Session) -> datetime:
    """Get the current time from Postgres representing the start of the transaction
    Within the same transaction this value will not update
@@ -184,7 +128,7 @@ async def get_async_session() -> AsyncGenerator[AsyncSession, None]:


 async def warm_up_connections(
-    sync_connections_to_warm_up: int = 20, async_connections_to_warm_up: int = 20
+    sync_connections_to_warm_up: int = 10, async_connections_to_warm_up: int = 10
 ) -> None:
    sync_postgres_engine = get_sqlalchemy_engine()
    connections = [
--- a/backend/danswer/db/enums.py
+++ b/backend/danswer/db/enums.py
@@ -6,15 +6,6 @@ class IndexingStatus(str, PyEnum):
    IN_PROGRESS = "in_progress"
    SUCCESS = "success"
    FAILED = "failed"
-    COMPLETED_WITH_ERRORS = "completed_with_errors"
-
-    def is_terminal(self) -> bool:
-        terminal_states = {
-            IndexingStatus.SUCCESS,
-            IndexingStatus.COMPLETED_WITH_ERRORS,
-            IndexingStatus.FAILED,
-        }
-        return self in terminal_states


 # these may differ in the future, which is why we're okay with this duplication
@@ -48,6 +39,3 @@ class ConnectorCredentialPairStatus(str, PyEnum):
    ACTIVE = "ACTIVE"
    PAUSED = "PAUSED"
    DELETING = "DELETING"
-
-    def is_active(self) -> bool:
-        return self == ConnectorCredentialPairStatus.ACTIVE
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
pablodanswer	4eb53ce56f	rebase needs fixing	2024-08-19 07:40:53 -07:00
pablodanswer	2fc84ed63e	post rebase fix	2024-08-18 16:41:12 -07:00
pablodanswer	722d5e6e54	add sequential tool calls	2024-08-18 16:40:07 -07:00
pablodanswer	14c30d2e4d	add env variable	2024-08-18 15:05:44 -07:00
pablodanswer	6abad2fdd3	robust chat session state persistence	2024-08-18 15:05:44 -07:00
pablodanswer	4691e736f6	functional new message carry-over	2024-08-18 15:05:44 -07:00
pablodanswer	5a826a527f	properly reset blank screen	2024-08-18 15:05:44 -07:00
pablodanswer	f92d31df70	refactored for stop / regenerate	2024-08-18 15:05:44 -07:00
pablodanswer	1eb786897a	proper margin	2024-08-18 15:05:26 -07:00
pablodanswer	72471f9e1d	remove parameter	2024-08-18 15:05:26 -07:00
pablodanswer	49c335d06a	squash	2024-08-18 15:05:26 -07:00
pablodanswer	fda06b7739	more robust implementation for first messages	2024-08-18 15:05:26 -07:00
pablodanswer	00d44e31b3	validated + cleaner UI	2024-08-18 15:05:26 -07:00
pablodanswer	2a42c1dd18	functional once again post rebase but quite ugly	2024-08-18 15:05:26 -07:00
pablodanswer	05cd25043e	add regenerate	2024-08-18 15:05:26 -07:00
pablodanswer	abebff50bb	Enable seeding of analytics via file path (#2146 ) * enable seeding of analytics via file path * remove log	2024-08-18 15:05:26 -07:00
pablodanswer	0a7e672832	add handling for poorly formatting model names (#2143 )	2024-08-18 15:05:26 -07:00
pablodanswer	221ab9134c	add `critical` error just in case	2024-08-18 15:03:04 -07:00
pablodanswer	f7134202b6	slightly more specific logs	2024-08-18 14:44:10 -07:00
pablodanswer	bea11dc3aa	include logs	2024-08-18 14:33:45 -07:00
pablodanswer	374b798071	update typing	2024-08-17 13:51:52 -07:00
pablodanswer	6a2e3edfcd	add synchronous wrapper to avoid hampering main event loop	2024-08-17 13:39:22 -07:00
pablodanswer	2ef1731e32	tiny formatting (remove newline)	2024-08-17 09:29:39 -07:00
pablodanswer	7d4d7a5f5d	clean final message handling	2024-08-17 01:14:31 -07:00
pablodanswer	ea2f9cf625	cleaner messages	2024-08-15 17:17:03 -07:00
pablodanswer	97dc9c5e31	add back stack trace detail	2024-08-15 16:46:32 -07:00
pablodanswer	249bcd46d9	clearer	2024-08-15 16:10:56 -07:00
pablodanswer	f29b727bc7	remove comments	2024-08-15 16:10:56 -07:00
pablodanswer	31fb6c0753	improve clarity + new SSE handling utility function	2024-08-15 16:10:56 -07:00
pablodanswer	a45e72c298	update utility + copy	2024-08-15 16:10:56 -07:00
pablodanswer	157548817c	slightly more robust chat state	2024-08-15 16:10:56 -07:00
pablodanswer	d9396f77d1	remove false comment	2024-08-15 16:10:56 -07:00
pablodanswer	7bae6bbf8f	remove log	2024-08-15 16:10:56 -07:00
pablodanswer	1d535769ed	robustify	2024-08-15 16:10:56 -07:00
pablodanswer	8584a81fe2	unnecessary list removed	2024-08-15 16:10:56 -07:00
pablodanswer	5f4ac19928	robustify typing	2024-08-15 16:10:56 -07:00
pablodanswer	d898e4f738	remove logs	2024-08-15 16:10:56 -07:00
pablodanswer	19412f0aa0	add `ChatState` for more robust handling	2024-08-15 16:10:56 -07:00
pablodanswer	c338de30fd	add new loading state to prevent collisions	2024-08-15 16:10:56 -07:00
pablodanswer	edfde621b9	formatting	2024-08-15 16:10:56 -07:00
pablodanswer	9306abf911	migrate to streaming response	2024-08-15 16:10:56 -07:00
pablodanswer	70d885b621	cleaner loop + data persistence	2024-08-15 16:10:56 -07:00
pablodanswer	53bea4f859	robustify frontend handling	2024-08-15 16:10:55 -07:00
pablodanswer	a79d734d96	typing	2024-08-15 16:10:28 -07:00
pablodanswer	25cd7de147	remove logs	2024-08-15 16:10:28 -07:00
pablodanswer	ab2916c807	robustify switching	2024-08-15 16:10:28 -07:00
pablodanswer	96112f1f95	functional rework of temporary user/assistant ID	2024-08-15 16:10:28 -07:00
pablodanswer	54502b32d3	remove logs	2024-08-15 16:10:28 -07:00
pablodanswer	9431e6c06c	remove commits	2024-08-15 16:10:28 -07:00
pablodanswer	f18571d580	functional types + sidebar	2024-08-15 16:10:28 -07:00