updates

all done
Merge pull request #3379 from onyx-dot-app/hagen-initial-branch
2026-02-17 07:45:47 +00:00 · 2024-12-15 15:12:55 -08:00 · 2024-12-13 12:18:16 -08:00 · 2024-12-12 15:55:22 -08:00 · 2024-12-12 15:54:13 -08:00 · 2024-12-12 15:20:39 -08:00
575 changed files with 26953 additions and 20942 deletions
--- a/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
+++ b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml
@@ -65,6 +65,7 @@ jobs:
            NEXT_PUBLIC_POSTHOG_KEY=${{ secrets.POSTHOG_KEY }}
            NEXT_PUBLIC_POSTHOG_HOST=${{ secrets.POSTHOG_HOST }}
            NEXT_PUBLIC_SENTRY_DSN=${{ secrets.SENTRY_DSN }}
+            NEXT_PUBLIC_GTM_ENABLED=true
          # needed due to weird interactions with the builds for different platforms
          no-cache: true
          labels: ${{ steps.meta.outputs.labels }}
--- a/.github/workflows/pr-chromatic-tests.yml
+++ b/.github/workflows/pr-chromatic-tests.yml
@@ -0,0 +1,225 @@
+name: Run Chromatic Tests
+concurrency:
+  group: Run-Chromatic-Tests-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }}
+  cancel-in-progress: true
+
+on: push
+
+env:
+  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+jobs:
+  playwright-tests:
+    name: Playwright Tests
+
+    # See https://runs-on.com/runners/linux/
+    runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+          cache: 'pip'
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+            backend/requirements/model_server.txt
+      - run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
+        
+      - name: Setup node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+
+      - name: Install node dependencies
+        working-directory: ./web
+        run: npm ci
+
+      - name: Install playwright browsers
+        working-directory: ./web
+        run: npx playwright install --with-deps
+        
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      # tag every docker image with "test" so that we can spin up the correct set
+      # of images during testing
+      
+      # we use the runs-on cache for docker builds
+      # in conjunction with runs-on runners, it has better speed and unlimited caching
+      # https://runs-on.com/caching/s3-cache-for-github-actions/
+      # https://runs-on.com/caching/docker/
+      # https://github.com/moby/buildkit#s3-cache-experimental
+      
+      # images are built and run locally for testing purposes. Not pushed.
+
+      - name: Build Web Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./web
+          file: ./web/Dockerfile
+          platforms: linux/amd64
+          tags: danswer/danswer-web-server:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Build Backend Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile
+          platforms: linux/amd64
+          tags: danswer/danswer-backend:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Build Model Server Docker image
+        uses: ./.github/actions/custom-build-and-push
+        with:
+          context: ./backend
+          file: ./backend/Dockerfile.model_server
+          platforms: linux/amd64
+          tags: danswer/danswer-model-server:test
+          push: false
+          load: true
+          cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
+          cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
+
+      - name: Start Docker containers
+        run: |
+          cd deployment/docker_compose
+          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
+          AUTH_TYPE=basic \
+          REQUIRE_EMAIL_VERIFICATION=false \
+          DISABLE_TELEMETRY=true \
+          IMAGE_TAG=test \
+          docker compose -f docker-compose.dev.yml -p danswer-stack up -d
+        id: start_docker
+
+      - name: Wait for service to be ready
+        run: |
+          echo "Starting wait-for-service script..."
+          
+          docker logs -f danswer-stack-api_server-1 &
+
+          start_time=$(date +%s)
+          timeout=300  # 5 minutes in seconds
+          
+          while true; do
+            current_time=$(date +%s)
+            elapsed_time=$((current_time - start_time))
+            
+            if [ $elapsed_time -ge $timeout ]; then
+              echo "Timeout reached. Service did not become ready in 5 minutes."
+              exit 1
+            fi
+            
+            # Use curl with error handling to ignore specific exit code 56
+            response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
+            
+            if [ "$response" = "200" ]; then
+              echo "Service is ready!"
+              break
+            elif [ "$response" = "curl_error" ]; then
+              echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
+            else
+              echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
+            fi
+            
+            sleep 5
+          done
+          echo "Finished waiting for service."
+
+      - name: Run pytest playwright test init
+        working-directory: ./backend
+        env: 
+          PYTEST_IGNORE_SKIP: true
+        run: pytest -s tests/integration/tests/playwright/test_playwright.py
+
+      - name: Run Playwright tests
+        working-directory: ./web
+        run: npx playwright test
+
+      - uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          # Chromatic automatically defaults to the test-results directory.
+          # Replace with the path to your custom directory and adjust the CHROMATIC_ARCHIVE_LOCATION environment variable accordingly.
+          name: test-results
+          path: ./web/test-results
+          retention-days: 30
+                    
+      # save before stopping the containers so the logs can be captured
+      - name: Save Docker logs
+        if: success() || failure()
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
+          mv docker-compose.log ${{ github.workspace }}/docker-compose.log
+      
+      - name: Upload logs
+        if: success() || failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: docker-logs
+          path: ${{ github.workspace }}/docker-compose.log
+
+      - name: Stop Docker containers
+        run: |
+          cd deployment/docker_compose
+          docker compose -f docker-compose.dev.yml -p danswer-stack down -v
+
+  chromatic-tests:
+    name: Chromatic Tests
+    
+    needs: playwright-tests
+    runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          
+      - name: Setup node
+        uses: actions/setup-node@v4
+        with:
+          node-version: 22
+          
+      - name: Install node dependencies
+        working-directory: ./web
+        run: npm ci
+        
+      - name: Download Playwright test results
+        uses: actions/download-artifact@v4
+        with:
+          name: test-results
+          path: ./web/test-results
+          
+      - name: Run Chromatic
+        uses: chromaui/action@latest
+        with:
+          playwright: true
+          projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }}
+          workingDir: ./web
+        env: 
+          CHROMATIC_ARCHIVE_LOCATION: ./test-results
--- a/.github/workflows/pr-helm-chart-testing.yml
+++ b/.github/workflows/pr-helm-chart-testing.yml
@@ -23,21 +23,6 @@ jobs:
      with:
        version: v3.14.4
      
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: '3.11'
-        cache: 'pip'
-        cache-dependency-path: |
-          backend/requirements/default.txt
-          backend/requirements/dev.txt
-          backend/requirements/model_server.txt
-    - run: |
-        python -m pip install --upgrade pip
-        pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
-        pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
-        pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
-
    - name: Set up chart-testing
      uses: helm/chart-testing-action@v2.6.1

@@ -52,6 +37,22 @@ jobs:
          echo "changed=true" >> "$GITHUB_OUTPUT"
        fi

+#     rkuo: I don't think we need python?
+#     - name: Set up Python
+#       uses: actions/setup-python@v5
+#       with:
+#         python-version: '3.11'
+#         cache: 'pip'
+#         cache-dependency-path: |
+#           backend/requirements/default.txt
+#           backend/requirements/dev.txt
+#           backend/requirements/model_server.txt
+#     - run: |
+#         python -m pip install --upgrade pip
+#         pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+#         pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+#         pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
+
    # lint all charts if any changes were detected
    - name: Run chart-testing (lint)
      if: steps.list-changed.outputs.changed == 'true'
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -13,7 +13,10 @@ on:
 env:
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
-
+  CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
+  CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
+  CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
+  
 jobs:
  integration-tests:
    # See https://runs-on.com/runners/linux/
@@ -195,9 +198,13 @@ jobs:
            -e API_SERVER_HOST=api_server \
            -e OPENAI_API_KEY=${OPENAI_API_KEY} \
            -e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
+            -e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
+            -e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
+            -e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
            -e TEST_WEB_HOSTNAME=test-runner \
            danswer/danswer-integration:test \
-            /app/tests/integration/tests
+            /app/tests/integration/tests \
+            /app/tests/integration/connector_job_tests
        continue-on-error: true
        id: run_tests

--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -20,9 +20,12 @@ env:
  JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
  # Google
  GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
+  GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }}
  GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
  GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
  GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
+  # Slab
+  SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}

 jobs:
  connectors-check:
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@
 .vscode/
 *.sw?
 /backend/tests/regression/answer_quality/search_test_config.yaml
+/web/test-results/
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -203,7 +203,7 @@
                "--loglevel=INFO",
                "--hostname=light@%n",
                "-Q",
-                "vespa_metadata_sync,connector_deletion",
+                "vespa_metadata_sync,connector_deletion,doc_permissions_upsert",
            ],
            "presentation": {
 				 "group": "2",
@@ -232,7 +232,7 @@
                "--loglevel=INFO",
                "--hostname=heavy@%n",
                "-Q",
-                "connector_pruning",
+                "connector_pruning,connector_doc_permissions_sync,connector_external_group_sync",
            ],
            "presentation": {
 				 "group": "2",
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -32,7 +32,7 @@ To contribute to this project, please follow the
 When opening a pull request, mention related issues and feel free to tag relevant maintainers.

 Before creating a pull request please make sure that the new changes conform to the formatting and linting requirements.
-See the [Formatting and Linting](#-formatting-and-linting) section for how to run these checks locally.
+See the [Formatting and Linting](#formatting-and-linting) section for how to run these checks locally.


 ### Getting Help 🙋
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
 <a href="https://docs.danswer.dev/" target="_blank">
    <img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
 </a>
-<a href="https://join.slack.com/t/danswer/shared_invite/zt-2lcmqw703-071hBuZBfNEOGUsLa5PXvQ" target="_blank">
+<a href="https://join.slack.com/t/danswer/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA" target="_blank">
    <img src="https://img.shields.io/badge/slack-join-blue.svg?logo=slack" alt="Slack">
 </a>
 <a href="https://discord.gg/TDJ59cGV2X" target="_blank">
@@ -135,7 +135,7 @@ Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md

 ## ✨Contributors

-<a href="https://github.com/aryn-ai/sycamore/graphs/contributors">
+<a href="https://github.com/danswer-ai/danswer/graphs/contributors">
  <img alt="contributors" src="https://contrib.rocks/image?repo=danswer-ai/danswer"/>
 </a>

--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -73,6 +73,7 @@ RUN apt-get update && \
    rm -rf /var/lib/apt/lists/* && \
    rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key

+
 # Pre-downloading models for setups with limited egress
 RUN python -c "from tokenizers import Tokenizer; \
 Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -1,5 +1,5 @@
 from sqlalchemy.engine.base import Connection
-from typing import Any
+from typing import Literal
 import asyncio
 from logging.config import fileConfig
 import logging
@@ -8,6 +8,7 @@ from alembic import context
 from sqlalchemy import pool
 from sqlalchemy.ext.asyncio import create_async_engine
 from sqlalchemy.sql import text
+from sqlalchemy.sql.schema import SchemaItem

 from shared_configs.configs import MULTI_TENANT
 from danswer.db.engine import build_connection_string
@@ -35,7 +36,18 @@ logger = logging.getLogger(__name__)


 def include_object(
-    object: Any, name: str, type_: str, reflected: bool, compare_to: Any
+    object: SchemaItem,
+    name: str | None,
+    type_: Literal[
+        "schema",
+        "table",
+        "column",
+        "index",
+        "unique_constraint",
+        "foreign_key_constraint",
+    ],
+    reflected: bool,
+    compare_to: SchemaItem | None,
 ) -> bool:
    """
    Determines whether a database object should be included in migrations.
--- a/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py
+++ b/backend/alembic/versions/177de57c21c9_display_custom_llm_models.py
@@ -0,0 +1,59 @@
+"""display custom llm models
+
+Revision ID: 177de57c21c9
+Revises: 4ee1287bd26a
+Create Date: 2024-11-21 11:49:04.488677
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+from sqlalchemy import and_
+
+revision = "177de57c21c9"
+down_revision = "4ee1287bd26a"
+branch_labels = None
+depends_on = None
+depends_on = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    llm_provider = sa.table(
+        "llm_provider",
+        sa.column("id", sa.Integer),
+        sa.column("provider", sa.String),
+        sa.column("model_names", postgresql.ARRAY(sa.String)),
+        sa.column("display_model_names", postgresql.ARRAY(sa.String)),
+    )
+
+    excluded_providers = ["openai", "bedrock", "anthropic", "azure"]
+
+    providers_to_update = sa.select(
+        llm_provider.c.id,
+        llm_provider.c.model_names,
+        llm_provider.c.display_model_names,
+    ).where(
+        and_(
+            ~llm_provider.c.provider.in_(excluded_providers),
+            llm_provider.c.model_names.isnot(None),
+        )
+    )
+
+    results = conn.execute(providers_to_update).fetchall()
+
+    for provider_id, model_names, display_model_names in results:
+        if display_model_names is None:
+            display_model_names = []
+
+        combined_model_names = list(set(display_model_names + model_names))
+        update_stmt = (
+            llm_provider.update()
+            .where(llm_provider.c.id == provider_id)
+            .values(display_model_names=combined_model_names)
+        )
+        conn.execute(update_stmt)
+
+
+def downgrade() -> None:
+    pass
--- a/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py
+++ b/backend/alembic/versions/26b931506ecb_default_chosen_assistants_to_none.py
@@ -0,0 +1,68 @@
+"""default chosen assistants to none
+
+Revision ID: 26b931506ecb
+Revises: 2daa494a0851
+Create Date: 2024-11-12 13:23:29.858995
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "26b931506ecb"
+down_revision = "2daa494a0851"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "user", sa.Column("chosen_assistants_new", postgresql.JSONB(), nullable=True)
+    )
+
+    op.execute(
+        """
+    UPDATE "user"
+    SET chosen_assistants_new =
+        CASE
+            WHEN chosen_assistants = '[-2, -1, 0]' THEN NULL
+            ELSE chosen_assistants
+        END
+    """
+    )
+
+    op.drop_column("user", "chosen_assistants")
+
+    op.alter_column(
+        "user", "chosen_assistants_new", new_column_name="chosen_assistants"
+    )
+
+
+def downgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column(
+            "chosen_assistants_old",
+            postgresql.JSONB(),
+            nullable=False,
+            server_default="[-2, -1, 0]",
+        ),
+    )
+
+    op.execute(
+        """
+    UPDATE "user"
+    SET chosen_assistants_old =
+        CASE
+            WHEN chosen_assistants IS NULL THEN '[-2, -1, 0]'::jsonb
+            ELSE chosen_assistants
+        END
+    """
+    )
+
+    op.drop_column("user", "chosen_assistants")
+
+    op.alter_column(
+        "user", "chosen_assistants_old", new_column_name="chosen_assistants"
+    )
--- a/backend/alembic/versions/2daa494a0851_add_group_sync_time.py
+++ b/backend/alembic/versions/2daa494a0851_add_group_sync_time.py
@@ -0,0 +1,30 @@
+"""add-group-sync-time
+
+Revision ID: 2daa494a0851
+Revises: c0fd6e4da83a
+Create Date: 2024-11-11 10:57:22.991157
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "2daa494a0851"
+down_revision = "c0fd6e4da83a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "last_time_external_group_sync",
+            sa.DateTime(timezone=True),
+            nullable=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "last_time_external_group_sync")
--- a/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py
+++ b/backend/alembic/versions/47e5bef3a1d7_add_persona_categories.py
@@ -0,0 +1,45 @@
+"""add persona categories
+
+Revision ID: 47e5bef3a1d7
+Revises: dfbe9e93d3c7
+Create Date: 2024-11-05 18:55:02.221064
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "47e5bef3a1d7"
+down_revision = "dfbe9e93d3c7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create the persona_category table
+    op.create_table(
+        "persona_category",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sa.String(), nullable=False),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
+    )
+
+    # Add category_id to persona table
+    op.add_column("persona", sa.Column("category_id", sa.Integer(), nullable=True))
+    op.create_foreign_key(
+        "fk_persona_category",
+        "persona",
+        "persona_category",
+        ["category_id"],
+        ["id"],
+        ondelete="SET NULL",
+    )
+
+
+def downgrade() -> None:
+    op.drop_constraint("fk_persona_category", "persona", type_="foreignkey")
+    op.drop_column("persona", "category_id")
+    op.drop_table("persona_category")
--- a/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py
+++ b/backend/alembic/versions/4ee1287bd26a_add_multiple_slack_bot_support.py
@@ -0,0 +1,280 @@
+"""add_multiple_slack_bot_support
+
+Revision ID: 4ee1287bd26a
+Revises: 47e5bef3a1d7
+Create Date: 2024-11-06 13:15:53.302644
+
+"""
+import logging
+from typing import cast
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.orm import Session
+from danswer.key_value_store.factory import get_kv_store
+from danswer.db.models import SlackBot
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "4ee1287bd26a"
+down_revision = "47e5bef3a1d7"
+branch_labels: None = None
+depends_on: None = None
+
+# Configure logging
+logger = logging.getLogger("alembic.runtime.migration")
+logger.setLevel(logging.INFO)
+
+
+def upgrade() -> None:
+    logger.info(f"{revision}: create_table: slack_bot")
+    # Create new slack_bot table
+    op.create_table(
+        "slack_bot",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("name", sa.String(), nullable=False),
+        sa.Column("enabled", sa.Boolean(), nullable=False, server_default="true"),
+        sa.Column("bot_token", sa.LargeBinary(), nullable=False),
+        sa.Column("app_token", sa.LargeBinary(), nullable=False),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("bot_token"),
+        sa.UniqueConstraint("app_token"),
+    )
+
+    # # Create new slack_channel_config table
+    op.create_table(
+        "slack_channel_config",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("slack_bot_id", sa.Integer(), nullable=True),
+        sa.Column("persona_id", sa.Integer(), nullable=True),
+        sa.Column("channel_config", postgresql.JSONB(), nullable=False),
+        sa.Column("response_type", sa.String(), nullable=False),
+        sa.Column(
+            "enable_auto_filters", sa.Boolean(), nullable=False, server_default="false"
+        ),
+        sa.ForeignKeyConstraint(
+            ["slack_bot_id"],
+            ["slack_bot.id"],
+        ),
+        sa.ForeignKeyConstraint(
+            ["persona_id"],
+            ["persona.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Handle existing Slack bot tokens first
+    logger.info(f"{revision}: Checking for existing Slack bot.")
+    bot_token = None
+    app_token = None
+    first_row_id = None
+
+    try:
+        tokens = cast(dict, get_kv_store().load("slack_bot_tokens_config_key"))
+    except Exception:
+        logger.warning("No existing Slack bot tokens found.")
+        tokens = {}
+
+    bot_token = tokens.get("bot_token")
+    app_token = tokens.get("app_token")
+
+    if bot_token and app_token:
+        logger.info(f"{revision}: Found bot and app tokens.")
+
+        session = Session(bind=op.get_bind())
+        new_slack_bot = SlackBot(
+            name="Slack Bot (Migrated)",
+            enabled=True,
+            bot_token=bot_token,
+            app_token=app_token,
+        )
+        session.add(new_slack_bot)
+        session.commit()
+        first_row_id = new_slack_bot.id
+
+    # Create a default bot if none exists
+    # This is in case there are no slack tokens but there are channels configured
+    op.execute(
+        sa.text(
+            """
+            INSERT INTO slack_bot (name, enabled, bot_token, app_token)
+            SELECT 'Default Bot', true, '', ''
+            WHERE NOT EXISTS (SELECT 1 FROM slack_bot)
+            RETURNING id;
+            """
+        )
+    )
+
+    # Get the bot ID to use (either from existing migration or newly created)
+    bot_id_query = sa.text(
+        """
+        SELECT COALESCE(
+            :first_row_id,
+            (SELECT id FROM slack_bot ORDER BY id ASC LIMIT 1)
+        ) as bot_id;
+        """
+    )
+    result = op.get_bind().execute(bot_id_query, {"first_row_id": first_row_id})
+    bot_id = result.scalar()
+
+    # CTE (Common Table Expression) that transforms the old slack_bot_config table data
+    # This splits up the channel_names into their own rows
+    channel_names_cte = """
+        WITH channel_names AS (
+            SELECT
+                sbc.id as config_id,
+                sbc.persona_id,
+                sbc.response_type,
+                sbc.enable_auto_filters,
+                jsonb_array_elements_text(sbc.channel_config->'channel_names') as channel_name,
+                sbc.channel_config->>'respond_tag_only' as respond_tag_only,
+                sbc.channel_config->>'respond_to_bots' as respond_to_bots,
+                sbc.channel_config->'respond_member_group_list' as respond_member_group_list,
+                sbc.channel_config->'answer_filters' as answer_filters,
+                sbc.channel_config->'follow_up_tags' as follow_up_tags
+            FROM slack_bot_config sbc
+        )
+    """
+
+    # Insert the channel names into the new slack_channel_config table
+    insert_statement = """
+        INSERT INTO slack_channel_config (
+            slack_bot_id,
+            persona_id,
+            channel_config,
+            response_type,
+            enable_auto_filters
+        )
+        SELECT
+            :bot_id,
+            channel_name.persona_id,
+            jsonb_build_object(
+                'channel_name', channel_name.channel_name,
+                'respond_tag_only',
+                COALESCE((channel_name.respond_tag_only)::boolean, false),
+                'respond_to_bots',
+                COALESCE((channel_name.respond_to_bots)::boolean, false),
+                'respond_member_group_list',
+                COALESCE(channel_name.respond_member_group_list, '[]'::jsonb),
+                'answer_filters',
+                COALESCE(channel_name.answer_filters, '[]'::jsonb),
+                'follow_up_tags',
+                COALESCE(channel_name.follow_up_tags, '[]'::jsonb)
+            ),
+            channel_name.response_type,
+            channel_name.enable_auto_filters
+        FROM channel_names channel_name;
+    """
+
+    op.execute(sa.text(channel_names_cte + insert_statement).bindparams(bot_id=bot_id))
+
+    # Clean up old tokens if they existed
+    try:
+        if bot_token and app_token:
+            logger.info(f"{revision}: Removing old bot and app tokens.")
+            get_kv_store().delete("slack_bot_tokens_config_key")
+    except Exception:
+        logger.warning("tried to delete tokens in dynamic config but failed")
+    # Rename the table
+    op.rename_table(
+        "slack_bot_config__standard_answer_category",
+        "slack_channel_config__standard_answer_category",
+    )
+
+    # Rename the column
+    op.alter_column(
+        "slack_channel_config__standard_answer_category",
+        "slack_bot_config_id",
+        new_column_name="slack_channel_config_id",
+    )
+
+    # Drop the table with CASCADE to handle dependent objects
+    op.execute("DROP TABLE slack_bot_config CASCADE")
+
+    logger.info(f"{revision}: Migration complete.")
+
+
+def downgrade() -> None:
+    # Recreate the old slack_bot_config table
+    op.create_table(
+        "slack_bot_config",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("persona_id", sa.Integer(), nullable=True),
+        sa.Column("channel_config", postgresql.JSONB(), nullable=False),
+        sa.Column("response_type", sa.String(), nullable=False),
+        sa.Column("enable_auto_filters", sa.Boolean(), nullable=False),
+        sa.ForeignKeyConstraint(
+            ["persona_id"],
+            ["persona.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+    # Migrate data back to the old format
+    # Group by persona_id to combine channel names back into arrays
+    op.execute(
+        sa.text(
+            """
+            INSERT INTO slack_bot_config (
+                persona_id,
+                channel_config,
+                response_type,
+                enable_auto_filters
+            )
+            SELECT DISTINCT ON (persona_id)
+                persona_id,
+                jsonb_build_object(
+                    'channel_names', (
+                        SELECT jsonb_agg(c.channel_config->>'channel_name')
+                        FROM slack_channel_config c
+                        WHERE c.persona_id = scc.persona_id
+                    ),
+                    'respond_tag_only', (channel_config->>'respond_tag_only')::boolean,
+                    'respond_to_bots', (channel_config->>'respond_to_bots')::boolean,
+                    'respond_member_group_list', channel_config->'respond_member_group_list',
+                    'answer_filters', channel_config->'answer_filters',
+                    'follow_up_tags', channel_config->'follow_up_tags'
+                ),
+                response_type,
+                enable_auto_filters
+            FROM slack_channel_config scc
+            WHERE persona_id IS NOT NULL;
+            """
+        )
+    )
+
+    # Rename the table back
+    op.rename_table(
+        "slack_channel_config__standard_answer_category",
+        "slack_bot_config__standard_answer_category",
+    )
+
+    # Rename the column back
+    op.alter_column(
+        "slack_bot_config__standard_answer_category",
+        "slack_channel_config_id",
+        new_column_name="slack_bot_config_id",
+    )
+
+    # Try to save the first bot's tokens back to KV store
+    try:
+        first_bot = (
+            op.get_bind()
+            .execute(
+                sa.text(
+                    "SELECT bot_token, app_token FROM slack_bot ORDER BY id LIMIT 1"
+                )
+            )
+            .first()
+        )
+        if first_bot and first_bot.bot_token and first_bot.app_token:
+            tokens = {
+                "bot_token": first_bot.bot_token,
+                "app_token": first_bot.app_token,
+            }
+            get_kv_store().store("slack_bot_tokens_config_key", tokens)
+    except Exception:
+        logger.warning("Failed to save tokens back to KV store")
+
+    # Drop the new tables in reverse order
+    op.drop_table("slack_channel_config")
+    op.drop_table("slack_bot")
--- a/backend/alembic/versions/6d562f86c78b_remove_default_bot.py
+++ b/backend/alembic/versions/6d562f86c78b_remove_default_bot.py
@@ -0,0 +1,45 @@
+"""remove default bot
+
+Revision ID: 6d562f86c78b
+Revises: 177de57c21c9
+Create Date: 2024-11-22 11:51:29.331336
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "6d562f86c78b"
+down_revision = "177de57c21c9"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(
+        sa.text(
+            """
+            DELETE FROM slack_bot
+            WHERE name = 'Default Bot'
+            AND bot_token = ''
+            AND app_token = ''
+            AND NOT EXISTS (
+                SELECT 1 FROM slack_channel_config
+                WHERE slack_channel_config.slack_bot_id = slack_bot.id
+            )
+            """
+        )
+    )
+
+
+def downgrade() -> None:
+    op.execute(
+        sa.text(
+            """
+            INSERT INTO slack_bot (name, enabled, bot_token, app_token)
+            SELECT 'Default Bot', true, '', ''
+            WHERE NOT EXISTS (SELECT 1 FROM slack_bot)
+            RETURNING id;
+            """
+        )
+    )
--- a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
+++ b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
@@ -9,8 +9,8 @@ from alembic import op
 import sqlalchemy as sa

 from danswer.db.models import IndexModelStatus
-from danswer.search.enums import RecencyBiasSetting
-from danswer.search.enums import SearchType
+from danswer.context.search.enums import RecencyBiasSetting
+from danswer.context.search.enums import SearchType

 # revision identifiers, used by Alembic.
 revision = "776b3bbe9092"
--- a/backend/alembic/versions/93560ba1b118_add_web_ui_option_to_slack_config.py
+++ b/backend/alembic/versions/93560ba1b118_add_web_ui_option_to_slack_config.py
@@ -0,0 +1,35 @@
+"""add web ui option to slack config
+
+Revision ID: 93560ba1b118
+Revises: 6d562f86c78b
+Create Date: 2024-11-24 06:36:17.490612
+
+"""
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision = "93560ba1b118"
+down_revision = "6d562f86c78b"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Add show_continue_in_web_ui with default False to all existing channel_configs
+    op.execute(
+        """
+        UPDATE slack_channel_config
+        SET channel_config = channel_config || '{"show_continue_in_web_ui": false}'::jsonb
+        WHERE NOT channel_config ? 'show_continue_in_web_ui'
+        """
+    )
+
+
+def downgrade() -> None:
+    # Remove show_continue_in_web_ui from all channel_configs
+    op.execute(
+        """
+        UPDATE slack_channel_config
+        SET channel_config = channel_config - 'show_continue_in_web_ui'
+        """
+    )
--- a/backend/alembic/versions/949b4a92a401_remove_rt.py
+++ b/backend/alembic/versions/949b4a92a401_remove_rt.py
@@ -7,6 +7,7 @@ Create Date: 2024-10-26 13:06:06.937969
 """
 from alembic import op
 from sqlalchemy.orm import Session
+from sqlalchemy import text

 # Import your models and constants
 from danswer.db.models import (
@@ -15,7 +16,6 @@ from danswer.db.models import (
    Credential,
    IndexAttempt,
 )
-from danswer.configs.constants import DocumentSource


 # revision identifiers, used by Alembic.
@@ -30,13 +30,11 @@ def upgrade() -> None:
    bind = op.get_bind()
    session = Session(bind=bind)

-    connectors_to_delete = (
-        session.query(Connector)
-        .filter(Connector.source == DocumentSource.REQUESTTRACKER)
-        .all()
+    # Get connectors using raw SQL
+    result = bind.execute(
+        text("SELECT id FROM connector WHERE source = 'requesttracker'")
    )
-
-    connector_ids = [connector.id for connector in connectors_to_delete]
+    connector_ids = [row[0] for row in result]

    if connector_ids:
        cc_pairs_to_delete = (
--- a/backend/alembic/versions/9cf5c00f72fe_add_creator_to_cc_pair.py
+++ b/backend/alembic/versions/9cf5c00f72fe_add_creator_to_cc_pair.py
@@ -0,0 +1,30 @@
+"""add creator to cc pair
+
+Revision ID: 9cf5c00f72fe
+Revises: 26b931506ecb
+Create Date: 2024-11-12 15:16:42.682902
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "9cf5c00f72fe"
+down_revision = "26b931506ecb"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "creator_id",
+            sa.UUID(as_uuid=True),
+            nullable=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "creator_id")
--- a/backend/alembic/versions/9f696734098f_combine_search_and_chat.py
+++ b/backend/alembic/versions/9f696734098f_combine_search_and_chat.py
@@ -0,0 +1,36 @@
+"""Combine Search and Chat
+
+Revision ID: 9f696734098f
+Revises: a8c2065484e6
+Create Date: 2024-11-27 15:32:19.694972
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "9f696734098f"
+down_revision = "a8c2065484e6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.alter_column("chat_session", "description", nullable=True)
+    op.drop_column("chat_session", "one_shot")
+    op.drop_column("slack_channel_config", "response_type")
+
+
+def downgrade() -> None:
+    op.execute("UPDATE chat_session SET description = '' WHERE description IS NULL")
+    op.alter_column("chat_session", "description", nullable=False)
+    op.add_column(
+        "chat_session",
+        sa.Column("one_shot", sa.Boolean(), nullable=False, server_default=sa.false()),
+    )
+    op.add_column(
+        "slack_channel_config",
+        sa.Column(
+            "response_type", sa.String(), nullable=False, server_default="citations"
+        ),
+    )
--- a/backend/alembic/versions/a8c2065484e6_add_auto_scroll_to_user_model.py
+++ b/backend/alembic/versions/a8c2065484e6_add_auto_scroll_to_user_model.py
@@ -0,0 +1,27 @@
+"""add auto scroll to user model
+
+Revision ID: a8c2065484e6
+Revises: abe7378b8217
+Create Date: 2024-11-22 17:34:09.690295
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "a8c2065484e6"
+down_revision = "abe7378b8217"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column("auto_scroll", sa.Boolean(), nullable=True, server_default=None),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("user", "auto_scroll")
--- a/backend/alembic/versions/abe7378b8217_add_indexing_trigger_to_cc_pair.py
+++ b/backend/alembic/versions/abe7378b8217_add_indexing_trigger_to_cc_pair.py
@@ -0,0 +1,30 @@
+"""add indexing trigger to cc_pair
+
+Revision ID: abe7378b8217
+Revises: 6d562f86c78b
+Create Date: 2024-11-26 19:09:53.481171
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "abe7378b8217"
+down_revision = "93560ba1b118"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column(
+            "indexing_trigger",
+            sa.Enum("UPDATE", "REINDEX", name="indexingmode", native_enum=False),
+            nullable=True,
+        ),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "indexing_trigger")
--- a/backend/alembic/versions/dfbe9e93d3c7_extended_role_for_non_web.py
+++ b/backend/alembic/versions/dfbe9e93d3c7_extended_role_for_non_web.py
@@ -0,0 +1,42 @@
+"""extended_role_for_non_web
+
+Revision ID: dfbe9e93d3c7
+Revises: 9cf5c00f72fe
+Create Date: 2024-11-16 07:54:18.727906
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "dfbe9e93d3c7"
+down_revision = "9cf5c00f72fe"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute(
+        """
+        UPDATE "user"
+        SET role = 'EXT_PERM_USER'
+        WHERE has_web_login = false
+    """
+    )
+    op.drop_column("user", "has_web_login")
+
+
+def downgrade() -> None:
+    op.add_column(
+        "user",
+        sa.Column("has_web_login", sa.Boolean(), nullable=False, server_default="true"),
+    )
+
+    op.execute(
+        """
+        UPDATE "user"
+        SET has_web_login = false,
+            role = 'BASIC'
+        WHERE role IN ('SLACK_USER', 'EXT_PERM_USER')
+    """
+    )
--- a/backend/alembic_tenants/env.py
+++ b/backend/alembic_tenants/env.py
@@ -1,5 +1,6 @@
 import asyncio
 from logging.config import fileConfig
+from typing import Literal

 from sqlalchemy import pool
 from sqlalchemy.engine import Connection
@@ -37,8 +38,15 @@ EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}

 def include_object(
    object: SchemaItem,
-    name: str,
-    type_: str,
+    name: str | None,
+    type_: Literal[
+        "schema",
+        "table",
+        "column",
+        "index",
+        "unique_constraint",
+        "foreign_key_constraint",
+    ],
    reflected: bool,
    compare_to: SchemaItem | None,
 ) -> bool:
--- a/backend/danswer/access/models.py
+++ b/backend/danswer/access/models.py
@@ -16,6 +16,46 @@ class ExternalAccess:
    is_public: bool


+@dataclass(frozen=True)
+class DocExternalAccess:
+    """
+    This is just a class to wrap the external access and the document ID
+    together. It's used for syncing document permissions to Redis.
+    """
+
+    external_access: ExternalAccess
+    # The document ID
+    doc_id: str
+
+    def to_dict(self) -> dict:
+        return {
+            "external_access": {
+                "external_user_emails": list(self.external_access.external_user_emails),
+                "external_user_group_ids": list(
+                    self.external_access.external_user_group_ids
+                ),
+                "is_public": self.external_access.is_public,
+            },
+            "doc_id": self.doc_id,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict) -> "DocExternalAccess":
+        external_access = ExternalAccess(
+            external_user_emails=set(
+                data["external_access"].get("external_user_emails", [])
+            ),
+            external_user_group_ids=set(
+                data["external_access"].get("external_user_group_ids", [])
+            ),
+            is_public=data["external_access"]["is_public"],
+        )
+        return cls(
+            external_access=external_access,
+            doc_id=data["doc_id"],
+        )
+
+
@dataclass(frozen=True)
 class DocumentAccess(ExternalAccess):
    # User emails for Danswer users, None indicates admin
--- a/backend/danswer/agent_search/answer_query/graph_builder.py
+++ b/backend/danswer/agent_search/answer_query/graph_builder.py
@@ -0,0 +1,100 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from danswer.agent_search.answer_query.nodes.answer_check import answer_check
+from danswer.agent_search.answer_query.nodes.answer_generation import answer_generation
+from danswer.agent_search.answer_query.nodes.format_answer import format_answer
+from danswer.agent_search.answer_query.states import AnswerQueryInput
+from danswer.agent_search.answer_query.states import AnswerQueryOutput
+from danswer.agent_search.answer_query.states import AnswerQueryState
+from danswer.agent_search.expanded_retrieval.graph_builder import (
+    expanded_retrieval_graph_builder,
+)
+
+
+def answer_query_graph_builder() -> StateGraph:
+    graph = StateGraph(
+        state_schema=AnswerQueryState,
+        input=AnswerQueryInput,
+        output=AnswerQueryOutput,
+    )
+
+    ### Add nodes ###
+
+    expanded_retrieval = expanded_retrieval_graph_builder().compile()
+    graph.add_node(
+        node="expanded_retrieval_for_initial_decomp",
+        action=expanded_retrieval,
+    )
+    graph.add_node(
+        node="answer_check",
+        action=answer_check,
+    )
+    graph.add_node(
+        node="answer_generation",
+        action=answer_generation,
+    )
+    graph.add_node(
+        node="format_answer",
+        action=format_answer,
+    )
+
+    ### Add edges ###
+
+    graph.add_edge(
+        start_key=START,
+        end_key="expanded_retrieval_for_initial_decomp",
+    )
+    graph.add_edge(
+        start_key="expanded_retrieval_for_initial_decomp",
+        end_key="answer_generation",
+    )
+    graph.add_edge(
+        start_key="answer_generation",
+        end_key="answer_check",
+    )
+    graph.add_edge(
+        start_key="answer_check",
+        end_key="format_answer",
+    )
+    graph.add_edge(
+        start_key="format_answer",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    from danswer.db.engine import get_session_context_manager
+    from danswer.llm.factory import get_default_llms
+    from danswer.context.search.models import SearchRequest
+
+    graph = answer_query_graph_builder()
+    compiled_graph = graph.compile()
+    primary_llm, fast_llm = get_default_llms()
+    search_request = SearchRequest(
+        query="Who made Excel and what other products did they make?",
+    )
+    with get_session_context_manager() as db_session:
+        inputs = AnswerQueryInput(
+            search_request=search_request,
+            primary_llm=primary_llm,
+            fast_llm=fast_llm,
+            db_session=db_session,
+            query_to_answer="Who made Excel?",
+        )
+        output = compiled_graph.invoke(
+            input=inputs,
+            # debug=True,
+            # subgraphs=True,
+        )
+        print(output)
+        # for namespace, chunk in compiled_graph.stream(
+        #     input=inputs,
+        #     # debug=True,
+        #     subgraphs=True,
+        # ):
+        #     print(namespace)
+        #     print(chunk)
--- a/backend/danswer/agent_search/answer_query/nodes/answer_check.py
+++ b/backend/danswer/agent_search/answer_query/nodes/answer_check.py
@@ -0,0 +1,30 @@
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.answer_query.states import AnswerQueryState
+from danswer.agent_search.answer_query.states import QACheckOutput
+from danswer.agent_search.shared_graph_utils.prompts import BASE_CHECK_PROMPT
+
+
+def answer_check(state: AnswerQueryState) -> QACheckOutput:
+    msg = [
+        HumanMessage(
+            content=BASE_CHECK_PROMPT.format(
+                question=state["search_request"].query,
+                base_answer=state["answer"],
+            )
+        )
+    ]
+
+    fast_llm = state["fast_llm"]
+    response = list(
+        fast_llm.stream(
+            prompt=msg,
+        )
+    )
+
+    response_str = merge_message_runs(response, chunk_separator="")[0].content
+
+    return QACheckOutput(
+        answer_quality=response_str,
+    )
--- a/backend/danswer/agent_search/answer_query/nodes/answer_generation.py
+++ b/backend/danswer/agent_search/answer_query/nodes/answer_generation.py
@@ -0,0 +1,32 @@
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.answer_query.states import AnswerQueryState
+from danswer.agent_search.answer_query.states import QAGenerationOutput
+from danswer.agent_search.shared_graph_utils.prompts import BASE_RAG_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+
+
+def answer_generation(state: AnswerQueryState) -> QAGenerationOutput:
+    query = state["query_to_answer"]
+    docs = state["reordered_documents"]
+
+    print(f"Number of verified retrieval docs: {len(docs)}")
+
+    msg = [
+        HumanMessage(
+            content=BASE_RAG_PROMPT.format(question=query, context=format_docs(docs))
+        )
+    ]
+
+    fast_llm = state["fast_llm"]
+    response = list(
+        fast_llm.stream(
+            prompt=msg,
+        )
+    )
+
+    answer_str = merge_message_runs(response, chunk_separator="")[0].content
+    return QAGenerationOutput(
+        answer=answer_str,
+    )
--- a/backend/danswer/agent_search/answer_query/nodes/format_answer.py
+++ b/backend/danswer/agent_search/answer_query/nodes/format_answer.py
@@ -0,0 +1,16 @@
+from danswer.agent_search.answer_query.states import AnswerQueryOutput
+from danswer.agent_search.answer_query.states import AnswerQueryState
+from danswer.agent_search.answer_query.states import SearchAnswerResults
+
+
+def format_answer(state: AnswerQueryState) -> AnswerQueryOutput:
+    return AnswerQueryOutput(
+        decomp_answer_results=[
+            SearchAnswerResults(
+                query=state["query_to_answer"],
+                quality=state["answer_quality"],
+                answer=state["answer"],
+                documents=state["reordered_documents"],
+            )
+        ],
+    )
--- a/backend/danswer/agent_search/answer_query/states.py
+++ b/backend/danswer/agent_search/answer_query/states.py
@@ -0,0 +1,45 @@
+from typing import Annotated
+from typing import TypedDict
+
+from pydantic import BaseModel
+
+from danswer.agent_search.core_state import PrimaryState
+from danswer.agent_search.shared_graph_utils.operators import dedup_inference_sections
+from danswer.context.search.models import InferenceSection
+
+
+class SearchAnswerResults(BaseModel):
+    query: str
+    answer: str
+    quality: str
+    documents: Annotated[list[InferenceSection], dedup_inference_sections]
+
+
+class QACheckOutput(TypedDict, total=False):
+    answer_quality: str
+
+
+class QAGenerationOutput(TypedDict, total=False):
+    answer: str
+
+
+class ExpandedRetrievalOutput(TypedDict):
+    reordered_documents: Annotated[list[InferenceSection], dedup_inference_sections]
+
+
+class AnswerQueryState(
+    PrimaryState,
+    QACheckOutput,
+    QAGenerationOutput,
+    ExpandedRetrievalOutput,
+    total=True,
+):
+    query_to_answer: str
+
+
+class AnswerQueryInput(PrimaryState, total=True):
+    query_to_answer: str
+
+
+class AnswerQueryOutput(TypedDict):
+    decomp_answer_results: list[SearchAnswerResults]
--- a/backend/danswer/agent_search/core_state.py
+++ b/backend/danswer/agent_search/core_state.py
@@ -0,0 +1,15 @@
+from typing import TypedDict
+
+from sqlalchemy.orm import Session
+
+from danswer.context.search.models import SearchRequest
+from danswer.llm.interfaces import LLM
+
+
+class PrimaryState(TypedDict, total=False):
+    search_request: SearchRequest
+    primary_llm: LLM
+    fast_llm: LLM
+    # a single session for the entire agent search
+    # is fine if we are only reading
+    db_session: Session
--- a/backend/danswer/agent_search/deep_answer/edges.py
+++ b/backend/danswer/agent_search/deep_answer/edges.py
--- a/backend/danswer/agent_search/deep_answer/graph_builder.py
+++ b/backend/danswer/agent_search/deep_answer/graph_builder.py
--- a/backend/danswer/agent_search/deep_answer/nodes/answer_generation.py
+++ b/backend/danswer/agent_search/deep_answer/nodes/answer_generation.py
@@ -0,0 +1,114 @@
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.main.states import MainState
+from danswer.agent_search.shared_graph_utils.prompts import COMBINED_CONTEXT
+from danswer.agent_search.shared_graph_utils.prompts import MODIFIED_RAG_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+from danswer.agent_search.shared_graph_utils.utils import normalize_whitespace
+
+
+# aggregate sub questions and answers
+def deep_answer_generation(state: MainState) -> dict[str, Any]:
+    """
+    Generate answer
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+         dict: The updated state with re-phrased question
+    """
+    print("---DEEP GENERATE---")
+
+    question = state["original_question"]
+    docs = state["deduped_retrieval_docs"]
+
+    deep_answer_context = state["core_answer_dynamic_context"]
+
+    print(f"Number of verified retrieval docs - deep: {len(docs)}")
+
+    combined_context = normalize_whitespace(
+        COMBINED_CONTEXT.format(
+            deep_answer_context=deep_answer_context, formated_docs=format_docs(docs)
+        )
+    )
+
+    msg = [
+        HumanMessage(
+            content=MODIFIED_RAG_PROMPT.format(
+                question=question, combined_context=combined_context
+            )
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+
+    return {
+        "deep_answer": response.content,
+    }
+
+
+def final_stuff(state: MainState) -> dict[str, Any]:
+    """
+    Invokes the agent model to generate a response based on the current state. Given
+    the question, it will decide to retrieve using the retriever tool, or simply end.
+
+    Args:
+        state (messages): The current state
+
+    Returns:
+        dict: The updated state with the agent response appended to messages
+    """
+    print("---FINAL---")
+
+    messages = state["log_messages"]
+    time_ordered_messages = [x.pretty_repr() for x in messages]
+    time_ordered_messages.sort()
+
+    print("Message Log:")
+    print("\n".join(time_ordered_messages))
+
+    initial_sub_qas = state["initial_sub_qas"]
+    initial_sub_qa_list = []
+    for initial_sub_qa in initial_sub_qas:
+        if initial_sub_qa["sub_answer_check"] == "yes":
+            initial_sub_qa_list.append(
+                f'  Question:\n  {initial_sub_qa["sub_question"]}\n  --\n  Answer:\n  {initial_sub_qa["sub_answer"]}\n  -----'
+            )
+
+    initial_sub_qa_context = "\n".join(initial_sub_qa_list)
+
+    base_answer = state["base_answer"]
+
+    print(f"Final Base Answer:\n{base_answer}")
+    print("--------------------------------")
+    print(f"Initial Answered Sub Questions:\n{initial_sub_qa_context}")
+    print("--------------------------------")
+
+    if not state.get("deep_answer"):
+        print("No Deep Answer was required")
+        return {}
+
+    deep_answer = state["deep_answer"]
+    sub_qas = state["sub_qas"]
+    sub_qa_list = []
+    for sub_qa in sub_qas:
+        if sub_qa["sub_answer_check"] == "yes":
+            sub_qa_list.append(
+                f'  Question:\n  {sub_qa["sub_question"]}\n  --\n  Answer:\n  {sub_qa["sub_answer"]}\n  -----'
+            )
+
+    sub_qa_context = "\n".join(sub_qa_list)
+
+    print(f"Final Base Answer:\n{base_answer}")
+    print("--------------------------------")
+    print(f"Final Deep Answer:\n{deep_answer}")
+    print("--------------------------------")
+    print("Sub Questions and Answers:")
+    print(sub_qa_context)
+
+    return {}
--- a/backend/danswer/agent_search/deep_answer/nodes/deep_decomp.py
+++ b/backend/danswer/agent_search/deep_answer/nodes/deep_decomp.py
@@ -0,0 +1,78 @@
+import json
+import re
+from datetime import datetime
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.main.states import MainState
+from danswer.agent_search.shared_graph_utils.prompts import DEEP_DECOMPOSE_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_entity_term_extraction
+from danswer.agent_search.shared_graph_utils.utils import generate_log_message
+
+
+def decompose(state: MainState) -> dict[str, Any]:
+    """ """
+
+    node_start_time = datetime.now()
+
+    question = state["original_question"]
+    base_answer = state["base_answer"]
+
+    # get the entity term extraction dict and properly format it
+    entity_term_extraction_dict = state["retrieved_entities_relationships"][
+        "retrieved_entities_relationships"
+    ]
+
+    entity_term_extraction_str = format_entity_term_extraction(
+        entity_term_extraction_dict
+    )
+
+    initial_question_answers = state["initial_sub_qas"]
+
+    addressed_question_list = [
+        x["sub_question"]
+        for x in initial_question_answers
+        if x["sub_answer_check"] == "yes"
+    ]
+    failed_question_list = [
+        x["sub_question"]
+        for x in initial_question_answers
+        if x["sub_answer_check"] == "no"
+    ]
+
+    msg = [
+        HumanMessage(
+            content=DEEP_DECOMPOSE_PROMPT.format(
+                question=question,
+                entity_term_extraction_str=entity_term_extraction_str,
+                base_answer=base_answer,
+                answered_sub_questions="\n - ".join(addressed_question_list),
+                failed_sub_questions="\n - ".join(failed_question_list),
+            ),
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+
+    cleaned_response = re.sub(r"```json\n|\n```", "", response.pretty_repr())
+    parsed_response = json.loads(cleaned_response)
+
+    sub_questions_dict = {}
+    for sub_question_nr, sub_question_dict in enumerate(
+        parsed_response["sub_questions"]
+    ):
+        sub_question_dict["answered"] = False
+        sub_question_dict["verified"] = False
+        sub_questions_dict[sub_question_nr] = sub_question_dict
+
+    return {
+        "decomposed_sub_questions_dict": sub_questions_dict,
+        "log_messages": generate_log_message(
+            message="deep - decompose",
+            node_start_time=node_start_time,
+            graph_start_time=state["graph_start_time"],
+        ),
+    }
--- a/backend/danswer/agent_search/deep_answer/nodes/entity_term_extraction.py
+++ b/backend/danswer/agent_search/deep_answer/nodes/entity_term_extraction.py
@@ -0,0 +1,40 @@
+import json
+import re
+from typing import Any
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.main.states import MainState
+from danswer.agent_search.shared_graph_utils.prompts import ENTITY_TERM_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+
+
+def entity_term_extraction(state: MainState) -> dict[str, Any]:
+    """Extract entities and terms from the question and context"""
+
+    question = state["original_question"]
+    docs = state["deduped_retrieval_docs"]
+
+    doc_context = format_docs(docs)
+
+    msg = [
+        HumanMessage(
+            content=ENTITY_TERM_PROMPT.format(question=question, context=doc_context),
+        )
+    ]
+    fast_llm = state["fast_llm"]
+    # Grader
+    llm_response_list = list(
+        fast_llm.stream(
+            prompt=msg,
+        )
+    )
+    llm_response = merge_message_runs(llm_response_list, chunk_separator="")[0].content
+
+    cleaned_response = re.sub(r"```json\n|\n```", "", llm_response)
+    parsed_response = json.loads(cleaned_response)
+
+    return {
+        "retrieved_entities_relationships": parsed_response,
+    }
--- a/backend/danswer/agent_search/deep_answer/nodes/sub_qa_level_aggregator.py
+++ b/backend/danswer/agent_search/deep_answer/nodes/sub_qa_level_aggregator.py
@@ -0,0 +1,30 @@
+from typing import Any
+
+from danswer.agent_search.main.states import MainState
+
+
+# aggregate sub questions and answers
+def sub_qa_level_aggregator(state: MainState) -> dict[str, Any]:
+    sub_qas = state["sub_qas"]
+
+    dynamic_context_list = [
+        "Below you will find useful information to answer the original question:"
+    ]
+    checked_sub_qas = []
+
+    for core_answer_sub_qa in sub_qas:
+        question = core_answer_sub_qa["sub_question"]
+        answer = core_answer_sub_qa["sub_answer"]
+        verified = core_answer_sub_qa["sub_answer_check"]
+
+        if verified == "yes":
+            dynamic_context_list.append(
+                f"Question:\n{question}\n\nAnswer:\n{answer}\n\n---\n\n"
+            )
+            checked_sub_qas.append({"sub_question": question, "sub_answer": answer})
+    dynamic_context = "\n".join(dynamic_context_list)
+
+    return {
+        "core_answer_dynamic_context": dynamic_context,
+        "checked_sub_qas": checked_sub_qas,
+    }
--- a/backend/danswer/agent_search/deep_answer/nodes/sub_qa_manager.py
+++ b/backend/danswer/agent_search/deep_answer/nodes/sub_qa_manager.py
@@ -0,0 +1,19 @@
+from typing import Any
+
+from danswer.agent_search.main.states import MainState
+
+
+def sub_qa_manager(state: MainState) -> dict[str, Any]:
+    """ """
+
+    sub_questions_dict = state["decomposed_sub_questions_dict"]
+
+    sub_questions = {}
+
+    for sub_question_nr, sub_question_dict in sub_questions_dict.items():
+        sub_questions[sub_question_nr] = sub_question_dict["sub_question"]
+
+    return {
+        "sub_questions": sub_questions,
+        "num_new_question_iterations": 0,
+    }
--- a/backend/danswer/agent_search/deep_answer/states.py
+++ b/backend/danswer/agent_search/deep_answer/states.py
--- a/backend/danswer/agent_search/expanded_retrieval/edges.py
+++ b/backend/danswer/agent_search/expanded_retrieval/edges.py
@@ -0,0 +1,44 @@
+from collections.abc import Hashable
+
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+from langgraph.types import Send
+
+from danswer.agent_search.expanded_retrieval.nodes.doc_retrieval import RetrieveInput
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalInput
+from danswer.agent_search.shared_graph_utils.prompts import REWRITE_PROMPT_MULTI
+from danswer.llm.interfaces import LLM
+
+
+def parallel_retrieval_edge(state: ExpandedRetrievalInput) -> list[Send | Hashable]:
+    print(f"parallel_retrieval_edge state: {state.keys()}")
+
+    # This should be better...
+    question = state.get("query_to_answer") or state["search_request"].query
+    llm: LLM = state["fast_llm"]
+
+    msg = [
+        HumanMessage(
+            content=REWRITE_PROMPT_MULTI.format(question=question),
+        )
+    ]
+    llm_response_list = list(
+        llm.stream(
+            prompt=msg,
+        )
+    )
+    llm_response = merge_message_runs(llm_response_list, chunk_separator="")[0].content
+
+    print(f"llm_response: {llm_response}")
+
+    rewritten_queries = llm_response.split("\n")
+
+    print(f"rewritten_queries: {rewritten_queries}")
+
+    return [
+        Send(
+            "doc_retrieval",
+            RetrieveInput(query_to_retrieve=query, **state),
+        )
+        for query in rewritten_queries
+    ]
--- a/backend/danswer/agent_search/expanded_retrieval/graph_builder.py
+++ b/backend/danswer/agent_search/expanded_retrieval/graph_builder.py
@@ -0,0 +1,88 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from danswer.agent_search.expanded_retrieval.edges import parallel_retrieval_edge
+from danswer.agent_search.expanded_retrieval.nodes.doc_reranking import doc_reranking
+from danswer.agent_search.expanded_retrieval.nodes.doc_retrieval import doc_retrieval
+from danswer.agent_search.expanded_retrieval.nodes.doc_verification import (
+    doc_verification,
+)
+from danswer.agent_search.expanded_retrieval.nodes.verification_kickoff import (
+    verification_kickoff,
+)
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalInput
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalOutput
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalState
+
+
+def expanded_retrieval_graph_builder() -> StateGraph:
+    graph = StateGraph(
+        state_schema=ExpandedRetrievalState,
+        input=ExpandedRetrievalInput,
+        output=ExpandedRetrievalOutput,
+    )
+
+    ### Add nodes ###
+
+    graph.add_node(
+        node="doc_retrieval",
+        action=doc_retrieval,
+    )
+    graph.add_node(
+        node="verification_kickoff",
+        action=verification_kickoff,
+    )
+    graph.add_node(
+        node="doc_verification",
+        action=doc_verification,
+    )
+    graph.add_node(
+        node="doc_reranking",
+        action=doc_reranking,
+    )
+
+    ### Add edges ###
+
+    graph.add_conditional_edges(
+        source=START,
+        path=parallel_retrieval_edge,
+        path_map=["doc_retrieval"],
+    )
+    graph.add_edge(
+        start_key="doc_retrieval",
+        end_key="verification_kickoff",
+    )
+    graph.add_edge(
+        start_key="doc_verification",
+        end_key="doc_reranking",
+    )
+    graph.add_edge(
+        start_key="doc_reranking",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    from danswer.db.engine import get_session_context_manager
+    from danswer.llm.factory import get_default_llms
+    from danswer.context.search.models import SearchRequest
+
+    graph = expanded_retrieval_graph_builder()
+    compiled_graph = graph.compile()
+    primary_llm, fast_llm = get_default_llms()
+    search_request = SearchRequest(
+        query="Who made Excel and what other products did they make?",
+    )
+    with get_session_context_manager() as db_session:
+        inputs = ExpandedRetrievalInput(
+            search_request=search_request,
+            primary_llm=primary_llm,
+            fast_llm=fast_llm,
+            db_session=db_session,
+            query_to_answer="Who made Excel?",
+        )
+        for thing in compiled_graph.stream(inputs, debug=True):
+            print(thing)
--- a/backend/danswer/agent_search/expanded_retrieval/nodes/doc_reranking.py
+++ b/backend/danswer/agent_search/expanded_retrieval/nodes/doc_reranking.py
@@ -0,0 +1,11 @@
+from danswer.agent_search.expanded_retrieval.states import DocRerankingOutput
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalState
+
+
+def doc_reranking(state: ExpandedRetrievalState) -> DocRerankingOutput:
+    print(f"doc_reranking state: {state.keys()}")
+
+    verified_documents = state["verified_documents"]
+    reranked_documents = verified_documents
+
+    return DocRerankingOutput(reranked_documents=reranked_documents)
--- a/backend/danswer/agent_search/expanded_retrieval/nodes/doc_retrieval.py
+++ b/backend/danswer/agent_search/expanded_retrieval/nodes/doc_retrieval.py
@@ -0,0 +1,47 @@
+from danswer.agent_search.expanded_retrieval.states import DocRetrievalOutput
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalState
+from danswer.context.search.models import InferenceSection
+from danswer.context.search.models import SearchRequest
+from danswer.context.search.pipeline import SearchPipeline
+from danswer.db.engine import get_session_context_manager
+
+
+class RetrieveInput(ExpandedRetrievalState):
+    query_to_retrieve: str
+
+
+def doc_retrieval(state: RetrieveInput) -> DocRetrievalOutput:
+    # def doc_retrieval(state: RetrieveInput) -> Command[Literal["doc_verification"]]:
+    """
+    Retrieve documents
+
+    Args:
+        state (dict): The current graph state
+
+    Returns:
+        state (dict): New key added to state, documents, that contains retrieved documents
+    """
+    print(f"doc_retrieval state: {state.keys()}")
+
+    state["query_to_retrieve"]
+
+    documents: list[InferenceSection] = []
+    llm = state["primary_llm"]
+    fast_llm = state["fast_llm"]
+    # db_session = state["db_session"]
+    query_to_retrieve = state["search_request"].query
+    with get_session_context_manager() as db_session1:
+        documents = SearchPipeline(
+            search_request=SearchRequest(
+                query=query_to_retrieve,
+            ),
+            user=None,
+            llm=llm,
+            fast_llm=fast_llm,
+            db_session=db_session1,
+        ).reranked_sections
+
+    print(f"retrieved documents: {len(documents)}")
+    return DocRetrievalOutput(
+        retrieved_documents=documents,
+    )
--- a/backend/danswer/agent_search/expanded_retrieval/nodes/doc_verification.py
+++ b/backend/danswer/agent_search/expanded_retrieval/nodes/doc_verification.py
@@ -0,0 +1,60 @@
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import merge_message_runs
+
+from danswer.agent_search.expanded_retrieval.states import DocVerificationOutput
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalState
+from danswer.agent_search.shared_graph_utils.models import BinaryDecision
+from danswer.agent_search.shared_graph_utils.prompts import VERIFIER_PROMPT
+from danswer.context.search.models import InferenceSection
+
+
+class DocVerificationInput(ExpandedRetrievalState, total=True):
+    doc_to_verify: InferenceSection
+
+
+def doc_verification(state: DocVerificationInput) -> DocVerificationOutput:
+    """
+    Check whether the document is relevant for the original user question
+
+    Args:
+        state (VerifierState): The current state
+
+    Returns:
+        dict: ict: The updated state with the final decision
+    """
+
+    print(f"doc_verification state: {state.keys()}")
+
+    original_query = state["search_request"].query
+    doc_to_verify = state["doc_to_verify"]
+    document_content = doc_to_verify.combined_content
+
+    msg = [
+        HumanMessage(
+            content=VERIFIER_PROMPT.format(
+                question=original_query, document_content=document_content
+            )
+        )
+    ]
+
+    fast_llm = state["fast_llm"]
+    response = list(
+        fast_llm.stream(
+            prompt=msg,
+        )
+    )
+
+    response_string = merge_message_runs(response, chunk_separator="")[0].content
+    # Convert string response to proper dictionary format
+    decision_dict = {"decision": response_string.lower()}
+    formatted_response = BinaryDecision.model_validate(decision_dict)
+
+    print(f"Verdict: {formatted_response.decision}")
+
+    verified_documents = []
+    if formatted_response.decision == "yes":
+        verified_documents.append(doc_to_verify)
+
+    return DocVerificationOutput(
+        verified_documents=verified_documents,
+    )
--- a/backend/danswer/agent_search/expanded_retrieval/nodes/verification_kickoff.py
+++ b/backend/danswer/agent_search/expanded_retrieval/nodes/verification_kickoff.py
@@ -0,0 +1,27 @@
+from typing import Literal
+
+from langgraph.types import Command
+from langgraph.types import Send
+
+from danswer.agent_search.expanded_retrieval.nodes.doc_verification import (
+    DocVerificationInput,
+)
+from danswer.agent_search.expanded_retrieval.states import ExpandedRetrievalState
+
+
+def verification_kickoff(
+    state: ExpandedRetrievalState,
+) -> Command[Literal["doc_verification"]]:
+    print(f"verification_kickoff state: {state.keys()}")
+
+    documents = state["retrieved_documents"]
+    return Command(
+        update={},
+        goto=[
+            Send(
+                node="doc_verification",
+                arg=DocVerificationInput(doc_to_verify=doc, **state),
+            )
+            for doc in documents
+        ],
+    )
--- a/backend/danswer/agent_search/expanded_retrieval/prompts.py
+++ b/backend/danswer/agent_search/expanded_retrieval/prompts.py
--- a/backend/danswer/agent_search/expanded_retrieval/states.py
+++ b/backend/danswer/agent_search/expanded_retrieval/states.py
@@ -0,0 +1,36 @@
+from typing import Annotated
+from typing import TypedDict
+
+from danswer.agent_search.core_state import PrimaryState
+from danswer.agent_search.shared_graph_utils.operators import dedup_inference_sections
+from danswer.context.search.models import InferenceSection
+
+
+class DocRetrievalOutput(TypedDict, total=False):
+    retrieved_documents: Annotated[list[InferenceSection], dedup_inference_sections]
+
+
+class DocVerificationOutput(TypedDict, total=False):
+    verified_documents: Annotated[list[InferenceSection], dedup_inference_sections]
+
+
+class DocRerankingOutput(TypedDict, total=False):
+    reranked_documents: Annotated[list[InferenceSection], dedup_inference_sections]
+
+
+class ExpandedRetrievalState(
+    PrimaryState,
+    DocRetrievalOutput,
+    DocVerificationOutput,
+    DocRerankingOutput,
+    total=True,
+):
+    query_to_answer: str
+
+
+class ExpandedRetrievalInput(PrimaryState, total=True):
+    query_to_answer: str
+
+
+class ExpandedRetrievalOutput(TypedDict):
+    reordered_documents: Annotated[list[InferenceSection], dedup_inference_sections]
--- a/backend/danswer/agent_search/main/edges.py
+++ b/backend/danswer/agent_search/main/edges.py
@@ -0,0 +1,61 @@
+from collections.abc import Hashable
+
+from langgraph.types import Send
+
+from danswer.agent_search.answer_query.states import AnswerQueryInput
+from danswer.agent_search.main.states import MainState
+
+
+def parallelize_decompozed_answer_queries(state: MainState) -> list[Send | Hashable]:
+    return [
+        Send(
+            "answer_query",
+            AnswerQueryInput(
+                **state,
+                query_to_answer=query,
+            ),
+        )
+        for query in state["initial_decomp_queries"]
+    ]
+
+
+# def continue_to_answer_sub_questions(state: QAState) -> Union[Hashable, list[Hashable]]:
+#     # Routes re-written queries to the (parallel) retrieval steps
+#     # Notice the 'Send()' API that takes care of the parallelization
+#     return [
+#         Send(
+#             "sub_answers_graph",
+#             ResearchQAState(
+#                 sub_question=sub_question["sub_question_str"],
+#                 sub_question_nr=sub_question["sub_question_nr"],
+#                 graph_start_time=state["graph_start_time"],
+#                 primary_llm=state["primary_llm"],
+#                 fast_llm=state["fast_llm"],
+#             ),
+#         )
+#         for sub_question in state["sub_questions"]
+#     ]
+
+
+# def continue_to_deep_answer(state: QAState) -> Union[Hashable, list[Hashable]]:
+#     print("---GO TO DEEP ANSWER OR END---")
+
+#     base_answer = state["base_answer"]
+
+#     question = state["original_question"]
+
+#     BASE_CHECK_MESSAGE = [
+#         HumanMessage(
+#             content=BASE_CHECK_PROMPT.format(question=question, base_answer=base_answer)
+#         )
+#     ]
+
+#     model = state["fast_llm"]
+#     response = model.invoke(BASE_CHECK_MESSAGE)
+
+#     print(f"CAN WE CONTINUE W/O GENERATING A DEEP ANSWER? - {response.pretty_repr()}")
+
+#     if response.pretty_repr() == "no":
+#         return "decompose"
+#     else:
+#         return "end"
--- a/backend/danswer/agent_search/main/graph_builder.py
+++ b/backend/danswer/agent_search/main/graph_builder.py
@@ -0,0 +1,98 @@
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from danswer.agent_search.answer_query.graph_builder import answer_query_graph_builder
+from danswer.agent_search.expanded_retrieval.graph_builder import (
+    expanded_retrieval_graph_builder,
+)
+from danswer.agent_search.main.edges import parallelize_decompozed_answer_queries
+from danswer.agent_search.main.nodes.base_decomp import main_decomp_base
+from danswer.agent_search.main.nodes.generate_initial_answer import (
+    generate_initial_answer,
+)
+from danswer.agent_search.main.states import MainInput
+from danswer.agent_search.main.states import MainState
+
+
+def main_graph_builder() -> StateGraph:
+    graph = StateGraph(
+        state_schema=MainState,
+        input=MainInput,
+    )
+
+    ### Add nodes ###
+
+    graph.add_node(
+        node="base_decomp",
+        action=main_decomp_base,
+    )
+    answer_query_subgraph = answer_query_graph_builder().compile()
+    graph.add_node(
+        node="answer_query",
+        action=answer_query_subgraph,
+    )
+    expanded_retrieval_subgraph = expanded_retrieval_graph_builder().compile()
+    graph.add_node(
+        node="expanded_retrieval",
+        action=expanded_retrieval_subgraph,
+    )
+    graph.add_node(
+        node="generate_initial_answer",
+        action=generate_initial_answer,
+    )
+
+    ### Add edges ###
+    graph.add_edge(
+        start_key=START,
+        end_key="expanded_retrieval",
+    )
+
+    graph.add_edge(
+        start_key=START,
+        end_key="base_decomp",
+    )
+    graph.add_conditional_edges(
+        source="base_decomp",
+        path=parallelize_decompozed_answer_queries,
+        path_map=["answer_query"],
+    )
+    graph.add_edge(
+        start_key=["answer_query", "expanded_retrieval"],
+        end_key="generate_initial_answer",
+    )
+    graph.add_edge(
+        start_key="generate_initial_answer",
+        end_key=END,
+    )
+
+    return graph
+
+
+if __name__ == "__main__":
+    from danswer.db.engine import get_session_context_manager
+    from danswer.llm.factory import get_default_llms
+    from danswer.context.search.models import SearchRequest
+
+    graph = main_graph_builder()
+    compiled_graph = graph.compile()
+    primary_llm, fast_llm = get_default_llms()
+    search_request = SearchRequest(
+        query="If i am familiar with the function that I need, how can I type it into a cell?",
+    )
+    with get_session_context_manager() as db_session:
+        inputs = MainInput(
+            search_request=search_request,
+            primary_llm=primary_llm,
+            fast_llm=fast_llm,
+            db_session=db_session,
+        )
+        for thing in compiled_graph.stream(
+            input=inputs,
+            # stream_mode="debug",
+            # debug=True,
+            subgraphs=True,
+        ):
+            # print(thing)
+            print()
+            print()
--- a/backend/danswer/agent_search/main/nodes/base_decomp.py
+++ b/backend/danswer/agent_search/main/nodes/base_decomp.py
@@ -0,0 +1,31 @@
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.main.states import BaseDecompOutput
+from danswer.agent_search.main.states import MainState
+from danswer.agent_search.shared_graph_utils.prompts import INITIAL_DECOMPOSITION_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import clean_and_parse_list_string
+
+
+def main_decomp_base(state: MainState) -> BaseDecompOutput:
+    question = state["search_request"].query
+
+    msg = [
+        HumanMessage(
+            content=INITIAL_DECOMPOSITION_PROMPT.format(question=question),
+        )
+    ]
+
+    # Get the rewritten queries in a defined format
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+
+    content = response.pretty_repr()
+    list_of_subquestions = clean_and_parse_list_string(content)
+
+    decomp_list: list[str] = [
+        sub_question["sub_question"].strip() for sub_question in list_of_subquestions
+    ]
+
+    return BaseDecompOutput(
+        initial_decomp_queries=decomp_list,
+    )
--- a/backend/danswer/agent_search/main/nodes/generate_initial_answer.py
+++ b/backend/danswer/agent_search/main/nodes/generate_initial_answer.py
@@ -0,0 +1,53 @@
+from langchain_core.messages import HumanMessage
+
+from danswer.agent_search.main.states import InitialAnswerOutput
+from danswer.agent_search.main.states import MainState
+from danswer.agent_search.shared_graph_utils.prompts import INITIAL_RAG_PROMPT
+from danswer.agent_search.shared_graph_utils.utils import format_docs
+
+
+def generate_initial_answer(state: MainState) -> InitialAnswerOutput:
+    print("---GENERATE INITIAL---")
+
+    question = state["search_request"].query
+    docs = state["documents"]
+
+    decomp_answer_results = state["decomp_answer_results"]
+
+    good_qa_list: list[str] = []
+
+    _SUB_QUESTION_ANSWER_TEMPLATE = """
+    Sub-Question:\n  - {sub_question}\n  --\nAnswer:\n  - {sub_answer}\n\n
+    """
+    for decomp_answer_result in decomp_answer_results:
+        if (
+            decomp_answer_result.quality.lower() == "yes"
+            and len(decomp_answer_result.answer) > 0
+            and decomp_answer_result.answer != "I don't know"
+        ):
+            good_qa_list.append(
+                _SUB_QUESTION_ANSWER_TEMPLATE.format(
+                    sub_question=decomp_answer_result.query,
+                    sub_answer=decomp_answer_result.answer,
+                )
+            )
+
+    sub_question_answer_str = "\n\n------\n\n".join(good_qa_list)
+
+    msg = [
+        HumanMessage(
+            content=INITIAL_RAG_PROMPT.format(
+                question=question,
+                context=format_docs(docs),
+                answered_sub_questions=sub_question_answer_str,
+            )
+        )
+    ]
+
+    # Grader
+    model = state["fast_llm"]
+    response = model.invoke(msg)
+    answer = response.pretty_repr()
+
+    print(answer)
+    return InitialAnswerOutput(initial_answer=answer)
--- a/backend/danswer/agent_search/main/states.py
+++ b/backend/danswer/agent_search/main/states.py
@@ -0,0 +1,37 @@
+from operator import add
+from typing import Annotated
+from typing import TypedDict
+
+from danswer.agent_search.answer_query.states import SearchAnswerResults
+from danswer.agent_search.core_state import PrimaryState
+from danswer.agent_search.shared_graph_utils.operators import dedup_inference_sections
+from danswer.context.search.models import InferenceSection
+
+
+class BaseDecompOutput(TypedDict, total=False):
+    initial_decomp_queries: list[str]
+
+
+class InitialAnswerOutput(TypedDict, total=False):
+    initial_answer: str
+
+
+class MainState(
+    PrimaryState,
+    BaseDecompOutput,
+    InitialAnswerOutput,
+    total=True,
+):
+    documents: Annotated[list[InferenceSection], dedup_inference_sections]
+    decomp_answer_results: Annotated[list[SearchAnswerResults], add]
+
+
+class MainInput(PrimaryState, total=True):
+    pass
+
+
+class MainOutput(TypedDict):
+    """
+    This is not used because defining the output only matters for filtering the output of
+      a .invoke() call but we are streaming so we just yield the entire state.
+    """
--- a/backend/danswer/agent_search/run_graph.py
+++ b/backend/danswer/agent_search/run_graph.py
@@ -0,0 +1,27 @@
+from danswer.agent_search.primary_graph.graph_builder import build_core_graph
+from danswer.llm.answering.answer import AnswerStream
+from danswer.llm.interfaces import LLM
+from danswer.tools.tool import Tool
+
+
+def run_graph(
+    query: str,
+    llm: LLM,
+    tools: list[Tool],
+) -> AnswerStream:
+    graph = build_core_graph()
+
+    inputs = {
+        "original_query": query,
+        "messages": [],
+        "tools": tools,
+        "llm": llm,
+    }
+    compiled_graph = graph.compile()
+    output = compiled_graph.invoke(input=inputs)
+    yield from output
+
+
+if __name__ == "__main__":
+    pass
+    # run_graph("What is the capital of France?", llm, [])
--- a/backend/danswer/agent_search/shared_graph_utils/models.py
+++ b/backend/danswer/agent_search/shared_graph_utils/models.py
@@ -0,0 +1,12 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+# Pydantic models for structured outputs
+class RewrittenQueries(BaseModel):
+    rewritten_queries: list[str]
+
+
+class BinaryDecision(BaseModel):
+    decision: Literal["yes", "no"]
--- a/backend/danswer/agent_search/shared_graph_utils/operators.py
+++ b/backend/danswer/agent_search/shared_graph_utils/operators.py
@@ -0,0 +1,9 @@
+from danswer.context.search.models import InferenceSection
+from danswer.llm.answering.prune_and_merge import _merge_sections
+
+
+def dedup_inference_sections(
+    list1: list[InferenceSection], list2: list[InferenceSection]
+) -> list[InferenceSection]:
+    deduped = _merge_sections(list1 + list2)
+    return deduped
--- a/backend/danswer/agent_search/shared_graph_utils/prompts.py
+++ b/backend/danswer/agent_search/shared_graph_utils/prompts.py
@@ -0,0 +1,427 @@
+REWRITE_PROMPT_MULTI_ORIGINAL = """ \n
+    Please convert an initial user question into a 2-3 more appropriate short and pointed search queries for retrievel from a
+    document store. Particularly, try to think about resolving ambiguities and make the search queries more specific,
+    enabling the system to search more broadly.
+    Also, try to make the search queries not redundant, i.e. not too similar! \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    Formulate the queries separated by '--' (Do not say 'Query 1: ...', just write the querytext): """
+
+REWRITE_PROMPT_MULTI = """ \n
+    Please create a list of 2-3 sample documents that could answer an original question. Each document
+    should be about as long as the original question. \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    Formulate the sample documents separated by '--' (Do not say 'Document 1: ...', just write the text): """
+
+BASE_RAG_PROMPT = """ \n
+    You are an assistant for question-answering tasks. Use the context provided below - and only the
+    provided context - to answer the question. If you don't know the answer or if the provided context is
+    empty, just say "I don't know". Do not use your internal knowledge!
+
+    Again, only use the provided context and do not use your internal knowledge! If you cannot answer the
+    question based on the context, say "I don't know". It is a matter of life and death that you do NOT
+    use your internal knowledge, just the provided information!
+
+    Use three sentences maximum and keep the answer concise.
+    answer concise.\nQuestion:\n {question} \nContext:\n {context} \n\n
+    \n\n
+    Answer:"""
+
+BASE_CHECK_PROMPT = """ \n
+    Please check whether 1) the suggested answer seems to fully address the original question AND 2)the
+    original question requests a simple, factual answer, and there are no ambiguities, judgements,
+    aggregations, or any other complications that may require extra context. (I.e., if the question is
+    somewhat addressed, but the answer would benefit from more context, then answer with 'no'.)
+
+    Please only answer with 'yes' or 'no' \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    Here is the proposed answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+    Please answer with yes or no:"""
+
+VERIFIER_PROMPT = """ \n
+    Please check whether the document seems to be relevant for the answer of the question. Please
+    only answer with 'yes' or 'no' \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+    Here is the document text:
+    \n ------- \n
+    {document_content}
+    \n ------- \n
+    Please answer with yes or no:"""
+
+INITIAL_DECOMPOSITION_PROMPT_BASIC = """ \n
+    Please decompose an initial user question into not more than 4 appropriate sub-questions that help to
+    answer the original question. The purpose for this decomposition is to isolate individulal entities
+    (i.e., 'compare sales of company A and company B' -> 'what are sales for company A' + 'what are sales
+    for company B'), split ambiguous terms (i.e., 'what is our success with company A' -> 'what are our
+    sales with company A' + 'what is our market share with company A' + 'is company A a reference customer
+    for us'), etc. Each sub-question should be realistically be answerable by a good RAG system. \n
+
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Please formulate your answer as a list of subquestions:
+
+    Answer:
+    """
+
+REWRITE_PROMPT_SINGLE = """ \n
+    Please convert an initial user question into a more appropriate search query for retrievel from a
+    document store. \n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Formulate the query: """
+
+MODIFIED_RAG_PROMPT = """You are an assistant for question-answering tasks. Use the context provided below
+    - and only this context - to answer the question. If you don't know the answer, just say "I don't know".
+    Use three sentences maximum and keep the answer concise.
+    Pay also particular attention to the sub-questions and their answers, at least it may enrich the answer.
+    Again, only use the provided context and do not use your internal knowledge! If you cannot answer the
+    question based on the context, say "I don't know". It is a matter of life and death that you do NOT
+    use your internal knowledge, just the provided information!
+
+    \nQuestion: {question}
+    \nContext: {combined_context} \n
+
+    Answer:"""
+
+ORIG_DEEP_DECOMPOSE_PROMPT = """ \n
+    An initial user question needs to be answered. An initial answer has been provided but it wasn't quite
+    good enough. Also, some sub-questions had been answered and this information has been used to provide
+    the initial answer. Some other subquestions may have been suggested based on little knowledge, but they
+    were not directly answerable. Also, some entities, relationships and terms are givenm to you so that
+    you have an idea of how the avaiolable data looks like.
+
+    Your role is to generate 3-5 new sub-questions that would help to answer the initial question,
+    considering:
+
+    1) The initial question
+    2) The initial answer that was found to be unsatisfactory
+    3) The sub-questions that were answered
+    4) The sub-questions that were suggested but not answered
+    5) The entities, relationships and terms that were extracted from the context
+
+    The individual questions should be answerable by a good RAG system.
+    So a good idea would be to use the sub-questions to resolve ambiguities and/or to separate the
+    question for different entities that may be involved in the original question, but in a way that does
+    not duplicate questions that were already tried.
+
+    Additional Guidelines:
+    - The sub-questions should be specific to the question and provide richer context for the question,
+    resolve ambiguities, or address shortcoming of the initial answer
+    - Each sub-question - when answered - should be relevant for the answer to the original question
+    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
+    other complications that may require extra context.
+    - The sub-questions MUST have the full context of the original question so that it can be executed by
+    a RAG system independently without the original question available
+      (Example:
+        - initial question: "What is the capital of France?"
+        - bad sub-question: "What is the name of the river there?"
+        - good sub-question: "What is the name of the river that flows through Paris?"
+    - For each sub-question, please provide a short explanation for why it is a good sub-question. So
+    generate a list of dictionaries with the following format:
+      [{{"sub_question": <sub-question>, "explanation": <explanation>, "search_term": <rewrite the
+      sub-question using as a search phrase for the document store>}}, ...]
+
+    \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Here is the initial sub-optimal answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+
+    Here are the sub-questions that were answered:
+    \n ------- \n
+    {answered_sub_questions}
+    \n ------- \n
+
+    Here are the sub-questions that were suggested but not answered:
+    \n ------- \n
+    {failed_sub_questions}
+    \n ------- \n
+
+    And here are the entities, relationships and terms extracted from the context:
+    \n ------- \n
+    {entity_term_extraction_str}
+    \n ------- \n
+
+   Please generate the list of good, fully contextualized sub-questions that would help to address the
+   main question. Again, please find questions that are NOT overlapping too much with the already answered
+   sub-questions or those that already were suggested and failed.
+   In other words - what can we try in addition to what has been tried so far?
+
+   Please think through it step by step and then generate the list of json dictionaries with the following
+   format:
+
+   {{"sub_questions": [{{"sub_question": <sub-question>,
+        "explanation": <explanation>,
+        "search_term": <rewrite the sub-question using as a search phrase for the document store>}},
+        ...]}} """
+
+DEEP_DECOMPOSE_PROMPT = """ \n
+    An initial user question needs to be answered. An initial answer has been provided but it wasn't quite
+    good enough. Also, some sub-questions had been answered and this information has been used to provide
+    the initial answer. Some other subquestions may have been suggested based on little knowledge, but they
+    were not directly answerable. Also, some entities, relationships and terms are givenm to you so that
+    you have an idea of how the avaiolable data looks like.
+
+    Your role is to generate 4-6 new sub-questions that would help to answer the initial question,
+    considering:
+
+    1) The initial question
+    2) The initial answer that was found to be unsatisfactory
+    3) The sub-questions that were answered
+    4) The sub-questions that were suggested but not answered
+    5) The entities, relationships and terms that were extracted from the context
+
+    The individual questions should be answerable by a good RAG system.
+    So a good idea would be to use the sub-questions to resolve ambiguities and/or to separate the
+    question for different entities that may be involved in the original question, but in a way that does
+    not duplicate questions that were already tried.
+
+    Additional Guidelines:
+    - The sub-questions should be specific to the question and provide richer context for the question,
+    resolve ambiguities, or address shortcoming of the initial answer
+    - Each sub-question - when answered - should be relevant for the answer to the original question
+    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
+    other complications that may require extra context.
+    - The sub-questions MUST have the full context of the original question so that it can be executed by
+    a RAG system independently without the original question available
+      (Example:
+        - initial question: "What is the capital of France?"
+        - bad sub-question: "What is the name of the river there?"
+        - good sub-question: "What is the name of the river that flows through Paris?"
+    - For each sub-question, please also provide a search term that can be used to retrieve relevant
+    documents from a document store.
+    \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Here is the initial sub-optimal answer:
+    \n ------- \n
+    {base_answer}
+    \n ------- \n
+
+    Here are the sub-questions that were answered:
+    \n ------- \n
+    {answered_sub_questions}
+    \n ------- \n
+
+    Here are the sub-questions that were suggested but not answered:
+    \n ------- \n
+    {failed_sub_questions}
+    \n ------- \n
+
+    And here are the entities, relationships and terms extracted from the context:
+    \n ------- \n
+    {entity_term_extraction_str}
+    \n ------- \n
+
+   Please generate the list of good, fully contextualized sub-questions that would help to address the
+   main question. Again, please find questions that are NOT overlapping too much with the already answered
+   sub-questions or those that already were suggested and failed.
+   In other words - what can we try in addition to what has been tried so far?
+
+   Generate the list of json dictionaries with the following format:
+
+   {{"sub_questions": [{{"sub_question": <sub-question>,
+        "search_term": <rewrite the sub-question using as a search phrase for the document store>}},
+        ...]}} """
+
+DECOMPOSE_PROMPT = """ \n
+    For an initial user question, please generate at 5-10 individual sub-questions whose answers would help
+    \n to answer the initial question. The individual questions should be answerable by a good RAG system.
+    So a good idea would be to \n use the sub-questions to resolve ambiguities and/or to separate the
+    question for different entities that may be involved in the original question.
+
+    In order to arrive at meaningful sub-questions, please also consider the context retrieved from the
+    document store, expressed as entities, relationships and terms. You can also think about the types
+    mentioned in brackets
+
+    Guidelines:
+    - The sub-questions should be specific to the question and provide richer context for the question,
+    and or resolve ambiguities
+    - Each sub-question - when answered - should be relevant for the answer to the original question
+    - The sub-questions should be free from comparisions, ambiguities,judgements, aggregations, or any
+    other complications that may require extra context.
+    - The sub-questions MUST have the full context of the original question so that it can be executed by
+    a RAG system independently without the original question available
+      (Example:
+        - initial question: "What is the capital of France?"
+        - bad sub-question: "What is the name of the river there?"
+        - good sub-question: "What is the name of the river that flows through Paris?"
+    - For each sub-question, please provide a short explanation for why it is a good sub-question. So
+    generate a list of dictionaries with the following format:
+      [{{"sub_question": <sub-question>, "explanation": <explanation>, "search_term": <rewrite the
+      sub-question using as a search phrase for the document store>}}, ...]
+
+    \n\n
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    And here are the entities, relationships and terms extracted from the context:
+    \n ------- \n
+    {entity_term_extraction_str}
+    \n ------- \n
+
+   Please generate the list of good, fully contextualized sub-questions that would help to address the
+   main question. Don't be too specific unless the original question is specific.
+   Please think through it step by step and then generate the list of json dictionaries with the following
+   format:
+   {{"sub_questions": [{{"sub_question": <sub-question>,
+        "explanation": <explanation>,
+        "search_term": <rewrite the sub-question using as a search phrase for the document store>}},
+        ...]}} """
+
+#### Consolidations
+COMBINED_CONTEXT = """-------
+    Below you will find useful information to answer the original question. First, you see a number of
+    sub-questions with their answers. This information should be considered to be more focussed and
+    somewhat more specific to the original question as it tries to contextualized facts.
+    After that will see the documents that were considered to be relevant to answer the original question.
+
+    Here are the sub-questions and their answers:
+    \n\n {deep_answer_context} \n\n
+    \n\n Here are the documents that were considered to be relevant to answer the original question:
+    \n\n {formated_docs} \n\n
+    ----------------
+    """
+
+SUB_QUESTION_EXPLANATION_RANKER_PROMPT = """-------
+    Below you will find a question that we ultimately want to answer (the original question) and a list of
+    motivations in arbitrary order for generated sub-questions that are supposed to help us answering the
+    original question. The motivations are formatted as <motivation number>:  <motivation explanation>.
+    (Again, the numbering is arbitrary and does not necessarily mean that 1 is the most relevant
+    motivation and 2 is less relevant.)
+
+    Please rank the motivations in order of relevance for answering the original question. Also, try to
+    ensure that the top questions do not duplicate too much, i.e. that they are not too similar.
+    Ultimately, create a list with the motivation numbers where the number of the most relevant
+    motivations comes first.
+
+    Here is the original question:
+    \n\n {original_question} \n\n
+    \n\n Here is the list of sub-question motivations:
+    \n\n {sub_question_explanations} \n\n
+    ----------------
+
+    Please think step by step and then generate the ranked list of motivations.
+
+    Please format your answer as a json object in the following format:
+    {{"reasonning": <explain your reasoning for the ranking>,
+      "ranked_motivations": <ranked list of motivation numbers>}}
+    """
+
+
+INITIAL_DECOMPOSITION_PROMPT = """ \n
+    Please decompose an initial user question into 2 or 3 appropriate sub-questions that help to
+    answer the original question. The purpose for this decomposition is to isolate individulal entities
+    (i.e., 'compare sales of company A and company B' -> 'what are sales for company A' + 'what are sales
+    for company B'), split ambiguous terms (i.e., 'what is our success with company A' -> 'what are our
+    sales with company A' + 'what is our market share with company A' + 'is company A a reference customer
+    for us'), etc. Each sub-question should be realistically be answerable by a good RAG system. \n
+
+    For each sub-question, please also create one search term that can be used to retrieve relevant
+    documents from a document store.
+
+    Here is the initial question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+
+    Please formulate your answer as a list of json objects with the following format:
+
+   [{{"sub_question": <sub-question>, "search_term": <search term>}}, ...]
+
+    Answer:
+    """
+
+INITIAL_RAG_PROMPT = """ \n
+    You are an assistant for question-answering tasks. Use the information provided below - and only the
+    provided information - to answer the provided question.
+
+    The information provided below consists of:
+     1) a number of answered sub-questions - these are very important(!) and definitely should be
+     considered to answer the question.
+     2) a number of documents that were also deemed relevant for the question.
+
+    If you don't know the answer or if the provided information is empty or insufficient, just say
+    "I don't know". Do not use your internal knowledge!
+
+    Again, only use the provided informationand do not use your internal knowledge! It is a matter of life
+    and death that you do NOT use your internal knowledge, just the provided information!
+
+    Try to keep your answer concise.
+
+    And here is the question and the provided information:
+    \n
+    \nQuestion:\n {question}
+
+    \nAnswered Sub-questions:\n {answered_sub_questions}
+
+    \nContext:\n {context} \n\n
+    \n\n
+
+    Answer:"""
+
+ENTITY_TERM_PROMPT = """ \n
+    Based on the original question and the context retieved from a dataset, please generate a list of
+    entities (e.g. companies, organizations, industries, products, locations, etc.), terms and concepts
+    (e.g. sales, revenue, etc.) that are relevant for the question, plus their relations to each other.
+
+    \n\n
+    Here is the original question:
+    \n ------- \n
+    {question}
+    \n ------- \n
+   And here is the context retrieved:
+    \n ------- \n
+    {context}
+    \n ------- \n
+
+    Please format your answer as a json object in the following format:
+
+    {{"retrieved_entities_relationships": {{
+        "entities": [{{
+            "entity_name": <assign a name for the entity>,
+            "entity_type": <specify a short type name for the entity, such as 'company', 'location',...>
+        }}],
+        "relationships": [{{
+            "name": <assign a name for the relationship>,
+            "type": <specify a short type name for the relationship, such as 'sales_to', 'is_location_of',...>,
+            "entities": [<related entity name 1>, <related entity name 2>]
+        }}],
+        "terms": [{{
+            "term_name": <assign a name for the term>,
+            "term_type": <specify a short type name for the term, such as 'revenue', 'market_share',...>,
+            "similar_to": <list terms that are similar to this term>
+        }}]
+    }}
+    }}
+   """
--- a/backend/danswer/agent_search/shared_graph_utils/utils.py
+++ b/backend/danswer/agent_search/shared_graph_utils/utils.py
@@ -0,0 +1,101 @@
+import ast
+import json
+import re
+from collections.abc import Sequence
+from datetime import datetime
+from datetime import timedelta
+from typing import Any
+
+from danswer.context.search.models import InferenceSection
+
+
+def normalize_whitespace(text: str) -> str:
+    """Normalize whitespace in text to single spaces and strip leading/trailing whitespace."""
+    import re
+
+    return re.sub(r"\s+", " ", text.strip())
+
+
+# Post-processing
+def format_docs(docs: Sequence[InferenceSection]) -> str:
+    return "\n\n".join(doc.combined_content for doc in docs)
+
+
+def clean_and_parse_list_string(json_string: str) -> list[dict]:
+    # Remove any prefixes/labels before the actual JSON content
+    json_string = re.sub(r"^.*?(?=\[)", "", json_string, flags=re.DOTALL)
+
+    # Remove markdown code block markers and any newline prefixes
+    cleaned_string = re.sub(r"```json\n|\n```", "", json_string)
+    cleaned_string = cleaned_string.replace("\\n", " ").replace("\n", " ")
+    cleaned_string = " ".join(cleaned_string.split())
+
+    # Try parsing with json.loads first, fall back to ast.literal_eval
+    try:
+        return json.loads(cleaned_string)
+    except json.JSONDecodeError:
+        try:
+            return ast.literal_eval(cleaned_string)
+        except (ValueError, SyntaxError) as e:
+            raise ValueError(f"Failed to parse JSON string: {cleaned_string}") from e
+
+
+def clean_and_parse_json_string(json_string: str) -> dict[str, Any]:
+    # Remove markdown code block markers and any newline prefixes
+    cleaned_string = re.sub(r"```json\n|\n```", "", json_string)
+    cleaned_string = cleaned_string.replace("\\n", " ").replace("\n", " ")
+    cleaned_string = " ".join(cleaned_string.split())
+    # Parse the cleaned string into a Python dictionary
+    return json.loads(cleaned_string)
+
+
+def format_entity_term_extraction(entity_term_extraction_dict: dict[str, Any]) -> str:
+    entities = entity_term_extraction_dict["entities"]
+    terms = entity_term_extraction_dict["terms"]
+    relationships = entity_term_extraction_dict["relationships"]
+
+    entity_strs = ["\nEntities:\n"]
+    for entity in entities:
+        entity_str = f"{entity['entity_name']} ({entity['entity_type']})"
+        entity_strs.append(entity_str)
+
+    entity_str = "\n - ".join(entity_strs)
+
+    relationship_strs = ["\n\nRelationships:\n"]
+    for relationship in relationships:
+        relationship_str = f"{relationship['name']} ({relationship['type']}): {relationship['entities']}"
+        relationship_strs.append(relationship_str)
+
+    relationship_str = "\n - ".join(relationship_strs)
+
+    term_strs = ["\n\nTerms:\n"]
+    for term in terms:
+        term_str = f"{term['term_name']} ({term['term_type']}): similar to {term['similar_to']}"
+        term_strs.append(term_str)
+
+    term_str = "\n - ".join(term_strs)
+
+    return "\n".join(entity_strs + relationship_strs + term_strs)
+
+
+def _format_time_delta(time: timedelta) -> str:
+    seconds_from_start = f"{((time).seconds):03d}"
+    microseconds_from_start = f"{((time).microseconds):06d}"
+    return f"{seconds_from_start}.{microseconds_from_start}"
+
+
+def generate_log_message(
+    message: str,
+    node_start_time: datetime,
+    graph_start_time: datetime | None = None,
+) -> str:
+    current_time = datetime.now()
+
+    if graph_start_time is not None:
+        graph_time_str = _format_time_delta(current_time - graph_start_time)
+    else:
+        graph_time_str = "N/A"
+
+    node_time_str = _format_time_delta(current_time - node_start_time)
+
+    return f"{graph_time_str} ({node_time_str} s): {message}"
--- a/backend/danswer/auth/invited_users.py
+++ b/backend/danswer/auth/invited_users.py
@@ -2,8 +2,8 @@ from typing import cast

 from danswer.configs.constants import KV_USER_STORE_KEY
 from danswer.key_value_store.factory import get_kv_store
-from danswer.key_value_store.interface import JSON_ro
 from danswer.key_value_store.interface import KvKeyNotFoundError
+from danswer.utils.special_types import JSON_ro


 def get_invited_users() -> list[str]:
--- a/backend/danswer/auth/noauth_user.py
+++ b/backend/danswer/auth/noauth_user.py
@@ -23,7 +23,9 @@ def load_no_auth_user_preferences(store: KeyValueStore) -> UserPreferences:
        )
        return UserPreferences(**preferences_data)
    except KvKeyNotFoundError:
-        return UserPreferences(chosen_assistants=None, default_model=None)
+        return UserPreferences(
+            chosen_assistants=None, default_model=None, auto_scroll=True
+        )


 def fetch_no_auth_user(store: KeyValueStore) -> UserInfo:
--- a/backend/danswer/auth/schemas.py
+++ b/backend/danswer/auth/schemas.py
@@ -13,12 +13,24 @@ class UserRole(str, Enum):
        groups they are curators of
    - Global Curator can perform admin actions
        for all groups they are a member of
+    - Limited can access a limited set of basic api endpoints
+    - Slack are users that have used danswer via slack but dont have a web login
+    - External permissioned users that have been picked up during the external permissions sync process but don't have a web login
    """

+    LIMITED = "limited"
    BASIC = "basic"
    ADMIN = "admin"
    CURATOR = "curator"
    GLOBAL_CURATOR = "global_curator"
+    SLACK_USER = "slack_user"
+    EXT_PERM_USER = "ext_perm_user"
+
+    def is_web_login(self) -> bool:
+        return self not in [
+            UserRole.SLACK_USER,
+            UserRole.EXT_PERM_USER,
+        ]


 class UserStatus(str, Enum):
@@ -33,10 +45,8 @@ class UserRead(schemas.BaseUser[uuid.UUID]):

 class UserCreate(schemas.BaseUserCreate):
    role: UserRole = UserRole.BASIC
-    has_web_login: bool | None = True
    tenant_id: str | None = None


 class UserUpdate(schemas.BaseUserUpdate):
    role: UserRole
-    has_web_login: bool | None = True
--- a/backend/danswer/auth/users.py
+++ b/backend/danswer/auth/users.py
@@ -49,8 +49,7 @@ from httpx_oauth.oauth2 import BaseOAuth2
 from httpx_oauth.oauth2 import OAuth2Token
 from pydantic import BaseModel
 from sqlalchemy import text
-from sqlalchemy.orm import attributes
-from sqlalchemy.orm import Session
+from sqlalchemy.ext.asyncio import AsyncSession

 from danswer.auth.api_key import get_hashed_api_key_from_request
 from danswer.auth.invited_users import get_invited_users
@@ -81,13 +80,14 @@ from danswer.db.auth import get_default_admin_user_emails
 from danswer.db.auth import get_user_count
 from danswer.db.auth import get_user_db
 from danswer.db.auth import SQLAlchemyUserAdminDB
+from danswer.db.engine import get_async_session
 from danswer.db.engine import get_async_session_with_tenant
-from danswer.db.engine import get_session
 from danswer.db.engine import get_session_with_tenant
 from danswer.db.models import AccessToken
 from danswer.db.models import OAuthAccount
 from danswer.db.models import User
 from danswer.db.users import get_user_by_email
+from danswer.server.utils import BasicAuthenticationError
 from danswer.utils.logger import setup_logger
 from danswer.utils.telemetry import optional_telemetry
 from danswer.utils.telemetry import RecordType
@@ -100,11 +100,6 @@ from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
 logger = setup_logger()


-class BasicAuthenticationError(HTTPException):
-    def __init__(self, detail: str):
-        super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
-
-
 def is_user_admin(user: User | None) -> bool:
    if AUTH_TYPE == AuthType.DISABLED:
        return True
@@ -222,18 +217,25 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
    reset_password_token_secret = USER_AUTH_SECRET
    verification_token_secret = USER_AUTH_SECRET

+    user_db: SQLAlchemyUserDatabase[User, uuid.UUID]
+
    async def create(
        self,
        user_create: schemas.UC | UserCreate,
        safe: bool = False,
        request: Optional[Request] = None,
    ) -> User:
+        referral_source = None
+        if request is not None:
+            referral_source = request.cookies.get("referral_source", None)
+
        tenant_id = await fetch_ee_implementation_or_noop(
            "danswer.server.tenants.provisioning",
            "get_or_create_tenant_id",
            async_return_default_schema,
        )(
            email=user_create.email,
+            referral_source=referral_source,
        )

        async with get_async_session_with_tenant(tenant_id) as db_session:
@@ -242,7 +244,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            verify_email_is_invited(user_create.email)
            verify_email_domain(user_create.email)
            if MULTI_TENANT:
-                tenant_user_db = SQLAlchemyUserAdminDB(db_session, User, OAuthAccount)
+                tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID](
+                    db_session, User, OAuthAccount
+                )
                self.user_db = tenant_user_db
                self.database = tenant_user_db

@@ -261,14 +265,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            except exceptions.UserAlreadyExists:
                user = await self.get_by_email(user_create.email)
                # Handle case where user has used product outside of web and is now creating an account through web
-                if (
-                    not user.has_web_login
-                    and hasattr(user_create, "has_web_login")
-                    and user_create.has_web_login
-                ):
+                if not user.role.is_web_login() and user_create.role.is_web_login():
                    user_update = UserUpdate(
                        password=user_create.password,
-                        has_web_login=True,
                        role=user_create.role,
                        is_verified=user_create.is_verified,
                    )
@@ -282,7 +281,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            return user

    async def oauth_callback(
-        self: "BaseUserManager[models.UOAP, models.ID]",
+        self,
        oauth_name: str,
        access_token: str,
        account_id: str,
@@ -293,13 +292,18 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        *,
        associate_by_email: bool = False,
        is_verified_by_default: bool = False,
-    ) -> models.UOAP:
+    ) -> User:
+        referral_source = None
+        if request:
+            referral_source = getattr(request.state, "referral_source", None)
+
        tenant_id = await fetch_ee_implementation_or_noop(
            "danswer.server.tenants.provisioning",
            "get_or_create_tenant_id",
            async_return_default_schema,
        )(
            email=account_email,
+            referral_source=referral_source,
        )

        if not tenant_id:
@@ -314,9 +318,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            verify_email_domain(account_email)

            if MULTI_TENANT:
-                tenant_user_db = SQLAlchemyUserAdminDB(db_session, User, OAuthAccount)
+                tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID](
+                    db_session, User, OAuthAccount
+                )
                self.user_db = tenant_user_db
-                self.database = tenant_user_db  # type: ignore
+                self.database = tenant_user_db

            oauth_account_dict = {
                "oauth_name": oauth_name,
@@ -368,7 +374,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                        and existing_oauth_account.oauth_name == oauth_name
                    ):
                        user = await self.user_db.update_oauth_account(
-                            user, existing_oauth_account, oauth_account_dict
+                            user,
+                            # NOTE: OAuthAccount DOES implement the OAuthAccountProtocol
+                            # but the type checker doesn't know that :(
+                            existing_oauth_account,  # type: ignore
+                            oauth_account_dict,
                        )

            # NOTE: Most IdPs have very short expiry times, and we don't want to force the user to
@@ -381,16 +391,15 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                )

            # Handle case where user has used product outside of web and is now creating an account through web
-            if not user.has_web_login:  # type: ignore
+            if not user.role.is_web_login():
                await self.user_db.update(
                    user,
                    {
                        "is_verified": is_verified_by_default,
-                        "has_web_login": True,
+                        "role": UserRole.BASIC,
                    },
                )
                user.is_verified = is_verified_by_default
-                user.has_web_login = True  # type: ignore

            # this is needed if an organization goes from `TRACK_EXTERNAL_IDP_EXPIRY=true` to `false`
            # otherwise, the oidc expiry will always be old, and the user will never be able to login
@@ -465,9 +474,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                self.password_helper.hash(credentials.password)
                return None

-            has_web_login = attributes.get_attribute(user, "has_web_login")
-
-            if not has_web_login:
+            if not user.role.is_web_login():
                raise BasicAuthenticationError(
                    detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD",
                )
@@ -598,7 +605,7 @@ optional_fastapi_current_user = fastapi_users.current_user(active=True, optional
 async def optional_user_(
    request: Request,
    user: User | None,
-    db_session: Session,
+    async_db_session: AsyncSession,
 ) -> User | None:
    """NOTE: `request` and `db_session` are not used here, but are included
    for the EE version of this function."""
@@ -607,13 +614,21 @@ async def optional_user_(

 async def optional_user(
    request: Request,
-    db_session: Session = Depends(get_session),
+    async_db_session: AsyncSession = Depends(get_async_session),
    user: User | None = Depends(optional_fastapi_current_user),
 ) -> User | None:
    versioned_fetch_user = fetch_versioned_implementation(
        "danswer.auth.users", "optional_user_"
    )
-    return await versioned_fetch_user(request, user, db_session)
+    user = await versioned_fetch_user(request, user, async_db_session)
+
+    # check if an API key is present
+    if user is None:
+        hashed_api_key = get_hashed_api_key_from_request(request)
+        if hashed_api_key:
+            user = await fetch_user_for_api_key(hashed_api_key, async_db_session)
+
+    return user


 async def double_check_user(
@@ -652,12 +667,26 @@ async def current_user_with_expired_token(
    return await double_check_user(user, include_expired=True)


-async def current_user(
+async def current_limited_user(
    user: User | None = Depends(optional_user),
 ) -> User | None:
    return await double_check_user(user)


+async def current_user(
+    user: User | None = Depends(optional_user),
+) -> User | None:
+    user = await double_check_user(user)
+    if not user:
+        return None
+
+    if user.role == UserRole.LIMITED:
+        raise BasicAuthenticationError(
+            detail="Access denied. User role is LIMITED. BASIC or higher permissions are required.",
+        )
+    return user
+
+
 async def current_curator_or_admin_user(
    user: User | None = Depends(current_user),
 ) -> User | None:
@@ -711,8 +740,6 @@ def generate_state_token(


 # refer to https://github.com/fastapi-users/fastapi-users/blob/42ddc241b965475390e2bce887b084152ae1a2cd/fastapi_users/fastapi_users.py#L91
-
-
 def create_danswer_oauth_router(
    oauth_client: BaseOAuth2,
    backend: AuthenticationBackend,
@@ -762,15 +789,22 @@ def get_oauth_router(
        response_model=OAuth2AuthorizeResponse,
    )
    async def authorize(
-        request: Request, scopes: List[str] = Query(None)
+        request: Request,
+        scopes: List[str] = Query(None),
    ) -> OAuth2AuthorizeResponse:
+        referral_source = request.cookies.get("referral_source", None)
+
        if redirect_url is not None:
            authorize_redirect_url = redirect_url
        else:
            authorize_redirect_url = str(request.url_for(callback_route_name))

        next_url = request.query_params.get("next", "/")
-        state_data: Dict[str, str] = {"next_url": next_url}
+
+        state_data: Dict[str, str] = {
+            "next_url": next_url,
+            "referral_source": referral_source or "default_referral",
+        }
        state = generate_state_token(state_data, state_secret)
        authorization_url = await oauth_client.get_authorization_url(
            authorize_redirect_url,
@@ -829,8 +863,11 @@ def get_oauth_router(
            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST)

        next_url = state_data.get("next_url", "/")
+        referral_source = state_data.get("referral_source", None)

-        # Authenticate user
+        request.state.referral_source = referral_source
+
+        # Proceed to authenticate or create the user
        try:
            user = await user_manager.oauth_callback(
                oauth_client.name,
@@ -872,14 +909,13 @@ def get_oauth_router(
            redirect_response.status_code = response.status_code
        if hasattr(response, "media_type"):
            redirect_response.media_type = response.media_type
-
        return redirect_response

    return router


-def api_key_dep(
-    request: Request, db_session: Session = Depends(get_session)
+async def api_key_dep(
+    request: Request, async_db_session: AsyncSession = Depends(get_async_session)
 ) -> User | None:
    if AUTH_TYPE == AuthType.DISABLED:
        return None
@@ -889,7 +925,7 @@ def api_key_dep(
        raise HTTPException(status_code=401, detail="Missing API key")

    if hashed_api_key:
-        user = fetch_user_for_api_key(hashed_api_key, db_session)
+        user = await fetch_user_for_api_key(hashed_api_key, async_db_session)

    if user is None:
        raise HTTPException(status_code=401, detail="Invalid API key")
--- a/backend/danswer/background/celery/apps/app_base.py
+++ b/backend/danswer/background/celery/apps/app_base.py
@@ -11,6 +11,7 @@ from celery.exceptions import WorkerShutdown
 from celery.states import READY_STATES
 from celery.utils.log import get_task_logger
 from celery.worker import strategy  # type: ignore
+from redis.lock import Lock as RedisLock
 from sentry_sdk.integrations.celery import CeleryIntegration
 from sqlalchemy import text
 from sqlalchemy.orm import Session
@@ -24,6 +25,8 @@ from danswer.document_index.vespa_constants import VESPA_CONFIG_SERVER_URL
 from danswer.redis.redis_connector import RedisConnector
 from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
 from danswer.redis.redis_connector_delete import RedisConnectorDelete
+from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
+from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
 from danswer.redis.redis_connector_prune import RedisConnectorPrune
 from danswer.redis.redis_document_set import RedisDocumentSet
 from danswer.redis.redis_pool import get_redis_client
@@ -136,6 +139,22 @@ def on_task_postrun(
            RedisConnectorPrune.remove_from_taskset(int(cc_pair_id), task_id, r)
        return

+    if task_id.startswith(RedisConnectorPermissionSync.SUBTASK_PREFIX):
+        cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
+        if cc_pair_id is not None:
+            RedisConnectorPermissionSync.remove_from_taskset(
+                int(cc_pair_id), task_id, r
+            )
+        return
+
+    if task_id.startswith(RedisConnectorExternalGroupSync.SUBTASK_PREFIX):
+        cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
+        if cc_pair_id is not None:
+            RedisConnectorExternalGroupSync.remove_from_taskset(
+                int(cc_pair_id), task_id, r
+            )
+        return
+

 def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
    """The first signal sent on celery worker startup"""
@@ -314,16 +333,16 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
        return

    logger.info("Releasing primary worker lock.")
-    lock = sender.primary_worker_lock
+    lock: RedisLock = sender.primary_worker_lock
    try:
        if lock.owned():
            try:
                lock.release()
                sender.primary_worker_lock = None
-            except Exception as e:
-                logger.error(f"Failed to release primary worker lock: {e}")
-    except Exception as e:
-        logger.error(f"Failed to check if primary worker lock is owned: {e}")
+            except Exception:
+                logger.exception("Failed to release primary worker lock")
+    except Exception:
+        logger.exception("Failed to check if primary worker lock is owned")


 def on_setup_logging(
--- a/backend/danswer/background/celery/apps/beat.py
+++ b/backend/danswer/background/celery/apps/beat.py
@@ -12,6 +12,7 @@ from danswer.db.engine import get_all_tenant_ids
 from danswer.db.engine import SqlEngine
 from danswer.utils.logger import setup_logger
 from danswer.utils.variable_functionality import fetch_versioned_implementation
+from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
 from shared_configs.configs import MULTI_TENANT

 logger = setup_logger(__name__)
@@ -72,6 +73,15 @@ class DynamicTenantScheduler(PersistentScheduler):
            logger.info(f"Found {len(existing_tenants)} existing tenants in schedule")

            for tenant_id in tenant_ids:
+                if (
+                    IGNORED_SYNCING_TENANT_LIST
+                    and tenant_id in IGNORED_SYNCING_TENANT_LIST
+                ):
+                    logger.info(
+                        f"Skipping tenant {tenant_id} as it is in the ignored syncing list"
+                    )
+                    continue
+
                if tenant_id not in existing_tenants:
                    logger.info(f"Processing new tenant: {tenant_id}")

--- a/backend/danswer/background/celery/apps/heavy.py
+++ b/backend/danswer/background/celery/apps/heavy.py
@@ -91,5 +91,7 @@ def on_setup_logging(
 celery_app.autodiscover_tasks(
    [
        "danswer.background.celery.tasks.pruning",
+        "danswer.background.celery.tasks.doc_permission_syncing",
+        "danswer.background.celery.tasks.external_group_syncing",
    ]
 )
--- a/backend/danswer/background/celery/apps/indexing.py
+++ b/backend/danswer/background/celery/apps/indexing.py
@@ -6,6 +6,7 @@ from celery import signals
 from celery import Task
 from celery.signals import celeryd_init
 from celery.signals import worker_init
+from celery.signals import worker_process_init
 from celery.signals import worker_ready
 from celery.signals import worker_shutdown

@@ -59,7 +60,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
-    SqlEngine.init_engine(pool_size=8, max_overflow=0)
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency)

    # Startup checks are not needed in multi-tenant case
    if MULTI_TENANT:
@@ -81,6 +82,11 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
    app_base.on_worker_shutdown(sender, **kwargs)


+@worker_process_init.connect
+def init_worker(**kwargs: Any) -> None:
+    SqlEngine.reset_engine()
+
+
@signals.setup_logging.connect
 def on_setup_logging(
    loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any
--- a/backend/danswer/background/celery/apps/light.py
+++ b/backend/danswer/background/celery/apps/light.py
@@ -92,5 +92,6 @@ celery_app.autodiscover_tasks(
        "danswer.background.celery.tasks.shared",
        "danswer.background.celery.tasks.vespa",
        "danswer.background.celery.tasks.connector_deletion",
+        "danswer.background.celery.tasks.doc_permission_syncing",
    ]
 )
--- a/backend/danswer/background/celery/apps/primary.py
+++ b/backend/danswer/background/celery/apps/primary.py
@@ -1,5 +1,6 @@
 import multiprocessing
 from typing import Any
+from typing import cast

 from celery import bootsteps  # type: ignore
 from celery import Celery
@@ -10,16 +11,25 @@ from celery.signals import celeryd_init
 from celery.signals import worker_init
 from celery.signals import worker_ready
 from celery.signals import worker_shutdown
+from redis.lock import Lock as RedisLock

 import danswer.background.celery.apps.app_base as app_base
 from danswer.background.celery.apps.app_base import task_logger
 from danswer.background.celery.celery_utils import celery_is_worker_primary
+from danswer.background.celery.tasks.indexing.tasks import (
+    get_unfenced_index_attempt_ids,
+)
 from danswer.configs.constants import CELERY_PRIMARY_WORKER_LOCK_TIMEOUT
 from danswer.configs.constants import DanswerRedisLocks
 from danswer.configs.constants import POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME
+from danswer.db.engine import get_session_with_default_tenant
 from danswer.db.engine import SqlEngine
+from danswer.db.index_attempt import get_index_attempt
+from danswer.db.index_attempt import mark_attempt_canceled
 from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
 from danswer.redis.redis_connector_delete import RedisConnectorDelete
+from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
+from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
 from danswer.redis.redis_connector_index import RedisConnectorIndex
 from danswer.redis.redis_connector_prune import RedisConnectorPrune
 from danswer.redis.redis_connector_stop import RedisConnectorStop
@@ -29,7 +39,6 @@ from danswer.redis.redis_usergroup import RedisUserGroup
 from danswer.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT

-
 logger = setup_logger()

 celery_app = Celery(__name__)
@@ -89,6 +98,15 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    # by the primary worker. This is unnecessary in the multi tenant scenario
    r = get_redis_client(tenant_id=None)

+    # Log the role and slave count - being connected to a slave or slave count > 0 could be problematic
+    info: dict[str, Any] = cast(dict, r.info("replication"))
+    role: str = cast(str, info.get("role"))
+    connected_slaves: int = info.get("connected_slaves", 0)
+
+    logger.info(
+        f"Redis INFO REPLICATION: role={role} connected_slaves={connected_slaves}"
+    )
+
    # For the moment, we're assuming that we are the only primary worker
    # that should be running.
    # TODO: maybe check for or clean up another zombie primary worker if we detect it
@@ -98,9 +116,13 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    # it is planned to use this lock to enforce singleton behavior on the primary
    # worker, since the primary worker does redis cleanup on startup, but this isn't
    # implemented yet.
-    lock = r.lock(
+
+    # set thread_local=False since we don't control what thread the periodic task might
+    # reacquire the lock with
+    lock: RedisLock = r.lock(
        DanswerRedisLocks.PRIMARY_WORKER,
        timeout=CELERY_PRIMARY_WORKER_LOCK_TIMEOUT,
+        thread_local=False,
    )

    logger.info("Primary worker lock: Acquire starting.")
@@ -134,6 +156,27 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:

    RedisConnectorStop.reset_all(r)

+    RedisConnectorPermissionSync.reset_all(r)
+
+    RedisConnectorExternalGroupSync.reset_all(r)
+
+    # mark orphaned index attempts as failed
+    with get_session_with_default_tenant() as db_session:
+        unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
+        for attempt_id in unfenced_attempt_ids:
+            attempt = get_index_attempt(db_session, attempt_id)
+            if not attempt:
+                continue
+
+            failure_reason = (
+                f"Canceling leftover index attempt found on startup: "
+                f"index_attempt={attempt.id} "
+                f"cc_pair={attempt.connector_credential_pair_id} "
+                f"search_settings={attempt.search_settings_id}"
+            )
+            logger.warning(failure_reason)
+            mark_attempt_canceled(attempt.id, db_session, failure_reason)
+

@worker_ready.connect
 def on_worker_ready(sender: Any, **kwargs: Any) -> None:
@@ -188,7 +231,7 @@ class HubPeriodicTask(bootsteps.StartStopStep):
            if not hasattr(worker, "primary_worker_lock"):
                return

-            lock = worker.primary_worker_lock
+            lock: RedisLock = worker.primary_worker_lock

            r = get_redis_client(tenant_id=None)

@@ -233,6 +276,8 @@ celery_app.autodiscover_tasks(
        "danswer.background.celery.tasks.connector_deletion",
        "danswer.background.celery.tasks.indexing",
        "danswer.background.celery.tasks.periodic",
+        "danswer.background.celery.tasks.doc_permission_syncing",
+        "danswer.background.celery.tasks.external_group_syncing",
        "danswer.background.celery.tasks.pruning",
        "danswer.background.celery.tasks.shared",
        "danswer.background.celery.tasks.vespa",
--- a/backend/danswer/background/celery/apps/scheduler.py
+++ b/backend/danswer/background/celery/apps/scheduler.py
@@ -1,96 +0,0 @@
-from datetime import timedelta
-from typing import Any
-
-from celery.beat import PersistentScheduler  # type: ignore
-from celery.utils.log import get_task_logger
-
-from danswer.db.engine import get_all_tenant_ids
-from danswer.utils.variable_functionality import fetch_versioned_implementation
-
-logger = get_task_logger(__name__)
-
-
-class DynamicTenantScheduler(PersistentScheduler):
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-        self._reload_interval = timedelta(minutes=1)
-        self._last_reload = self.app.now() - self._reload_interval
-
-    def setup_schedule(self) -> None:
-        super().setup_schedule()
-
-    def tick(self) -> float:
-        retval = super().tick()
-        now = self.app.now()
-        if (
-            self._last_reload is None
-            or (now - self._last_reload) > self._reload_interval
-        ):
-            logger.info("Reloading schedule to check for new tenants...")
-            self._update_tenant_tasks()
-            self._last_reload = now
-        return retval
-
-    def _update_tenant_tasks(self) -> None:
-        logger.info("Checking for tenant task updates...")
-        try:
-            tenant_ids = get_all_tenant_ids()
-            tasks_to_schedule = fetch_versioned_implementation(
-                "danswer.background.celery.tasks.beat_schedule", "get_tasks_to_schedule"
-            )
-
-            new_beat_schedule: dict[str, dict[str, Any]] = {}
-
-            current_schedule = getattr(self, "_store", {"entries": {}}).get(
-                "entries", {}
-            )
-
-            existing_tenants = set()
-            for task_name in current_schedule.keys():
-                if "-" in task_name:
-                    existing_tenants.add(task_name.split("-")[-1])
-
-            for tenant_id in tenant_ids:
-                if tenant_id not in existing_tenants:
-                    logger.info(f"Found new tenant: {tenant_id}")
-
-                for task in tasks_to_schedule():
-                    task_name = f"{task['name']}-{tenant_id}"
-                    new_task = {
-                        "task": task["task"],
-                        "schedule": task["schedule"],
-                        "kwargs": {"tenant_id": tenant_id},
-                    }
-                    if options := task.get("options"):
-                        new_task["options"] = options
-                    new_beat_schedule[task_name] = new_task
-
-            if self._should_update_schedule(current_schedule, new_beat_schedule):
-                logger.info(
-                    "Updating schedule",
-                    extra={
-                        "new_tasks": len(new_beat_schedule),
-                        "current_tasks": len(current_schedule),
-                    },
-                )
-                if not hasattr(self, "_store"):
-                    self._store: dict[str, dict] = {"entries": {}}
-                self.update_from_dict(new_beat_schedule)
-                logger.info(f"New schedule: {new_beat_schedule}")
-
-                logger.info("Tenant tasks updated successfully")
-            else:
-                logger.debug("No schedule updates needed")
-
-        except (AttributeError, KeyError):
-            logger.exception("Failed to process task configuration")
-        except Exception:
-            logger.exception("Unexpected error updating tenant tasks")
-
-    def _should_update_schedule(
-        self, current_schedule: dict, new_schedule: dict
-    ) -> bool:
-        """Compare schedules to determine if an update is needed."""
-        current_tasks = set(current_schedule.keys())
-        new_tasks = set(new_schedule.keys())
-        return current_tasks != new_tasks
--- a/backend/danswer/background/celery/celery_utils.py
+++ b/backend/danswer/background/celery/celery_utils.py
@@ -4,7 +4,6 @@ from typing import Any

 from sqlalchemy.orm import Session

-from danswer.background.indexing.run_indexing import RunIndexingCallbackInterface
 from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
    rate_limit_builder,
@@ -17,6 +16,7 @@ from danswer.connectors.models import Document
 from danswer.db.connector_credential_pair import get_connector_credential_pair
 from danswer.db.enums import TaskStatus
 from danswer.db.models import TaskQueueState
+from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from danswer.redis.redis_connector import RedisConnector
 from danswer.server.documents.models import DeletionAttemptSnapshot
 from danswer.utils.logger import setup_logger
@@ -78,10 +78,10 @@ def document_batch_to_ids(

 def extract_ids_from_runnable_connector(
    runnable_connector: BaseConnector,
-    callback: RunIndexingCallbackInterface | None = None,
+    callback: IndexingHeartbeatInterface | None = None,
 ) -> set[str]:
    """
-    If the PruneConnector hasnt been implemented for the given connector, just pull
+    If the SlimConnector hasnt been implemented for the given connector, just pull
    all docs using the load_from_state and grab out the IDs.

    Optionally, a callback can be passed to handle the length of each document batch.
@@ -111,10 +111,15 @@ def extract_ids_from_runnable_connector(
    for doc_batch in doc_batch_generator:
        if callback:
            if callback.should_stop():
-                raise RuntimeError("Stop signal received")
-            callback.progress(len(doc_batch))
+                raise RuntimeError(
+                    "extract_ids_from_runnable_connector: Stop signal detected"
+                )
+
        all_connector_doc_ids.update(doc_batch_processing_func(doc_batch))

+        if callback:
+            callback.progress("extract_ids_from_runnable_connector", len(doc_batch))
+
    return all_connector_doc_ids


--- a/backend/danswer/background/celery/tasks/beat_schedule.py
+++ b/backend/danswer/background/celery/tasks/beat_schedule.py
@@ -2,45 +2,58 @@ from datetime import timedelta
 from typing import Any

 from danswer.configs.constants import DanswerCeleryPriority
+from danswer.configs.constants import DanswerCeleryTask


 tasks_to_schedule = [
    {
        "name": "check-for-vespa-sync",
-        "task": "check_for_vespa_sync_task",
-        "schedule": timedelta(seconds=5),
+        "task": DanswerCeleryTask.CHECK_FOR_VESPA_SYNC_TASK,
+        "schedule": timedelta(seconds=20),
        "options": {"priority": DanswerCeleryPriority.HIGH},
    },
    {
        "name": "check-for-connector-deletion",
-        "task": "check_for_connector_deletion_task",
+        "task": DanswerCeleryTask.CHECK_FOR_CONNECTOR_DELETION,
        "schedule": timedelta(seconds=20),
        "options": {"priority": DanswerCeleryPriority.HIGH},
    },
    {
        "name": "check-for-indexing",
-        "task": "check_for_indexing",
-        "schedule": timedelta(seconds=10),
+        "task": DanswerCeleryTask.CHECK_FOR_INDEXING,
+        "schedule": timedelta(seconds=15),
        "options": {"priority": DanswerCeleryPriority.HIGH},
    },
    {
        "name": "check-for-prune",
-        "task": "check_for_pruning",
-        "schedule": timedelta(seconds=10),
+        "task": DanswerCeleryTask.CHECK_FOR_PRUNING,
+        "schedule": timedelta(seconds=15),
        "options": {"priority": DanswerCeleryPriority.HIGH},
    },
    {
        "name": "kombu-message-cleanup",
-        "task": "kombu_message_cleanup_task",
+        "task": DanswerCeleryTask.KOMBU_MESSAGE_CLEANUP_TASK,
        "schedule": timedelta(seconds=3600),
        "options": {"priority": DanswerCeleryPriority.LOWEST},
    },
    {
        "name": "monitor-vespa-sync",
-        "task": "monitor_vespa_sync",
+        "task": DanswerCeleryTask.MONITOR_VESPA_SYNC,
        "schedule": timedelta(seconds=5),
        "options": {"priority": DanswerCeleryPriority.HIGH},
    },
+    {
+        "name": "check-for-doc-permissions-sync",
+        "task": DanswerCeleryTask.CHECK_FOR_DOC_PERMISSIONS_SYNC,
+        "schedule": timedelta(seconds=30),
+        "options": {"priority": DanswerCeleryPriority.HIGH},
+    },
+    {
+        "name": "check-for-external-group-sync",
+        "task": DanswerCeleryTask.CHECK_FOR_EXTERNAL_GROUP_SYNC,
+        "schedule": timedelta(seconds=20),
+        "options": {"priority": DanswerCeleryPriority.HIGH},
+    },
 ]


--- a/backend/danswer/background/celery/tasks/connector_deletion/tasks.py
+++ b/backend/danswer/background/celery/tasks/connector_deletion/tasks.py
@@ -1,17 +1,17 @@
 from datetime import datetime
 from datetime import timezone

-import redis
 from celery import Celery
 from celery import shared_task
 from celery import Task
 from celery.exceptions import SoftTimeLimitExceeded
-from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.background.celery.apps.app_base import task_logger
 from danswer.configs.app_configs import JOB_TIMEOUT
 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
+from danswer.configs.constants import DanswerCeleryTask
 from danswer.configs.constants import DanswerRedisLocks
 from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
 from danswer.db.connector_credential_pair import get_connector_credential_pairs
@@ -19,7 +19,7 @@ from danswer.db.engine import get_session_with_tenant
 from danswer.db.enums import ConnectorCredentialPairStatus
 from danswer.db.search_settings import get_all_search_settings
 from danswer.redis.redis_connector import RedisConnector
-from danswer.redis.redis_connector_delete import RedisConnectorDeletionFenceData
+from danswer.redis.redis_connector_delete import RedisConnectorDeletePayload
 from danswer.redis.redis_pool import get_redis_client


@@ -29,7 +29,7 @@ class TaskDependencyError(RuntimeError):


@shared_task(
-    name="check_for_connector_deletion_task",
+    name=DanswerCeleryTask.CHECK_FOR_CONNECTOR_DELETION,
    soft_time_limit=JOB_TIMEOUT,
    trail=False,
    bind=True,
@@ -37,7 +37,7 @@ class TaskDependencyError(RuntimeError):
 def check_for_connector_deletion_task(self: Task, *, tenant_id: str | None) -> None:
    r = get_redis_client(tenant_id=tenant_id)

-    lock_beat = r.lock(
+    lock_beat: RedisLock = r.lock(
        DanswerRedisLocks.CHECK_CONNECTOR_DELETION_BEAT_LOCK,
        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
    )
@@ -60,7 +60,7 @@ def check_for_connector_deletion_task(self: Task, *, tenant_id: str | None) -> N
                redis_connector = RedisConnector(tenant_id, cc_pair_id)
                try:
                    try_generate_document_cc_pair_cleanup_tasks(
-                        self.app, cc_pair_id, db_session, r, lock_beat, tenant_id
+                        self.app, cc_pair_id, db_session, lock_beat, tenant_id
                    )
                except TaskDependencyError as e:
                    # this means we wanted to start deleting but dependent tasks were running
@@ -86,8 +86,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
    app: Celery,
    cc_pair_id: int,
    db_session: Session,
-    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    """Returns an int if syncing is needed. The int represents the number of sync tasks generated.
@@ -118,7 +117,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
        return None

    # set a basic fence to start
-    fence_payload = RedisConnectorDeletionFenceData(
+    fence_payload = RedisConnectorDeletePayload(
        num_tasks=None,
        submitted=datetime.now(timezone.utc),
    )
@@ -143,6 +142,12 @@ def try_generate_document_cc_pair_cleanup_tasks(
                f"cc_pair={cc_pair_id}"
            )

+        if redis_connector.permissions.fenced:
+            raise TaskDependencyError(
+                f"Connector deletion - Delayed (permissions in progress): "
+                f"cc_pair={cc_pair_id}"
+            )
+
        # add tasks to celery and build up the task set to monitor in redis
        redis_connector.delete.taskset_clear()

--- a/backend/danswer/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/danswer/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -0,0 +1,345 @@
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+from uuid import uuid4
+
+from celery import Celery
+from celery import shared_task
+from celery import Task
+from celery.exceptions import SoftTimeLimitExceeded
+from redis import Redis
+from redis.lock import Lock as RedisLock
+
+from danswer.access.models import DocExternalAccess
+from danswer.background.celery.apps.app_base import task_logger
+from danswer.configs.app_configs import JOB_TIMEOUT
+from danswer.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT
+from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
+from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
+from danswer.configs.constants import DanswerCeleryPriority
+from danswer.configs.constants import DanswerCeleryQueues
+from danswer.configs.constants import DanswerCeleryTask
+from danswer.configs.constants import DanswerRedisLocks
+from danswer.configs.constants import DocumentSource
+from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
+from danswer.db.document import upsert_document_by_connector_credential_pair
+from danswer.db.engine import get_session_with_tenant
+from danswer.db.enums import AccessType
+from danswer.db.enums import ConnectorCredentialPairStatus
+from danswer.db.models import ConnectorCredentialPair
+from danswer.db.users import batch_add_ext_perm_user_if_not_exists
+from danswer.redis.redis_connector import RedisConnector
+from danswer.redis.redis_connector_doc_perm_sync import (
+    RedisConnectorPermissionSyncPayload,
+)
+from danswer.redis.redis_pool import get_redis_client
+from danswer.utils.logger import doc_permission_sync_ctx
+from danswer.utils.logger import setup_logger
+from ee.danswer.db.connector_credential_pair import get_all_auto_sync_cc_pairs
+from ee.danswer.db.document import upsert_document_external_perms
+from ee.danswer.external_permissions.sync_params import DOC_PERMISSION_SYNC_PERIODS
+from ee.danswer.external_permissions.sync_params import DOC_PERMISSIONS_FUNC_MAP
+
+logger = setup_logger()
+
+
+DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES = 3
+
+
+# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
+LIGHT_SOFT_TIME_LIMIT = 105
+LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
+
+
+def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
+    """Returns boolean indicating if external doc permissions sync is due."""
+
+    if cc_pair.access_type != AccessType.SYNC:
+        return False
+
+    # skip doc permissions sync if not active
+    if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
+        return False
+
+    if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
+        return False
+
+    # If the last sync is None, it has never been run so we run the sync
+    last_perm_sync = cc_pair.last_time_perm_sync
+    if last_perm_sync is None:
+        return True
+
+    source_sync_period = DOC_PERMISSION_SYNC_PERIODS.get(cc_pair.connector.source)
+
+    # If RESTRICTED_FETCH_PERIOD[source] is None, we always run the sync.
+    if not source_sync_period:
+        return True
+
+    # If the last sync is greater than the full fetch period, we run the sync
+    next_sync = last_perm_sync + timedelta(seconds=source_sync_period)
+    if datetime.now(timezone.utc) >= next_sync:
+        return True
+
+    return False
+
+
+@shared_task(
+    name=DanswerCeleryTask.CHECK_FOR_DOC_PERMISSIONS_SYNC,
+    soft_time_limit=JOB_TIMEOUT,
+    bind=True,
+)
+def check_for_doc_permissions_sync(self: Task, *, tenant_id: str | None) -> None:
+    r = get_redis_client(tenant_id=tenant_id)
+
+    lock_beat = r.lock(
+        DanswerRedisLocks.CHECK_CONNECTOR_DOC_PERMISSIONS_SYNC_BEAT_LOCK,
+        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
+    )
+
+    try:
+        # these tasks should never overlap
+        if not lock_beat.acquire(blocking=False):
+            return
+
+        # get all cc pairs that need to be synced
+        cc_pair_ids_to_sync: list[int] = []
+        with get_session_with_tenant(tenant_id) as db_session:
+            cc_pairs = get_all_auto_sync_cc_pairs(db_session)
+
+            for cc_pair in cc_pairs:
+                if _is_external_doc_permissions_sync_due(cc_pair):
+                    cc_pair_ids_to_sync.append(cc_pair.id)
+
+        for cc_pair_id in cc_pair_ids_to_sync:
+            tasks_created = try_creating_permissions_sync_task(
+                self.app, cc_pair_id, r, tenant_id
+            )
+            if not tasks_created:
+                continue
+
+            task_logger.info(f"Doc permissions sync queued: cc_pair={cc_pair_id}")
+    except SoftTimeLimitExceeded:
+        task_logger.info(
+            "Soft time limit exceeded, task is being terminated gracefully."
+        )
+    except Exception:
+        task_logger.exception(f"Unexpected exception: tenant={tenant_id}")
+    finally:
+        if lock_beat.owned():
+            lock_beat.release()
+
+
+def try_creating_permissions_sync_task(
+    app: Celery,
+    cc_pair_id: int,
+    r: Redis,
+    tenant_id: str | None,
+) -> int | None:
+    """Returns an int if syncing is needed. The int represents the number of sync tasks generated.
+    Returns None if no syncing is required."""
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+
+    LOCK_TIMEOUT = 30
+
+    lock: RedisLock = r.lock(
+        DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_generate_permissions_sync_tasks",
+        timeout=LOCK_TIMEOUT,
+    )
+
+    acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2)
+    if not acquired:
+        return None
+
+    try:
+        if redis_connector.permissions.fenced:
+            return None
+
+        if redis_connector.delete.fenced:
+            return None
+
+        if redis_connector.prune.fenced:
+            return None
+
+        redis_connector.permissions.generator_clear()
+        redis_connector.permissions.taskset_clear()
+
+        custom_task_id = f"{redis_connector.permissions.generator_task_key}_{uuid4()}"
+
+        result = app.send_task(
+            DanswerCeleryTask.CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK,
+            kwargs=dict(
+                cc_pair_id=cc_pair_id,
+                tenant_id=tenant_id,
+            ),
+            queue=DanswerCeleryQueues.CONNECTOR_DOC_PERMISSIONS_SYNC,
+            task_id=custom_task_id,
+            priority=DanswerCeleryPriority.HIGH,
+        )
+
+        # set a basic fence to start
+        payload = RedisConnectorPermissionSyncPayload(
+            started=None, celery_task_id=result.id
+        )
+
+        redis_connector.permissions.set_fence(payload)
+    except Exception:
+        task_logger.exception(f"Unexpected exception: cc_pair={cc_pair_id}")
+        return None
+    finally:
+        if lock.owned():
+            lock.release()
+
+    return 1
+
+
+@shared_task(
+    name=DanswerCeleryTask.CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK,
+    acks_late=False,
+    soft_time_limit=JOB_TIMEOUT,
+    track_started=True,
+    trail=False,
+    bind=True,
+)
+def connector_permission_sync_generator_task(
+    self: Task,
+    cc_pair_id: int,
+    tenant_id: str | None,
+) -> None:
+    """
+    Permission sync task that handles document permission syncing for a given connector credential pair
+    This task assumes that the task has already been properly fenced
+    """
+
+    doc_permission_sync_ctx_dict = doc_permission_sync_ctx.get()
+    doc_permission_sync_ctx_dict["cc_pair_id"] = cc_pair_id
+    doc_permission_sync_ctx_dict["request_id"] = self.request.id
+    doc_permission_sync_ctx.set(doc_permission_sync_ctx_dict)
+
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+
+    r = get_redis_client(tenant_id=tenant_id)
+
+    lock = r.lock(
+        DanswerRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX
+        + f"_{redis_connector.id}",
+        timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT,
+    )
+
+    acquired = lock.acquire(blocking=False)
+    if not acquired:
+        task_logger.warning(
+            f"Permission sync task already running, exiting...: cc_pair={cc_pair_id}"
+        )
+        return None
+
+    try:
+        with get_session_with_tenant(tenant_id) as db_session:
+            cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session)
+            if cc_pair is None:
+                raise ValueError(
+                    f"No connector credential pair found for id: {cc_pair_id}"
+                )
+
+            source_type = cc_pair.connector.source
+
+            doc_sync_func = DOC_PERMISSIONS_FUNC_MAP.get(source_type)
+            if doc_sync_func is None:
+                raise ValueError(
+                    f"No doc sync func found for {source_type} with cc_pair={cc_pair_id}"
+                )
+
+            logger.info(f"Syncing docs for {source_type} with cc_pair={cc_pair_id}")
+
+            payload = redis_connector.permissions.payload
+            if not payload:
+                raise ValueError(f"No fence payload found: cc_pair={cc_pair_id}")
+
+            payload.started = datetime.now(timezone.utc)
+            redis_connector.permissions.set_fence(payload)
+
+            document_external_accesses: list[DocExternalAccess] = doc_sync_func(cc_pair)
+
+            task_logger.info(
+                f"RedisConnector.permissions.generate_tasks starting. cc_pair={cc_pair_id}"
+            )
+            tasks_generated = redis_connector.permissions.generate_tasks(
+                celery_app=self.app,
+                lock=lock,
+                new_permissions=document_external_accesses,
+                source_string=source_type,
+                connector_id=cc_pair.connector.id,
+                credential_id=cc_pair.credential.id,
+            )
+            if tasks_generated is None:
+                return None
+
+            task_logger.info(
+                f"RedisConnector.permissions.generate_tasks finished. "
+                f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}"
+            )
+
+            redis_connector.permissions.generator_complete = tasks_generated
+
+    except Exception as e:
+        task_logger.exception(f"Failed to run permission sync: cc_pair={cc_pair_id}")
+
+        redis_connector.permissions.generator_clear()
+        redis_connector.permissions.taskset_clear()
+        redis_connector.permissions.set_fence(None)
+        raise e
+    finally:
+        if lock.owned():
+            lock.release()
+
+
+@shared_task(
+    name=DanswerCeleryTask.UPDATE_EXTERNAL_DOCUMENT_PERMISSIONS_TASK,
+    soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
+    time_limit=LIGHT_TIME_LIMIT,
+    max_retries=DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES,
+    bind=True,
+)
+def update_external_document_permissions_task(
+    self: Task,
+    tenant_id: str | None,
+    serialized_doc_external_access: dict,
+    source_string: str,
+    connector_id: int,
+    credential_id: int,
+) -> bool:
+    document_external_access = DocExternalAccess.from_dict(
+        serialized_doc_external_access
+    )
+    doc_id = document_external_access.doc_id
+    external_access = document_external_access.external_access
+    try:
+        with get_session_with_tenant(tenant_id) as db_session:
+            # Add the users to the DB if they don't exist
+            batch_add_ext_perm_user_if_not_exists(
+                db_session=db_session,
+                emails=list(external_access.external_user_emails),
+            )
+            # Then we upsert the document's external permissions in postgres
+            created_new_doc = upsert_document_external_perms(
+                db_session=db_session,
+                doc_id=doc_id,
+                external_access=external_access,
+                source_type=DocumentSource(source_string),
+            )
+
+            if created_new_doc:
+                # If a new document was created, we associate it with the cc_pair
+                upsert_document_by_connector_credential_pair(
+                    db_session=db_session,
+                    connector_id=connector_id,
+                    credential_id=credential_id,
+                    document_ids=[doc_id],
+                )
+
+            logger.debug(
+                f"Successfully synced postgres document permissions for {doc_id}"
+            )
+        return True
+    except Exception:
+        logger.exception("Error Syncing Document Permissions")
+        return False
--- a/backend/danswer/background/celery/tasks/external_group_syncing/tasks.py
+++ b/backend/danswer/background/celery/tasks/external_group_syncing/tasks.py
@@ -0,0 +1,298 @@
+from datetime import datetime
+from datetime import timedelta
+from datetime import timezone
+from uuid import uuid4
+
+from celery import Celery
+from celery import shared_task
+from celery import Task
+from celery.exceptions import SoftTimeLimitExceeded
+from redis import Redis
+from redis.lock import Lock as RedisLock
+
+from danswer.background.celery.apps.app_base import task_logger
+from danswer.configs.app_configs import JOB_TIMEOUT
+from danswer.configs.constants import CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT
+from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
+from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
+from danswer.configs.constants import DanswerCeleryPriority
+from danswer.configs.constants import DanswerCeleryQueues
+from danswer.configs.constants import DanswerCeleryTask
+from danswer.configs.constants import DanswerRedisLocks
+from danswer.db.connector import mark_cc_pair_as_external_group_synced
+from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
+from danswer.db.engine import get_session_with_tenant
+from danswer.db.enums import AccessType
+from danswer.db.enums import ConnectorCredentialPairStatus
+from danswer.db.models import ConnectorCredentialPair
+from danswer.redis.redis_connector import RedisConnector
+from danswer.redis.redis_connector_ext_group_sync import (
+    RedisConnectorExternalGroupSyncPayload,
+)
+from danswer.redis.redis_pool import get_redis_client
+from danswer.utils.logger import setup_logger
+from ee.danswer.db.connector_credential_pair import get_all_auto_sync_cc_pairs
+from ee.danswer.db.connector_credential_pair import get_cc_pairs_by_source
+from ee.danswer.db.external_perm import ExternalUserGroup
+from ee.danswer.db.external_perm import replace_user__ext_group_for_cc_pair
+from ee.danswer.external_permissions.sync_params import EXTERNAL_GROUP_SYNC_PERIODS
+from ee.danswer.external_permissions.sync_params import GROUP_PERMISSIONS_FUNC_MAP
+from ee.danswer.external_permissions.sync_params import (
+    GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC,
+)
+
+logger = setup_logger()
+
+
+EXTERNAL_GROUPS_UPDATE_MAX_RETRIES = 3
+
+
+# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
+LIGHT_SOFT_TIME_LIMIT = 105
+LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
+
+
+def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
+    """Returns boolean indicating if external group sync is due."""
+
+    if cc_pair.access_type != AccessType.SYNC:
+        return False
+
+    # skip external group sync if not active
+    if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
+        return False
+
+    if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
+        return False
+
+    # If there is not group sync function for the connector, we don't run the sync
+    # This is fine because all sources dont necessarily have a concept of groups
+    if not GROUP_PERMISSIONS_FUNC_MAP.get(cc_pair.connector.source):
+        return False
+
+    # If the last sync is None, it has never been run so we run the sync
+    last_ext_group_sync = cc_pair.last_time_external_group_sync
+    if last_ext_group_sync is None:
+        return True
+
+    source_sync_period = EXTERNAL_GROUP_SYNC_PERIODS.get(cc_pair.connector.source)
+
+    # If EXTERNAL_GROUP_SYNC_PERIODS is None, we always run the sync.
+    if not source_sync_period:
+        return True
+
+    # If the last sync is greater than the full fetch period, we run the sync
+    next_sync = last_ext_group_sync + timedelta(seconds=source_sync_period)
+    if datetime.now(timezone.utc) >= next_sync:
+        return True
+
+    return False
+
+
+@shared_task(
+    name=DanswerCeleryTask.CHECK_FOR_EXTERNAL_GROUP_SYNC,
+    soft_time_limit=JOB_TIMEOUT,
+    bind=True,
+)
+def check_for_external_group_sync(self: Task, *, tenant_id: str | None) -> None:
+    r = get_redis_client(tenant_id=tenant_id)
+
+    lock_beat = r.lock(
+        DanswerRedisLocks.CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK,
+        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
+    )
+
+    try:
+        # these tasks should never overlap
+        if not lock_beat.acquire(blocking=False):
+            return
+
+        cc_pair_ids_to_sync: list[int] = []
+        with get_session_with_tenant(tenant_id) as db_session:
+            cc_pairs = get_all_auto_sync_cc_pairs(db_session)
+
+            # We only want to sync one cc_pair per source type in
+            # GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC
+            for source in GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC:
+                # These are ordered by cc_pair id so the first one is the one we want
+                cc_pairs_to_dedupe = get_cc_pairs_by_source(
+                    db_session, source, only_sync=True
+                )
+                # We only want to sync one cc_pair per source type
+                # in GROUP_PERMISSIONS_IS_CC_PAIR_AGNOSTIC so we dedupe here
+                for cc_pair_to_remove in cc_pairs_to_dedupe[1:]:
+                    cc_pairs = [
+                        cc_pair
+                        for cc_pair in cc_pairs
+                        if cc_pair.id != cc_pair_to_remove.id
+                    ]
+
+            for cc_pair in cc_pairs:
+                if _is_external_group_sync_due(cc_pair):
+                    cc_pair_ids_to_sync.append(cc_pair.id)
+
+        for cc_pair_id in cc_pair_ids_to_sync:
+            tasks_created = try_creating_external_group_sync_task(
+                self.app, cc_pair_id, r, tenant_id
+            )
+            if not tasks_created:
+                continue
+
+            task_logger.info(f"External group sync queued: cc_pair={cc_pair_id}")
+    except SoftTimeLimitExceeded:
+        task_logger.info(
+            "Soft time limit exceeded, task is being terminated gracefully."
+        )
+    except Exception:
+        task_logger.exception(f"Unexpected exception: tenant={tenant_id}")
+    finally:
+        if lock_beat.owned():
+            lock_beat.release()
+
+
+def try_creating_external_group_sync_task(
+    app: Celery,
+    cc_pair_id: int,
+    r: Redis,
+    tenant_id: str | None,
+) -> int | None:
+    """Returns an int if syncing is needed. The int represents the number of sync tasks generated.
+    Returns None if no syncing is required."""
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+
+    LOCK_TIMEOUT = 30
+
+    lock = r.lock(
+        DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_generate_external_group_sync_tasks",
+        timeout=LOCK_TIMEOUT,
+    )
+
+    acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2)
+    if not acquired:
+        return None
+
+    try:
+        # Dont kick off a new sync if the previous one is still running
+        if redis_connector.external_group_sync.fenced:
+            return None
+
+        redis_connector.external_group_sync.generator_clear()
+        redis_connector.external_group_sync.taskset_clear()
+
+        custom_task_id = f"{redis_connector.external_group_sync.taskset_key}_{uuid4()}"
+
+        result = app.send_task(
+            DanswerCeleryTask.CONNECTOR_EXTERNAL_GROUP_SYNC_GENERATOR_TASK,
+            kwargs=dict(
+                cc_pair_id=cc_pair_id,
+                tenant_id=tenant_id,
+            ),
+            queue=DanswerCeleryQueues.CONNECTOR_EXTERNAL_GROUP_SYNC,
+            task_id=custom_task_id,
+            priority=DanswerCeleryPriority.HIGH,
+        )
+
+        payload = RedisConnectorExternalGroupSyncPayload(
+            started=datetime.now(timezone.utc),
+            celery_task_id=result.id,
+        )
+
+        redis_connector.external_group_sync.set_fence(payload)
+
+    except Exception:
+        task_logger.exception(
+            f"Unexpected exception while trying to create external group sync task: cc_pair={cc_pair_id}"
+        )
+        return None
+    finally:
+        if lock.owned():
+            lock.release()
+
+    return 1
+
+
+@shared_task(
+    name=DanswerCeleryTask.CONNECTOR_EXTERNAL_GROUP_SYNC_GENERATOR_TASK,
+    acks_late=False,
+    soft_time_limit=JOB_TIMEOUT,
+    track_started=True,
+    trail=False,
+    bind=True,
+)
+def connector_external_group_sync_generator_task(
+    self: Task,
+    cc_pair_id: int,
+    tenant_id: str | None,
+) -> None:
+    """
+    Permission sync task that handles external group syncing for a given connector credential pair
+    This task assumes that the task has already been properly fenced
+    """
+
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+
+    r = get_redis_client(tenant_id=tenant_id)
+
+    lock: RedisLock = r.lock(
+        DanswerRedisLocks.CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX
+        + f"_{redis_connector.id}",
+        timeout=CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT,
+    )
+
+    try:
+        acquired = lock.acquire(blocking=False)
+        if not acquired:
+            task_logger.warning(
+                f"External group sync task already running, exiting...: cc_pair={cc_pair_id}"
+            )
+            return None
+
+        with get_session_with_tenant(tenant_id) as db_session:
+            cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session)
+            if cc_pair is None:
+                raise ValueError(
+                    f"No connector credential pair found for id: {cc_pair_id}"
+                )
+
+            source_type = cc_pair.connector.source
+
+            ext_group_sync_func = GROUP_PERMISSIONS_FUNC_MAP.get(source_type)
+            if ext_group_sync_func is None:
+                raise ValueError(
+                    f"No external group sync func found for {source_type} for cc_pair: {cc_pair_id}"
+                )
+
+            logger.info(
+                f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}"
+            )
+
+            external_user_groups: list[ExternalUserGroup] = ext_group_sync_func(cc_pair)
+
+            logger.info(
+                f"Syncing {len(external_user_groups)} external user groups for {source_type}"
+            )
+
+            replace_user__ext_group_for_cc_pair(
+                db_session=db_session,
+                cc_pair_id=cc_pair.id,
+                group_defs=external_user_groups,
+                source=cc_pair.connector.source,
+            )
+            logger.info(
+                f"Synced {len(external_user_groups)} external user groups for {source_type}"
+            )
+
+            mark_cc_pair_as_external_group_synced(db_session, cc_pair.id)
+    except Exception as e:
+        task_logger.exception(
+            f"Failed to run external group sync: cc_pair={cc_pair_id}"
+        )
+
+        redis_connector.external_group_sync.generator_clear()
+        redis_connector.external_group_sync.taskset_clear()
+        raise e
+    finally:
+        # we always want to clear the fence after the task is done or failed so it doesn't get stuck
+        redis_connector.external_group_sync.set_fence(None)
+        if lock.owned():
+            lock.release()
--- a/backend/danswer/background/celery/tasks/indexing/tasks.py
+++ b/backend/danswer/background/celery/tasks/indexing/tasks.py
@@ -10,41 +10,50 @@ from celery import shared_task
 from celery import Task
 from celery.exceptions import SoftTimeLimitExceeded
 from redis import Redis
+from redis.exceptions import LockError
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.background.celery.apps.app_base import task_logger
 from danswer.background.indexing.job_client import SimpleJobClient
 from danswer.background.indexing.run_indexing import run_indexing_entrypoint
-from danswer.background.indexing.run_indexing import RunIndexingCallbackInterface
 from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
 from danswer.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
 from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
 from danswer.configs.constants import DanswerCeleryPriority
 from danswer.configs.constants import DanswerCeleryQueues
+from danswer.configs.constants import DanswerCeleryTask
 from danswer.configs.constants import DanswerRedisLocks
 from danswer.configs.constants import DocumentSource
+from danswer.db.connector import mark_ccpair_with_indexing_trigger
 from danswer.db.connector_credential_pair import fetch_connector_credential_pairs
 from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
 from danswer.db.engine import get_db_current_time
 from danswer.db.engine import get_session_with_tenant
 from danswer.db.enums import ConnectorCredentialPairStatus
+from danswer.db.enums import IndexingMode
 from danswer.db.enums import IndexingStatus
 from danswer.db.enums import IndexModelStatus
 from danswer.db.index_attempt import create_index_attempt
+from danswer.db.index_attempt import delete_index_attempt
+from danswer.db.index_attempt import get_all_index_attempts_by_status
 from danswer.db.index_attempt import get_index_attempt
 from danswer.db.index_attempt import get_last_attempt_for_cc_pair
+from danswer.db.index_attempt import mark_attempt_canceled
 from danswer.db.index_attempt import mark_attempt_failed
 from danswer.db.models import ConnectorCredentialPair
 from danswer.db.models import IndexAttempt
 from danswer.db.models import SearchSettings
+from danswer.db.search_settings import get_active_search_settings
 from danswer.db.search_settings import get_current_search_settings
-from danswer.db.search_settings import get_secondary_search_settings
 from danswer.db.swap_index import check_index_swap
+from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
 from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
 from danswer.redis.redis_connector import RedisConnector
-from danswer.redis.redis_connector_index import RedisConnectorIndexingFenceData
+from danswer.redis.redis_connector_index import RedisConnectorIndex
+from danswer.redis.redis_connector_index import RedisConnectorIndexPayload
 from danswer.redis.redis_pool import get_redis_client
 from danswer.utils.logger import setup_logger
 from danswer.utils.variable_functionality import global_version
@@ -56,41 +65,108 @@ from shared_configs.configs import SENTRY_DSN
 logger = setup_logger()


-class RunIndexingCallback(RunIndexingCallbackInterface):
+class IndexingCallback(IndexingHeartbeatInterface):
    def __init__(
        self,
        stop_key: str,
        generator_progress_key: str,
-        redis_lock: redis.lock.Lock,
+        redis_lock: RedisLock,
        redis_client: Redis,
    ):
        super().__init__()
-        self.redis_lock: redis.lock.Lock = redis_lock
+        self.redis_lock: RedisLock = redis_lock
        self.stop_key: str = stop_key
        self.generator_progress_key: str = generator_progress_key
        self.redis_client = redis_client
+        self.started: datetime = datetime.now(timezone.utc)
+        self.redis_lock.reacquire()
+
+        self.last_tag: str = "IndexingCallback.__init__"
+        self.last_lock_reacquire: datetime = datetime.now(timezone.utc)

    def should_stop(self) -> bool:
        if self.redis_client.exists(self.stop_key):
            return True
        return False

-    def progress(self, amount: int) -> None:
-        self.redis_lock.reacquire()
+    def progress(self, tag: str, amount: int) -> None:
+        try:
+            self.redis_lock.reacquire()
+            self.last_tag = tag
+            self.last_lock_reacquire = datetime.now(timezone.utc)
+        except LockError:
+            logger.exception(
+                f"IndexingCallback - lock.reacquire exceptioned. "
+                f"lock_timeout={self.redis_lock.timeout} "
+                f"start={self.started} "
+                f"last_tag={self.last_tag} "
+                f"last_reacquired={self.last_lock_reacquire} "
+                f"now={datetime.now(timezone.utc)}"
+            )
+            raise
+
        self.redis_client.incrby(self.generator_progress_key, amount)


+def get_unfenced_index_attempt_ids(db_session: Session, r: redis.Redis) -> list[int]:
+    """Gets a list of unfenced index attempts. Should not be possible, so we'd typically
+    want to clean them up.
+
+    Unfenced = attempt not in terminal state and fence does not exist.
+    """
+    unfenced_attempts: list[int] = []
+
+    # inner/outer/inner double check pattern to avoid race conditions when checking for
+    # bad state
+    # inner = index_attempt in non terminal state
+    # outer = r.fence_key down
+
+    # check the db for index attempts in a non terminal state
+    attempts: list[IndexAttempt] = []
+    attempts.extend(
+        get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session)
+    )
+    attempts.extend(
+        get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session)
+    )
+
+    for attempt in attempts:
+        fence_key = RedisConnectorIndex.fence_key_with_ids(
+            attempt.connector_credential_pair_id, attempt.search_settings_id
+        )
+
+        # if the fence is down / doesn't exist, possible error but not confirmed
+        if r.exists(fence_key):
+            continue
+
+        # Between the time the attempts are first looked up and the time we see the fence down,
+        # the attempt may have completed and taken down the fence normally.
+
+        # We need to double check that the index attempt is still in a non terminal state
+        # and matches the original state, which confirms we are really in a bad state.
+        attempt_2 = get_index_attempt(db_session, attempt.id)
+        if not attempt_2:
+            continue
+
+        if attempt.status != attempt_2.status:
+            continue
+
+        unfenced_attempts.append(attempt.id)
+
+    return unfenced_attempts
+
+
@shared_task(
-    name="check_for_indexing",
+    name=DanswerCeleryTask.CHECK_FOR_INDEXING,
    soft_time_limit=300,
    bind=True,
 )
 def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
    tasks_created = 0
-
+    locked = False
    r = get_redis_client(tenant_id=tenant_id)

-    lock_beat = r.lock(
+    lock_beat: RedisLock = r.lock(
        DanswerRedisLocks.CHECK_INDEXING_BEAT_LOCK,
        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
    )
@@ -100,6 +176,9 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
        if not lock_beat.acquire(blocking=False):
            return None

+        locked = True
+
+        # check for search settings swap
        with get_session_with_tenant(tenant_id=tenant_id) as db_session:
            old_search_settings = check_index_swap(db_session=db_session)
            current_search_settings = get_current_search_settings(db_session)
@@ -118,26 +197,24 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
                        embedding_model=embedding_model,
                    )

+        # gather cc_pair_ids
        cc_pair_ids: list[int] = []
        with get_session_with_tenant(tenant_id) as db_session:
+            lock_beat.reacquire()
            cc_pairs = fetch_connector_credential_pairs(db_session)
            for cc_pair_entry in cc_pairs:
                cc_pair_ids.append(cc_pair_entry.id)

+        # kick off index attempts
        for cc_pair_id in cc_pair_ids:
+            lock_beat.reacquire()
+
            redis_connector = RedisConnector(tenant_id, cc_pair_id)
            with get_session_with_tenant(tenant_id) as db_session:
-                # Get the primary search settings
-                primary_search_settings = get_current_search_settings(db_session)
-                search_settings = [primary_search_settings]
-
-                # Check for secondary search settings
-                secondary_search_settings = get_secondary_search_settings(db_session)
-                if secondary_search_settings is not None:
-                    # If secondary settings exist, add them to the list
-                    search_settings.append(secondary_search_settings)
-
-                for search_settings_instance in search_settings:
+                search_settings_list: list[SearchSettings] = get_active_search_settings(
+                    db_session
+                )
+                for search_settings_instance in search_settings_list:
                    redis_connector_index = redis_connector.new_index(
                        search_settings_instance.id
                    )
@@ -153,33 +230,80 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
                    last_attempt = get_last_attempt_for_cc_pair(
                        cc_pair.id, search_settings_instance.id, db_session
                    )
+
+                    search_settings_primary = False
+                    if search_settings_instance.id == search_settings_list[0].id:
+                        search_settings_primary = True
+
                    if not _should_index(
                        cc_pair=cc_pair,
                        last_index=last_attempt,
                        search_settings_instance=search_settings_instance,
-                        secondary_index_building=len(search_settings) > 1,
+                        search_settings_primary=search_settings_primary,
+                        secondary_index_building=len(search_settings_list) > 1,
                        db_session=db_session,
                    ):
                        continue

+                    reindex = False
+                    if search_settings_instance.id == search_settings_list[0].id:
+                        # the indexing trigger is only checked and cleared with the primary search settings
+                        if cc_pair.indexing_trigger is not None:
+                            if cc_pair.indexing_trigger == IndexingMode.REINDEX:
+                                reindex = True
+
+                            task_logger.info(
+                                f"Connector indexing manual trigger detected: "
+                                f"cc_pair={cc_pair.id} "
+                                f"search_settings={search_settings_instance.id} "
+                                f"indexing_mode={cc_pair.indexing_trigger}"
+                            )
+
+                            mark_ccpair_with_indexing_trigger(
+                                cc_pair.id, None, db_session
+                            )
+
                    # using a task queue and only allowing one task per cc_pair/search_setting
                    # prevents us from starving out certain attempts
                    attempt_id = try_creating_indexing_task(
                        self.app,
                        cc_pair,
                        search_settings_instance,
-                        False,
+                        reindex,
                        db_session,
                        r,
                        tenant_id,
                    )
                    if attempt_id:
                        task_logger.info(
-                            f"Indexing queued: index_attempt={attempt_id} "
+                            f"Connector indexing queued: "
+                            f"index_attempt={attempt_id} "
                            f"cc_pair={cc_pair.id} "
-                            f"search_settings={search_settings_instance.id} "
+                            f"search_settings={search_settings_instance.id}"
                        )
                        tasks_created += 1
+
+        # Fail any index attempts in the DB that don't have fences
+        # This shouldn't ever happen!
+        with get_session_with_tenant(tenant_id) as db_session:
+            unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
+            for attempt_id in unfenced_attempt_ids:
+                lock_beat.reacquire()
+
+                attempt = get_index_attempt(db_session, attempt_id)
+                if not attempt:
+                    continue
+
+                failure_reason = (
+                    f"Unfenced index attempt found in DB: "
+                    f"index_attempt={attempt.id} "
+                    f"cc_pair={attempt.connector_credential_pair_id} "
+                    f"search_settings={attempt.search_settings_id}"
+                )
+                task_logger.error(failure_reason)
+                mark_attempt_failed(
+                    attempt.id, db_session, failure_reason=failure_reason
+                )
    except SoftTimeLimitExceeded:
        task_logger.info(
            "Soft time limit exceeded, task is being terminated gracefully."
@@ -187,8 +311,14 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
    except Exception:
        task_logger.exception(f"Unexpected exception: tenant={tenant_id}")
    finally:
-        if lock_beat.owned():
-            lock_beat.release()
+        if locked:
+            if lock_beat.owned():
+                lock_beat.release()
+            else:
+                task_logger.error(
+                    "check_for_indexing - Lock not owned on completion: "
+                    f"tenant={tenant_id}"
+                )

    return tasks_created

@@ -197,6 +327,7 @@ def _should_index(
    cc_pair: ConnectorCredentialPair,
    last_index: IndexAttempt | None,
    search_settings_instance: SearchSettings,
+    search_settings_primary: bool,
    secondary_index_building: bool,
    db_session: Session,
 ) -> bool:
@@ -261,6 +392,11 @@ def _should_index(
    ):
        return False

+    if search_settings_primary:
+        if cc_pair.indexing_trigger is not None:
+            # if a manual indexing trigger is on the cc pair, honor it for primary search settings
+            return True
+
    # if no attempt has ever occurred, we should index regardless of refresh_freq
    if not last_index:
        return True
@@ -293,10 +429,11 @@ def try_creating_indexing_task(
    """

    LOCK_TIMEOUT = 30
+    index_attempt_id: int | None = None

    # we need to serialize any attempt to trigger indexing since it can be triggered
    # either via celery beat or manually (API call)
-    lock = r.lock(
+    lock: RedisLock = r.lock(
        DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_creating_indexing_task",
        timeout=LOCK_TIMEOUT,
    )
@@ -325,7 +462,7 @@ def try_creating_indexing_task(
        redis_connector_index.generator_clear()

        # set a basic fence to start
-        payload = RedisConnectorIndexingFenceData(
+        payload = RedisConnectorIndexPayload(
            index_attempt_id=None,
            started=None,
            submitted=datetime.now(timezone.utc),
@@ -347,8 +484,10 @@ def try_creating_indexing_task(

        custom_task_id = redis_connector_index.generate_generator_task_id()

+        # when the task is sent, we have yet to finish setting up the fence
+        # therefore, the task must contain code that blocks until the fence is ready
        result = celery_app.send_task(
-            "connector_indexing_proxy_task",
+            DanswerCeleryTask.CONNECTOR_INDEXING_PROXY_TASK,
            kwargs=dict(
                index_attempt_id=index_attempt_id,
                cc_pair_id=cc_pair.id,
@@ -366,15 +505,17 @@ def try_creating_indexing_task(
        payload.index_attempt_id = index_attempt_id
        payload.celery_task_id = result.id
        redis_connector_index.set_fence(payload)
-
    except Exception:
-        redis_connector_index.set_fence(payload)
        task_logger.exception(
-            f"Unexpected exception: "
+            f"try_creating_indexing_task - Unexpected exception: "
            f"tenant={tenant_id} "
            f"cc_pair={cc_pair.id} "
            f"search_settings={search_settings.id}"
        )
+
+        if index_attempt_id is not None:
+            delete_index_attempt(db_session, index_attempt_id)
+        redis_connector_index.set_fence(None)
        return None
    finally:
        if lock.owned():
@@ -383,8 +524,14 @@ def try_creating_indexing_task(
    return index_attempt_id


-@shared_task(name="connector_indexing_proxy_task", acks_late=False, track_started=True)
+@shared_task(
+    name=DanswerCeleryTask.CONNECTOR_INDEXING_PROXY_TASK,
+    bind=True,
+    acks_late=False,
+    track_started=True,
+)
 def connector_indexing_proxy_task(
+    self: Task,
    index_attempt_id: int,
    cc_pair_id: int,
    search_settings_id: int,
@@ -392,15 +539,19 @@ def connector_indexing_proxy_task(
 ) -> None:
    """celery tasks are forked, but forking is unstable.  This proxies work to a spawned task."""
    task_logger.info(
-        f"Indexing proxy - starting: attempt={index_attempt_id} "
+        f"Indexing watchdog - starting: attempt={index_attempt_id} "
        f"tenant={tenant_id} "
        f"cc_pair={cc_pair_id} "
        f"search_settings={search_settings_id}"
    )
+
+    if not self.request.id:
+        task_logger.error("self.request.id is None!")
+
    client = SimpleJobClient()

    job = client.submit(
-        connector_indexing_task,
+        connector_indexing_task_wrapper,
        index_attempt_id,
        cc_pair_id,
        search_settings_id,
@@ -411,7 +562,7 @@ def connector_indexing_proxy_task(

    if not job:
        task_logger.info(
-            f"Indexing proxy - spawn failed: attempt={index_attempt_id} "
+            f"Indexing watchdog - spawn failed: attempt={index_attempt_id} "
            f"tenant={tenant_id} "
            f"cc_pair={cc_pair_id} "
            f"search_settings={search_settings_id}"
@@ -419,31 +570,78 @@ def connector_indexing_proxy_task(
        return

    task_logger.info(
-        f"Indexing proxy - spawn succeeded: attempt={index_attempt_id} "
+        f"Indexing watchdog - spawn succeeded: attempt={index_attempt_id} "
        f"tenant={tenant_id} "
        f"cc_pair={cc_pair_id} "
        f"search_settings={search_settings_id}"
    )

-    while True:
-        sleep(10)
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+    redis_connector_index = redis_connector.new_index(search_settings_id)

-        # do nothing for ongoing jobs that haven't been stopped
-        if not job.done():
-            with get_session_with_tenant(tenant_id) as db_session:
-                index_attempt = get_index_attempt(
-                    db_session=db_session, index_attempt_id=index_attempt_id
+    while True:
+        sleep(5)
+
+        if self.request.id and redis_connector_index.terminating(self.request.id):
+            task_logger.warning(
+                "Indexing watchdog - termination signal detected: "
+                f"attempt={index_attempt_id} "
+                f"tenant={tenant_id} "
+                f"cc_pair={cc_pair_id} "
+                f"search_settings={search_settings_id}"
+            )
+
+            try:
+                with get_session_with_tenant(tenant_id) as db_session:
+                    mark_attempt_canceled(
+                        index_attempt_id,
+                        db_session,
+                        "Connector termination signal detected",
+                    )
+            finally:
+                # if the DB exceptions, we'll just get an unfriendly failure message
+                # in the UI instead of the cancellation message
+                logger.exception(
+                    "Indexing watchdog - transient exception marking index attempt as canceled: "
+                    f"attempt={index_attempt_id} "
+                    f"tenant={tenant_id} "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={search_settings_id}"
                )

-                if not index_attempt:
-                    continue
+                job.cancel()

-                if not index_attempt.is_finished():
-                    continue
+            break
+
+        if not job.done():
+            # if the spawned task is still running, restart the check once again
+            # if the index attempt is not in a finished status
+            try:
+                with get_session_with_tenant(tenant_id) as db_session:
+                    index_attempt = get_index_attempt(
+                        db_session=db_session, index_attempt_id=index_attempt_id
+                    )
+
+                    if not index_attempt:
+                        continue
+
+                    if not index_attempt.is_finished():
+                        continue
+            except Exception:
+                # if the DB exceptioned, just restart the check.
+                # polling the index attempt status doesn't need to be strongly consistent
+                logger.exception(
+                    "Indexing watchdog - transient exception looking up index attempt: "
+                    f"attempt={index_attempt_id} "
+                    f"tenant={tenant_id} "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={search_settings_id}"
+                )
+                continue

        if job.status == "error":
            task_logger.error(
-                f"Indexing proxy - spawned task exceptioned: "
+                "Indexing watchdog - spawned task exceptioned: "
                f"attempt={index_attempt_id} "
                f"tenant={tenant_id} "
                f"cc_pair={cc_pair_id} "
@@ -455,7 +653,7 @@ def connector_indexing_proxy_task(
        break

    task_logger.info(
-        f"Indexing proxy - finished: attempt={index_attempt_id} "
+        f"Indexing watchdog - finished: attempt={index_attempt_id} "
        f"tenant={tenant_id} "
        f"cc_pair={cc_pair_id} "
        f"search_settings={search_settings_id}"
@@ -463,6 +661,38 @@ def connector_indexing_proxy_task(
    return


+def connector_indexing_task_wrapper(
+    index_attempt_id: int,
+    cc_pair_id: int,
+    search_settings_id: int,
+    tenant_id: str | None,
+    is_ee: bool,
+) -> int | None:
+    """Just wraps connector_indexing_task so we can log any exceptions before
+    re-raising it."""
+    result: int | None = None
+
+    try:
+        result = connector_indexing_task(
+            index_attempt_id,
+            cc_pair_id,
+            search_settings_id,
+            tenant_id,
+            is_ee,
+        )
+    except:
+        logger.exception(
+            f"connector_indexing_task exceptioned: "
+            f"tenant={tenant_id} "
+            f"index_attempt={index_attempt_id} "
+            f"cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id}"
+        )
+        raise
+
+    return result
+
+
 def connector_indexing_task(
    index_attempt_id: int,
    cc_pair_id: int,
@@ -499,7 +729,8 @@ def connector_indexing_task(
        logger.debug("Sentry DSN not provided, skipping Sentry initialization")

    logger.info(
-        f"Indexing spawned task starting: attempt={index_attempt_id} "
+        f"Indexing spawned task starting: "
+        f"attempt={index_attempt_id} "
        f"tenant={tenant_id} "
        f"cc_pair={cc_pair_id} "
        f"search_settings={search_settings_id}"
@@ -516,6 +747,7 @@ def connector_indexing_task(
    if redis_connector.delete.fenced:
        raise RuntimeError(
            f"Indexing will not start because connector deletion is in progress: "
+            f"attempt={index_attempt_id} "
            f"cc_pair={cc_pair_id} "
            f"fence={redis_connector.delete.fence_key}"
        )
@@ -523,18 +755,18 @@ def connector_indexing_task(
    if redis_connector.stop.fenced:
        raise RuntimeError(
            f"Indexing will not start because a connector stop signal was detected: "
+            f"attempt={index_attempt_id} "
            f"cc_pair={cc_pair_id} "
            f"fence={redis_connector.stop.fence_key}"
        )

    while True:
-        # wait for the fence to come up
-        if not redis_connector_index.fenced:
+        if not redis_connector_index.fenced:  # The fence must exist
            raise ValueError(
                f"connector_indexing_task - fence not found: fence={redis_connector_index.fence_key}"
            )

-        payload = redis_connector_index.payload
+        payload = redis_connector_index.payload  # The payload must exist
        if not payload:
            raise ValueError("connector_indexing_task: payload invalid or not found")

@@ -557,16 +789,19 @@ def connector_indexing_task(
        )
        break

-    lock = r.lock(
+    # set thread_local=False since we don't control what thread the indexing/pruning
+    # might run our callback with
+    lock: RedisLock = r.lock(
        redis_connector_index.generator_lock_key,
        timeout=CELERY_INDEXING_LOCK_TIMEOUT,
+        thread_local=False,
    )

    acquired = lock.acquire(blocking=False)
    if not acquired:
        logger.warning(
            f"Indexing task already running, exiting...: "
-            f"cc_pair={cc_pair_id} search_settings={search_settings_id}"
+            f"index_attempt={index_attempt_id} cc_pair={cc_pair_id} search_settings={search_settings_id}"
        )
        return None

@@ -601,7 +836,7 @@ def connector_indexing_task(
                )

        # define a callback class
-        callback = RunIndexingCallback(
+        callback = IndexingCallback(
            redis_connector.stop.fence_key,
            redis_connector_index.generator_progress_key,
            lock,
--- a/backend/danswer/background/celery/tasks/periodic/tasks.py
+++ b/backend/danswer/background/celery/tasks/periodic/tasks.py
@@ -13,12 +13,13 @@ from sqlalchemy.orm import Session

 from danswer.background.celery.apps.app_base import task_logger
 from danswer.configs.app_configs import JOB_TIMEOUT
+from danswer.configs.constants import DanswerCeleryTask
 from danswer.configs.constants import PostgresAdvisoryLocks
 from danswer.db.engine import get_session_with_tenant


@shared_task(
-    name="kombu_message_cleanup_task",
+    name=DanswerCeleryTask.KOMBU_MESSAGE_CLEANUP_TASK,
    soft_time_limit=JOB_TIMEOUT,
    bind=True,
    base=AbortableTask,
--- a/backend/danswer/background/celery/tasks/pruning/tasks.py
+++ b/backend/danswer/background/celery/tasks/pruning/tasks.py
@@ -8,11 +8,12 @@ from celery import shared_task
 from celery import Task
 from celery.exceptions import SoftTimeLimitExceeded
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.background.celery.apps.app_base import task_logger
 from danswer.background.celery.celery_utils import extract_ids_from_runnable_connector
-from danswer.background.celery.tasks.indexing.tasks import RunIndexingCallback
+from danswer.background.celery.tasks.indexing.tasks import IndexingCallback
 from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING
 from danswer.configs.app_configs import JOB_TIMEOUT
 from danswer.configs.constants import CELERY_PRUNING_LOCK_TIMEOUT
@@ -20,6 +21,7 @@ from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
 from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
 from danswer.configs.constants import DanswerCeleryPriority
 from danswer.configs.constants import DanswerCeleryQueues
+from danswer.configs.constants import DanswerCeleryTask
 from danswer.configs.constants import DanswerRedisLocks
 from danswer.connectors.factory import instantiate_connector
 from danswer.connectors.models import InputType
@@ -38,8 +40,44 @@ from danswer.utils.logger import setup_logger
 logger = setup_logger()


+def _is_pruning_due(cc_pair: ConnectorCredentialPair) -> bool:
+    """Returns boolean indicating if pruning is due.
+
+    Next pruning time is calculated as a delta from the last successful prune, or the
+    last successful indexing if pruning has never succeeded.
+
+    TODO(rkuo): consider whether we should allow pruning to be immediately rescheduled
+    if pruning fails (which is what it does now). A backoff could be reasonable.
+    """
+
+    # skip pruning if no prune frequency is set
+    # pruning can still be forced via the API which will run a pruning task directly
+    if not cc_pair.connector.prune_freq:
+        return False
+
+    # skip pruning if not active
+    if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
+        return False
+
+    # skip pruning if the next scheduled prune time hasn't been reached yet
+    last_pruned = cc_pair.last_pruned
+    if not last_pruned:
+        if not cc_pair.last_successful_index_time:
+            # if we've never indexed, we can't prune
+            return False
+
+        # if never pruned, use the last time the connector indexed successfully
+        last_pruned = cc_pair.last_successful_index_time
+
+    next_prune = last_pruned + timedelta(seconds=cc_pair.connector.prune_freq)
+    if datetime.now(timezone.utc) < next_prune:
+        return False
+
+    return True
+
+
@shared_task(
-    name="check_for_pruning",
+    name=DanswerCeleryTask.CHECK_FOR_PRUNING,
    soft_time_limit=JOB_TIMEOUT,
    bind=True,
 )
@@ -69,7 +107,7 @@ def check_for_pruning(self: Task, *, tenant_id: str | None) -> None:
                if not cc_pair:
                    continue

-                if not is_pruning_due(cc_pair, db_session, r):
+                if not _is_pruning_due(cc_pair):
                    continue

                tasks_created = try_creating_prune_generator_task(
@@ -90,47 +128,6 @@ def check_for_pruning(self: Task, *, tenant_id: str | None) -> None:
            lock_beat.release()


-def is_pruning_due(
-    cc_pair: ConnectorCredentialPair,
-    db_session: Session,
-    r: Redis,
-) -> bool:
-    """Returns an int if pruning is triggered.
-    The int represents the number of prune tasks generated (in this case, only one
-    because the task is a long running generator task.)
-    Returns None if no pruning is triggered (due to not being needed or
-    other reasons such as simultaneous pruning restrictions.
-
-    Checks for scheduling related conditions, then delegates the rest of the checks to
-    try_creating_prune_generator_task.
-    """
-
-    # skip pruning if no prune frequency is set
-    # pruning can still be forced via the API which will run a pruning task directly
-    if not cc_pair.connector.prune_freq:
-        return False
-
-    # skip pruning if not active
-    if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
-        return False
-
-    # skip pruning if the next scheduled prune time hasn't been reached yet
-    last_pruned = cc_pair.last_pruned
-    if not last_pruned:
-        if not cc_pair.last_successful_index_time:
-            # if we've never indexed, we can't prune
-            return False
-
-        # if never pruned, use the last time the connector indexed successfully
-        last_pruned = cc_pair.last_successful_index_time
-
-    next_prune = last_pruned + timedelta(seconds=cc_pair.connector.prune_freq)
-    if datetime.now(timezone.utc) < next_prune:
-        return False
-
-    return True
-
-
 def try_creating_prune_generator_task(
    celery_app: Celery,
    cc_pair: ConnectorCredentialPair,
@@ -166,10 +163,16 @@ def try_creating_prune_generator_task(
        return None

    try:
-        if redis_connector.prune.fenced:  # skip pruning if already pruning
+        # skip pruning if already pruning
+        if redis_connector.prune.fenced:
            return None

-        if redis_connector.delete.fenced:  # skip pruning if the cc_pair is deleting
+        # skip pruning if the cc_pair is deleting
+        if redis_connector.delete.fenced:
+            return None
+
+        # skip pruning if doc permissions sync is running
+        if redis_connector.permissions.fenced:
            return None

        db_session.refresh(cc_pair)
@@ -183,7 +186,7 @@ def try_creating_prune_generator_task(
        custom_task_id = f"{redis_connector.prune.generator_task_key}_{uuid4()}"

        celery_app.send_task(
-            "connector_pruning_generator_task",
+            DanswerCeleryTask.CONNECTOR_PRUNING_GENERATOR_TASK,
            kwargs=dict(
                cc_pair_id=cc_pair.id,
                connector_id=cc_pair.connector_id,
@@ -208,7 +211,7 @@ def try_creating_prune_generator_task(


@shared_task(
-    name="connector_pruning_generator_task",
+    name=DanswerCeleryTask.CONNECTOR_PRUNING_GENERATOR_TASK,
    acks_late=False,
    soft_time_limit=JOB_TIMEOUT,
    track_started=True,
@@ -231,13 +234,18 @@ def connector_pruning_generator_task(
    pruning_ctx_dict["request_id"] = self.request.id
    pruning_ctx.set(pruning_ctx_dict)

+    task_logger.info(f"Pruning generator starting: cc_pair={cc_pair_id}")
+
    redis_connector = RedisConnector(tenant_id, cc_pair_id)

    r = get_redis_client(tenant_id=tenant_id)

-    lock = r.lock(
+    # set thread_local=False since we don't control what thread the indexing/pruning
+    # might run our callback with
+    lock: RedisLock = r.lock(
        DanswerRedisLocks.PRUNING_LOCK_PREFIX + f"_{redis_connector.id}",
        timeout=CELERY_PRUNING_LOCK_TIMEOUT,
+        thread_local=False,
    )

    acquired = lock.acquire(blocking=False)
@@ -261,6 +269,11 @@ def connector_pruning_generator_task(
                )
                return

+            task_logger.info(
+                f"Pruning generator running connector: "
+                f"cc_pair={cc_pair_id} "
+                f"connector_source={cc_pair.connector.source}"
+            )
            runnable_connector = instantiate_connector(
                db_session,
                cc_pair.connector.source,
@@ -269,12 +282,13 @@ def connector_pruning_generator_task(
                cc_pair.credential,
            )

-            callback = RunIndexingCallback(
+            callback = IndexingCallback(
                redis_connector.stop.fence_key,
                redis_connector.prune.generator_progress_key,
                lock,
                r,
            )
+
            # a list of docs in the source
            all_connector_doc_ids: set[str] = extract_ids_from_runnable_connector(
                runnable_connector, callback
@@ -296,8 +310,8 @@ def connector_pruning_generator_task(
            task_logger.info(
                f"Pruning set collected: "
                f"cc_pair={cc_pair_id} "
-                f"docs_to_remove={len(doc_ids_to_remove)} "
-                f"doc_source={cc_pair.connector.source}"
+                f"connector_source={cc_pair.connector.source} "
+                f"docs_to_remove={len(doc_ids_to_remove)}"
            )

            task_logger.info(
@@ -320,10 +334,10 @@ def connector_pruning_generator_task(
            f"Failed to run pruning: cc_pair={cc_pair_id} connector={connector_id}"
        )

-        redis_connector.prune.generator_clear()
-        redis_connector.prune.taskset_clear()
-        redis_connector.prune.set_fence(False)
+        redis_connector.prune.reset()
        raise e
    finally:
        if lock.owned():
            lock.release()
+
+        task_logger.info(f"Pruning generator finished: cc_pair={cc_pair_id}")
--- a/backend/danswer/background/celery/tasks/shared/tasks.py
+++ b/backend/danswer/background/celery/tasks/shared/tasks.py
@@ -9,6 +9,7 @@ from tenacity import RetryError
 from danswer.access.access import get_access_for_document
 from danswer.background.celery.apps.app_base import task_logger
 from danswer.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
+from danswer.configs.constants import DanswerCeleryTask
 from danswer.db.document import delete_document_by_connector_credential_pair__no_commit
 from danswer.db.document import delete_documents_complete__no_commit
 from danswer.db.document import get_document
@@ -31,7 +32,7 @@ LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15


@shared_task(
-    name="document_by_cc_pair_cleanup_task",
+    name=DanswerCeleryTask.DOCUMENT_BY_CC_PAIR_CLEANUP_TASK,
    soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
    time_limit=LIGHT_TIME_LIMIT,
    max_retries=DOCUMENT_BY_CC_PAIR_CLEANUP_MAX_RETRIES,
@@ -59,7 +60,7 @@ def document_by_cc_pair_cleanup_task(
    connector / credential pair from the access list
    (6) delete all relevant entries from postgres
    """
-    task_logger.info(f"tenant={tenant_id} doc={document_id}")
+    task_logger.debug(f"Task start: tenant={tenant_id} doc={document_id}")

    try:
        with get_session_with_tenant(tenant_id) as db_session:
@@ -141,7 +142,9 @@ def document_by_cc_pair_cleanup_task(
        return False
    except Exception as ex:
        if isinstance(ex, RetryError):
-            task_logger.info(f"Retry failed: {ex.last_attempt.attempt_number}")
+            task_logger.warning(
+                f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}"
+            )

            # only set the inner exception if it is of type Exception
            e_temp = ex.last_attempt.exception()
@@ -171,11 +174,21 @@ def document_by_cc_pair_cleanup_task(
        else:
            # This is the last attempt! mark the document as dirty in the db so that it
            # eventually gets fixed out of band via stale document reconciliation
-            task_logger.info(
-                f"Max retries reached. Marking doc as dirty for reconciliation: "
+            task_logger.warning(
+                f"Max celery task retries reached. Marking doc as dirty for reconciliation: "
                f"tenant={tenant_id} doc={document_id}"
            )
-            with get_session_with_tenant(tenant_id):
+            with get_session_with_tenant(tenant_id) as db_session:
+                # delete the cc pair relationship now and let reconciliation clean it up
+                # in vespa
+                delete_document_by_connector_credential_pair__no_commit(
+                    db_session=db_session,
+                    document_id=document_id,
+                    connector_credential_pair_identifier=ConnectorCredentialPairIdentifier(
+                        connector_id=connector_id,
+                        credential_id=credential_id,
+                    ),
+                )
                mark_document_as_modified(document_id, db_session)
        return False

--- a/backend/danswer/background/celery/tasks/vespa/tasks.py
+++ b/backend/danswer/background/celery/tasks/vespa/tasks.py
@@ -5,7 +5,6 @@ from http import HTTPStatus
 from typing import cast

 import httpx
-import redis
 from celery import Celery
 from celery import shared_task
 from celery import Task
@@ -13,6 +12,7 @@ from celery.exceptions import SoftTimeLimitExceeded
 from celery.result import AsyncResult
 from celery.states import READY_STATES
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session
 from tenacity import RetryError

@@ -25,8 +25,10 @@ from danswer.background.celery.tasks.shared.tasks import LIGHT_TIME_LIMIT
 from danswer.configs.app_configs import JOB_TIMEOUT
 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
 from danswer.configs.constants import DanswerCeleryQueues
+from danswer.configs.constants import DanswerCeleryTask
 from danswer.configs.constants import DanswerRedisLocks
 from danswer.db.connector import fetch_connector_by_id
+from danswer.db.connector import mark_cc_pair_as_permissions_synced
 from danswer.db.connector import mark_ccpair_as_pruned
 from danswer.db.connector_credential_pair import add_deletion_failure_message
 from danswer.db.connector_credential_pair import (
@@ -47,17 +49,19 @@ from danswer.db.document_set import mark_document_set_as_synced
 from danswer.db.engine import get_session_with_tenant
 from danswer.db.enums import IndexingStatus
 from danswer.db.index_attempt import delete_index_attempts
-from danswer.db.index_attempt import get_all_index_attempts_by_status
 from danswer.db.index_attempt import get_index_attempt
 from danswer.db.index_attempt import mark_attempt_failed
 from danswer.db.models import DocumentSet
-from danswer.db.models import IndexAttempt
 from danswer.document_index.document_index_utils import get_both_index_names
 from danswer.document_index.factory import get_default_document_index
 from danswer.document_index.interfaces import VespaDocumentFields
 from danswer.redis.redis_connector import RedisConnector
 from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
 from danswer.redis.redis_connector_delete import RedisConnectorDelete
+from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
+from danswer.redis.redis_connector_doc_perm_sync import (
+    RedisConnectorPermissionSyncPayload,
+)
 from danswer.redis.redis_connector_index import RedisConnectorIndex
 from danswer.redis.redis_connector_prune import RedisConnectorPrune
 from danswer.redis.redis_document_set import RedisDocumentSet
@@ -77,7 +81,7 @@ logger = setup_logger()
 # celery auto associates tasks created inside another task,
 # which bloats the result metadata considerably. trail=False prevents this.
@shared_task(
-    name="check_for_vespa_sync_task",
+    name=DanswerCeleryTask.CHECK_FOR_VESPA_SYNC_TASK,
    soft_time_limit=JOB_TIMEOUT,
    trail=False,
    bind=True,
@@ -162,7 +166,7 @@ def try_generate_stale_document_sync_tasks(
    celery_app: Celery,
    db_session: Session,
    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    # the fence is up, do nothing
@@ -180,7 +184,12 @@ def try_generate_stale_document_sync_tasks(
        f"Stale documents found (at least {stale_doc_count}). Generating sync tasks by cc pair."
    )

-    task_logger.info("RedisConnector.generate_tasks starting by cc_pair.")
+    task_logger.info(
+        "RedisConnector.generate_tasks starting by cc_pair. "
+        "Documents spanning multiple cc_pairs will only be synced once."
+    )
+
+    docs_to_skip: set[str] = set()

    # rkuo: we could technically sync all stale docs in one big pass.
    # but I feel it's more understandable to group the docs by cc_pair
@@ -188,22 +197,21 @@ def try_generate_stale_document_sync_tasks(
    cc_pairs = get_connector_credential_pairs(db_session)
    for cc_pair in cc_pairs:
        rc = RedisConnectorCredentialPair(tenant_id, cc_pair.id)
-        tasks_generated = rc.generate_tasks(
-            celery_app, db_session, r, lock_beat, tenant_id
-        )
+        rc.set_skip_docs(docs_to_skip)
+        result = rc.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)

-        if tasks_generated is None:
+        if result is None:
            continue

-        if tasks_generated == 0:
+        if result[1] == 0:
            continue

        task_logger.info(
            f"RedisConnector.generate_tasks finished for single cc_pair. "
-            f"cc_pair_id={cc_pair.id} tasks_generated={tasks_generated}"
+            f"cc_pair={cc_pair.id} tasks_generated={result[0]} tasks_possible={result[1]}"
        )

-        total_tasks_generated += tasks_generated
+        total_tasks_generated += result[0]

    task_logger.info(
        f"RedisConnector.generate_tasks finished for all cc_pairs. total_tasks_generated={total_tasks_generated}"
@@ -218,7 +226,7 @@ def try_generate_document_set_sync_tasks(
    document_set_id: int,
    db_session: Session,
    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    lock_beat.reacquire()
@@ -246,12 +254,11 @@ def try_generate_document_set_sync_tasks(
    )

    # Add all documents that need to be updated into the queue
-    tasks_generated = rds.generate_tasks(
-        celery_app, db_session, r, lock_beat, tenant_id
-    )
-    if tasks_generated is None:
+    result = rds.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)
+    if result is None:
        return None

+    tasks_generated = result[0]
    # Currently we are allowing the sync to proceed with 0 tasks.
    # It's possible for sets/groups to be generated initially with no entries
    # and they still need to be marked as up to date.
@@ -260,7 +267,7 @@ def try_generate_document_set_sync_tasks(

    task_logger.info(
        f"RedisDocumentSet.generate_tasks finished. "
-        f"document_set_id={document_set.id} tasks_generated={tasks_generated}"
+        f"document_set={document_set.id} tasks_generated={tasks_generated}"
    )

    # set this only after all tasks have been added
@@ -273,7 +280,7 @@ def try_generate_user_group_sync_tasks(
    usergroup_id: int,
    db_session: Session,
    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    lock_beat.reacquire()
@@ -302,12 +309,11 @@ def try_generate_user_group_sync_tasks(
    task_logger.info(
        f"RedisUserGroup.generate_tasks starting. usergroup_id={usergroup.id}"
    )
-    tasks_generated = rug.generate_tasks(
-        celery_app, db_session, r, lock_beat, tenant_id
-    )
-    if tasks_generated is None:
+    result = rug.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)
+    if result is None:
        return None

+    tasks_generated = result[0]
    # Currently we are allowing the sync to proceed with 0 tasks.
    # It's possible for sets/groups to be generated initially with no entries
    # and they still need to be marked as up to date.
@@ -316,7 +322,7 @@ def try_generate_user_group_sync_tasks(

    task_logger.info(
        f"RedisUserGroup.generate_tasks finished. "
-        f"usergroup_id={usergroup.id} tasks_generated={tasks_generated}"
+        f"usergroup={usergroup.id} tasks_generated={tasks_generated}"
    )

    # set this only after all tasks have been added
@@ -436,11 +442,22 @@ def monitor_connector_deletion_taskset(
                db_session, cc_pair.connector_id, cc_pair.credential_id
            )
            if len(doc_ids) > 0:
-                # if this happens, documents somehow got added while deletion was in progress. Likely a bug
-                # gating off pruning and indexing work before deletion starts
+                # NOTE(rkuo): if this happens, documents somehow got added while
+                # deletion was in progress. Likely a bug gating off pruning and indexing
+                # work before deletion starts.
                task_logger.warning(
-                    f"Connector deletion - documents still found after taskset completion: "
-                    f"cc_pair={cc_pair_id} num={len(doc_ids)}"
+                    "Connector deletion - documents still found after taskset completion. "
+                    "Clearing the current deletion attempt and allowing deletion to restart: "
+                    f"cc_pair={cc_pair_id} "
+                    f"docs_deleted={fence_data.num_tasks} "
+                    f"docs_remaining={len(doc_ids)}"
+                )
+
+                # We don't want to waive off why we get into this state, but resetting
+                # our attempt and letting the deletion restart is a good way to recover
+                redis_connector.delete.reset()
+                raise RuntimeError(
+                    "Connector deletion - documents still found after taskset completion"
                )

            # clean up the rest of the related Postgres entities
@@ -504,8 +521,7 @@ def monitor_connector_deletion_taskset(
        f"docs_deleted={fence_data.num_tasks}"
    )

-    redis_connector.delete.taskset_clear()
-    redis_connector.delete.set_fence(None)
+    redis_connector.delete.reset()


 def monitor_ccpair_pruning_taskset(
@@ -546,6 +562,45 @@ def monitor_ccpair_pruning_taskset(
    redis_connector.prune.set_fence(False)


+def monitor_ccpair_permissions_taskset(
+    tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
+) -> None:
+    fence_key = key_bytes.decode("utf-8")
+    cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
+    if cc_pair_id_str is None:
+        task_logger.warning(
+            f"monitor_ccpair_permissions_taskset: could not parse cc_pair_id from {fence_key}"
+        )
+        return
+
+    cc_pair_id = int(cc_pair_id_str)
+
+    redis_connector = RedisConnector(tenant_id, cc_pair_id)
+    if not redis_connector.permissions.fenced:
+        return
+
+    initial = redis_connector.permissions.generator_complete
+    if initial is None:
+        return
+
+    remaining = redis_connector.permissions.get_remaining()
+    task_logger.info(
+        f"Permissions sync progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}"
+    )
+    if remaining > 0:
+        return
+
+    payload: RedisConnectorPermissionSyncPayload | None = (
+        redis_connector.permissions.payload
+    )
+    start_time: datetime | None = payload.started if payload else None
+
+    mark_cc_pair_as_permissions_synced(db_session, int(cc_pair_id), start_time)
+    task_logger.info(f"Successfully synced permissions for cc_pair={cc_pair_id}")
+
+    redis_connector.permissions.reset()
+
+
 def monitor_ccpair_indexing_taskset(
    tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
 ) -> None:
@@ -580,8 +635,8 @@ def monitor_ccpair_indexing_taskset(
    progress = redis_connector_index.get_progress()
    if progress is not None:
        task_logger.info(
-            f"Connector indexing progress: cc_pair_id={cc_pair_id} "
-            f"search_settings_id={search_settings_id} "
+            f"Connector indexing progress: cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id} "
            f"progress={progress} "
            f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
        )
@@ -590,39 +645,62 @@ def monitor_ccpair_indexing_taskset(
        # the task is still setting up
        return

-    # Read result state BEFORE generator_complete_key to avoid a race condition
    # never use any blocking methods on the result from inside a task!
    result: AsyncResult = AsyncResult(payload.celery_task_id)
-    result_state = result.state

+    # inner/outer/inner double check pattern to avoid race conditions when checking for
+    # bad state
+
+    # inner = get_completion / generator_complete not signaled
+    # outer = result.state in READY state
    status_int = redis_connector_index.get_completion()
-    if status_int is None:
-        if result_state in READY_STATES:
-            # IF the task state is READY, THEN generator_complete should be set
-            # if it isn't, then the worker crashed
-            task_logger.info(
-                f"Connector indexing aborted: "
-                f"cc_pair_id={cc_pair_id} "
-                f"search_settings_id={search_settings_id} "
-                f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
-            )
+    if status_int is None:  # inner signal not set ... possible error
+        task_state = result.state
+        if (
+            task_state in READY_STATES
+        ):  # outer signal in terminal state ... possible error
+            # Now double check!
+            if redis_connector_index.get_completion() is None:
+                # inner signal still not set (and cannot change when outer result_state is READY)
+                # Task is finished but generator complete isn't set.
+                # We have a problem! Worker may have crashed.
+                task_result = str(result.result)
+                task_traceback = str(result.traceback)

-            index_attempt = get_index_attempt(db_session, payload.index_attempt_id)
-            if index_attempt:
-                mark_attempt_failed(
-                    index_attempt_id=payload.index_attempt_id,
-                    db_session=db_session,
-                    failure_reason="Connector indexing aborted or exceptioned.",
+                msg = (
+                    f"Connector indexing aborted or exceptioned: "
+                    f"attempt={payload.index_attempt_id} "
+                    f"celery_task={payload.celery_task_id} "
+                    f"cc_pair={cc_pair_id} "
+                    f"search_settings={search_settings_id} "
+                    f"elapsed_submitted={elapsed_submitted.total_seconds():.2f} "
+                    f"result.state={task_state} "
+                    f"result.result={task_result} "
+                    f"result.traceback={task_traceback}"
                )
+                task_logger.warning(msg)

-            redis_connector_index.reset()
+                index_attempt = get_index_attempt(db_session, payload.index_attempt_id)
+                if index_attempt:
+                    if (
+                        index_attempt.status != IndexingStatus.CANCELED
+                        and index_attempt.status != IndexingStatus.FAILED
+                    ):
+                        mark_attempt_failed(
+                            index_attempt_id=payload.index_attempt_id,
+                            db_session=db_session,
+                            failure_reason=msg,
+                        )
+
+                redis_connector_index.reset()
        return

    status_enum = HTTPStatus(status_int)

    task_logger.info(
-        f"Connector indexing finished: cc_pair_id={cc_pair_id} "
-        f"search_settings_id={search_settings_id} "
+        f"Connector indexing finished: cc_pair={cc_pair_id} "
+        f"search_settings={search_settings_id} "
+        f"progress={progress} "
        f"status={status_enum.name} "
        f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
    )
@@ -630,7 +708,7 @@ def monitor_ccpair_indexing_taskset(
    redis_connector_index.reset()


-@shared_task(name="monitor_vespa_sync", soft_time_limit=300, bind=True)
+@shared_task(name=DanswerCeleryTask.MONITOR_VESPA_SYNC, soft_time_limit=300, bind=True)
 def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
    """This is a celery beat task that monitors and finalizes metadata sync tasksets.
    It scans for fence values and then gets the counts of any associated tasksets.
@@ -643,7 +721,7 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
    """
    r = get_redis_client(tenant_id=tenant_id)

-    lock_beat: redis.lock.Lock = r.lock(
+    lock_beat: RedisLock = r.lock(
        DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK,
        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
    )
@@ -655,7 +733,7 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:

        # print current queue lengths
        r_celery = self.app.broker_connection().channel().client  # type: ignore
-        n_celery = celery_get_queue_length("celery", r)
+        n_celery = celery_get_queue_length("celery", r_celery)
        n_indexing = celery_get_queue_length(
            DanswerCeleryQueues.CONNECTOR_INDEXING, r_celery
        )
@@ -668,41 +746,19 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
        n_pruning = celery_get_queue_length(
            DanswerCeleryQueues.CONNECTOR_PRUNING, r_celery
        )
+        n_permissions_sync = celery_get_queue_length(
+            DanswerCeleryQueues.CONNECTOR_DOC_PERMISSIONS_SYNC, r_celery
+        )

        task_logger.info(
            f"Queue lengths: celery={n_celery} "
            f"indexing={n_indexing} "
            f"sync={n_sync} "
            f"deletion={n_deletion} "
-            f"pruning={n_pruning}"
+            f"pruning={n_pruning} "
+            f"permissions_sync={n_permissions_sync} "
        )

-        # do some cleanup before clearing fences
-        # check the db for any outstanding index attempts
-        with get_session_with_tenant(tenant_id) as db_session:
-            attempts: list[IndexAttempt] = []
-            attempts.extend(
-                get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session)
-            )
-            attempts.extend(
-                get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session)
-            )
-
-            for a in attempts:
-                # if attempts exist in the db but we don't detect them in redis, mark them as failed
-                fence_key = RedisConnectorIndex.fence_key_with_ids(
-                    a.connector_credential_pair_id, a.search_settings_id
-                )
-                if not r.exists(fence_key):
-                    failure_reason = (
-                        f"Unknown index attempt. Might be left over from a process restart: "
-                        f"index_attempt={a.id} "
-                        f"cc_pair={a.connector_credential_pair_id} "
-                        f"search_settings={a.search_settings_id}"
-                    )
-                    task_logger.warning(failure_reason)
-                    mark_attempt_failed(a.id, db_session, failure_reason=failure_reason)
-
        lock_beat.reacquire()
        if r.exists(RedisConnectorCredentialPair.get_fence_key()):
            monitor_connector_taskset(r)
@@ -741,6 +797,12 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
            with get_session_with_tenant(tenant_id) as db_session:
                monitor_ccpair_indexing_taskset(tenant_id, key_bytes, r, db_session)

+        lock_beat.reacquire()
+        for key_bytes in r.scan_iter(RedisConnectorPermissionSync.FENCE_PREFIX + "*"):
+            lock_beat.reacquire()
+            with get_session_with_tenant(tenant_id) as db_session:
+                monitor_ccpair_permissions_taskset(tenant_id, key_bytes, r, db_session)
+
        # uncomment for debugging if needed
        # r_celery = celery_app.broker_connection().channel().client
        # length = celery_get_queue_length(DanswerCeleryQueues.VESPA_METADATA_SYNC, r_celery)
@@ -757,7 +819,7 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:


@shared_task(
-    name="vespa_metadata_sync_task",
+    name=DanswerCeleryTask.VESPA_METADATA_SYNC_TASK,
    bind=True,
    soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
    time_limit=LIGHT_TIME_LIMIT,
@@ -811,7 +873,9 @@ def vespa_metadata_sync_task(
        )
    except Exception as ex:
        if isinstance(ex, RetryError):
-            task_logger.warning(f"Retry failed: {ex.last_attempt.attempt_number}")
+            task_logger.warning(
+                f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}"
+            )

            # only set the inner exception if it is of type Exception
            e_temp = ex.last_attempt.exception()
--- a/backend/danswer/background/celery/versioned_apps/beat.py
+++ b/backend/danswer/background/celery/versioned_apps/beat.py
@@ -1,6 +1,8 @@
 """Factory stub for running celery worker / celery beat."""
+from celery import Celery
+
 from danswer.background.celery.apps.beat import celery_app
 from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable

 set_is_ee_based_on_env_variable()
-app = celery_app
+app: Celery = celery_app
--- a/backend/danswer/background/celery/versioned_apps/primary.py
+++ b/backend/danswer/background/celery/versioned_apps/primary.py
@@ -1,8 +1,10 @@
 """Factory stub for running celery worker / celery beat."""
+from celery import Celery
+
 from danswer.utils.variable_functionality import fetch_versioned_implementation
 from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable

 set_is_ee_based_on_env_variable()
-app = fetch_versioned_implementation(
+app: Celery = fetch_versioned_implementation(
    "danswer.background.celery.apps.primary", "celery_app"
 )
--- a/backend/danswer/background/indexing/job_client.py
+++ b/backend/danswer/background/indexing/job_client.py
@@ -29,18 +29,26 @@ JobStatusType = (
 def _initializer(
    func: Callable, args: list | tuple, kwargs: dict[str, Any] | None = None
 ) -> Any:
-    """Ensure the parent proc's database connections are not touched
-    in the new connection pool
+    """Initialize the child process with a fresh SQLAlchemy Engine.

-    Based on the recommended approach in the SQLAlchemy docs found:
+    Based on SQLAlchemy's recommendations to handle multiprocessing:
    https://docs.sqlalchemy.org/en/20/core/pooling.html#using-connection-pools-with-multiprocessing-or-os-fork
    """
    if kwargs is None:
        kwargs = {}

    logger.info("Initializing spawned worker child process.")
+
+    # Reset the engine in the child process
+    SqlEngine.reset_engine()
+
+    # Optionally set a custom app name for database logging purposes
    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_CHILD_APP_NAME)
+
+    # Initialize a new engine with desired parameters
    SqlEngine.init_engine(pool_size=4, max_overflow=12, pool_recycle=60)
+
+    # Proceed with executing the target function
    return func(*args, **kwargs)


--- a/backend/danswer/background/indexing/run_indexing.py
+++ b/backend/danswer/background/indexing/run_indexing.py
@@ -1,7 +1,5 @@
 import time
 import traceback
-from abc import ABC
-from abc import abstractmethod
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
@@ -21,6 +19,7 @@ from danswer.db.connector_credential_pair import get_last_successful_attempt_tim
 from danswer.db.connector_credential_pair import update_connector_credential_pair
 from danswer.db.engine import get_session_with_tenant
 from danswer.db.enums import ConnectorCredentialPairStatus
+from danswer.db.index_attempt import mark_attempt_canceled
 from danswer.db.index_attempt import mark_attempt_failed
 from danswer.db.index_attempt import mark_attempt_partially_succeeded
 from danswer.db.index_attempt import mark_attempt_succeeded
@@ -31,10 +30,10 @@ from danswer.db.models import IndexingStatus
 from danswer.db.models import IndexModelStatus
 from danswer.document_index.factory import get_default_document_index
 from danswer.indexing.embedder import DefaultIndexingEmbedder
-from danswer.indexing.indexing_heartbeat import IndexingHeartbeat
+from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from danswer.indexing.indexing_pipeline import build_indexing_pipeline
-from danswer.utils.logger import IndexAttemptSingleton
 from danswer.utils.logger import setup_logger
+from danswer.utils.logger import TaskAttemptSingleton
 from danswer.utils.variable_functionality import global_version

 logger = setup_logger()
@@ -42,19 +41,6 @@ logger = setup_logger()
 INDEXING_TRACER_NUM_PRINT_ENTRIES = 5


-class RunIndexingCallbackInterface(ABC):
-    """Defines a callback interface to be passed to
-    to run_indexing_entrypoint."""
-
-    @abstractmethod
-    def should_stop(self) -> bool:
-        """Signal to stop the looping function in flight."""
-
-    @abstractmethod
-    def progress(self, amount: int) -> None:
-        """Send progress updates to the caller."""
-
-
 def _get_connector_runner(
    db_session: Session,
    attempt: IndexAttempt,
@@ -102,11 +88,15 @@ def _get_connector_runner(
    )


+class ConnectorStopSignal(Exception):
+    """A custom exception used to signal a stop in processing."""
+
+
 def _run_indexing(
    db_session: Session,
    index_attempt: IndexAttempt,
    tenant_id: str | None,
-    callback: RunIndexingCallbackInterface | None = None,
+    callback: IndexingHeartbeatInterface | None = None,
 ) -> None:
    """
    1. Get documents which are either new or updated from specified application
@@ -138,13 +128,7 @@ def _run_indexing(

    embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
        search_settings=search_settings,
-        heartbeat=IndexingHeartbeat(
-            index_attempt_id=index_attempt.id,
-            db_session=db_session,
-            # let the world know we're still making progress after
-            # every 10 batches
-            freq=10,
-        ),
+        callback=callback,
    )

    indexing_pipeline = build_indexing_pipeline(
@@ -157,6 +141,7 @@ def _run_indexing(
        ),
        db_session=db_session,
        tenant_id=tenant_id,
+        callback=callback,
    )

    db_cc_pair = index_attempt.connector_credential_pair
@@ -228,7 +213,7 @@ def _run_indexing(
                # contents still need to be initially pulled.
                if callback:
                    if callback.should_stop():
-                        raise RuntimeError("Connector stop signal detected")
+                        raise ConnectorStopSignal("Connector stop signal detected")

                # TODO: should we move this into the above callback instead?
                db_session.refresh(db_cc_pair)
@@ -289,7 +274,7 @@ def _run_indexing(
                db_session.commit()

                if callback:
-                    callback.progress(len(doc_batch))
+                    callback.progress("_run_indexing", len(doc_batch))

                # This new value is updated every batch, so UI can refresh per batch update
                update_docs_indexed(
@@ -322,26 +307,16 @@ def _run_indexing(
                )
        except Exception as e:
            logger.exception(
-                f"Connector run ran into exception after elapsed time: {time.time() - start_time} seconds"
+                f"Connector run exceptioned after elapsed time: {time.time() - start_time} seconds"
            )
-            # Only mark the attempt as a complete failure if this is the first indexing window.
-            # Otherwise, some progress was made - the next run will not start from the beginning.
-            # In this case, it is not accurate to mark it as a failure. When the next run begins,
-            # if that fails immediately, it will be marked as a failure.
-            #
-            # NOTE: if the connector is manually disabled, we should mark it as a failure regardless
-            # to give better clarity in the UI, as the next run will never happen.
-            if (
-                ind == 0
-                or not db_cc_pair.status.is_active()
-                or index_attempt.status != IndexingStatus.IN_PROGRESS
-            ):
-                mark_attempt_failed(
+
+            if isinstance(e, ConnectorStopSignal):
+                mark_attempt_canceled(
                    index_attempt.id,
                    db_session,
-                    failure_reason=str(e),
-                    full_exception_trace=traceback.format_exc(),
+                    reason=str(e),
                )
+
                if is_primary:
                    update_connector_credential_pair(
                        db_session=db_session,
@@ -353,6 +328,37 @@ def _run_indexing(
                if INDEXING_TRACER_INTERVAL > 0:
                    tracer.stop()
                raise e
+            else:
+                # Only mark the attempt as a complete failure if this is the first indexing window.
+                # Otherwise, some progress was made - the next run will not start from the beginning.
+                # In this case, it is not accurate to mark it as a failure. When the next run begins,
+                # if that fails immediately, it will be marked as a failure.
+                #
+                # NOTE: if the connector is manually disabled, we should mark it as a failure regardless
+                # to give better clarity in the UI, as the next run will never happen.
+                if (
+                    ind == 0
+                    or not db_cc_pair.status.is_active()
+                    or index_attempt.status != IndexingStatus.IN_PROGRESS
+                ):
+                    mark_attempt_failed(
+                        index_attempt.id,
+                        db_session,
+                        failure_reason=str(e),
+                        full_exception_trace=traceback.format_exc(),
+                    )
+
+                    if is_primary:
+                        update_connector_credential_pair(
+                            db_session=db_session,
+                            connector_id=db_connector.id,
+                            credential_id=db_credential.id,
+                            net_docs=net_doc_change,
+                        )
+
+                    if INDEXING_TRACER_INTERVAL > 0:
+                        tracer.stop()
+                    raise e

            # break => similar to success case. As mentioned above, if the next run fails for the same
            # reason it will then be marked as a failure
@@ -419,7 +425,7 @@ def run_indexing_entrypoint(
    tenant_id: str | None,
    connector_credential_pair_id: int,
    is_ee: bool = False,
-    callback: RunIndexingCallbackInterface | None = None,
+    callback: IndexingHeartbeatInterface | None = None,
 ) -> None:
    try:
        if is_ee:
@@ -427,17 +433,19 @@ def run_indexing_entrypoint(

        # set the indexing attempt ID so that all log messages from this process
        # will have it added as a prefix
-        IndexAttemptSingleton.set_cc_and_index_id(
+        TaskAttemptSingleton.set_cc_and_index_id(
            index_attempt_id, connector_credential_pair_id
        )
        with get_session_with_tenant(tenant_id) as db_session:
            attempt = transition_attempt_to_in_progress(index_attempt_id, db_session)

+            tenant_str = ""
+            if tenant_id is not None:
+                tenant_str = f" for tenant {tenant_id}"
+
            logger.info(
-                f"Indexing starting for tenant {tenant_id}: "
-                if tenant_id is not None
-                else ""
-                + f"connector='{attempt.connector_credential_pair.connector.name}' "
+                f"Indexing starting{tenant_str}: "
+                f"connector='{attempt.connector_credential_pair.connector.name}' "
                f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
                f"credentials='{attempt.connector_credential_pair.connector_id}'"
            )
@@ -445,10 +453,8 @@ def run_indexing_entrypoint(
            _run_indexing(db_session, attempt, tenant_id, callback)

            logger.info(
-                f"Indexing finished for tenant {tenant_id}: "
-                if tenant_id is not None
-                else ""
-                + f"connector='{attempt.connector_credential_pair.connector.name}' "
+                f"Indexing finished{tenant_str}: "
+                f"connector='{attempt.connector_credential_pair.connector.name}' "
                f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
                f"credentials='{attempt.connector_credential_pair.connector_id}'"
            )
--- a/backend/danswer/background/task_name_builders.py
+++ b/backend/danswer/background/task_name_builders.py
@@ -1,4 +0,0 @@
-def name_sync_external_doc_permissions_task(
-    cc_pair_id: int, tenant_id: str | None = None
-) -> str:
-    return f"sync_external_doc_permissions_task__{cc_pair_id}"
--- a/backend/danswer/background/task_utils.py
+++ b/backend/danswer/background/task_utils.py
@@ -14,15 +14,6 @@ from danswer.db.tasks import mark_task_start
 from danswer.db.tasks import register_task


-def name_cc_prune_task(
-    connector_id: int | None = None, credential_id: int | None = None
-) -> str:
-    task_name = f"prune_connector_credential_pair_{connector_id}_{credential_id}"
-    if not connector_id or not credential_id:
-        task_name = "prune_connector_credential_pair"
-    return task_name
-
-
 T = TypeVar("T", bound=Callable)


--- a/backend/danswer/chat/chat_utils.py
+++ b/backend/danswer/chat/chat_utils.py
@@ -2,20 +2,79 @@ import re
 from typing import cast
 from uuid import UUID

+from fastapi import HTTPException
 from fastapi.datastructures import Headers
 from sqlalchemy.orm import Session

+from danswer.auth.users import is_user_admin
 from danswer.chat.models import CitationInfo
 from danswer.chat.models import LlmDoc
+from danswer.chat.models import PersonaOverrideConfig
+from danswer.chat.models import ThreadMessage
+from danswer.configs.constants import DEFAULT_PERSONA_ID
+from danswer.configs.constants import MessageType
+from danswer.context.search.models import InferenceSection
+from danswer.context.search.models import RerankingDetails
+from danswer.context.search.models import RetrievalDetails
+from danswer.db.chat import create_chat_session
 from danswer.db.chat import get_chat_messages_by_session
+from danswer.db.llm import fetch_existing_doc_sets
+from danswer.db.llm import fetch_existing_tools
 from danswer.db.models import ChatMessage
+from danswer.db.models import Persona
+from danswer.db.models import Prompt
+from danswer.db.models import Tool
+from danswer.db.models import User
+from danswer.db.persona import get_prompts_by_ids
 from danswer.llm.answering.models import PreviousMessage
-from danswer.search.models import InferenceSection
+from danswer.natural_language_processing.utils import BaseTokenizer
+from danswer.server.query_and_chat.models import CreateChatMessageRequest
+from danswer.tools.tool_implementations.custom.custom_tool import (
+    build_custom_tools_from_openapi_schema_and_headers,
+)
 from danswer.utils.logger import setup_logger

 logger = setup_logger()


+def prepare_chat_message_request(
+    message_text: str,
+    user: User | None,
+    persona_id: int | None,
+    # Does the question need to have a persona override
+    persona_override_config: PersonaOverrideConfig | None,
+    prompt: Prompt | None,
+    message_ts_to_respond_to: str | None,
+    retrieval_details: RetrievalDetails | None,
+    rerank_settings: RerankingDetails | None,
+    db_session: Session,
+) -> CreateChatMessageRequest:
+    # Typically used for one shot flows like SlackBot or non-chat API endpoint use cases
+    new_chat_session = create_chat_session(
+        db_session=db_session,
+        description=None,
+        user_id=user.id if user else None,
+        # If using an override, this id will be ignored later on
+        persona_id=persona_id or DEFAULT_PERSONA_ID,
+        danswerbot_flow=True,
+        slack_thread_id=message_ts_to_respond_to,
+    )
+
+    return CreateChatMessageRequest(
+        chat_session_id=new_chat_session.id,
+        parent_message_id=None,  # It's a standalone chat session each time
+        message=message_text,
+        file_descriptors=[],  # Currently SlackBot/answer api do not support files in the context
+        prompt_id=prompt.id if prompt else None,
+        # Can always override the persona for the single query, if it's a normal persona
+        # then it will be treated the same
+        persona_override_config=persona_override_config,
+        search_doc_ids=None,
+        retrieval_options=retrieval_details,
+        rerank_settings=rerank_settings,
+    )
+
+
 def llm_doc_from_inference_section(inference_section: InferenceSection) -> LlmDoc:
    return LlmDoc(
        document_id=inference_section.center_chunk.document_id,
@@ -31,9 +90,49 @@ def llm_doc_from_inference_section(inference_section: InferenceSection) -> LlmDo
        if inference_section.center_chunk.source_links
        else None,
        source_links=inference_section.center_chunk.source_links,
+        match_highlights=inference_section.center_chunk.match_highlights,
    )


+def combine_message_thread(
+    messages: list[ThreadMessage],
+    max_tokens: int | None,
+    llm_tokenizer: BaseTokenizer,
+) -> str:
+    """Used to create a single combined message context from threads"""
+    if not messages:
+        return ""
+
+    message_strs: list[str] = []
+    total_token_count = 0
+
+    for message in reversed(messages):
+        if message.role == MessageType.USER:
+            role_str = message.role.value.upper()
+            if message.sender:
+                role_str += " " + message.sender
+            else:
+                # Since other messages might have the user identifying information
+                # better to use Unknown for symmetry
+                role_str += " Unknown"
+        else:
+            role_str = message.role.value.upper()
+
+        msg_str = f"{role_str}:\n{message.message}"
+        message_token_count = len(llm_tokenizer.encode(msg_str))
+
+        if (
+            max_tokens is not None
+            and total_token_count + message_token_count > max_tokens
+        ):
+            break
+
+        message_strs.insert(0, msg_str)
+        total_token_count += message_token_count
+
+    return "\n\n".join(message_strs)
+
+
 def create_chat_chain(
    chat_session_id: UUID,
    db_session: Session,
@@ -196,3 +295,71 @@ def extract_headers(
            if lowercase_key in headers:
                extracted_headers[lowercase_key] = headers[lowercase_key]
    return extracted_headers
+
+
+def create_temporary_persona(
+    persona_config: PersonaOverrideConfig, db_session: Session, user: User | None = None
+) -> Persona:
+    if not is_user_admin(user):
+        raise HTTPException(
+            status_code=403,
+            detail="User is not authorized to create a persona in one shot queries",
+        )
+
+    """Create a temporary Persona object from the provided configuration."""
+    persona = Persona(
+        name=persona_config.name,
+        description=persona_config.description,
+        num_chunks=persona_config.num_chunks,
+        llm_relevance_filter=persona_config.llm_relevance_filter,
+        llm_filter_extraction=persona_config.llm_filter_extraction,
+        recency_bias=persona_config.recency_bias,
+        llm_model_provider_override=persona_config.llm_model_provider_override,
+        llm_model_version_override=persona_config.llm_model_version_override,
+    )
+
+    if persona_config.prompts:
+        persona.prompts = [
+            Prompt(
+                name=p.name,
+                description=p.description,
+                system_prompt=p.system_prompt,
+                task_prompt=p.task_prompt,
+                include_citations=p.include_citations,
+                datetime_aware=p.datetime_aware,
+            )
+            for p in persona_config.prompts
+        ]
+    elif persona_config.prompt_ids:
+        persona.prompts = get_prompts_by_ids(
+            db_session=db_session, prompt_ids=persona_config.prompt_ids
+        )
+
+    persona.tools = []
+    if persona_config.custom_tools_openapi:
+        for schema in persona_config.custom_tools_openapi:
+            tools = cast(
+                list[Tool],
+                build_custom_tools_from_openapi_schema_and_headers(schema),
+            )
+            persona.tools.extend(tools)
+
+    if persona_config.tools:
+        tool_ids = [tool.id for tool in persona_config.tools]
+        persona.tools.extend(
+            fetch_existing_tools(db_session=db_session, tool_ids=tool_ids)
+        )
+
+    if persona_config.tool_ids:
+        persona.tools.extend(
+            fetch_existing_tools(
+                db_session=db_session, tool_ids=persona_config.tool_ids
+            )
+        )
+
+    fetched_docs = fetch_existing_doc_sets(
+        db_session=db_session, doc_ids=persona_config.document_set_ids
+    )
+    persona.document_sets = fetched_docs
+
+    return persona
--- a/backend/danswer/chat/models.py
+++ b/backend/danswer/chat/models.py
@@ -4,12 +4,14 @@ from enum import Enum
 from typing import Any

 from pydantic import BaseModel
+from pydantic import Field

 from danswer.configs.constants import DocumentSource
-from danswer.search.enums import QueryFlow
-from danswer.search.enums import SearchType
-from danswer.search.models import RetrievalDocs
-from danswer.search.models import SearchResponse
+from danswer.configs.constants import MessageType
+from danswer.context.search.enums import QueryFlow
+from danswer.context.search.enums import RecencyBiasSetting
+from danswer.context.search.enums import SearchType
+from danswer.context.search.models import RetrievalDocs
 from danswer.tools.tool_implementations.custom.base_tool_types import ToolResultType


@@ -25,6 +27,7 @@ class LlmDoc(BaseModel):
    updated_at: datetime | None
    link: str | None
    source_links: dict[int, str] | None
+    match_highlights: list[str] | None


 # First chunk of info for streaming QA
@@ -117,20 +120,6 @@ class StreamingError(BaseModel):
    stack_trace: str | None = None


-class DanswerQuote(BaseModel):
-    # This is during inference so everything is a string by this point
-    quote: str
-    document_id: str
-    link: str | None
-    source_type: str
-    semantic_identifier: str
-    blurb: str
-
-
-class DanswerQuotes(BaseModel):
-    quotes: list[DanswerQuote]
-
-
 class DanswerContext(BaseModel):
    content: str
    document_id: str
@@ -146,14 +135,20 @@ class DanswerAnswer(BaseModel):
    answer: str | None


-class QAResponse(SearchResponse, DanswerAnswer):
-    quotes: list[DanswerQuote] | None
-    contexts: list[DanswerContexts] | None
-    predicted_flow: QueryFlow
-    predicted_search: SearchType
-    eval_res_valid: bool | None = None
+class ThreadMessage(BaseModel):
+    message: str
+    sender: str | None = None
+    role: MessageType = MessageType.USER
+
+
+class ChatDanswerBotResponse(BaseModel):
+    answer: str | None = None
+    citations: list[CitationInfo] | None = None
+    docs: QADocsResponse | None = None
    llm_selected_doc_indices: list[int] | None = None
    error_msg: str | None = None
+    chat_message_id: int | None = None
+    answer_valid: bool = True  # Reflexion result, default True if Reflexion not run


 class FileChatDisplay(BaseModel):
@@ -165,9 +160,41 @@ class CustomToolResponse(BaseModel):
    tool_name: str


+class ToolConfig(BaseModel):
+    id: int
+
+
+class PromptOverrideConfig(BaseModel):
+    name: str
+    description: str = ""
+    system_prompt: str
+    task_prompt: str = ""
+    include_citations: bool = True
+    datetime_aware: bool = True
+
+
+class PersonaOverrideConfig(BaseModel):
+    name: str
+    description: str
+    search_type: SearchType = SearchType.SEMANTIC
+    num_chunks: float | None = None
+    llm_relevance_filter: bool = False
+    llm_filter_extraction: bool = False
+    recency_bias: RecencyBiasSetting = RecencyBiasSetting.AUTO
+    llm_model_provider_override: str | None = None
+    llm_model_version_override: str | None = None
+
+    prompts: list[PromptOverrideConfig] = Field(default_factory=list)
+    prompt_ids: list[int] = Field(default_factory=list)
+
+    document_set_ids: list[int] = Field(default_factory=list)
+    tools: list[ToolConfig] = Field(default_factory=list)
+    tool_ids: list[int] = Field(default_factory=list)
+    custom_tools_openapi: list[dict[str, Any]] = Field(default_factory=list)
+
+
 AnswerQuestionPossibleReturn = (
    DanswerAnswerPiece
-    | DanswerQuotes
    | CitationInfo
    | DanswerContexts
    | FileChatDisplay
--- a/backend/danswer/chat/process_message.py
+++ b/backend/danswer/chat/process_message.py
@@ -7,10 +7,13 @@ from typing import cast
 from sqlalchemy.orm import Session

 from danswer.chat.chat_utils import create_chat_chain
+from danswer.chat.chat_utils import create_temporary_persona
 from danswer.chat.models import AllCitations
+from danswer.chat.models import ChatDanswerBotResponse
 from danswer.chat.models import CitationInfo
 from danswer.chat.models import CustomToolResponse
 from danswer.chat.models import DanswerAnswerPiece
+from danswer.chat.models import DanswerContexts
 from danswer.chat.models import FileChatDisplay
 from danswer.chat.models import FinalUsedContextDocsResponse
 from danswer.chat.models import LLMRelevanceFilterResponse
@@ -23,6 +26,16 @@ from danswer.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
 from danswer.configs.chat_configs import DISABLE_LLM_CHOOSE_SEARCH
 from danswer.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
 from danswer.configs.constants import MessageType
+from danswer.context.search.enums import OptionalSearchSetting
+from danswer.context.search.enums import QueryFlow
+from danswer.context.search.enums import SearchType
+from danswer.context.search.models import InferenceSection
+from danswer.context.search.models import RetrievalDetails
+from danswer.context.search.retrieval.search_runner import inference_sections_from_ids
+from danswer.context.search.utils import chunks_or_sections_to_search_docs
+from danswer.context.search.utils import dedupe_documents
+from danswer.context.search.utils import drop_llm_indices
+from danswer.context.search.utils import relevant_sections_to_indices
 from danswer.db.chat import attach_files_to_chat_message
 from danswer.db.chat import create_db_search_doc
 from danswer.db.chat import create_new_chat_message
@@ -56,16 +69,6 @@ from danswer.llm.factory import get_llms_for_persona
 from danswer.llm.factory import get_main_llm_from_tuple
 from danswer.llm.utils import litellm_exception_to_error_msg
 from danswer.natural_language_processing.utils import get_tokenizer
-from danswer.search.enums import OptionalSearchSetting
-from danswer.search.enums import QueryFlow
-from danswer.search.enums import SearchType
-from danswer.search.models import InferenceSection
-from danswer.search.models import RetrievalDetails
-from danswer.search.retrieval.search_runner import inference_sections_from_ids
-from danswer.search.utils import chunks_or_sections_to_search_docs
-from danswer.search.utils import dedupe_documents
-from danswer.search.utils import drop_llm_indices
-from danswer.search.utils import relevant_sections_to_indices
 from danswer.server.query_and_chat.models import ChatMessageDetail
 from danswer.server.query_and_chat.models import CreateChatMessageRequest
 from danswer.server.utils import get_json_line
@@ -102,6 +105,7 @@ from danswer.tools.tool_implementations.internet_search.internet_search_tool imp
 from danswer.tools.tool_implementations.search.search_tool import (
    FINAL_CONTEXT_DOCUMENTS_ID,
 )
+from danswer.tools.tool_implementations.search.search_tool import SEARCH_DOC_CONTENT_ID
 from danswer.tools.tool_implementations.search.search_tool import (
    SEARCH_RESPONSE_SUMMARY_ID,
 )
@@ -112,8 +116,11 @@ from danswer.tools.tool_implementations.search.search_tool import (
 )
 from danswer.tools.tool_runner import ToolCallFinalResult
 from danswer.utils.logger import setup_logger
+from danswer.utils.long_term_log import LongTermLogger
+from danswer.utils.timing import log_function_time
 from danswer.utils.timing import log_generator_function_time

+
 logger = setup_logger()


@@ -255,6 +262,7 @@ def _get_force_search_settings(
 ChatPacket = (
    StreamingError
    | QADocsResponse
+    | DanswerContexts
    | LLMRelevanceFilterResponse
    | FinalUsedContextDocsResponse
    | ChatMessageDetail
@@ -285,6 +293,8 @@ def stream_chat_message_objects(
    custom_tool_additional_headers: dict[str, str] | None = None,
    is_connected: Callable[[], bool] | None = None,
    enforce_chat_session_id_for_search_docs: bool = True,
+    bypass_acl: bool = False,
+    include_contexts: bool = False,
 ) -> ChatPacketStream:
    """Streams in order:
    1. [conditional] Retrieved documents if a search needs to be run
@@ -316,17 +326,36 @@ def stream_chat_message_objects(
        retrieval_options = new_msg_req.retrieval_options
        alternate_assistant_id = new_msg_req.alternate_assistant_id

-        # use alternate persona if alternative assistant id is passed in
+        # permanent "log" store, used primarily for debugging
+        long_term_logger = LongTermLogger(
+            metadata={"user_id": str(user_id), "chat_session_id": str(chat_session_id)}
+        )
+
        if alternate_assistant_id is not None:
+            # Allows users to specify a temporary persona (assistant) in the chat session
+            # this takes highest priority since it's user specified
            persona = get_persona_by_id(
                alternate_assistant_id,
                user=user,
                db_session=db_session,
                is_for_edit=False,
            )
+        elif new_msg_req.persona_override_config:
+            # Certain endpoints allow users to specify arbitrary persona settings
+            # this should never conflict with the alternate_assistant_id
+            persona = persona = create_temporary_persona(
+                db_session=db_session,
+                persona_config=new_msg_req.persona_override_config,
+                user=user,
+            )
        else:
            persona = chat_session.persona

+        if not persona:
+            raise RuntimeError("No persona specified or found for chat session")
+
+        # If a prompt override is specified via the API, use that with highest priority
+        # but for saving it, we are just mapping it to an existing prompt
        prompt_id = new_msg_req.prompt_id
        if prompt_id is None and persona.prompts:
            prompt_id = sorted(persona.prompts, key=lambda x: x.id)[-1].id
@@ -341,6 +370,7 @@ def stream_chat_message_objects(
                persona=persona,
                llm_override=new_msg_req.llm_override or chat_session.llm_override,
                additional_headers=litellm_additional_headers,
+                long_term_logger=long_term_logger,
            )
        except GenAIDisabledException:
            raise RuntimeError("LLM is disabled. Can't use chat flow without LLM.")
@@ -548,19 +578,34 @@ def stream_chat_message_objects(
            reserved_message_id=reserved_message_id,
        )

-        if not final_msg.prompt:
-            raise RuntimeError("No Prompt found")
-
-        prompt_config = (
-            PromptConfig.from_model(
-                final_msg.prompt,
-                prompt_override=(
-                    new_msg_req.prompt_override or chat_session.prompt_override
-                ),
+        prompt_override = new_msg_req.prompt_override or chat_session.prompt_override
+        if new_msg_req.persona_override_config:
+            prompt_config = PromptConfig(
+                system_prompt=new_msg_req.persona_override_config.prompts[
+                    0
+                ].system_prompt,
+                task_prompt=new_msg_req.persona_override_config.prompts[0].task_prompt,
+                datetime_aware=new_msg_req.persona_override_config.prompts[
+                    0
+                ].datetime_aware,
+                include_citations=new_msg_req.persona_override_config.prompts[
+                    0
+                ].include_citations,
            )
-            if not persona
-            else PromptConfig.from_model(persona.prompts[0])
-        )
+        elif prompt_override:
+            if not final_msg.prompt:
+                raise ValueError(
+                    "Prompt override cannot be applied, no base prompt found."
+                )
+            prompt_config = PromptConfig.from_model(
+                final_msg.prompt,
+                prompt_override=prompt_override,
+            )
+        elif final_msg.prompt:
+            prompt_config = PromptConfig.from_model(final_msg.prompt)
+        else:
+            prompt_config = PromptConfig.from_model(persona.prompts[0])
+
        answer_style_config = AnswerStyleConfig(
            citation_config=CitationConfig(
                all_docs_useful=selected_db_search_docs is not None
@@ -580,11 +625,13 @@ def stream_chat_message_objects(
                answer_style_config=answer_style_config,
                document_pruning_config=document_pruning_config,
                retrieval_options=retrieval_options or RetrievalDetails(),
+                rerank_settings=new_msg_req.rerank_settings,
                selected_sections=selected_sections,
                chunks_above=new_msg_req.chunks_above,
                chunks_below=new_msg_req.chunks_below,
                full_doc=new_msg_req.full_doc,
                latest_query_files=latest_query_files,
+                bypass_acl=bypass_acl,
            ),
            internet_search_tool_config=InternetSearchToolConfig(
                answer_style_config=answer_style_config,
@@ -598,6 +645,7 @@ def stream_chat_message_objects(
                additional_headers=custom_tool_additional_headers,
            ),
        )
+
        tools: list[Tool] = []
        for tool_list in tool_dict.values():
            tools.extend(tool_list)
@@ -729,6 +777,8 @@ def stream_chat_message_objects(
                            response=custom_tool_response.tool_result,
                            tool_name=custom_tool_response.tool_name,
                        )
+                elif packet.id == SEARCH_DOC_CONTENT_ID and include_contexts:
+                    yield cast(DanswerContexts, packet.response)

            elif isinstance(packet, StreamStopInfo):
                pass
@@ -837,3 +887,30 @@ def stream_chat_message(
        )
        for obj in objects:
            yield get_json_line(obj.model_dump())
+
+
+@log_function_time()
+def gather_stream_for_slack(
+    packets: ChatPacketStream,
+) -> ChatDanswerBotResponse:
+    response = ChatDanswerBotResponse()
+
+    answer = ""
+    for packet in packets:
+        if isinstance(packet, DanswerAnswerPiece) and packet.answer_piece:
+            answer += packet.answer_piece
+        elif isinstance(packet, QADocsResponse):
+            response.docs = packet
+        elif isinstance(packet, StreamingError):
+            response.error_msg = packet.error
+        elif isinstance(packet, ChatMessageDetail):
+            response.chat_message_id = packet.message_id
+        elif isinstance(packet, LLMRelevanceFilterResponse):
+            response.llm_selected_doc_indices = packet.llm_selected_doc_indices
+        elif isinstance(packet, AllCitations):
+            response.citations = packet.citations
+
+    if answer:
+        response.answer = answer
+
+    return response
--- a/backend/danswer/chat/tools.py
+++ b/backend/danswer/chat/tools.py
@@ -1,115 +0,0 @@
-from typing_extensions import TypedDict  # noreorder
-
-from pydantic import BaseModel
-
-from danswer.prompts.chat_tools import DANSWER_TOOL_DESCRIPTION
-from danswer.prompts.chat_tools import DANSWER_TOOL_NAME
-from danswer.prompts.chat_tools import TOOL_FOLLOWUP
-from danswer.prompts.chat_tools import TOOL_LESS_FOLLOWUP
-from danswer.prompts.chat_tools import TOOL_LESS_PROMPT
-from danswer.prompts.chat_tools import TOOL_TEMPLATE
-from danswer.prompts.chat_tools import USER_INPUT
-
-
-class ToolInfo(TypedDict):
-    name: str
-    description: str
-
-
-class DanswerChatModelOut(BaseModel):
-    model_raw: str
-    action: str
-    action_input: str
-
-
-def call_tool(
-    model_actions: DanswerChatModelOut,
-) -> str:
-    raise NotImplementedError("There are no additional tool integrations right now")
-
-
-def form_user_prompt_text(
-    query: str,
-    tool_text: str | None,
-    hint_text: str | None,
-    user_input_prompt: str = USER_INPUT,
-    tool_less_prompt: str = TOOL_LESS_PROMPT,
-) -> str:
-    user_prompt = tool_text or tool_less_prompt
-
-    user_prompt += user_input_prompt.format(user_input=query)
-
-    if hint_text:
-        if user_prompt[-1] != "\n":
-            user_prompt += "\n"
-        user_prompt += "\nHint: " + hint_text
-
-    return user_prompt.strip()
-
-
-def form_tool_section_text(
-    tools: list[ToolInfo] | None, retrieval_enabled: bool, template: str = TOOL_TEMPLATE
-) -> str | None:
-    if not tools and not retrieval_enabled:
-        return None
-
-    if retrieval_enabled and tools:
-        tools.append(
-            {"name": DANSWER_TOOL_NAME, "description": DANSWER_TOOL_DESCRIPTION}
-        )
-
-    tools_intro = []
-    if tools:
-        num_tools = len(tools)
-        for tool in tools:
-            description_formatted = tool["description"].replace("\n", " ")
-            tools_intro.append(f"> {tool['name']}: {description_formatted}")
-
-        prefix = "Must be one of " if num_tools > 1 else "Must be "
-
-        tools_intro_text = "\n".join(tools_intro)
-        tool_names_text = prefix + ", ".join([tool["name"] for tool in tools])
-
-    else:
-        return None
-
-    return template.format(
-        tool_overviews=tools_intro_text, tool_names=tool_names_text
-    ).strip()
-
-
-def form_tool_followup_text(
-    tool_output: str,
-    query: str,
-    hint_text: str | None,
-    tool_followup_prompt: str = TOOL_FOLLOWUP,
-    ignore_hint: bool = False,
-) -> str:
-    # If multi-line query, it likely confuses the model more than helps
-    if "\n" not in query:
-        optional_reminder = f"\nAs a reminder, my query was: {query}\n"
-    else:
-        optional_reminder = ""
-
-    if not ignore_hint and hint_text:
-        hint_text_spaced = f"\nHint: {hint_text}\n"
-    else:
-        hint_text_spaced = ""
-
-    return tool_followup_prompt.format(
-        tool_output=tool_output,
-        optional_reminder=optional_reminder,
-        hint=hint_text_spaced,
-    ).strip()
-
-
-def form_tool_less_followup_text(
-    tool_output: str,
-    query: str,
-    hint_text: str | None,
-    tool_followup_prompt: str = TOOL_LESS_FOLLOWUP,
-) -> str:
-    hint = f"Hint: {hint_text}" if hint_text else ""
-    return tool_followup_prompt.format(
-        context_str=tool_output, user_query=query, hint_text=hint
-    ).strip()
--- a/backend/danswer/configs/app_configs.py
+++ b/backend/danswer/configs/app_configs.py
@@ -234,7 +234,7 @@ except ValueError:
        CELERY_WORKER_LIGHT_PREFETCH_MULTIPLIER_DEFAULT
    )

-CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT = 1
+CELERY_WORKER_INDEXING_CONCURRENCY_DEFAULT = 3
 try:
    env_value = os.environ.get("CELERY_WORKER_INDEXING_CONCURRENCY")
    if not env_value:
@@ -308,6 +308,22 @@ CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD = int(
    os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
 )

+# Due to breakages in the confluence API, the timezone offset must be specified client side
+# to match the user's specified timezone.
+
+# The current state of affairs:
+# CQL queries are parsed in the user's timezone and cannot be specified in UTC
+# no API retrieves the user's timezone
+# All data is returned in UTC, so we can't derive the user's timezone from that
+
+# https://community.developer.atlassian.com/t/confluence-cloud-time-zone-get-via-rest-api/35954/16
+# https://jira.atlassian.com/browse/CONFCLOUD-69670
+
+# enter as a floating point offset from UTC in hours (-24 < val < 24)
+# this will be applied globally, so it probably makes sense to transition this to per
+# connector as some point.
+CONFLUENCE_TIMEZONE_OFFSET = float(os.environ.get("CONFLUENCE_TIMEZONE_OFFSET", 0.0))
+
 JIRA_CONNECTOR_LABELS_TO_SKIP = [
    ignored_tag
    for ignored_tag in os.environ.get("JIRA_CONNECTOR_LABELS_TO_SKIP", "").split(",")
@@ -422,6 +438,9 @@ LOG_ALL_MODEL_INTERACTIONS = (
 LOG_DANSWER_MODEL_INTERACTIONS = (
    os.environ.get("LOG_DANSWER_MODEL_INTERACTIONS", "").lower() == "true"
 )
+LOG_INDIVIDUAL_MODEL_TOKENS = (
+    os.environ.get("LOG_INDIVIDUAL_MODEL_TOKENS", "").lower() == "true"
+)
 # If set to `true` will enable additional logs about Vespa query performance
 # (time spent on finding the right docs + time spent fetching summaries from disk)
 LOG_VESPA_TIMING_INFORMATION = (
@@ -490,10 +509,6 @@ CONTROL_PLANE_API_BASE_URL = os.environ.get(
 # JWT configuration
 JWT_ALGORITHM = "HS256"

-# Super Users
-SUPER_USERS = json.loads(os.environ.get("SUPER_USERS", '["pablo@danswer.ai"]'))
-SUPER_CLOUD_API_KEY = os.environ.get("SUPER_CLOUD_API_KEY", "api_key")
-

 #####
 # API Key Configs
@@ -503,3 +518,10 @@ _API_KEY_HASH_ROUNDS_RAW = os.environ.get("API_KEY_HASH_ROUNDS")
 API_KEY_HASH_ROUNDS = (
    int(_API_KEY_HASH_ROUNDS_RAW) if _API_KEY_HASH_ROUNDS_RAW else None
 )
+
+
+POD_NAME = os.environ.get("POD_NAME")
+POD_NAMESPACE = os.environ.get("POD_NAMESPACE")
+
+
+DEV_MODE = os.environ.get("DEV_MODE", "").lower() == "true"
--- a/backend/danswer/configs/chat_configs.py
+++ b/backend/danswer/configs/chat_configs.py
@@ -1,9 +1,9 @@
 import os


-PROMPTS_YAML = "./danswer/chat/prompts.yaml"
-PERSONAS_YAML = "./danswer/chat/personas.yaml"
-INPUT_PROMPT_YAML = "./danswer/chat/input_prompts.yaml"
+PROMPTS_YAML = "./danswer/seeding/prompts.yaml"
+PERSONAS_YAML = "./danswer/seeding/personas.yaml"
+INPUT_PROMPT_YAML = "./danswer/seeding/input_prompts.yaml"

 NUM_RETURNED_HITS = 50
 # Used for LLM filtering and reranking
@@ -17,9 +17,6 @@ MAX_CHUNKS_FED_TO_CHAT = float(os.environ.get("MAX_CHUNKS_FED_TO_CHAT") or 10.0)
 # ~3k input, half for docs, half for chat history + prompts
 CHAT_TARGET_CHUNK_PERCENTAGE = 512 * 3 / 3072

-# For selecting a different LLM question-answering prompt format
-# Valid values: default, cot, weak
-QA_PROMPT_OVERRIDE = os.environ.get("QA_PROMPT_OVERRIDE") or None
 # 1 / (1 + DOC_TIME_DECAY * doc-age-in-years), set to 0 to have no decay
 # Capped in Vespa at 0.5
 DOC_TIME_DECAY = float(
@@ -27,8 +24,6 @@ DOC_TIME_DECAY = float(
 )
 BASE_RECENCY_DECAY = 0.5
 FAVOR_RECENT_DECAY_MULTIPLIER = 2.0
-# Currently this next one is not configurable via env
-DISABLE_LLM_QUERY_ANSWERABILITY = QA_PROMPT_OVERRIDE == "weak"
 # For the highest matching base size chunk, how many chunks above and below do we pull in by default
 # Note this is not in any of the deployment configs yet
 # Currently only applies to search flow not chat
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -31,6 +31,8 @@ DISABLED_GEN_AI_MSG = (
    "You can still use Danswer as a search engine."
 )

+DEFAULT_PERSONA_ID = 0
+
 # Postgres connection constants for application_name
 POSTGRES_WEB_APP_NAME = "web"
 POSTGRES_INDEXER_APP_NAME = "indexer"
@@ -60,7 +62,6 @@ KV_GMAIL_CRED_KEY = "gmail_app_credential"
 KV_GMAIL_SERVICE_ACCOUNT_KEY = "gmail_service_account_key"
 KV_GOOGLE_DRIVE_CRED_KEY = "google_drive_app_credential"
 KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY = "google_drive_service_account_key"
-KV_SLACK_BOT_TOKENS_CONFIG_KEY = "slack_bot_tokens_config_key"
 KV_GEN_AI_KEY_CHECK_TIME = "genai_api_key_last_check_time"
 KV_SETTINGS_KEY = "danswer_settings"
 KV_CUSTOMER_UUID_KEY = "customer_uuid"
@@ -74,12 +75,16 @@ CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120

 # needs to be long enough to cover the maximum time it takes to download an object
 # if we can get callbacks as object bytes download, we could lower this a lot.
-CELERY_INDEXING_LOCK_TIMEOUT = 60 * 60  # 60 min
+CELERY_INDEXING_LOCK_TIMEOUT = 3 * 60 * 60  # 60 min

 # needs to be long enough to cover the maximum time it takes to download an object
 # if we can get callbacks as object bytes download, we could lower this a lot.
 CELERY_PRUNING_LOCK_TIMEOUT = 300  # 5 min

+CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT = 300  # 5 min
+
+CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT = 300  # 5 min
+
 DANSWER_REDIS_FUNCTION_LOCK_PREFIX = "da_function_lock:"


@@ -209,9 +214,17 @@ class PostgresAdvisoryLocks(Enum):


 class DanswerCeleryQueues:
+    # Light queue
    VESPA_METADATA_SYNC = "vespa_metadata_sync"
+    DOC_PERMISSIONS_UPSERT = "doc_permissions_upsert"
    CONNECTOR_DELETION = "connector_deletion"
+
+    # Heavy queue
    CONNECTOR_PRUNING = "connector_pruning"
+    CONNECTOR_DOC_PERMISSIONS_SYNC = "connector_doc_permissions_sync"
+    CONNECTOR_EXTERNAL_GROUP_SYNC = "connector_external_group_sync"
+
+    # Indexing queue
    CONNECTOR_INDEXING = "connector_indexing"


@@ -221,8 +234,18 @@ class DanswerRedisLocks:
    CHECK_CONNECTOR_DELETION_BEAT_LOCK = "da_lock:check_connector_deletion_beat"
    CHECK_PRUNE_BEAT_LOCK = "da_lock:check_prune_beat"
    CHECK_INDEXING_BEAT_LOCK = "da_lock:check_indexing_beat"
+    CHECK_CONNECTOR_DOC_PERMISSIONS_SYNC_BEAT_LOCK = (
+        "da_lock:check_connector_doc_permissions_sync_beat"
+    )
+    CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK = (
+        "da_lock:check_connector_external_group_sync_beat"
+    )
    MONITOR_VESPA_SYNC_BEAT_LOCK = "da_lock:monitor_vespa_sync_beat"

+    CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX = (
+        "da_lock:connector_doc_permissions_sync"
+    )
+    CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX = "da_lock:connector_external_group_sync"
    PRUNING_LOCK_PREFIX = "da_lock:pruning"
    INDEXING_METADATA_PREFIX = "da_metadata:indexing"

@@ -238,6 +261,32 @@ class DanswerCeleryPriority(int, Enum):
    LOWEST = auto()


+class DanswerCeleryTask:
+    CHECK_FOR_CONNECTOR_DELETION = "check_for_connector_deletion_task"
+    CHECK_FOR_VESPA_SYNC_TASK = "check_for_vespa_sync_task"
+    CHECK_FOR_INDEXING = "check_for_indexing"
+    CHECK_FOR_PRUNING = "check_for_pruning"
+    CHECK_FOR_DOC_PERMISSIONS_SYNC = "check_for_doc_permissions_sync"
+    CHECK_FOR_EXTERNAL_GROUP_SYNC = "check_for_external_group_sync"
+    MONITOR_VESPA_SYNC = "monitor_vespa_sync"
+    KOMBU_MESSAGE_CLEANUP_TASK = "kombu_message_cleanup_task"
+    CONNECTOR_PERMISSION_SYNC_GENERATOR_TASK = (
+        "connector_permission_sync_generator_task"
+    )
+    UPDATE_EXTERNAL_DOCUMENT_PERMISSIONS_TASK = (
+        "update_external_document_permissions_task"
+    )
+    CONNECTOR_EXTERNAL_GROUP_SYNC_GENERATOR_TASK = (
+        "connector_external_group_sync_generator_task"
+    )
+    CONNECTOR_INDEXING_PROXY_TASK = "connector_indexing_proxy_task"
+    CONNECTOR_PRUNING_GENERATOR_TASK = "connector_pruning_generator_task"
+    DOCUMENT_BY_CC_PAIR_CLEANUP_TASK = "document_by_cc_pair_cleanup_task"
+    VESPA_METADATA_SYNC_TASK = "vespa_metadata_sync_task"
+    CHECK_TTL_MANAGEMENT_TASK = "check_ttl_management_task"
+    AUTOGENERATE_USAGE_REPORT_TASK = "autogenerate_usage_report_task"
+
+
 REDIS_SOCKET_KEEPALIVE_OPTIONS = {}
 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3
--- a/backend/danswer/configs/danswerbot_configs.py
+++ b/backend/danswer/configs/danswerbot_configs.py
@@ -4,11 +4,8 @@ import os
 # Danswer Slack Bot Configs
 #####
 DANSWER_BOT_NUM_RETRIES = int(os.environ.get("DANSWER_BOT_NUM_RETRIES", "5"))
-DANSWER_BOT_ANSWER_GENERATION_TIMEOUT = int(
-    os.environ.get("DANSWER_BOT_ANSWER_GENERATION_TIMEOUT", "90")
-)
 # How much of the available input context can be used for thread context
-DANSWER_BOT_TARGET_CHUNK_PERCENTAGE = 512 * 2 / 3072
+MAX_THREAD_CONTEXT_PERCENTAGE = 512 * 2 / 3072
 # Number of docs to display in "Reference Documents"
 DANSWER_BOT_NUM_DOCS_TO_DISPLAY = int(
    os.environ.get("DANSWER_BOT_NUM_DOCS_TO_DISPLAY", "5")
@@ -47,17 +44,6 @@ DANSWER_BOT_DISPLAY_ERROR_MSGS = os.environ.get(
 DANSWER_BOT_RESPOND_EVERY_CHANNEL = (
    os.environ.get("DANSWER_BOT_RESPOND_EVERY_CHANNEL", "").lower() == "true"
 )
-# Add a second LLM call post Answer to verify if the Answer is valid
-# Throws out answers that don't directly or fully answer the user query
-# This is the default for all DanswerBot channels unless the channel is configured individually
-# Set/unset by "Hide Non Answers"
-ENABLE_DANSWERBOT_REFLEXION = (
-    os.environ.get("ENABLE_DANSWERBOT_REFLEXION", "").lower() == "true"
-)
-# Currently not support chain of thought, probably will add back later
-DANSWER_BOT_DISABLE_COT = True
-# if set, will default DanswerBot to use quotes and reference documents
-DANSWER_BOT_USE_QUOTES = os.environ.get("DANSWER_BOT_USE_QUOTES", "").lower() == "true"

 # Maximum Questions Per Minute, Default Uncapped
 DANSWER_BOT_MAX_QPM = int(os.environ.get("DANSWER_BOT_MAX_QPM") or 0) or None
--- a/backend/danswer/configs/model_configs.py
+++ b/backend/danswer/configs/model_configs.py
@@ -70,7 +70,9 @@ GEN_AI_NUM_RESERVED_OUTPUT_TOKENS = int(
 )

 # Typically, GenAI models nowadays are at least 4K tokens
-GEN_AI_MODEL_FALLBACK_MAX_TOKENS = 4096
+GEN_AI_MODEL_FALLBACK_MAX_TOKENS = int(
+    os.environ.get("GEN_AI_MODEL_FALLBACK_MAX_TOKENS") or 4096
+)

 # Number of tokens from chat history to include at maximum
 # 3000 should be enough context regardless of use, no need to include as much as possible
@@ -119,3 +121,14 @@ if _LITELLM_PASS_THROUGH_HEADERS_RAW:
        logger.error(
            "Failed to parse LITELLM_PASS_THROUGH_HEADERS, must be a valid JSON object"
        )
+
+
+# if specified, will merge the specified JSON with the existing body of the
+# request before sending it to the LLM
+LITELLM_EXTRA_BODY: dict | None = None
+_LITELLM_EXTRA_BODY_RAW = os.environ.get("LITELLM_EXTRA_BODY")
+if _LITELLM_EXTRA_BODY_RAW:
+    try:
+        LITELLM_EXTRA_BODY = json.loads(_LITELLM_EXTRA_BODY_RAW)
+    except Exception:
+        pass
--- a/backend/danswer/connectors/README.md
+++ b/backend/danswer/connectors/README.md
@@ -11,11 +11,16 @@ Connectors come in 3 different flows:
 - Load Connector:
  - Bulk indexes documents to reflect a point in time. This type of connector generally works by either pulling all
  documents via a connector's API or loads the documents from some sort of a dump file.
- Poll connector:
+- Poll Connector:
  - Incrementally updates documents based on a provided time range. It is used by the background job to pull the latest
  changes and additions since the last round of polling. This connector helps keep the document index up to date
  without needing to fetch/embed/index every document which would be too slow to do frequently on large sets of
  documents.
+- Slim Connector:
+  - This connector should be a lighter weight method of checking all documents in the source to see if they still exist.
+  - This connector should be identical to the Poll or Load Connector except that it only fetches the IDs of the documents, not the documents themselves.
+  - This is used by our pruning job which removes old documents from the index.
+  - The optional start and end datetimes can be ignored.
 - Event Based connectors:
  - Connectors that listen to events and update documents accordingly.
  - Currently not used by the background job, this exists for future design purposes.
@@ -26,8 +31,14 @@ Refer to [interfaces.py](https://github.com/danswer-ai/danswer/blob/main/backend
 and this first contributor created Pull Request for a new connector (Shoutout to Dan Brown):
 [Reference Pull Request](https://github.com/danswer-ai/danswer/pull/139)

+For implementing a Slim Connector, refer to the comments in this PR:
+[Slim Connector PR](https://github.com/danswer-ai/danswer/pull/3303/files)
+
+All new connectors should have tests added to the `backend/tests/daily/connectors` directory. Refer to the above PR for an example of adding tests for a new connector.
+
+
 #### Implementing the new Connector
-The connector must subclass one or more of LoadConnector, PollConnector, or EventConnector.
+The connector must subclass one or more of LoadConnector, PollConnector, SlimConnector, or EventConnector.

 The `__init__` should take arguments for configuring what documents the connector will and where it finds those
 documents. For example, if you have a wiki site, it may include the configuration for the team, topic, folder, etc. of
--- a/backend/danswer/connectors/blob/connector.py
+++ b/backend/danswer/connectors/blob/connector.py
@@ -5,9 +5,9 @@ from io import BytesIO
 from typing import Any
 from typing import Optional

-import boto3
-from botocore.client import Config
-from mypy_boto3_s3 import S3Client
+import boto3  # type: ignore
+from botocore.client import Config  # type: ignore
+from mypy_boto3_s3 import S3Client  # type: ignore

 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import BlobType
--- a/backend/danswer/connectors/confluence/connector.py
+++ b/backend/danswer/connectors/confluence/connector.py
@@ -1,15 +1,17 @@
 from datetime import datetime
+from datetime import timedelta
 from datetime import timezone
 from typing import Any
 from urllib.parse import quote

 from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
+from danswer.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
 from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
 from danswer.configs.app_configs import INDEX_BATCH_SIZE
 from danswer.configs.constants import DocumentSource
+from danswer.connectors.confluence.onyx_confluence import build_confluence_client
 from danswer.connectors.confluence.onyx_confluence import OnyxConfluence
 from danswer.connectors.confluence.utils import attachment_to_content
-from danswer.connectors.confluence.utils import build_confluence_client
 from danswer.connectors.confluence.utils import build_confluence_document_id
 from danswer.connectors.confluence.utils import datetime_from_string
 from danswer.connectors.confluence.utils import extract_text_from_confluence_html
@@ -51,6 +53,8 @@ _RESTRICTIONS_EXPANSION_FIELDS = [
    "restrictions.read.restrictions.group",
 ]

+_SLIM_DOC_BATCH_SIZE = 5000
+

 class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
    def __init__(
@@ -67,10 +71,11 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        # skip it. This is generally used to avoid indexing extra sensitive
        # pages.
        labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
+        timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
    ) -> None:
        self.batch_size = batch_size
        self.continue_on_failure = continue_on_failure
-        self.confluence_client: OnyxConfluence | None = None
+        self._confluence_client: OnyxConfluence | None = None
        self.is_cloud = is_cloud

        # Remove trailing slash from wiki_base if present
@@ -81,15 +86,15 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        if cql_query:
            # if a cql_query is provided, we will use it to fetch the pages
            cql_page_query = cql_query
-        elif space:
-            # if no cql_query is provided, we will use the space to fetch the pages
-            cql_page_query += f" and space='{quote(space)}'"
        elif page_id:
+            # if a cql_query is not provided, we will use the page_id to fetch the page
            if index_recursively:
                cql_page_query += f" and ancestor='{page_id}'"
            else:
-                # if neither a space nor a cql_query is provided, we will use the page_id to fetch the page
                cql_page_query += f" and id='{page_id}'"
+        elif space:
+            # if no cql_query or page_id is provided, we will use the space to fetch the pages
+            cql_page_query += f" and space='{quote(space)}'"

        self.cql_page_query = cql_page_query
        self.cql_time_filter = ""
@@ -97,39 +102,46 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        self.cql_label_filter = ""
        if labels_to_skip:
            labels_to_skip = list(set(labels_to_skip))
-            comma_separated_labels = ",".join(f"'{label}'" for label in labels_to_skip)
+            comma_separated_labels = ",".join(
+                f"'{quote(label)}'" for label in labels_to_skip
+            )
            self.cql_label_filter = f" and label not in ({comma_separated_labels})"

+        self.timezone: timezone = timezone(offset=timedelta(hours=timezone_offset))
+
+    @property
+    def confluence_client(self) -> OnyxConfluence:
+        if self._confluence_client is None:
+            raise ConnectorMissingCredentialError("Confluence")
+        return self._confluence_client
+
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        # see https://github.com/atlassian-api/atlassian-python-api/blob/master/atlassian/rest_client.py
        # for a list of other hidden constructor args
-        self.confluence_client = build_confluence_client(
-            credentials_json=credentials,
+        self._confluence_client = build_confluence_client(
+            credentials=credentials,
            is_cloud=self.is_cloud,
            wiki_base=self.wiki_base,
        )
        return None

    def _get_comment_string_for_page_id(self, page_id: str) -> str:
-        if self.confluence_client is None:
-            raise ConnectorMissingCredentialError("Confluence")
-
        comment_string = ""

        comment_cql = f"type=comment and container='{page_id}'"
        comment_cql += self.cql_label_filter

        expand = ",".join(_COMMENT_EXPANSION_FIELDS)
-        for comments in self.confluence_client.paginated_cql_page_retrieval(
+        for comment in self.confluence_client.paginated_cql_retrieval(
            cql=comment_cql,
            expand=expand,
        ):
-            for comment in comments:
-                comment_string += "\nComment:\n"
-                comment_string += extract_text_from_confluence_html(
-                    confluence_client=self.confluence_client,
-                    confluence_object=comment,
-                )
+            comment_string += "\nComment:\n"
+            comment_string += extract_text_from_confluence_html(
+                confluence_client=self.confluence_client,
+                confluence_object=comment,
+                fetched_titles=set(),
+            )

        return comment_string

@@ -141,28 +153,28 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        If its a page, it extracts the text, adds the comments for the document text.
        If its an attachment, it just downloads the attachment and converts that into a document.
        """
-        if self.confluence_client is None:
-            raise ConnectorMissingCredentialError("Confluence")
-
        # The url and the id are the same
        object_url = build_confluence_document_id(
-            self.wiki_base, confluence_object["_links"]["webui"]
+            self.wiki_base, confluence_object["_links"]["webui"], self.is_cloud
        )

        object_text = None
        # Extract text from page
        if confluence_object["type"] == "page":
            object_text = extract_text_from_confluence_html(
-                self.confluence_client, confluence_object
+                confluence_client=self.confluence_client,
+                confluence_object=confluence_object,
+                fetched_titles={confluence_object.get("title", "")},
            )
            # Add comments to text
            object_text += self._get_comment_string_for_page_id(confluence_object["id"])
        elif confluence_object["type"] == "attachment":
            object_text = attachment_to_content(
-                self.confluence_client, confluence_object
+                confluence_client=self.confluence_client, attachment=confluence_object
            )

        if object_text is None:
+            # This only happens for attachments that are not parseable
            return None

        # Get space name
@@ -193,44 +205,41 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        )

    def _fetch_document_batches(self) -> GenerateDocumentsOutput:
-        if self.confluence_client is None:
-            raise ConnectorMissingCredentialError("Confluence")
-
        doc_batch: list[Document] = []
        confluence_page_ids: list[str] = []

        page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter
+        logger.debug(f"page_query: {page_query}")
        # Fetch pages as Documents
-        for page_batch in self.confluence_client.paginated_cql_page_retrieval(
+        for page in self.confluence_client.paginated_cql_retrieval(
            cql=page_query,
            expand=",".join(_PAGE_EXPANSION_FIELDS),
            limit=self.batch_size,
        ):
-            for page in page_batch:
-                confluence_page_ids.append(page["id"])
-                doc = self._convert_object_to_document(page)
-                if doc is not None:
-                    doc_batch.append(doc)
-                if len(doc_batch) >= self.batch_size:
-                    yield doc_batch
-                    doc_batch = []
+            logger.debug(f"_fetch_document_batches: {page['id']}")
+            confluence_page_ids.append(page["id"])
+            doc = self._convert_object_to_document(page)
+            if doc is not None:
+                doc_batch.append(doc)
+            if len(doc_batch) >= self.batch_size:
+                yield doc_batch
+                doc_batch = []

        # Fetch attachments as Documents
        for confluence_page_id in confluence_page_ids:
            attachment_cql = f"type=attachment and container='{confluence_page_id}'"
            attachment_cql += self.cql_label_filter
            # TODO: maybe should add time filter as well?
-            for attachments in self.confluence_client.paginated_cql_page_retrieval(
+            for attachment in self.confluence_client.paginated_cql_retrieval(
                cql=attachment_cql,
                expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
            ):
-                for attachment in attachments:
-                    doc = self._convert_object_to_document(attachment)
-                    if doc is not None:
-                        doc_batch.append(doc)
-                    if len(doc_batch) >= self.batch_size:
-                        yield doc_batch
-                        doc_batch = []
+                doc = self._convert_object_to_document(attachment)
+                if doc is not None:
+                    doc_batch.append(doc)
+                if len(doc_batch) >= self.batch_size:
+                    yield doc_batch
+                    doc_batch = []

        if doc_batch:
            yield doc_batch
@@ -240,10 +249,10 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):

    def poll_source(self, start: float, end: float) -> GenerateDocumentsOutput:
        # Add time filters
-        formatted_start_time = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
+        formatted_start_time = datetime.fromtimestamp(start, tz=self.timezone).strftime(
            "%Y-%m-%d %H:%M"
        )
-        formatted_end_time = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
+        formatted_end_time = datetime.fromtimestamp(end, tz=self.timezone).strftime(
            "%Y-%m-%d %H:%M"
        )
        self.cql_time_filter = f" and lastmodified >= '{formatted_start_time}'"
@@ -255,48 +264,52 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
        start: SecondsSinceUnixEpoch | None = None,
        end: SecondsSinceUnixEpoch | None = None,
    ) -> GenerateSlimDocumentOutput:
-        if self.confluence_client is None:
-            raise ConnectorMissingCredentialError("Confluence")
-
        doc_metadata_list: list[SlimDocument] = []

        restrictions_expand = ",".join(_RESTRICTIONS_EXPANSION_FIELDS)

        page_query = self.cql_page_query + self.cql_label_filter
-        for pages in self.confluence_client.cql_paginate_all_expansions(
+        for page in self.confluence_client.cql_paginate_all_expansions(
            cql=page_query,
            expand=restrictions_expand,
+            limit=_SLIM_DOC_BATCH_SIZE,
        ):
-            for page in pages:
-                # If the page has restrictions, add them to the perm_sync_data
-                # These will be used by doc_sync.py to sync permissions
-                perm_sync_data = {
-                    "restrictions": page.get("restrictions", {}),
-                    "space_key": page.get("space", {}).get("key"),
-                }
+            # If the page has restrictions, add them to the perm_sync_data
+            # These will be used by doc_sync.py to sync permissions
+            perm_sync_data = {
+                "restrictions": page.get("restrictions", {}),
+                "space_key": page.get("space", {}).get("key"),
+            }

+            doc_metadata_list.append(
+                SlimDocument(
+                    id=build_confluence_document_id(
+                        self.wiki_base,
+                        page["_links"]["webui"],
+                        self.is_cloud,
+                    ),
+                    perm_sync_data=perm_sync_data,
+                )
+            )
+            attachment_cql = f"type=attachment and container='{page['id']}'"
+            attachment_cql += self.cql_label_filter
+            for attachment in self.confluence_client.cql_paginate_all_expansions(
+                cql=attachment_cql,
+                expand=restrictions_expand,
+                limit=_SLIM_DOC_BATCH_SIZE,
+            ):
                doc_metadata_list.append(
                    SlimDocument(
                        id=build_confluence_document_id(
-                            self.wiki_base, page["_links"]["webui"]
+                            self.wiki_base,
+                            attachment["_links"]["webui"],
+                            self.is_cloud,
                        ),
                        perm_sync_data=perm_sync_data,
                    )
                )
-                attachment_cql = f"type=attachment and container='{page['id']}'"
-                attachment_cql += self.cql_label_filter
-                for attachments in self.confluence_client.cql_paginate_all_expansions(
-                    cql=attachment_cql,
-                    expand=restrictions_expand,
-                ):
-                    for attachment in attachments:
-                        doc_metadata_list.append(
-                            SlimDocument(
-                                id=build_confluence_document_id(
-                                    self.wiki_base, attachment["_links"]["webui"]
-                                ),
-                                perm_sync_data=perm_sync_data,
-                            )
-                        )
-                yield doc_metadata_list
-                doc_metadata_list = []
+            if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE:
+                yield doc_metadata_list[:_SLIM_DOC_BATCH_SIZE]
+                doc_metadata_list = doc_metadata_list[_SLIM_DOC_BATCH_SIZE:]
+
+        yield doc_metadata_list
--- a/backend/danswer/connectors/confluence/onyx_confluence.py
+++ b/backend/danswer/connectors/confluence/onyx_confluence.py
@@ -20,6 +20,10 @@ F = TypeVar("F", bound=Callable[..., Any])

 RATE_LIMIT_MESSAGE_LOWERCASE = "Rate limit exceeded".lower()

+# https://jira.atlassian.com/browse/CONFCLOUD-76433
+_PROBLEMATIC_EXPANSIONS = "body.storage.value"
+_REPLACEMENT_EXPANSIONS = "body.view.value"
+

 class ConfluenceRateLimitError(Exception):
    pass
@@ -80,7 +84,7 @@ def handle_confluence_rate_limit(confluence_call: F) -> F:
    def wrapped_call(*args: list[Any], **kwargs: Any) -> Any:
        MAX_RETRIES = 5

-        TIMEOUT = 3600
+        TIMEOUT = 600
        timeout_at = time.monotonic() + TIMEOUT

        for attempt in range(MAX_RETRIES):
@@ -95,6 +99,10 @@ def handle_confluence_rate_limit(confluence_call: F) -> F:
                return confluence_call(*args, **kwargs)
            except HTTPError as e:
                delay_until = _handle_http_error(e, attempt)
+                logger.warning(
+                    f"HTTPError in confluence call. "
+                    f"Retrying in {delay_until} seconds..."
+                )
                while time.monotonic() < delay_until:
                    # in the future, check a signal here to exit
                    time.sleep(1)
@@ -112,7 +120,7 @@ def handle_confluence_rate_limit(confluence_call: F) -> F:
    return cast(F, wrapped_call)


-_DEFAULT_PAGINATION_LIMIT = 100
+_DEFAULT_PAGINATION_LIMIT = 1000


 class OnyxConfluence(Confluence):
@@ -126,6 +134,32 @@ class OnyxConfluence(Confluence):
        super(OnyxConfluence, self).__init__(url, *args, **kwargs)
        self._wrap_methods()

+    def get_current_user(self, expand: str | None = None) -> Any:
+        """
+        Implements a method that isn't in the third party client.
+
+        Get information about the current user
+        :param expand: OPTIONAL expand for get status of user.
+                Possible param is "status". Results are "Active, Deactivated"
+        :return: Returns the user details
+        """
+
+        from atlassian.errors import ApiPermissionError  # type:ignore
+
+        url = "rest/api/user/current"
+        params = {}
+        if expand:
+            params["expand"] = expand
+        try:
+            response = self.get(url, params=params)
+        except HTTPError as e:
+            if e.response.status_code == 403:
+                raise ApiPermissionError(
+                    "The calling user does not have permission", reason=e
+                )
+            raise
+        return response
+
    def _wrap_methods(self) -> None:
        """
        For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
@@ -141,7 +175,7 @@ class OnyxConfluence(Confluence):

    def _paginate_url(
        self, url_suffix: str, limit: int | None = None
-    ) -> Iterator[list[dict[str, Any]]]:
+    ) -> Iterator[dict[str, Any]]:
        """
        This will paginate through the top level query.
        """
@@ -153,46 +187,43 @@ class OnyxConfluence(Confluence):

        while url_suffix:
            try:
+                logger.debug(f"Making confluence call to {url_suffix}")
                next_response = self.get(url_suffix)
            except Exception as e:
-                logger.exception("Error in danswer_cql: \n")
-                raise e
-            yield next_response.get("results", [])
+                logger.warning(f"Error in confluence call to {url_suffix}")
+
+                # If the problematic expansion is in the url, replace it
+                # with the replacement expansion and try again
+                # If that fails, raise the error
+                if _PROBLEMATIC_EXPANSIONS not in url_suffix:
+                    logger.exception(f"Error in confluence call to {url_suffix}")
+                    raise e
+                logger.warning(
+                    f"Replacing {_PROBLEMATIC_EXPANSIONS} with {_REPLACEMENT_EXPANSIONS}"
+                    " and trying again."
+                )
+                url_suffix = url_suffix.replace(
+                    _PROBLEMATIC_EXPANSIONS,
+                    _REPLACEMENT_EXPANSIONS,
+                )
+                continue
+
+            # yield the results individually
+            yield from next_response.get("results", [])
+
            url_suffix = next_response.get("_links", {}).get("next")

-    def paginated_groups_retrieval(
-        self,
-        limit: int | None = None,
-    ) -> Iterator[list[dict[str, Any]]]:
-        return self._paginate_url("rest/api/group", limit)
-
-    def paginated_group_members_retrieval(
-        self,
-        group_name: str,
-        limit: int | None = None,
-    ) -> Iterator[list[dict[str, Any]]]:
-        group_name = quote(group_name)
-        return self._paginate_url(f"rest/api/group/{group_name}/member", limit)
-
-    def paginated_cql_user_retrieval(
+    def paginated_cql_retrieval(
        self,
        cql: str,
        expand: str | None = None,
        limit: int | None = None,
-    ) -> Iterator[list[dict[str, Any]]]:
+    ) -> Iterator[dict[str, Any]]:
+        """
+        The content/search endpoint can be used to fetch pages, attachments, and comments.
+        """
        expand_string = f"&expand={expand}" if expand else ""
-        return self._paginate_url(
-            f"rest/api/search/user?cql={cql}{expand_string}", limit
-        )
-
-    def paginated_cql_page_retrieval(
-        self,
-        cql: str,
-        expand: str | None = None,
-        limit: int | None = None,
-    ) -> Iterator[list[dict[str, Any]]]:
-        expand_string = f"&expand={expand}" if expand else ""
-        return self._paginate_url(
+        yield from self._paginate_url(
            f"rest/api/content/search?cql={cql}{expand_string}", limit
        )

@@ -201,7 +232,7 @@ class OnyxConfluence(Confluence):
        cql: str,
        expand: str | None = None,
        limit: int | None = None,
-    ) -> Iterator[list[dict[str, Any]]]:
+    ) -> Iterator[dict[str, Any]]:
        """
        This function will paginate through the top level query first, then
        paginate through all of the expansions.
@@ -221,6 +252,120 @@ class OnyxConfluence(Confluence):
                for item in data:
                    _traverse_and_update(item)

-        for results in self.paginated_cql_page_retrieval(cql, expand, limit):
-            _traverse_and_update(results)
-            yield results
+        for confluence_object in self.paginated_cql_retrieval(cql, expand, limit):
+            _traverse_and_update(confluence_object)
+            yield confluence_object
+
+    def paginated_cql_user_retrieval(
+        self,
+        expand: str | None = None,
+        limit: int | None = None,
+    ) -> Iterator[dict[str, Any]]:
+        """
+        The search/user endpoint can be used to fetch users.
+        It's a seperate endpoint from the content/search endpoint used only for users.
+        Otherwise it's very similar to the content/search endpoint.
+        """
+        cql = "type=user"
+        url = "rest/api/search/user" if self.cloud else "rest/api/search"
+        expand_string = f"&expand={expand}" if expand else ""
+        url += f"?cql={cql}{expand_string}"
+        yield from self._paginate_url(url, limit)
+
+    def paginated_groups_by_user_retrieval(
+        self,
+        user: dict[str, Any],
+        limit: int | None = None,
+    ) -> Iterator[dict[str, Any]]:
+        """
+        This is not an SQL like query.
+        It's a confluence specific endpoint that can be used to fetch groups.
+        """
+        user_field = "accountId" if self.cloud else "key"
+        user_value = user["accountId"] if self.cloud else user["userKey"]
+        # Server uses userKey (but calls it key during the API call), Cloud uses accountId
+        user_query = f"{user_field}={quote(user_value)}"
+
+        url = f"rest/api/user/memberof?{user_query}"
+        yield from self._paginate_url(url, limit)
+
+    def paginated_groups_retrieval(
+        self,
+        limit: int | None = None,
+    ) -> Iterator[dict[str, Any]]:
+        """
+        This is not an SQL like query.
+        It's a confluence specific endpoint that can be used to fetch groups.
+        """
+        yield from self._paginate_url("rest/api/group", limit)
+
+    def paginated_group_members_retrieval(
+        self,
+        group_name: str,
+        limit: int | None = None,
+    ) -> Iterator[dict[str, Any]]:
+        """
+        This is not an SQL like query.
+        It's a confluence specific endpoint that can be used to fetch the members of a group.
+        THIS DOESN'T WORK FOR SERVER because it breaks when there is a slash in the group name.
+        E.g. neither "test/group" nor "test%2Fgroup" works for confluence.
+        """
+        group_name = quote(group_name)
+        yield from self._paginate_url(f"rest/api/group/{group_name}/member", limit)
+
+
+def _validate_connector_configuration(
+    credentials: dict[str, Any],
+    is_cloud: bool,
+    wiki_base: str,
+) -> None:
+    # test connection with direct client, no retries
+    confluence_client_with_minimal_retries = Confluence(
+        api_version="cloud" if is_cloud else "latest",
+        url=wiki_base.rstrip("/"),
+        username=credentials["confluence_username"] if is_cloud else None,
+        password=credentials["confluence_access_token"] if is_cloud else None,
+        token=credentials["confluence_access_token"] if not is_cloud else None,
+        backoff_and_retry=True,
+        max_backoff_retries=6,
+        max_backoff_seconds=10,
+    )
+    spaces = confluence_client_with_minimal_retries.get_all_spaces(limit=1)
+
+    # uncomment the following for testing
+    # the following is an attempt to retrieve the user's timezone
+    # Unfornately, all data is returned in UTC regardless of the user's time zone
+    # even tho CQL parses incoming times based on the user's time zone
+    # space_key = spaces["results"][0]["key"]
+    # space_details = confluence_client_with_minimal_retries.cql(f"space.key={space_key}+AND+type=space")
+
+    if not spaces:
+        raise RuntimeError(
+            f"No spaces found at {wiki_base}! "
+            "Check your credentials and wiki_base and make sure "
+            "is_cloud is set correctly."
+        )
+
+
+def build_confluence_client(
+    credentials: dict[str, Any],
+    is_cloud: bool,
+    wiki_base: str,
+) -> OnyxConfluence:
+    _validate_connector_configuration(
+        credentials=credentials,
+        is_cloud=is_cloud,
+        wiki_base=wiki_base,
+    )
+    return OnyxConfluence(
+        api_version="cloud" if is_cloud else "latest",
+        # Remove trailing slash from wiki_base if present
+        url=wiki_base.rstrip("/"),
+        # passing in username causes issues for Confluence data center
+        username=credentials["confluence_username"] if is_cloud else None,
+        password=credentials["confluence_access_token"] if is_cloud else None,
+        token=credentials["confluence_access_token"] if not is_cloud else None,
+        backoff_and_retry=True,
+        max_backoff_retries=10,
+        max_backoff_seconds=60,
+    )
--- a/backend/danswer/connectors/confluence/utils.py
+++ b/backend/danswer/connectors/confluence/utils.py
@@ -2,6 +2,7 @@ import io
 from datetime import datetime
 from datetime import timezone
 from typing import Any
+from urllib.parse import quote

 import bs4

@@ -31,7 +32,11 @@ def get_user_email_from_username__server(
            response = confluence_client.get_mobile_parameters(user_name)
            email = response.get("email")
        except Exception:
-            email = None
+            # For now, we'll just return a string that indicates failure
+            # We may want to revert to returning None in the future
+            # email = None
+            email = f"FAILED TO GET CONFLUENCE EMAIL FOR {user_name}"
+            logger.warning(f"failed to get confluence email for {user_name}")
        _USER_EMAIL_CACHE[user_name] = email
    return _USER_EMAIL_CACHE[user_name]

@@ -71,7 +76,9 @@ def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str:


 def extract_text_from_confluence_html(
-    confluence_client: OnyxConfluence, confluence_object: dict[str, Any]
+    confluence_client: OnyxConfluence,
+    confluence_object: dict[str, Any],
+    fetched_titles: set[str],
 ) -> str:
    """Parse a Confluence html page and replace the 'user Id' by the real
        User Display Name
@@ -79,7 +86,7 @@ def extract_text_from_confluence_html(
    Args:
        confluence_object (dict): The confluence object as a dict
        confluence_client (Confluence): Confluence client
-
+        fetched_titles (set[str]): The titles of the pages that have already been fetched
    Returns:
        str: loaded and formated Confluence page
    """
@@ -100,6 +107,73 @@ def extract_text_from_confluence_html(
            continue
        # Include @ sign for tagging, more clear for LLM
        user.replaceWith("@" + _get_user(confluence_client, user_id))
+
+    for html_page_reference in soup.findAll("ac:structured-macro"):
+        # Here, we only want to process page within page macros
+        if html_page_reference.attrs.get("ac:name") != "include":
+            continue
+
+        page_data = html_page_reference.find("ri:page")
+        if not page_data:
+            logger.warning(
+                f"Skipping retrieval of {html_page_reference} because because page data is missing"
+            )
+            continue
+
+        page_title = page_data.attrs.get("ri:content-title")
+        if not page_title:
+            # only fetch pages that have a title
+            logger.warning(
+                f"Skipping retrieval of {html_page_reference} because it has no title"
+            )
+            continue
+
+        if page_title in fetched_titles:
+            # prevent recursive fetching of pages
+            logger.debug(f"Skipping {page_title} because it has already been fetched")
+            continue
+
+        fetched_titles.add(page_title)
+
+        # Wrap this in a try-except because there are some pages that might not exist
+        try:
+            page_query = f"type=page and title='{quote(page_title)}'"
+
+            page_contents: dict[str, Any] | None = None
+            # Confluence enforces title uniqueness, so we should only get one result here
+            for page in confluence_client.paginated_cql_retrieval(
+                cql=page_query,
+                expand="body.storage.value",
+                limit=1,
+            ):
+                page_contents = page
+                break
+        except Exception as e:
+            logger.warning(
+                f"Error getting page contents for object {confluence_object}: {e}"
+            )
+            continue
+
+        if not page_contents:
+            continue
+
+        text_from_page = extract_text_from_confluence_html(
+            confluence_client=confluence_client,
+            confluence_object=page_contents,
+            fetched_titles=fetched_titles,
+        )
+
+        html_page_reference.replaceWith(text_from_page)
+
+    for html_link_body in soup.findAll("ac:link-body"):
+        # This extracts the text from inline links in the page so they can be
+        # represented in the document text as plain text
+        try:
+            text_from_link = html_link_body.text
+            html_link_body.replaceWith(f"(LINK TEXT: {text_from_link})")
+        except Exception as e:
+            logger.warning(f"Error processing ac:link-body: {e}")
+
    return format_document_soup(soup)


@@ -153,7 +227,9 @@ def attachment_to_content(
    return extracted_text


-def build_confluence_document_id(base_url: str, content_url: str) -> str:
+def build_confluence_document_id(
+    base_url: str, content_url: str, is_cloud: bool
+) -> str:
    """For confluence, the document id is the page url for a page based document
        or the attachment download url for an attachment based document

@@ -164,6 +240,8 @@ def build_confluence_document_id(base_url: str, content_url: str) -> str:
    Returns:
        str: The document id
    """
+    if is_cloud and not base_url.endswith("/wiki"):
+        base_url += "/wiki"
    return f"{base_url}{content_url}"


@@ -195,20 +273,3 @@ def datetime_from_string(datetime_string: str) -> datetime:
        datetime_object = datetime_object.astimezone(timezone.utc)

    return datetime_object
-
-
-def build_confluence_client(
-    credentials_json: dict[str, Any], is_cloud: bool, wiki_base: str
-) -> OnyxConfluence:
-    return OnyxConfluence(
-        api_version="cloud" if is_cloud else "latest",
-        # Remove trailing slash from wiki_base if present
-        url=wiki_base.rstrip("/"),
-        # passing in username causes issues for Confluence data center
-        username=credentials_json["confluence_username"] if is_cloud else None,
-        password=credentials_json["confluence_access_token"] if is_cloud else None,
-        token=credentials_json["confluence_access_token"] if not is_cloud else None,
-        backoff_and_retry=True,
-        max_backoff_retries=60,
-        max_backoff_seconds=60,
-    )
--- a/Show More
+++ b/Show More