fix(celery): Guardrail for User File Processing (#8633 )

chore(devtools): upgrade ods: v0.6.1->v0.6.2 (#8773 )
chore(mypy): fix mypy cache issues switching between HEAD and release (#7732 )
2026-03-12 03:02:43 +00:00 · 2026-03-01 09:22:43 -08:00 · 2026-02-26 16:20:13 -08:00 · 2026-01-27 15:52:40 -08:00 · 2026-01-27 13:36:44 -08:00 · 2026-01-27 12:20:56 -08:00
561 changed files with 17887 additions and 44529 deletions
--- a/.github/workflows/deployment.yml
+++ b/.github/workflows/deployment.yml
@@ -404,7 +404,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -477,7 +477,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -537,7 +537,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -615,7 +615,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -696,7 +696,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -764,7 +764,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -839,7 +839,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -911,7 +911,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -970,7 +970,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
@@ -1049,7 +1049,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
        with:
          buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}

@@ -1128,7 +1128,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
        with:
          buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}

@@ -1193,7 +1193,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
--- a/.github/workflows/docker-tag-beta.yml
+++ b/.github/workflows/docker-tag-beta.yml
@@ -21,7 +21,7 @@ jobs:
    timeout-minutes: 45
    steps:
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
--- a/.github/workflows/docker-tag-latest.yml
+++ b/.github/workflows/docker-tag-latest.yml
@@ -21,7 +21,7 @@ jobs:
    timeout-minutes: 45
    steps:
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
--- a/.github/workflows/helm-chart-releases.yml
+++ b/.github/workflows/helm-chart-releases.yml
@@ -29,7 +29,6 @@ jobs:
        run: |
          helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
          helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
-          helm repo add opensearch https://opensearch-project.github.io/helm-charts
          helm repo add cloudnative-pg https://cloudnative-pg.github.io/charts
          helm repo add ot-container-kit https://ot-container-kit.github.io/helm-charts
          helm repo add minio https://charts.min.io/
--- a/.github/workflows/nightly-scan-licenses.yml
+++ b/.github/workflows/nightly-scan-licenses.yml
@@ -94,7 +94,7 @@ jobs:

    steps:
    - name: Set up Docker Buildx
-      uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+      uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

    - name: Login to Docker Hub
      uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
--- a/.github/workflows/pr-external-dependency-unit-tests.yml
+++ b/.github/workflows/pr-external-dependency-unit-tests.yml
@@ -45,9 +45,6 @@ env:
  # TODO: debug why this is failing and enable
  CODE_INTERPRETER_BASE_URL: http://localhost:8000

-  # OpenSearch
-  OPENSEARCH_ADMIN_PASSWORD: "StrongPassword123!"
-
 jobs:
  discover-test-dirs:
    # NOTE: Github-hosted runners have about 20s faster queue times and are preferred here.
@@ -128,13 +125,11 @@ jobs:
          docker compose \
            -f docker-compose.yml \
            -f docker-compose.dev.yml \
-            -f docker-compose.opensearch.yml \
            up -d \
            minio \
            relational_db \
            cache \
            index \
-            opensearch \
            code-interpreter

      - name: Run migrations
@@ -163,7 +158,7 @@ jobs:
          cd deployment/docker_compose

          # Get list of running containers
-          containers=$(docker compose -f docker-compose.yml -f docker-compose.dev.yml -f docker-compose.opensearch.yml ps -q)
+          containers=$(docker compose -f docker-compose.yml -f docker-compose.dev.yml ps -q)

          # Collect logs from each container
          for container in $containers; do
--- a/.github/workflows/pr-helm-chart-testing.yml
+++ b/.github/workflows/pr-helm-chart-testing.yml
@@ -88,7 +88,6 @@ jobs:
          echo "=== Adding Helm repositories ==="
          helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
          helm repo add vespa https://onyx-dot-app.github.io/vespa-helm-charts
-          helm repo add opensearch https://opensearch-project.github.io/helm-charts
          helm repo add cloudnative-pg https://cloudnative-pg.github.io/charts
          helm repo add ot-container-kit https://ot-container-kit.github.io/helm-charts
          helm repo add minio https://charts.min.io/
@@ -181,11 +180,6 @@ jobs:
          trap cleanup EXIT

          # Run the actual installation with detailed logging
-          # Note that opensearch.enabled is true whereas others in this install
-          # are false. There is some work that needs to be done to get this
-          # entire step working in CI, enabling opensearch here is a small step
-          # in that direction. If this is causing issues, disabling it in this
-          # step should be ok in the short term.
          echo "=== Starting ct install ==="
          set +e
          ct install --all \
@@ -193,8 +187,6 @@ jobs:
              --set=nginx.enabled=false \
              --set=minio.enabled=false \
              --set=vespa.enabled=false \
-              --set=opensearch.enabled=true \
-              --set=auth.opensearch.enabled=true \
              --set=slackbot.enabled=false \
              --set=postgresql.enabled=true \
              --set=postgresql.nameOverride=cloudnative-pg \
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -103,7 +103,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling Vespa, Redis, Postgres, and Minio images
      # otherwise, we hit the "Unauthenticated users" limit
@@ -163,7 +163,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling Vespa, Redis, Postgres, and Minio images
      # otherwise, we hit the "Unauthenticated users" limit
@@ -208,7 +208,7 @@ jobs:
          persist-credentials: false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling openapitools/openapi-generator-cli
      # otherwise, we hit the "Unauthenticated users" limit
--- a/.github/workflows/pr-mit-integration-tests.yml
+++ b/.github/workflows/pr-mit-integration-tests.yml
@@ -95,7 +95,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling Vespa, Redis, Postgres, and Minio images
      # otherwise, we hit the "Unauthenticated users" limit
@@ -155,7 +155,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling Vespa, Redis, Postgres, and Minio images
      # otherwise, we hit the "Unauthenticated users" limit
@@ -214,7 +214,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling openapitools/openapi-generator-cli
      # otherwise, we hit the "Unauthenticated users" limit
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -85,7 +85,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
      # https://docs.docker.com/docker-hub/usage/
@@ -146,7 +146,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
      # https://docs.docker.com/docker-hub/usage/
@@ -207,7 +207,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      # needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
      # https://docs.docker.com/docker-hub/usage/
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -50,8 +50,9 @@ jobs:
        uses: runs-on/cache@50350ad4242587b6c8c2baa2e740b1bc11285ff4 # ratchet:runs-on/cache@v4
        with:
          path: backend/.mypy_cache
-          key: mypy-${{ runner.os }}-${{ hashFiles('**/*.py', '**/*.pyi', 'backend/pyproject.toml') }}
+          key: mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-${{ hashFiles('**/*.py', '**/*.pyi', 'backend/pyproject.toml') }}
          restore-keys: |
+            mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-
            mypy-${{ runner.os }}-

      - name: Run MyPy
--- a/.github/workflows/pr-python-model-tests.yml
+++ b/.github/workflows/pr-python-model-tests.yml
@@ -5,6 +5,11 @@ on:
    # This cron expression runs the job daily at 16:00 UTC (9am PT)
    - cron: "0 16 * * *"
  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Branch to run the workflow on'
+        required: false
+        default: 'main'

 permissions:
  contents: read
@@ -26,11 +31,7 @@ env:
 jobs:
  model-check:
    # See https://runs-on.com/runners/linux/
-    runs-on:
-      - runs-on
-      - runner=4cpu-linux-arm64
-      - "run-id=${{ github.run_id }}-model-check"
-      - "extras=ecr-cache"
+    runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}-model-check"]
    timeout-minutes: 45

    env:
@@ -42,83 +43,104 @@ jobs:
        with:
          persist-credentials: false

-      - name: Setup Python and Install Dependencies
-        uses: ./.github/actions/setup-python-and-install-dependencies
-        with:
-          requirements: |
-            backend/requirements/default.txt
-            backend/requirements/dev.txt
-
-      - name: Format branch name for cache
-        id: format-branch
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          REF_NAME: ${{ github.ref_name }}
-        run: |
-          if [ -n "${PR_NUMBER}" ]; then
-            CACHE_SUFFIX="${PR_NUMBER}"
-          else
-            # shellcheck disable=SC2001
-            CACHE_SUFFIX=$(echo "${REF_NAME}" | sed 's/[^A-Za-z0-9._-]/-/g')
-          fi
-          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
-
      - name: Login to Docker Hub
-        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_TOKEN }}

-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f
+      # tag every docker image with "test" so that we can spin up the correct set
+      # of images during testing

-      - name: Build and load
-        uses: docker/bake-action@5be5f02ff8819ecd3092ea6b2e6261c31774f2b4 # ratchet:docker/bake-action@v6
-        env:
-          TAG: model-server-${{ github.run_id }}
+      # We don't need to build the Web Docker image since it's not yet used
+      # in the integration tests. We have a separate action to verify that it builds
+      # successfully.
+      - name: Pull Model Server Docker image
+        run: |
+          docker pull onyxdotapp/onyx-model-server:latest
+          docker tag onyxdotapp/onyx-model-server:latest onyxdotapp/onyx-model-server:test
+
+      - name: Set up Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # ratchet:actions/setup-python@v6
        with:
-          load: true
-          targets: model-server
-          set: |
-            model-server.cache-from=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ github.event.pull_request.head.sha || github.sha }}
-            model-server.cache-from=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ steps.format-branch.outputs.cache-suffix }}
-            model-server.cache-from=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache
-            model-server.cache-from=type=registry,ref=onyxdotapp/onyx-model-server:latest
-            model-server.cache-to=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ github.event.pull_request.head.sha || github.sha }},mode=max
-            model-server.cache-to=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ steps.format-branch.outputs.cache-suffix }},mode=max
-            model-server.cache-to=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache,mode=max
+          python-version: "3.11"
+          cache: "pip"
+          cache-dependency-path: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
+          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt

      - name: Start Docker containers
-        id: start_docker
-        env:
-          IMAGE_TAG: model-server-${{ github.run_id }}
        run: |
          cd deployment/docker_compose
-          docker compose \
-            -f docker-compose.yml \
-            -f docker-compose.dev.yml \
-            up -d --wait \
-            inference_model_server
+          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
+          AUTH_TYPE=basic \
+          REQUIRE_EMAIL_VERIFICATION=false \
+          DISABLE_TELEMETRY=true \
+          IMAGE_TAG=test \
+          docker compose -f docker-compose.model-server-test.yml up -d indexing_model_server
+        id: start_docker
+
+      - name: Wait for service to be ready
+        run: |
+          echo "Starting wait-for-service script..."
+
+          start_time=$(date +%s)
+          timeout=300  # 5 minutes in seconds
+
+          while true; do
+            current_time=$(date +%s)
+            elapsed_time=$((current_time - start_time))
+
+            if [ $elapsed_time -ge $timeout ]; then
+              echo "Timeout reached. Service did not become ready in 5 minutes."
+              exit 1
+            fi
+
+            # Use curl with error handling to ignore specific exit code 56
+            response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:9000/api/health || echo "curl_error")
+
+            if [ "$response" = "200" ]; then
+              echo "Service is ready!"
+              break
+            elif [ "$response" = "curl_error" ]; then
+              echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
+            else
+              echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
+            fi
+
+            sleep 5
+          done
+          echo "Finished waiting for service."

      - name: Run Tests
+        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
        run: |
          py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/llm
          py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/embedding

      - name: Alert on Failure
        if: failure() && github.event_name == 'schedule'
-        uses: ./.github/actions/slack-notify
-        with:
-          webhook-url: ${{ secrets.SLACK_WEBHOOK }}
-          failed-jobs: model-check
-          title: "🚨 Scheduled Model Tests failed!"
-          ref-name: ${{ github.ref_name }}
+        env:
+          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
+          REPO: ${{ github.repository }}
+          RUN_ID: ${{ github.run_id }}
+        run: |
+          curl -X POST \
+            -H 'Content-type: application/json' \
+            --data "{\"text\":\"Scheduled Model Tests failed! Check the run at: https://github.com/${REPO}/actions/runs/${RUN_ID}\"}" \
+            $SLACK_WEBHOOK

      - name: Dump all-container logs (optional)
        if: always()
        run: |
          cd deployment/docker_compose
-          docker compose logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
+          docker compose -f docker-compose.model-server-test.yml logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true

      - name: Upload logs
        if: always()
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,5 @@
 # editors
 .vscode
-!/.vscode/env_template.txt
-!/.vscode/launch.json
-!/.vscode/tasks.template.jsonc
 .zed
 .cursor

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -74,13 +74,6 @@ repos:
      #   pass_filenames: true
      #   files: ^backend/.*\.py$

-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: 3e8a8703264a2f4a69428a0aa4dcb512790b2c8c # frozen: v6.0.0
-    hooks:
-      - id: check-added-large-files
-        name: Check for added large files
-        args: ["--maxkb=1500"]
-
  - repo: https://github.com/rhysd/actionlint
    rev: a443f344ff32813837fa49f7aa6cbc478d770e62 # frozen: v1.7.9
    hooks:
@@ -153,22 +146,6 @@ repos:
        pass_filenames: false
        files: \.tf$

-      - id: npm-install
-        name: npm install
-        description: "Automatically run 'npm install' after a checkout, pull or rebase"
-        language: system
-        entry: bash -c 'cd web && npm install --no-save'
-        pass_filenames: false
-        files: ^web/package(-lock)?\.json$
-        stages: [post-checkout, post-merge, post-rewrite]
-      - id: npm-install-check
-        name: npm install --package-lock-only
-        description: "Check the 'web/package-lock.json' is updated"
-        language: system
-        entry: bash -c 'cd web && npm install --package-lock-only'
-        pass_filenames: false
-        files: ^web/package(-lock)?\.json$
-
      # Uses tsgo (TypeScript's native Go compiler) for ~10x faster type checking.
      # This is a preview package - if it breaks:
      #   1. Try updating: cd web && npm update @typescript/native-preview
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -17,6 +17,12 @@ LOG_ONYX_MODEL_INTERACTIONS=True
 LOG_LEVEL=debug


+# This passes top N results to LLM an additional time for reranking prior to
+# answer generation.
+# This step is quite heavy on token usage so we disable it for dev generally.
+DISABLE_LLM_DOC_RELEVANCE=False
+
+
 # Useful if you want to toggle auth on/off (google_oauth/OIDC specifically).
 OAUTH_CLIENT_ID=<REPLACE THIS>
 OAUTH_CLIENT_SECRET=<REPLACE THIS>
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -1,3 +1,5 @@
+/* Copy this file into '.vscode/launch.json' or merge its contents into your existing configurations. */
+
 {
  // Use IntelliSense to learn about possible attributes.
  // Hover to view descriptions of existing attributes.
@@ -22,7 +24,7 @@
        "Slack Bot",
        "Celery primary",
        "Celery light",
-        "Celery heavy",
+        "Celery background",
        "Celery docfetching",
        "Celery docprocessing",
        "Celery beat"
@@ -577,99 +579,6 @@
        "group": "3"
      }
    },
-    {
-      // Dummy entry used to label the group
-      "name": "--- Database ---",
-      "type": "node",
-      "request": "launch",
-      "presentation": {
-        "group": "4",
-        "order": 0
-      }
-    },
-    {
-      "name": "Clean restore seeded database dump (destructive)",
-      "type": "node",
-      "request": "launch",
-      "runtimeExecutable": "uv",
-      "runtimeArgs": [
-        "run",
-        "--with",
-        "onyx-devtools",
-        "ods",
-        "db",
-        "restore",
-        "--fetch-seeded",
-        "--clean",
-        "--yes"
-      ],
-      "cwd": "${workspaceFolder}",
-      "console": "integratedTerminal",
-      "presentation": {
-        "group": "4"
-      }
-    },
-    {
-      "name": "Create database snapshot",
-      "type": "node",
-      "request": "launch",
-      "runtimeExecutable": "uv",
-      "runtimeArgs": [
-        "run",
-        "--with",
-        "onyx-devtools",
-        "ods",
-        "db",
-        "dump",
-        "backup.dump"
-      ],
-      "cwd": "${workspaceFolder}",
-      "console": "integratedTerminal",
-      "presentation": {
-        "group": "4"
-      }
-    },
-    {
-      "name": "Clean restore database snapshot (destructive)",
-      "type": "node",
-      "request": "launch",
-      "runtimeExecutable": "uv",
-      "runtimeArgs": [
-        "run",
-        "--with",
-        "onyx-devtools",
-        "ods",
-        "db",
-        "restore",
-        "--clean",
-        "--yes",
-        "backup.dump"
-      ],
-      "cwd": "${workspaceFolder}",
-      "console": "integratedTerminal",
-      "presentation": {
-        "group": "4"
-      }
-    },
-    {
-      "name": "Upgrade database to head revision",
-      "type": "node",
-      "request": "launch",
-      "runtimeExecutable": "uv",
-      "runtimeArgs": [
-        "run",
-        "--with",
-        "onyx-devtools",
-        "ods",
-        "db",
-        "upgrade"
-      ],
-      "cwd": "${workspaceFolder}",
-      "console": "integratedTerminal",
-      "presentation": {
-        "group": "4"
-      }
-    },
    {
      // script to generate the openapi schema
      "name": "Onyx OpenAPI Schema Generator",
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,31 +1,262 @@
+<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
+
 # Contributing to Onyx
+
 Hey there! We are so excited that you're interested in Onyx.

+As an open source project in a rapidly changing space, we welcome all contributions.

-## Contribution Opportunities
-The [GitHub Issues](https://github.com/onyx-dot-app/onyx/issues) page is a great place to look for and share contribution ideas.
+## 💃 Guidelines

-If you have your own feature that you would like to build please create an issue and community members can provide feedback and
-thumb it up if they feel a common need. 
+### Contribution Opportunities

+The [GitHub Issues](https://github.com/onyx-dot-app/onyx/issues) page is a great place to start for contribution ideas.

-## Contributing Code
-Please reference the documents in contributing_guides folder to ensure that the code base is kept to a high standard.
-1. dev_setup.md (start here): gives you a guide to setting up a local development environment.
-2. contribution_process.md: how to ensure you are building valuable features that will get reviewed and merged.
-3. best_practices.md: before asking for reviews, ensure your changes meet the repo code quality standards.
+To ensure that your contribution is aligned with the project's direction, please reach out to any maintainer on the Onyx team
+via [Discord](https://discord.gg/4NA5SbzrWb) or [email](mailto:hello@onyx.app).

-To contribute, please follow the
+Issues that have been explicitly approved by the maintainers (aligned with the direction of the project)
+will be marked with the `approved by maintainers` label.
+Issues marked `good first issue` are an especially great place to start.
+
+**Connectors** to other tools are another great place to contribute. For details on how, refer to this
+[README.md](https://github.com/onyx-dot-app/onyx/blob/main/backend/onyx/connectors/README.md).
+
+If you have a new/different contribution in mind, we'd love to hear about it!
+Your input is vital to making sure that Onyx moves in the right direction.
+Before starting on implementation, please raise a GitHub issue.
+
+Also, always feel free to message the founders (Chris Weaver / Yuhong Sun) on
+[Discord](https://discord.gg/4NA5SbzrWb) directly about anything at all.
+
+### Contributing Code
+
+To contribute to this project, please follow the
 ["fork and pull request"](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) workflow.
+When opening a pull request, mention related issues and feel free to tag relevant maintainers.
+
+Before creating a pull request please make sure that the new changes conform to the formatting and linting requirements.
+See the [Formatting and Linting](#formatting-and-linting) section for how to run these checks locally.
+
+### Getting Help 🙋
+
+Our goal is to make contributing as easy as possible. If you run into any issues please don't hesitate to reach out.
+That way we can help future contributors and users can avoid the same issue.
+
+We also have support channels and generally interesting discussions on our
+[Discord](https://discord.gg/4NA5SbzrWb).
+
+We would love to see you there!
+
+## Get Started 🚀
+
+Onyx being a fully functional app, relies on some external software, specifically:
+
+- [Postgres](https://www.postgresql.org/) (Relational DB)
+- [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
+- [Redis](https://redis.io/) (Cache)
+- [MinIO](https://min.io/) (File Store)
+- [Nginx](https://nginx.org/) (Not needed for development flows generally)
+
+> **Note:**
+> This guide provides instructions to build and run Onyx locally from source with Docker containers providing the above external software. We believe this combination is easier for
+> development purposes. If you prefer to use pre-built container images, we provide instructions on running the full Onyx stack within Docker below.
+
+### Local Set Up
+
+Be sure to use Python version 3.11. For instructions on installing Python 3.11 on macOS, refer to the [CONTRIBUTING_MACOS.md](./CONTRIBUTING_MACOS.md) readme.
+
+If using a lower version, modifications will have to be made to the code.
+If using a higher version, sometimes some libraries will not be available (i.e. we had problems with Tensorflow in the past with higher versions of python).
+
+#### Backend: Python requirements
+
+Currently, we use [uv](https://docs.astral.sh/uv/) and recommend creating a [virtual environment](https://docs.astral.sh/uv/pip/environments/#using-a-virtual-environment).
+
+For convenience here's a command for it:
+
+```bash
+uv venv .venv --python 3.11
+source .venv/bin/activate
+```
+
+_For Windows, activate the virtual environment using Command Prompt:_
+
+```bash
+.venv\Scripts\activate
+```
+
+If using PowerShell, the command slightly differs:
+
+```powershell
+.venv\Scripts\Activate.ps1
+```
+
+Install the required python dependencies:
+
+```bash
+uv sync --all-extras
+```
+
+Install Playwright for Python (headless browser required by the Web Connector):
+
+```bash
+uv run playwright install
+```
+
+#### Frontend: Node dependencies
+
+Onyx uses Node v22.20.0. We highly recommend you use [Node Version Manager (nvm)](https://github.com/nvm-sh/nvm)
+to manage your Node installations. Once installed, you can run
+
+```bash
+nvm install 22 && nvm use 22
+node -v # verify your active version
+```
+
+Navigate to `onyx/web` and run:
+
+```bash
+npm i
+```
+
+## Formatting and Linting
+
+### Backend
+
+For the backend, you'll need to setup pre-commit hooks (black / reorder-python-imports).
+
+Then run:
+
+```bash
+uv run pre-commit install
+```
+
+Additionally, we use `mypy` for static type checking.
+Onyx is fully type-annotated, and we want to keep it that way!
+To run the mypy checks manually, run `uv run mypy .` from the `onyx/backend` directory.
+
+### Web
+
+We use `prettier` for formatting. The desired version will be installed via a `npm i` from the `onyx/web` directory.
+To run the formatter, use `npx prettier --write .` from the `onyx/web` directory.
+
+Pre-commit will also run prettier automatically on files you've recently touched. If re-formatted, your commit will fail.
+Re-stage your changes and commit again.
+
+# Running the application for development
+
+## Developing using VSCode Debugger (recommended)
+
+**We highly recommend using VSCode debugger for development.**
+See [CONTRIBUTING_VSCODE.md](./CONTRIBUTING_VSCODE.md) for more details.
+
+Otherwise, you can follow the instructions below to run the application for development.
+
+## Manually running the application for development
+### Docker containers for external software
+
+You will need Docker installed to run these containers.
+
+First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis/MinIO with:
+
+```bash
+docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d index relational_db cache minio
+```
+
+(index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
+
+### Running Onyx locally
+
+To start the frontend, navigate to `onyx/web` and run:
+
+```bash
+npm run dev
+```
+
+Next, start the model server which runs the local NLP models.
+Navigate to `onyx/backend` and run:
+
+```bash
+uvicorn model_server.main:app --reload --port 9000
+```
+
+_For Windows (for compatibility with both PowerShell and Command Prompt):_
+
+```bash
+powershell -Command "uvicorn model_server.main:app --reload --port 9000"
+```
+
+The first time running Onyx, you will need to run the DB migrations for Postgres.
+After the first time, this is no longer required unless the DB models change.
+
+Navigate to `onyx/backend` and with the venv active, run:
+
+```bash
+alembic upgrade head
+```
+
+Next, start the task queue which orchestrates the background jobs.
+Jobs that take more time are run async from the API server.
+
+Still in `onyx/backend`, run:
+
+```bash
+python ./scripts/dev_run_background_jobs.py
+```
+
+To run the backend API server, navigate back to `onyx/backend` and run:
+
+```bash
+AUTH_TYPE=disabled uvicorn onyx.main:app --reload --port 8080
+```
+
+_For Windows (for compatibility with both PowerShell and Command Prompt):_
+
+```bash
+powershell -Command "
+    $env:AUTH_TYPE='disabled'
+    uvicorn onyx.main:app --reload --port 8080
+"
+```
+
+> **Note:**
+> If you need finer logging, add the additional environment variable `LOG_LEVEL=DEBUG` to the relevant services.
+
+#### Wrapping up
+
+You should now have 4 servers running:
+
+- Web server
+- Backend API
+- Model server
+- Background jobs
+
+Now, visit `http://localhost:3000` in your browser. You should see the Onyx onboarding wizard where you can connect your external LLM provider to Onyx.
+
+You've successfully set up a local Onyx instance! 🏁
+
+#### Running the Onyx application in a container
+
+You can run the full Onyx application stack from pre-built images including all external software dependencies.
+
+Navigate to `onyx/deployment/docker_compose` and run:
+
+```bash
+docker compose up -d
+```
+
+After Docker pulls and starts these containers, navigate to `http://localhost:3000` to use Onyx.
+
+If you want to make changes to Onyx and run those changes in Docker, you can also build a local version of the Onyx container images that incorporates your changes like so:
+
+```bash
+docker compose up -d --build
+```


-## Getting Help 🙋
-We have support channels and generally interesting discussions on our [Discord](https://discord.gg/4NA5SbzrWb).
+### Release Process

-See you there!
-
-
-## Release Process
 Onyx loosely follows the SemVer versioning standard.
 Major changes are released with a "minor" version bump. Currently we use patch release versions to indicate small feature changes.
 A set of Docker containers will be pushed automatically to DockerHub with every tag.
--- a/contributing_guides/contributing_macos.md
+++ b/contributing_guides/contributing_macos.md
--- a/contributing_guides/contributing_vscode.md
+++ b/contributing_guides/contributing_vscode.md
@@ -7,6 +7,8 @@ This guide explains how to set up and use VSCode's debugging capabilities with t
 1. **Environment Setup**:
   - Copy `.vscode/env_template.txt` to `.vscode/.env`
   - Fill in the necessary environment variables in `.vscode/.env`
+2. **launch.json**:
+   - Copy `.vscode/launch.template.jsonc` to `.vscode/launch.json`

 ## Using the Debugger

--- a/backend/.trivyignore
+++ b/backend/.trivyignore
@@ -37,6 +37,10 @@ CVE-2023-50868
 CVE-2023-52425
 CVE-2024-28757

+# sqlite, only used by NLTK library to grab word lemmatizer and stopwords
+# No impact in our settings
+CVE-2023-7104
+
 # libharfbuzz0b, O(n^2) growth, worst case is denial of service
 # Accept the risk
 CVE-2023-25193
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -89,6 +89,12 @@ RUN uv pip install --system --no-cache-dir --upgrade \
 RUN python -c "from tokenizers import Tokenizer; \
 Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"

+# Pre-downloading NLTK for setups with limited egress
+RUN python -c "import nltk; \
+nltk.download('stopwords', quiet=True); \
+nltk.download('punkt_tab', quiet=True);"
+# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed
+
 # Pre-downloading tiktoken for setups with limited egress
 RUN python -c "import tiktoken; \
 tiktoken.get_encoding('cl100k_base')"
--- a/backend/alembic/versions/2c2430828bdf_add_unique_constraint_to_inputprompt_.py
+++ b/backend/alembic/versions/2c2430828bdf_add_unique_constraint_to_inputprompt_.py
@@ -1,42 +0,0 @@
-"""add_unique_constraint_to_inputprompt_prompt_user_id
-
-Revision ID: 2c2430828bdf
-Revises: fb80bdd256de
-Create Date: 2026-01-20 16:01:54.314805
-
-"""
-
-from alembic import op
-
-
-# revision identifiers, used by Alembic.
-revision = "2c2430828bdf"
-down_revision = "fb80bdd256de"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    # Create unique constraint on (prompt, user_id) for user-owned prompts
-    # This ensures each user can only have one shortcut with a given name
-    op.create_unique_constraint(
-        "uq_inputprompt_prompt_user_id",
-        "inputprompt",
-        ["prompt", "user_id"],
-    )
-
-    # Create partial unique index for public prompts (where user_id IS NULL)
-    # PostgreSQL unique constraints don't enforce uniqueness for NULL values,
-    # so we need a partial index to ensure public prompt names are also unique
-    op.execute(
-        """
-        CREATE UNIQUE INDEX uq_inputprompt_prompt_public
-        ON inputprompt (prompt)
-        WHERE user_id IS NULL
-        """
-    )
-
-
-def downgrade() -> None:
-    op.execute("DROP INDEX IF EXISTS uq_inputprompt_prompt_public")
-    op.drop_constraint("uq_inputprompt_prompt_user_id", "inputprompt", type_="unique")
--- a/backend/alembic/versions/41fa44bef321_remove_default_prompt_shortcuts.py
+++ b/backend/alembic/versions/41fa44bef321_remove_default_prompt_shortcuts.py
@@ -1,29 +0,0 @@
-"""remove default prompt shortcuts
-
-Revision ID: 41fa44bef321
-Revises: 2c2430828bdf
-Create Date: 2025-01-21
-
-"""
-
-from alembic import op
-
-# revision identifiers, used by Alembic.
-revision = "41fa44bef321"
-down_revision = "2c2430828bdf"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    # Delete any user associations for the default prompts first (foreign key constraint)
-    op.execute(
-        "DELETE FROM inputprompt__user WHERE input_prompt_id IN (SELECT id FROM inputprompt WHERE id < 0)"
-    )
-    # Delete the pre-seeded default prompt shortcuts (they have negative IDs)
-    op.execute("DELETE FROM inputprompt WHERE id < 0")
-
-
-def downgrade() -> None:
-    # We don't restore the default prompts on downgrade
-    pass
--- a/backend/alembic/versions/73e9983e5091_add_search_query_table.py
+++ b/backend/alembic/versions/73e9983e5091_add_search_query_table.py
@@ -1,47 +0,0 @@
-"""add_search_query_table
-
-Revision ID: 73e9983e5091
-Revises: d1b637d7050a
-Create Date: 2026-01-14 14:16:52.837489
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision = "73e9983e5091"
-down_revision = "d1b637d7050a"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    op.create_table(
-        "search_query",
-        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
-        sa.Column(
-            "user_id",
-            postgresql.UUID(as_uuid=True),
-            sa.ForeignKey("user.id"),
-            nullable=False,
-        ),
-        sa.Column("query", sa.String(), nullable=False),
-        sa.Column("query_expansions", postgresql.ARRAY(sa.String()), nullable=True),
-        sa.Column(
-            "created_at",
-            sa.DateTime(timezone=True),
-            nullable=False,
-            server_default=sa.func.now(),
-        ),
-    )
-
-    op.create_index("ix_search_query_user_id", "search_query", ["user_id"])
-    op.create_index("ix_search_query_created_at", "search_query", ["created_at"])
-
-
-def downgrade() -> None:
-    op.drop_index("ix_search_query_created_at", table_name="search_query")
-    op.drop_index("ix_search_query_user_id", table_name="search_query")
-    op.drop_table("search_query")
--- a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
+++ b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
@@ -10,7 +10,8 @@ from alembic import op
 import sqlalchemy as sa

 from onyx.db.models import IndexModelStatus
-from onyx.context.search.enums import RecencyBiasSetting, SearchType
+from onyx.context.search.enums import RecencyBiasSetting
+from onyx.context.search.enums import SearchType

 # revision identifiers, used by Alembic.
 revision = "776b3bbe9092"
--- a/backend/alembic/versions/8b5ce697290e_add_discord_bot_tables.py
+++ b/backend/alembic/versions/8b5ce697290e_add_discord_bot_tables.py
@@ -1,116 +0,0 @@
-"""Add Discord bot tables
-
-Revision ID: 8b5ce697290e
-Revises: a1b2c3d4e5f7
-Create Date: 2025-01-14
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-# revision identifiers, used by Alembic.
-revision = "8b5ce697290e"
-down_revision = "a1b2c3d4e5f7"
-branch_labels: None = None
-depends_on: None = None
-
-
-def upgrade() -> None:
-    # DiscordBotConfig (singleton table - one per tenant)
-    op.create_table(
-        "discord_bot_config",
-        sa.Column(
-            "id",
-            sa.String(),
-            primary_key=True,
-            server_default=sa.text("'SINGLETON'"),
-        ),
-        sa.Column("bot_token", sa.LargeBinary(), nullable=False),  # EncryptedString
-        sa.Column(
-            "created_at",
-            sa.DateTime(timezone=True),
-            server_default=sa.func.now(),
-            nullable=False,
-        ),
-        sa.CheckConstraint("id = 'SINGLETON'", name="ck_discord_bot_config_singleton"),
-    )
-
-    # DiscordGuildConfig
-    op.create_table(
-        "discord_guild_config",
-        sa.Column("id", sa.Integer(), primary_key=True),
-        sa.Column("guild_id", sa.BigInteger(), nullable=True, unique=True),
-        sa.Column("guild_name", sa.String(), nullable=True),
-        sa.Column("registration_key", sa.String(), nullable=False, unique=True),
-        sa.Column("registered_at", sa.DateTime(timezone=True), nullable=True),
-        sa.Column(
-            "default_persona_id",
-            sa.Integer(),
-            sa.ForeignKey("persona.id", ondelete="SET NULL"),
-            nullable=True,
-        ),
-        sa.Column(
-            "enabled", sa.Boolean(), server_default=sa.text("true"), nullable=False
-        ),
-    )
-
-    # DiscordChannelConfig
-    op.create_table(
-        "discord_channel_config",
-        sa.Column("id", sa.Integer(), primary_key=True),
-        sa.Column(
-            "guild_config_id",
-            sa.Integer(),
-            sa.ForeignKey("discord_guild_config.id", ondelete="CASCADE"),
-            nullable=False,
-        ),
-        sa.Column("channel_id", sa.BigInteger(), nullable=False),
-        sa.Column("channel_name", sa.String(), nullable=False),
-        sa.Column(
-            "channel_type",
-            sa.String(20),
-            server_default=sa.text("'text'"),
-            nullable=False,
-        ),
-        sa.Column(
-            "is_private",
-            sa.Boolean(),
-            server_default=sa.text("false"),
-            nullable=False,
-        ),
-        sa.Column(
-            "thread_only_mode",
-            sa.Boolean(),
-            server_default=sa.text("false"),
-            nullable=False,
-        ),
-        sa.Column(
-            "require_bot_invocation",
-            sa.Boolean(),
-            server_default=sa.text("true"),
-            nullable=False,
-        ),
-        sa.Column(
-            "persona_override_id",
-            sa.Integer(),
-            sa.ForeignKey("persona.id", ondelete="SET NULL"),
-            nullable=True,
-        ),
-        sa.Column(
-            "enabled", sa.Boolean(), server_default=sa.text("false"), nullable=False
-        ),
-    )
-
-    # Unique constraint: one config per channel per guild
-    op.create_unique_constraint(
-        "uq_discord_channel_guild_channel",
-        "discord_channel_config",
-        ["guild_config_id", "channel_id"],
-    )
-
-
-def downgrade() -> None:
-    op.drop_table("discord_channel_config")
-    op.drop_table("discord_guild_config")
-    op.drop_table("discord_bot_config")
--- a/backend/alembic/versions/a1b2c3d4e5f7_drop_agent_search_metrics_table.py
+++ b/backend/alembic/versions/a1b2c3d4e5f7_drop_agent_search_metrics_table.py
@@ -1,47 +0,0 @@
-"""drop agent_search_metrics table
-
-Revision ID: a1b2c3d4e5f7
-Revises: 73e9983e5091
-Create Date: 2026-01-17
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-from sqlalchemy.dialects import postgresql
-
-# revision identifiers, used by Alembic.
-revision = "a1b2c3d4e5f7"
-down_revision = "73e9983e5091"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    op.drop_table("agent__search_metrics")
-
-
-def downgrade() -> None:
-    op.create_table(
-        "agent__search_metrics",
-        sa.Column("id", sa.Integer(), nullable=False),
-        sa.Column("user_id", sa.UUID(), nullable=True),
-        sa.Column("persona_id", sa.Integer(), nullable=True),
-        sa.Column("agent_type", sa.String(), nullable=False),
-        sa.Column("start_time", sa.DateTime(timezone=True), nullable=False),
-        sa.Column("base_duration_s", sa.Float(), nullable=False),
-        sa.Column("full_duration_s", sa.Float(), nullable=False),
-        sa.Column("base_metrics", postgresql.JSONB(), nullable=True),
-        sa.Column("refined_metrics", postgresql.JSONB(), nullable=True),
-        sa.Column("all_metrics", postgresql.JSONB(), nullable=True),
-        sa.ForeignKeyConstraint(
-            ["user_id"],
-            ["user.id"],
-            ondelete="CASCADE",
-        ),
-        sa.ForeignKeyConstraint(
-            ["persona_id"],
-            ["persona.id"],
-        ),
-        sa.PrimaryKeyConstraint("id"),
-    )
--- a/backend/alembic/versions/fb80bdd256de_add_chat_background_to_user.py
+++ b/backend/alembic/versions/fb80bdd256de_add_chat_background_to_user.py
@@ -1,31 +0,0 @@
-"""add chat_background to user
-
-Revision ID: fb80bdd256de
-Revises: 8b5ce697290e
-Create Date: 2026-01-16 16:15:59.222617
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-# revision identifiers, used by Alembic.
-revision = "fb80bdd256de"
-down_revision = "8b5ce697290e"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    op.add_column(
-        "user",
-        sa.Column(
-            "chat_background",
-            sa.String(),
-            nullable=True,
-        ),
-    )
-
-
-def downgrade() -> None:
-    op.drop_column("user", "chat_background")
--- a/backend/ee/onyx/configs/app_configs.py
+++ b/backend/ee/onyx/configs/app_configs.py
@@ -128,8 +128,3 @@ MARKETING_POSTHOG_API_KEY = os.environ.get("MARKETING_POSTHOG_API_KEY")
 HUBSPOT_TRACKING_URL = os.environ.get("HUBSPOT_TRACKING_URL")

 GATED_TENANTS_KEY = "gated_tenants"
-
-# License enforcement - when True, blocks API access for gated/expired licenses
-LICENSE_ENFORCEMENT_ENABLED = (
-    os.environ.get("LICENSE_ENFORCEMENT_ENABLED", "").lower() == "true"
-)
--- a/backend/ee/onyx/db/search.py
+++ b/backend/ee/onyx/db/search.py
@@ -1,64 +0,0 @@
-import uuid
-from datetime import timedelta
-from uuid import UUID
-
-from sqlalchemy import select
-from sqlalchemy.orm import Session
-
-from onyx.db.engine.time_utils import get_db_current_time
-from onyx.db.models import SearchQuery
-
-
-def create_search_query(
-    db_session: Session,
-    user_id: UUID,
-    query: str,
-    query_expansions: list[str] | None = None,
-) -> SearchQuery:
-    """Create and persist a `SearchQuery` row.
-
-    Notes:
-    - `SearchQuery.id` is a UUID PK without a server-side default, so we generate it.
-    - `created_at` is filled by the DB (server_default=now()).
-    """
-    search_query = SearchQuery(
-        id=uuid.uuid4(),
-        user_id=user_id,
-        query=query,
-        query_expansions=query_expansions,
-    )
-    db_session.add(search_query)
-    db_session.commit()
-    db_session.refresh(search_query)
-    return search_query
-
-
-def fetch_search_queries_for_user(
-    db_session: Session,
-    user_id: UUID,
-    filter_days: int | None = None,
-    limit: int | None = None,
-) -> list[SearchQuery]:
-    """Fetch `SearchQuery` rows for a user.
-
-    Args:
-        user_id: User UUID.
-        filter_days: Optional time filter. If provided, only rows created within
-            the last `filter_days` days are returned.
-        limit: Optional max number of rows to return.
-    """
-    if filter_days is not None and filter_days <= 0:
-        raise ValueError("filter_days must be > 0")
-
-    stmt = select(SearchQuery).where(SearchQuery.user_id == user_id)
-
-    if filter_days is not None and filter_days > 0:
-        cutoff = get_db_current_time(db_session) - timedelta(days=filter_days)
-        stmt = stmt.where(SearchQuery.created_at >= cutoff)
-
-    stmt = stmt.order_by(SearchQuery.created_at.desc())
-
-    if limit is not None:
-        stmt = stmt.limit(limit)
-
-    return list(db_session.scalars(stmt).all())
--- a/backend/ee/onyx/main.py
+++ b/backend/ee/onyx/main.py
@@ -16,17 +16,16 @@ from ee.onyx.server.enterprise_settings.api import (
 from ee.onyx.server.evals.api import router as evals_router
 from ee.onyx.server.license.api import router as license_router
 from ee.onyx.server.manage.standard_answer import router as standard_answer_router
-from ee.onyx.server.middleware.license_enforcement import (
-    add_license_enforcement_middleware,
-)
 from ee.onyx.server.middleware.tenant_tracking import (
    add_api_server_tenant_id_middleware,
 )
 from ee.onyx.server.oauth.api import router as ee_oauth_router
+from ee.onyx.server.query_and_chat.chat_backend import (
+    router as chat_router,
+)
 from ee.onyx.server.query_and_chat.query_backend import (
    basic_router as ee_query_router,
 )
-from ee.onyx.server.query_and_chat.search_backend import router as search_router
 from ee.onyx.server.query_history.api import router as query_history_router
 from ee.onyx.server.reporting.usage_export_api import router as usage_export_router
 from ee.onyx.server.seeding import seed_db
@@ -86,10 +85,6 @@ def get_application() -> FastAPI:
    if MULTI_TENANT:
        add_api_server_tenant_id_middleware(application, logger)

-    # Add license enforcement middleware (runs after tenant tracking)
-    # This blocks access when license is expired/gated
-    add_license_enforcement_middleware(application, logger)
-
    if AUTH_TYPE == AuthType.CLOUD:
        # For Google OAuth, refresh tokens are requested by:
        # 1. Adding the right scopes
@@ -129,7 +124,7 @@ def get_application() -> FastAPI:
    # EE only backend APIs
    include_router_with_global_prefix_prepended(application, query_router)
    include_router_with_global_prefix_prepended(application, ee_query_router)
-    include_router_with_global_prefix_prepended(application, search_router)
+    include_router_with_global_prefix_prepended(application, chat_router)
    include_router_with_global_prefix_prepended(application, standard_answer_router)
    include_router_with_global_prefix_prepended(application, ee_oauth_router)
    include_router_with_global_prefix_prepended(application, ee_document_cc_pair_router)
--- a/backend/ee/onyx/prompts/query_expansion.py
+++ b/backend/ee/onyx/prompts/query_expansion.py
@@ -1,27 +0,0 @@
-# Single message is likely most reliable and generally better for this task
-# No final reminders at the end since the user query is expected to be short
-# If it is not short, it should go into the chat flow so we do not need to account for this.
-KEYWORD_EXPANSION_PROMPT = """
-Generate a set of keyword-only queries to help find relevant documents for the provided query. \
-These queries will be passed to a bm25-based keyword search engine. \
-Provide a single query per line (where each query consists of one or more keywords). \
-The queries must be purely keywords and not contain any filler natural language. \
-The each query should have as few keywords as necessary to represent the user's search intent. \
-If there are no useful expansions, simply return the original query with no additional keyword queries. \
-CRITICAL: Do not include any additional formatting, comments, or anything aside from the keyword queries.
-
-The user query is:
-{user_query}
-""".strip()
-
-
-QUERY_TYPE_PROMPT = """
-Determine if the provided query is better suited for a keyword search or a semantic search.
-Respond with "keyword" or "semantic" literally and nothing else.
-Do not provide any additional text or reasoning to your response.
-
-CRITICAL: It must only be 1 single word - EITHER "keyword" or "semantic".
-
-The user query is:
-{user_query}
-""".strip()
--- a/backend/ee/onyx/prompts/search_flow_classification.py
+++ b/backend/ee/onyx/prompts/search_flow_classification.py
@@ -1,42 +0,0 @@
-# ruff: noqa: E501, W605 start
-SEARCH_CLASS = "search"
-CHAT_CLASS = "chat"
-
-# Will note that with many larger LLMs the latency on running this prompt via third party APIs is as high as 2 seconds which is too slow for many
-# use cases.
-SEARCH_CHAT_PROMPT = f"""
-Determine if the following query is better suited for a search UI or a chat UI. Respond with "{SEARCH_CLASS}" or "{CHAT_CLASS}" literally and nothing else. \
-Do not provide any additional text or reasoning to your response. CRITICAL, IT MUST ONLY BE 1 SINGLE WORD - EITHER "{SEARCH_CLASS}" or "{CHAT_CLASS}".
-
-# Classification Guidelines:
-## {SEARCH_CLASS}
- If the query consists entirely of keywords or query doesn't require any answer from the AI
- If the query is a short statement that seems like a search query rather than a question
- If the query feels nonsensical or is a short phrase that possibly describes a document or information that could be found in a internal document
-
-### Examples of {SEARCH_CLASS} queries:
- Find me the document that goes over the onboarding process for a new hire
- Pull requests since last week
- Sales Runbook AMEA Region
- Procurement process
- Retrieve the PRD for project X
-
-## {CHAT_CLASS}
- If the query is asking a question that requires an answer rather than a document
- If the query is asking for a solution, suggestion, or general help
- If the query is seeking information that is on the web and likely not in a company internal document
- If the query should be answered without any context from additional documents or searches
-
-### Examples of {CHAT_CLASS} queries:
- What led us to win the deal with company X? (seeking answer)
- Google Drive not sync-ing files to my computer (seeking solution)
- Review my email: <whatever the email is> (general help)
- Write me a script to... (general help)
- Cheap flights Europe to Tokyo (information likely found on the web, not internal)
-
-# User Query:
-{{user_query}}
-
-REMEMBER TO ONLY RESPOND WITH "{SEARCH_CLASS}" OR "{CHAT_CLASS}" AND NOTHING ELSE.
-""".strip()
-# ruff: noqa: E501, W605 end
--- a/backend/ee/onyx/search/process_search_query.py
+++ b/backend/ee/onyx/search/process_search_query.py
@@ -1,286 +0,0 @@
-from collections.abc import Generator
-
-from sqlalchemy.orm import Session
-
-from ee.onyx.db.search import create_search_query
-from ee.onyx.secondary_llm_flows.query_expansion import expand_keywords
-from ee.onyx.server.query_and_chat.models import SearchDocWithContent
-from ee.onyx.server.query_and_chat.models import SearchFullResponse
-from ee.onyx.server.query_and_chat.models import SendSearchQueryRequest
-from ee.onyx.server.query_and_chat.streaming_models import LLMSelectedDocsPacket
-from ee.onyx.server.query_and_chat.streaming_models import SearchDocsPacket
-from ee.onyx.server.query_and_chat.streaming_models import SearchErrorPacket
-from ee.onyx.server.query_and_chat.streaming_models import SearchQueriesPacket
-from onyx.context.search.models import BaseFilters
-from onyx.context.search.models import ChunkSearchRequest
-from onyx.context.search.models import InferenceChunk
-from onyx.context.search.pipeline import merge_individual_chunks
-from onyx.context.search.pipeline import search_pipeline
-from onyx.db.models import User
-from onyx.db.search_settings import get_current_search_settings
-from onyx.document_index.factory import get_default_document_index
-from onyx.document_index.interfaces import DocumentIndex
-from onyx.llm.factory import get_default_llm
-from onyx.secondary_llm_flows.document_filter import select_sections_for_expansion
-from onyx.tools.tool_implementations.search.search_utils import (
-    weighted_reciprocal_rank_fusion,
-)
-from onyx.utils.logger import setup_logger
-from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
-
-logger = setup_logger()
-
-
-# This is just a heuristic that also happens to work well for the UI/UX
-# Users would not find it useful to see a huge list of suggested docs
-# but more than 1 is also likely good as many questions may target more than 1 doc.
-TARGET_NUM_SECTIONS_FOR_LLM_SELECTION = 3
-
-
-def _run_single_search(
-    query: str,
-    filters: BaseFilters | None,
-    document_index: DocumentIndex,
-    user: User | None,
-    db_session: Session,
-    num_hits: int | None = None,
-) -> list[InferenceChunk]:
-    """Execute a single search query and return chunks."""
-    chunk_search_request = ChunkSearchRequest(
-        query=query,
-        user_selected_filters=filters,
-        limit=num_hits,
-    )
-
-    return search_pipeline(
-        chunk_search_request=chunk_search_request,
-        document_index=document_index,
-        user=user,
-        persona=None,  # No persona for direct search
-        db_session=db_session,
-    )
-
-
-def stream_search_query(
-    request: SendSearchQueryRequest,
-    user: User | None,
-    db_session: Session,
-) -> Generator[
-    SearchQueriesPacket | SearchDocsPacket | LLMSelectedDocsPacket | SearchErrorPacket,
-    None,
-    None,
-]:
-    """
-    Core search function that yields streaming packets.
-    Used by both streaming and non-streaming endpoints.
-    """
-    # Get document index
-    search_settings = get_current_search_settings(db_session)
-    # This flow is for search so we do not get all indices.
-    document_index = get_default_document_index(search_settings, None)
-
-    # Determine queries to execute
-    original_query = request.search_query
-    keyword_expansions: list[str] = []
-
-    if request.run_query_expansion:
-        try:
-            llm = get_default_llm()
-            keyword_expansions = expand_keywords(
-                user_query=original_query,
-                llm=llm,
-            )
-            if keyword_expansions:
-                logger.debug(
-                    f"Query expansion generated {len(keyword_expansions)} keyword queries"
-                )
-        except Exception as e:
-            logger.warning(f"Query expansion failed: {e}; using original query only.")
-            keyword_expansions = []
-
-    # Build list of all executed queries for tracking
-    all_executed_queries = [original_query] + keyword_expansions
-
-    # TODO remove this check, user should not be None
-    if user is not None:
-        create_search_query(
-            db_session=db_session,
-            user_id=user.id,
-            query=request.search_query,
-            query_expansions=keyword_expansions if keyword_expansions else None,
-        )
-
-    # Execute search(es)
-    if not keyword_expansions:
-        # Single query (original only) - no threading needed
-        chunks = _run_single_search(
-            query=original_query,
-            filters=request.filters,
-            document_index=document_index,
-            user=user,
-            db_session=db_session,
-            num_hits=request.num_hits,
-        )
-    else:
-        # Multiple queries - run in parallel and merge with RRF
-        # First query is the original (semantic), rest are keyword expansions
-        search_functions = [
-            (
-                _run_single_search,
-                (
-                    query,
-                    request.filters,
-                    document_index,
-                    user,
-                    db_session,
-                    request.num_hits,
-                ),
-            )
-            for query in all_executed_queries
-        ]
-
-        # Run all searches in parallel
-        all_search_results: list[list[InferenceChunk]] = (
-            run_functions_tuples_in_parallel(
-                search_functions,
-                allow_failures=True,
-            )
-        )
-
-        # Separate original query results from keyword expansion results
-        # Note that in rare cases, the original query may have failed and so we may be
-        # just overweighting one set of keyword results, should be not a big deal though.
-        original_result = all_search_results[0] if all_search_results else []
-        keyword_results = all_search_results[1:] if len(all_search_results) > 1 else []
-
-        # Build valid results and weights
-        # Original query (semantic): weight 2.0
-        # Keyword expansions: weight 1.0 each
-        valid_results: list[list[InferenceChunk]] = []
-        weights: list[float] = []
-
-        if original_result:
-            valid_results.append(original_result)
-            weights.append(2.0)
-
-        for keyword_result in keyword_results:
-            if keyword_result:
-                valid_results.append(keyword_result)
-                weights.append(1.0)
-
-        if not valid_results:
-            logger.warning("All parallel searches returned empty results")
-            chunks = []
-        else:
-            chunks = weighted_reciprocal_rank_fusion(
-                ranked_results=valid_results,
-                weights=weights,
-                id_extractor=lambda chunk: f"{chunk.document_id}_{chunk.chunk_id}",
-            )
-
-    # Merge chunks into sections
-    sections = merge_individual_chunks(chunks)
-
-    # Truncate to the requested number of hits
-    sections = sections[: request.num_hits]
-
-    # Apply LLM document selection if requested
-    # num_docs_fed_to_llm_selection specifies how many sections to feed to the LLM for selection
-    # The LLM will always try to select TARGET_NUM_SECTIONS_FOR_LLM_SELECTION sections from those fed to it
-    # llm_selected_doc_ids will be:
-    #   - None if LLM selection was not requested or failed
-    #   - Empty list if LLM selection ran but selected nothing
-    #   - List of doc IDs if LLM selection succeeded
-    run_llm_selection = (
-        request.num_docs_fed_to_llm_selection is not None
-        and request.num_docs_fed_to_llm_selection >= 1
-    )
-    llm_selected_doc_ids: list[str] | None = None
-    llm_selection_failed = False
-    if run_llm_selection and sections:
-        try:
-            llm = get_default_llm()
-            sections_to_evaluate = sections[: request.num_docs_fed_to_llm_selection]
-            selected_sections, _ = select_sections_for_expansion(
-                sections=sections_to_evaluate,
-                user_query=original_query,
-                llm=llm,
-                max_sections=TARGET_NUM_SECTIONS_FOR_LLM_SELECTION,
-                try_to_fill_to_max=True,
-            )
-            # Extract unique document IDs from selected sections (may be empty)
-            llm_selected_doc_ids = list(
-                dict.fromkeys(
-                    section.center_chunk.document_id for section in selected_sections
-                )
-            )
-            logger.debug(
-                f"LLM document selection evaluated {len(sections_to_evaluate)} sections, "
-                f"selected {len(selected_sections)} sections with doc IDs: {llm_selected_doc_ids}"
-            )
-        except Exception as e:
-            # Allowing a blanket exception here as this step is not critical and the rest of the results are still valid
-            logger.warning(f"LLM document selection failed: {e}")
-            llm_selection_failed = True
-    elif run_llm_selection and not sections:
-        # LLM selection requested but no sections to evaluate
-        llm_selected_doc_ids = []
-
-    # Convert to SearchDocWithContent list, optionally including content
-    search_docs = SearchDocWithContent.from_inference_sections(
-        sections,
-        include_content=request.include_content,
-        is_internet=False,
-    )
-
-    # Yield queries packet
-    yield SearchQueriesPacket(all_executed_queries=all_executed_queries)
-
-    # Yield docs packet
-    yield SearchDocsPacket(search_docs=search_docs)
-
-    # Yield LLM selected docs packet if LLM selection was requested
-    # - llm_selected_doc_ids is None if selection failed
-    # - llm_selected_doc_ids is empty list if no docs were selected
-    # - llm_selected_doc_ids is list of IDs if docs were selected
-    if run_llm_selection:
-        yield LLMSelectedDocsPacket(
-            llm_selected_doc_ids=None if llm_selection_failed else llm_selected_doc_ids
-        )
-
-
-def gather_search_stream(
-    packets: Generator[
-        SearchQueriesPacket
-        | SearchDocsPacket
-        | LLMSelectedDocsPacket
-        | SearchErrorPacket,
-        None,
-        None,
-    ],
-) -> SearchFullResponse:
-    """
-    Aggregate all streaming packets into SearchFullResponse.
-    """
-    all_executed_queries: list[str] = []
-    search_docs: list[SearchDocWithContent] = []
-    llm_selected_doc_ids: list[str] | None = None
-    error: str | None = None
-
-    for packet in packets:
-        if isinstance(packet, SearchQueriesPacket):
-            all_executed_queries = packet.all_executed_queries
-        elif isinstance(packet, SearchDocsPacket):
-            search_docs = packet.search_docs
-        elif isinstance(packet, LLMSelectedDocsPacket):
-            llm_selected_doc_ids = packet.llm_selected_doc_ids
-        elif isinstance(packet, SearchErrorPacket):
-            error = packet.error
-
-    return SearchFullResponse(
-        all_executed_queries=all_executed_queries,
-        search_docs=search_docs,
-        doc_selection_reasoning=None,
-        llm_selected_doc_ids=llm_selected_doc_ids,
-        error=error,
-    )
--- a/backend/ee/onyx/secondary_llm_flows/init.py
+++ b/backend/ee/onyx/secondary_llm_flows/init.py
--- a/backend/ee/onyx/secondary_llm_flows/query_expansion.py
+++ b/backend/ee/onyx/secondary_llm_flows/query_expansion.py
@@ -1,92 +0,0 @@
-import re
-
-from ee.onyx.prompts.query_expansion import KEYWORD_EXPANSION_PROMPT
-from onyx.llm.interfaces import LLM
-from onyx.llm.models import LanguageModelInput
-from onyx.llm.models import ReasoningEffort
-from onyx.llm.models import UserMessage
-from onyx.llm.utils import llm_response_to_string
-from onyx.utils.logger import setup_logger
-
-logger = setup_logger()
-
-# Pattern to remove common LLM artifacts: brackets, quotes, list markers, etc.
-CLEANUP_PATTERN = re.compile(r'[\[\]"\'`]')
-
-
-def _clean_keyword_line(line: str) -> str:
-    """Clean a keyword line by removing common LLM artifacts.
-
-    Removes brackets, quotes, and other characters that LLMs may accidentally
-    include in their output.
-    """
-    # Remove common artifacts
-    cleaned = CLEANUP_PATTERN.sub("", line)
-    # Remove leading list markers like "1.", "2.", "-", "*"
-    cleaned = re.sub(r"^\s*(?:\d+[\.\)]\s*|[-*]\s*)", "", cleaned)
-    return cleaned.strip()
-
-
-def expand_keywords(
-    user_query: str,
-    llm: LLM,
-) -> list[str]:
-    """Expand a user query into multiple keyword-only queries for BM25 search.
-
-    Uses an LLM to generate keyword-based search queries that capture different
-    aspects of the user's search intent. Returns only the expanded queries,
-    not the original query.
-
-    Args:
-        user_query: The original search query from the user
-        llm: Language model to use for keyword expansion
-
-    Returns:
-        List of expanded keyword queries (excluding the original query).
-        Returns empty list if expansion fails or produces no useful expansions.
-    """
-    messages: LanguageModelInput = [
-        UserMessage(content=KEYWORD_EXPANSION_PROMPT.format(user_query=user_query))
-    ]
-
-    try:
-        response = llm.invoke(
-            prompt=messages,
-            reasoning_effort=ReasoningEffort.OFF,
-            # Limit output - we only expect a few short keyword queries
-            max_tokens=150,
-        )
-
-        content = llm_response_to_string(response).strip()
-
-        if not content:
-            logger.warning("Keyword expansion returned empty response.")
-            return []
-
-        # Parse response - each line is a separate keyword query
-        # Clean each line to remove LLM artifacts and drop empty lines
-        parsed_queries = []
-        for line in content.strip().split("\n"):
-            cleaned = _clean_keyword_line(line)
-            if cleaned:
-                parsed_queries.append(cleaned)
-
-        if not parsed_queries:
-            logger.warning("Keyword expansion parsing returned no queries.")
-            return []
-
-        # Filter out duplicates and queries that match the original
-        expanded_queries: list[str] = []
-        seen_lower: set[str] = {user_query.lower()}
-        for query in parsed_queries:
-            query_lower = query.lower()
-            if query_lower not in seen_lower:
-                seen_lower.add(query_lower)
-                expanded_queries.append(query)
-
-        logger.debug(f"Keyword expansion generated {len(expanded_queries)} queries")
-        return expanded_queries
-
-    except Exception as e:
-        logger.warning(f"Keyword expansion failed: {e}")
-        return []
--- a/backend/ee/onyx/secondary_llm_flows/search_flow_classification.py
+++ b/backend/ee/onyx/secondary_llm_flows/search_flow_classification.py
@@ -1,50 +0,0 @@
-from ee.onyx.prompts.search_flow_classification import CHAT_CLASS
-from ee.onyx.prompts.search_flow_classification import SEARCH_CHAT_PROMPT
-from ee.onyx.prompts.search_flow_classification import SEARCH_CLASS
-from onyx.llm.interfaces import LLM
-from onyx.llm.models import LanguageModelInput
-from onyx.llm.models import ReasoningEffort
-from onyx.llm.models import UserMessage
-from onyx.llm.utils import llm_response_to_string
-from onyx.utils.logger import setup_logger
-from onyx.utils.timing import log_function_time
-
-logger = setup_logger()
-
-
-@log_function_time(print_only=True)
-def classify_is_search_flow(
-    query: str,
-    llm: LLM,
-) -> bool:
-    messages: LanguageModelInput = [
-        UserMessage(content=SEARCH_CHAT_PROMPT.format(user_query=query))
-    ]
-    response = llm.invoke(
-        prompt=messages,
-        reasoning_effort=ReasoningEffort.OFF,
-        # Nothing can happen in the UI until this call finishes so we need to be aggressive with the timeout
-        timeout_override=2,
-        # Well more than necessary but just to ensure completion and in case it succeeds with classifying but
-        # ends up rambling
-        max_tokens=20,
-    )
-
-    content = llm_response_to_string(response).strip().lower()
-    if not content:
-        logger.warning(
-            "Search flow classification returned empty response; defaulting to chat flow."
-        )
-        return False
-
-    # Prefer chat if both appear.
-    if CHAT_CLASS in content:
-        return False
-    if SEARCH_CLASS in content:
-        return True
-
-    logger.warning(
-        "Search flow classification returned unexpected response; defaulting to chat flow. Response=%r",
-        content,
-    )
-    return False
--- a/backend/ee/onyx/server/analytics/api.py
+++ b/backend/ee/onyx/server/analytics/api.py
@@ -19,9 +19,9 @@ from ee.onyx.db.analytics import fetch_query_analytics
 from ee.onyx.db.analytics import user_can_view_assistant_stats
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_user
-from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
+from onyx.server.utils import PUBLIC_API_TAGS

 router = APIRouter(prefix="/analytics", tags=PUBLIC_API_TAGS)

--- a/backend/ee/onyx/server/auth_check.py
+++ b/backend/ee/onyx/server/auth_check.py
@@ -10,8 +10,6 @@ EE_PUBLIC_ENDPOINT_SPECS = PUBLIC_ENDPOINT_SPECS + [
    ("/enterprise-settings/logo", {"GET"}),
    ("/enterprise-settings/logotype", {"GET"}),
    ("/enterprise-settings/custom-analytics-script", {"GET"}),
-    # Stripe publishable key is safe to expose publicly
-    ("/tenants/stripe-publishable-key", {"GET"}),
 ]


--- a/backend/ee/onyx/server/middleware/license_enforcement.py
+++ b/backend/ee/onyx/server/middleware/license_enforcement.py
@@ -1,102 +0,0 @@
-"""Middleware to enforce license status application-wide."""
-
-import logging
-from collections.abc import Awaitable
-from collections.abc import Callable
-
-from fastapi import FastAPI
-from fastapi import Request
-from fastapi import Response
-from fastapi.responses import JSONResponse
-from redis.exceptions import RedisError
-
-from ee.onyx.configs.app_configs import LICENSE_ENFORCEMENT_ENABLED
-from ee.onyx.db.license import get_cached_license_metadata
-from ee.onyx.server.tenants.product_gating import is_tenant_gated
-from onyx.server.settings.models import ApplicationStatus
-from shared_configs.configs import MULTI_TENANT
-from shared_configs.contextvars import get_current_tenant_id
-
-# Paths that are ALWAYS accessible, even when license is expired/gated.
-# These enable users to:
-#   /auth - Log in/out (users can't fix billing if locked out of auth)
-#   /license - Fetch, upload, or check license status
-#   /health - Health checks for load balancers/orchestrators
-#   /me - Basic user info needed for UI rendering
-#   /settings, /enterprise-settings - View app status and branding
-#   /tenants/billing-* - Manage subscription to resolve gating
-ALLOWED_PATH_PREFIXES = {
-    "/auth",
-    "/license",
-    "/health",
-    "/me",
-    "/settings",
-    "/enterprise-settings",
-    "/tenants/billing-information",
-    "/tenants/create-customer-portal-session",
-    "/tenants/create-subscription-session",
-}
-
-
-def _is_path_allowed(path: str) -> bool:
-    """Check if path is in allowlist (prefix match)."""
-    return any(path.startswith(prefix) for prefix in ALLOWED_PATH_PREFIXES)
-
-
-def add_license_enforcement_middleware(
-    app: FastAPI, logger: logging.LoggerAdapter
-) -> None:
-    logger.info("License enforcement middleware registered")
-
-    @app.middleware("http")
-    async def enforce_license(
-        request: Request, call_next: Callable[[Request], Awaitable[Response]]
-    ) -> Response:
-        """Block requests when license is expired/gated."""
-        if not LICENSE_ENFORCEMENT_ENABLED:
-            return await call_next(request)
-
-        path = request.url.path
-        if path.startswith("/api"):
-            path = path[4:]
-
-        if _is_path_allowed(path):
-            return await call_next(request)
-
-        is_gated = False
-        tenant_id = get_current_tenant_id()
-
-        if MULTI_TENANT:
-            try:
-                is_gated = is_tenant_gated(tenant_id)
-            except RedisError as e:
-                logger.warning(f"Failed to check tenant gating status: {e}")
-                # Fail open - don't block users due to Redis connectivity issues
-                is_gated = False
-        else:
-            try:
-                metadata = get_cached_license_metadata(tenant_id)
-                if metadata:
-                    if metadata.status == ApplicationStatus.GATED_ACCESS:
-                        is_gated = True
-                else:
-                    # No license metadata = gated for self-hosted EE
-                    is_gated = True
-            except RedisError as e:
-                logger.warning(f"Failed to check license metadata: {e}")
-                # Fail open - don't block users due to Redis connectivity issues
-                is_gated = False
-
-        if is_gated:
-            logger.info(f"Blocking request for gated tenant: {tenant_id}, path={path}")
-            return JSONResponse(
-                status_code=402,
-                content={
-                    "detail": {
-                        "error": "license_expired",
-                        "message": "Your subscription has expired. Please update your billing.",
-                    }
-                },
-            )
-
-        return await call_next(request)
--- a/backend/ee/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/chat_backend.py
@@ -0,0 +1,217 @@
+from fastapi import APIRouter
+from fastapi import Depends
+from fastapi import HTTPException
+from sqlalchemy.orm import Session
+
+from ee.onyx.server.query_and_chat.models import BasicCreateChatMessageRequest
+from ee.onyx.server.query_and_chat.models import (
+    BasicCreateChatMessageWithHistoryRequest,
+)
+from onyx.auth.users import current_user
+from onyx.chat.chat_utils import create_chat_history_chain
+from onyx.chat.models import ChatBasicResponse
+from onyx.chat.process_message import gather_stream
+from onyx.chat.process_message import stream_chat_message_objects
+from onyx.configs.constants import MessageType
+from onyx.context.search.models import OptionalSearchSetting
+from onyx.context.search.models import RetrievalDetails
+from onyx.db.chat import create_chat_session
+from onyx.db.chat import create_new_chat_message
+from onyx.db.chat import get_or_create_root_message
+from onyx.db.engine.sql_engine import get_session
+from onyx.db.models import User
+from onyx.llm.factory import get_llm_for_persona
+from onyx.natural_language_processing.utils import get_tokenizer
+from onyx.server.query_and_chat.models import CreateChatMessageRequest
+from onyx.server.query_and_chat.models import MessageOrigin
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+router = APIRouter(prefix="/chat")
+
+
+@router.post("/send-message-simple-api")
+def handle_simplified_chat_message(
+    chat_message_req: BasicCreateChatMessageRequest,
+    user: User | None = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> ChatBasicResponse:
+    """This is a Non-Streaming version that only gives back a minimal set of information"""
+    logger.notice(f"Received new simple api chat message: {chat_message_req.message}")
+
+    if not chat_message_req.message:
+        raise HTTPException(status_code=400, detail="Empty chat message is invalid")
+
+    # Handle chat session creation if chat_session_id is not provided
+    if chat_message_req.chat_session_id is None:
+        if chat_message_req.persona_id is None:
+            raise HTTPException(
+                status_code=400,
+                detail="Either chat_session_id or persona_id must be provided",
+            )
+
+        # Create a new chat session with the provided persona_id
+        try:
+            new_chat_session = create_chat_session(
+                db_session=db_session,
+                description="",  # Leave empty for simple API
+                user_id=user.id if user else None,
+                persona_id=chat_message_req.persona_id,
+            )
+            chat_session_id = new_chat_session.id
+        except Exception as e:
+            logger.exception(e)
+            raise HTTPException(status_code=400, detail="Invalid Persona provided.")
+    else:
+        chat_session_id = chat_message_req.chat_session_id
+
+    try:
+        parent_message = create_chat_history_chain(
+            chat_session_id=chat_session_id, db_session=db_session
+        )[-1]
+    except Exception:
+        parent_message = get_or_create_root_message(
+            chat_session_id=chat_session_id, db_session=db_session
+        )
+
+    if (
+        chat_message_req.retrieval_options is None
+        and chat_message_req.search_doc_ids is None
+    ):
+        retrieval_options: RetrievalDetails | None = RetrievalDetails(
+            run_search=OptionalSearchSetting.ALWAYS,
+            real_time=False,
+        )
+    else:
+        retrieval_options = chat_message_req.retrieval_options
+
+    full_chat_msg_info = CreateChatMessageRequest(
+        chat_session_id=chat_session_id,
+        parent_message_id=parent_message.id,
+        message=chat_message_req.message,
+        file_descriptors=[],
+        search_doc_ids=chat_message_req.search_doc_ids,
+        retrieval_options=retrieval_options,
+        # Simple API does not support reranking, hide complexity from user
+        rerank_settings=None,
+        query_override=chat_message_req.query_override,
+        # Currently only applies to search flow not chat
+        chunks_above=0,
+        chunks_below=0,
+        full_doc=chat_message_req.full_doc,
+        structured_response_format=chat_message_req.structured_response_format,
+        origin=MessageOrigin.API,
+    )
+
+    packets = stream_chat_message_objects(
+        new_msg_req=full_chat_msg_info,
+        user=user,
+        db_session=db_session,
+    )
+
+    return gather_stream(packets)
+
+
+@router.post("/send-message-simple-with-history")
+def handle_send_message_simple_with_history(
+    req: BasicCreateChatMessageWithHistoryRequest,
+    user: User | None = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> ChatBasicResponse:
+    """This is a Non-Streaming version that only gives back a minimal set of information.
+    takes in chat history maintained by the caller
+    and does query rephrasing similar to answer-with-quote"""
+
+    if len(req.messages) == 0:
+        raise HTTPException(status_code=400, detail="Messages cannot be zero length")
+
+    # This is a sanity check to make sure the chat history is valid
+    # It must start with a user message and alternate beteen user and assistant
+    expected_role = MessageType.USER
+    for msg in req.messages:
+        if not msg.message:
+            raise HTTPException(
+                status_code=400, detail="One or more chat messages were empty"
+            )
+
+        if msg.role != expected_role:
+            raise HTTPException(
+                status_code=400,
+                detail="Message roles must start and end with MessageType.USER and alternate in-between.",
+            )
+        if expected_role == MessageType.USER:
+            expected_role = MessageType.ASSISTANT
+        else:
+            expected_role = MessageType.USER
+
+    query = req.messages[-1].message
+    msg_history = req.messages[:-1]
+
+    logger.notice(f"Received new simple with history chat message: {query}")
+
+    user_id = user.id if user is not None else None
+    chat_session = create_chat_session(
+        db_session=db_session,
+        description="handle_send_message_simple_with_history",
+        user_id=user_id,
+        persona_id=req.persona_id,
+    )
+
+    llm = get_llm_for_persona(persona=chat_session.persona, user=user)
+
+    llm_tokenizer = get_tokenizer(
+        model_name=llm.config.model_name,
+        provider_type=llm.config.model_provider,
+    )
+
+    # Every chat Session begins with an empty root message
+    root_message = get_or_create_root_message(
+        chat_session_id=chat_session.id, db_session=db_session
+    )
+
+    chat_message = root_message
+    for msg in msg_history:
+        chat_message = create_new_chat_message(
+            chat_session_id=chat_session.id,
+            parent_message=chat_message,
+            message=msg.message,
+            token_count=len(llm_tokenizer.encode(msg.message)),
+            message_type=msg.role,
+            db_session=db_session,
+            commit=False,
+        )
+    db_session.commit()
+
+    if req.retrieval_options is None and req.search_doc_ids is None:
+        retrieval_options: RetrievalDetails | None = RetrievalDetails(
+            run_search=OptionalSearchSetting.ALWAYS,
+            real_time=False,
+        )
+    else:
+        retrieval_options = req.retrieval_options
+
+    full_chat_msg_info = CreateChatMessageRequest(
+        chat_session_id=chat_session.id,
+        parent_message_id=chat_message.id,
+        message=query,
+        file_descriptors=[],
+        search_doc_ids=req.search_doc_ids,
+        retrieval_options=retrieval_options,
+        # Simple API does not support reranking, hide complexity from user
+        rerank_settings=None,
+        query_override=None,
+        chunks_above=0,
+        chunks_below=0,
+        full_doc=req.full_doc,
+        structured_response_format=req.structured_response_format,
+        origin=MessageOrigin.API,
+    )
+
+    packets = stream_chat_message_objects(
+        new_msg_req=full_chat_msg_info,
+        user=user,
+        db_session=db_session,
+    )
+
+    return gather_stream(packets)
--- a/backend/ee/onyx/server/query_and_chat/models.py
+++ b/backend/ee/onyx/server/query_and_chat/models.py
@@ -1,12 +1,18 @@
-from collections.abc import Sequence
-from datetime import datetime
+from collections import OrderedDict
+from typing import Literal
+from uuid import UUID

 from pydantic import BaseModel
 from pydantic import Field
+from pydantic import model_validator

+from onyx.chat.models import ThreadMessage
+from onyx.configs.constants import DocumentSource
 from onyx.context.search.models import BaseFilters
-from onyx.context.search.models import InferenceSection
-from onyx.context.search.models import SearchDoc
+from onyx.context.search.models import BasicChunkRequest
+from onyx.context.search.models import ChunkContext
+from onyx.context.search.models import InferenceChunk
+from onyx.context.search.models import RetrievalDetails
 from onyx.server.manage.models import StandardAnswer


@@ -19,89 +25,119 @@ class StandardAnswerResponse(BaseModel):
    standard_answers: list[StandardAnswer] = Field(default_factory=list)


-class SearchFlowClassificationRequest(BaseModel):
-    user_query: str
+class DocumentSearchRequest(BasicChunkRequest):
+    user_selected_filters: BaseFilters | None = None


-class SearchFlowClassificationResponse(BaseModel):
-    is_search_flow: bool
+class DocumentSearchResponse(BaseModel):
+    top_documents: list[InferenceChunk]


-class SendSearchQueryRequest(BaseModel):
-    search_query: str
-    filters: BaseFilters | None = None
-    num_docs_fed_to_llm_selection: int | None = None
-    run_query_expansion: bool = False
-    num_hits: int = 50
+class BasicCreateChatMessageRequest(ChunkContext):
+    """If a chat_session_id is not provided, a persona_id must be provided to automatically create a new chat session
+    Note, for simplicity this option only allows for a single linear chain of messages
+    """

-    include_content: bool = False
-    stream: bool = False
+    chat_session_id: UUID | None = None
+    # Optional persona_id to create a new chat session if chat_session_id is not provided
+    persona_id: int | None = None
+    # New message contents
+    message: str
+    # Defaults to using retrieval with no additional filters
+    retrieval_options: RetrievalDetails | None = None
+    # Allows the caller to specify the exact search query they want to use
+    # will disable Query Rewording if specified
+    query_override: str | None = None
+    # If search_doc_ids provided, then retrieval options are unused
+    search_doc_ids: list[int] | None = None
+    # only works if using an OpenAI model. See the following for more details:
+    # https://platform.openai.com/docs/guides/structured-outputs/introduction
+    structured_response_format: dict | None = None
+
+    @model_validator(mode="after")
+    def validate_chat_session_or_persona(self) -> "BasicCreateChatMessageRequest":
+        if self.chat_session_id is None and self.persona_id is None:
+            raise ValueError("Either chat_session_id or persona_id must be provided")
+        return self


-class SearchDocWithContent(SearchDoc):
-    # Allows None because this is determined by a flag but the object used in code
-    # of the search path uses this type
-    content: str | None
+class BasicCreateChatMessageWithHistoryRequest(ChunkContext):
+    # Last element is the new query. All previous elements are historical context
+    messages: list[ThreadMessage]
+    persona_id: int
+    retrieval_options: RetrievalDetails | None = None
+    query_override: str | None = None
+    skip_rerank: bool | None = None
+    # If search_doc_ids provided, then retrieval options are unused
+    search_doc_ids: list[int] | None = None
+    # only works if using an OpenAI model. See the following for more details:
+    # https://platform.openai.com/docs/guides/structured-outputs/introduction
+    structured_response_format: dict | None = None

-    @classmethod
-    def from_inference_sections(
-        cls,
-        sections: Sequence[InferenceSection],
-        include_content: bool = False,
-        is_internet: bool = False,
-    ) -> list["SearchDocWithContent"]:
-        """Convert InferenceSections to SearchDocWithContent objects.

-        Args:
-            sections: Sequence of InferenceSection objects
-            include_content: If True, populate content field with combined_content
-            is_internet: Whether these are internet search results
+class SimpleDoc(BaseModel):
+    id: str
+    semantic_identifier: str
+    link: str | None
+    blurb: str
+    match_highlights: list[str]
+    source_type: DocumentSource
+    metadata: dict | None

-        Returns:
-            List of SearchDocWithContent with optional content
+
+class AgentSubQuestion(BaseModel):
+    sub_question: str
+    document_ids: list[str]
+
+
+class AgentAnswer(BaseModel):
+    answer: str
+    answer_type: Literal["agent_sub_answer", "agent_level_answer"]
+
+
+class AgentSubQuery(BaseModel):
+    sub_query: str
+    query_id: int
+
+    @staticmethod
+    def make_dict_by_level_and_question_index(
+        original_dict: dict[tuple[int, int, int], "AgentSubQuery"],
+    ) -> dict[int, dict[int, list["AgentSubQuery"]]]:
+        """Takes a dict of tuple(level, question num, query_id) to sub queries.
+
+        returns a dict of level to dict[question num to list of query_id's]
+        Ordering is asc for readability.
        """
-        if not sections:
-            return []
+        # In this function, when we sort int | None, we deliberately push None to the end

-        return [
-            cls(
-                document_id=(chunk := section.center_chunk).document_id,
-                chunk_ind=chunk.chunk_id,
-                semantic_identifier=chunk.semantic_identifier or "Unknown",
-                link=chunk.source_links[0] if chunk.source_links else None,
-                blurb=chunk.blurb,
-                source_type=chunk.source_type,
-                boost=chunk.boost,
-                hidden=chunk.hidden,
-                metadata=chunk.metadata,
-                score=chunk.score,
-                match_highlights=chunk.match_highlights,
-                updated_at=chunk.updated_at,
-                primary_owners=chunk.primary_owners,
-                secondary_owners=chunk.secondary_owners,
-                is_internet=is_internet,
-                content=section.combined_content if include_content else None,
+        # map entries to the level_question_dict
+        level_question_dict: dict[int, dict[int, list["AgentSubQuery"]]] = {}
+        for k1, obj in original_dict.items():
+            level = k1[0]
+            question = k1[1]
+
+            if level not in level_question_dict:
+                level_question_dict[level] = {}
+
+            if question not in level_question_dict[level]:
+                level_question_dict[level][question] = []
+
+            level_question_dict[level][question].append(obj)
+
+        # sort each query_id list and question_index
+        for key1, obj1 in level_question_dict.items():
+            for key2, value2 in obj1.items():
+                # sort the query_id list of each question_index
+                level_question_dict[key1][key2] = sorted(
+                    value2, key=lambda o: o.query_id
+                )
+            # sort the question_index dict of level
+            level_question_dict[key1] = OrderedDict(
+                sorted(level_question_dict[key1].items(), key=lambda x: (x is None, x))
            )
-            for section in sections
-        ]

-
-class SearchFullResponse(BaseModel):
-    all_executed_queries: list[str]
-    search_docs: list[SearchDocWithContent]
-    # Reasoning tokens output by the LLM for the document selection
-    doc_selection_reasoning: str | None = None
-    # This a list of document ids that are in the search_docs list
-    llm_selected_doc_ids: list[str] | None = None
-    # Error message if the search failed partway through
-    error: str | None = None
-
-
-class SearchQueryResponse(BaseModel):
-    query: str
-    query_expansions: list[str] | None
-    created_at: datetime
-
-
-class SearchHistoryResponse(BaseModel):
-    search_queries: list[SearchQueryResponse]
+        # sort the top dict of levels
+        sorted_dict = OrderedDict(
+            sorted(level_question_dict.items(), key=lambda x: (x is None, x))
+        )
+        return sorted_dict
--- a/backend/ee/onyx/server/query_and_chat/search_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/search_backend.py
@@ -1,170 +0,0 @@
-from collections.abc import Generator
-
-from fastapi import APIRouter
-from fastapi import Depends
-from fastapi import HTTPException
-from fastapi.responses import StreamingResponse
-from sqlalchemy.orm import Session
-
-from ee.onyx.db.search import fetch_search_queries_for_user
-from ee.onyx.search.process_search_query import gather_search_stream
-from ee.onyx.search.process_search_query import stream_search_query
-from ee.onyx.secondary_llm_flows.search_flow_classification import (
-    classify_is_search_flow,
-)
-from ee.onyx.server.query_and_chat.models import SearchFlowClassificationRequest
-from ee.onyx.server.query_and_chat.models import SearchFlowClassificationResponse
-from ee.onyx.server.query_and_chat.models import SearchFullResponse
-from ee.onyx.server.query_and_chat.models import SearchHistoryResponse
-from ee.onyx.server.query_and_chat.models import SearchQueryResponse
-from ee.onyx.server.query_and_chat.models import SendSearchQueryRequest
-from ee.onyx.server.query_and_chat.streaming_models import SearchErrorPacket
-from onyx.auth.users import current_user
-from onyx.db.engine.sql_engine import get_session
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
-from onyx.db.models import User
-from onyx.llm.factory import get_default_llm
-from onyx.server.usage_limits import check_llm_cost_limit_for_provider
-from onyx.server.utils import get_json_line
-from onyx.utils.logger import setup_logger
-from shared_configs.contextvars import get_current_tenant_id
-
-logger = setup_logger()
-
-router = APIRouter(prefix="/search")
-
-
-@router.post("/search-flow-classification")
-def search_flow_classification(
-    request: SearchFlowClassificationRequest,
-    # This is added just to ensure this endpoint isn't spammed by non-authorized users since there's an LLM call underneath it
-    _: User | None = Depends(current_user),
-    db_session: Session = Depends(get_session),
-) -> SearchFlowClassificationResponse:
-    query = request.user_query
-    # This is a heuristic that if the user is typing a lot of text, it's unlikely they're looking for some specific document
-    # Most likely something needs to be done with the text included so we'll just classify it as a chat flow
-    if len(query) > 200:
-        return SearchFlowClassificationResponse(is_search_flow=False)
-
-    llm = get_default_llm()
-
-    check_llm_cost_limit_for_provider(
-        db_session=db_session,
-        tenant_id=get_current_tenant_id(),
-        llm_provider_api_key=llm.config.api_key,
-    )
-
-    try:
-        is_search_flow = classify_is_search_flow(query=query, llm=llm)
-    except Exception as e:
-        logger.exception(
-            "Search flow classification failed; defaulting to chat flow",
-            exc_info=e,
-        )
-        is_search_flow = False
-
-    return SearchFlowClassificationResponse(is_search_flow=is_search_flow)
-
-
-@router.post("/send-search-message", response_model=None)
-def handle_send_search_message(
-    request: SendSearchQueryRequest,
-    user: User | None = Depends(current_user),
-    db_session: Session = Depends(get_session),
-) -> StreamingResponse | SearchFullResponse:
-    """
-    Execute a search query with optional streaming.
-
-    When stream=True: Returns StreamingResponse with SSE
-    When stream=False: Returns SearchFullResponse
-    """
-    logger.debug(f"Received search query: {request.search_query}")
-
-    # Non-streaming path
-    if not request.stream:
-        try:
-            packets = stream_search_query(request, user, db_session)
-            return gather_search_stream(packets)
-        except NotImplementedError as e:
-            return SearchFullResponse(
-                all_executed_queries=[],
-                search_docs=[],
-                error=str(e),
-            )
-
-    # Streaming path
-    def stream_generator() -> Generator[str, None, None]:
-        try:
-            with get_session_with_current_tenant() as streaming_db_session:
-                for packet in stream_search_query(request, user, streaming_db_session):
-                    yield get_json_line(packet.model_dump())
-        except NotImplementedError as e:
-            yield get_json_line(SearchErrorPacket(error=str(e)).model_dump())
-        except HTTPException:
-            raise
-        except Exception as e:
-            logger.exception("Error in search streaming")
-            yield get_json_line(SearchErrorPacket(error=str(e)).model_dump())
-
-    return StreamingResponse(stream_generator(), media_type="text/event-stream")
-
-
-@router.get("/search-history")
-def get_search_history(
-    limit: int = 100,
-    filter_days: int | None = None,
-    user: User | None = Depends(current_user),
-    db_session: Session = Depends(get_session),
-) -> SearchHistoryResponse:
-    """
-    Fetch past search queries for the authenticated user.
-
-    Args:
-        limit: Maximum number of queries to return (default 100)
-        filter_days: Only return queries from the last N days (optional)
-
-    Returns:
-        SearchHistoryResponse with list of search queries, ordered by most recent first.
-    """
-    # Validate limit
-    if limit <= 0:
-        raise HTTPException(
-            status_code=400,
-            detail="limit must be greater than 0",
-        )
-    if limit > 1000:
-        raise HTTPException(
-            status_code=400,
-            detail="limit must be at most 1000",
-        )
-
-    # Validate filter_days
-    if filter_days is not None and filter_days <= 0:
-        raise HTTPException(
-            status_code=400,
-            detail="filter_days must be greater than 0",
-        )
-
-    # TODO(yuhong) remove this
-    if user is None:
-        # Return empty list for unauthenticated users
-        return SearchHistoryResponse(search_queries=[])
-
-    search_queries = fetch_search_queries_for_user(
-        db_session=db_session,
-        user_id=user.id,
-        filter_days=filter_days,
-        limit=limit,
-    )
-
-    return SearchHistoryResponse(
-        search_queries=[
-            SearchQueryResponse(
-                query=sq.query,
-                query_expansions=sq.query_expansions,
-                created_at=sq.created_at,
-            )
-            for sq in search_queries
-        ]
-    )
--- a/backend/ee/onyx/server/query_and_chat/streaming_models.py
+++ b/backend/ee/onyx/server/query_and_chat/streaming_models.py
@@ -1,35 +0,0 @@
-from typing import Literal
-
-from pydantic import BaseModel
-from pydantic import ConfigDict
-
-from ee.onyx.server.query_and_chat.models import SearchDocWithContent
-
-
-class SearchQueriesPacket(BaseModel):
-    model_config = ConfigDict(frozen=True)
-
-    type: Literal["search_queries"] = "search_queries"
-    all_executed_queries: list[str]
-
-
-class SearchDocsPacket(BaseModel):
-    model_config = ConfigDict(frozen=True)
-
-    type: Literal["search_docs"] = "search_docs"
-    search_docs: list[SearchDocWithContent]
-
-
-class SearchErrorPacket(BaseModel):
-    model_config = ConfigDict(frozen=True)
-
-    type: Literal["search_error"] = "search_error"
-    error: str
-
-
-class LLMSelectedDocsPacket(BaseModel):
-    model_config = ConfigDict(frozen=True)
-
-    type: Literal["llm_selected_docs"] = "llm_selected_docs"
-    # None if LLM selection failed, empty list if no docs selected, list of IDs otherwise
-    llm_selected_doc_ids: list[str] | None
--- a/backend/ee/onyx/server/query_history/api.py
+++ b/backend/ee/onyx/server/query_history/api.py
@@ -32,7 +32,6 @@ from onyx.configs.constants import MessageType
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
-from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.configs.constants import QAFeedbackType
 from onyx.configs.constants import QueryHistoryType
 from onyx.configs.constants import SessionType
@@ -49,6 +48,7 @@ from onyx.file_store.file_store import get_default_file_store
 from onyx.server.documents.models import PaginatedReturn
 from onyx.server.query_and_chat.models import ChatSessionDetails
 from onyx.server.query_and_chat.models import ChatSessionsResponse
+from onyx.server.utils import PUBLIC_API_TAGS
 from onyx.utils.threadpool_concurrency import parallel_yield
 from shared_configs.contextvars import get_current_tenant_id

--- a/backend/ee/onyx/server/settings/init.py
+++ b/backend/ee/onyx/server/settings/init.py
--- a/backend/ee/onyx/server/settings/api.py
+++ b/backend/ee/onyx/server/settings/api.py
@@ -1,54 +0,0 @@
-"""EE Settings API - provides license-aware settings override."""
-
-from redis.exceptions import RedisError
-
-from ee.onyx.configs.app_configs import LICENSE_ENFORCEMENT_ENABLED
-from ee.onyx.db.license import get_cached_license_metadata
-from onyx.server.settings.models import ApplicationStatus
-from onyx.server.settings.models import Settings
-from onyx.utils.logger import setup_logger
-from shared_configs.configs import MULTI_TENANT
-from shared_configs.contextvars import get_current_tenant_id
-
-logger = setup_logger()
-
-# Statuses that indicate a billing/license problem - propagate these to settings
-_GATED_STATUSES = frozenset(
-    {
-        ApplicationStatus.GATED_ACCESS,
-        ApplicationStatus.GRACE_PERIOD,
-        ApplicationStatus.PAYMENT_REMINDER,
-    }
-)
-
-
-def apply_license_status_to_settings(settings: Settings) -> Settings:
-    """EE version: checks license status for self-hosted deployments.
-
-    For self-hosted, looks up license metadata and overrides application_status
-    if the license is missing or indicates a problem (expired, grace period, etc.).
-
-    For multi-tenant (cloud), the settings already have the correct status
-    from the control plane, so no override is needed.
-
-    If LICENSE_ENFORCEMENT_ENABLED is false, settings are returned unchanged,
-    allowing the product to function normally without license checks.
-    """
-    if not LICENSE_ENFORCEMENT_ENABLED:
-        return settings
-
-    if MULTI_TENANT:
-        return settings
-
-    tenant_id = get_current_tenant_id()
-    try:
-        metadata = get_cached_license_metadata(tenant_id)
-        if metadata and metadata.status in _GATED_STATUSES:
-            settings.application_status = metadata.status
-        elif not metadata:
-            # No license = gated access for self-hosted EE
-            settings.application_status = ApplicationStatus.GATED_ACCESS
-    except RedisError as e:
-        logger.warning(f"Failed to check license metadata for settings: {e}")
-
-    return settings
--- a/backend/ee/onyx/server/tenant_usage_limits.py
+++ b/backend/ee/onyx/server/tenant_usage_limits.py
@@ -1,14 +1,10 @@
 """Tenant-specific usage limit overrides from the control plane (EE version)."""

-import time
-
 import requests

 from ee.onyx.server.tenants.access import generate_data_plane_token
 from onyx.configs.app_configs import CONTROL_PLANE_API_BASE_URL
-from onyx.configs.app_configs import DEV_MODE
 from onyx.server.tenant_usage_limits import TenantUsageLimitOverrides
-from onyx.server.usage_limits import NO_LIMIT
 from onyx.utils.logger import setup_logger

 logger = setup_logger()
@@ -16,12 +12,9 @@ logger = setup_logger()

 # In-memory storage for tenant overrides (populated at startup)
 _tenant_usage_limit_overrides: dict[str, TenantUsageLimitOverrides] | None = None
-_last_fetch_time: float = 0.0
-_FETCH_INTERVAL = 60 * 60 * 24  # 24 hours
-_ERROR_FETCH_INTERVAL = 30 * 60  # 30 minutes (if the last fetch failed)


-def fetch_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides] | None:
+def fetch_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides]:
    """
    Fetch tenant-specific usage limit overrides from the control plane.

@@ -52,52 +45,33 @@ def fetch_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides] | None
                    f"Failed to parse usage limit overrides for tenant {tenant_id}: {e}"
                )

-        return (
-            result or None
-        )  # if empty dictionary, something went wrong and we shouldn't enforce limits
+        return result

    except requests.exceptions.RequestException as e:
        logger.warning(f"Failed to fetch usage limit overrides from control plane: {e}")
-        return None
+        return {}
    except Exception as e:
        logger.error(f"Error parsing usage limit overrides: {e}")
-        return None
+        return {}


-def load_usage_limit_overrides() -> None:
+def load_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides]:
    """
    Load tenant usage limit overrides from the control plane.
+
+    Called at server startup to populate the in-memory cache.
    """
    global _tenant_usage_limit_overrides
-    global _last_fetch_time

    logger.info("Loading tenant usage limit overrides from control plane...")
    overrides = fetch_usage_limit_overrides()
-
-    _last_fetch_time = time.time()
-
-    # use the new result if it exists, otherwise use the old result
-    # (prevents us from updating to a failed fetch result)
-    _tenant_usage_limit_overrides = overrides or _tenant_usage_limit_overrides
+    _tenant_usage_limit_overrides = overrides

    if overrides:
        logger.info(f"Loaded usage limit overrides for {len(overrides)} tenants")
    else:
        logger.info("No tenant-specific usage limit overrides found")
-
-
-def unlimited(tenant_id: str) -> TenantUsageLimitOverrides:
-    return TenantUsageLimitOverrides(
-        tenant_id=tenant_id,
-        llm_cost_cents_trial=NO_LIMIT,
-        llm_cost_cents_paid=NO_LIMIT,
-        chunks_indexed_trial=NO_LIMIT,
-        chunks_indexed_paid=NO_LIMIT,
-        api_calls_trial=NO_LIMIT,
-        api_calls_paid=NO_LIMIT,
-        non_streaming_calls_trial=NO_LIMIT,
-        non_streaming_calls_paid=NO_LIMIT,
-    )
+    return overrides


 def get_tenant_usage_limit_overrides(
@@ -112,22 +86,7 @@ def get_tenant_usage_limit_overrides(
    Returns:
        TenantUsageLimitOverrides if the tenant has overrides, None otherwise.
    """
-
-    if DEV_MODE:  # in dev mode, we return unlimited limits for all tenants
-        return unlimited(tenant_id)
-
    global _tenant_usage_limit_overrides
-    time_since = time.time() - _last_fetch_time
-    if (
-        _tenant_usage_limit_overrides is None and time_since > _ERROR_FETCH_INTERVAL
-    ) or (time_since > _FETCH_INTERVAL):
-        logger.debug(
-            f"Last fetch time: {_last_fetch_time}, time since last fetch: {time_since}"
-        )
-
-        load_usage_limit_overrides()
-
-    # If we have failed to fetch from the control plane or we're in dev mode, don't usage limit anyone.
-    if _tenant_usage_limit_overrides is None or DEV_MODE:
-        return unlimited(tenant_id)
+    if _tenant_usage_limit_overrides is None:
+        _tenant_usage_limit_overrides = load_usage_limit_overrides()
    return _tenant_usage_limit_overrides.get(tenant_id)
--- a/backend/ee/onyx/server/tenants/billing.py
+++ b/backend/ee/onyx/server/tenants/billing.py
@@ -76,26 +76,6 @@ def fetch_billing_information(
    return BillingInformation(**response_data)


-def fetch_customer_portal_session(tenant_id: str, return_url: str | None = None) -> str:
-    """
-    Fetch a Stripe customer portal session URL from the control plane.
-    NOTE: This is currently only used for multi-tenant (cloud) deployments.
-    Self-hosted proxy endpoints will be added in a future phase.
-    """
-    token = generate_data_plane_token()
-    headers = {
-        "Authorization": f"Bearer {token}",
-        "Content-Type": "application/json",
-    }
-    url = f"{CONTROL_PLANE_API_BASE_URL}/create-customer-portal-session"
-    payload = {"tenant_id": tenant_id}
-    if return_url:
-        payload["return_url"] = return_url
-    response = requests.post(url, headers=headers, json=payload)
-    response.raise_for_status()
-    return response.json()["url"]
-
-
 def register_tenant_users(tenant_id: str, number_of_users: int) -> stripe.Subscription:
    """
    Update the number of seats for a tenant's subscription.
--- a/backend/ee/onyx/server/tenants/billing_api.py
+++ b/backend/ee/onyx/server/tenants/billing_api.py
@@ -1,41 +1,34 @@
-import asyncio
-
-import httpx
+import stripe
 from fastapi import APIRouter
 from fastapi import Depends
 from fastapi import HTTPException

 from ee.onyx.auth.users import current_admin_user
+from ee.onyx.configs.app_configs import STRIPE_SECRET_KEY
 from ee.onyx.server.tenants.access import control_plane_dep
 from ee.onyx.server.tenants.billing import fetch_billing_information
-from ee.onyx.server.tenants.billing import fetch_customer_portal_session
 from ee.onyx.server.tenants.billing import fetch_stripe_checkout_session
+from ee.onyx.server.tenants.billing import fetch_tenant_stripe_information
 from ee.onyx.server.tenants.models import BillingInformation
 from ee.onyx.server.tenants.models import CreateSubscriptionSessionRequest
 from ee.onyx.server.tenants.models import ProductGatingFullSyncRequest
 from ee.onyx.server.tenants.models import ProductGatingRequest
 from ee.onyx.server.tenants.models import ProductGatingResponse
-from ee.onyx.server.tenants.models import StripePublishableKeyResponse
 from ee.onyx.server.tenants.models import SubscriptionSessionResponse
 from ee.onyx.server.tenants.models import SubscriptionStatusResponse
 from ee.onyx.server.tenants.product_gating import overwrite_full_gated_set
 from ee.onyx.server.tenants.product_gating import store_product_gating
 from onyx.auth.users import User
-from onyx.configs.app_configs import STRIPE_PUBLISHABLE_KEY_OVERRIDE
-from onyx.configs.app_configs import STRIPE_PUBLISHABLE_KEY_URL
 from onyx.configs.app_configs import WEB_DOMAIN
 from onyx.utils.logger import setup_logger
 from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
 from shared_configs.contextvars import get_current_tenant_id

+stripe.api_key = STRIPE_SECRET_KEY
 logger = setup_logger()

 router = APIRouter(prefix="/tenants")

-# Cache for Stripe publishable key to avoid hitting S3 on every request
-_stripe_publishable_key_cache: str | None = None
-_stripe_key_lock = asyncio.Lock()
-

@router.post("/product-gating")
 def gate_product(
@@ -90,17 +83,21 @@ async def billing_information(
 async def create_customer_portal_session(
    _: User = Depends(current_admin_user),
 ) -> dict:
-    """
-    Create a Stripe customer portal session via the control plane.
-    NOTE: This is currently only used for multi-tenant (cloud) deployments.
-    Self-hosted proxy endpoints will be added in a future phase.
-    """
    tenant_id = get_current_tenant_id()
-    return_url = f"{WEB_DOMAIN}/admin/billing"

    try:
-        portal_url = fetch_customer_portal_session(tenant_id, return_url)
-        return {"url": portal_url}
+        stripe_info = fetch_tenant_stripe_information(tenant_id)
+        stripe_customer_id = stripe_info.get("stripe_customer_id")
+        if not stripe_customer_id:
+            raise HTTPException(status_code=400, detail="Stripe customer ID not found")
+        logger.info(stripe_customer_id)
+
+        portal_session = stripe.billing_portal.Session.create(
+            customer=stripe_customer_id,
+            return_url=f"{WEB_DOMAIN}/admin/billing",
+        )
+        logger.info(portal_session)
+        return {"url": portal_session.url}
    except Exception as e:
        logger.exception("Failed to create customer portal session")
        raise HTTPException(status_code=500, detail=str(e))
@@ -123,67 +120,3 @@ async def create_subscription_session(
    except Exception as e:
        logger.exception("Failed to create subscription session")
        raise HTTPException(status_code=500, detail=str(e))
-
-
-@router.get("/stripe-publishable-key")
-async def get_stripe_publishable_key() -> StripePublishableKeyResponse:
-    """
-    Fetch the Stripe publishable key.
-    Priority: env var override (for testing) > S3 bucket (production).
-    This endpoint is public (no auth required) since publishable keys are safe to expose.
-    The key is cached in memory to avoid hitting S3 on every request.
-    """
-    global _stripe_publishable_key_cache
-
-    # Fast path: return cached value without lock
-    if _stripe_publishable_key_cache:
-        return StripePublishableKeyResponse(
-            publishable_key=_stripe_publishable_key_cache
-        )
-
-    # Use lock to prevent concurrent S3 requests
-    async with _stripe_key_lock:
-        # Double-check after acquiring lock (another request may have populated cache)
-        if _stripe_publishable_key_cache:
-            return StripePublishableKeyResponse(
-                publishable_key=_stripe_publishable_key_cache
-            )
-
-        # Check for env var override first (for local testing with pk_test_* keys)
-        if STRIPE_PUBLISHABLE_KEY_OVERRIDE:
-            key = STRIPE_PUBLISHABLE_KEY_OVERRIDE.strip()
-            if not key.startswith("pk_"):
-                raise HTTPException(
-                    status_code=500,
-                    detail="Invalid Stripe publishable key format",
-                )
-            _stripe_publishable_key_cache = key
-            return StripePublishableKeyResponse(publishable_key=key)
-
-        # Fall back to S3 bucket
-        if not STRIPE_PUBLISHABLE_KEY_URL:
-            raise HTTPException(
-                status_code=500,
-                detail="Stripe publishable key is not configured",
-            )
-
-        try:
-            async with httpx.AsyncClient() as client:
-                response = await client.get(STRIPE_PUBLISHABLE_KEY_URL)
-                response.raise_for_status()
-                key = response.text.strip()
-
-                # Validate key format
-                if not key.startswith("pk_"):
-                    raise HTTPException(
-                        status_code=500,
-                        detail="Invalid Stripe publishable key format",
-                    )
-
-                _stripe_publishable_key_cache = key
-                return StripePublishableKeyResponse(publishable_key=key)
-        except httpx.HTTPError:
-            raise HTTPException(
-                status_code=500,
-                detail="Failed to fetch Stripe publishable key",
-            )
--- a/backend/ee/onyx/server/tenants/models.py
+++ b/backend/ee/onyx/server/tenants/models.py
@@ -105,7 +105,3 @@ class PendingUserSnapshot(BaseModel):

 class ApproveUserRequest(BaseModel):
    email: str
-
-
-class StripePublishableKeyResponse(BaseModel):
-    publishable_key: str
--- a/backend/ee/onyx/server/tenants/product_gating.py
+++ b/backend/ee/onyx/server/tenants/product_gating.py
@@ -65,9 +65,3 @@ def get_gated_tenants() -> set[str]:
    redis_client = get_redis_replica_client(tenant_id=ONYX_CLOUD_TENANT_ID)
    gated_tenants_bytes = cast(set[bytes], redis_client.smembers(GATED_TENANTS_KEY))
    return {tenant_id.decode("utf-8") for tenant_id in gated_tenants_bytes}
-
-
-def is_tenant_gated(tenant_id: str) -> bool:
-    """Fast O(1) check if tenant is in gated set (multi-tenant only)."""
-    redis_client = get_redis_replica_client(tenant_id=ONYX_CLOUD_TENANT_ID)
-    return bool(redis_client.sismember(GATED_TENANTS_KEY, tenant_id))
--- a/backend/ee/onyx/server/token_rate_limits/api.py
+++ b/backend/ee/onyx/server/token_rate_limits/api.py
@@ -9,7 +9,6 @@ from ee.onyx.db.token_limit import fetch_user_group_token_rate_limits_for_user
 from ee.onyx.db.token_limit import insert_user_group_token_rate_limit
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_curator_or_admin_user
-from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.db.token_limit import fetch_all_user_token_rate_limits
@@ -17,6 +16,7 @@ from onyx.db.token_limit import insert_user_token_rate_limit
 from onyx.server.query_and_chat.token_limit import any_rate_limit_exists
 from onyx.server.token_rate_limits.models import TokenRateLimitArgs
 from onyx.server.token_rate_limits.models import TokenRateLimitDisplay
+from onyx.server.utils import PUBLIC_API_TAGS

 router = APIRouter(prefix="/admin/token-rate-limits", tags=PUBLIC_API_TAGS)

--- a/backend/ee/onyx/server/user_group/api.py
+++ b/backend/ee/onyx/server/user_group/api.py
@@ -18,10 +18,10 @@ from ee.onyx.server.user_group.models import UserGroupCreate
 from ee.onyx.server.user_group.models import UserGroupUpdate
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_curator_or_admin_user
-from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.db.models import UserRole
+from onyx.server.utils import PUBLIC_API_TAGS
 from onyx.utils.logger import setup_logger

 logger = setup_logger()
--- a/backend/onyx/auth/users.py
+++ b/backend/onyx/auth/users.py
@@ -11,7 +11,6 @@ from typing import Any
 from typing import cast
 from typing import Dict
 from typing import List
-from typing import Literal
 from typing import Optional
 from typing import Protocol
 from typing import Tuple
@@ -1457,9 +1456,6 @@ def get_default_admin_user_emails_() -> list[str]:


 STATE_TOKEN_AUDIENCE = "fastapi-users:oauth-state"
-STATE_TOKEN_LIFETIME_SECONDS = 3600
-CSRF_TOKEN_KEY = "csrftoken"
-CSRF_TOKEN_COOKIE_NAME = "fastapiusersoauthcsrf"


 class OAuth2AuthorizeResponse(BaseModel):
@@ -1467,19 +1463,13 @@ class OAuth2AuthorizeResponse(BaseModel):


 def generate_state_token(
-    data: Dict[str, str],
-    secret: SecretType,
-    lifetime_seconds: int = STATE_TOKEN_LIFETIME_SECONDS,
+    data: Dict[str, str], secret: SecretType, lifetime_seconds: int = 3600
 ) -> str:
    data["aud"] = STATE_TOKEN_AUDIENCE

    return generate_jwt(data, secret, lifetime_seconds)


-def generate_csrf_token() -> str:
-    return secrets.token_urlsafe(32)
-
-
 # refer to https://github.com/fastapi-users/fastapi-users/blob/42ddc241b965475390e2bce887b084152ae1a2cd/fastapi_users/fastapi_users.py#L91
 def create_onyx_oauth_router(
    oauth_client: BaseOAuth2,
@@ -1508,13 +1498,6 @@ def get_oauth_router(
    redirect_url: Optional[str] = None,
    associate_by_email: bool = False,
    is_verified_by_default: bool = False,
-    *,
-    csrf_token_cookie_name: str = CSRF_TOKEN_COOKIE_NAME,
-    csrf_token_cookie_path: str = "/",
-    csrf_token_cookie_domain: Optional[str] = None,
-    csrf_token_cookie_secure: Optional[bool] = None,
-    csrf_token_cookie_httponly: bool = True,
-    csrf_token_cookie_samesite: Optional[Literal["lax", "strict", "none"]] = "lax",
 ) -> APIRouter:
    """Generate a router with the OAuth routes."""
    router = APIRouter()
@@ -1531,9 +1514,6 @@ def get_oauth_router(
            route_name=callback_route_name,
        )

-    if csrf_token_cookie_secure is None:
-        csrf_token_cookie_secure = WEB_DOMAIN.startswith("https")
-
    @router.get(
        "/authorize",
        name=f"oauth:{oauth_client.name}.{backend.name}.authorize",
@@ -1541,10 +1521,8 @@ def get_oauth_router(
    )
    async def authorize(
        request: Request,
-        response: Response,
-        redirect: bool = Query(False),
        scopes: List[str] = Query(None),
-    ) -> Response | OAuth2AuthorizeResponse:
+    ) -> OAuth2AuthorizeResponse:
        referral_source = request.cookies.get("referral_source", None)

        if redirect_url is not None:
@@ -1554,11 +1532,9 @@ def get_oauth_router(

        next_url = request.query_params.get("next", "/")

-        csrf_token = generate_csrf_token()
        state_data: Dict[str, str] = {
            "next_url": next_url,
            "referral_source": referral_source or "default_referral",
-            CSRF_TOKEN_KEY: csrf_token,
        }
        state = generate_state_token(state_data, state_secret)

@@ -1575,31 +1551,6 @@ def get_oauth_router(
                authorization_url, {"access_type": "offline", "prompt": "consent"}
            )

-        if redirect:
-            redirect_response = RedirectResponse(authorization_url, status_code=302)
-            redirect_response.set_cookie(
-                key=csrf_token_cookie_name,
-                value=csrf_token,
-                max_age=STATE_TOKEN_LIFETIME_SECONDS,
-                path=csrf_token_cookie_path,
-                domain=csrf_token_cookie_domain,
-                secure=csrf_token_cookie_secure,
-                httponly=csrf_token_cookie_httponly,
-                samesite=csrf_token_cookie_samesite,
-            )
-            return redirect_response
-
-        response.set_cookie(
-            key=csrf_token_cookie_name,
-            value=csrf_token,
-            max_age=STATE_TOKEN_LIFETIME_SECONDS,
-            path=csrf_token_cookie_path,
-            domain=csrf_token_cookie_domain,
-            secure=csrf_token_cookie_secure,
-            httponly=csrf_token_cookie_httponly,
-            samesite=csrf_token_cookie_samesite,
-        )
-
        return OAuth2AuthorizeResponse(authorization_url=authorization_url)

    @log_function_time(print_only=True)
@@ -1649,33 +1600,7 @@ def get_oauth_router(
        try:
            state_data = decode_jwt(state, state_secret, [STATE_TOKEN_AUDIENCE])
        except jwt.DecodeError:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail=getattr(
-                    ErrorCode, "ACCESS_TOKEN_DECODE_ERROR", "ACCESS_TOKEN_DECODE_ERROR"
-                ),
-            )
-        except jwt.ExpiredSignatureError:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail=getattr(
-                    ErrorCode,
-                    "ACCESS_TOKEN_ALREADY_EXPIRED",
-                    "ACCESS_TOKEN_ALREADY_EXPIRED",
-                ),
-            )
-
-        cookie_csrf_token = request.cookies.get(csrf_token_cookie_name)
-        state_csrf_token = state_data.get(CSRF_TOKEN_KEY)
-        if (
-            not cookie_csrf_token
-            or not state_csrf_token
-            or not secrets.compare_digest(cookie_csrf_token, state_csrf_token)
-        ):
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail=getattr(ErrorCode, "OAUTH_INVALID_STATE", "OAUTH_INVALID_STATE"),
-            )
+            raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST)

        next_url = state_data.get("next_url", "/")
        referral_source = state_data.get("referral_source", None)
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -26,13 +26,10 @@ from onyx.background.celery.celery_utils import celery_is_worker_primary
 from onyx.background.celery.celery_utils import make_probe_path
 from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_PREFIX
 from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_TASKSET_KEY
-from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
+from onyx.configs.app_configs import ENABLE_OPENSEARCH_FOR_ONYX
 from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
 from onyx.configs.constants import OnyxRedisLocks
 from onyx.db.engine.sql_engine import get_sqlalchemy_engine
-from onyx.document_index.opensearch.client import (
-    wait_for_opensearch_with_timeout,
-)
 from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
 from onyx.httpx.httpx_pool import HttpxPool
 from onyx.redis.redis_connector import RedisConnector
@@ -519,17 +516,14 @@ def wait_for_vespa_or_shutdown(sender: Any, **kwargs: Any) -> None:
    """Waits for Vespa to become ready subject to a timeout.
    Raises WorkerShutdown if the timeout is reached."""

+    if ENABLE_OPENSEARCH_FOR_ONYX:
+        return
+
    if not wait_for_vespa_with_timeout():
-        msg = "[Vespa] Readiness probe did not succeed within the timeout. Exiting..."
+        msg = "Vespa: Readiness probe did not succeed within the timeout. Exiting..."
        logger.error(msg)
        raise WorkerShutdown(msg)

-    if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
-        if not wait_for_opensearch_with_timeout():
-            msg = "[OpenSearch] Readiness probe did not succeed within the timeout. Exiting..."
-            logger.error(msg)
-            raise WorkerShutdown(msg)
-

 # File for validating worker liveness
 class LivenessProbe(bootsteps.StartStopStep):
--- a/backend/onyx/background/celery/tasks/docprocessing/tasks.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -87,7 +87,7 @@ from onyx.db.models import SearchSettings
 from onyx.db.search_settings import get_current_search_settings
 from onyx.db.search_settings import get_secondary_search_settings
 from onyx.db.swap_index import check_and_perform_index_swap
-from onyx.document_index.factory import get_all_document_indices
+from onyx.document_index.factory import get_default_document_index
 from onyx.file_store.document_batch_storage import DocumentBatchStorage
 from onyx.file_store.document_batch_storage import get_document_batch_storage
 from onyx.httpx.httpx_pool import HttpxPool
@@ -1436,7 +1436,7 @@ def _docprocessing_task(
                callback=callback,
            )

-            document_indices = get_all_document_indices(
+            document_index = get_default_document_index(
                index_attempt.search_settings,
                None,
                httpx_client=HttpxPool.get("vespa"),
@@ -1473,7 +1473,7 @@ def _docprocessing_task(
            # real work happens here!
            index_pipeline_result = run_indexing_pipeline(
                embedder=embedding_model,
-                document_indices=document_indices,
+                document_index=document_index,
                ignore_time_skip=True,  # Documents are already filtered during extraction
                db_session=db_session,
                tenant_id=tenant_id,
--- a/backend/onyx/background/celery/tasks/shared/tasks.py
+++ b/backend/onyx/background/celery/tasks/shared/tasks.py
@@ -25,7 +25,7 @@ from onyx.db.document_set import fetch_document_sets_for_document
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.relationships import delete_document_references_from_kg
 from onyx.db.search_settings import get_active_search_settings
-from onyx.document_index.factory import get_all_document_indices
+from onyx.document_index.factory import get_default_document_index
 from onyx.document_index.interfaces import VespaDocumentFields
 from onyx.httpx.httpx_pool import HttpxPool
 from onyx.redis.redis_pool import get_redis_client
@@ -97,17 +97,13 @@ def document_by_cc_pair_cleanup_task(
            action = "skip"

            active_search_settings = get_active_search_settings(db_session)
-            # This flow is for updates and deletion so we get all indices.
-            document_indices = get_all_document_indices(
+            doc_index = get_default_document_index(
                active_search_settings.primary,
                active_search_settings.secondary,
                httpx_client=HttpxPool.get("vespa"),
            )

-            retry_document_indices: list[RetryDocumentIndex] = [
-                RetryDocumentIndex(document_index)
-                for document_index in document_indices
-            ]
+            retry_index = RetryDocumentIndex(doc_index)

            count = get_document_connector_count(db_session, document_id)
            if count == 1:
@@ -117,12 +113,11 @@ def document_by_cc_pair_cleanup_task(

                chunk_count = fetch_chunk_count_for_document(document_id, db_session)

-                for retry_document_index in retry_document_indices:
-                    _ = retry_document_index.delete_single(
-                        document_id,
-                        tenant_id=tenant_id,
-                        chunk_count=chunk_count,
-                    )
+                _ = retry_index.delete_single(
+                    document_id,
+                    tenant_id=tenant_id,
+                    chunk_count=chunk_count,
+                )

                delete_document_references_from_kg(
                    db_session=db_session,
@@ -160,18 +155,14 @@ def document_by_cc_pair_cleanup_task(
                    hidden=doc.hidden,
                )

-                for retry_document_index in retry_document_indices:
-                    # TODO(andrei): Previously there was a comment here saying
-                    # it was ok if a doc did not exist in the document index. I
-                    # don't agree with that claim, so keep an eye on this task
-                    # to see if this raises.
-                    retry_document_index.update_single(
-                        document_id,
-                        tenant_id=tenant_id,
-                        chunk_count=doc.chunk_count,
-                        fields=fields,
-                        user_fields=None,
-                    )
+                # update Vespa. OK if doc doesn't exist. Raises exception otherwise.
+                retry_index.update_single(
+                    document_id,
+                    tenant_id=tenant_id,
+                    chunk_count=doc.chunk_count,
+                    fields=fields,
+                    user_fields=None,
+                )

                # there are still other cc_pair references to the doc, so just resync to Vespa
                delete_document_by_connector_credential_pair__no_commit(
--- a/backend/onyx/background/celery/tasks/user_file_processing/tasks.py
+++ b/backend/onyx/background/celery/tasks/user_file_processing/tasks.py
@@ -12,6 +12,7 @@ from retry import retry
 from sqlalchemy import select

 from onyx.background.celery.apps.app_base import task_logger
+from onyx.background.celery.celery_redis import celery_get_queue_length
 from onyx.background.celery.celery_utils import httpx_init_vespa_pool
 from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
 from onyx.configs.app_configs import MANAGED_VESPA
@@ -19,12 +20,14 @@ from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
 from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import CELERY_USER_FILE_PROCESSING_LOCK_TIMEOUT
+from onyx.configs.constants import CELERY_USER_FILE_PROCESSING_TASK_EXPIRES
 from onyx.configs.constants import CELERY_USER_FILE_PROJECT_SYNC_LOCK_TIMEOUT
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisLocks
+from onyx.configs.constants import USER_FILE_PROCESSING_MAX_QUEUE_DEPTH
 from onyx.connectors.file.connector import LocalFileConnector
 from onyx.connectors.models import Document
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
@@ -32,7 +35,7 @@ from onyx.db.enums import UserFileStatus
 from onyx.db.models import UserFile
 from onyx.db.search_settings import get_active_search_settings
 from onyx.db.search_settings import get_active_search_settings_list
-from onyx.document_index.factory import get_all_document_indices
+from onyx.document_index.factory import get_default_document_index
 from onyx.document_index.interfaces import VespaDocumentUserFields
 from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
 from onyx.file_store.file_store import get_default_file_store
@@ -53,6 +56,17 @@ def _user_file_lock_key(user_file_id: str | UUID) -> str:
    return f"{OnyxRedisLocks.USER_FILE_PROCESSING_LOCK_PREFIX}:{user_file_id}"


+def _user_file_queued_key(user_file_id: str | UUID) -> str:
+    """Key that exists while a process_single_user_file task is sitting in the queue.
+
+    The beat generator sets this with a TTL equal to CELERY_USER_FILE_PROCESSING_TASK_EXPIRES
+    before enqueuing and the worker deletes it as its first action.  This prevents
+    the beat from adding duplicate tasks for files that already have a live task
+    in flight.
+    """
+    return f"{OnyxRedisLocks.USER_FILE_QUEUED_PREFIX}:{user_file_id}"
+
+
 def _user_file_project_sync_lock_key(user_file_id: str | UUID) -> str:
    return f"{OnyxRedisLocks.USER_FILE_PROJECT_SYNC_LOCK_PREFIX}:{user_file_id}"

@@ -116,7 +130,24 @@ def _get_document_chunk_count(
 def check_user_file_processing(self: Task, *, tenant_id: str) -> None:
    """Scan for user files with PROCESSING status and enqueue per-file tasks.

-    Uses direct Redis locks to avoid overlapping runs.
+    Three mechanisms prevent queue runaway:
+
+    1. **Queue depth backpressure** – if the broker queue already has more than
+       USER_FILE_PROCESSING_MAX_QUEUE_DEPTH items we skip this beat cycle
+       entirely.  Workers are clearly behind; adding more tasks would only make
+       the backlog worse.
+
+    2. **Per-file queued guard** – before enqueuing a task we set a short-lived
+       Redis key (TTL = CELERY_USER_FILE_PROCESSING_TASK_EXPIRES).  If that key
+       already exists the file already has a live task in the queue, so we skip
+       it.  The worker deletes the key the moment it picks up the task so the
+       next beat cycle can re-enqueue if the file is still PROCESSING.
+
+    3. **Task expiry** – every enqueued task carries an `expires` value equal to
+       CELERY_USER_FILE_PROCESSING_TASK_EXPIRES.  If a task is still sitting in
+       the queue after that deadline, Celery discards it without touching the DB.
+       This is a belt-and-suspenders defence: even if the guard key is lost (e.g.
+       Redis restart), stale tasks evict themselves rather than piling up forever.
    """
    task_logger.info("check_user_file_processing - Starting")

@@ -131,7 +162,21 @@ def check_user_file_processing(self: Task, *, tenant_id: str) -> None:
        return None

    enqueued = 0
+    skipped_guard = 0
    try:
+        # --- Protection 1: queue depth backpressure ---
+        r_celery = self.app.broker_connection().channel().client  # type: ignore
+        queue_len = celery_get_queue_length(
+            OnyxCeleryQueues.USER_FILE_PROCESSING, r_celery
+        )
+        if queue_len > USER_FILE_PROCESSING_MAX_QUEUE_DEPTH:
+            task_logger.warning(
+                f"check_user_file_processing - Queue depth {queue_len} exceeds "
+                f"{USER_FILE_PROCESSING_MAX_QUEUE_DEPTH}, skipping enqueue for "
+                f"tenant={tenant_id}"
+            )
+            return None
+
        with get_session_with_current_tenant() as db_session:
            user_file_ids = (
                db_session.execute(
@@ -144,12 +189,35 @@ def check_user_file_processing(self: Task, *, tenant_id: str) -> None:
            )

            for user_file_id in user_file_ids:
-                self.app.send_task(
-                    OnyxCeleryTask.PROCESS_SINGLE_USER_FILE,
-                    kwargs={"user_file_id": str(user_file_id), "tenant_id": tenant_id},
-                    queue=OnyxCeleryQueues.USER_FILE_PROCESSING,
-                    priority=OnyxCeleryPriority.HIGH,
+                # --- Protection 2: per-file queued guard ---
+                queued_key = _user_file_queued_key(user_file_id)
+                guard_set = redis_client.set(
+                    queued_key,
+                    1,
+                    ex=CELERY_USER_FILE_PROCESSING_TASK_EXPIRES,
+                    nx=True,
                )
+                if not guard_set:
+                    skipped_guard += 1
+                    continue
+
+                # --- Protection 3: task expiry ---
+                # If task submission fails, clear the guard immediately so the
+                # next beat cycle can retry enqueuing this file.
+                try:
+                    self.app.send_task(
+                        OnyxCeleryTask.PROCESS_SINGLE_USER_FILE,
+                        kwargs={
+                            "user_file_id": str(user_file_id),
+                            "tenant_id": tenant_id,
+                        },
+                        queue=OnyxCeleryQueues.USER_FILE_PROCESSING,
+                        priority=OnyxCeleryPriority.HIGH,
+                        expires=CELERY_USER_FILE_PROCESSING_TASK_EXPIRES,
+                    )
+                except Exception:
+                    redis_client.delete(queued_key)
+                    raise
                enqueued += 1

    finally:
@@ -157,7 +225,8 @@ def check_user_file_processing(self: Task, *, tenant_id: str) -> None:
            lock.release()

    task_logger.info(
-        f"check_user_file_processing - Enqueued {enqueued} tasks for tenant={tenant_id}"
+        f"check_user_file_processing - Enqueued {enqueued} skipped_guard={skipped_guard} "
+        f"tasks for tenant={tenant_id}"
    )
    return None

@@ -172,6 +241,12 @@ def process_single_user_file(self: Task, *, user_file_id: str, tenant_id: str) -
    start = time.monotonic()

    redis_client = get_redis_client(tenant_id=tenant_id)
+
+    # Clear the "queued" guard set by the beat generator so that the next beat
+    # cycle can re-enqueue this file if it is still in PROCESSING state after
+    # this task completes or fails.
+    redis_client.delete(_user_file_queued_key(user_file_id))
+
    file_lock: RedisLock = redis_client.lock(
        _user_file_lock_key(user_file_id),
        timeout=CELERY_USER_FILE_PROCESSING_LOCK_TIMEOUT,
@@ -244,8 +319,7 @@ def process_single_user_file(self: Task, *, user_file_id: str, tenant_id: str) -
                    search_settings=current_search_settings,
                )

-                # This flow is for indexing so we get all indices.
-                document_indices = get_all_document_indices(
+                document_index = get_default_document_index(
                    current_search_settings,
                    None,
                    httpx_client=HttpxPool.get("vespa"),
@@ -259,7 +333,7 @@ def process_single_user_file(self: Task, *, user_file_id: str, tenant_id: str) -
                # real work happens here!
                index_pipeline_result = run_indexing_pipeline(
                    embedder=embedding_model,
-                    document_indices=document_indices,
+                    document_index=document_index,
                    ignore_time_skip=True,
                    db_session=db_session,
                    tenant_id=tenant_id,
@@ -413,16 +487,12 @@ def process_single_user_file_delete(
                httpx_init_vespa_pool(20)

            active_search_settings = get_active_search_settings(db_session)
-            # This flow is for deletion so we get all indices.
-            document_indices = get_all_document_indices(
+            document_index = get_default_document_index(
                search_settings=active_search_settings.primary,
                secondary_search_settings=active_search_settings.secondary,
                httpx_client=HttpxPool.get("vespa"),
            )
-            retry_document_indices: list[RetryDocumentIndex] = [
-                RetryDocumentIndex(document_index)
-                for document_index in document_indices
-            ]
+            retry_index = RetryDocumentIndex(document_index)
            index_name = active_search_settings.primary.index_name
            selection = f"{index_name}.document_id=='{user_file_id}'"

@@ -443,12 +513,11 @@ def process_single_user_file_delete(
            else:
                chunk_count = user_file.chunk_count

-            for retry_document_index in retry_document_indices:
-                retry_document_index.delete_single(
-                    doc_id=user_file_id,
-                    tenant_id=tenant_id,
-                    chunk_count=chunk_count,
-                )
+            retry_index.delete_single(
+                doc_id=user_file_id,
+                tenant_id=tenant_id,
+                chunk_count=chunk_count,
+            )

            # 2) Delete the user-uploaded file content from filestore (blob + metadata)
            file_store = get_default_file_store()
@@ -570,16 +639,12 @@ def process_single_user_file_project_sync(
                httpx_init_vespa_pool(20)

            active_search_settings = get_active_search_settings(db_session)
-            # This flow is for updates so we get all indices.
-            document_indices = get_all_document_indices(
+            doc_index = get_default_document_index(
                search_settings=active_search_settings.primary,
                secondary_search_settings=active_search_settings.secondary,
                httpx_client=HttpxPool.get("vespa"),
            )
-            retry_document_indices: list[RetryDocumentIndex] = [
-                RetryDocumentIndex(document_index)
-                for document_index in document_indices
-            ]
+            retry_index = RetryDocumentIndex(doc_index)

            user_file = db_session.get(UserFile, _as_uuid(user_file_id))
            if not user_file:
@@ -589,14 +654,13 @@ def process_single_user_file_project_sync(
                return None

            project_ids = [project.id for project in user_file.projects]
-            for retry_document_index in retry_document_indices:
-                retry_document_index.update_single(
-                    doc_id=str(user_file.id),
-                    tenant_id=tenant_id,
-                    chunk_count=user_file.chunk_count,
-                    fields=None,
-                    user_fields=VespaDocumentUserFields(user_projects=project_ids),
-                )
+            retry_index.update_single(
+                doc_id=str(user_file.id),
+                tenant_id=tenant_id,
+                chunk_count=user_file.chunk_count,
+                fields=None,
+                user_fields=VespaDocumentUserFields(user_projects=project_ids),
+            )

            task_logger.info(
                f"process_single_user_file_project_sync - User file id={user_file_id}"
--- a/backend/onyx/background/celery/tasks/vespa/tasks.py
+++ b/backend/onyx/background/celery/tasks/vespa/tasks.py
@@ -49,7 +49,7 @@ from onyx.db.search_settings import get_active_search_settings
 from onyx.db.sync_record import cleanup_sync_records
 from onyx.db.sync_record import insert_sync_record
 from onyx.db.sync_record import update_sync_record_status
-from onyx.document_index.factory import get_all_document_indices
+from onyx.document_index.factory import get_default_document_index
 from onyx.document_index.interfaces import VespaDocumentFields
 from onyx.httpx.httpx_pool import HttpxPool
 from onyx.redis.redis_document_set import RedisDocumentSet
@@ -70,8 +70,6 @@ logger = setup_logger()

 # celery auto associates tasks created inside another task,
 # which bloats the result metadata considerably. trail=False prevents this.
-# TODO(andrei): Rename all these kinds of functions from *vespa* to a more
-# generic *document_index*.
@shared_task(
    name=OnyxCeleryTask.CHECK_FOR_VESPA_SYNC_TASK,
    ignore_result=True,
@@ -467,17 +465,13 @@ def vespa_metadata_sync_task(self: Task, document_id: str, *, tenant_id: str) ->
    try:
        with get_session_with_current_tenant() as db_session:
            active_search_settings = get_active_search_settings(db_session)
-            # This flow is for updates so we get all indices.
-            document_indices = get_all_document_indices(
+            doc_index = get_default_document_index(
                search_settings=active_search_settings.primary,
                secondary_search_settings=active_search_settings.secondary,
                httpx_client=HttpxPool.get("vespa"),
            )

-            retry_document_indices: list[RetryDocumentIndex] = [
-                RetryDocumentIndex(document_index)
-                for document_index in document_indices
-            ]
+            retry_index = RetryDocumentIndex(doc_index)

            doc = get_document(document_id, db_session)
            if not doc:
@@ -506,18 +500,14 @@ def vespa_metadata_sync_task(self: Task, document_id: str, *, tenant_id: str) ->
                    # aggregated_boost_factor=doc.aggregated_boost_factor,
                )

-                for retry_document_index in retry_document_indices:
-                    # TODO(andrei): Previously there was a comment here saying
-                    # it was ok if a doc did not exist in the document index. I
-                    # don't agree with that claim, so keep an eye on this task
-                    # to see if this raises.
-                    retry_document_index.update_single(
-                        document_id,
-                        tenant_id=tenant_id,
-                        chunk_count=doc.chunk_count,
-                        fields=fields,
-                        user_fields=None,
-                    )
+                # update Vespa. OK if doc doesn't exist. Raises exception otherwise.
+                retry_index.update_single(
+                    document_id,
+                    tenant_id=tenant_id,
+                    chunk_count=doc.chunk_count,
+                    fields=fields,
+                    user_fields=None,
+                )

                # update db last. Worst case = we crash right before this and
                # the sync might repeat again later
--- a/backend/onyx/chat/chat_state.py
+++ b/backend/onyx/chat/chat_state.py
@@ -7,7 +7,6 @@ from typing import Any

 from onyx.chat.citation_processor import CitationMapping
 from onyx.chat.emitter import Emitter
-from onyx.context.search.models import SearchDoc
 from onyx.server.query_and_chat.placement import Placement
 from onyx.server.query_and_chat.streaming_models import OverallStop
 from onyx.server.query_and_chat.streaming_models import Packet
@@ -16,11 +15,6 @@ from onyx.tools.models import ToolCallInfo
 from onyx.utils.threadpool_concurrency import run_in_background
 from onyx.utils.threadpool_concurrency import wait_on_background

-# Type alias for search doc deduplication key
-# Simple key: just document_id (str)
-# Full key: (document_id, chunk_ind, match_highlights)
-SearchDocKey = str | tuple[str, int, tuple[str, ...]]
-

 class ChatStateContainer:
    """Container for accumulating state during LLM loop execution.
@@ -46,10 +40,6 @@ class ChatStateContainer:
        # True if this turn is a clarification question (deep research flow)
        self.is_clarification: bool = False
        # Note: LLM cost tracking is now handled in multi_llm.py
-        # Search doc collection - maps dedup key to SearchDoc for all docs from tool calls
-        self._all_search_docs: dict[SearchDocKey, SearchDoc] = {}
-        # Track which citation numbers were actually emitted during streaming
-        self._emitted_citations: set[int] = set()

    def add_tool_call(self, tool_call: ToolCallInfo) -> None:
        """Add a tool call to the accumulated state."""
@@ -101,54 +91,6 @@ class ChatStateContainer:
        with self._lock:
            return self.is_clarification

-    @staticmethod
-    def create_search_doc_key(
-        search_doc: SearchDoc, use_simple_key: bool = True
-    ) -> SearchDocKey:
-        """Create a unique key for a SearchDoc for deduplication.
-
-        Args:
-            search_doc: The SearchDoc to create a key for
-            use_simple_key: If True (default), use only document_id for deduplication.
-                If False, include chunk_ind and match_highlights so that the same
-                document/chunk with different highlights are stored separately.
-        """
-        if use_simple_key:
-            return search_doc.document_id
-        match_highlights_tuple = tuple(sorted(search_doc.match_highlights or []))
-        return (search_doc.document_id, search_doc.chunk_ind, match_highlights_tuple)
-
-    def add_search_docs(
-        self, search_docs: list[SearchDoc], use_simple_key: bool = True
-    ) -> None:
-        """Add search docs to the accumulated collection with deduplication.
-
-        Args:
-            search_docs: List of SearchDoc objects to add
-            use_simple_key: If True (default), deduplicate by document_id only.
-                If False, deduplicate by document_id + chunk_ind + match_highlights.
-        """
-        with self._lock:
-            for doc in search_docs:
-                key = self.create_search_doc_key(doc, use_simple_key)
-                if key not in self._all_search_docs:
-                    self._all_search_docs[key] = doc
-
-    def get_all_search_docs(self) -> dict[SearchDocKey, SearchDoc]:
-        """Thread-safe getter for all accumulated search docs (returns a copy)."""
-        with self._lock:
-            return self._all_search_docs.copy()
-
-    def add_emitted_citation(self, citation_num: int) -> None:
-        """Add a citation number that was actually emitted during streaming."""
-        with self._lock:
-            self._emitted_citations.add(citation_num)
-
-    def get_emitted_citations(self) -> set[int]:
-        """Thread-safe getter for emitted citations (returns a copy)."""
-        with self._lock:
-            return self._emitted_citations.copy()
-

 def run_chat_loop_with_state_containers(
    func: Callable[..., None],
--- a/backend/onyx/chat/chat_utils.py
+++ b/backend/onyx/chat/chat_utils.py
@@ -18,10 +18,12 @@ from onyx.background.celery.tasks.kg_processing.kg_indexing import (
 from onyx.chat.models import ChatLoadedFile
 from onyx.chat.models import ChatMessageSimple
 from onyx.chat.models import PersonaOverrideConfig
+from onyx.chat.models import ThreadMessage
 from onyx.configs.constants import DEFAULT_PERSONA_ID
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import TMP_DRALPHA_PERSONA_NAME
-from onyx.context.search.enums import RecencyBiasSetting
+from onyx.context.search.models import RerankingDetails
+from onyx.context.search.models import RetrievalDetails
 from onyx.db.chat import create_chat_session
 from onyx.db.chat import get_chat_messages_by_session
 from onyx.db.chat import get_or_create_root_message
@@ -46,10 +48,14 @@ from onyx.kg.models import KGException
 from onyx.kg.setup.kg_default_entity_definitions import (
    populate_missing_default_entity_types__commit,
 )
+from onyx.llm.override_models import LLMOverride
+from onyx.natural_language_processing.utils import BaseTokenizer
 from onyx.prompts.chat_prompts import ADDITIONAL_CONTEXT_PROMPT
 from onyx.prompts.chat_prompts import TOOL_CALL_RESPONSE_CROSS_MESSAGE
 from onyx.prompts.tool_prompts import TOOL_CALL_FAILURE_PROMPT
 from onyx.server.query_and_chat.models import ChatSessionCreationRequest
+from onyx.server.query_and_chat.models import CreateChatMessageRequest
+from onyx.server.query_and_chat.models import MessageOrigin
 from onyx.server.query_and_chat.streaming_models import CitationInfo
 from onyx.tools.models import ToolCallKickoff
 from onyx.tools.tool_implementations.custom.custom_tool import (
@@ -98,6 +104,91 @@ def create_chat_session_from_request(
    )


+def prepare_chat_message_request(
+    message_text: str,
+    user: User | None,
+    persona_id: int | None,
+    # Does the question need to have a persona override
+    persona_override_config: PersonaOverrideConfig | None,
+    message_ts_to_respond_to: str | None,
+    retrieval_details: RetrievalDetails | None,
+    rerank_settings: RerankingDetails | None,
+    db_session: Session,
+    skip_gen_ai_answer_generation: bool = False,
+    llm_override: LLMOverride | None = None,
+    allowed_tool_ids: list[int] | None = None,
+    forced_tool_ids: list[int] | None = None,
+    origin: MessageOrigin | None = None,
+) -> CreateChatMessageRequest:
+    # Typically used for one shot flows like SlackBot or non-chat API endpoint use cases
+    new_chat_session = create_chat_session(
+        db_session=db_session,
+        description=None,
+        user_id=user.id if user else None,
+        # If using an override, this id will be ignored later on
+        persona_id=persona_id or DEFAULT_PERSONA_ID,
+        onyxbot_flow=True,
+        slack_thread_id=message_ts_to_respond_to,
+    )
+
+    return CreateChatMessageRequest(
+        chat_session_id=new_chat_session.id,
+        parent_message_id=None,  # It's a standalone chat session each time
+        message=message_text,
+        file_descriptors=[],  # Currently SlackBot/answer api do not support files in the context
+        # Can always override the persona for the single query, if it's a normal persona
+        # then it will be treated the same
+        persona_override_config=persona_override_config,
+        search_doc_ids=None,
+        retrieval_options=retrieval_details,
+        rerank_settings=rerank_settings,
+        skip_gen_ai_answer_generation=skip_gen_ai_answer_generation,
+        llm_override=llm_override,
+        allowed_tool_ids=allowed_tool_ids,
+        forced_tool_ids=forced_tool_ids,
+        origin=origin or MessageOrigin.UNKNOWN,
+    )
+
+
+def combine_message_thread(
+    messages: list[ThreadMessage],
+    max_tokens: int | None,
+    llm_tokenizer: BaseTokenizer,
+) -> str:
+    """Used to create a single combined message context from threads"""
+    if not messages:
+        return ""
+
+    message_strs: list[str] = []
+    total_token_count = 0
+
+    for message in reversed(messages):
+        if message.role == MessageType.USER:
+            role_str = message.role.value.upper()
+            if message.sender:
+                role_str += " " + message.sender
+            else:
+                # Since other messages might have the user identifying information
+                # better to use Unknown for symmetry
+                role_str += " Unknown"
+        else:
+            role_str = message.role.value.upper()
+
+        msg_str = f"{role_str}:\n{message.message}"
+        message_token_count = len(llm_tokenizer.encode(msg_str))
+
+        if (
+            max_tokens is not None
+            and total_token_count + message_token_count > max_tokens
+        ):
+            break
+
+        message_strs.insert(0, msg_str)
+        total_token_count += message_token_count
+
+    return "\n\n".join(message_strs)
+
+
 def create_chat_history_chain(
    chat_session_id: UUID,
    db_session: Session,
@@ -159,6 +250,31 @@ def create_chat_history_chain(
    return mainline_messages


+def combine_message_chain(
+    messages: list[ChatMessage],
+    token_limit: int,
+    msg_limit: int | None = None,
+) -> str:
+    """Used for secondary LLM flows that require the chat history,"""
+    message_strs: list[str] = []
+    total_token_count = 0
+
+    if msg_limit is not None:
+        messages = messages[-msg_limit:]
+
+    for message in cast(list[ChatMessage], reversed(messages)):
+        message_token_count = message.token_count
+
+        if total_token_count + message_token_count > token_limit:
+            break
+
+        role = message.message_type.value.upper()
+        message_strs.insert(0, f"{role}:\n{message.message}")
+        total_token_count += message_token_count
+
+    return "\n\n".join(message_strs)
+
+
 def reorganize_citations(
    answer: str, citations: list[CitationInfo]
 ) -> tuple[str, list[CitationInfo]]:
@@ -299,7 +415,7 @@ def create_temporary_persona(
        num_chunks=persona_config.num_chunks,
        llm_relevance_filter=persona_config.llm_relevance_filter,
        llm_filter_extraction=persona_config.llm_filter_extraction,
-        recency_bias=RecencyBiasSetting.BASE_DECAY,
+        recency_bias=persona_config.recency_bias,
        llm_model_provider_override=persona_config.llm_model_provider_override,
        llm_model_version_override=persona_config.llm_model_version_override,
    )
@@ -469,71 +585,6 @@ def load_all_chat_files(
    return files


-def convert_chat_history_basic(
-    chat_history: list[ChatMessage],
-    token_counter: Callable[[str], int],
-    max_individual_message_tokens: int | None = None,
-    max_total_tokens: int | None = None,
-) -> list[ChatMessageSimple]:
-    """Convert ChatMessage history to ChatMessageSimple format with no tool calls or files included.
-
-    Args:
-        chat_history: List of ChatMessage objects to convert
-        token_counter: Function to count tokens in a message string
-        max_individual_message_tokens: If set, messages exceeding this number of tokens are dropped.
-            If None, no messages are dropped based on individual token count.
-        max_total_tokens: If set, maximum number of tokens allowed for the entire history.
-            If None, the history is not trimmed based on total token count.
-
-    Returns:
-        List of ChatMessageSimple objects
-    """
-    # Defensive: treat a non-positive total budget as "no history".
-    if max_total_tokens is not None and max_total_tokens <= 0:
-        return []
-
-    # Convert only the core USER/ASSISTANT messages; omit files and tool calls.
-    converted: list[ChatMessageSimple] = []
-    for chat_message in chat_history:
-        if chat_message.message_type not in (MessageType.USER, MessageType.ASSISTANT):
-            continue
-
-        message = chat_message.message or ""
-        token_count = getattr(chat_message, "token_count", None)
-        if token_count is None:
-            token_count = token_counter(message)
-
-        # Drop any single message that would dominate the context window.
-        if (
-            max_individual_message_tokens is not None
-            and token_count > max_individual_message_tokens
-        ):
-            continue
-
-        converted.append(
-            ChatMessageSimple(
-                message=message,
-                token_count=token_count,
-                message_type=chat_message.message_type,
-                image_files=None,
-            )
-        )
-
-    if max_total_tokens is None:
-        return converted
-
-    # Enforce a max total budget by keeping a contiguous suffix of the conversation.
-    trimmed_reversed: list[ChatMessageSimple] = []
-    total_tokens = 0
-    for msg in reversed(converted):
-        if total_tokens + msg.token_count > max_total_tokens:
-            break
-        trimmed_reversed.append(msg)
-        total_tokens += msg.token_count
-
-    return list(reversed(trimmed_reversed))
-
-
 def convert_chat_history(
    chat_history: list[ChatMessage],
    files: list[ChatLoadedFile],
--- a/backend/onyx/chat/citation_processor.py
+++ b/backend/onyx/chat/citation_processor.py
@@ -4,15 +4,14 @@ Dynamic Citation Processor for LLM Responses
 This module provides a citation processor that can:
 - Accept citation number to SearchDoc mappings dynamically
 - Process token streams from LLMs to extract citations
- Handle citations in three modes: REMOVE, KEEP_MARKERS, or HYPERLINK
- Emit CitationInfo objects for detected citations (in HYPERLINK mode)
- Track all seen citations regardless of mode
+- Optionally replace citation markers with formatted markdown links
+- Emit CitationInfo objects for detected citations (when replacing)
+- Track all seen citations regardless of replacement mode
 - Maintain a list of cited documents in order of first citation
 """

 import re
 from collections.abc import Generator
-from enum import Enum
 from typing import TypeAlias

 from onyx.configs.chat_configs import STOP_STREAM_PAT
@@ -24,29 +23,6 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


-class CitationMode(Enum):
-    """Defines how citations should be handled in the output.
-
-    REMOVE: Citations are completely removed from output text.
-            No CitationInfo objects are emitted.
-            Use case: When you need to remove citations from the output if they are not shared with the user
-            (e.g. in discord bot, public slack bot).
-
-    KEEP_MARKERS: Original citation markers like [1], [2] are preserved unchanged.
-                  No CitationInfo objects are emitted.
-                  Use case: When you need to track citations in research agent and later process
-                  them with collapse_citations() to renumber.
-
-    HYPERLINK: Citations are replaced with markdown links like [[1]](url).
-               CitationInfo objects are emitted for UI tracking.
-               Use case: Final reports shown to users with clickable links.
-    """
-
-    REMOVE = "remove"
-    KEEP_MARKERS = "keep_markers"
-    HYPERLINK = "hyperlink"
-
-
 CitationMapping: TypeAlias = dict[int, SearchDoc]


@@ -72,37 +48,29 @@ class DynamicCitationProcessor:

    This processor is designed for multi-turn conversations where the citation
    number to document mapping is provided externally. It processes streaming
-    tokens from an LLM, detects citations (e.g., [1], [2,3], [[4]]), and handles
-    them according to the configured CitationMode:
+    tokens from an LLM, detects citations (e.g., [1], [2,3], [[4]]), and based
+    on the `replace_citation_tokens` setting:

-    CitationMode.HYPERLINK (default):
+    When replace_citation_tokens=True (default):
        1. Replaces citation markers with formatted markdown links (e.g., [[1]](url))
        2. Emits CitationInfo objects for tracking
        3. Maintains the order in which documents were first cited
-        Use case: Final reports shown to users with clickable links.

-    CitationMode.KEEP_MARKERS:
-        1. Preserves original citation markers like [1], [2] unchanged
+    When replace_citation_tokens=False:
+        1. Preserves original citation markers in the output text
        2. Does NOT emit CitationInfo objects
        3. Still tracks all seen citations via get_seen_citations()
-        Use case: When citations need later processing (e.g., renumbering).
-
-    CitationMode.REMOVE:
-        1. Removes citation markers entirely from the output text
-        2. Does NOT emit CitationInfo objects
-        3. Still tracks all seen citations via get_seen_citations()
-        Use case: Research agent intermediate reports.

    Features:
        - Accepts citation number → SearchDoc mapping via update_citation_mapping()
-        - Configurable citation mode at initialization
-        - Always tracks seen citations regardless of mode
+        - Configurable citation replacement behavior at initialization
+        - Always tracks seen citations regardless of replacement mode
        - Holds back tokens that might be partial citations
        - Maintains list of cited SearchDocs in order of first citation
        - Handles unicode bracket variants (【】, ［］)
        - Skips citation processing inside code blocks

-    Example (HYPERLINK mode - default):
+    Example (with citation replacement - default):
        processor = DynamicCitationProcessor()

        # Set up citation mapping
@@ -119,8 +87,8 @@ class DynamicCitationProcessor:
        # Get cited documents at the end
        cited_docs = processor.get_cited_documents()

-    Example (KEEP_MARKERS mode):
-        processor = DynamicCitationProcessor(citation_mode=CitationMode.KEEP_MARKERS)
+    Example (without citation replacement):
+        processor = DynamicCitationProcessor(replace_citation_tokens=False)
        processor.update_citation_mapping({1: search_doc1, 2: search_doc2})

        # Process tokens from LLM
@@ -131,42 +99,26 @@ class DynamicCitationProcessor:

        # Get all seen citations after processing
        seen_citations = processor.get_seen_citations()  # {1: search_doc1, ...}
-
-    Example (REMOVE mode):
-        processor = DynamicCitationProcessor(citation_mode=CitationMode.REMOVE)
-        processor.update_citation_mapping({1: search_doc1, 2: search_doc2})
-
-        # Process tokens - citations are removed but tracked
-        for token in llm_stream:
-            for result in processor.process_token(token):
-                print(result)  # Text without any citation markers
-
-        # Citations are still tracked
-        seen_citations = processor.get_seen_citations()
    """

    def __init__(
        self,
-        citation_mode: CitationMode = CitationMode.HYPERLINK,
+        replace_citation_tokens: bool = True,
        stop_stream: str | None = STOP_STREAM_PAT,
    ):
        """
        Initialize the citation processor.

        Args:
-            citation_mode: How to handle citations in the output. One of:
-                - CitationMode.HYPERLINK (default): Replace [1] with [[1]](url)
-                  and emit CitationInfo objects.
-                - CitationMode.KEEP_MARKERS: Keep original [1] markers unchanged,
-                  no CitationInfo objects emitted.
-                - CitationMode.REMOVE: Remove citations entirely from output,
-                  no CitationInfo objects emitted.
-                All modes track seen citations via get_seen_citations().
+            replace_citation_tokens: If True (default), citations like [1] are replaced
+                with formatted markdown links like [[1]](url) and CitationInfo objects
+                are emitted. If False, original citation text is preserved in output
+                and no CitationInfo objects are emitted. Regardless of this setting,
+                all seen citations are tracked and available via get_seen_citations().
            stop_stream: Optional stop token pattern to halt processing early.
                When this pattern is detected in the token stream, processing stops.
                Defaults to STOP_STREAM_PAT from chat configs.
        """
-
        # Citation mapping from citation number to SearchDoc
        self.citation_to_doc: CitationMapping = {}
        self.seen_citations: CitationMapping = {}  # citation num -> SearchDoc
@@ -176,7 +128,7 @@ class DynamicCitationProcessor:
        self.curr_segment = ""  # tokens held for citation processing
        self.hold = ""  # tokens held for stop token processing
        self.stop_stream = stop_stream
-        self.citation_mode = citation_mode
+        self.replace_citation_tokens = replace_citation_tokens

        # Citation tracking
        self.cited_documents_in_order: list[SearchDoc] = (
@@ -247,21 +199,19 @@ class DynamicCitationProcessor:
        5. Handles stop tokens
        6. Always tracks seen citations in self.seen_citations

-        Behavior depends on the `citation_mode` setting from __init__:
-        - HYPERLINK: Citations are replaced with [[n]](url) format and CitationInfo
+        Behavior depends on the `replace_citation_tokens` setting from __init__:
+        - If True: Citations are replaced with [[n]](url) format and CitationInfo
          objects are yielded before each formatted citation
-        - KEEP_MARKERS: Original citation markers like [1] are preserved unchanged,
-          no CitationInfo objects are yielded
-        - REMOVE: Citations are removed entirely from output,
-          no CitationInfo objects are yielded
+        - If False: Original citation text (e.g., [1]) is preserved in output
+          and no CitationInfo objects are yielded

        Args:
            token: The next token from the LLM stream, or None to signal end of stream.
                Pass None to flush any remaining buffered text at end of stream.

        Yields:
-            str: Text chunks to display. Citation format depends on citation_mode.
-            CitationInfo: Citation metadata (only when citation_mode=HYPERLINK)
+            str: Text chunks to display. Citation format depends on replace_citation_tokens.
+            CitationInfo: Citation metadata (only when replace_citation_tokens=True)
        """
        # None -> end of stream, flush remaining segment
        if token is None:
@@ -349,17 +299,17 @@ class DynamicCitationProcessor:
                if self.non_citation_count > 5:
                    self.recent_cited_documents.clear()

+                # Yield text before citation FIRST (preserve order)
+                if intermatch_str:
+                    yield intermatch_str
+
                # Process the citation (returns formatted citation text and CitationInfo objects)
-                # Always tracks seen citations regardless of citation_mode
+                # Always tracks seen citations regardless of strip_citations flag
                citation_text, citation_info_list = self._process_citation(
-                    match, has_leading_space
+                    match, has_leading_space, self.replace_citation_tokens
                )

-                if self.citation_mode == CitationMode.HYPERLINK:
-                    # HYPERLINK mode: Replace citations with markdown links [[n]](url)
-                    # Yield text before citation FIRST (preserve order)
-                    if intermatch_str:
-                        yield intermatch_str
+                if self.replace_citation_tokens:
                    # Yield CitationInfo objects BEFORE the citation text
                    # This allows the frontend to receive citation metadata before the token
                    # that contains [[n]](link), enabling immediate rendering
@@ -368,34 +318,10 @@ class DynamicCitationProcessor:
                    # Then yield the formatted citation text
                    if citation_text:
                        yield citation_text
-
-                elif self.citation_mode == CitationMode.KEEP_MARKERS:
-                    # KEEP_MARKERS mode: Preserve original citation markers unchanged
-                    # Yield text before citation
-                    if intermatch_str:
-                        yield intermatch_str
-                    # Yield the original citation marker as-is
+                else:
+                    # When not stripping, yield the original citation text unchanged
                    yield match.group()

-                else:  # CitationMode.REMOVE
-                    # REMOVE mode: Remove citations entirely from output
-                    # This strips citation markers like [1], [2], 【1】 from the output text
-                    # When removing citations, we need to handle spacing to avoid issues like:
-                    # - "text [1] more" -> "text  more" (double space)
-                    # - "text [1]." -> "text ." (space before punctuation)
-                    if intermatch_str:
-                        remaining_text = self.curr_segment[match_span[1] :]
-                        # Strip trailing space from intermatch if:
-                        # 1. Remaining text starts with space (avoids double space)
-                        # 2. Remaining text starts with punctuation (avoids space before punctuation)
-                        if intermatch_str[-1].isspace() and remaining_text:
-                            first_char = remaining_text[0]
-                            # Check if next char is space or common punctuation
-                            if first_char.isspace() or first_char in ".,;:!?)]}":
-                                intermatch_str = intermatch_str.rstrip()
-                        if intermatch_str:
-                            yield intermatch_str
-
                self.non_citation_count = 0

            # Leftover text could be part of next citation
@@ -412,7 +338,7 @@ class DynamicCitationProcessor:
            yield result

    def _process_citation(
-        self, match: re.Match, has_leading_space: bool
+        self, match: re.Match, has_leading_space: bool, replace_tokens: bool = True
    ) -> tuple[str, list[CitationInfo]]:
        """
        Process a single citation match and return formatted citation text and citation info objects.
@@ -423,28 +349,31 @@ class DynamicCitationProcessor:
        This method always:
        1. Extracts citation numbers from the match
        2. Looks up the corresponding SearchDoc from the mapping
-        3. Tracks seen citations in self.seen_citations (regardless of citation_mode)
+        3. Tracks seen citations in self.seen_citations (regardless of replace_tokens)

-        When citation_mode is HYPERLINK:
+        When replace_tokens=True (controlled by self.replace_citation_tokens):
        4. Creates formatted citation text as [[n]](url)
        5. Creates CitationInfo objects for new citations
        6. Handles deduplication of recently cited documents

-        When citation_mode is REMOVE or KEEP_MARKERS:
-        4. Returns empty string and empty list (caller handles output based on mode)
+        When replace_tokens=False:
+        4. Returns empty string and empty list (caller yields original match text)

        Args:
            match: Regex match object containing the citation pattern
            has_leading_space: Whether the text immediately before this citation
                ends with whitespace. Used to determine if a leading space should
                be added to the formatted output.
+            replace_tokens: If True, return formatted text and CitationInfo objects.
+                If False, only track seen citations and return empty results.
+                This is passed from self.replace_citation_tokens by the caller.

        Returns:
            Tuple of (formatted_citation_text, citation_info_list):
            - formatted_citation_text: Markdown-formatted citation text like
-              "[[1]](https://example.com)" or empty string if not in HYPERLINK mode
+              "[[1]](https://example.com)" or empty string if replace_tokens=False
            - citation_info_list: List of CitationInfo objects for newly cited
-              documents, or empty list if not in HYPERLINK mode
+              documents, or empty list if replace_tokens=False
        """
        citation_str: str = match.group()  # e.g., '[1]', '[1, 2, 3]', '[[1]]', '【1】'
        formatted = (
@@ -482,11 +411,11 @@ class DynamicCitationProcessor:
            doc_id = search_doc.document_id
            link = search_doc.link or ""

-            # Always track seen citations regardless of citation_mode setting
+            # Always track seen citations regardless of replace_tokens setting
            self.seen_citations[num] = search_doc

-            # Only generate formatted citations and CitationInfo in HYPERLINK mode
-            if self.citation_mode != CitationMode.HYPERLINK:
+            # When not replacing citation tokens, skip the rest of the processing
+            if not replace_tokens:
                continue

            # Format the citation text as [[n]](link)
@@ -521,14 +450,14 @@ class DynamicCitationProcessor:
        """
        Get the list of cited SearchDoc objects in the order they were first cited.

-        Note: This list is only populated when `citation_mode=HYPERLINK`.
-        When using REMOVE or KEEP_MARKERS mode, this will return an empty list.
+        Note: This list is only populated when `replace_citation_tokens=True`.
+        When `replace_citation_tokens=False`, this will return an empty list.
        Use get_seen_citations() instead if you need to track citations without
-        emitting CitationInfo objects.
+        replacing them.

        Returns:
            List of SearchDoc objects in the order they were first cited.
-            Empty list if citation_mode is not HYPERLINK.
+            Empty list if replace_citation_tokens=False.
        """
        return self.cited_documents_in_order

@@ -536,14 +465,14 @@ class DynamicCitationProcessor:
        """
        Get the list of cited document IDs in the order they were first cited.

-        Note: This list is only populated when `citation_mode=HYPERLINK`.
-        When using REMOVE or KEEP_MARKERS mode, this will return an empty list.
+        Note: This list is only populated when `replace_citation_tokens=True`.
+        When `replace_citation_tokens=False`, this will return an empty list.
        Use get_seen_citations() instead if you need to track citations without
-        emitting CitationInfo objects.
+        replacing them.

        Returns:
            List of document IDs (strings) in the order they were first cited.
-            Empty list if citation_mode is not HYPERLINK.
+            Empty list if replace_citation_tokens=False.
        """
        return [doc.document_id for doc in self.cited_documents_in_order]

@@ -552,12 +481,12 @@ class DynamicCitationProcessor:
        Get all seen citations as a mapping from citation number to SearchDoc.

        This returns all citations that have been encountered during processing,
-        regardless of the `citation_mode` setting. Citations are tracked
+        regardless of the `replace_citation_tokens` setting. Citations are tracked
        whenever they are parsed, making this useful for cases where you need to
-        know which citations appeared in the text without emitting CitationInfo objects.
+        know which citations appeared in the text without replacing them.

-        This is particularly useful when using REMOVE or KEEP_MARKERS mode, as
-        get_cited_documents() will be empty in those cases, but get_seen_citations()
+        This is particularly useful when `replace_citation_tokens=False`, as
+        get_cited_documents() will be empty in that case, but get_seen_citations()
        will still contain all the citations that were found.

        Returns:
@@ -572,13 +501,13 @@ class DynamicCitationProcessor:
        """
        Get the number of unique documents that have been cited.

-        Note: This count is only updated when `citation_mode=HYPERLINK`.
-        When using REMOVE or KEEP_MARKERS mode, this will always return 0.
+        Note: This count is only updated when `replace_citation_tokens=True`.
+        When `replace_citation_tokens=False`, this will always return 0.
        Use len(get_seen_citations()) instead if you need to count citations
-        without emitting CitationInfo objects.
+        without replacing them.

        Returns:
-            Number of unique documents cited. 0 if citation_mode is not HYPERLINK.
+            Number of unique documents cited. 0 if replace_citation_tokens=False.
        """
        return len(self.cited_document_ids)

@@ -590,9 +519,9 @@ class DynamicCitationProcessor:
        CitationInfo objects for the same document when it's cited multiple times
        in close succession. This method clears that tracker.

-        This is primarily useful when `citation_mode=HYPERLINK` to allow
+        This is primarily useful when `replace_citation_tokens=True` to allow
        previously cited documents to emit CitationInfo objects again. Has no
-        effect when using REMOVE or KEEP_MARKERS mode.
+        effect when `replace_citation_tokens=False`.

        The recent citation tracker is also automatically cleared when more than
        5 non-citation characters are processed between citations.
--- a/backend/onyx/chat/citation_utils.py
+++ b/backend/onyx/chat/citation_utils.py
@@ -53,50 +53,6 @@ def update_citation_processor_from_tool_response(
            citation_processor.update_citation_mapping(citation_to_doc)


-def extract_citation_order_from_text(text: str) -> list[int]:
-    """Extract citation numbers from text in order of first appearance.
-
-    Parses citation patterns like [1], [1, 2], [[1]], 【1】 etc. and returns
-    the citation numbers in the order they first appear in the text.
-
-    Args:
-        text: The text containing citations
-
-    Returns:
-        List of citation numbers in order of first appearance (no duplicates)
-    """
-    # Same pattern used in collapse_citations and DynamicCitationProcessor
-    # Group 2 captures the number in double bracket format: [[1]], 【【1】】
-    # Group 4 captures the numbers in single bracket format: [1], [1, 2]
-    citation_pattern = re.compile(
-        r"([\[【［]{2}(\d+)[\]】］]{2})|([\[【［]([\d]+(?: *, *\d+)*)[\]】］])"
-    )
-    seen: set[int] = set()
-    order: list[int] = []
-
-    for match in citation_pattern.finditer(text):
-        # Group 2 is for double bracket single number, group 4 is for single bracket
-        if match.group(2):
-            nums_str = match.group(2)
-        elif match.group(4):
-            nums_str = match.group(4)
-        else:
-            continue
-
-        for num_str in nums_str.split(","):
-            num_str = num_str.strip()
-            if num_str:
-                try:
-                    num = int(num_str)
-                    if num not in seen:
-                        seen.add(num)
-                        order.append(num)
-                except ValueError:
-                    continue
-
-    return order
-
-
 def collapse_citations(
    answer_text: str,
    existing_citation_mapping: CitationMapping,
--- a/backend/onyx/chat/llm_loop.py
+++ b/backend/onyx/chat/llm_loop.py
@@ -5,11 +5,9 @@ from sqlalchemy.orm import Session
 from onyx.chat.chat_state import ChatStateContainer
 from onyx.chat.chat_utils import create_tool_call_failure_messages
 from onyx.chat.citation_processor import CitationMapping
-from onyx.chat.citation_processor import CitationMode
 from onyx.chat.citation_processor import DynamicCitationProcessor
 from onyx.chat.citation_utils import update_citation_processor_from_tool_response
 from onyx.chat.emitter import Emitter
-from onyx.chat.llm_step import extract_tool_calls_from_response_text
 from onyx.chat.llm_step import run_llm_step
 from onyx.chat.models import ChatMessageSimple
 from onyx.chat.models import ExtractedProjectFiles
@@ -39,13 +37,11 @@ from onyx.tools.built_in_tools import CITEABLE_TOOLS_NAMES
 from onyx.tools.built_in_tools import STOPPING_TOOLS_NAMES
 from onyx.tools.interface import Tool
 from onyx.tools.models import ToolCallInfo
-from onyx.tools.models import ToolCallKickoff
 from onyx.tools.models import ToolResponse
 from onyx.tools.tool_implementations.images.models import (
    FinalImageGenerationResponse,
 )
 from onyx.tools.tool_implementations.search.search_tool import SearchTool
-from onyx.tools.tool_implementations.web_search.utils import extract_url_snippet_map
 from onyx.tools.tool_implementations.web_search.web_search_tool import WebSearchTool
 from onyx.tools.tool_runner import run_tool_calls
 from onyx.tracing.framework.create import trace
@@ -54,78 +50,6 @@ from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()

-
-def _try_fallback_tool_extraction(
-    llm_step_result: LlmStepResult,
-    tool_choice: ToolChoiceOptions,
-    fallback_extraction_attempted: bool,
-    tool_defs: list[dict],
-    turn_index: int,
-) -> tuple[LlmStepResult, bool]:
-    """Attempt to extract tool calls from response text as a fallback.
-
-    This is a last resort fallback for low quality LLMs or those that don't have
-    tool calling from the serving layer. Also triggers if there's reasoning but
-    no answer and no tool calls.
-
-    Args:
-        llm_step_result: The result from the LLM step
-        tool_choice: The tool choice option used for this step
-        fallback_extraction_attempted: Whether fallback extraction was already attempted
-        tool_defs: List of tool definitions
-        turn_index: The current turn index for placement
-
-    Returns:
-        Tuple of (possibly updated LlmStepResult, whether fallback was attempted this call)
-    """
-    if fallback_extraction_attempted:
-        return llm_step_result, False
-
-    no_tool_calls = (
-        not llm_step_result.tool_calls or len(llm_step_result.tool_calls) == 0
-    )
-    reasoning_but_no_answer_or_tools = (
-        llm_step_result.reasoning and not llm_step_result.answer and no_tool_calls
-    )
-    should_try_fallback = (
-        tool_choice == ToolChoiceOptions.REQUIRED and no_tool_calls
-    ) or reasoning_but_no_answer_or_tools
-
-    if not should_try_fallback:
-        return llm_step_result, False
-
-    # Try to extract from answer first, then fall back to reasoning
-    extracted_tool_calls: list[ToolCallKickoff] = []
-    if llm_step_result.answer:
-        extracted_tool_calls = extract_tool_calls_from_response_text(
-            response_text=llm_step_result.answer,
-            tool_definitions=tool_defs,
-            placement=Placement(turn_index=turn_index),
-        )
-    if not extracted_tool_calls and llm_step_result.reasoning:
-        extracted_tool_calls = extract_tool_calls_from_response_text(
-            response_text=llm_step_result.reasoning,
-            tool_definitions=tool_defs,
-            placement=Placement(turn_index=turn_index),
-        )
-
-    if extracted_tool_calls:
-        logger.info(
-            f"Extracted {len(extracted_tool_calls)} tool call(s) from response text "
-            f"as fallback (tool_choice was REQUIRED but no tool calls returned)"
-        )
-        return (
-            LlmStepResult(
-                reasoning=llm_step_result.reasoning,
-                answer=llm_step_result.answer,
-                tool_calls=extracted_tool_calls,
-            ),
-            True,
-        )
-
-    return llm_step_result, True
-
-
 # Hardcoded oppinionated value, might breaks down to something like:
 # Cycle 1: Calls web_search for something
 # Cycle 2: Calls open_url for some results
@@ -373,7 +297,6 @@ def run_llm_loop(
    forced_tool_id: int | None = None,
    user_identity: LLMUserIdentity | None = None,
    chat_session_id: str | None = None,
-    include_citations: bool = True,
 ) -> None:
    with trace(
        "run_llm_loop",
@@ -391,13 +314,7 @@ def run_llm_loop(
        initialize_litellm()

        # Initialize citation processor for handling citations dynamically
-        # When include_citations is True, use HYPERLINK mode to format citations as [[1]](url)
-        # When include_citations is False, use REMOVE mode to strip citations from output
-        citation_processor = DynamicCitationProcessor(
-            citation_mode=(
-                CitationMode.HYPERLINK if include_citations else CitationMode.REMOVE
-            )
-        )
+        citation_processor = DynamicCitationProcessor()

        # Add project file citation mappings if project files are present
        project_citation_mapping: CitationMapping = {}
@@ -427,7 +344,6 @@ def run_llm_loop(
        ran_image_gen: bool = False
        just_ran_web_search: bool = False
        has_called_search_tool: bool = False
-        fallback_extraction_attempted: bool = False
        citation_mapping: dict[int, str] = {}  # Maps citation_num -> document_id/URL

        default_base_system_prompt: str = get_default_base_system_prompt(db_session)
@@ -454,16 +370,12 @@ def run_llm_loop(

            # The section below calculates the available tokens for history a bit more accurately
            # now that project files are loaded in.
-            if persona and persona.replace_base_system_prompt:
+            if persona and persona.replace_base_system_prompt and persona.system_prompt:
                # Handles the case where user has checked off the "Replace base system prompt" checkbox
-                system_prompt = (
-                    ChatMessageSimple(
-                        message=persona.system_prompt,
-                        token_count=token_counter(persona.system_prompt),
-                        message_type=MessageType.SYSTEM,
-                    )
-                    if persona.system_prompt
-                    else None
+                system_prompt = ChatMessageSimple(
+                    message=persona.system_prompt,
+                    token_count=token_counter(persona.system_prompt),
+                    message_type=MessageType.SYSTEM,
                )
                custom_agent_prompt_msg = None
            else:
@@ -550,11 +462,10 @@ def run_llm_loop(

            # This calls the LLM, yields packets (reasoning, answers, etc.) and returns the result
            # It also pre-processes the tool calls in preparation for running them
-            tool_defs = [tool.tool_definition() for tool in final_tools]
            llm_step_result, has_reasoned = run_llm_step(
                emitter=emitter,
                history=truncated_message_history,
-                tool_definitions=tool_defs,
+                tool_definitions=[tool.tool_definition() for tool in final_tools],
                tool_choice=tool_choice,
                llm=llm,
                placement=Placement(turn_index=llm_cycle_count + reasoning_cycles),
@@ -569,19 +480,6 @@ def run_llm_loop(
            if has_reasoned:
                reasoning_cycles += 1

-            # Fallback extraction for LLMs that don't support tool calling natively or are lower quality
-            # and might incorrectly output tool calls in other channels
-            llm_step_result, attempted = _try_fallback_tool_extraction(
-                llm_step_result=llm_step_result,
-                tool_choice=tool_choice,
-                fallback_extraction_attempted=fallback_extraction_attempted,
-                tool_defs=tool_defs,
-                turn_index=llm_cycle_count + reasoning_cycles,
-            )
-            if attempted:
-                # To prevent the case of excessive looping with bad models, we only allow one fallback attempt
-                fallback_extraction_attempted = True
-
            # Save citation mapping after each LLM step for incremental state updates
            state_container.set_citation_mapping(citation_processor.citation_to_doc)

@@ -617,7 +515,6 @@ def run_llm_loop(
                next_citation_num=citation_processor.get_next_citation_number(),
                max_concurrent_tools=None,
                skip_search_query_expansion=has_called_search_tool,
-                url_snippet_map=extract_url_snippet_map(gathered_documents or []),
            )
            tool_responses = parallel_tool_call_results.tool_responses
            citation_mapping = parallel_tool_call_results.updated_citation_mapping
@@ -656,15 +553,8 @@ def run_llm_loop(

                # Extract search_docs if this is a search tool response
                search_docs = None
-                displayed_docs = None
                if isinstance(tool_response.rich_response, SearchDocsResponse):
                    search_docs = tool_response.rich_response.search_docs
-                    displayed_docs = tool_response.rich_response.displayed_docs
-
-                    # Add ALL search docs to state container for DB persistence
-                    if search_docs:
-                        state_container.add_search_docs(search_docs)
-
                    if gathered_documents:
                        gathered_documents.extend(search_docs)
                    else:
@@ -682,12 +572,6 @@ def run_llm_loop(
                ):
                    generated_images = tool_response.rich_response.generated_images

-                saved_response = (
-                    tool_response.rich_response
-                    if isinstance(tool_response.rich_response, str)
-                    else tool_response.llm_facing_response
-                )
-
                tool_call_info = ToolCallInfo(
                    parent_tool_call_id=None,  # Top-level tool calls are attached to the chat message
                    turn_index=llm_cycle_count + reasoning_cycles,
@@ -697,8 +581,8 @@ def run_llm_loop(
                    tool_id=tool.id,
                    reasoning_tokens=llm_step_result.reasoning,  # All tool calls from this loop share the same reasoning
                    tool_call_arguments=tool_call.tool_args,
-                    tool_call_response=saved_response,
-                    search_docs=displayed_docs or search_docs,
+                    tool_call_response=tool_response.llm_facing_response,
+                    search_docs=search_docs,
                    generated_images=generated_images,
                )
                # Add to state container for partial save support
@@ -753,12 +637,7 @@ def run_llm_loop(
                should_cite_documents = True

        if not llm_step_result or not llm_step_result.answer:
-            raise RuntimeError(
-                "The LLM did not return an answer. "
-                "Typically this is an issue with LLMs that do not support tool calling natively, "
-                "or the model serving API is not configured correctly. "
-                "This may also happen with models that are lower quality outputting invalid tool calls."
-            )
+            raise RuntimeError("LLM did not return an answer.")

        emitter.emit(
            Packet(
--- a/backend/onyx/chat/llm_step.py
+++ b/backend/onyx/chat/llm_step.py
@@ -1,6 +1,5 @@
 import json
 import time
-import uuid
 from collections.abc import Callable
 from collections.abc import Generator
 from collections.abc import Mapping
@@ -14,7 +13,6 @@ from onyx.chat.emitter import Emitter
 from onyx.chat.models import ChatMessageSimple
 from onyx.chat.models import LlmStepResult
 from onyx.configs.app_configs import LOG_ONYX_MODEL_INTERACTIONS
-from onyx.configs.app_configs import PROMPT_CACHE_CHAT_HISTORY
 from onyx.configs.constants import MessageType
 from onyx.context.search.models import SearchDoc
 from onyx.file_store.models import ChatFileType
@@ -50,7 +48,6 @@ from onyx.tools.models import ToolCallKickoff
 from onyx.tracing.framework.create import generation_span
 from onyx.utils.b64 import get_image_type_from_bytes
 from onyx.utils.logger import setup_logger
-from onyx.utils.text_processing import find_all_json_objects

 logger = setup_logger()

@@ -139,11 +136,12 @@ def _format_message_history_for_logging(

    separator = "================================================"

-    # Handle single ChatCompletionMessage - wrap in list for uniform processing
-    if isinstance(
-        message_history, (SystemMessage, UserMessage, AssistantMessage, ToolMessage)
-    ):
-        message_history = [message_history]
+    # Handle string input
+    if isinstance(message_history, str):
+        formatted_lines.append("Message [string]:")
+        formatted_lines.append(separator)
+        formatted_lines.append(f"{message_history}")
+        return "\n".join(formatted_lines)

    # Handle sequence of messages
    for i, msg in enumerate(message_history):
@@ -213,8 +211,7 @@ def _update_tool_call_with_delta(

    if index not in tool_calls_in_progress:
        tool_calls_in_progress[index] = {
-            # Fallback ID in case the provider never sends one via deltas.
-            "id": f"fallback_{uuid.uuid4().hex}",
+            "id": None,
            "name": None,
            "arguments": "",
        }
@@ -280,144 +277,6 @@ def _extract_tool_call_kickoffs(
    return tool_calls


-def extract_tool_calls_from_response_text(
-    response_text: str | None,
-    tool_definitions: list[dict],
-    placement: Placement,
-) -> list[ToolCallKickoff]:
-    """Extract tool calls from LLM response text by matching JSON against tool definitions.
-
-    This is a fallback mechanism for when the LLM was expected to return tool calls
-    but didn't use the proper tool call format. It searches for JSON objects in the
-    response text that match the structure of available tools.
-
-    Args:
-        response_text: The LLM's text response to search for tool calls
-        tool_definitions: List of tool definitions to match against
-        placement: Placement information for the tool calls
-
-    Returns:
-        List of ToolCallKickoff objects for any matched tool calls
-    """
-    if not response_text or not tool_definitions:
-        return []
-
-    # Build a map of tool names to their definitions
-    tool_name_to_def: dict[str, dict] = {}
-    for tool_def in tool_definitions:
-        if tool_def.get("type") == "function" and "function" in tool_def:
-            func_def = tool_def["function"]
-            tool_name = func_def.get("name")
-            if tool_name:
-                tool_name_to_def[tool_name] = func_def
-
-    if not tool_name_to_def:
-        return []
-
-    # Find all JSON objects in the response text
-    json_objects = find_all_json_objects(response_text)
-
-    tool_calls: list[ToolCallKickoff] = []
-    tab_index = 0
-
-    for json_obj in json_objects:
-        matched_tool_call = _try_match_json_to_tool(json_obj, tool_name_to_def)
-        if matched_tool_call:
-            tool_name, tool_args = matched_tool_call
-            tool_calls.append(
-                ToolCallKickoff(
-                    tool_call_id=f"extracted_{uuid.uuid4().hex[:8]}",
-                    tool_name=tool_name,
-                    tool_args=tool_args,
-                    placement=Placement(
-                        turn_index=placement.turn_index,
-                        tab_index=tab_index,
-                        sub_turn_index=placement.sub_turn_index,
-                    ),
-                )
-            )
-            tab_index += 1
-
-    logger.info(
-        f"Extracted {len(tool_calls)} tool call(s) from response text as fallback"
-    )
-
-    return tool_calls
-
-
-def _try_match_json_to_tool(
-    json_obj: dict[str, Any],
-    tool_name_to_def: dict[str, dict],
-) -> tuple[str, dict[str, Any]] | None:
-    """Try to match a JSON object to a tool definition.
-
-    Supports several formats:
-    1. Direct tool call format: {"name": "tool_name", "arguments": {...}}
-    2. Function call format: {"function": {"name": "tool_name", "arguments": {...}}}
-    3. Tool name as key: {"tool_name": {...arguments...}}
-    4. Arguments matching a tool's parameter schema
-
-    Args:
-        json_obj: The JSON object to match
-        tool_name_to_def: Map of tool names to their function definitions
-
-    Returns:
-        Tuple of (tool_name, tool_args) if matched, None otherwise
-    """
-    # Format 1: Direct tool call format {"name": "...", "arguments": {...}}
-    if "name" in json_obj and json_obj["name"] in tool_name_to_def:
-        tool_name = json_obj["name"]
-        arguments = json_obj.get("arguments", json_obj.get("parameters", {}))
-        if isinstance(arguments, str):
-            try:
-                arguments = json.loads(arguments)
-            except json.JSONDecodeError:
-                arguments = {}
-        if isinstance(arguments, dict):
-            return (tool_name, arguments)
-
-    # Format 2: Function call format {"function": {"name": "...", "arguments": {...}}}
-    if "function" in json_obj and isinstance(json_obj["function"], dict):
-        func_obj = json_obj["function"]
-        if "name" in func_obj and func_obj["name"] in tool_name_to_def:
-            tool_name = func_obj["name"]
-            arguments = func_obj.get("arguments", func_obj.get("parameters", {}))
-            if isinstance(arguments, str):
-                try:
-                    arguments = json.loads(arguments)
-                except json.JSONDecodeError:
-                    arguments = {}
-            if isinstance(arguments, dict):
-                return (tool_name, arguments)
-
-    # Format 3: Tool name as key {"tool_name": {...arguments...}}
-    for tool_name in tool_name_to_def:
-        if tool_name in json_obj:
-            arguments = json_obj[tool_name]
-            if isinstance(arguments, dict):
-                return (tool_name, arguments)
-
-    # Format 4: Check if the JSON object matches a tool's parameter schema
-    for tool_name, func_def in tool_name_to_def.items():
-        params = func_def.get("parameters", {})
-        properties = params.get("properties", {})
-        required = params.get("required", [])
-
-        if not properties:
-            continue
-
-        # Check if all required parameters are present (empty required = all optional)
-        if all(req in json_obj for req in required):
-            # Check if any of the tool's properties are in the JSON object
-            matching_props = [prop for prop in properties if prop in json_obj]
-            if matching_props:
-                # Filter to only include known properties
-                filtered_args = {k: v for k, v in json_obj.items() if k in properties}
-                return (tool_name, filtered_args)
-
-    return None
-
-
 def translate_history_to_llm_format(
    history: list[ChatMessageSimple],
    llm_config: LLMConfig,
@@ -433,7 +292,7 @@ def translate_history_to_llm_format(

    for idx, msg in enumerate(history):
        # if the message is being added to the history
-        if PROMPT_CACHE_CHAT_HISTORY and msg.message_type in [
+        if msg.message_type in [
            MessageType.SYSTEM,
            MessageType.USER,
            MessageType.ASSISTANT,
@@ -722,18 +581,6 @@ def run_llm_step_pkt_generator(
                }
                # Note: LLM cost tracking is now handled in multi_llm.py
            delta = packet.choice.delta
-
-            # Weird behavior from some model providers, just log and ignore for now
-            if (
-                delta.content is None
-                and delta.reasoning_content is None
-                and delta.tool_calls is None
-            ):
-                logger.warning(
-                    f"LLM packet is empty (no contents, reasoning or tool calls). Skipping: {packet}"
-                )
-                continue
-
            if not first_action_recorded and _delta_has_action(delta):
                span_generation.span_data.time_to_first_action_seconds = (
                    time.monotonic() - stream_start_time
@@ -860,11 +707,6 @@ def run_llm_step_pkt_generator(
                                    ),
                                    obj=result,
                                )
-                                # Track emitted citation for saving
-                                if state_container:
-                                    state_container.add_emitted_citation(
-                                        result.citation_number
-                                    )
                    else:
                        # When citation_processor is None, use delta.content directly without modification
                        accumulated_answer += delta.content
@@ -991,9 +833,6 @@ def run_llm_step_pkt_generator(
                    ),
                    obj=result,
                )
-                # Track emitted citation for saving
-                if state_container:
-                    state_container.add_emitted_citation(result.citation_number)

    # Note: Content (AgentResponseDelta) doesn't need an explicit end packet - OverallStop handles it
    # Tool calls are handled by tool execution code and emit their own packets (e.g., SectionEnd)
@@ -1001,14 +840,14 @@ def run_llm_step_pkt_generator(
        logger.debug(f"Accumulated reasoning: {accumulated_reasoning}")
        logger.debug(f"Accumulated answer: {accumulated_answer}")

-        if tool_calls:
-            tool_calls_str = "\n".join(
-                f"  - {tc.tool_name}: {json.dumps(tc.tool_args, indent=4)}"
-                for tc in tool_calls
-            )
-            logger.debug(f"Tool calls:\n{tool_calls_str}")
-        else:
-            logger.debug("Tool calls: []")
+    if tool_calls:
+        tool_calls_str = "\n".join(
+            f"  - {tc.tool_name}: {json.dumps(tc.tool_args, indent=4)}"
+            for tc in tool_calls
+        )
+        logger.debug(f"Tool calls:\n{tool_calls_str}")
+    else:
+        logger.debug("Tool calls: []")

    return (
        LlmStepResult(
--- a/backend/onyx/chat/models.py
+++ b/backend/onyx/chat/models.py
@@ -1,5 +1,6 @@
 from collections.abc import Callable
 from collections.abc import Iterator
+from datetime import datetime
 from enum import Enum
 from typing import Any
 from uuid import UUID
@@ -7,7 +8,10 @@ from uuid import UUID
 from pydantic import BaseModel
 from pydantic import Field

+from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import MessageType
+from onyx.context.search.enums import QueryFlow
+from onyx.context.search.enums import RecencyBiasSetting
 from onyx.context.search.enums import SearchType
 from onyx.context.search.models import SearchDoc
 from onyx.file_store.models import FileDescriptor
@@ -20,6 +24,25 @@ from onyx.tools.models import ToolCallKickoff
 from onyx.tools.tool_implementations.custom.base_tool_types import ToolResultType


+# First chunk of info for streaming QA
+class QADocsResponse(BaseModel):
+    top_documents: list[SearchDoc]
+    rephrased_query: str | None = None
+    predicted_flow: QueryFlow | None
+    predicted_search: SearchType | None
+    applied_source_filters: list[DocumentSource] | None
+    applied_time_cutoff: datetime | None
+    recency_bias_multiplier: float
+
+    def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
+        initial_dict = super().model_dump(mode="json", *args, **kwargs)  # type: ignore
+        initial_dict["applied_time_cutoff"] = (
+            self.applied_time_cutoff.isoformat() if self.applied_time_cutoff else None
+        )
+
+        return initial_dict
+
+
 class StreamStopReason(Enum):
    CONTEXT_LENGTH = "context_length"
    CANCELLED = "cancelled"
@@ -47,11 +70,22 @@ class UserKnowledgeFilePacket(BaseModel):
    user_files: list[FileDescriptor]


+class LLMRelevanceFilterResponse(BaseModel):
+    llm_selected_doc_indices: list[int]
+
+
 class RelevanceAnalysis(BaseModel):
    relevant: bool
    content: str | None = None


+class SectionRelevancePiece(RelevanceAnalysis):
+    """LLM analysis mapped to an Inference Section"""
+
+    document_id: str
+    chunk_id: int  # ID of the center chunk for a given inference section
+
+
 class DocumentRelevance(BaseModel):
    """Contains all relevance information for a given search"""

@@ -82,6 +116,12 @@ class OnyxAnswer(BaseModel):
    answer: str | None


+class ThreadMessage(BaseModel):
+    message: str
+    sender: str | None = None
+    role: MessageType = MessageType.USER
+
+
 class FileChatDisplay(BaseModel):
    file_ids: list[str]

@@ -118,6 +158,7 @@ class PersonaOverrideConfig(BaseModel):
    num_chunks: float | None = None
    llm_relevance_filter: bool = False
    llm_filter_extraction: bool = False
+    recency_bias: RecencyBiasSetting = RecencyBiasSetting.AUTO
    llm_model_provider_override: str | None = None
    llm_model_version_override: str | None = None

--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -38,10 +38,10 @@ from onyx.chat.save_chat import save_chat_turn
 from onyx.chat.stop_signal_checker import is_connected as check_stop_signal
 from onyx.chat.stop_signal_checker import reset_cancel_status
 from onyx.configs.constants import DEFAULT_PERSONA_ID
-from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import MilestoneRecordType
-from onyx.context.search.models import BaseFilters
+from onyx.context.search.enums import OptionalSearchSetting
+from onyx.context.search.models import CitationDocInfo
 from onyx.context.search.models import SearchDoc
 from onyx.db.chat import create_new_chat_message
 from onyx.db.chat import get_chat_session_by_id
@@ -50,7 +50,6 @@ from onyx.db.chat import reserve_message_id
 from onyx.db.memory import get_memories
 from onyx.db.models import ChatMessage
 from onyx.db.models import ChatSession
-from onyx.db.models import Persona
 from onyx.db.models import User
 from onyx.db.projects import get_project_token_count
 from onyx.db.projects import get_user_files_from_project
@@ -68,7 +67,6 @@ from onyx.onyxbot.slack.models import SlackContext
 from onyx.redis.redis_pool import get_redis_client
 from onyx.server.query_and_chat.models import AUTO_PLACE_AFTER_LATEST_MESSAGE
 from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.server.query_and_chat.models import OptionalSearchSetting
 from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
@@ -95,22 +93,6 @@ logger = setup_logger()
 ERROR_TYPE_CANCELLED = "cancelled"


-def _should_enable_slack_search(
-    persona: Persona,
-    filters: BaseFilters | None,
-) -> bool:
-    """Determine if Slack search should be enabled.
-
-    Returns True if:
-    - Source type filter exists and includes Slack, OR
-    - Default persona with no source type filter
-    """
-    source_types = filters.source_type if filters else None
-    return (source_types is not None and DocumentSource.SLACK in source_types) or (
-        persona.id == DEFAULT_PERSONA_ID and source_types is None
-    )
-
-
 def _extract_project_file_texts_and_images(
    project_id: int | None,
    user_id: UUID | None,
@@ -299,7 +281,6 @@ def handle_stream_message_objects(
    # on the `new_msg_req.message`. Currently, requires a state where the last message is a
    litellm_additional_headers: dict[str, str] | None = None,
    custom_tool_additional_headers: dict[str, str] | None = None,
-    mcp_headers: dict[str, str] | None = None,
    bypass_acl: bool = False,
    # Additional context that should be included in the chat history, for example:
    # Slack threads where the conversation cannot be represented by a chain of User/Assistant
@@ -523,15 +504,11 @@ def handle_stream_message_objects(
                ),
                bypass_acl=bypass_acl,
                slack_context=slack_context,
-                enable_slack_search=_should_enable_slack_search(
-                    persona, new_msg_req.internal_search_filters
-                ),
            ),
            custom_tool_config=CustomToolConfig(
                chat_session_id=chat_session.id,
                message_id=user_message.id if user_message else None,
                additional_headers=custom_tool_additional_headers,
-                mcp_headers=mcp_headers,
            ),
            allowed_tool_ids=new_msg_req.allowed_tool_ids,
            search_usage_forcing_setting=project_search_config.search_usage,
@@ -652,7 +629,6 @@ def handle_stream_message_objects(
                forced_tool_id=forced_tool_id,
                user_identity=user_identity,
                chat_session_id=str(chat_session.id),
-                include_citations=new_msg_req.include_citations,
            )

    except ValueError as e:
@@ -743,16 +719,27 @@ def llm_loop_completion_handle(
        else:
            final_answer = "The generation was stopped by the user."

+    # Build citation_docs_info from accumulated citations in state container
+    citation_docs_info: list[CitationDocInfo] = []
+    seen_citation_nums: set[int] = set()
+    for citation_num, search_doc in state_container.citation_to_doc.items():
+        if citation_num not in seen_citation_nums:
+            seen_citation_nums.add(citation_num)
+            citation_docs_info.append(
+                CitationDocInfo(
+                    search_doc=search_doc,
+                    citation_number=citation_num,
+                )
+            )
+
    save_chat_turn(
        message_text=final_answer,
        reasoning_tokens=state_container.reasoning_tokens,
-        citation_to_doc=state_container.citation_to_doc,
+        citation_docs_info=citation_docs_info,
        tool_calls=state_container.tool_calls,
-        all_search_docs=state_container.get_all_search_docs(),
        db_session=db_session,
        assistant_message=assistant_message,
        is_clarification=state_container.is_clarification,
-        emitted_citations=state_container.get_emitted_citations(),
    )


@@ -803,7 +790,6 @@ def stream_chat_message_objects(
        parent_message_id=new_msg_req.parent_message_id,
        chat_session_id=new_msg_req.chat_session_id,
        origin=new_msg_req.origin,
-        include_citations=new_msg_req.include_citations,
    )
    return handle_stream_message_objects(
        new_msg_req=translated_new_msg_req,
--- a/backend/onyx/chat/prompt_utils.py
+++ b/backend/onyx/chat/prompt_utils.py
@@ -18,7 +18,6 @@ from onyx.prompts.prompt_utils import handle_onyx_date_awareness
 from onyx.prompts.prompt_utils import replace_citation_guidance_tag
 from onyx.prompts.tool_prompts import GENERATE_IMAGE_GUIDANCE
 from onyx.prompts.tool_prompts import INTERNAL_SEARCH_GUIDANCE
-from onyx.prompts.tool_prompts import MEMORY_GUIDANCE
 from onyx.prompts.tool_prompts import OPEN_URLS_GUIDANCE
 from onyx.prompts.tool_prompts import PYTHON_TOOL_GUIDANCE
 from onyx.prompts.tool_prompts import TOOL_DESCRIPTION_SEARCH_GUIDANCE
@@ -29,7 +28,6 @@ from onyx.tools.interface import Tool
 from onyx.tools.tool_implementations.images.image_generation_tool import (
    ImageGenerationTool,
 )
-from onyx.tools.tool_implementations.memory.memory_tool import MemoryTool
 from onyx.tools.tool_implementations.open_url.open_url_tool import OpenURLTool
 from onyx.tools.tool_implementations.python.python_tool import PythonTool
 from onyx.tools.tool_implementations.search.search_tool import SearchTool
@@ -180,9 +178,8 @@ def build_system_prompt(
                site_colon_disabled=WEB_SEARCH_SITE_DISABLED_GUIDANCE
            )
            + OPEN_URLS_GUIDANCE
-            + PYTHON_TOOL_GUIDANCE
            + GENERATE_IMAGE_GUIDANCE
-            + MEMORY_GUIDANCE
+            + PYTHON_TOOL_GUIDANCE
        )
        return system_prompt

@@ -196,7 +193,6 @@ def build_system_prompt(
        has_generate_image = any(
            isinstance(tool, ImageGenerationTool) for tool in tools
        )
-        has_memory = any(isinstance(tool, MemoryTool) for tool in tools)

        if has_web_search or has_internal_search or include_all_guidance:
            system_prompt += TOOL_DESCRIPTION_SEARCH_GUIDANCE
@@ -226,7 +222,4 @@ def build_system_prompt(
        if has_generate_image or include_all_guidance:
            system_prompt += GENERATE_IMAGE_GUIDANCE

-        if has_memory or include_all_guidance:
-            system_prompt += MEMORY_GUIDANCE
-
    return system_prompt
--- a/backend/onyx/chat/save_chat.py
+++ b/backend/onyx/chat/save_chat.py
@@ -2,9 +2,8 @@ import json

 from sqlalchemy.orm import Session

-from onyx.chat.chat_state import ChatStateContainer
-from onyx.chat.chat_state import SearchDocKey
 from onyx.configs.constants import DocumentSource
+from onyx.context.search.models import CitationDocInfo
 from onyx.context.search.models import SearchDoc
 from onyx.db.chat import add_search_docs_to_chat_message
 from onyx.db.chat import add_search_docs_to_tool_call
@@ -20,6 +19,22 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


+def _create_search_doc_key(search_doc: SearchDoc) -> tuple[str, int, tuple[str, ...]]:
+    """
+    Create a unique key for a SearchDoc that accounts for different versions of the same
+    document/chunk with different match_highlights.
+
+    Args:
+        search_doc: The SearchDoc pydantic model to create a key for
+
+    Returns:
+        A tuple of (document_id, chunk_ind, sorted match_highlights) that uniquely identifies
+        this specific version of the document
+    """
+    match_highlights_tuple = tuple(sorted(search_doc.match_highlights or []))
+    return (search_doc.document_id, search_doc.chunk_ind, match_highlights_tuple)
+
+
 def _create_and_link_tool_calls(
    tool_calls: list[ToolCallInfo],
    assistant_message: ChatMessage,
@@ -139,36 +154,38 @@ def save_chat_turn(
    message_text: str,
    reasoning_tokens: str | None,
    tool_calls: list[ToolCallInfo],
-    citation_to_doc: dict[int, SearchDoc],
-    all_search_docs: dict[SearchDocKey, SearchDoc],
+    citation_docs_info: list[CitationDocInfo],
    db_session: Session,
    assistant_message: ChatMessage,
    is_clarification: bool = False,
-    emitted_citations: set[int] | None = None,
 ) -> None:
    """
    Save a chat turn by populating the assistant_message and creating related entities.

    This function:
    1. Updates the ChatMessage with text, reasoning tokens, and token count
-    2. Creates DB SearchDoc entries from pre-deduplicated all_search_docs
-    3. Builds tool_call -> search_doc mapping for displayed docs
-    4. Builds citation mapping from citation_to_doc
-    5. Links all unique SearchDocs to the ChatMessage
+    2. Creates SearchDoc entries from ToolCall search_docs (for tool calls that returned documents)
+    3. Collects all unique SearchDocs from all tool calls and links them to ChatMessage
+    4. Builds citation mapping from citation_docs_info
+    5. Links all unique SearchDocs from tool calls to the ChatMessage
    6. Creates ToolCall entries and links SearchDocs to them
    7. Builds the citations mapping for the ChatMessage

+    Deduplication Logic:
+    - SearchDocs are deduplicated using (document_id, chunk_ind, match_highlights) as the key
+    - This ensures that the same document/chunk with different match_highlights (from different
+      queries) are stored as separate SearchDoc entries
+    - Each ToolCall and ChatMessage will map to the correct version of the SearchDoc that
+      matches its specific query highlights
+
    Args:
        message_text: The message content to save
        reasoning_tokens: Optional reasoning tokens for the message
        tool_calls: List of tool call information to create ToolCall entries (may include search_docs)
-        citation_to_doc: Mapping from citation number to SearchDoc for building citations
-        all_search_docs: Pre-deduplicated search docs from ChatStateContainer
+        citation_docs_info: List of citation document information for building citations mapping
        db_session: Database session for persistence
        assistant_message: The ChatMessage object to populate (should already exist in DB)
        is_clarification: Whether this assistant message is a clarification question (deep research flow)
-        emitted_citations: Set of citation numbers that were actually emitted during streaming.
-            If provided, only citations in this set will be saved; others are filtered out.
    """
    # 1. Update ChatMessage with message content, reasoning tokens, and token count
    assistant_message.message = message_text
@@ -183,53 +200,53 @@ def save_chat_turn(
    else:
        assistant_message.token_count = 0

-    # 2. Create DB SearchDoc entries from pre-deduplicated all_search_docs
-    search_doc_key_to_id: dict[SearchDocKey, int] = {}
-    for key, search_doc_py in all_search_docs.items():
-        db_search_doc = create_db_search_doc(
-            server_search_doc=search_doc_py,
-            db_session=db_session,
-            commit=False,
-        )
-        search_doc_key_to_id[key] = db_search_doc.id
-
-    # 3. Build tool_call -> search_doc mapping (for displayed docs in each tool call)
+    # 2. Create SearchDoc entries from tool_calls
+    # Build mapping from SearchDoc to DB SearchDoc ID
+    # Use (document_id, chunk_ind, match_highlights) as key to avoid duplicates
+    # while ensuring different versions with different highlights are stored separately
+    search_doc_key_to_id: dict[tuple[str, int, tuple[str, ...]], int] = {}
    tool_call_to_search_doc_ids: dict[str, list[int]] = {}
+
+    # Process tool calls and their search docs
    for tool_call_info in tool_calls:
        if tool_call_info.search_docs:
            search_doc_ids_for_tool: list[int] = []
            for search_doc_py in tool_call_info.search_docs:
-                key = ChatStateContainer.create_search_doc_key(search_doc_py)
-                if key in search_doc_key_to_id:
-                    search_doc_ids_for_tool.append(search_doc_key_to_id[key])
+                # Create a unique key for this SearchDoc version
+                search_doc_key = _create_search_doc_key(search_doc_py)
+
+                # Check if we've already created this exact SearchDoc version
+                if search_doc_key in search_doc_key_to_id:
+                    search_doc_ids_for_tool.append(search_doc_key_to_id[search_doc_key])
                else:
-                    # Displayed doc not in all_search_docs - create it
-                    # This can happen if displayed_docs contains docs not in search_docs
+                    # Create new DB SearchDoc entry
                    db_search_doc = create_db_search_doc(
                        server_search_doc=search_doc_py,
                        db_session=db_session,
                        commit=False,
                    )
-                    search_doc_key_to_id[key] = db_search_doc.id
+                    search_doc_key_to_id[search_doc_key] = db_search_doc.id
                    search_doc_ids_for_tool.append(db_search_doc.id)
+
            tool_call_to_search_doc_ids[tool_call_info.tool_call_id] = list(
                set(search_doc_ids_for_tool)
            )

-    # Collect all search doc IDs for ChatMessage linking
-    all_search_doc_ids_set: set[int] = set(search_doc_key_to_id.values())
+    # 3. Collect all unique SearchDoc IDs from all tool calls to link to ChatMessage
+    # Use a set to deduplicate by ID (since we've already deduplicated by key above)
+    all_search_doc_ids_set: set[int] = set()
+    for search_doc_ids in tool_call_to_search_doc_ids.values():
+        all_search_doc_ids_set.update(search_doc_ids)

-    # 4. Build a citation mapping from the citation number to the saved DB SearchDoc ID
-    # Only include citations that were actually emitted during streaming
+    # 4. Build citation mapping from citation_docs_info
    citation_number_to_search_doc_id: dict[int, int] = {}

-    for citation_num, search_doc_py in citation_to_doc.items():
-        # Skip citations that weren't actually emitted (if emitted_citations is provided)
-        if emitted_citations is not None and citation_num not in emitted_citations:
-            continue
+    for citation_doc_info in citation_docs_info:
+        # Extract SearchDoc pydantic model
+        search_doc_py = citation_doc_info.search_doc

        # Create the unique key for this SearchDoc version
-        search_doc_key = ChatStateContainer.create_search_doc_key(search_doc_py)
+        search_doc_key = _create_search_doc_key(search_doc_py)

        # Get the search doc ID (should already exist from processing tool_calls)
        if search_doc_key in search_doc_key_to_id:
@@ -266,7 +283,10 @@ def save_chat_turn(
                all_search_doc_ids_set.add(db_search_doc_id)

        # Build mapping from citation number to search doc ID
-        citation_number_to_search_doc_id[citation_num] = db_search_doc_id
+        if citation_doc_info.citation_number is not None:
+            citation_number_to_search_doc_id[citation_doc_info.citation_number] = (
+                db_search_doc_id
+            )

    # 5. Link all unique SearchDocs (from both tool calls and citations) to ChatMessage
    final_search_doc_ids: list[int] = list(all_search_doc_ids_set)
@@ -286,10 +306,23 @@ def save_chat_turn(
        tool_call_to_search_doc_ids=tool_call_to_search_doc_ids,
    )

-    # 7. Build citations mapping - use the mapping we already built in step 4
-    assistant_message.citations = (
-        citation_number_to_search_doc_id if citation_number_to_search_doc_id else None
-    )
+    # 7. Build citations mapping from citation_docs_info
+    # Any citation_doc_info with a citation_number appeared in the text and should be mapped
+    citations: dict[int, int] = {}
+    for citation_doc_info in citation_docs_info:
+        if citation_doc_info.citation_number is not None:
+            search_doc_id = citation_number_to_search_doc_id.get(
+                citation_doc_info.citation_number
+            )
+            if search_doc_id is not None:
+                citations[citation_doc_info.citation_number] = search_doc_id
+            else:
+                logger.warning(
+                    f"Citation number {citation_doc_info.citation_number} found in citation_docs_info "
+                    f"but no matching search doc ID in mapping"
+                )
+
+    assistant_message.citations = citations if citations else None

    # Finally save the messages, tool calls, and docs
    db_session.commit()
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -22,14 +22,6 @@ APP_PORT = 8080
 # prefix from requests directed towards the API server. In these cases, set this to `/api`
 APP_API_PREFIX = os.environ.get("API_PREFIX", "")

-# Certain services need to make HTTP requests to the API server, such as the MCP server and Discord bot
-API_SERVER_PROTOCOL = os.environ.get("API_SERVER_PROTOCOL", "http")
-API_SERVER_HOST = os.environ.get("API_SERVER_HOST", "127.0.0.1")
-# This override allows self-hosting the MCP server with Onyx Cloud backend.
-API_SERVER_URL_OVERRIDE_FOR_HTTP_REQUESTS = os.environ.get(
-    "API_SERVER_URL_OVERRIDE_FOR_HTTP_REQUESTS"
-)
-
 # Whether to send user metadata (user_id/email and session_id) to the LLM provider.
 # Disabled by default.
 SEND_USER_METADATA_TO_LLM_PROVIDER = (
@@ -208,19 +200,8 @@ OPENSEARCH_REST_API_PORT = int(os.environ.get("OPENSEARCH_REST_API_PORT") or 920
 OPENSEARCH_ADMIN_USERNAME = os.environ.get("OPENSEARCH_ADMIN_USERNAME", "admin")
 OPENSEARCH_ADMIN_PASSWORD = os.environ.get("OPENSEARCH_ADMIN_PASSWORD", "")

-# This is the "base" config for now, the idea is that at least for our dev
-# environments we always want to be dual indexing into both OpenSearch and Vespa
-# to stress test the new codepaths. Only enable this if there is some instance
-# of OpenSearch running for the relevant Onyx instance.
-ENABLE_OPENSEARCH_INDEXING_FOR_ONYX = (
-    os.environ.get("ENABLE_OPENSEARCH_INDEXING_FOR_ONYX", "").lower() == "true"
-)
-# Given that the "base" config above is true, this enables whether we want to
-# retrieve from OpenSearch or Vespa. We want to be able to quickly toggle this
-# in the event we see issues with OpenSearch retrieval in our dev environments.
-ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX = (
-    ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
-    and os.environ.get("ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX", "").lower() == "true"
+ENABLE_OPENSEARCH_FOR_ONYX = (
+    os.environ.get("ENABLE_OPENSEARCH_FOR_ONYX", "").lower() == "true"
 )

 VESPA_HOST = os.environ.get("VESPA_HOST") or "localhost"
@@ -749,10 +730,6 @@ JOB_TIMEOUT = 60 * 60 * 6  # 6 hours default
 LOG_ONYX_MODEL_INTERACTIONS = (
    os.environ.get("LOG_ONYX_MODEL_INTERACTIONS", "").lower() == "true"
 )
-
-PROMPT_CACHE_CHAT_HISTORY = (
-    os.environ.get("PROMPT_CACHE_CHAT_HISTORY", "").lower() == "true"
-)
 # If set to `true` will enable additional logs about Vespa query performance
 # (time spent on finding the right docs + time spent fetching summaries from disk)
 LOG_VESPA_TIMING_INFORMATION = (
@@ -873,7 +850,6 @@ AZURE_IMAGE_DEPLOYMENT_NAME = os.environ.get(

 # configurable image model
 IMAGE_MODEL_NAME = os.environ.get("IMAGE_MODEL_NAME", "gpt-image-1")
-IMAGE_MODEL_PROVIDER = os.environ.get("IMAGE_MODEL_PROVIDER", "openai")

 # Use managed Vespa (Vespa Cloud). If set, must also set VESPA_CLOUD_URL, VESPA_CLOUD_CERT_PATH and VESPA_CLOUD_KEY_PATH
 MANAGED_VESPA = os.environ.get("MANAGED_VESPA", "").lower() == "true"
@@ -1026,19 +1002,3 @@ INSTANCE_TYPE = (
    if os.environ.get("IS_MANAGED_INSTANCE", "").lower() == "true"
    else "cloud" if AUTH_TYPE == AuthType.CLOUD else "self_hosted"
 )
-
-
-## Discord Bot Configuration
-DISCORD_BOT_TOKEN = os.environ.get("DISCORD_BOT_TOKEN")
-DISCORD_BOT_INVOKE_CHAR = os.environ.get("DISCORD_BOT_INVOKE_CHAR", "!")
-
-
-## Stripe Configuration
-# URL to fetch the Stripe publishable key from a public S3 bucket.
-# Publishable keys are safe to expose publicly - they can only initialize
-# Stripe.js and tokenize payment info, not make charges or access data.
-STRIPE_PUBLISHABLE_KEY_URL = (
-    "https://onyx-stripe-public.s3.amazonaws.com/publishable-key.txt"
-)
-# Override for local testing with Stripe test keys (pk_test_*)
-STRIPE_PUBLISHABLE_KEY_OVERRIDE = os.environ.get("STRIPE_PUBLISHABLE_KEY")
--- a/backend/onyx/configs/chat_configs.py
+++ b/backend/onyx/configs/chat_configs.py
@@ -1,5 +1,6 @@
 import os

+INPUT_PROMPT_YAML = "./onyx/seeding/input_prompts.yaml"
 PROMPTS_YAML = "./onyx/seeding/prompts.yaml"
 PERSONAS_YAML = "./onyx/seeding/personas.yaml"
 NUM_RETURNED_HITS = 50
@@ -11,6 +12,9 @@ NUM_POSTPROCESSED_RESULTS = 20
 # May be less depending on model
 MAX_CHUNKS_FED_TO_CHAT = int(os.environ.get("MAX_CHUNKS_FED_TO_CHAT") or 25)

+# Maximum percentage of the context window to fill with selected sections
+SELECTED_SECTIONS_MAX_WINDOW_PERCENTAGE = 0.8
+
 # 1 / (1 + DOC_TIME_DECAY * doc-age-in-years), set to 0 to have no decay
 # Capped in Vespa at 0.5
 DOC_TIME_DECAY = float(
@@ -23,6 +27,11 @@ FAVOR_RECENT_DECAY_MULTIPLIER = 2.0
 # Currently only applies to search flow not chat
 CONTEXT_CHUNKS_ABOVE = int(os.environ.get("CONTEXT_CHUNKS_ABOVE") or 1)
 CONTEXT_CHUNKS_BELOW = int(os.environ.get("CONTEXT_CHUNKS_BELOW") or 1)
+DISABLE_LLM_QUERY_REPHRASE = (
+    os.environ.get("DISABLE_LLM_QUERY_REPHRASE", "").lower() == "true"
+)
+# 1 edit per 20 characters, currently unused due to fuzzy match being too slow
+QUOTE_ALLOWED_ERROR_PERCENT = 0.05
 QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60")  # 60 seconds
 # Weighting factor between Vector and Keyword Search, 1 for completely vector search
 HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.5)))
@@ -37,6 +46,34 @@ TITLE_CONTENT_RATIO = max(
    0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.10))
 )

+# A list of languages passed to the LLM to rephase the query
+# For example "English,French,Spanish", be sure to use the "," separator
+# TODO these are not used, should probably reintroduce these
+MULTILINGUAL_QUERY_EXPANSION = os.environ.get("MULTILINGUAL_QUERY_EXPANSION") or None
+LANGUAGE_HINT = "\n" + (
+    os.environ.get("LANGUAGE_HINT")
+    or "IMPORTANT: Respond in the same language as my query!"
+)
+LANGUAGE_CHAT_NAMING_HINT = (
+    os.environ.get("LANGUAGE_CHAT_NAMING_HINT")
+    or "The name of the conversation must be in the same language as the user query."
+)
+
+# Number of prompts each persona should have
+NUM_PERSONA_PROMPTS = 4
+NUM_PERSONA_PROMPT_GENERATION_CHUNKS = 5
+
+# Agentic search takes significantly more tokens and therefore has much higher cost.
+# This configuration allows users to get a search-only experience with instant results
+# and no involvement from the LLM.
+# Additionally, some LLM providers have strict rate limits which may prohibit
+# sending many API requests at once (as is done in agentic search).
+# Whether the LLM should evaluate all of the document chunks passed in for usefulness
+# in relation to the user query
+DISABLE_LLM_DOC_RELEVANCE = (
+    os.environ.get("DISABLE_LLM_DOC_RELEVANCE", "").lower() == "true"
+)
+
 # Stops streaming answers back to the UI if this pattern is seen:
 STOP_STREAM_PAT = os.environ.get("STOP_STREAM_PAT") or None

@@ -49,6 +86,9 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "").lower() == "true"
 NUM_INTERNET_SEARCH_RESULTS = int(os.environ.get("NUM_INTERNET_SEARCH_RESULTS") or 10)
 NUM_INTERNET_SEARCH_CHUNKS = int(os.environ.get("NUM_INTERNET_SEARCH_CHUNKS") or 50)

+# Enable in-house model for detecting connector-based filtering in queries
+ENABLE_CONNECTOR_CLASSIFIER = os.environ.get("ENABLE_CONNECTOR_CLASSIFIER", False)
+
 VESPA_SEARCHER_THREADS = int(os.environ.get("VESPA_SEARCHER_THREADS") or 2)

 # Whether or not to use the semantic & keyword search expansions for Basic Search
@@ -56,3 +96,5 @@ USE_SEMANTIC_KEYWORD_EXPANSIONS_BASIC_SEARCH = (
    os.environ.get("USE_SEMANTIC_KEYWORD_EXPANSIONS_BASIC_SEARCH", "false").lower()
    == "true"
 )
+
+USE_DIV_CON_AGENT = os.environ.get("USE_DIV_CON_AGENT", "false").lower() == "true"
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -23,9 +23,6 @@ PUBLIC_DOC_PAT = "PUBLIC"
 ID_SEPARATOR = ":;:"
 DEFAULT_BOOST = 0

-# Tag for endpoints that should be included in the public API documentation
-PUBLIC_API_TAGS: list[str | Enum] = ["public"]
-
 # Cookies
 FASTAPI_USERS_AUTH_COOKIE_NAME = (
    "fastapiusersauth"  # Currently a constant, but logic allows for configuration
@@ -93,7 +90,6 @@ SSL_CERT_FILE = "bundle.pem"
 DANSWER_API_KEY_PREFIX = "API_KEY__"
 DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN = "onyxapikey.ai"
 UNNAMED_KEY_PLACEHOLDER = "Unnamed"
-DISCORD_SERVICE_API_KEY_NAME = "discord-bot-service"

 # Key-Value store keys
 KV_REINDEX_KEY = "needs_reindexing"
@@ -153,6 +149,17 @@ CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT = 300  # 5 min

 CELERY_USER_FILE_PROCESSING_LOCK_TIMEOUT = 30 * 60  # 30 minutes (in seconds)

+# How long a queued user-file task is valid before workers discard it.
+# Should be longer than the beat interval (20 s) but short enough to prevent
+# indefinite queue growth.  Workers drop tasks older than this without touching
+# the DB, so a shorter value = faster drain of stale duplicates.
+CELERY_USER_FILE_PROCESSING_TASK_EXPIRES = 60  # 1 minute (in seconds)
+
+# Maximum number of tasks allowed in the user-file-processing queue before the
+# beat generator stops adding more.  Prevents unbounded queue growth when workers
+# fall behind.
+USER_FILE_PROCESSING_MAX_QUEUE_DEPTH = 500
+
 CELERY_USER_FILE_PROJECT_SYNC_LOCK_TIMEOUT = 5 * 60  # 5 minutes (in seconds)

 DANSWER_REDIS_FUNCTION_LOCK_PREFIX = "da_function_lock:"
@@ -423,6 +430,9 @@ class OnyxRedisLocks:
    # User file processing
    USER_FILE_PROCESSING_BEAT_LOCK = "da_lock:check_user_file_processing_beat"
    USER_FILE_PROCESSING_LOCK_PREFIX = "da_lock:user_file_processing"
+    # Short-lived key set when a task is enqueued; cleared when the worker picks it up.
+    # Prevents the beat from re-enqueuing the same file while a task is already queued.
+    USER_FILE_QUEUED_PREFIX = "da_lock:user_file_queued"
    USER_FILE_PROJECT_SYNC_BEAT_LOCK = "da_lock:check_user_file_project_sync_beat"
    USER_FILE_PROJECT_SYNC_LOCK_PREFIX = "da_lock:user_file_project_sync"
    USER_FILE_DELETE_BEAT_LOCK = "da_lock:check_user_file_delete_beat"
--- a/backend/onyx/configs/onyxbot_configs.py
+++ b/backend/onyx/configs/onyxbot_configs.py
@@ -4,6 +4,8 @@ import os
 # Onyx Slack Bot Configs
 #####
 ONYX_BOT_NUM_RETRIES = int(os.environ.get("ONYX_BOT_NUM_RETRIES", "5"))
+# How much of the available input context can be used for thread context
+MAX_THREAD_CONTEXT_PERCENTAGE = 512 * 2 / 3072
 # Number of docs to display in "Reference Documents"
 ONYX_BOT_NUM_DOCS_TO_DISPLAY = int(os.environ.get("ONYX_BOT_NUM_DOCS_TO_DISPLAY", "5"))
 # If the LLM fails to answer, Onyx can still show the "Reference Documents"
@@ -45,6 +47,10 @@ ONYX_BOT_MAX_WAIT_TIME = int(os.environ.get("ONYX_BOT_MAX_WAIT_TIME") or 180)
 # Time (in minutes) after which a Slack message is sent to the user to remind him to give feedback.
 # Set to 0 to disable it (default)
 ONYX_BOT_FEEDBACK_REMINDER = int(os.environ.get("ONYX_BOT_FEEDBACK_REMINDER") or 0)
+# Set to True to rephrase the Slack users messages
+ONYX_BOT_REPHRASE_MESSAGE = (
+    os.environ.get("ONYX_BOT_REPHRASE_MESSAGE", "").lower() == "true"
+)

 # ONYX_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD is the number of
 # responses OnyxBot can send in a given time period.
--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -161,8 +161,6 @@ class DocumentBase(BaseModel):
    sections: list[TextSection | ImageSection]
    source: DocumentSource | None = None
    semantic_identifier: str  # displayed in the UI as the main identifier for the doc
-    # TODO(andrei): Ideally we could improve this to where each value is just a
-    # list of strings.
    metadata: dict[str, str | list[str]]

    # UTC time
@@ -204,7 +202,13 @@ class DocumentBase(BaseModel):
        if not self.metadata:
            return None
        # Combined string for the key/value for easy filtering
-        return convert_metadata_dict_to_list_of_strings(self.metadata)
+        attributes: list[str] = []
+        for k, v in self.metadata.items():
+            if isinstance(v, list):
+                attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
+            else:
+                attributes.append(k + INDEX_SEPARATOR + v)
+        return attributes

    def __sizeof__(self) -> int:
        size = sys.getsizeof(self.id)
@@ -236,66 +240,6 @@ class DocumentBase(BaseModel):
        return " ".join([section.text for section in self.sections if section.text])


-def convert_metadata_dict_to_list_of_strings(
-    metadata: dict[str, str | list[str]],
-) -> list[str]:
-    """Converts a metadata dict to a list of strings.
-
-    Each string is a key-value pair separated by the INDEX_SEPARATOR. If a key
-    points to a list of values, each value generates a unique pair.
-
-    Args:
-        metadata: The metadata dict to convert where values can be either a
-            string or a list of strings.
-
-    Returns:
-        A list of strings where each string is a key-value pair separated by the
-            INDEX_SEPARATOR.
-    """
-    attributes: list[str] = []
-    for k, v in metadata.items():
-        if isinstance(v, list):
-            attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
-        else:
-            attributes.append(k + INDEX_SEPARATOR + v)
-    return attributes
-
-
-def convert_metadata_list_of_strings_to_dict(
-    metadata_list: list[str],
-) -> dict[str, str | list[str]]:
-    """
-    Converts a list of strings to a metadata dict. The inverse of
-    convert_metadata_dict_to_list_of_strings.
-
-    Assumes the input strings are formatted as in the output of
-    convert_metadata_dict_to_list_of_strings.
-
-    The schema of the output metadata dict is suboptimal yet bound to legacy
-    code. Ideally each key would just point to a list of strings, where each
-    list might contain just one element.
-
-    Args:
-        metadata_list: The list of strings to convert to a metadata dict.
-
-    Returns:
-        A metadata dict where values can be either a string or a list of
-            strings.
-    """
-    metadata: dict[str, str | list[str]] = {}
-    for item in metadata_list:
-        key, value = item.split(INDEX_SEPARATOR, 1)
-        if key in metadata:
-            # We have already seen this key therefore it must point to a list.
-            if isinstance(metadata[key], list):
-                cast(list[str], metadata[key]).append(value)
-            else:
-                metadata[key] = [cast(str, metadata[key]), value]
-        else:
-            metadata[key] = value
-    return metadata
-
-
 class Document(DocumentBase):
    """Used for Onyx ingestion api, the ID is required"""

--- a/backend/onyx/context/search/enums.py
+++ b/backend/onyx/context/search/enums.py
@@ -13,6 +13,13 @@ class RecencyBiasSetting(str, Enum):
    AUTO = "auto"


+class OptionalSearchSetting(str, Enum):
+    ALWAYS = "always"
+    NEVER = "never"
+    # Determine whether to run search based on history and latest query
+    AUTO = "auto"
+
+
 class QueryType(str, Enum):
    """
    The type of first-pass query to use for hybrid search.
@@ -29,3 +36,15 @@ class SearchType(str, Enum):
    KEYWORD = "keyword"
    SEMANTIC = "semantic"
    INTERNET = "internet"
+
+
+class LLMEvaluationType(str, Enum):
+    AGENTIC = "agentic"  # applies agentic evaluation
+    BASIC = "basic"  # applies boolean evaluation
+    SKIP = "skip"  # skips evaluation
+    UNSPECIFIED = "unspecified"  # reverts to default
+
+
+class QueryFlow(str, Enum):
+    SEARCH = "search"
+    QUESTION_ANSWER = "question-answer"
--- a/backend/onyx/context/search/federated/slack_search.py
+++ b/backend/onyx/context/search/federated/slack_search.py
@@ -31,6 +31,7 @@ from onyx.context.search.federated.slack_search_utils import is_recency_query
 from onyx.context.search.federated.slack_search_utils import should_include_message
 from onyx.context.search.models import ChunkIndexRequest
 from onyx.context.search.models import InferenceChunk
+from onyx.context.search.models import SearchQuery
 from onyx.db.document import DocumentSource
 from onyx.db.search_settings import get_current_search_settings
 from onyx.document_index.document_index_utils import (
@@ -424,6 +425,7 @@ class SlackQueryResult(BaseModel):

 def query_slack(
    query_string: str,
+    original_query: SearchQuery,
    access_token: str,
    limit: int | None = None,
    allowed_private_channel: str | None = None,
@@ -454,7 +456,7 @@ def query_slack(
    logger.info(f"Final query to slack: {final_query}")

    # Detect if query asks for most recent results
-    sort_by_time = is_recency_query(query_string)
+    sort_by_time = is_recency_query(original_query.query)

    slack_client = WebClient(token=access_token)
    try:
@@ -534,7 +536,8 @@ def query_slack(
        )
        document_id = f"{channel_id}_{message_id}"

-        decay_factor = DOC_TIME_DECAY
+        # compute recency bias (parallels vespa calculation) and metadata
+        decay_factor = DOC_TIME_DECAY * original_query.recency_bias_multiplier
        doc_time = datetime.fromtimestamp(float(message_id))
        doc_age_years = (datetime.now() - doc_time).total_seconds() / (
            365 * 24 * 60 * 60
@@ -999,6 +1002,7 @@ def slack_retrieval(
            query_slack,
            (
                query_string,
+                query,
                access_token,
                query_limit,
                allowed_private_channel,
@@ -1041,6 +1045,7 @@ def slack_retrieval(
                    query_slack,
                    (
                        query_string,
+                        query,
                        access_token,
                        query_limit,
                        allowed_private_channel,
@@ -1220,6 +1225,7 @@ def slack_retrieval(
                source_type=DocumentSource.SLACK,
                title=chunk.title_prefix,
                boost=0,
+                recency_bias=docid_to_message[document_id].recency_bias,
                score=convert_slack_score(docid_to_message[document_id].slack_score),
                hidden=False,
                is_relevant=None,
--- a/backend/onyx/context/search/federated/slack_search_utils.py
+++ b/backend/onyx/context/search/federated/slack_search_utils.py
@@ -13,9 +13,7 @@ from onyx.context.search.federated.models import ChannelMetadata
 from onyx.context.search.models import ChunkIndexRequest
 from onyx.federated_connectors.slack.models import SlackEntities
 from onyx.llm.interfaces import LLM
-from onyx.llm.models import UserMessage
 from onyx.llm.utils import llm_response_to_string
-from onyx.natural_language_processing.english_stopwords import ENGLISH_STOPWORDS_SET
 from onyx.onyxbot.slack.models import ChannelType
 from onyx.prompts.federated_search import SLACK_DATE_EXTRACTION_PROMPT
 from onyx.prompts.federated_search import SLACK_QUERY_EXPANSION_PROMPT
@@ -114,7 +112,7 @@ def is_recency_query(query: str) -> bool:
    if not has_recency_keyword:
        return False

-    # Get combined stop words (English + Slack-specific)
+    # Get combined stop words (NLTK + Slack-specific)
    all_stop_words = _get_combined_stop_words()

    # Extract content words (excluding stop words)
@@ -192,7 +190,7 @@ def extract_date_range_from_query(

    try:
        prompt = SLACK_DATE_EXTRACTION_PROMPT.format(query=query)
-        response = llm_response_to_string(llm.invoke(UserMessage(content=prompt)))
+        response = llm_response_to_string(llm.invoke(prompt))

        response_clean = _parse_llm_code_block_response(response)

@@ -489,7 +487,7 @@ def build_channel_override_query(channel_references: set[str], time_filter: str)
    return f"__CHANNEL_OVERRIDE__ {channel_filter}{time_filter}"


-# Slack-specific stop words (in addition to standard English stop words)
+# Slack-specific stop words (in addition to standard NLTK stop words)
 # These include Slack-specific terms and temporal/recency keywords
 SLACK_SPECIFIC_STOP_WORDS = frozenset(
    RECENCY_KEYWORDS
@@ -509,16 +507,27 @@ SLACK_SPECIFIC_STOP_WORDS = frozenset(
 )


-def _get_combined_stop_words() -> frozenset[str]:
-    """Get combined English + Slack-specific stop words.
+def _get_combined_stop_words() -> set[str]:
+    """Get combined NLTK + Slack-specific stop words.

-    Returns a frozenset of stop words for filtering content words.
+    Returns a set of stop words for filtering content words.
+    Falls back to just Slack-specific stop words if NLTK is unavailable.

    Note: Currently only supports English stop words. Non-English queries
    may have suboptimal content word extraction. Future enhancement could
    detect query language and load appropriate stop words.
    """
-    return ENGLISH_STOPWORDS_SET | SLACK_SPECIFIC_STOP_WORDS
+    try:
+        from nltk.corpus import stopwords  # type: ignore
+
+        # TODO: Support multiple languages - currently hardcoded to English
+        # Could detect language or allow configuration
+        nltk_stop_words = set(stopwords.words("english"))
+    except Exception:
+        # Fallback if NLTK not available
+        nltk_stop_words = set()
+
+    return nltk_stop_words | SLACK_SPECIFIC_STOP_WORDS


 def extract_content_words_from_recency_query(
@@ -526,7 +535,7 @@ def extract_content_words_from_recency_query(
 ) -> list[str]:
    """Extract meaningful content words from a recency query.

-    Filters out English stop words, Slack-specific terms, channel references, and proper nouns.
+    Filters out NLTK stop words, Slack-specific terms, channel references, and proper nouns.

    Args:
        query_text: The user's query text
@@ -535,7 +544,7 @@ def extract_content_words_from_recency_query(
    Returns:
        List of content words (up to MAX_CONTENT_WORDS)
    """
-    # Get combined stop words (English + Slack-specific)
+    # Get combined stop words (NLTK + Slack-specific)
    all_stop_words = _get_combined_stop_words()

    words = query_text.split()
@@ -584,10 +593,8 @@ def expand_query_with_llm(query_text: str, llm: LLM) -> list[str]:
    Returns:
        List of rephrased query strings (up to MAX_SLACK_QUERY_EXPANSIONS)
    """
-    prompt = UserMessage(
-        content=SLACK_QUERY_EXPANSION_PROMPT.format(
-            query=query_text, max_queries=MAX_SLACK_QUERY_EXPANSIONS
-        )
+    prompt = SLACK_QUERY_EXPANSION_PROMPT.format(
+        query=query_text, max_queries=MAX_SLACK_QUERY_EXPANSIONS
    )

    try:
--- a/backend/onyx/context/search/models.py
+++ b/backend/onyx/context/search/models.py
@@ -5,15 +5,27 @@ from typing import Any
 from uuid import UUID

 from pydantic import BaseModel
+from pydantic import ConfigDict
 from pydantic import Field
 from pydantic import field_validator

+from onyx.configs.chat_configs import NUM_RETURNED_HITS
 from onyx.configs.constants import DocumentSource
+from onyx.context.search.enums import LLMEvaluationType
+from onyx.context.search.enums import OptionalSearchSetting
+from onyx.context.search.enums import SearchType
+from onyx.db.models import Persona
 from onyx.db.models import SearchSettings
 from onyx.indexing.models import BaseChunk
 from onyx.indexing.models import IndexingSetting
 from onyx.tools.tool_implementations.web_search.models import WEB_SEARCH_PREFIX
 from shared_configs.enums import RerankerProvider
+from shared_configs.model_server_models import Embedding
+
+
+MAX_METRICS_CONTENT = (
+    200  # Just need enough characters to identify where in the doc the chunk is
+)


 class QueryExpansions(BaseModel):
@@ -26,7 +38,6 @@ class QueryExpansionType(Enum):
    SEMANTIC = "semantic"


-# TODO clean up this stuff, reranking is no longer used
 class RerankingDetails(BaseModel):
    # If model is None (or num_rerank is 0), then reranking is turned off
    rerank_model_name: str | None
@@ -120,6 +131,13 @@ class IndexFilters(BaseFilters, UserFileFilters):
    tenant_id: str | None = None


+class ChunkMetric(BaseModel):
+    document_id: str
+    chunk_content_start: str
+    first_link: str | None
+    score: float
+
+
 class ChunkContext(BaseModel):
    # If not specified (None), picked up from Persona settings if there is space
    # if specified (even if 0), it always uses the specified number of chunks above and below
@@ -144,6 +162,10 @@ class BasicChunkRequest(BaseModel):
    # In case some queries favor recency more than other queries.
    recency_bias_multiplier: float = 1.0

+    # Sometimes we may want to extract specific keywords from a more semantic query for
+    # a better keyword search.
+    query_keywords: list[str] | None = None  # Not used currently
+
    limit: int | None = None
    offset: int | None = None  # This one is not set currently

@@ -162,8 +184,6 @@ class ChunkIndexRequest(BasicChunkRequest):
    # Calculated final filters
    filters: IndexFilters

-    query_keywords: list[str] | None = None
-

 class ContextExpansionType(str, Enum):
    NOT_RELEVANT = "not_relevant"
@@ -172,18 +192,94 @@ class ContextExpansionType(str, Enum):
    FULL_DOCUMENT = "full_document"


+class SearchRequest(ChunkContext):
+    query: str
+
+    expanded_queries: QueryExpansions | None = None
+    original_query: str | None = None
+
+    search_type: SearchType = SearchType.SEMANTIC
+
+    human_selected_filters: BaseFilters | None = None
+    user_file_filters: UserFileFilters | None = None
+    enable_auto_detect_filters: bool | None = None
+    persona: Persona | None = None
+
+    # if None, no offset / limit
+    offset: int | None = None
+    limit: int | None = None
+
+    multilingual_expansion: list[str] | None = None
+    recency_bias_multiplier: float = 1.0
+    hybrid_alpha: float | None = None
+    rerank_settings: RerankingDetails | None = None
+    evaluation_type: LLMEvaluationType = LLMEvaluationType.UNSPECIFIED
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    precomputed_query_embedding: Embedding | None = None
+    precomputed_is_keyword: bool | None = None
+    precomputed_keywords: list[str] | None = None
+
+
+class SearchQuery(ChunkContext):
+    query: str
+    processed_keywords: list[str]
+    search_type: SearchType
+    evaluation_type: LLMEvaluationType
+    filters: IndexFilters
+
+    # by this point, the chunks_above and chunks_below must be set
+    chunks_above: int
+    chunks_below: int
+
+    rerank_settings: RerankingDetails | None
+    hybrid_alpha: float
+    recency_bias_multiplier: float
+
+    # Only used if LLM evaluation type is not skip, None to use default settings
+    max_llm_filter_sections: int
+
+    num_hits: int = NUM_RETURNED_HITS
+    offset: int = 0
+    model_config = ConfigDict(frozen=True)
+
+    precomputed_query_embedding: Embedding | None = None
+
+    expanded_queries: QueryExpansions | None = None
+    original_query: str | None
+
+
+class RetrievalDetails(ChunkContext):
+    # Use LLM to determine whether to do a retrieval or only rely on existing history
+    # If the Persona is configured to not run search (0 chunks), this is bypassed
+    # If no Prompt is configured, the only search results are shown, this is bypassed
+    run_search: OptionalSearchSetting = OptionalSearchSetting.AUTO
+    # Is this a real-time/streaming call or a question where Onyx can take more time?
+    # Used to determine reranking flow
+    real_time: bool = True
+    # The following have defaults in the Persona settings which can be overridden via
+    # the query, if None, then use Persona settings
+    filters: BaseFilters | None = None
+    enable_auto_detect_filters: bool | None = None
+    # if None, no offset / limit
+    offset: int | None = None
+    limit: int | None = None
+
+    # If this is set, only the highest matching chunk (or merged chunks) is returned
+    dedupe_docs: bool = False
+
+
 class InferenceChunk(BaseChunk):
    document_id: str
    source_type: DocumentSource
    semantic_identifier: str
    title: str | None  # Separate from Semantic Identifier though often same
    boost: int
+    recency_bias: float
    score: float | None
    hidden: bool
    is_relevant: bool | None = None
    relevance_explanation: str | None = None
-    # TODO(andrei): Ideally we could improve this to where each value is just a
-    # list of strings.
    metadata: dict[str, str | list[str]]
    # Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
    # to specify that a set of words should be highlighted. For example:
@@ -370,10 +466,6 @@ class SearchDocsResponse(BaseModel):
    # document id is  the most staightforward way.
    citation_mapping: dict[int, str]

-    # For cases where the frontend only needs to display a subset of the search docs
-    # The whole list is typically still needed for later steps but this set should be saved separately
-    displayed_docs: list[SearchDoc] | None = None
-

 class SavedSearchDoc(SearchDoc):
    db_doc_id: int
@@ -432,8 +524,25 @@ class SavedSearchDoc(SearchDoc):
        return self_score < other_score


+class CitationDocInfo(BaseModel):
+    search_doc: SearchDoc
+    citation_number: int | None
+
+
 class SavedSearchDocWithContent(SavedSearchDoc):
    """Used for endpoints that need to return the actual contents of the retrieved
    section in addition to the match_highlights."""

    content: str
+
+
+class RetrievalMetricsContainer(BaseModel):
+    search_type: SearchType
+    metrics: list[ChunkMetric]  # This contains the scores for retrieval as well
+
+
+class RerankMetricsContainer(BaseModel):
+    """The score held by this is the un-boosted, averaged score of the ensemble cross-encoders"""
+
+    metrics: list[ChunkMetric]
+    raw_similarity_scores: list[float]
--- a/backend/onyx/context/search/pipeline.py
+++ b/backend/onyx/context/search/pipeline.py
@@ -19,7 +19,6 @@ from onyx.db.models import Persona
 from onyx.db.models import User
 from onyx.document_index.interfaces import DocumentIndex
 from onyx.llm.interfaces import LLM
-from onyx.natural_language_processing.english_stopwords import strip_stopwords
 from onyx.secondary_llm_flows.source_filter import extract_source_filter
 from onyx.secondary_llm_flows.time_filter import extract_time_filter
 from onyx.utils.logger import setup_logger
@@ -279,16 +278,12 @@ def search_pipeline(
        bypass_acl=chunk_search_request.bypass_acl,
    )

-    query_keywords = strip_stopwords(chunk_search_request.query)
-
    query_request = ChunkIndexRequest(
        query=chunk_search_request.query,
        hybrid_alpha=chunk_search_request.hybrid_alpha,
        recency_bias_multiplier=chunk_search_request.recency_bias_multiplier,
-        query_keywords=query_keywords,
+        query_keywords=chunk_search_request.query_keywords,
        filters=filters,
-        limit=chunk_search_request.limit,
-        offset=chunk_search_request.offset,
    )

    retrieved_chunks = search_chunks(
--- a/backend/onyx/context/search/preprocessing/preprocessing.py
+++ b/backend/onyx/context/search/preprocessing/preprocessing.py
@@ -0,0 +1,272 @@
+from sqlalchemy.orm import Session
+
+from onyx.configs.chat_configs import BASE_RECENCY_DECAY
+from onyx.configs.chat_configs import CONTEXT_CHUNKS_ABOVE
+from onyx.configs.chat_configs import CONTEXT_CHUNKS_BELOW
+from onyx.configs.chat_configs import DISABLE_LLM_DOC_RELEVANCE
+from onyx.configs.chat_configs import FAVOR_RECENT_DECAY_MULTIPLIER
+from onyx.configs.chat_configs import HYBRID_ALPHA
+from onyx.configs.chat_configs import HYBRID_ALPHA_KEYWORD
+from onyx.configs.chat_configs import NUM_POSTPROCESSED_RESULTS
+from onyx.configs.chat_configs import NUM_RETURNED_HITS
+from onyx.context.search.enums import LLMEvaluationType
+from onyx.context.search.enums import RecencyBiasSetting
+from onyx.context.search.enums import SearchType
+from onyx.context.search.models import BaseFilters
+from onyx.context.search.models import IndexFilters
+from onyx.context.search.models import RerankingDetails
+from onyx.context.search.models import SearchQuery
+from onyx.context.search.models import SearchRequest
+from onyx.context.search.preprocessing.access_filters import (
+    build_access_filters_for_user,
+)
+from onyx.context.search.utils import (
+    remove_stop_words_and_punctuation,
+)
+from onyx.db.models import User
+from onyx.db.search_settings import get_current_search_settings
+from onyx.llm.interfaces import LLM
+from onyx.natural_language_processing.search_nlp_models import QueryAnalysisModel
+from onyx.secondary_llm_flows.source_filter import extract_source_filter
+from onyx.secondary_llm_flows.time_filter import extract_time_filter
+from onyx.utils.logger import setup_logger
+from onyx.utils.threadpool_concurrency import FunctionCall
+from onyx.utils.threadpool_concurrency import run_functions_in_parallel
+from onyx.utils.timing import log_function_time
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.contextvars import get_current_tenant_id
+
+logger = setup_logger()
+
+
+def query_analysis(query: str) -> tuple[bool, list[str]]:
+    analysis_model = QueryAnalysisModel()
+    return analysis_model.predict(query)
+
+
+# TODO: This is unused code.
+@log_function_time(print_only=True)
+def retrieval_preprocessing(
+    search_request: SearchRequest,
+    user: User | None,
+    llm: LLM,
+    skip_query_analysis: bool,
+    db_session: Session,
+    favor_recent_decay_multiplier: float = FAVOR_RECENT_DECAY_MULTIPLIER,
+    base_recency_decay: float = BASE_RECENCY_DECAY,
+    bypass_acl: bool = False,
+) -> SearchQuery:
+    """Logic is as follows:
+    Any global disables apply first
+    Then any filters or settings as part of the query are used
+    Then defaults to Persona settings if not specified by the query
+    """
+    query = search_request.query
+    limit = search_request.limit
+    offset = search_request.offset
+    persona = search_request.persona
+
+    preset_filters = search_request.human_selected_filters or BaseFilters()
+    if persona and persona.document_sets and preset_filters.document_set is None:
+        preset_filters.document_set = [
+            document_set.name for document_set in persona.document_sets
+        ]
+
+    time_filter = preset_filters.time_cutoff
+    if time_filter is None and persona:
+        time_filter = persona.search_start_date
+
+    source_filter = preset_filters.source_type
+
+    auto_detect_time_filter = True
+    auto_detect_source_filter = True
+    if not search_request.enable_auto_detect_filters:
+        logger.debug("Retrieval details disables auto detect filters")
+        auto_detect_time_filter = False
+        auto_detect_source_filter = False
+    elif persona and persona.llm_filter_extraction is False:
+        logger.debug("Persona disables auto detect filters")
+        auto_detect_time_filter = False
+        auto_detect_source_filter = False
+    else:
+        logger.debug("Auto detect filters enabled")
+
+    if (
+        time_filter is not None
+        and persona
+        and persona.recency_bias != RecencyBiasSetting.AUTO
+    ):
+        auto_detect_time_filter = False
+        logger.debug("Not extract time filter - already provided")
+    if source_filter is not None:
+        logger.debug("Not extract source filter - already provided")
+        auto_detect_source_filter = False
+
+    # Based on the query figure out if we should apply any hard time filters /
+    # if we should bias more recent docs even more strongly
+    run_time_filters = (
+        FunctionCall(extract_time_filter, (query, llm), {})
+        if auto_detect_time_filter
+        else None
+    )
+
+    # Based on the query, figure out if we should apply any source filters
+    run_source_filters = (
+        FunctionCall(extract_source_filter, (query, llm, db_session), {})
+        if auto_detect_source_filter
+        else None
+    )
+
+    # Sometimes this is pre-computed in parallel with other heavy tasks to improve
+    # latency, and in that case we don't need to run the model again
+    run_query_analysis = (
+        None
+        if (skip_query_analysis or search_request.precomputed_is_keyword is not None)
+        else FunctionCall(query_analysis, (query,), {})
+    )
+
+    functions_to_run = [
+        filter_fn
+        for filter_fn in [
+            run_time_filters,
+            run_source_filters,
+            run_query_analysis,
+        ]
+        if filter_fn
+    ]
+    parallel_results = run_functions_in_parallel(functions_to_run)
+
+    predicted_time_cutoff, predicted_favor_recent = (
+        parallel_results[run_time_filters.result_id]
+        if run_time_filters
+        else (None, None)
+    )
+    predicted_source_filters = (
+        parallel_results[run_source_filters.result_id] if run_source_filters else None
+    )
+
+    # The extracted keywords right now are not very reliable, not using for now
+    # Can maybe use for highlighting
+    is_keyword, _extracted_keywords = False, None
+    if search_request.precomputed_is_keyword is not None:
+        is_keyword = search_request.precomputed_is_keyword
+        _extracted_keywords = search_request.precomputed_keywords
+    elif run_query_analysis:
+        is_keyword, _extracted_keywords = parallel_results[run_query_analysis.result_id]
+
+    all_query_terms = query.split()
+    processed_keywords = (
+        remove_stop_words_and_punctuation(all_query_terms)
+        # If the user is using a different language, don't edit the query or remove english stopwords
+        if not search_request.multilingual_expansion
+        else all_query_terms
+    )
+
+    user_acl_filters = (
+        None if bypass_acl else build_access_filters_for_user(user, db_session)
+    )
+    user_file_filters = search_request.user_file_filters
+    user_file_ids = (user_file_filters.user_file_ids or []) if user_file_filters else []
+    if persona and persona.user_files:
+        user_file_ids = list(
+            set(user_file_ids) | set([file.id for file in persona.user_files])
+        )
+
+    final_filters = IndexFilters(
+        user_file_ids=user_file_ids,
+        project_id=user_file_filters.project_id if user_file_filters else None,
+        source_type=preset_filters.source_type or predicted_source_filters,
+        document_set=preset_filters.document_set,
+        time_cutoff=time_filter or predicted_time_cutoff,
+        tags=preset_filters.tags,  # Tags are never auto-extracted
+        access_control_list=user_acl_filters,
+        tenant_id=get_current_tenant_id() if MULTI_TENANT else None,
+        # kg_entities=preset_filters.kg_entities,
+        # kg_relationships=preset_filters.kg_relationships,
+        # kg_terms=preset_filters.kg_terms,
+        # kg_sources=preset_filters.kg_sources,
+        # kg_chunk_id_zero_only=preset_filters.kg_chunk_id_zero_only,
+    )
+
+    llm_evaluation_type = LLMEvaluationType.BASIC
+    if search_request.evaluation_type is not LLMEvaluationType.UNSPECIFIED:
+        llm_evaluation_type = search_request.evaluation_type
+
+    elif persona:
+        llm_evaluation_type = (
+            LLMEvaluationType.BASIC
+            if persona.llm_relevance_filter
+            else LLMEvaluationType.SKIP
+        )
+
+    if DISABLE_LLM_DOC_RELEVANCE:
+        if llm_evaluation_type:
+            logger.info(
+                "LLM chunk filtering would have run but has been globally disabled"
+            )
+        llm_evaluation_type = LLMEvaluationType.SKIP
+
+    rerank_settings = search_request.rerank_settings
+    # If not explicitly specified by the query, use the current settings
+    if rerank_settings is None:
+        search_settings = get_current_search_settings(db_session)
+
+        # For non-streaming flows, the rerank settings are applied at the search_request level
+        if not search_settings.disable_rerank_for_streaming:
+            rerank_settings = RerankingDetails.from_db_model(search_settings)
+
+    # Decays at 1 / (1 + (multiplier * num years))
+    if persona and persona.recency_bias == RecencyBiasSetting.NO_DECAY:
+        recency_bias_multiplier = 0.0
+    elif persona and persona.recency_bias == RecencyBiasSetting.BASE_DECAY:
+        recency_bias_multiplier = base_recency_decay
+    elif persona and persona.recency_bias == RecencyBiasSetting.FAVOR_RECENT:
+        recency_bias_multiplier = base_recency_decay * favor_recent_decay_multiplier
+    else:
+        if predicted_favor_recent:
+            recency_bias_multiplier = base_recency_decay * favor_recent_decay_multiplier
+        else:
+            recency_bias_multiplier = base_recency_decay
+
+    hybrid_alpha = HYBRID_ALPHA_KEYWORD if is_keyword else HYBRID_ALPHA
+    if search_request.hybrid_alpha:
+        hybrid_alpha = search_request.hybrid_alpha
+
+    # Search request overrides anything else as it's explicitly set by the request
+    # If not explicitly specified, use the persona settings if they exist
+    # Otherwise, use the global defaults
+    chunks_above = (
+        search_request.chunks_above
+        if search_request.chunks_above is not None
+        else (persona.chunks_above if persona else CONTEXT_CHUNKS_ABOVE)
+    )
+    chunks_below = (
+        search_request.chunks_below
+        if search_request.chunks_below is not None
+        else (persona.chunks_below if persona else CONTEXT_CHUNKS_BELOW)
+    )
+
+    return SearchQuery(
+        query=query,
+        original_query=search_request.original_query,
+        processed_keywords=processed_keywords,
+        search_type=SearchType.KEYWORD if is_keyword else SearchType.SEMANTIC,
+        evaluation_type=llm_evaluation_type,
+        filters=final_filters,
+        hybrid_alpha=hybrid_alpha,
+        recency_bias_multiplier=recency_bias_multiplier,
+        num_hits=limit if limit is not None else NUM_RETURNED_HITS,
+        offset=offset or 0,
+        rerank_settings=rerank_settings,
+        # Should match the LLM filtering to the same as the reranked, it's understood as this is the number of results
+        # the user wants to do heavier processing on, so do the same for the LLM if reranking is on
+        # if no reranking settings are set, then use the global default
+        max_llm_filter_sections=(
+            rerank_settings.num_rerank if rerank_settings else NUM_POSTPROCESSED_RESULTS
+        ),
+        chunks_above=chunks_above,
+        chunks_below=chunks_below,
+        full_doc=search_request.full_doc,
+        precomputed_query_embedding=search_request.precomputed_query_embedding,
+        expanded_queries=search_request.expanded_queries,
+    )
--- a/backend/onyx/context/search/retrieval/search_runner.py
+++ b/backend/onyx/context/search/retrieval/search_runner.py
@@ -1,28 +1,98 @@
+import string
 from collections.abc import Callable
 from uuid import UUID

 from sqlalchemy.orm import Session

-from onyx.configs.chat_configs import HYBRID_ALPHA
 from onyx.configs.chat_configs import NUM_RETURNED_HITS
+from onyx.context.search.enums import SearchType
 from onyx.context.search.models import ChunkIndexRequest
+from onyx.context.search.models import ChunkMetric
 from onyx.context.search.models import IndexFilters
 from onyx.context.search.models import InferenceChunk
 from onyx.context.search.models import InferenceSection
+from onyx.context.search.models import MAX_METRICS_CONTENT
 from onyx.context.search.models import QueryExpansionType
+from onyx.context.search.models import RetrievalMetricsContainer
+from onyx.context.search.models import SearchQuery
+from onyx.context.search.preprocessing.preprocessing import HYBRID_ALPHA
+from onyx.context.search.preprocessing.preprocessing import HYBRID_ALPHA_KEYWORD
 from onyx.context.search.utils import get_query_embedding
+from onyx.context.search.utils import get_query_embeddings
 from onyx.context.search.utils import inference_section_from_chunks
+from onyx.db.search_settings import get_multilingual_expansion
 from onyx.document_index.interfaces import DocumentIndex
 from onyx.document_index.interfaces import VespaChunkRequest
+from onyx.document_index.vespa.shared_utils.utils import (
+    replace_invalid_doc_id_characters,
+)
 from onyx.federated_connectors.federated_retrieval import (
    get_federated_retrieval_functions,
 )
+from onyx.secondary_llm_flows.query_expansion import multilingual_query_expansion
 from onyx.utils.logger import setup_logger
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
+from onyx.utils.threadpool_concurrency import run_in_background
+from onyx.utils.threadpool_concurrency import TimeoutThread
+from onyx.utils.threadpool_concurrency import wait_on_background
+from onyx.utils.timing import log_function_time
+from shared_configs.model_server_models import Embedding

 logger = setup_logger()


+def _dedupe_chunks(
+    chunks: list[InferenceChunk],
+) -> list[InferenceChunk]:
+    used_chunks: dict[tuple[str, int], InferenceChunk] = {}
+    for chunk in chunks:
+        key = (chunk.document_id, chunk.chunk_id)
+        if key not in used_chunks:
+            used_chunks[key] = chunk
+        else:
+            stored_chunk_score = used_chunks[key].score or 0
+            this_chunk_score = chunk.score or 0
+            if stored_chunk_score < this_chunk_score:
+                used_chunks[key] = chunk
+
+    return list(used_chunks.values())
+
+
+def download_nltk_data() -> None:
+    import nltk  # type: ignore[import-untyped]
+
+    resources = {
+        "stopwords": "corpora/stopwords",
+        # "wordnet": "corpora/wordnet",  # Not in use
+        "punkt_tab": "tokenizers/punkt_tab",
+    }
+
+    for resource_name, resource_path in resources.items():
+        try:
+            nltk.data.find(resource_path)
+            logger.info(f"{resource_name} is already downloaded.")
+        except LookupError:
+            try:
+                logger.info(f"Downloading {resource_name}...")
+                nltk.download(resource_name, quiet=True)
+                logger.info(f"{resource_name} downloaded successfully.")
+            except Exception as e:
+                logger.error(f"Failed to download {resource_name}. Error: {e}")
+
+
+def lemmatize_text(keywords: list[str]) -> list[str]:
+    raise NotImplementedError("Lemmatization should not be used currently")
+    # try:
+    #     query = " ".join(keywords)
+    #     lemmatizer = WordNetLemmatizer()
+    #     word_tokens = word_tokenize(query)
+    #     lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
+    #     combined_keywords = list(set(keywords + lemmatized_words))
+    #     return combined_keywords
+    # except Exception:
+    #     return keywords
+
+
 def combine_retrieval_results(
    chunk_sets: list[list[InferenceChunk]],
 ) -> list[InferenceChunk]:
@@ -47,6 +117,313 @@ def combine_retrieval_results(
    return sorted_chunks


+# TODO: This is unused code.
+@log_function_time(print_only=True)
+def doc_index_retrieval(
+    query: SearchQuery,
+    document_index: DocumentIndex,
+    db_session: Session,
+) -> list[InferenceChunk]:
+    """
+    This function performs the search to retrieve the chunks,
+    extracts chunks from the large chunks, persists the scores
+    from the large chunks to the referenced chunks,
+    dedupes the chunks, and cleans the chunks.
+    """
+    query_embedding = query.precomputed_query_embedding or get_query_embedding(
+        query.query, db_session
+    )
+
+    keyword_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
+    semantic_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
+    top_base_chunks_standard_ranking_thread: (
+        TimeoutThread[list[InferenceChunk]] | None
+    ) = None
+
+    top_semantic_chunks_thread: TimeoutThread[list[InferenceChunk]] | None = None
+
+    keyword_embeddings: list[Embedding] | None = None
+    semantic_embeddings: list[Embedding] | None = None
+
+    top_semantic_chunks: list[InferenceChunk] | None = None
+
+    # original retrieveal method
+    top_base_chunks_standard_ranking_thread = run_in_background(
+        document_index.hybrid_retrieval,
+        query.query,
+        query_embedding,
+        query.processed_keywords,
+        query.filters,
+        query.hybrid_alpha,
+        query.recency_bias_multiplier,
+        query.num_hits,
+        QueryExpansionType.SEMANTIC,
+        query.offset,
+    )
+
+    if (
+        query.expanded_queries
+        and query.expanded_queries.keywords_expansions
+        and query.expanded_queries.semantic_expansions
+    ):
+
+        keyword_embeddings_thread = run_in_background(
+            get_query_embeddings,
+            query.expanded_queries.keywords_expansions,
+            db_session,
+        )
+
+        if query.search_type == SearchType.SEMANTIC:
+            semantic_embeddings_thread = run_in_background(
+                get_query_embeddings,
+                query.expanded_queries.semantic_expansions,
+                db_session,
+            )
+
+        keyword_embeddings = wait_on_background(keyword_embeddings_thread)
+        if query.search_type == SearchType.SEMANTIC:
+            assert semantic_embeddings_thread is not None
+            semantic_embeddings = wait_on_background(semantic_embeddings_thread)
+
+        # Use original query embedding for keyword retrieval embedding
+        keyword_embeddings = [query_embedding]
+
+        # Note: we generally prepped earlier for multiple expansions, but for now we only use one.
+        top_keyword_chunks_thread = run_in_background(
+            document_index.hybrid_retrieval,
+            query.expanded_queries.keywords_expansions[0],
+            keyword_embeddings[0],
+            query.processed_keywords,
+            query.filters,
+            HYBRID_ALPHA_KEYWORD,
+            query.recency_bias_multiplier,
+            query.num_hits,
+            QueryExpansionType.KEYWORD,
+            query.offset,
+        )
+
+        if query.search_type == SearchType.SEMANTIC:
+            assert semantic_embeddings is not None
+
+            top_semantic_chunks_thread = run_in_background(
+                document_index.hybrid_retrieval,
+                query.expanded_queries.semantic_expansions[0],
+                semantic_embeddings[0],
+                query.processed_keywords,
+                query.filters,
+                HYBRID_ALPHA,
+                query.recency_bias_multiplier,
+                query.num_hits,
+                QueryExpansionType.SEMANTIC,
+                query.offset,
+            )
+
+        top_base_chunks_standard_ranking = wait_on_background(
+            top_base_chunks_standard_ranking_thread
+        )
+
+        top_keyword_chunks = wait_on_background(top_keyword_chunks_thread)
+
+        if query.search_type == SearchType.SEMANTIC:
+            assert top_semantic_chunks_thread is not None
+            top_semantic_chunks = wait_on_background(top_semantic_chunks_thread)
+
+        all_top_chunks = top_base_chunks_standard_ranking + top_keyword_chunks
+
+        # use all three retrieval methods to retrieve top chunks
+
+        if query.search_type == SearchType.SEMANTIC and top_semantic_chunks is not None:
+
+            all_top_chunks += top_semantic_chunks
+
+        top_chunks = _dedupe_chunks(all_top_chunks)
+
+    else:
+
+        top_base_chunks_standard_ranking = wait_on_background(
+            top_base_chunks_standard_ranking_thread
+        )
+
+        top_chunks = _dedupe_chunks(top_base_chunks_standard_ranking)
+
+    logger.info(f"Overall number of top initial retrieval chunks: {len(top_chunks)}")
+
+    retrieval_requests: list[VespaChunkRequest] = []
+    normal_chunks: list[InferenceChunk] = []
+    referenced_chunk_scores: dict[tuple[str, int], float] = {}
+    for chunk in top_chunks:
+        if chunk.large_chunk_reference_ids:
+            retrieval_requests.append(
+                VespaChunkRequest(
+                    document_id=replace_invalid_doc_id_characters(chunk.document_id),
+                    min_chunk_ind=chunk.large_chunk_reference_ids[0],
+                    max_chunk_ind=chunk.large_chunk_reference_ids[-1],
+                )
+            )
+            # for each referenced chunk, persist the
+            # highest score to the referenced chunk
+            for chunk_id in chunk.large_chunk_reference_ids:
+                key = (chunk.document_id, chunk_id)
+                referenced_chunk_scores[key] = max(
+                    referenced_chunk_scores.get(key, 0), chunk.score or 0
+                )
+        else:
+            normal_chunks.append(chunk)
+
+    # If there are no large chunks, just return the normal chunks
+    if not retrieval_requests:
+        return normal_chunks
+
+    # Retrieve and return the referenced normal chunks from the large chunks
+    retrieved_inference_chunks = document_index.id_based_retrieval(
+        chunk_requests=retrieval_requests,
+        filters=query.filters,
+        batch_retrieval=True,
+    )
+
+    # Apply the scores from the large chunks to the chunks referenced
+    # by each large chunk
+    for chunk in retrieved_inference_chunks:
+        if (chunk.document_id, chunk.chunk_id) in referenced_chunk_scores:
+            chunk.score = referenced_chunk_scores[(chunk.document_id, chunk.chunk_id)]
+            referenced_chunk_scores.pop((chunk.document_id, chunk.chunk_id))
+        else:
+            logger.error(
+                f"Chunk {chunk.document_id} {chunk.chunk_id} not found in referenced chunk scores"
+            )
+
+    # Log any chunks that were not found in the retrieved chunks
+    for reference in referenced_chunk_scores.keys():
+        logger.error(f"Chunk {reference} not found in retrieved chunks")
+
+    unique_chunks: dict[tuple[str, int], InferenceChunk] = {
+        (chunk.document_id, chunk.chunk_id): chunk for chunk in normal_chunks
+    }
+
+    # persist the highest score of each deduped chunk
+    for chunk in retrieved_inference_chunks:
+        key = (chunk.document_id, chunk.chunk_id)
+        # For duplicates, keep the highest score
+        if key not in unique_chunks or (chunk.score or 0) > (
+            unique_chunks[key].score or 0
+        ):
+            unique_chunks[key] = chunk
+
+    # Deduplicate the chunks
+    deduped_chunks = list(unique_chunks.values())
+    deduped_chunks.sort(key=lambda chunk: chunk.score or 0, reverse=True)
+    return deduped_chunks
+
+
+def _simplify_text(text: str) -> str:
+    return "".join(
+        char for char in text if char not in string.punctuation and not char.isspace()
+    ).lower()
+
+
+# TODO delete this
+def retrieve_chunks(
+    query: SearchQuery,
+    user_id: UUID | None,
+    document_index: DocumentIndex,
+    db_session: Session,
+    retrieval_metrics_callback: (
+        Callable[[RetrievalMetricsContainer], None] | None
+    ) = None,
+) -> list[InferenceChunk]:
+    """Returns a list of the best chunks from an initial keyword/semantic/ hybrid search."""
+
+    multilingual_expansion = get_multilingual_expansion(db_session)
+    run_queries: list[tuple[Callable, tuple]] = []
+
+    source_filters = (
+        set(query.filters.source_type) if query.filters.source_type else None
+    )
+
+    # Federated retrieval
+    federated_retrieval_infos = get_federated_retrieval_functions(
+        db_session,
+        user_id,
+        list(query.filters.source_type) if query.filters.source_type else None,
+        query.filters.document_set,
+        user_file_ids=query.filters.user_file_ids,
+    )
+    federated_sources = set(
+        federated_retrieval_info.source.to_non_federated_source()
+        for federated_retrieval_info in federated_retrieval_infos
+    )
+    for federated_retrieval_info in federated_retrieval_infos:
+        run_queries.append((federated_retrieval_info.retrieval_function, (query,)))
+
+    # Normal retrieval
+    normal_search_enabled = (source_filters is None) or (
+        len(set(source_filters) - federated_sources) > 0
+    )
+    if normal_search_enabled and (
+        not multilingual_expansion or "\n" in query.query or "\r" in query.query
+    ):
+        # Don't do query expansion on complex queries, rephrasings likely would not work well
+        run_queries.append((doc_index_retrieval, (query, document_index, db_session)))
+    elif normal_search_enabled:
+        simplified_queries = set()
+
+        # Currently only uses query expansion on multilingual use cases
+        query_rephrases = multilingual_query_expansion(
+            query.query, multilingual_expansion
+        )
+        # Just to be extra sure, add the original query.
+        query_rephrases.append(query.query)
+        for rephrase in set(query_rephrases):
+            # Sometimes the model rephrases the query in the same language with minor changes
+            # Avoid doing an extra search with the minor changes as this biases the results
+            simplified_rephrase = _simplify_text(rephrase)
+            if simplified_rephrase in simplified_queries:
+                continue
+            simplified_queries.add(simplified_rephrase)
+
+            q_copy = query.model_copy(
+                update={
+                    "query": rephrase,
+                    # need to recompute for each rephrase
+                    # note that `SearchQuery` is a frozen model, so we can't update
+                    # it below
+                    "precomputed_query_embedding": None,
+                },
+                deep=True,
+            )
+            run_queries.append(
+                (doc_index_retrieval, (q_copy, document_index, db_session))
+            )
+
+    parallel_search_results = run_functions_tuples_in_parallel(run_queries)
+    top_chunks = combine_retrieval_results(parallel_search_results)
+
+    if not top_chunks:
+        logger.warning(
+            f"Hybrid ({query.search_type.value.capitalize()}) search returned no results "
+            f"with filters: {query.filters}"
+        )
+        return []
+
+    if retrieval_metrics_callback is not None:
+        chunk_metrics = [
+            ChunkMetric(
+                document_id=chunk.document_id,
+                chunk_content_start=chunk.content[:MAX_METRICS_CONTENT],
+                first_link=chunk.source_links[0] if chunk.source_links else None,
+                score=chunk.score if chunk.score is not None else 0,
+            )
+            for chunk in top_chunks
+        ]
+        retrieval_metrics_callback(
+            RetrievalMetricsContainer(
+                search_type=query.search_type, metrics=chunk_metrics
+            )
+        )
+
+    return top_chunks
+
+
 def _embed_and_search(
    query_request: ChunkIndexRequest,
    document_index: DocumentIndex,
--- a/backend/onyx/context/search/utils.py
+++ b/backend/onyx/context/search/utils.py
@@ -1,12 +1,16 @@
+import string
+from collections.abc import Sequence
 from typing import TypeVar

 from sqlalchemy.orm import Session

+from onyx.chat.models import SectionRelevancePiece
 from onyx.context.search.models import InferenceChunk
 from onyx.context.search.models import InferenceSection
 from onyx.context.search.models import SavedSearchDoc
 from onyx.context.search.models import SavedSearchDocWithContent
 from onyx.context.search.models import SearchDoc
+from onyx.db.models import SearchDoc as DBSearchDoc
 from onyx.db.search_settings import get_current_search_settings
 from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
 from onyx.utils.logger import setup_logger
@@ -37,6 +41,66 @@ TSection = TypeVar(
 )


+def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:
+    seen_ids = set()
+    deduped_items = []
+    dropped_indices = []
+    for index, item in enumerate(items):
+        if isinstance(item, InferenceSection):
+            document_id = item.center_chunk.document_id
+        else:
+            document_id = item.document_id
+
+        if document_id not in seen_ids:
+            seen_ids.add(document_id)
+            deduped_items.append(item)
+        else:
+            dropped_indices.append(index)
+    return deduped_items, dropped_indices
+
+
+def relevant_sections_to_indices(
+    relevance_sections: list[SectionRelevancePiece] | None, items: list[TSection]
+) -> list[int]:
+    if not relevance_sections:
+        return []
+
+    relevant_set = {
+        (chunk.document_id, chunk.chunk_id)
+        for chunk in relevance_sections
+        if chunk.relevant
+    }
+
+    return [
+        index
+        for index, item in enumerate(items)
+        if (
+            (
+                isinstance(item, InferenceSection)
+                and (item.center_chunk.document_id, item.center_chunk.chunk_id)
+                in relevant_set
+            )
+            or (
+                not isinstance(item, (InferenceSection))
+                and (item.document_id, item.chunk_ind) in relevant_set
+            )
+        )
+    ]
+
+
+def drop_llm_indices(
+    llm_indices: list[int],
+    search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
+    dropped_indices: list[int],
+) -> list[int]:
+    llm_bools = [i in llm_indices for i in range(len(search_docs))]
+    if dropped_indices:
+        llm_bools = [
+            val for ind, val in enumerate(llm_bools) if ind not in dropped_indices
+        ]
+    return [i for i, val in enumerate(llm_bools) if val]
+
+
 def inference_section_from_chunks(
    center_chunk: InferenceChunk,
    chunks: list[InferenceChunk],
@@ -64,6 +128,26 @@ def inference_section_from_single_chunk(
    )


+def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
+    from nltk.corpus import stopwords  # type:ignore
+    from nltk.tokenize import word_tokenize  # type:ignore
+
+    try:
+        # Re-tokenize using the NLTK tokenizer for better matching
+        query = " ".join(keywords)
+        stop_words = set(stopwords.words("english"))
+        word_tokens = word_tokenize(query)
+        text_trimmed = [
+            word
+            for word in word_tokens
+            if (word.casefold() not in stop_words and word not in string.punctuation)
+        ]
+        return text_trimmed or word_tokens
+    except Exception as e:
+        logger.warning(f"Error removing stop words and punctuation: {e}")
+        return keywords
+
+
 def get_query_embeddings(queries: list[str], db_session: Session) -> list[Embedding]:
    search_settings = get_current_search_settings(db_session)

--- a/backend/onyx/db/chat.py
+++ b/backend/onyx/db/chat.py
@@ -91,6 +91,59 @@ def get_chat_sessions_by_slack_thread_id(
    return db_session.scalars(stmt).all()


+def get_valid_messages_from_query_sessions(
+    chat_session_ids: list[UUID],
+    db_session: Session,
+) -> dict[UUID, str]:
+    user_message_subquery = (
+        select(
+            ChatMessage.chat_session_id, func.min(ChatMessage.id).label("user_msg_id")
+        )
+        .where(
+            ChatMessage.chat_session_id.in_(chat_session_ids),
+            ChatMessage.message_type == MessageType.USER,
+        )
+        .group_by(ChatMessage.chat_session_id)
+        .subquery()
+    )
+
+    assistant_message_subquery = (
+        select(
+            ChatMessage.chat_session_id,
+            func.min(ChatMessage.id).label("assistant_msg_id"),
+        )
+        .where(
+            ChatMessage.chat_session_id.in_(chat_session_ids),
+            ChatMessage.message_type == MessageType.ASSISTANT,
+        )
+        .group_by(ChatMessage.chat_session_id)
+        .subquery()
+    )
+
+    query = (
+        select(ChatMessage.chat_session_id, ChatMessage.message)
+        .join(
+            user_message_subquery,
+            ChatMessage.chat_session_id == user_message_subquery.c.chat_session_id,
+        )
+        .join(
+            assistant_message_subquery,
+            ChatMessage.chat_session_id == assistant_message_subquery.c.chat_session_id,
+        )
+        .join(
+            ChatMessage__SearchDoc,
+            ChatMessage__SearchDoc.chat_message_id
+            == assistant_message_subquery.c.assistant_msg_id,
+        )
+        .where(ChatMessage.id == user_message_subquery.c.user_msg_id)
+    )
+
+    first_messages = db_session.execute(query).all()
+    logger.info(f"Retrieved {len(first_messages)} first messages with documents")
+
+    return {row.chat_session_id: row.message for row in first_messages}
+
+
 # Retrieves chat sessions by user
 # Chat sessions do not include onyxbot flows
 def get_chat_sessions_by_user(
@@ -457,6 +510,21 @@ def add_chats_to_session_from_slack_thread(
        )


+def get_search_docs_for_chat_message(
+    chat_message_id: int, db_session: Session
+) -> list[DBSearchDoc]:
+    stmt = (
+        select(DBSearchDoc)
+        .join(
+            ChatMessage__SearchDoc,
+            ChatMessage__SearchDoc.search_doc_id == DBSearchDoc.id,
+        )
+        .where(ChatMessage__SearchDoc.chat_message_id == chat_message_id)
+    )
+
+    return list(db_session.scalars(stmt).all())
+
+
 def add_search_docs_to_chat_message(
    chat_message_id: int, search_doc_ids: list[int], db_session: Session
 ) -> None:
--- a/backend/onyx/db/discord_bot.py
+++ b/backend/onyx/db/discord_bot.py
@@ -1,451 +0,0 @@
-"""CRUD operations for Discord bot models."""
-
-from datetime import datetime
-from datetime import timezone
-
-from sqlalchemy import delete
-from sqlalchemy import select
-from sqlalchemy.exc import IntegrityError
-from sqlalchemy.orm import joinedload
-from sqlalchemy.orm import Session
-
-from onyx.auth.api_key import build_displayable_api_key
-from onyx.auth.api_key import generate_api_key
-from onyx.auth.api_key import hash_api_key
-from onyx.auth.schemas import UserRole
-from onyx.configs.constants import DISCORD_SERVICE_API_KEY_NAME
-from onyx.db.api_key import insert_api_key
-from onyx.db.models import ApiKey
-from onyx.db.models import DiscordBotConfig
-from onyx.db.models import DiscordChannelConfig
-from onyx.db.models import DiscordGuildConfig
-from onyx.db.models import User
-from onyx.db.utils import DiscordChannelView
-from onyx.server.api_key.models import APIKeyArgs
-from onyx.utils.logger import setup_logger
-
-logger = setup_logger()
-
-
-# === DiscordBotConfig ===
-
-
-def get_discord_bot_config(db_session: Session) -> DiscordBotConfig | None:
-    """Get the Discord bot config for this tenant (at most one)."""
-    return db_session.scalar(select(DiscordBotConfig).limit(1))
-
-
-def create_discord_bot_config(
-    db_session: Session,
-    bot_token: str,
-) -> DiscordBotConfig:
-    """Create the Discord bot config. Raises ValueError if already exists.
-
-    The check constraint on id='SINGLETON' ensures only one config per tenant.
-    """
-    existing = get_discord_bot_config(db_session)
-    if existing:
-        raise ValueError("Discord bot config already exists")
-
-    config = DiscordBotConfig(bot_token=bot_token)
-    db_session.add(config)
-    try:
-        db_session.flush()
-    except IntegrityError:
-        # Race condition: another request created the config concurrently
-        db_session.rollback()
-        raise ValueError("Discord bot config already exists")
-    return config
-
-
-def delete_discord_bot_config(db_session: Session) -> bool:
-    """Delete the Discord bot config. Returns True if deleted."""
-    result = db_session.execute(delete(DiscordBotConfig))
-    db_session.flush()
-    return result.rowcount > 0  # type: ignore[attr-defined]
-
-
-# === Discord Service API Key ===
-
-
-def get_discord_service_api_key(db_session: Session) -> ApiKey | None:
-    """Get the Discord service API key if it exists."""
-    return db_session.scalar(
-        select(ApiKey).where(ApiKey.name == DISCORD_SERVICE_API_KEY_NAME)
-    )
-
-
-def get_or_create_discord_service_api_key(
-    db_session: Session,
-    tenant_id: str,
-) -> str:
-    """Get existing Discord service API key or create one.
-
-    The API key is used by the Discord bot to authenticate with the
-    Onyx API pods when sending chat requests.
-
-    Args:
-        db_session: Database session for the tenant.
-        tenant_id: The tenant ID (used for logging/context).
-
-    Returns:
-        The raw API key string (not hashed).
-
-    Raises:
-        RuntimeError: If API key creation fails.
-    """
-    # Check for existing key
-    existing = get_discord_service_api_key(db_session)
-    if existing:
-        # Database only stores the hash, so we must regenerate to get the raw key.
-        # This is safe since the Discord bot is the only consumer of this key.
-        logger.debug(
-            f"Found existing Discord service API key for tenant {tenant_id} that isn't in cache, "
-            "regenerating to update cache"
-        )
-        new_api_key = generate_api_key(tenant_id)
-        existing.hashed_api_key = hash_api_key(new_api_key)
-        existing.api_key_display = build_displayable_api_key(new_api_key)
-        db_session.flush()
-        return new_api_key
-
-    # Create new API key
-    logger.info(f"Creating Discord service API key for tenant {tenant_id}")
-    api_key_args = APIKeyArgs(
-        name=DISCORD_SERVICE_API_KEY_NAME,
-        role=UserRole.LIMITED,  # Limited role is sufficient for chat requests
-    )
-    api_key_descriptor = insert_api_key(
-        db_session=db_session,
-        api_key_args=api_key_args,
-        user_id=None,  # Service account, no owner
-    )
-
-    if not api_key_descriptor.api_key:
-        raise RuntimeError(
-            f"Failed to create Discord service API key for tenant {tenant_id}"
-        )
-
-    return api_key_descriptor.api_key
-
-
-def delete_discord_service_api_key(db_session: Session) -> bool:
-    """Delete the Discord service API key for a tenant.
-
-    Called when:
-    - Bot config is deleted (self-hosted)
-    - All guild configs are deleted (Cloud)
-
-    Args:
-        db_session: Database session for the tenant.
-
-    Returns:
-        True if the key was deleted, False if it didn't exist.
-    """
-    existing_key = get_discord_service_api_key(db_session)
-    if not existing_key:
-        return False
-
-    # Also delete the associated user
-    api_key_user = db_session.scalar(
-        select(User).where(User.id == existing_key.user_id)  # type: ignore[arg-type]
-    )
-
-    db_session.delete(existing_key)
-    if api_key_user:
-        db_session.delete(api_key_user)
-
-    db_session.flush()
-    logger.info("Deleted Discord service API key")
-    return True
-
-
-# === DiscordGuildConfig ===
-
-
-def get_guild_configs(
-    db_session: Session,
-    include_channels: bool = False,
-) -> list[DiscordGuildConfig]:
-    """Get all guild configs for this tenant."""
-    stmt = select(DiscordGuildConfig)
-    if include_channels:
-        stmt = stmt.options(joinedload(DiscordGuildConfig.channels))
-    return list(db_session.scalars(stmt).unique().all())
-
-
-def get_guild_config_by_internal_id(
-    db_session: Session,
-    internal_id: int,
-) -> DiscordGuildConfig | None:
-    """Get a specific guild config by its ID."""
-    return db_session.scalar(
-        select(DiscordGuildConfig).where(DiscordGuildConfig.id == internal_id)
-    )
-
-
-def get_guild_config_by_discord_id(
-    db_session: Session,
-    guild_id: int,
-) -> DiscordGuildConfig | None:
-    """Get a guild config by Discord guild ID."""
-    return db_session.scalar(
-        select(DiscordGuildConfig).where(DiscordGuildConfig.guild_id == guild_id)
-    )
-
-
-def get_guild_config_by_registration_key(
-    db_session: Session,
-    registration_key: str,
-) -> DiscordGuildConfig | None:
-    """Get a guild config by its registration key."""
-    return db_session.scalar(
-        select(DiscordGuildConfig).where(
-            DiscordGuildConfig.registration_key == registration_key
-        )
-    )
-
-
-def create_guild_config(
-    db_session: Session,
-    registration_key: str,
-) -> DiscordGuildConfig:
-    """Create a new guild config with a registration key (guild_id=NULL)."""
-    config = DiscordGuildConfig(registration_key=registration_key)
-    db_session.add(config)
-    db_session.flush()
-    return config
-
-
-def register_guild(
-    db_session: Session,
-    config: DiscordGuildConfig,
-    guild_id: int,
-    guild_name: str,
-) -> DiscordGuildConfig:
-    """Complete registration by setting guild_id and guild_name."""
-    config.guild_id = guild_id
-    config.guild_name = guild_name
-    config.registered_at = datetime.now(timezone.utc)
-    db_session.flush()
-    return config
-
-
-def update_guild_config(
-    db_session: Session,
-    config: DiscordGuildConfig,
-    enabled: bool,
-    default_persona_id: int | None = None,
-) -> DiscordGuildConfig:
-    """Update guild config fields."""
-    config.enabled = enabled
-    config.default_persona_id = default_persona_id
-    db_session.flush()
-    return config
-
-
-def delete_guild_config(
-    db_session: Session,
-    internal_id: int,
-) -> bool:
-    """Delete guild config (cascades to channel configs). Returns True if deleted."""
-    result = db_session.execute(
-        delete(DiscordGuildConfig).where(DiscordGuildConfig.id == internal_id)
-    )
-    db_session.flush()
-    return result.rowcount > 0  # type: ignore[attr-defined]
-
-
-# === DiscordChannelConfig ===
-
-
-def get_channel_configs(
-    db_session: Session,
-    guild_config_id: int,
-) -> list[DiscordChannelConfig]:
-    """Get all channel configs for a guild."""
-    return list(
-        db_session.scalars(
-            select(DiscordChannelConfig).where(
-                DiscordChannelConfig.guild_config_id == guild_config_id
-            )
-        ).all()
-    )
-
-
-def get_channel_config_by_discord_ids(
-    db_session: Session,
-    guild_id: int,
-    channel_id: int,
-) -> DiscordChannelConfig | None:
-    """Get a specific channel config by guild_id and channel_id."""
-    return db_session.scalar(
-        select(DiscordChannelConfig)
-        .join(DiscordGuildConfig)
-        .where(
-            DiscordGuildConfig.guild_id == guild_id,
-            DiscordChannelConfig.channel_id == channel_id,
-        )
-    )
-
-
-def get_channel_config_by_internal_ids(
-    db_session: Session,
-    guild_config_id: int,
-    channel_config_id: int,
-) -> DiscordChannelConfig | None:
-    """Get a specific channel config by guild_config_id and channel_config_id"""
-    return db_session.scalar(
-        select(DiscordChannelConfig).where(
-            DiscordChannelConfig.guild_config_id == guild_config_id,
-            DiscordChannelConfig.id == channel_config_id,
-        )
-    )
-
-
-def update_discord_channel_config(
-    db_session: Session,
-    config: DiscordChannelConfig,
-    channel_name: str,
-    thread_only_mode: bool,
-    require_bot_invocation: bool,
-    enabled: bool,
-    persona_override_id: int | None = None,
-) -> DiscordChannelConfig:
-    """Update channel config fields."""
-    config.channel_name = channel_name
-    config.require_bot_invocation = require_bot_invocation
-    config.persona_override_id = persona_override_id
-    config.enabled = enabled
-    config.thread_only_mode = thread_only_mode
-    db_session.flush()
-    return config
-
-
-def delete_discord_channel_config(
-    db_session: Session,
-    guild_config_id: int,
-    channel_config_id: int,
-) -> bool:
-    """Delete a channel config. Returns True if deleted."""
-    result = db_session.execute(
-        delete(DiscordChannelConfig).where(
-            DiscordChannelConfig.guild_config_id == guild_config_id,
-            DiscordChannelConfig.id == channel_config_id,
-        )
-    )
-    db_session.flush()
-    return result.rowcount > 0  # type: ignore[attr-defined]
-
-
-def create_channel_config(
-    db_session: Session,
-    guild_config_id: int,
-    channel_view: DiscordChannelView,
-) -> DiscordChannelConfig:
-    """Create a new channel config with default settings (disabled by default, admin enables via UI)."""
-    config = DiscordChannelConfig(
-        guild_config_id=guild_config_id,
-        channel_id=channel_view.channel_id,
-        channel_name=channel_view.channel_name,
-        channel_type=channel_view.channel_type,
-        is_private=channel_view.is_private,
-    )
-    db_session.add(config)
-    db_session.flush()
-    return config
-
-
-def bulk_create_channel_configs(
-    db_session: Session,
-    guild_config_id: int,
-    channels: list[DiscordChannelView],
-) -> list[DiscordChannelConfig]:
-    """Create multiple channel configs at once. Skips existing channels."""
-    # Get existing channel IDs for this guild
-    existing_channel_ids = set(
-        db_session.scalars(
-            select(DiscordChannelConfig.channel_id).where(
-                DiscordChannelConfig.guild_config_id == guild_config_id
-            )
-        ).all()
-    )
-
-    # Create configs for new channels only
-    new_configs = []
-    for channel_view in channels:
-        if channel_view.channel_id not in existing_channel_ids:
-            config = DiscordChannelConfig(
-                guild_config_id=guild_config_id,
-                channel_id=channel_view.channel_id,
-                channel_name=channel_view.channel_name,
-                channel_type=channel_view.channel_type,
-                is_private=channel_view.is_private,
-            )
-            db_session.add(config)
-            new_configs.append(config)
-
-    db_session.flush()
-    return new_configs
-
-
-def sync_channel_configs(
-    db_session: Session,
-    guild_config_id: int,
-    current_channels: list[DiscordChannelView],
-) -> tuple[int, int, int]:
-    """Sync channel configs with current Discord channels.
-
-    - Creates configs for new channels (disabled by default)
-    - Removes configs for deleted channels
-    - Updates names and types for existing channels if changed
-
-    Returns: (added_count, removed_count, updated_count)
-    """
-    current_channel_map = {
-        channel_view.channel_id: channel_view for channel_view in current_channels
-    }
-    current_channel_ids = set(current_channel_map.keys())
-
-    # Get existing configs
-    existing_configs = get_channel_configs(db_session, guild_config_id)
-    existing_channel_ids = {c.channel_id for c in existing_configs}
-
-    # Find channels to add, remove, and potentially update
-    to_add = current_channel_ids - existing_channel_ids
-    to_remove = existing_channel_ids - current_channel_ids
-
-    # Add new channels
-    added_count = 0
-    for channel_id in to_add:
-        channel_view = current_channel_map[channel_id]
-        create_channel_config(db_session, guild_config_id, channel_view)
-        added_count += 1
-
-    # Remove deleted channels
-    removed_count = 0
-    for config in existing_configs:
-        if config.channel_id in to_remove:
-            db_session.delete(config)
-            removed_count += 1
-
-    # Update names, types, and privacy for existing channels if changed
-    updated_count = 0
-    for config in existing_configs:
-        if config.channel_id in current_channel_ids:
-            channel_view = current_channel_map[config.channel_id]
-            changed = False
-            if config.channel_name != channel_view.channel_name:
-                config.channel_name = channel_view.channel_name
-                changed = True
-            if config.channel_type != channel_view.channel_type:
-                config.channel_type = channel_view.channel_type
-                changed = True
-            if config.is_private != channel_view.is_private:
-                config.is_private = channel_view.is_private
-                changed = True
-            if changed:
-                updated_count += 1
-
-    db_session.flush()
-    return added_count, removed_count, updated_count
--- a/backend/onyx/db/input_prompt.py
+++ b/backend/onyx/db/input_prompt.py
@@ -3,8 +3,6 @@ from uuid import UUID
 from fastapi import HTTPException
 from sqlalchemy import or_
 from sqlalchemy import select
-from sqlalchemy.dialects.postgresql import insert as pg_insert
-from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import aliased
 from sqlalchemy.orm import Session

@@ -20,6 +18,45 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


+def insert_input_prompt_if_not_exists(
+    user: User | None,
+    input_prompt_id: int | None,
+    prompt: str,
+    content: str,
+    active: bool,
+    is_public: bool,
+    db_session: Session,
+    commit: bool = True,
+) -> InputPrompt:
+    if input_prompt_id is not None:
+        input_prompt = (
+            db_session.query(InputPrompt).filter_by(id=input_prompt_id).first()
+        )
+    else:
+        query = db_session.query(InputPrompt).filter(InputPrompt.prompt == prompt)
+        if user:
+            query = query.filter(InputPrompt.user_id == user.id)
+        else:
+            query = query.filter(InputPrompt.user_id.is_(None))
+        input_prompt = query.first()
+
+    if input_prompt is None:
+        input_prompt = InputPrompt(
+            id=input_prompt_id,
+            prompt=prompt,
+            content=content,
+            active=active,
+            is_public=is_public or user is None,
+            user_id=user.id if user else None,
+        )
+        db_session.add(input_prompt)
+
+    if commit:
+        db_session.commit()
+
+    return input_prompt
+
+
 def insert_input_prompt(
    prompt: str,
    content: str,
@@ -27,41 +64,16 @@ def insert_input_prompt(
    user: User | None,
    db_session: Session,
 ) -> InputPrompt:
-    user_id = user.id if user else None
-
-    # Use atomic INSERT ... ON CONFLICT DO NOTHING with RETURNING
-    # to avoid race conditions with the uniqueness check
-    stmt = pg_insert(InputPrompt).values(
+    input_prompt = InputPrompt(
        prompt=prompt,
        content=content,
        active=True,
        is_public=is_public,
-        user_id=user_id,
+        user_id=user.id if user is not None else None,
    )
-
-    # Use the appropriate constraint based on whether this is a user-owned or public prompt
-    if user_id is not None:
-        stmt = stmt.on_conflict_do_nothing(constraint="uq_inputprompt_prompt_user_id")
-    else:
-        # Partial unique indexes cannot be targeted by constraint name;
-        # must use index_elements + index_where
-        stmt = stmt.on_conflict_do_nothing(
-            index_elements=[InputPrompt.prompt],
-            index_where=InputPrompt.user_id.is_(None),
-        )
-
-    stmt = stmt.returning(InputPrompt)
-
-    result = db_session.execute(stmt)
-    input_prompt = result.scalar_one_or_none()
-
-    if input_prompt is None:
-        raise HTTPException(
-            status_code=409,
-            detail=f"A prompt shortcut with the name '{prompt}' already exists",
-        )
-
+    db_session.add(input_prompt)
    db_session.commit()
+
    return input_prompt


@@ -86,40 +98,23 @@ def update_input_prompt(
    input_prompt.content = content
    input_prompt.active = active

-    try:
-        db_session.commit()
-    except IntegrityError:
-        db_session.rollback()
-        raise HTTPException(
-            status_code=409,
-            detail=f"A prompt shortcut with the name '{prompt}' already exists",
-        )
-
+    db_session.commit()
    return input_prompt


 def validate_user_prompt_authorization(
    user: User | None, input_prompt: InputPrompt
 ) -> bool:
-    """
-    Check if the user is authorized to modify the given input prompt.
-    Returns True only if the user owns the prompt.
-    Returns False for public prompts (only admins can modify those),
-    unless auth is disabled (then anyone can manage public prompts).
-    """
    prompt = InputPromptSnapshot.from_model(input_prompt=input_prompt)

-    # Public prompts cannot be modified via the user API (unless auth is disabled)
-    if prompt.is_public or prompt.user_id is None:
-        return AUTH_TYPE == AuthType.DISABLED
+    if prompt.user_id is not None:
+        if user is None:
+            return False

-    # User must be logged in
-    if user is None:
-        return False
-
-    # User must own the prompt
-    user_details = UserInfo.from_model(user)
-    return str(user_details.id) == str(prompt.user_id)
+        user_details = UserInfo.from_model(user)
+        if str(user_details.id) != str(prompt.user_id):
+            return False
+    return True


 def remove_public_input_prompt(input_prompt_id: int, db_session: Session) -> None:
--- a/backend/onyx/db/memory.py
+++ b/backend/onyx/db/memory.py
@@ -9,9 +9,6 @@ def get_memories(user: User | None, db_session: Session) -> list[str]:
    if user is None:
        return []

-    if not user.use_memories:
-        return []
-
    user_info = [
        f"User's name: {user.personal_name}" if user.personal_name else "",
        f"User's role: {user.personal_role}" if user.personal_role else "",
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -26,7 +26,6 @@ from sqlalchemy import ForeignKey
 from sqlalchemy import func
 from sqlalchemy import Index
 from sqlalchemy import Integer
-from sqlalchemy import BigInteger

 from sqlalchemy import Sequence
 from sqlalchemy import String
@@ -84,6 +83,7 @@ from onyx.utils.special_types import JSON_ro
 from onyx.file_store.models import FileDescriptor
 from onyx.llm.override_models import LLMOverride
 from onyx.llm.override_models import PromptOverride
+from onyx.context.search.enums import RecencyBiasSetting
 from onyx.kg.models import KGStage
 from onyx.server.features.mcp.models import MCPConnectionData
 from onyx.utils.encryption import decrypt_bytes_to_string
@@ -91,8 +91,6 @@ from onyx.utils.encryption import encrypt_string_to_bytes
 from onyx.utils.headers import HeaderItemDict
 from shared_configs.enums import EmbeddingProvider
 from shared_configs.enums import RerankerProvider
-from onyx.context.search.enums import RecencyBiasSetting
-

 logger = setup_logger()

@@ -188,7 +186,6 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
        nullable=True,
        default=None,
    )
-    chat_background: Mapped[str | None] = mapped_column(String, nullable=True)
    # personalization fields are exposed via the chat user settings "Personalization" tab
    personal_name: Mapped[str | None] = mapped_column(String, nullable=True)
    personal_role: Mapped[str | None] = mapped_column(String, nullable=True)
@@ -2335,23 +2332,6 @@ class SearchDoc(Base):
    )


-class SearchQuery(Base):
-    # This table contains search queries for the Search UI. There are no followups and less is stored because the reply
-    # functionality is simply to rerun the search query again as things may have changed and this is more common for search.
-    __tablename__ = "search_query"
-    id: Mapped[UUID] = mapped_column(
-        PGUUID(as_uuid=True), primary_key=True, default=uuid4
-    )
-    user_id: Mapped[UUID] = mapped_column(PGUUID(as_uuid=True), ForeignKey("user.id"))
-    query: Mapped[str] = mapped_column(String)
-    query_expansions: Mapped[list[str] | None] = mapped_column(
-        postgresql.ARRAY(String), nullable=True
-    )
-    created_at: Mapped[datetime.datetime] = mapped_column(
-        DateTime(timezone=True), server_default=func.now()
-    )
-
-
 """
 Feedback, Logging, Metrics Tables
 """
@@ -3040,124 +3020,6 @@ class SlackBot(Base):
    )


-class DiscordBotConfig(Base):
-    """Global Discord bot configuration (one per tenant).
-
-    Stores the bot token when not provided via DISCORD_BOT_TOKEN env var.
-    Uses a fixed ID with check constraint to enforce only one row per tenant.
-    """
-
-    __tablename__ = "discord_bot_config"
-
-    id: Mapped[str] = mapped_column(
-        String, primary_key=True, server_default=text("'SINGLETON'")
-    )
-    bot_token: Mapped[str] = mapped_column(EncryptedString(), nullable=False)
-    created_at: Mapped[datetime.datetime] = mapped_column(
-        DateTime(timezone=True), server_default=func.now(), nullable=False
-    )
-
-
-class DiscordGuildConfig(Base):
-    """Configuration for a Discord guild (server) connected to this tenant.
-
-    registration_key is a one-time key used to link a Discord server to this tenant.
-    Format: discord_<tenant_id>.<random_token>
-    guild_id is NULL until the Discord admin runs !register with the key.
-    """
-
-    __tablename__ = "discord_guild_config"
-
-    id: Mapped[int] = mapped_column(primary_key=True)
-
-    # Discord snowflake - NULL until registered via command in Discord
-    guild_id: Mapped[int | None] = mapped_column(BigInteger, nullable=True, unique=True)
-    guild_name: Mapped[str | None] = mapped_column(String(256), nullable=True)
-
-    # One-time registration key: discord_<tenant_id>.<random_token>
-    registration_key: Mapped[str] = mapped_column(String, unique=True, nullable=False)
-
-    registered_at: Mapped[datetime.datetime | None] = mapped_column(
-        DateTime(timezone=True), nullable=True
-    )
-
-    # Configuration
-    default_persona_id: Mapped[int | None] = mapped_column(
-        ForeignKey("persona.id", ondelete="SET NULL"), nullable=True
-    )
-    enabled: Mapped[bool] = mapped_column(
-        Boolean, server_default=text("true"), nullable=False
-    )
-
-    # Relationships
-    default_persona: Mapped["Persona | None"] = relationship(
-        "Persona", foreign_keys=[default_persona_id]
-    )
-    channels: Mapped[list["DiscordChannelConfig"]] = relationship(
-        back_populates="guild_config", cascade="all, delete-orphan"
-    )
-
-
-class DiscordChannelConfig(Base):
-    """Per-channel configuration for Discord bot behavior.
-
-    Used to whitelist specific channels and configure per-channel behavior.
-    """
-
-    __tablename__ = "discord_channel_config"
-
-    id: Mapped[int] = mapped_column(primary_key=True)
-    guild_config_id: Mapped[int] = mapped_column(
-        ForeignKey("discord_guild_config.id", ondelete="CASCADE"), nullable=False
-    )
-
-    # Discord snowflake
-    channel_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
-    channel_name: Mapped[str] = mapped_column(String(), nullable=False)
-
-    # Channel type from Discord (text, forum)
-    channel_type: Mapped[str] = mapped_column(
-        String(20), server_default=text("'text'"), nullable=False
-    )
-
-    # True if @everyone cannot view the channel
-    is_private: Mapped[bool] = mapped_column(
-        Boolean, server_default=text("false"), nullable=False
-    )
-
-    # If true, bot only responds to messages in threads
-    # Otherwise, will reply in channel
-    thread_only_mode: Mapped[bool] = mapped_column(
-        Boolean, server_default=text("false"), nullable=False
-    )
-
-    # If true (default), bot only responds when @mentioned
-    # If false, bot responds to ALL messages in this channel
-    require_bot_invocation: Mapped[bool] = mapped_column(
-        Boolean, server_default=text("true"), nullable=False
-    )
-
-    # Override the guild's default persona for this channel
-    persona_override_id: Mapped[int | None] = mapped_column(
-        ForeignKey("persona.id", ondelete="SET NULL"), nullable=True
-    )
-
-    enabled: Mapped[bool] = mapped_column(
-        Boolean, server_default=text("false"), nullable=False
-    )
-
-    # Relationships
-    guild_config: Mapped["DiscordGuildConfig"] = relationship(back_populates="channels")
-    persona_override: Mapped["Persona | None"] = relationship()
-
-    # Constraints
-    __table_args__ = (
-        UniqueConstraint(
-            "guild_config_id", "channel_id", name="uq_discord_channel_guild_channel"
-        ),
-    )
-
-
 class Milestone(Base):
    # This table is used to track significant events for a deployment towards finding value
    # The table is currently not used for features but it may be used in the future to inform
@@ -3235,6 +3097,25 @@ class FileRecord(Base):
    )


+class AgentSearchMetrics(Base):
+    __tablename__ = "agent__search_metrics"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    user_id: Mapped[UUID | None] = mapped_column(
+        ForeignKey("user.id", ondelete="CASCADE"), nullable=True
+    )
+    persona_id: Mapped[int | None] = mapped_column(
+        ForeignKey("persona.id"), nullable=True
+    )
+    agent_type: Mapped[str] = mapped_column(String)
+    start_time: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
+    base_duration_s: Mapped[float] = mapped_column(Float)
+    full_duration_s: Mapped[float] = mapped_column(Float)
+    base_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
+    refined_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
+    all_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
+
+
 """
 ************************************************************************
 Enterprise Edition Models
@@ -3627,18 +3508,6 @@ class InputPrompt(Base):
        ForeignKey("user.id", ondelete="CASCADE"), nullable=True
    )

-    __table_args__ = (
-        # Unique constraint on (prompt, user_id) for user-owned prompts
-        UniqueConstraint("prompt", "user_id", name="uq_inputprompt_prompt_user_id"),
-        # Partial unique index for public prompts (user_id IS NULL)
-        Index(
-            "uq_inputprompt_prompt_public",
-            "prompt",
-            unique=True,
-            postgresql_where=text("user_id IS NULL"),
-        ),
-    )
-

 class InputPrompt__User(Base):
    __tablename__ = "inputprompt__user"
@@ -3647,7 +3516,7 @@ class InputPrompt__User(Base):
        ForeignKey("inputprompt.id"), primary_key=True
    )
    user_id: Mapped[UUID | None] = mapped_column(
-        ForeignKey("user.id"), primary_key=True
+        ForeignKey("inputprompt.id"), primary_key=True
    )
    disabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)

--- a/backend/onyx/db/swap_index.py
+++ b/backend/onyx/db/swap_index.py
@@ -20,7 +20,7 @@ from onyx.db.models import SearchSettings
 from onyx.db.search_settings import get_current_search_settings
 from onyx.db.search_settings import get_secondary_search_settings
 from onyx.db.search_settings import update_search_settings_status
-from onyx.document_index.factory import get_all_document_indices
+from onyx.document_index.factory import get_default_document_index
 from onyx.key_value_store.factory import get_kv_store
 from onyx.utils.logger import setup_logger

@@ -80,43 +80,39 @@ def _perform_index_swap(
        db_session=db_session,
    )

-    # This flow is for checking and possibly creating an index so we get all
-    # indices.
-    document_indices = get_all_document_indices(new_search_settings, None, None)
+    # remove the old index from the vector db
+    document_index = get_default_document_index(new_search_settings, None)

    WAIT_SECONDS = 5

-    for document_index in document_indices:
-        success = False
-        for x in range(VESPA_NUM_ATTEMPTS_ON_STARTUP):
-            try:
-                logger.notice(
-                    f"Document index {document_index.__class__.__name__} swap (attempt {x+1}/{VESPA_NUM_ATTEMPTS_ON_STARTUP})..."
-                )
-                document_index.ensure_indices_exist(
-                    primary_embedding_dim=new_search_settings.final_embedding_dim,
-                    primary_embedding_precision=new_search_settings.embedding_precision,
-                    # just finished swap, no more secondary index
-                    secondary_index_embedding_dim=None,
-                    secondary_index_embedding_precision=None,
-                )
-
-                logger.notice("Document index swap complete.")
-                success = True
-                break
-            except Exception:
-                logger.exception(
-                    f"Document index swap for {document_index.__class__.__name__} did not succeed. "
-                    f"The document index services may not be ready yet. Retrying in {WAIT_SECONDS} seconds."
-                )
-                time.sleep(WAIT_SECONDS)
-
-        if not success:
-            logger.error(
-                f"Document index swap for {document_index.__class__.__name__} did not succeed. "
-                f"Attempt limit reached. ({VESPA_NUM_ATTEMPTS_ON_STARTUP})"
+    success = False
+    for x in range(VESPA_NUM_ATTEMPTS_ON_STARTUP):
+        try:
+            logger.notice(
+                f"Vespa index swap (attempt {x+1}/{VESPA_NUM_ATTEMPTS_ON_STARTUP})..."
            )
-            return None
+            document_index.ensure_indices_exist(
+                primary_embedding_dim=new_search_settings.final_embedding_dim,
+                primary_embedding_precision=new_search_settings.embedding_precision,
+                # just finished swap, no more secondary index
+                secondary_index_embedding_dim=None,
+                secondary_index_embedding_precision=None,
+            )
+
+            logger.notice("Vespa index swap complete.")
+            success = True
+            break
+        except Exception:
+            logger.exception(
+                f"Vespa index swap did not succeed. The Vespa service may not be ready yet. Retrying in {WAIT_SECONDS} seconds."
+            )
+            time.sleep(WAIT_SECONDS)
+
+    if not success:
+        logger.error(
+            f"Vespa index swap did not succeed. Attempt limit reached. ({VESPA_NUM_ATTEMPTS_ON_STARTUP})"
+        )
+        return None

    return current_search_settings

--- a/backend/onyx/db/user_preferences.py
+++ b/backend/onyx/db/user_preferences.py
@@ -139,20 +139,6 @@ def update_user_theme_preference(
    db_session.commit()


-def update_user_chat_background(
-    user_id: UUID,
-    chat_background: str | None,
-    db_session: Session,
-) -> None:
-    """Update user's chat background setting."""
-    db_session.execute(
-        update(User)
-        .where(User.id == user_id)  # type: ignore
-        .values(chat_background=chat_background)
-    )
-    db_session.commit()
-
-
 def update_user_personalization(
    user_id: UUID,
    *,
--- a/backend/onyx/db/utils.py
+++ b/backend/onyx/db/utils.py
@@ -40,10 +40,3 @@ class DocumentRow(BaseModel):
 class SortOrder(str, Enum):
    ASC = "asc"
    DESC = "desc"
-
-
-class DiscordChannelView(BaseModel):
-    channel_id: int
-    channel_name: str
-    channel_type: str = "text"  # text, forum
-    is_private: bool = False  # True if @everyone cannot view the channel
--- a/backend/onyx/deep_research/dr_loop.py
+++ b/backend/onyx/deep_research/dr_loop.py
@@ -287,7 +287,6 @@ def run_deep_research_llm_loop(
                token_count=100,
                message_type=MessageType.USER,
            )
-
            truncated_message_history = construct_message_history(
                system_prompt=system_prompt,
                custom_agent_prompt=None,
--- a/backend/onyx/document_index/chunk_content_enrichment.py
+++ b/backend/onyx/document_index/chunk_content_enrichment.py
@@ -1,106 +0,0 @@
-from onyx.configs.app_configs import BLURB_SIZE
-from onyx.configs.constants import RETURN_SEPARATOR
-from onyx.context.search.models import InferenceChunk
-from onyx.context.search.models import InferenceChunkUncleaned
-from onyx.indexing.models import DocAwareChunk
-from onyx.indexing.models import DocMetadataAwareIndexChunk
-
-
-def generate_enriched_content_for_chunk_text(chunk: DocMetadataAwareIndexChunk) -> str:
-    return f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_keyword}"
-
-
-def generate_enriched_content_for_chunk_embedding(chunk: DocAwareChunk) -> str:
-    return f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_semantic}"
-
-
-def cleanup_content_for_chunks(
-    chunks: list[InferenceChunkUncleaned],
-) -> list[InferenceChunk]:
-    """
-    Removes indexing-time content additions from chunks. Inverse of
-    generate_enriched_content_for_chunk.
-
-    During indexing, chunks are augmented with additional text to improve search
-    quality:
-    - Title prepended to content (for better keyword/semantic matching)
-    - Metadata suffix appended to content
-    - Contextual RAG: doc_summary (beginning) and chunk_context (end)
-
-    This function strips these additions before returning chunks to users,
-    restoring the original document content. Cleaning is applied in sequence:
-    1. Title removal:
-        - Full match: Strips exact title from beginning
-        - Partial match: If content starts with title[:BLURB_SIZE], splits on
-          RETURN_SEPARATOR to remove title section
-    2. Metadata suffix removal:
-        - Strips metadata_suffix from end, plus trailing RETURN_SEPARATOR
-    3. Contextual RAG removal:
-        - Strips doc_summary from beginning (if present)
-        - Strips chunk_context from end (if present)
-
-    TODO(andrei): This entire function is not that fantastic, clean it up during
-    QA before rolling out OpenSearch.
-
-    Args:
-        chunks: Chunks as retrieved from the document index with indexing
-            augmentations intact.
-
-    Returns:
-        Clean InferenceChunk objects with augmentations removed, containing only
-            the original document content that should be shown to users.
-    """
-
-    def _remove_title(chunk: InferenceChunkUncleaned) -> str:
-        # TODO(andrei): This was ported over from
-        # backend/onyx/document_index/vespa/vespa_document_index.py but I don't
-        # think this logic is correct. In Vespa at least we set the title field
-        # from the output of get_title_for_document_index, which is not
-        # necessarily the same data that is prepended to the content; that comes
-        # from title_prefix.
-        # This was added in
-        # https://github.com/onyx-dot-app/onyx/commit/e90c66c1b61c5b7da949652d703f7c906863e6e4#diff-2a2a29d5929de75cdaea77867a397934d9f8b785ce40a861c0d704033e3663ab,
-        # see postprocessing.py. At that time the content enrichment logic was
-        # also added in that commit, see
-        # https://github.com/onyx-dot-app/onyx/commit/e90c66c1b61c5b7da949652d703f7c906863e6e4#diff-d807718aa263a15c1d991a4ab063c360c8419eaad210b4ba70e1e9f47d2aa6d2R77
-        # chunker.py.
-        if not chunk.title or not chunk.content:
-            return chunk.content
-
-        if chunk.content.startswith(chunk.title):
-            return chunk.content[len(chunk.title) :].lstrip()
-
-        # BLURB SIZE is by token instead of char but each token is at least 1 char
-        # If this prefix matches the content, it's assumed the title was prepended
-        if chunk.content.startswith(chunk.title[:BLURB_SIZE]):
-            return (
-                chunk.content.split(RETURN_SEPARATOR, 1)[-1]
-                if RETURN_SEPARATOR in chunk.content
-                else chunk.content
-            )
-        return chunk.content
-
-    def _remove_metadata_suffix(chunk: InferenceChunkUncleaned) -> str:
-        if not chunk.metadata_suffix:
-            return chunk.content
-        return chunk.content.removesuffix(chunk.metadata_suffix).rstrip(
-            RETURN_SEPARATOR
-        )
-
-    def _remove_contextual_rag(chunk: InferenceChunkUncleaned) -> str:
-        # remove document summary
-        if chunk.doc_summary and chunk.content.startswith(chunk.doc_summary):
-            chunk.content = chunk.content[len(chunk.doc_summary) :].lstrip()
-        # remove chunk context
-        if chunk.chunk_context and chunk.content.endswith(chunk.chunk_context):
-            chunk.content = chunk.content[
-                : len(chunk.content) - len(chunk.chunk_context)
-            ].rstrip()
-        return chunk.content
-
-    for chunk in chunks:
-        chunk.content = _remove_title(chunk)
-        chunk.content = _remove_metadata_suffix(chunk)
-        chunk.content = _remove_contextual_rag(chunk)
-
-    return [chunk.to_inference_chunk() for chunk in chunks]
--- a/backend/onyx/document_index/factory.py
+++ b/backend/onyx/document_index/factory.py
@@ -1,8 +1,9 @@
 import httpx
+from sqlalchemy.orm import Session

-from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
-from onyx.configs.app_configs import ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX
+from onyx.configs.app_configs import ENABLE_OPENSEARCH_FOR_ONYX
 from onyx.db.models import SearchSettings
+from onyx.db.search_settings import get_current_search_settings
 from onyx.document_index.interfaces import DocumentIndex
 from onyx.document_index.opensearch.opensearch_document_index import (
    OpenSearchOldDocumentIndex,
@@ -16,24 +17,17 @@ def get_default_document_index(
    secondary_search_settings: SearchSettings | None,
    httpx_client: httpx.Client | None = None,
 ) -> DocumentIndex:
-    """Gets the default document index from env vars.
+    """Primary index is the index that is used for querying/updating etc.
+    Secondary index is for when both the currently used index and the upcoming
+    index both need to be updated, updates are applied to both indices"""

-    To be used for retrieval only. Indexing should be done through both indices
-    until Vespa is deprecated.
-
-    Pre-existing docstring for this function, although secondary indices are not
-    currently supported:
-    Primary index is the index that is used for querying/updating etc. Secondary
-    index is for when both the currently used index and the upcoming index both
-    need to be updated, updates are applied to both indices.
-    """
    secondary_index_name: str | None = None
    secondary_large_chunks_enabled: bool | None = None
    if secondary_search_settings:
        secondary_index_name = secondary_search_settings.index_name
        secondary_large_chunks_enabled = secondary_search_settings.large_chunks_enabled

-    if ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX:
+    if ENABLE_OPENSEARCH_FOR_ONYX:
        return OpenSearchOldDocumentIndex(
            index_name=search_settings.index_name,
            secondary_index_name=secondary_index_name,
@@ -53,48 +47,12 @@ def get_default_document_index(
        )


-def get_all_document_indices(
-    search_settings: SearchSettings,
-    secondary_search_settings: SearchSettings | None,
-    httpx_client: httpx.Client | None = None,
-) -> list[DocumentIndex]:
-    """Gets all document indices.
-
-    NOTE: Will only return an OpenSearch index interface if
-    ENABLE_OPENSEARCH_INDEXING_FOR_ONYX is True. This is so we don't break flows
-    where we know it won't be enabled.
-
-    Used for indexing only. Until Vespa is deprecated we will index into both
-    document indices. Retrieval is done through only one index however.
-
-    Large chunks and secondary indices are not currently supported so we
-    hardcode appropriate values.
+def get_current_primary_default_document_index(db_session: Session) -> DocumentIndex:
    """
-    vespa_document_index = VespaIndex(
-        index_name=search_settings.index_name,
-        secondary_index_name=(
-            secondary_search_settings.index_name if secondary_search_settings else None
-        ),
-        large_chunks_enabled=search_settings.large_chunks_enabled,
-        secondary_large_chunks_enabled=(
-            secondary_search_settings.large_chunks_enabled
-            if secondary_search_settings
-            else None
-        ),
-        multitenant=MULTI_TENANT,
-        httpx_client=httpx_client,
+    TODO: Use redis to cache this or something
+    """
+    search_settings = get_current_search_settings(db_session)
+    return get_default_document_index(
+        search_settings,
+        None,
    )
-    opensearch_document_index: OpenSearchOldDocumentIndex | None = None
-    if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
-        opensearch_document_index = OpenSearchOldDocumentIndex(
-            index_name=search_settings.index_name,
-            secondary_index_name=None,
-            large_chunks_enabled=False,
-            secondary_large_chunks_enabled=None,
-            multitenant=MULTI_TENANT,
-            httpx_client=httpx_client,
-        )
-    result: list[DocumentIndex] = [vespa_document_index]
-    if opensearch_document_index:
-        result.append(opensearch_document_index)
-    return result
--- a/backend/onyx/document_index/interfaces_new.py
+++ b/backend/onyx/document_index/interfaces_new.py
@@ -167,9 +167,9 @@ class IndexRetrievalFilters(BaseModel):

 class SchemaVerifiable(abc.ABC):
    """
-    Class must implement document index schema verification. For example, verify
-    that all of the necessary attributes for indexing, querying, filtering, and
-    fields to return from search are all valid in the schema.
+    Class must implement document index schema verification. For example, verify that all of the
+    necessary attributes for indexing, querying, filtering, and fields to return from search are
+    all valid in the schema.
    """

    @abc.abstractmethod
@@ -179,18 +179,13 @@ class SchemaVerifiable(abc.ABC):
        embedding_precision: EmbeddingPrecision,
    ) -> None:
        """
-        Verifies that the document index exists and is consistent with the
-        expectations in the code.
+        Verify that the document index exists and is consistent with the expectations in the code. For certain search
+        engines, the schema needs to be created before indexing can happen. This call should create the schema if it
+        does not exist.

-        For certain search engines, the schema needs to be created before
-        indexing can happen. This call should create the schema if it does not
-        exist.
-
-        Args:
-            embedding_dim: Vector dimensionality for the vector similarity part
-                of the search.
-            embedding_precision: Precision of the values of the vectors for the
-                similarity part of the search.
+        Parameters:
+        - embedding_dim: Vector dimensionality for the vector similarity part of the search
+        - embedding_precision: Precision of the vector similarity part of the search
        """
        raise NotImplementedError

@@ -243,8 +238,8 @@ class Deletable(abc.ABC):
    @abc.abstractmethod
    def delete(
        self,
-        # TODO(andrei): Fine for now but this can probably be a batch operation
-        # that takes in a list of IDs.
+        # TODO(andrei): Fine for now but this can probably be a batch operation that
+        # takes in a list of IDs.
        document_id: str,
        chunk_count: int | None = None,
        # TODO(andrei): Shouldn't this also have some acl filtering at minimum?
@@ -288,7 +283,10 @@ class Updatable(abc.ABC):
        self,
        update_requests: list[MetadataUpdateRequest],
    ) -> None:
-        """Updates some set of chunks.
+        """
+        Updates some set of chunks. The document and fields to update are specified in the update
+        requests. Each update request in the list applies its changes to a list of document ids.
+        None values mean that the field does not need an update.

        The document and fields to update are specified in the update requests.
        Each update request in the list applies its changes to a list of
--- a/backend/onyx/document_index/opensearch/client.py
+++ b/backend/onyx/document_index/opensearch/client.py
@@ -1,12 +1,8 @@
 import logging
-import time
 from typing import Any
-from typing import Generic
-from typing import TypeVar

 from opensearchpy import OpenSearch
 from opensearchpy.exceptions import TransportError
-from pydantic import BaseModel

 from onyx.configs.app_configs import OPENSEARCH_ADMIN_PASSWORD
 from onyx.configs.app_configs import OPENSEARCH_ADMIN_USERNAME
@@ -21,36 +17,10 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger(__name__)
 # Set the logging level to WARNING to ignore INFO and DEBUG logs from
 # opensearch. By default it emits INFO-level logs for every request.
-# TODO(andrei): I don't think this is working as intended, I still see spam in
-# logs. The module name is probably wrong or opensearchpy initializes a logger
-# dynamically along with an instance of a client class. Look at the constructor
-# for OpenSearch.
 opensearch_logger = logging.getLogger("opensearchpy")
 opensearch_logger.setLevel(logging.WARNING)


-SchemaDocumentModel = TypeVar("SchemaDocumentModel")
-
-
-class SearchHit(BaseModel, Generic[SchemaDocumentModel]):
-    """Represents a hit from OpenSearch in response to a query.
-
-    Templated on the specific document model as defined by a schema.
-    """
-
-    model_config = {"frozen": True}
-
-    # The document chunk source retrieved from OpenSearch.
-    document_chunk: SchemaDocumentModel
-    # The match score for the document chunk as calculated by OpenSearch. Only
-    # relevant for "fuzzy searches"; this will be None for direct queries where
-    # score is not relevant like direct retrieval on ID.
-    score: float | None = None
-    # Maps schema property name to a list of highlighted snippets with match
-    # terms wrapped in tags (e.g. "something <hi>keyword</hi> other thing").
-    match_highlights: dict[str, list[str]] = {}
-
-
 class OpenSearchClient:
    """Client for interacting with OpenSearch.

@@ -260,9 +230,9 @@ class OpenSearchClient:
            )
        result_string: str = result.get("result", "")
        match result_string:
-            # Sanity check.
            case "created":
                return
+            # Sanity check.
            case "updated":
                raise RuntimeError(
                    f'The OpenSearch client returned result "updated" for indexing document chunk "{document_chunk_id}". '
@@ -337,49 +307,9 @@ class OpenSearchClient:

        return num_deleted

-    def update_document(
-        self, document_chunk_id: str, properties_to_update: dict[str, Any]
-    ) -> None:
-        """Updates a document's properties.
-
-        Args:
-            document_chunk_id: The OpenSearch ID of the document chunk to
-                update.
-            properties_to_update: The properties of the document to update. Each
-                property should exist in the schema.
-
-        Raises:
-            Exception: There was an error updating the document.
-        """
-        update_body: dict[str, Any] = {"doc": properties_to_update}
-        result = self._client.update(
-            index=self._index_name,
-            id=document_chunk_id,
-            body=update_body,
-            _source=False,
-        )
-        result_id = result.get("_id", "")
-        # Sanity check.
-        if result_id != document_chunk_id:
-            raise RuntimeError(
-                f'Upon trying to update a document, OpenSearch responded with ID "{result_id}" '
-                f'instead of "{document_chunk_id}" which is the ID it was given.'
-            )
-        result_string: str = result.get("result", "")
-        match result_string:
-            # Sanity check.
-            case "updated":
-                return
-            case "noop":
-                logger.warning(
-                    f'OpenSearch reported a no-op when trying to update document with ID "{document_chunk_id}".'
-                )
-                return
-            case _:
-                raise RuntimeError(
-                    f'The OpenSearch client returned result "{result_string}" for updating document chunk "{document_chunk_id}". '
-                    "This is unexpected."
-                )
+    def update_document(self) -> None:
+        # TODO(andrei): Implement this.
+        raise NotImplementedError("Not implemented.")

    def get_document(self, document_chunk_id: str) -> DocumentChunk:
        """Gets a document.
@@ -448,13 +378,12 @@ class OpenSearchClient:

    def search(
        self, body: dict[str, Any], search_pipeline_id: str | None
-    ) -> list[SearchHit[DocumentChunk]]:
+    ) -> list[DocumentChunk]:
        """Searches the index.

        TODO(andrei): Ideally we could check that every field in the body is
        present in the index, to avoid a class of runtime bugs that could easily
-        be caught during development. Or change the function signature to accept
-        a predefined pydantic model of allowed fields.
+        be caught during development.

        Args:
            body: The body of the search request. See the OpenSearch
@@ -466,7 +395,7 @@ class OpenSearchClient:
            Exception: There was an error searching the index.

        Returns:
-            List of search hits that match the search request.
+            List of document chunks that match the search request.
        """
        result: dict[str, Any]
        if search_pipeline_id:
@@ -478,22 +407,15 @@ class OpenSearchClient:

        hits = self._get_hits_from_search_result(result)

-        search_hits: list[SearchHit[DocumentChunk]] = []
+        result_chunks: list[DocumentChunk] = []
        for hit in hits:
            document_chunk_source: dict[str, Any] | None = hit.get("_source")
            if not document_chunk_source:
                raise RuntimeError(
                    f"Document chunk with ID \"{hit.get('_id', '')}\" has no data."
                )
-            document_chunk_score = hit.get("_score", None)
-            match_highlights: dict[str, list[str]] = hit.get("highlight", {})
-            search_hit = SearchHit[DocumentChunk](
-                document_chunk=DocumentChunk.model_validate(document_chunk_source),
-                score=document_chunk_score,
-                match_highlights=match_highlights,
-            )
-            search_hits.append(search_hit)
-        return search_hits
+            result_chunks.append(DocumentChunk.model_validate(document_chunk_source))
+        return result_chunks

    def search_for_document_ids(self, body: dict[str, Any]) -> list[str]:
        """Searches the index and returns only document chunk IDs.
@@ -570,9 +492,6 @@ class OpenSearchClient:
    def close(self) -> None:
        """Closes the client.

-        TODO(andrei): Can we have some way to auto close when the client no
-        longer has any references?
-
        Raises:
            Exception: There was an error closing the client.
        """
@@ -600,55 +519,3 @@ class OpenSearchClient:
            )
        hits_second_layer: list[Any] = hits_first_layer.get("hits", [])
        return hits_second_layer
-
-
-def wait_for_opensearch_with_timeout(
-    wait_interval_s: int = 5,
-    wait_limit_s: int = 60,
-    client: OpenSearchClient | None = None,
-) -> bool:
-    """Waits for OpenSearch to become ready subject to a timeout.
-
-    Will create a new dummy client if no client is provided. Will close this
-    client at the end of the function. Will not close the client if it was
-    supplied.
-
-    Args:
-        wait_interval_s: The interval in seconds to wait between checks.
-            Defaults to 5.
-        wait_limit_s: The total timeout in seconds to wait for OpenSearch to
-            become ready. Defaults to 60.
-        client: The OpenSearch client to use for pinging. If None, a new dummy
-            client will be created. Defaults to None.
-
-    Returns:
-        True if OpenSearch is ready, False otherwise.
-    """
-    made_client = False
-    try:
-        if client is None:
-            # NOTE: index_name does not matter because we are only using this object
-            # to ping.
-            # TODO(andrei): Make this better.
-            client = OpenSearchClient(index_name="")
-            made_client = True
-        time_start = time.monotonic()
-        while True:
-            if client.ping():
-                logger.info("[OpenSearch] Readiness probe succeeded. Continuing...")
-                return True
-            time_elapsed = time.monotonic() - time_start
-            if time_elapsed > wait_limit_s:
-                logger.info(
-                    f"[OpenSearch] Readiness probe did not succeed within the timeout "
-                    f"({wait_limit_s} seconds)."
-                )
-                return False
-            logger.info(
-                f"[OpenSearch] Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={wait_limit_s:.1f}"
-            )
-            time.sleep(wait_interval_s)
-    finally:
-        if made_client:
-            assert client is not None
-            client.close()
--- a/backend/onyx/document_index/opensearch/opensearch_document_index.py
+++ b/backend/onyx/document_index/opensearch/opensearch_document_index.py
@@ -1,5 +1,4 @@
 import json
-from typing import Any

 import httpx

@@ -7,7 +6,6 @@ from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
 from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
    get_experts_stores_representations,
 )
-from onyx.connectors.models import convert_metadata_list_of_strings_to_dict
 from onyx.context.search.enums import QueryType
 from onyx.context.search.models import IndexFilters
 from onyx.context.search.models import InferenceChunk
@@ -15,10 +13,6 @@ from onyx.context.search.models import InferenceChunkUncleaned
 from onyx.context.search.models import QueryExpansionType
 from onyx.db.enums import EmbeddingPrecision
 from onyx.db.models import DocumentSource
-from onyx.document_index.chunk_content_enrichment import cleanup_content_for_chunks
-from onyx.document_index.chunk_content_enrichment import (
-    generate_enriched_content_for_chunk_text,
-)
 from onyx.document_index.interfaces import DocumentIndex as OldDocumentIndex
 from onyx.document_index.interfaces import (
    DocumentInsertionRecord as OldDocumentInsertionRecord,
@@ -35,16 +29,8 @@ from onyx.document_index.interfaces_new import IndexingMetadata
 from onyx.document_index.interfaces_new import MetadataUpdateRequest
 from onyx.document_index.interfaces_new import TenantState
 from onyx.document_index.opensearch.client import OpenSearchClient
-from onyx.document_index.opensearch.client import SearchHit
-from onyx.document_index.opensearch.schema import ACCESS_CONTROL_LIST_FIELD_NAME
-from onyx.document_index.opensearch.schema import CONTENT_FIELD_NAME
-from onyx.document_index.opensearch.schema import DOCUMENT_SETS_FIELD_NAME
 from onyx.document_index.opensearch.schema import DocumentChunk
 from onyx.document_index.opensearch.schema import DocumentSchema
-from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
-from onyx.document_index.opensearch.schema import GLOBAL_BOOST_FIELD_NAME
-from onyx.document_index.opensearch.schema import HIDDEN_FIELD_NAME
-from onyx.document_index.opensearch.schema import USER_PROJECTS_FIELD_NAME
 from onyx.document_index.opensearch.search import DocumentQuery
 from onyx.document_index.opensearch.search import (
    MIN_MAX_NORMALIZATION_PIPELINE_CONFIG,
@@ -68,40 +54,14 @@ from shared_configs.model_server_models import Embedding
 logger = setup_logger(__name__)


-def _convert_retrieved_opensearch_chunk_to_inference_chunk_uncleaned(
+def _convert_opensearch_chunk_to_inference_chunk_uncleaned(
    chunk: DocumentChunk,
-    score: float | None,
-    highlights: dict[str, list[str]],
 ) -> InferenceChunkUncleaned:
-    """
-    Generates an inference chunk from an OpenSearch document chunk, its score,
-    and its match highlights.
-
-    Args:
-        chunk: The document chunk returned by OpenSearch.
-        score: The document chunk match score as calculated by OpenSearch. Only
-            relevant for searches like hybrid search. It is acceptable for this
-            value to be None for results from other queries like ID-based
-            retrieval as a match score makes no sense in those contexts.
-        highlights: Maps schema property name to a list of highlighted snippets
-            with match terms wrapped in tags (e.g. "something <hi>keyword</hi>
-            other thing").
-
-    Returns:
-        An Onyx inference chunk representation.
-    """
    return InferenceChunkUncleaned(
        chunk_id=chunk.chunk_index,
        blurb=chunk.blurb,
-        # Includes extra content prepended/appended during indexing.
        content=chunk.content,
-        # When we read a string and turn it into a dict the keys will be
-        # strings, but in this case they need to be ints.
-        source_links=(
-            {int(k): v for k, v in json.loads(chunk.source_links).items()}
-            if chunk.source_links
-            else None
-        ),
+        source_links=json.loads(chunk.source_links) if chunk.source_links else None,
        image_file_id=chunk.image_file_id,
        # Deprecated. Fill in some reasonable default.
        section_continuation=False,
@@ -110,70 +70,66 @@ def _convert_retrieved_opensearch_chunk_to_inference_chunk_uncleaned(
        semantic_identifier=chunk.semantic_identifier,
        title=chunk.title,
        boost=chunk.global_boost,
-        score=score,
+        # TODO(andrei): Do in a followup. We should be able to get this from
+        # OpenSearch.
+        recency_bias=1.0,
+        # TODO(andrei): This is how good the match is, we need this, key insight
+        # is we can order chunks by this. Should not be hard to plumb this from
+        # a search result, do that in a followup.
+        score=None,
        hidden=chunk.hidden,
-        metadata=(
-            convert_metadata_list_of_strings_to_dict(chunk.metadata_list)
-            if chunk.metadata_list
-            else {}
-        ),
-        # Extract highlighted snippets from the content field, if available. In
-        # the future we may want to match on other fields too, currently we only
-        # use the content field.
-        match_highlights=highlights.get(CONTENT_FIELD_NAME, []),
+        metadata=json.loads(chunk.metadata),
+        # TODO(andrei): The vector DB needs to supply this. I vaguely know
+        # OpenSearch can from the documentation I've seen till now, look at this
+        # in a followup.
+        match_highlights=[],
        # TODO(andrei) Consider storing a chunk content index instead of a full
        # string when working on chunk content augmentation.
        doc_summary=chunk.doc_summary,
-        # TODO(andrei) Same thing as above.
+        # TODO(andrei) Same thing as contx ret above, LLM gens context for each
+        # chunk.
        chunk_context=chunk.chunk_context,
        updated_at=chunk.last_updated,
        primary_owners=chunk.primary_owners,
        secondary_owners=chunk.secondary_owners,
-        # TODO(andrei) Same thing as chunk_context above.
-        metadata_suffix=chunk.metadata_suffix,
+        # TODO(andrei): This is the suffix appended to the end of the chunk
+        # content to assist querying. There are better ways we can do this, for
+        # ex. keeping an index of where to string split from.
+        metadata_suffix=None,
    )


+def _convert_inference_chunk_uncleaned_to_inference_chunk(
+    inference_chunk_uncleaned: InferenceChunkUncleaned,
+) -> InferenceChunk:
+    # TODO(andrei): Implement this.
+    return inference_chunk_uncleaned.to_inference_chunk()
+
+
 def _convert_onyx_chunk_to_opensearch_document(
    chunk: DocMetadataAwareIndexChunk,
 ) -> DocumentChunk:
    return DocumentChunk(
        document_id=chunk.source_document.id,
        chunk_index=chunk.chunk_id,
-        # Use get_title_for_document_index to match the logic used when creating
-        # the title_embedding in the embedder. This method falls back to
-        # semantic_identifier when title is None (but not empty string).
-        title=chunk.source_document.get_title_for_document_index(),
+        title=chunk.source_document.title,
        title_vector=chunk.title_embedding,
-        content=generate_enriched_content_for_chunk_text(chunk),
+        content=chunk.content,
        content_vector=chunk.embeddings.full_embedding,
        source_type=chunk.source_document.source.value,
-        metadata_list=chunk.source_document.get_metadata_str_attributes(),
-        metadata_suffix=chunk.metadata_suffix_keyword,
+        metadata=json.dumps(chunk.source_document.metadata),
        last_updated=chunk.source_document.doc_updated_at,
        public=chunk.access.is_public,
-        # TODO(andrei): When going over ACL look very carefully at
-        # access_control_list. Notice DocumentAccess::to_acl prepends every
-        # string with a type.
        access_control_list=list(chunk.access.to_acl()),
        global_boost=chunk.boost,
        semantic_identifier=chunk.source_document.semantic_identifier,
        image_file_id=chunk.image_file_id,
-        # Small optimization, if this list is empty we can supply None to
-        # OpenSearch and it will not store any data at all for this field, which
-        # is different from supplying an empty list.
        source_links=json.dumps(chunk.source_links) if chunk.source_links else None,
        blurb=chunk.blurb,
        doc_summary=chunk.doc_summary,
        chunk_context=chunk.chunk_context,
-        # Small optimization, if this list is empty we can supply None to
-        # OpenSearch and it will not store any data at all for this field, which
-        # is different from supplying an empty list.
        document_sets=list(chunk.document_sets) if chunk.document_sets else None,
-        # Small optimization, if this list is empty we can supply None to
-        # OpenSearch and it will not store any data at all for this field, which
-        # is different from supplying an empty list.
-        user_projects=chunk.user_project or None,
+        project_ids=list(chunk.user_project) if chunk.user_project else None,
        primary_owners=get_experts_stores_representations(
            chunk.source_document.primary_owners
        ),
@@ -188,6 +144,23 @@ def _convert_onyx_chunk_to_opensearch_document(
    )


+def _enrich_chunk_info() -> None:  # pyright: ignore[reportUnusedFunction]
+    # TODO(andrei): Implement this. Until then, we do not enrich chunk content
+    # with title, etc.
+    raise NotImplementedError(
+        "[ANDREI]: Enrich chunk info is not implemented for OpenSearch."
+    )
+
+
+def _clean_chunk_info() -> None:  # pyright: ignore[reportUnusedFunction]
+    # Analogous to _cleanup_chunks in vespa_document_index.py.
+    # TODO(andrei): Implement this. Until then, we do not enrich chunk content
+    # with title, etc.
+    raise NotImplementedError(
+        "[ANDREI]: Clean chunk info is not implemented for OpenSearch."
+    )
+
+
 class OpenSearchOldDocumentIndex(OldDocumentIndex):
    """
    Wrapper for OpenSearch to adapt the new DocumentIndex interface with
@@ -213,10 +186,6 @@ class OpenSearchOldDocumentIndex(OldDocumentIndex):
            index_name=index_name,
            secondary_index_name=secondary_index_name,
        )
-        if multitenant:
-            raise ValueError(
-                "Bug: OpenSearch is not yet ready for multitenant environments but something tried to use it."
-            )
        self._real_index = OpenSearchDocumentIndex(
            index_name=index_name,
            # TODO(andrei): Sus. Do not plug this into production until all
@@ -424,24 +393,6 @@ class OpenSearchDocumentIndex(DocumentIndex):
    def verify_and_create_index_if_necessary(
        self, embedding_dim: int, embedding_precision: EmbeddingPrecision
    ) -> None:
-        """Verifies and creates the index if necessary.
-
-        Also puts the desired search pipeline state, creating the pipelines if
-        they do not exist and updating them otherwise.
-
-        Args:
-            embedding_dim: Vector dimensionality for the vector similarity part
-                of the search.
-            embedding_precision: Precision of the values of the vectors for the
-                similarity part of the search.
-
-        Raises:
-            RuntimeError: There was an error verifying or creating the index or
-                search pipelines.
-        """
-        logger.debug(
-            f"[OpenSearchDocumentIndex] Verifying and creating index {self._index_name} if necessary."
-        )
        expected_mappings = DocumentSchema.get_document_schema(
            embedding_dim, self._tenant_state.multitenant
        )
@@ -471,9 +422,6 @@ class OpenSearchDocumentIndex(DocumentIndex):
        chunks: list[DocMetadataAwareIndexChunk],
        indexing_metadata: IndexingMetadata,
    ) -> list[DocumentInsertionRecord]:
-        logger.debug(
-            f"[OpenSearchDocumentIndex] Indexing {len(chunks)} chunks for index {self._index_name}."
-        )
        # Set of doc IDs.
        unique_docs_to_be_indexed: set[str] = set()
        document_indexing_results: list[DocumentInsertionRecord] = []
@@ -504,6 +452,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
            opensearch_document_chunk = _convert_onyx_chunk_to_opensearch_document(
                chunk
            )
+            # TODO(andrei): Enrich chunk content here.
            # TODO(andrei): After our client supports batch indexing, use that
            # here.
            self._os_client.index_document(opensearch_document_chunk)
@@ -518,8 +467,6 @@ class OpenSearchDocumentIndex(DocumentIndex):
    def delete(self, document_id: str, chunk_count: int | None = None) -> int:
        """Deletes all chunks for a given document.

-        Does nothing if the specified document ID does not exist.
-
        TODO(andrei): Make this method require supplying source type.
        TODO(andrei): Consider implementing this method to delete on document
        chunk IDs vs querying for matching document chunks.
@@ -536,9 +483,6 @@ class OpenSearchDocumentIndex(DocumentIndex):
        Returns:
            The number of chunks successfully deleted.
        """
-        logger.debug(
-            f"[OpenSearchDocumentIndex] Deleting document {document_id} from index {self._index_name}."
-        )
        query_body = DocumentQuery.delete_from_document_id_query(
            document_id=document_id,
            tenant_state=self._tenant_state,
@@ -550,84 +494,15 @@ class OpenSearchDocumentIndex(DocumentIndex):
        self,
        update_requests: list[MetadataUpdateRequest],
    ) -> None:
-        """Updates some set of chunks.
-
-        NOTE: Will raise if the specified document chunks do not exist.
-        NOTE: Requires document chunk count be known; will raise if it is not.
-        NOTE: Each update request must have some field to update; if not it is
-        assumed there is a bug in the caller and this will raise.
-
-        TODO(andrei): Consider exploring a batch API for OpenSearch for this
-        operation.
-
-        Args:
-            update_requests: A list of update requests, each containing a list
-                of document IDs and the fields to update. The field updates
-                apply to all of the specified documents in each update request.
-
-        Raises:
-            RuntimeError: Failed to update some or all of the chunks for the
-                specified documents.
-        """
-        logger.debug(
-            f"[OpenSearchDocumentIndex] Updating {len(update_requests)} chunks for index {self._index_name}."
-        )
-        for update_request in update_requests:
-            properties_to_update: dict[str, Any] = dict()
-            # TODO(andrei): Nit but consider if we can use DocumentChunk
-            # here so we don't have to think about passing in the
-            # appropriate types into this dict.
-            if update_request.access is not None:
-                properties_to_update[ACCESS_CONTROL_LIST_FIELD_NAME] = list(
-                    update_request.access.to_acl()
-                )
-            if update_request.document_sets is not None:
-                properties_to_update[DOCUMENT_SETS_FIELD_NAME] = list(
-                    update_request.document_sets
-                )
-            if update_request.boost is not None:
-                properties_to_update[GLOBAL_BOOST_FIELD_NAME] = int(
-                    update_request.boost
-                )
-            if update_request.hidden is not None:
-                properties_to_update[HIDDEN_FIELD_NAME] = update_request.hidden
-            if update_request.project_ids is not None:
-                properties_to_update[USER_PROJECTS_FIELD_NAME] = list(
-                    update_request.project_ids
-                )
-
-            for doc_id in update_request.document_ids:
-                if not properties_to_update:
-                    raise ValueError(
-                        f"Bug: Tried to update document {doc_id} with no updated fields or user fields."
-                    )
-
-                doc_chunk_count = update_request.doc_id_to_chunk_cnt.get(doc_id, -1)
-                if doc_chunk_count < 0:
-                    raise ValueError(
-                        f"Tried to update document {doc_id} but its chunk count is not known. Older versions of the "
-                        "application used to permit this but is not a supported state for a document when using OpenSearch."
-                    )
-                if doc_chunk_count == 0:
-                    raise ValueError(
-                        f"Bug: Tried to update document {doc_id} but its chunk count was 0."
-                    )
-
-                for chunk_index in range(doc_chunk_count):
-                    document_chunk_id = get_opensearch_doc_chunk_id(
-                        document_id=doc_id, chunk_index=chunk_index
-                    )
-                    self._os_client.update_document(
-                        document_chunk_id=document_chunk_id,
-                        properties_to_update=properties_to_update,
-                    )
+        logger.info("[ANDREI]: Updating documents...")
+        # TODO(andrei): This needs to be implemented. I explicitly do not raise
+        # here despite this not being implemented because indexing calls this
+        # method so it is very hard to test other methods of this class if this
+        # raises.

    def id_based_retrieval(
        self,
        chunk_requests: list[DocumentSectionRequest],
-        # TODO(andrei): When going over ACL look very carefully at
-        # access_control_list. Notice DocumentAccess::to_acl prepends every
-        # string with a type.
        filters: IndexFilters,
        # TODO(andrei): Remove this from the new interface at some point; we
        # should not be exposing this.
@@ -637,12 +512,9 @@ class OpenSearchDocumentIndex(DocumentIndex):
        TODO(andrei): Consider implementing this method to retrieve on document
        chunk IDs vs querying for matching document chunks.
        """
-        logger.debug(
-            f"[OpenSearchDocumentIndex] Retrieving {len(chunk_requests)} chunks for index {self._index_name}."
-        )
        results: list[InferenceChunk] = []
        for chunk_request in chunk_requests:
-            search_hits: list[SearchHit[DocumentChunk]] = []
+            document_chunks: list[DocumentChunk] = []
            query_body = DocumentQuery.get_from_document_id_query(
                document_id=chunk_request.document_id,
                tenant_state=self._tenant_state,
@@ -650,20 +522,22 @@ class OpenSearchDocumentIndex(DocumentIndex):
                min_chunk_index=chunk_request.min_chunk_ind,
                max_chunk_index=chunk_request.max_chunk_ind,
            )
-            search_hits = self._os_client.search(
+            document_chunks = self._os_client.search(
                body=query_body,
                search_pipeline_id=None,
            )
-            inference_chunks_uncleaned: list[InferenceChunkUncleaned] = [
-                _convert_retrieved_opensearch_chunk_to_inference_chunk_uncleaned(
-                    search_hit.document_chunk, None, {}
-                )
-                for search_hit in search_hits
+            inference_chunks_uncleaned = [
+                _convert_opensearch_chunk_to_inference_chunk_uncleaned(document_chunk)
+                for document_chunk in document_chunks
+            ]
+            inference_chunks = [
+                _convert_inference_chunk_uncleaned_to_inference_chunk(
+                    inference_chunk_uncleaned
+                )
+                for inference_chunk_uncleaned in inference_chunks_uncleaned
            ]
-            inference_chunks: list[InferenceChunk] = cleanup_content_for_chunks(
-                inference_chunks_uncleaned
-            )
            results.extend(inference_chunks)
+            # TODO(andrei): Clean chunk content here.
        return results

    def hybrid_retrieval(
@@ -672,16 +546,10 @@ class OpenSearchDocumentIndex(DocumentIndex):
        query_embedding: Embedding,
        final_keywords: list[str] | None,
        query_type: QueryType,
-        # TODO(andrei): When going over ACL look very carefully at
-        # access_control_list. Notice DocumentAccess::to_acl prepends every
-        # string with a type.
        filters: IndexFilters,
        num_to_retrieve: int,
        offset: int = 0,
    ) -> list[InferenceChunk]:
-        logger.debug(
-            f"[OpenSearchDocumentIndex] Hybrid retrieving {num_to_retrieve} chunks for index {self._index_name}."
-        )
        query_body = DocumentQuery.get_hybrid_search_query(
            query_text=query,
            query_vector=query_embedding,
@@ -689,27 +557,25 @@ class OpenSearchDocumentIndex(DocumentIndex):
            num_hits=num_to_retrieve,
            tenant_state=self._tenant_state,
        )
-        search_hits: list[SearchHit[DocumentChunk]] = self._os_client.search(
+        document_chunks = self._os_client.search(
            body=query_body,
            search_pipeline_id=MIN_MAX_NORMALIZATION_PIPELINE_NAME,
        )
-        inference_chunks_uncleaned: list[InferenceChunkUncleaned] = [
-            _convert_retrieved_opensearch_chunk_to_inference_chunk_uncleaned(
-                search_hit.document_chunk, search_hit.score, search_hit.match_highlights
-            )
-            for search_hit in search_hits
+        # TODO(andrei): Clean chunk content here.
+        inference_chunks_uncleaned = [
+            _convert_opensearch_chunk_to_inference_chunk_uncleaned(document_chunk)
+            for document_chunk in document_chunks
+        ]
+        inference_chunks = [
+            _convert_inference_chunk_uncleaned_to_inference_chunk(
+                inference_chunk_uncleaned
+            )
+            for inference_chunk_uncleaned in inference_chunks_uncleaned
        ]
-        inference_chunks: list[InferenceChunk] = cleanup_content_for_chunks(
-            inference_chunks_uncleaned
-        )
-
        return inference_chunks

    def random_retrieval(
        self,
-        # TODO(andrei): When going over ACL look very carefully at
-        # access_control_list. Notice DocumentAccess::to_acl prepends every
-        # string with a type.
        filters: IndexFilters,
        num_to_retrieve: int = 100,
        dirty: bool | None = None,
--- a/backend/onyx/document_index/opensearch/schema.py
+++ b/backend/onyx/document_index/opensearch/schema.py
@@ -25,7 +25,7 @@ TITLE_VECTOR_FIELD_NAME = "title_vector"
 CONTENT_FIELD_NAME = "content"
 CONTENT_VECTOR_FIELD_NAME = "content_vector"
 SOURCE_TYPE_FIELD_NAME = "source_type"
-METADATA_LIST_FIELD_NAME = "metadata_list"
+METADATA_FIELD_NAME = "metadata"
 LAST_UPDATED_FIELD_NAME = "last_updated"
 PUBLIC_FIELD_NAME = "public"
 ACCESS_CONTROL_LIST_FIELD_NAME = "access_control_list"
@@ -35,7 +35,7 @@ SEMANTIC_IDENTIFIER_FIELD_NAME = "semantic_identifier"
 IMAGE_FILE_ID_FIELD_NAME = "image_file_id"
 SOURCE_LINKS_FIELD_NAME = "source_links"
 DOCUMENT_SETS_FIELD_NAME = "document_sets"
-USER_PROJECTS_FIELD_NAME = "user_projects"
+PROJECT_IDS_FIELD_NAME = "project_ids"
 DOCUMENT_ID_FIELD_NAME = "document_id"
 CHUNK_INDEX_FIELD_NAME = "chunk_index"
 MAX_CHUNK_SIZE_FIELD_NAME = "max_chunk_size"
@@ -43,7 +43,6 @@ TENANT_ID_FIELD_NAME = "tenant_id"
 BLURB_FIELD_NAME = "blurb"
 DOC_SUMMARY_FIELD_NAME = "doc_summary"
 CHUNK_CONTEXT_FIELD_NAME = "chunk_context"
-METADATA_SUFFIX_FIELD_NAME = "metadata_suffix"
 PRIMARY_OWNERS_FIELD_NAME = "primary_owners"
 SECONDARY_OWNERS_FIELD_NAME = "secondary_owners"

@@ -102,9 +101,12 @@ class DocumentChunk(BaseModel):
    content_vector: list[float]

    source_type: str
-    # A list of key-value pairs separated by INDEX_SEPARATOR. See
-    # convert_metadata_dict_to_list_of_strings.
-    metadata_list: list[str] | None = None
+    # Contains a string representation of a dict which maps string key to either
+    # string value or list of string values.
+    # TODO(andrei): When we augment content with metadata this can just be an
+    # index pointer, and when we support metadata list that will just be a list
+    # of strings.
+    metadata: str
    # If it exists, time zone should always be UTC.
    last_updated: datetime | None = None

@@ -121,16 +123,12 @@ class DocumentChunk(BaseModel):
    # chunk text to the link corresponding to that point.
    source_links: str | None = None
    blurb: str
-    # doc_summary, chunk_context, and metadata_suffix are all stored simply to
-    # reverse the augmentations to content. Ideally these would just be start
-    # and stop indices into the content string. For legacy reasons they are not
-    # right now.
    doc_summary: str
    chunk_context: str
-    metadata_suffix: str | None = None

    document_sets: list[str] | None = None
-    user_projects: list[int] | None = None
+    # User projects.
+    project_ids: list[int] | None = None
    primary_owners: list[str] | None = None
    secondary_owners: list[str] | None = None

@@ -285,12 +283,6 @@ class DocumentSchema:
            full-text searches.
          - "store": True fields are stored and can be returned on their own,
            independent of the parent document.
-          - "index": True fields can be queried on.
-          - "doc_values": True fields can be sorted and aggregated efficiently.
-            Not supported for "text" type fields.
-          - "store": True fields are stored separately from the source document
-            and can thus be returned from a query separately from _source.
-            Generally this is not necessary.

        Args:
            vector_dimension: The dimension of vector embeddings. Must be a
@@ -317,18 +309,10 @@ class DocumentSchema:
                        # TODO(andrei): Ask Yuhong do we want this?
                        "keyword": {"type": "keyword", "ignore_above": 256}
                    },
-                    # This makes highlighting text during queries more efficient
-                    # at the cost of disk space. See
-                    # https://docs.opensearch.org/latest/search-plugins/searching-data/highlight/#methods-of-obtaining-offsets
-                    "index_options": "offsets",
                },
                CONTENT_FIELD_NAME: {
                    "type": "text",
                    "store": True,
-                    # This makes highlighting text during queries more efficient
-                    # at the cost of disk space. See
-                    # https://docs.opensearch.org/latest/search-plugins/searching-data/highlight/#methods-of-obtaining-offsets
-                    "index_options": "offsets",
                },
                TITLE_VECTOR_FIELD_NAME: {
                    "type": "knn_vector",
@@ -353,7 +337,7 @@ class DocumentSchema:
                    },
                },
                SOURCE_TYPE_FIELD_NAME: {"type": "keyword"},
-                METADATA_LIST_FIELD_NAME: {"type": "keyword"},
+                METADATA_FIELD_NAME: {"type": "keyword"},
                # TODO(andrei): Check if Vespa stores seconds, we may wanna do
                # seconds here not millis.
                LAST_UPDATED_FIELD_NAME: {
@@ -378,13 +362,11 @@ class DocumentSchema:
                GLOBAL_BOOST_FIELD_NAME: {"type": "integer"},
                # This field is only used for displaying a useful name for the
                # doc in the UI and is not used for searching. Disabling these
-                # features to increase perf. This field is therefore essentially
-                # just metadata.
+                # features to increase perf.
                SEMANTIC_IDENTIFIER_FIELD_NAME: {
                    "type": "keyword",
                    "index": False,
                    "doc_values": False,
-                    # Generally False by default; just making sure.
                    "store": False,
                },
                # Same as above; used to display an image along with the doc.
@@ -392,7 +374,6 @@ class DocumentSchema:
                    "type": "keyword",
                    "index": False,
                    "doc_values": False,
-                    # Generally False by default; just making sure.
                    "store": False,
                },
                # Same as above; used to link to the source doc.
@@ -400,7 +381,6 @@ class DocumentSchema:
                    "type": "keyword",
                    "index": False,
                    "doc_values": False,
-                    # Generally False by default; just making sure.
                    "store": False,
                },
                # Same as above; used to quickly summarize the doc in the UI.
@@ -408,7 +388,6 @@ class DocumentSchema:
                    "type": "keyword",
                    "index": False,
                    "doc_values": False,
-                    # Generally False by default; just making sure.
                    "store": False,
                },
                # Same as above.
@@ -418,21 +397,12 @@ class DocumentSchema:
                    "type": "keyword",
                    "index": False,
                    "doc_values": False,
-                    # Generally False by default; just making sure.
                    "store": False,
                },
                # Same as above.
                # TODO(andrei): If we want to search on this this needs to be
                # changed.
                CHUNK_CONTEXT_FIELD_NAME: {
-                    "type": "keyword",
-                    "index": False,
-                    "doc_values": False,
-                    # Generally False by default; just making sure.
-                    "store": False,
-                },
-                # Same as above.
-                METADATA_SUFFIX_FIELD_NAME: {
                    "type": "keyword",
                    "index": False,
                    "doc_values": False,
@@ -440,7 +410,7 @@ class DocumentSchema:
                },
                # Product-specific fields.
                DOCUMENT_SETS_FIELD_NAME: {"type": "keyword"},
-                USER_PROJECTS_FIELD_NAME: {"type": "integer"},
+                PROJECT_IDS_FIELD_NAME: {"type": "integer"},
                PRIMARY_OWNERS_FIELD_NAME: {"type": "keyword"},
                SECONDARY_OWNERS_FIELD_NAME: {"type": "keyword"},
                # OpenSearch metadata fields.
--- a/backend/onyx/document_index/opensearch/search.py
+++ b/backend/onyx/document_index/opensearch/search.py
@@ -244,9 +244,6 @@ class DocumentQuery:
            query_text, query_vector, num_candidates
        )
        hybrid_search_filters = DocumentQuery._get_hybrid_search_filters(tenant_state)
-        match_highlights_configuration = (
-            DocumentQuery._get_match_highlights_configuration()
-        )

        hybrid_search_query: dict[str, Any] = {
            "bool": {
@@ -257,8 +254,6 @@ class DocumentQuery:
                        }
                    }
                ],
-                # TODO(andrei): When revisiting our hybrid query logic see if
-                # this needs to be nested one level down.
                "filter": hybrid_search_filters,
            }
        }
@@ -266,7 +261,6 @@ class DocumentQuery:
        final_hybrid_search_body: dict[str, Any] = {
            "query": hybrid_search_query,
            "size": num_hits,
-            "highlight": match_highlights_configuration,
        }
        return final_hybrid_search_body

@@ -352,30 +346,3 @@ class DocumentQuery:
                {"term": {TENANT_ID_FIELD_NAME: {"value": tenant_state.tenant_id}}}
            )
        return hybrid_search_filters
-
-    @staticmethod
-    def _get_match_highlights_configuration() -> dict[str, Any]:
-        """
-        Gets configuration for returning match highlights for a hit.
-        """
-        match_highlights_configuration: dict[str, Any] = {
-            "fields": {
-                CONTENT_FIELD_NAME: {
-                    # See https://docs.opensearch.org/latest/search-plugins/searching-data/highlight/#highlighter-types
-                    "type": "unified",
-                    # The length in chars of a match snippet. Somewhat
-                    # arbitrarily-chosen. The Vespa codepath limited total
-                    # highlights length to 400 chars. fragment_size *
-                    # number_of_fragments = 400 should be good enough.
-                    "fragment_size": 100,
-                    # The number of snippets to return per field per document
-                    # hit.
-                    "number_of_fragments": 4,
-                    # These tags wrap matched keywords and they match what Vespa
-                    # used to return. Use them to minimize changes to our code.
-                    "pre_tags": ["<hi>"],
-                    "post_tags": ["</hi>"],
-                }
-            }
-        }
-        return match_highlights_configuration
--- a/backend/onyx/document_index/vespa/chunk_retrieval.py
+++ b/backend/onyx/document_index/vespa/chunk_retrieval.py
@@ -41,6 +41,7 @@ from onyx.document_index.vespa_constants import MAX_OR_CONDITIONS
 from onyx.document_index.vespa_constants import METADATA
 from onyx.document_index.vespa_constants import METADATA_SUFFIX
 from onyx.document_index.vespa_constants import PRIMARY_OWNERS
+from onyx.document_index.vespa_constants import RECENCY_BIAS
 from onyx.document_index.vespa_constants import SEARCH_ENDPOINT
 from onyx.document_index.vespa_constants import SECONDARY_OWNERS
 from onyx.document_index.vespa_constants import SECTION_CONTINUATION
@@ -141,6 +142,7 @@ def _vespa_hit_to_inference_chunk(
        title=fields.get(TITLE),
        semantic_identifier=fields[SEMANTIC_IDENTIFIER],
        boost=fields.get(BOOST, 1),
+        recency_bias=fields.get("matchfeatures", {}).get(RECENCY_BIAS, 1.0),
        score=None if null_score else hit.get("relevance", 0),
        hidden=fields.get(HIDDEN, False),
        primary_owners=fields.get(PRIMARY_OWNERS),
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Justin Tahara	f1c30974f5	fix(celery): Guardrail for User File Processing (#8633 )	2026-03-01 09:22:43 -08:00
Jamison Lahman	81bf07fb15	chore(devtools): upgrade `ods`: v0.6.1->v0.6.2 (#8773 )	2026-02-26 16:20:13 -08:00
Jamison Lahman	b565bf8291	chore(mypy): fix mypy cache issues switching between HEAD and release (#7732 )	2026-01-27 15:52:40 -08:00
Jamison Lahman	b4da99cbdd	fix(citations): enable citation sidebar w/ web_search-only assistants (#7888 )	2026-01-27 13:36:44 -08:00
Justin Tahara	f910feea0f	fix(llm): Hide private models from Agent Creation (#7873 )	2026-01-27 12:20:56 -08:00
Justin Tahara	e3af8c6c8a	feat(desktop): Domain Configuration (#7655 )	2026-01-26 16:42:58 -08:00
Justin Tahara	d6e46ed792	feat(desktop): Properly Sign Mac App (#7608 )	2026-01-26 16:42:47 -08:00
Jamison Lahman	4ce1f4ecdd	chore(desktop): make artifact filename version-agnostic (#7679 )	2026-01-26 16:24:06 -08:00
Jamison Lahman	a4678884d7	chore(deployments): fix region (#7640 )	2026-01-26 16:24:06 -08:00
Jamison Lahman	c861ba68f1	chore(deployments): fetch secrets from AWS (#7584 )	2026-01-26 16:24:06 -08:00
Raunak Bhagat	b1d0e0bb0b	Fix actions-steps collapsing/opening issue	2026-01-25 12:49:32 -08:00
Raunak Bhagat	0d78bf52e3	Stop header from collapsing over and over again	2026-01-25 12:49:32 -08:00
Yuhong Sun	bd743282e6	fix: LiteLLM Azure models don't stream (#7761 )	2026-01-25 12:47:48 -08:00
Raunak Bhagat	d44d1d92b3	2.9 fixes (#7756 )	2026-01-24 17:36:20 -08:00
Raunak Bhagat	4cedcfee59	Fix notifications popover some more	2026-01-24 17:30:45 -08:00
Raunak Bhagat	90a721a76e	Fix line-items	2026-01-24 17:30:45 -08:00
Raunak Bhagat	3ccd99e931	Fix notifications	2026-01-24 17:30:45 -08:00
Raunak Bhagat	9076bf603f	Fix actions popover	2026-01-24 17:30:45 -08:00
Nikolas Garza	8c6e0a70c3	fix(chat): prevent streaming text from appearing in bursts after citations (#7745 )	2026-01-24 16:58:12 -08:00
Yuhong Sun	bebe9555d4	fix: Azure OpenAI Tool Calls (#7727 )	2026-01-24 16:55:27 -08:00
Nikolas Garza	c530722c9f	fix(tests): use crawler-friendly search query in Exa integration test (#7746 )	2026-01-24 16:53:40 -08:00
Jamison Lahman	68380b4ddb	chore(fe): align assistant icon with chat bar (#7537 )	2026-01-24 16:34:57 -08:00
Jamison Lahman	b3380746ab	fix(fe): chat header is sticky and transparent (#7487 )	2026-01-24 16:34:57 -08:00
Nikolas Garza	56be114c87	fix(fe): show scroll-down button when user scrolls up during streaming (#7562 )	2026-01-24 16:34:57 -08:00
Nikolas Garza	54f467da5c	fix: improve scroll behavior (#7364 )	2026-01-24 16:34:57 -08:00
Nikolas Garza	8726b112fe	fix(slack): Extract person names and filter garbage in query expansion (#7632 )	2026-01-23 22:59:23 -08:00
Raunak Bhagat	92181d07b2	fix: Fix scrollability issues for modals (#7718 )	2026-01-23 22:05:53 -08:00
Raunak Bhagat	3a73f7fab2	fix: Fix layout issues with `AgentEditorPage` (#7730 )	2026-01-23 20:29:21 -08:00
Raunak Bhagat	7dabaca7cd	fix: Add back agent sharing (#7731 )	2026-01-23 19:13:36 -08:00
Raunak Bhagat	dec4748825	Close modal on success only	2026-01-23 17:39:52 -08:00
Raunak Bhagat	072836cd86	Cherry-pick agent-deletion	2026-01-23 17:39:52 -08:00
Evan Lohn	2705b5fb0e	Revert "fix: modal header in index attempt errors (#7601 )" This reverts commit `f945ab6b05`.	2026-01-23 15:02:41 -08:00
Evan Lohn	37dcde4226	fix: prevent updates from overwriting perm syncing (#7384 )	2026-01-23 14:52:44 -08:00
Evan Lohn	a765b5f622	fix(mcp): per-user auth (#7400 )	2026-01-23 14:51:56 -08:00
Evan Lohn	5e093368d1	fix: bedrock non-anthropic prompt caching (#7435 )	2026-01-23 14:50:13 -08:00
Evan Lohn	f945ab6b05	fix: modal header in index attempt errors (#7601 )	2026-01-23 14:48:29 -08:00
Justin Tahara	11b7a22404	fix(ui): Coda Logo (#7656 )	2026-01-23 14:45:29 -08:00
Justin Tahara	8e34f944cc	fix(ui): First Connector Result (#7657 )	2026-01-23 14:45:18 -08:00
Jamison Lahman	32606dc752	revert: "feat: Enable triple click on content in the chat" (#7393 ) to release v2.9 (#7710 )	2026-01-23 14:21:22 -08:00
Jamison Lahman	1f6c4b40bf	fix(fe): inline code text wraps (#7574 ) to release v2.9 (#7707 )	2026-01-23 13:40:28 -08:00
Nikolas Garza	1943f1c745	feat(billing): add annual pricing support to subscription checkout (#7506 )	2026-01-23 10:40:16 -08:00
Jamison Lahman	82460729a6	fix(db): ensure migrations are atomic (#7474 ) to release v2.9 (#7648 )	2026-01-21 14:58:04 -08:00
Wenxi	c445e6a8c0	fix: delete old notifications first in migration (#7454 )	2026-01-20 08:31:00 -08:00