fix(ui): Updating Dropdown Modal component (#8033 )

fix(agents): Removing Label Dependency (#8189 )
chore(chat): Cleaning Error Codes + Tests (#8186 )
2026-02-17 07:45:47 +00:00 · 2026-02-06 11:59:09 -08:00 · 2026-02-06 11:39:09 -08:00 · 2026-02-06 11:02:41 -08:00 · 2026-02-02 15:05:21 -08:00 · 2026-02-02 10:03:39 -08:00
639 changed files with 43857 additions and 21054 deletions
--- a/.github/workflows/deployment.yml
+++ b/.github/workflows/deployment.yml
@@ -8,7 +8,9 @@ on:

 # Set restrictive default permissions for all jobs. Jobs that need more permissions
 # should explicitly declare them.
-permissions: {}
+permissions:
+  # Required for OIDC authentication with AWS
+  id-token: write # zizmor: ignore[excessive-permissions]

 env:
  EDGE_TAG: ${{ startsWith(github.ref_name, 'nightly-latest') }}
@@ -150,16 +152,30 @@ jobs:
    if: always() && needs.check-version-tag.result == 'failure' && github.event_name != 'workflow_dispatch'
    runs-on: ubuntu-slim
    timeout-minutes: 10
+    environment: release
    steps:
      - name: Checkout
        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            MONITOR_DEPLOYMENTS_WEBHOOK, deploy/monitor-deployments-webhook
+          parse-json-secrets: true
+
      - name: Send Slack notification
        uses: ./.github/actions/slack-notify
        with:
-          webhook-url: ${{ secrets.MONITOR_DEPLOYMENTS_WEBHOOK }}
+          webhook-url: ${{ env.MONITOR_DEPLOYMENTS_WEBHOOK }}
          failed-jobs: "• check-version-tag"
          title: "🚨 Version Tag Check Failed"
          ref-name: ${{ github.ref_name }}
@@ -168,6 +184,7 @@ jobs:
    needs: determine-builds
    if: needs.determine-builds.outputs.build-desktop == 'true'
    permissions:
+      id-token: write
      contents: write
      actions: read
    strategy:
@@ -185,12 +202,33 @@ jobs:

    runs-on: ${{ matrix.platform }}
    timeout-minutes: 90
+    environment: release
    steps:
      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6.0.1
        with:
          # NOTE: persist-credentials is needed for tauri-action to create GitHub releases.
          persist-credentials: true # zizmor: ignore[artipacked]

+      - name: Configure AWS credentials
+        if: startsWith(matrix.platform, 'macos-')
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        if: startsWith(matrix.platform, 'macos-')
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            APPLE_ID, deploy/apple-id
+            APPLE_PASSWORD, deploy/apple-password
+            APPLE_CERTIFICATE, deploy/apple-certificate
+            APPLE_CERTIFICATE_PASSWORD, deploy/apple-certificate-password
+            KEYCHAIN_PASSWORD, deploy/keychain-password
+            APPLE_TEAM_ID, deploy/apple-team-id
+          parse-json-secrets: true
+
      - name: install dependencies (ubuntu only)
        if: startsWith(matrix.platform, 'ubuntu-')
        run: |
@@ -285,15 +323,40 @@ jobs:

          Write-Host "Versions set to: $VERSION"

-      - uses: tauri-apps/tauri-action@19b93bb55601e3e373a93cfb6eb4242e45f5af20 # ratchet:tauri-apps/tauri-action@action-v0.6.0
+      - name: Import Apple Developer Certificate
+        if: startsWith(matrix.platform, 'macos-')
+        run: |
+          echo $APPLE_CERTIFICATE | base64 --decode > certificate.p12
+          security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
+          security default-keychain -s build.keychain
+          security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
+          security set-keychain-settings -t 3600 -u build.keychain
+          security import certificate.p12 -k build.keychain -P "$APPLE_CERTIFICATE_PASSWORD" -T /usr/bin/codesign
+          security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain
+          security find-identity -v -p codesigning build.keychain
+
+      - name: Verify Certificate
+        if: startsWith(matrix.platform, 'macos-')
+        run: |
+          CERT_INFO=$(security find-identity -v -p codesigning build.keychain | grep -E "(Developer ID Application|Apple Distribution|Apple Development)" | head -n 1)
+          CERT_ID=$(echo "$CERT_INFO" | awk -F'"' '{print $2}')
+          echo "CERT_ID=$CERT_ID" >> $GITHUB_ENV
+          echo "Certificate imported."
+
+      - uses: tauri-apps/tauri-action@73fb865345c54760d875b94642314f8c0c894afa # ratchet:tauri-apps/tauri-action@action-v0.6.1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          APPLE_ID: ${{ env.APPLE_ID }}
+          APPLE_PASSWORD: ${{ env.APPLE_PASSWORD }}
+          APPLE_SIGNING_IDENTITY: ${{ env.CERT_ID }}
+          APPLE_TEAM_ID: ${{ env.APPLE_TEAM_ID }}
        with:
          tagName: ${{ needs.determine-builds.outputs.is-test-run != 'true' && 'v__VERSION__' || format('v0.0.0-dev+{0}', needs.determine-builds.outputs.short-sha) }}
          releaseName: ${{ needs.determine-builds.outputs.is-test-run != 'true' && 'v__VERSION__' || format('v0.0.0-dev+{0}', needs.determine-builds.outputs.short-sha) }}
          releaseBody: "See the assets to download this version and install."
          releaseDraft: true
          prerelease: false
+          assetNamePattern: "[name]_[arch][ext]"
          args: ${{ matrix.args }}

  build-web-amd64:
@@ -305,6 +368,7 @@ jobs:
      - run-id=${{ github.run_id }}-web-amd64
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -317,6 +381,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -331,8 +409,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push AMD64
        id: build
@@ -363,6 +441,7 @@ jobs:
      - run-id=${{ github.run_id }}-web-arm64
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -375,6 +454,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -389,8 +482,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push ARM64
        id: build
@@ -423,19 +516,34 @@ jobs:
      - run-id=${{ github.run_id }}-merge-web
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: onyxdotapp/onyx-web-server
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Docker meta
        id: meta
@@ -471,6 +579,7 @@ jobs:
      - run-id=${{ github.run_id }}-web-cloud-amd64
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -483,6 +592,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -497,8 +620,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push AMD64
        id: build
@@ -537,6 +660,7 @@ jobs:
      - run-id=${{ github.run_id }}-web-cloud-arm64
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -549,6 +673,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -563,8 +701,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push ARM64
        id: build
@@ -605,19 +743,34 @@ jobs:
      - run-id=${{ github.run_id }}-merge-web-cloud
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: onyxdotapp/onyx-web-server-cloud
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Docker meta
        id: meta
@@ -650,6 +803,7 @@ jobs:
      - run-id=${{ github.run_id }}-backend-amd64
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -662,6 +816,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -676,8 +844,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push AMD64
        id: build
@@ -707,6 +875,7 @@ jobs:
      - run-id=${{ github.run_id }}-backend-arm64
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -719,6 +888,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -733,8 +916,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push ARM64
        id: build
@@ -766,19 +949,34 @@ jobs:
      - run-id=${{ github.run_id }}-merge-backend
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-backend-cloud' || 'onyxdotapp/onyx-backend' }}
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Docker meta
        id: meta
@@ -815,6 +1013,7 @@ jobs:
      - volume=40gb
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -827,6 +1026,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -843,8 +1056,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push AMD64
        id: build
@@ -879,6 +1092,7 @@ jobs:
      - volume=40gb
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    outputs:
      digest: ${{ steps.build.outputs.digest }}
    env:
@@ -891,6 +1105,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Docker meta
        id: meta
        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
@@ -907,8 +1135,8 @@ jobs:
      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Build and push ARM64
        id: build
@@ -944,19 +1172,34 @@ jobs:
      - run-id=${{ github.run_id }}-merge-model-server
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-model-server-cloud' || 'onyxdotapp/onyx-model-server' }}
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3

      - name: Login to Docker Hub
        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
        with:
-          username: ${{ secrets.DOCKER_USERNAME }}
-          password: ${{ secrets.DOCKER_TOKEN }}
+          username: ${{ env.DOCKER_USERNAME }}
+          password: ${{ env.DOCKER_TOKEN }}

      - name: Docker meta
        id: meta
@@ -994,11 +1237,26 @@ jobs:
      - run-id=${{ github.run_id }}-trivy-scan-web
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: onyxdotapp/onyx-web-server
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Run Trivy vulnerability scanner
        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
        with:
@@ -1014,8 +1272,8 @@ jobs:
            docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
              -e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
              -e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
-              -e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
-              -e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
+              -e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
+              -e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
              aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
              image \
              --skip-version-check \
@@ -1034,11 +1292,26 @@ jobs:
      - run-id=${{ github.run_id }}-trivy-scan-web-cloud
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: onyxdotapp/onyx-web-server-cloud
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Run Trivy vulnerability scanner
        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
        with:
@@ -1054,8 +1327,8 @@ jobs:
            docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
              -e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
              -e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
-              -e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
-              -e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
+              -e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
+              -e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
              aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
              image \
              --skip-version-check \
@@ -1074,6 +1347,7 @@ jobs:
      - run-id=${{ github.run_id }}-trivy-scan-backend
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-backend-cloud' || 'onyxdotapp/onyx-backend' }}
    steps:
@@ -1084,6 +1358,20 @@ jobs:
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Run Trivy vulnerability scanner
        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
        with:
@@ -1100,8 +1388,8 @@ jobs:
              -v ${{ github.workspace }}/backend/.trivyignore:/tmp/.trivyignore:ro \
              -e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
              -e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
-              -e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
-              -e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
+              -e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
+              -e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
              aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
              image \
              --skip-version-check \
@@ -1121,11 +1409,26 @@ jobs:
      - run-id=${{ github.run_id }}-trivy-scan-model-server
      - extras=ecr-cache
    timeout-minutes: 90
+    environment: release
    env:
      REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-model-server-cloud' || 'onyxdotapp/onyx-model-server' }}
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            DOCKER_USERNAME, deploy/docker-username
+            DOCKER_TOKEN, deploy/docker-token
+          parse-json-secrets: true
+
      - name: Run Trivy vulnerability scanner
        uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
        with:
@@ -1141,8 +1444,8 @@ jobs:
            docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
              -e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
              -e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
-              -e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
-              -e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
+              -e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
+              -e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
              aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
              image \
              --skip-version-check \
@@ -1170,12 +1473,26 @@ jobs:
    # NOTE: Github-hosted runners have about 20s faster queue times and are preferred here.
    runs-on: ubuntu-slim
    timeout-minutes: 90
+    environment: release
    steps:
      - name: Checkout
        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Get AWS Secrets
+        uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
+        with:
+          secret-ids: |
+            MONITOR_DEPLOYMENTS_WEBHOOK, deploy/monitor-deployments-webhook
+          parse-json-secrets: true
+
      - name: Determine failed jobs
        id: failed-jobs
        shell: bash
@@ -1241,7 +1558,7 @@ jobs:
      - name: Send Slack notification
        uses: ./.github/actions/slack-notify
        with:
-          webhook-url: ${{ secrets.MONITOR_DEPLOYMENTS_WEBHOOK }}
+          webhook-url: ${{ env.MONITOR_DEPLOYMENTS_WEBHOOK }}
          failed-jobs: ${{ steps.failed-jobs.outputs.jobs }}
          title: "🚨 Deployment Workflow Failed"
          ref-name: ${{ github.ref_name }}
--- a/.github/workflows/nightly-close-stale-issues.yml
+++ b/.github/workflows/nightly-close-stale-issues.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    timeout-minutes: 45
    steps:
-      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # ratchet:actions/stale@v10
+      - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # ratchet:actions/stale@v10
        with:
          stale-issue-message: 'This issue is stale because it has been open 75 days with no activity. Remove stale label or comment or this will be closed in 15 days.'
          stale-pr-message: 'This PR is stale because it has been open 75 days with no activity. Remove stale label or comment or this will be closed in 15 days.'
--- a/.github/workflows/pr-external-dependency-unit-tests.yml
+++ b/.github/workflows/pr-external-dependency-unit-tests.yml
@@ -172,7 +172,7 @@ jobs:

      - name: Upload Docker logs
        if: failure()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v5
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
          name: docker-logs-${{ matrix.test-dir }}
          path: docker-logs/
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -310,8 +310,9 @@ jobs:
          ONYX_MODEL_SERVER_IMAGE=${ECR_CACHE}:integration-test-model-server-test-${RUN_ID}
          INTEGRATION_TESTS_MODE=true
          CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=0.001
-          AUTO_LLM_UPDATE_INTERVAL_SECONDS=1
+          AUTO_LLM_UPDATE_INTERVAL_SECONDS=10
          MCP_SERVER_ENABLED=true
+          USE_LIGHTWEIGHT_BACKGROUND_WORKER=false
          EOF

      - name: Start Docker containers
@@ -438,7 +439,7 @@ jobs:

      - name: Upload logs
        if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v4
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
          name: docker-all-logs-${{ matrix.test-dir.name }}
          path: ${{ github.workspace }}/docker-compose.log
@@ -567,7 +568,7 @@ jobs:

      - name: Upload logs (multi-tenant)
        if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v4
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
          name: docker-all-logs-multitenant
          path: ${{ github.workspace }}/docker-compose-multitenant.log
--- a/.github/workflows/pr-jest-tests.yml
+++ b/.github/workflows/pr-jest-tests.yml
@@ -44,7 +44,7 @@ jobs:

      - name: Upload coverage reports
        if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v4
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
          name: jest-coverage-${{ github.run_id }}
          path: ./web/coverage
--- a/.github/workflows/pr-mit-integration-tests.yml
+++ b/.github/workflows/pr-mit-integration-tests.yml
@@ -301,7 +301,7 @@ jobs:
          ONYX_MODEL_SERVER_IMAGE=${ECR_CACHE}:integration-test-model-server-test-${RUN_ID}
          INTEGRATION_TESTS_MODE=true
          MCP_SERVER_ENABLED=true
-          AUTO_LLM_UPDATE_INTERVAL_SECONDS=1
+          AUTO_LLM_UPDATE_INTERVAL_SECONDS=10
          EOF

      - name: Start Docker containers
@@ -424,7 +424,7 @@ jobs:

      - name: Upload logs
        if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v4
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
          name: docker-all-logs-${{ matrix.test-dir.name }}
          path: ${{ github.workspace }}/docker-compose.log
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -435,7 +435,7 @@ jobs:
          fi
          npx playwright test --project ${PROJECT}

-      - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v4
+      - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        if: always()
        with:
          # Includes test results and trace.zip files
@@ -455,7 +455,7 @@ jobs:

      - name: Upload logs
        if: success() || failure()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v4
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
          name: docker-logs-${{ matrix.project }}-${{ github.run_id }}
          path: ${{ github.workspace }}/docker-compose.log
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -50,8 +50,9 @@ jobs:
        uses: runs-on/cache@50350ad4242587b6c8c2baa2e740b1bc11285ff4 # ratchet:runs-on/cache@v4
        with:
          path: backend/.mypy_cache
-          key: mypy-${{ runner.os }}-${{ hashFiles('**/*.py', '**/*.pyi', 'backend/pyproject.toml') }}
+          key: mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-${{ hashFiles('**/*.py', '**/*.pyi', 'backend/pyproject.toml') }}
          restore-keys: |
+            mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-
            mypy-${{ runner.os }}-

      - name: Run MyPy
--- a/.github/workflows/pr-python-model-tests.yml
+++ b/.github/workflows/pr-python-model-tests.yml
@@ -5,11 +5,6 @@ on:
    # This cron expression runs the job daily at 16:00 UTC (9am PT)
    - cron: "0 16 * * *"
  workflow_dispatch:
-    inputs:
-      branch:
-        description: 'Branch to run the workflow on'
-        required: false
-        default: 'main'

 permissions:
  contents: read
@@ -31,7 +26,11 @@ env:
 jobs:
  model-check:
    # See https://runs-on.com/runners/linux/
-    runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}-model-check"]
+    runs-on:
+      - runs-on
+      - runner=4cpu-linux-arm64
+      - "run-id=${{ github.run_id }}-model-check"
+      - "extras=ecr-cache"
    timeout-minutes: 45

    env:
@@ -43,108 +42,87 @@ jobs:
        with:
          persist-credentials: false

+      - name: Setup Python and Install Dependencies
+        uses: ./.github/actions/setup-python-and-install-dependencies
+        with:
+          requirements: |
+            backend/requirements/default.txt
+            backend/requirements/dev.txt
+
+      - name: Format branch name for cache
+        id: format-branch
+        env:
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          REF_NAME: ${{ github.ref_name }}
+        run: |
+          if [ -n "${PR_NUMBER}" ]; then
+            CACHE_SUFFIX="${PR_NUMBER}"
+          else
+            # shellcheck disable=SC2001
+            CACHE_SUFFIX=$(echo "${REF_NAME}" | sed 's/[^A-Za-z0-9._-]/-/g')
+          fi
+          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
+
      - name: Login to Docker Hub
-        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_TOKEN }}

-      # tag every docker image with "test" so that we can spin up the correct set
-      # of images during testing
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435

-      # We don't need to build the Web Docker image since it's not yet used
-      # in the integration tests. We have a separate action to verify that it builds
-      # successfully.
-      - name: Pull Model Server Docker image
-        run: |
-          docker pull onyxdotapp/onyx-model-server:latest
-          docker tag onyxdotapp/onyx-model-server:latest onyxdotapp/onyx-model-server:test
-
-      - name: Set up Python
-        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # ratchet:actions/setup-python@v6
+      - name: Build and load
+        uses: docker/bake-action@5be5f02ff8819ecd3092ea6b2e6261c31774f2b4 # ratchet:docker/bake-action@v6
+        env:
+          TAG: model-server-${{ github.run_id }}
        with:
-          python-version: "3.11"
-          cache: "pip"
-          cache-dependency-path: |
-            backend/requirements/default.txt
-            backend/requirements/dev.txt
-
-      - name: Install Dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
-          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
+          load: true
+          targets: model-server
+          set: |
+            model-server.cache-from=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ github.event.pull_request.head.sha || github.sha }}
+            model-server.cache-from=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ steps.format-branch.outputs.cache-suffix }}
+            model-server.cache-from=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache
+            model-server.cache-from=type=registry,ref=onyxdotapp/onyx-model-server:latest
+            model-server.cache-to=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ github.event.pull_request.head.sha || github.sha }},mode=max
+            model-server.cache-to=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache-${{ steps.format-branch.outputs.cache-suffix }},mode=max
+            model-server.cache-to=type=registry,ref=${{ env.RUNS_ON_ECR_CACHE }}:model-server-cache,mode=max

      - name: Start Docker containers
+        id: start_docker
+        env:
+          IMAGE_TAG: model-server-${{ github.run_id }}
        run: |
          cd deployment/docker_compose
-          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
-          AUTH_TYPE=basic \
-          REQUIRE_EMAIL_VERIFICATION=false \
-          DISABLE_TELEMETRY=true \
-          IMAGE_TAG=test \
-          docker compose -f docker-compose.model-server-test.yml up -d indexing_model_server
-        id: start_docker
-
-      - name: Wait for service to be ready
-        run: |
-          echo "Starting wait-for-service script..."
-
-          start_time=$(date +%s)
-          timeout=300  # 5 minutes in seconds
-
-          while true; do
-            current_time=$(date +%s)
-            elapsed_time=$((current_time - start_time))
-
-            if [ $elapsed_time -ge $timeout ]; then
-              echo "Timeout reached. Service did not become ready in 5 minutes."
-              exit 1
-            fi
-
-            # Use curl with error handling to ignore specific exit code 56
-            response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:9000/api/health || echo "curl_error")
-
-            if [ "$response" = "200" ]; then
-              echo "Service is ready!"
-              break
-            elif [ "$response" = "curl_error" ]; then
-              echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
-            else
-              echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
-            fi
-
-            sleep 5
-          done
-          echo "Finished waiting for service."
+          docker compose \
+            -f docker-compose.yml \
+            -f docker-compose.dev.yml \
+            up -d --wait \
+            inference_model_server

      - name: Run Tests
-        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
        run: |
          py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/llm
          py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/embedding

      - name: Alert on Failure
        if: failure() && github.event_name == 'schedule'
-        env:
-          SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
-          REPO: ${{ github.repository }}
-          RUN_ID: ${{ github.run_id }}
-        run: |
-          curl -X POST \
-            -H 'Content-type: application/json' \
-            --data "{\"text\":\"Scheduled Model Tests failed! Check the run at: https://github.com/${REPO}/actions/runs/${RUN_ID}\"}" \
-            $SLACK_WEBHOOK
+        uses: ./.github/actions/slack-notify
+        with:
+          webhook-url: ${{ secrets.SLACK_WEBHOOK }}
+          failed-jobs: model-check
+          title: "🚨 Scheduled Model Tests failed!"
+          ref-name: ${{ github.ref_name }}

      - name: Dump all-container logs (optional)
        if: always()
        run: |
          cd deployment/docker_compose
-          docker compose -f docker-compose.model-server-test.yml logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
+          docker compose logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true

      - name: Upload logs
        if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # ratchet:actions/upload-artifact@v4
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
        with:
          name: docker-all-logs
          path: ${{ github.workspace }}/docker-compose.log
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ backend/tests/regression/search_quality/*.json
 backend/onyx/evals/data/
 backend/onyx/evals/one_off/*.json
 *.log
+*.csv

 # secret files
 .env
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,6 @@ repos:
      - id: uv-sync
        args: ["--locked", "--all-extras"]
      - id: uv-lock
-        files: ^pyproject\.toml$
      - id: uv-export
        name: uv-export default.txt
        args:
@@ -147,6 +146,22 @@ repos:
        pass_filenames: false
        files: \.tf$

+      - id: npm-install
+        name: npm install
+        description: "Automatically run 'npm install' after a checkout, pull or rebase"
+        language: system
+        entry: bash -c 'cd web && npm install --no-save'
+        pass_filenames: false
+        files: ^web/package(-lock)?\.json$
+        stages: [post-checkout, post-merge, post-rewrite]
+      - id: npm-install-check
+        name: npm install --package-lock-only
+        description: "Check the 'web/package-lock.json' is updated"
+        language: system
+        entry: bash -c 'cd web && npm install --package-lock-only'
+        pass_filenames: false
+        files: ^web/package(-lock)?\.json$
+
      # Uses tsgo (TypeScript's native Go compiler) for ~10x faster type checking.
      # This is a preview package - if it breaks:
      #   1. Try updating: cd web && npm update @typescript/native-preview
--- a/.vscode/env_template.txt
+++ b/.vscode/env_template.txt
@@ -17,12 +17,6 @@ LOG_ONYX_MODEL_INTERACTIONS=True
 LOG_LEVEL=debug


-# This passes top N results to LLM an additional time for reranking prior to
-# answer generation.
-# This step is quite heavy on token usage so we disable it for dev generally.
-DISABLE_LLM_DOC_RELEVANCE=False
-
-
 # Useful if you want to toggle auth on/off (google_oauth/OIDC specifically).
 OAUTH_CLIENT_ID=<REPLACE THIS>
 OAUTH_CLIENT_SECRET=<REPLACE THIS>
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -151,6 +151,24 @@
      },
      "consoleTitle": "Slack Bot Console"
    },
+    {
+      "name": "Discord Bot",
+      "consoleName": "Discord Bot",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "onyx/onyxbot/discord/client.py",
+      "cwd": "${workspaceFolder}/backend",
+      "envFile": "${workspaceFolder}/.vscode/.env",
+      "env": {
+        "LOG_LEVEL": "DEBUG",
+        "PYTHONUNBUFFERED": "1",
+        "PYTHONPATH": "."
+      },
+      "presentation": {
+        "group": "2"
+      },
+      "consoleTitle": "Discord Bot Console"
+    },
    {
      "name": "MCP Server",
      "consoleName": "MCP Server",
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,262 +1,31 @@
-<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
-
 # Contributing to Onyx
-
 Hey there! We are so excited that you're interested in Onyx.

-As an open source project in a rapidly changing space, we welcome all contributions.

-## 💃 Guidelines
+## Contribution Opportunities
+The [GitHub Issues](https://github.com/onyx-dot-app/onyx/issues) page is a great place to look for and share contribution ideas.

-### Contribution Opportunities
+If you have your own feature that you would like to build please create an issue and community members can provide feedback and
+thumb it up if they feel a common need. 

-The [GitHub Issues](https://github.com/onyx-dot-app/onyx/issues) page is a great place to start for contribution ideas.

-To ensure that your contribution is aligned with the project's direction, please reach out to any maintainer on the Onyx team
-via [Discord](https://discord.gg/4NA5SbzrWb) or [email](mailto:hello@onyx.app).
+## Contributing Code
+Please reference the documents in contributing_guides folder to ensure that the code base is kept to a high standard.
+1. dev_setup.md (start here): gives you a guide to setting up a local development environment.
+2. contribution_process.md: how to ensure you are building valuable features that will get reviewed and merged.
+3. best_practices.md: before asking for reviews, ensure your changes meet the repo code quality standards.

-Issues that have been explicitly approved by the maintainers (aligned with the direction of the project)
-will be marked with the `approved by maintainers` label.
-Issues marked `good first issue` are an especially great place to start.
-
-**Connectors** to other tools are another great place to contribute. For details on how, refer to this
-[README.md](https://github.com/onyx-dot-app/onyx/blob/main/backend/onyx/connectors/README.md).
-
-If you have a new/different contribution in mind, we'd love to hear about it!
-Your input is vital to making sure that Onyx moves in the right direction.
-Before starting on implementation, please raise a GitHub issue.
-
-Also, always feel free to message the founders (Chris Weaver / Yuhong Sun) on
-[Discord](https://discord.gg/4NA5SbzrWb) directly about anything at all.
-
-### Contributing Code
-
-To contribute to this project, please follow the
+To contribute, please follow the
 ["fork and pull request"](https://docs.github.com/en/get-started/quickstart/contributing-to-projects) workflow.
-When opening a pull request, mention related issues and feel free to tag relevant maintainers.

-Before creating a pull request please make sure that the new changes conform to the formatting and linting requirements.
-See the [Formatting and Linting](#formatting-and-linting) section for how to run these checks locally.

-### Getting Help 🙋
+## Getting Help 🙋
+We have support channels and generally interesting discussions on our [Discord](https://discord.gg/4NA5SbzrWb).

-Our goal is to make contributing as easy as possible. If you run into any issues please don't hesitate to reach out.
-That way we can help future contributors and users can avoid the same issue.
+See you there!

-We also have support channels and generally interesting discussions on our
-[Discord](https://discord.gg/4NA5SbzrWb).
-
-We would love to see you there!
-
-## Get Started 🚀
-
-Onyx being a fully functional app, relies on some external software, specifically:
-
- [Postgres](https://www.postgresql.org/) (Relational DB)
- [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
- [Redis](https://redis.io/) (Cache)
- [MinIO](https://min.io/) (File Store)
- [Nginx](https://nginx.org/) (Not needed for development flows generally)
-
-> **Note:**
-> This guide provides instructions to build and run Onyx locally from source with Docker containers providing the above external software. We believe this combination is easier for
-> development purposes. If you prefer to use pre-built container images, we provide instructions on running the full Onyx stack within Docker below.
-
-### Local Set Up
-
-Be sure to use Python version 3.11. For instructions on installing Python 3.11 on macOS, refer to the [CONTRIBUTING_MACOS.md](./CONTRIBUTING_MACOS.md) readme.
-
-If using a lower version, modifications will have to be made to the code.
-If using a higher version, sometimes some libraries will not be available (i.e. we had problems with Tensorflow in the past with higher versions of python).
-
-#### Backend: Python requirements
-
-Currently, we use [uv](https://docs.astral.sh/uv/) and recommend creating a [virtual environment](https://docs.astral.sh/uv/pip/environments/#using-a-virtual-environment).
-
-For convenience here's a command for it:
-
-```bash
-uv venv .venv --python 3.11
-source .venv/bin/activate
-```
-
-_For Windows, activate the virtual environment using Command Prompt:_
-
-```bash
-.venv\Scripts\activate
-```
-
-If using PowerShell, the command slightly differs:
-
-```powershell
-.venv\Scripts\Activate.ps1
-```
-
-Install the required python dependencies:
-
-```bash
-uv sync --all-extras
-```
-
-Install Playwright for Python (headless browser required by the Web Connector):
-
-```bash
-uv run playwright install
-```
-
-#### Frontend: Node dependencies
-
-Onyx uses Node v22.20.0. We highly recommend you use [Node Version Manager (nvm)](https://github.com/nvm-sh/nvm)
-to manage your Node installations. Once installed, you can run
-
-```bash
-nvm install 22 && nvm use 22
-node -v # verify your active version
-```
-
-Navigate to `onyx/web` and run:
-
-```bash
-npm i
-```
-
-## Formatting and Linting
-
-### Backend
-
-For the backend, you'll need to setup pre-commit hooks (black / reorder-python-imports).
-
-Then run:
-
-```bash
-uv run pre-commit install
-```
-
-Additionally, we use `mypy` for static type checking.
-Onyx is fully type-annotated, and we want to keep it that way!
-To run the mypy checks manually, run `uv run mypy .` from the `onyx/backend` directory.
-
-### Web
-
-We use `prettier` for formatting. The desired version will be installed via a `npm i` from the `onyx/web` directory.
-To run the formatter, use `npx prettier --write .` from the `onyx/web` directory.
-
-Pre-commit will also run prettier automatically on files you've recently touched. If re-formatted, your commit will fail.
-Re-stage your changes and commit again.
-
-# Running the application for development
-
-## Developing using VSCode Debugger (recommended)
-
-**We highly recommend using VSCode debugger for development.**
-See [CONTRIBUTING_VSCODE.md](./CONTRIBUTING_VSCODE.md) for more details.
-
-Otherwise, you can follow the instructions below to run the application for development.
-
-## Manually running the application for development
-### Docker containers for external software
-
-You will need Docker installed to run these containers.
-
-First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis/MinIO with:
-
-```bash
-docker compose -f docker-compose.yml -f docker-compose.dev.yml up -d index relational_db cache minio
-```
-
-(index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
-
-### Running Onyx locally
-
-To start the frontend, navigate to `onyx/web` and run:
-
-```bash
-npm run dev
-```
-
-Next, start the model server which runs the local NLP models.
-Navigate to `onyx/backend` and run:
-
-```bash
-uvicorn model_server.main:app --reload --port 9000
-```
-
-_For Windows (for compatibility with both PowerShell and Command Prompt):_
-
-```bash
-powershell -Command "uvicorn model_server.main:app --reload --port 9000"
-```
-
-The first time running Onyx, you will need to run the DB migrations for Postgres.
-After the first time, this is no longer required unless the DB models change.
-
-Navigate to `onyx/backend` and with the venv active, run:
-
-```bash
-alembic upgrade head
-```
-
-Next, start the task queue which orchestrates the background jobs.
-Jobs that take more time are run async from the API server.
-
-Still in `onyx/backend`, run:
-
-```bash
-python ./scripts/dev_run_background_jobs.py
-```
-
-To run the backend API server, navigate back to `onyx/backend` and run:
-
-```bash
-AUTH_TYPE=disabled uvicorn onyx.main:app --reload --port 8080
-```
-
-_For Windows (for compatibility with both PowerShell and Command Prompt):_
-
-```bash
-powershell -Command "
-    $env:AUTH_TYPE='disabled'
-    uvicorn onyx.main:app --reload --port 8080
-"
-```
-
-> **Note:**
-> If you need finer logging, add the additional environment variable `LOG_LEVEL=DEBUG` to the relevant services.
-
-#### Wrapping up
-
-You should now have 4 servers running:
-
- Web server
- Backend API
- Model server
- Background jobs
-
-Now, visit `http://localhost:3000` in your browser. You should see the Onyx onboarding wizard where you can connect your external LLM provider to Onyx.
-
-You've successfully set up a local Onyx instance! 🏁
-
-#### Running the Onyx application in a container
-
-You can run the full Onyx application stack from pre-built images including all external software dependencies.
-
-Navigate to `onyx/deployment/docker_compose` and run:
-
-```bash
-docker compose up -d
-```
-
-After Docker pulls and starts these containers, navigate to `http://localhost:3000` to use Onyx.
-
-If you want to make changes to Onyx and run those changes in Docker, you can also build a local version of the Onyx container images that incorporates your changes like so:
-
-```bash
-docker compose up -d --build
-```
-
-
-### Release Process

+## Release Process
 Onyx loosely follows the SemVer versioning standard.
 Major changes are released with a "minor" version bump. Currently we use patch release versions to indicate small feature changes.
 A set of Docker containers will be pushed automatically to DockerHub with every tag.
--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -225,7 +225,6 @@ def do_run_migrations(
 ) -> None:
    if create_schema:
        connection.execute(text(f'CREATE SCHEMA IF NOT EXISTS "{schema_name}"'))
-        connection.execute(text("COMMIT"))

    connection.execute(text(f'SET search_path TO "{schema_name}"'))

@@ -309,6 +308,7 @@ async def run_async_migrations() -> None:
                        schema_name=schema,
                        create_schema=create_schema,
                    )
+                    await connection.commit()
            except Exception as e:
                logger.error(f"Error migrating schema {schema}: {e}")
                if not continue_on_error:
@@ -346,6 +346,7 @@ async def run_async_migrations() -> None:
                        schema_name=schema,
                        create_schema=create_schema,
                    )
+                    await connection.commit()
            except Exception as e:
                logger.error(f"Error migrating schema {schema}: {e}")
                if not continue_on_error:
--- a/backend/alembic/versions/505c488f6662_merge_default_assistants_into_unified.py
+++ b/backend/alembic/versions/505c488f6662_merge_default_assistants_into_unified.py
@@ -85,103 +85,122 @@ class UserRow(NamedTuple):
 def upgrade() -> None:
    conn = op.get_bind()

-    # Start transaction
-    conn.execute(sa.text("BEGIN"))
+    # Step 1: Create or update the unified assistant (ID 0)
+    search_assistant = conn.execute(
+        sa.text("SELECT * FROM persona WHERE id = 0")
+    ).fetchone()

-    try:
-        # Step 1: Create or update the unified assistant (ID 0)
-        search_assistant = conn.execute(
-            sa.text("SELECT * FROM persona WHERE id = 0")
-        ).fetchone()
-
-        if search_assistant:
-            # Update existing Search assistant to be the unified assistant
-            conn.execute(
-                sa.text(
-                    """
-                    UPDATE persona
-                    SET name = :name,
-                        description = :description,
-                        system_prompt = :system_prompt,
-                        num_chunks = :num_chunks,
-                        is_default_persona = true,
-                        is_visible = true,
-                        deleted = false,
-                        display_priority = :display_priority,
-                        llm_filter_extraction = :llm_filter_extraction,
-                        llm_relevance_filter = :llm_relevance_filter,
-                        recency_bias = :recency_bias,
-                        chunks_above = :chunks_above,
-                        chunks_below = :chunks_below,
-                        datetime_aware = :datetime_aware,
-                        starter_messages = null
-                    WHERE id = 0
-                """
-                ),
-                INSERT_DICT,
-            )
-        else:
-            # Create new unified assistant with ID 0
-            conn.execute(
-                sa.text(
-                    """
-                    INSERT INTO persona (
-                        id, name, description, system_prompt, num_chunks,
-                        is_default_persona, is_visible, deleted, display_priority,
-                        llm_filter_extraction, llm_relevance_filter, recency_bias,
-                        chunks_above, chunks_below, datetime_aware, starter_messages,
-                        builtin_persona
-                    ) VALUES (
-                        0, :name, :description, :system_prompt, :num_chunks,
-                        true, true, false, :display_priority, :llm_filter_extraction,
-                        :llm_relevance_filter, :recency_bias, :chunks_above, :chunks_below,
-                        :datetime_aware, null, true
-                    )
-                """
-                ),
-                INSERT_DICT,
-            )
-
-        # Step 2: Mark ALL builtin assistants as deleted (except the unified assistant ID 0)
+    if search_assistant:
+        # Update existing Search assistant to be the unified assistant
        conn.execute(
            sa.text(
                """
                UPDATE persona
-                SET deleted = true, is_visible = false, is_default_persona = false
-                WHERE builtin_persona = true AND id != 0
+                SET name = :name,
+                    description = :description,
+                    system_prompt = :system_prompt,
+                    num_chunks = :num_chunks,
+                    is_default_persona = true,
+                    is_visible = true,
+                    deleted = false,
+                    display_priority = :display_priority,
+                    llm_filter_extraction = :llm_filter_extraction,
+                    llm_relevance_filter = :llm_relevance_filter,
+                    recency_bias = :recency_bias,
+                    chunks_above = :chunks_above,
+                    chunks_below = :chunks_below,
+                    datetime_aware = :datetime_aware,
+                    starter_messages = null
+                WHERE id = 0
            """
-            )
+            ),
+            INSERT_DICT,
+        )
+    else:
+        # Create new unified assistant with ID 0
+        conn.execute(
+            sa.text(
+                """
+                INSERT INTO persona (
+                    id, name, description, system_prompt, num_chunks,
+                    is_default_persona, is_visible, deleted, display_priority,
+                    llm_filter_extraction, llm_relevance_filter, recency_bias,
+                    chunks_above, chunks_below, datetime_aware, starter_messages,
+                    builtin_persona
+                ) VALUES (
+                    0, :name, :description, :system_prompt, :num_chunks,
+                    true, true, false, :display_priority, :llm_filter_extraction,
+                    :llm_relevance_filter, :recency_bias, :chunks_above, :chunks_below,
+                    :datetime_aware, null, true
+                )
+            """
+            ),
+            INSERT_DICT,
        )

-        # Step 3: Add all built-in tools to the unified assistant
-        # First, get the tool IDs for SearchTool, ImageGenerationTool, and WebSearchTool
-        search_tool = conn.execute(
-            sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'SearchTool'")
-        ).fetchone()
+    # Step 2: Mark ALL builtin assistants as deleted (except the unified assistant ID 0)
+    conn.execute(
+        sa.text(
+            """
+            UPDATE persona
+            SET deleted = true, is_visible = false, is_default_persona = false
+            WHERE builtin_persona = true AND id != 0
+        """
+        )
+    )

-        if not search_tool:
-            raise ValueError(
-                "SearchTool not found in database. Ensure tools migration has run first."
-            )
+    # Step 3: Add all built-in tools to the unified assistant
+    # First, get the tool IDs for SearchTool, ImageGenerationTool, and WebSearchTool
+    search_tool = conn.execute(
+        sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'SearchTool'")
+    ).fetchone()

-        image_gen_tool = conn.execute(
-            sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'ImageGenerationTool'")
-        ).fetchone()
+    if not search_tool:
+        raise ValueError(
+            "SearchTool not found in database. Ensure tools migration has run first."
+        )

-        if not image_gen_tool:
-            raise ValueError(
-                "ImageGenerationTool not found in database. Ensure tools migration has run first."
-            )
+    image_gen_tool = conn.execute(
+        sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'ImageGenerationTool'")
+    ).fetchone()

-        # WebSearchTool is optional - may not be configured
-        web_search_tool = conn.execute(
-            sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'WebSearchTool'")
-        ).fetchone()
+    if not image_gen_tool:
+        raise ValueError(
+            "ImageGenerationTool not found in database. Ensure tools migration has run first."
+        )

-        # Clear existing tool associations for persona 0
-        conn.execute(sa.text("DELETE FROM persona__tool WHERE persona_id = 0"))
+    # WebSearchTool is optional - may not be configured
+    web_search_tool = conn.execute(
+        sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'WebSearchTool'")
+    ).fetchone()

-        # Add tools to the unified assistant
+    # Clear existing tool associations for persona 0
+    conn.execute(sa.text("DELETE FROM persona__tool WHERE persona_id = 0"))
+
+    # Add tools to the unified assistant
+    conn.execute(
+        sa.text(
+            """
+            INSERT INTO persona__tool (persona_id, tool_id)
+            VALUES (0, :tool_id)
+            ON CONFLICT DO NOTHING
+        """
+        ),
+        {"tool_id": search_tool[0]},
+    )
+
+    conn.execute(
+        sa.text(
+            """
+            INSERT INTO persona__tool (persona_id, tool_id)
+            VALUES (0, :tool_id)
+            ON CONFLICT DO NOTHING
+        """
+        ),
+        {"tool_id": image_gen_tool[0]},
+    )
+
+    if web_search_tool:
        conn.execute(
            sa.text(
                """
@@ -190,191 +209,148 @@ def upgrade() -> None:
                ON CONFLICT DO NOTHING
            """
            ),
-            {"tool_id": search_tool[0]},
+            {"tool_id": web_search_tool[0]},
        )

-        conn.execute(
-            sa.text(
-                """
-                INSERT INTO persona__tool (persona_id, tool_id)
-                VALUES (0, :tool_id)
-                ON CONFLICT DO NOTHING
+    # Step 4: Migrate existing chat sessions from all builtin assistants to unified assistant
+    conn.execute(
+        sa.text(
            """
-            ),
-            {"tool_id": image_gen_tool[0]},
+            UPDATE chat_session
+            SET persona_id = 0
+            WHERE persona_id IN (
+                SELECT id FROM persona WHERE builtin_persona = true AND id != 0
+            )
+        """
        )
+    )

-        if web_search_tool:
+    # Step 5: Migrate user preferences - remove references to all builtin assistants
+    # First, get all builtin assistant IDs (except 0)
+    builtin_assistants_result = conn.execute(
+        sa.text(
+            """
+            SELECT id FROM persona
+            WHERE builtin_persona = true AND id != 0
+        """
+        )
+    ).fetchall()
+    builtin_assistant_ids = [row[0] for row in builtin_assistants_result]
+
+    # Get all users with preferences
+    users_result = conn.execute(
+        sa.text(
+            """
+            SELECT id, chosen_assistants, visible_assistants,
+                   hidden_assistants, pinned_assistants
+            FROM "user"
+        """
+        )
+    ).fetchall()
+
+    for user_row in users_result:
+        user = UserRow(*user_row)
+        user_id: UUID = user.id
+        updates: dict[str, Any] = {}
+
+        # Remove all builtin assistants from chosen_assistants
+        if user.chosen_assistants:
+            new_chosen: list[int] = [
+                assistant_id
+                for assistant_id in user.chosen_assistants
+                if assistant_id not in builtin_assistant_ids
+            ]
+            if new_chosen != user.chosen_assistants:
+                updates["chosen_assistants"] = json.dumps(new_chosen)
+
+        # Remove all builtin assistants from visible_assistants
+        if user.visible_assistants:
+            new_visible: list[int] = [
+                assistant_id
+                for assistant_id in user.visible_assistants
+                if assistant_id not in builtin_assistant_ids
+            ]
+            if new_visible != user.visible_assistants:
+                updates["visible_assistants"] = json.dumps(new_visible)
+
+        # Add all builtin assistants to hidden_assistants
+        if user.hidden_assistants:
+            new_hidden: list[int] = list(user.hidden_assistants)
+            for old_id in builtin_assistant_ids:
+                if old_id not in new_hidden:
+                    new_hidden.append(old_id)
+            if new_hidden != user.hidden_assistants:
+                updates["hidden_assistants"] = json.dumps(new_hidden)
+        else:
+            updates["hidden_assistants"] = json.dumps(builtin_assistant_ids)
+
+        # Remove all builtin assistants from pinned_assistants
+        if user.pinned_assistants:
+            new_pinned: list[int] = [
+                assistant_id
+                for assistant_id in user.pinned_assistants
+                if assistant_id not in builtin_assistant_ids
+            ]
+            if new_pinned != user.pinned_assistants:
+                updates["pinned_assistants"] = json.dumps(new_pinned)
+
+        # Apply updates if any
+        if updates:
+            set_clause = ", ".join([f"{k} = :{k}" for k in updates.keys()])
+            updates["user_id"] = str(user_id)  # Convert UUID to string for SQL
            conn.execute(
-                sa.text(
-                    """
-                    INSERT INTO persona__tool (persona_id, tool_id)
-                    VALUES (0, :tool_id)
-                    ON CONFLICT DO NOTHING
-                """
-                ),
-                {"tool_id": web_search_tool[0]},
+                sa.text(f'UPDATE "user" SET {set_clause} WHERE id = :user_id'),
+                updates,
            )

-        # Step 4: Migrate existing chat sessions from all builtin assistants to unified assistant
-        conn.execute(
-            sa.text(
-                """
-                UPDATE chat_session
-                SET persona_id = 0
-                WHERE persona_id IN (
-                    SELECT id FROM persona WHERE builtin_persona = true AND id != 0
-                )
-            """
-            )
-        )
-
-        # Step 5: Migrate user preferences - remove references to all builtin assistants
-        # First, get all builtin assistant IDs (except 0)
-        builtin_assistants_result = conn.execute(
-            sa.text(
-                """
-                SELECT id FROM persona
-                WHERE builtin_persona = true AND id != 0
-            """
-            )
-        ).fetchall()
-        builtin_assistant_ids = [row[0] for row in builtin_assistants_result]
-
-        # Get all users with preferences
-        users_result = conn.execute(
-            sa.text(
-                """
-                SELECT id, chosen_assistants, visible_assistants,
-                       hidden_assistants, pinned_assistants
-                FROM "user"
-            """
-            )
-        ).fetchall()
-
-        for user_row in users_result:
-            user = UserRow(*user_row)
-            user_id: UUID = user.id
-            updates: dict[str, Any] = {}
-
-            # Remove all builtin assistants from chosen_assistants
-            if user.chosen_assistants:
-                new_chosen: list[int] = [
-                    assistant_id
-                    for assistant_id in user.chosen_assistants
-                    if assistant_id not in builtin_assistant_ids
-                ]
-                if new_chosen != user.chosen_assistants:
-                    updates["chosen_assistants"] = json.dumps(new_chosen)
-
-            # Remove all builtin assistants from visible_assistants
-            if user.visible_assistants:
-                new_visible: list[int] = [
-                    assistant_id
-                    for assistant_id in user.visible_assistants
-                    if assistant_id not in builtin_assistant_ids
-                ]
-                if new_visible != user.visible_assistants:
-                    updates["visible_assistants"] = json.dumps(new_visible)
-
-            # Add all builtin assistants to hidden_assistants
-            if user.hidden_assistants:
-                new_hidden: list[int] = list(user.hidden_assistants)
-                for old_id in builtin_assistant_ids:
-                    if old_id not in new_hidden:
-                        new_hidden.append(old_id)
-                if new_hidden != user.hidden_assistants:
-                    updates["hidden_assistants"] = json.dumps(new_hidden)
-            else:
-                updates["hidden_assistants"] = json.dumps(builtin_assistant_ids)
-
-            # Remove all builtin assistants from pinned_assistants
-            if user.pinned_assistants:
-                new_pinned: list[int] = [
-                    assistant_id
-                    for assistant_id in user.pinned_assistants
-                    if assistant_id not in builtin_assistant_ids
-                ]
-                if new_pinned != user.pinned_assistants:
-                    updates["pinned_assistants"] = json.dumps(new_pinned)
-
-            # Apply updates if any
-            if updates:
-                set_clause = ", ".join([f"{k} = :{k}" for k in updates.keys()])
-                updates["user_id"] = str(user_id)  # Convert UUID to string for SQL
-                conn.execute(
-                    sa.text(f'UPDATE "user" SET {set_clause} WHERE id = :user_id'),
-                    updates,
-                )
-
-        # Commit transaction
-        conn.execute(sa.text("COMMIT"))
-
-    except Exception as e:
-        # Rollback on error
-        conn.execute(sa.text("ROLLBACK"))
-        raise e
-

 def downgrade() -> None:
    conn = op.get_bind()

-    # Start transaction
-    conn.execute(sa.text("BEGIN"))
-
-    try:
-        # Only restore General (ID -1) and Art (ID -3) assistants
-        # Step 1: Keep Search assistant (ID 0) as default but restore original state
-        conn.execute(
-            sa.text(
-                """
-                UPDATE persona
-                SET is_default_persona = true,
-                    is_visible = true,
-                    deleted = false
-                WHERE id = 0
+    # Only restore General (ID -1) and Art (ID -3) assistants
+    # Step 1: Keep Search assistant (ID 0) as default but restore original state
+    conn.execute(
+        sa.text(
            """
-            )
+            UPDATE persona
+            SET is_default_persona = true,
+                is_visible = true,
+                deleted = false
+            WHERE id = 0
+        """
        )
+    )

-        # Step 2: Restore General assistant (ID -1)
-        conn.execute(
-            sa.text(
-                """
-                UPDATE persona
-                SET deleted = false,
-                    is_visible = true,
-                    is_default_persona = true
-                WHERE id = :general_assistant_id
+    # Step 2: Restore General assistant (ID -1)
+    conn.execute(
+        sa.text(
            """
-            ),
-            {"general_assistant_id": GENERAL_ASSISTANT_ID},
-        )
+            UPDATE persona
+            SET deleted = false,
+                is_visible = true,
+                is_default_persona = true
+            WHERE id = :general_assistant_id
+        """
+        ),
+        {"general_assistant_id": GENERAL_ASSISTANT_ID},
+    )

-        # Step 3: Restore Art assistant (ID -3)
-        conn.execute(
-            sa.text(
-                """
-                UPDATE persona
-                SET deleted = false,
-                    is_visible = true,
-                    is_default_persona = true
-                WHERE id = :art_assistant_id
+    # Step 3: Restore Art assistant (ID -3)
+    conn.execute(
+        sa.text(
            """
-            ),
-            {"art_assistant_id": ART_ASSISTANT_ID},
-        )
+            UPDATE persona
+            SET deleted = false,
+                is_visible = true,
+                is_default_persona = true
+            WHERE id = :art_assistant_id
+        """
+        ),
+        {"art_assistant_id": ART_ASSISTANT_ID},
+    )

-        # Note: We don't restore the original tool associations, names, or descriptions
-        # as those would require more complex logic to determine original state.
-        # We also cannot restore original chat session persona_ids as we don't
-        # have the original mappings.
-        # Other builtin assistants remain deleted as per the requirement.
-
-        # Commit transaction
-        conn.execute(sa.text("COMMIT"))
-
-    except Exception as e:
-        # Rollback on error
-        conn.execute(sa.text("ROLLBACK"))
-        raise e
+    # Note: We don't restore the original tool associations, names, or descriptions
+    # as those would require more complex logic to determine original state.
+    # We also cannot restore original chat session persona_ids as we don't
+    # have the original mappings.
+    # Other builtin assistants remain deleted as per the requirement.
--- a/backend/alembic/versions/73e9983e5091_add_search_query_table.py
+++ b/backend/alembic/versions/73e9983e5091_add_search_query_table.py
@@ -0,0 +1,47 @@
+"""add_search_query_table
+
+Revision ID: 73e9983e5091
+Revises: d1b637d7050a
+Create Date: 2026-01-14 14:16:52.837489
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "73e9983e5091"
+down_revision = "d1b637d7050a"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "search_query",
+        sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
+        sa.Column(
+            "user_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("user.id"),
+            nullable=False,
+        ),
+        sa.Column("query", sa.String(), nullable=False),
+        sa.Column("query_expansions", postgresql.ARRAY(sa.String()), nullable=True),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            nullable=False,
+            server_default=sa.func.now(),
+        ),
+    )
+
+    op.create_index("ix_search_query_user_id", "search_query", ["user_id"])
+    op.create_index("ix_search_query_created_at", "search_query", ["created_at"])
+
+
+def downgrade() -> None:
+    op.drop_index("ix_search_query_created_at", table_name="search_query")
+    op.drop_index("ix_search_query_user_id", table_name="search_query")
+    op.drop_table("search_query")
--- a/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
+++ b/backend/alembic/versions/776b3bbe9092_remove_remaining_enums.py
@@ -10,8 +10,7 @@ from alembic import op
 import sqlalchemy as sa

 from onyx.db.models import IndexModelStatus
-from onyx.context.search.enums import RecencyBiasSetting
-from onyx.context.search.enums import SearchType
+from onyx.context.search.enums import RecencyBiasSetting, SearchType

 # revision identifiers, used by Alembic.
 revision = "776b3bbe9092"
--- a/backend/alembic/versions/8405ca81cc83_notifications_constraint.py
+++ b/backend/alembic/versions/8405ca81cc83_notifications_constraint.py
@@ -0,0 +1,49 @@
+"""notifications constraint, sort index, and cleanup old notifications
+
+Revision ID: 8405ca81cc83
+Revises: a3c1a7904cd0
+Create Date: 2026-01-07 16:43:44.855156
+
+"""
+
+from alembic import op
+
+
+# revision identifiers, used by Alembic.
+revision = "8405ca81cc83"
+down_revision = "a3c1a7904cd0"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create unique index for notification deduplication.
+    # This enables atomic ON CONFLICT DO NOTHING inserts in batch_create_notifications.
+    #
+    # Uses COALESCE to handle NULL additional_data (NULLs are normally distinct
+    # in unique constraints, but we want NULL == NULL for deduplication).
+    # The '{}' represents an empty JSONB object as the NULL replacement.
+
+    # Clean up legacy notifications first
+    op.execute("DELETE FROM notification WHERE title = 'New Notification'")
+
+    op.execute(
+        """
+        CREATE UNIQUE INDEX IF NOT EXISTS ix_notification_user_type_data
+        ON notification (user_id, notif_type, COALESCE(additional_data, '{}'::jsonb))
+        """
+    )
+
+    # Create index for efficient notification sorting by user
+    # Covers: WHERE user_id = ? ORDER BY dismissed, first_shown DESC
+    op.execute(
+        """
+        CREATE INDEX IF NOT EXISTS ix_notification_user_sort
+        ON notification (user_id, dismissed, first_shown DESC)
+        """
+    )
+
+
+def downgrade() -> None:
+    op.execute("DROP INDEX IF EXISTS ix_notification_user_type_data")
+    op.execute("DROP INDEX IF EXISTS ix_notification_user_sort")
--- a/backend/alembic/versions/8b5ce697290e_add_discord_bot_tables.py
+++ b/backend/alembic/versions/8b5ce697290e_add_discord_bot_tables.py
@@ -0,0 +1,116 @@
+"""Add Discord bot tables
+
+Revision ID: 8b5ce697290e
+Revises: a1b2c3d4e5f7
+Create Date: 2025-01-14
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "8b5ce697290e"
+down_revision = "a1b2c3d4e5f7"
+branch_labels: None = None
+depends_on: None = None
+
+
+def upgrade() -> None:
+    # DiscordBotConfig (singleton table - one per tenant)
+    op.create_table(
+        "discord_bot_config",
+        sa.Column(
+            "id",
+            sa.String(),
+            primary_key=True,
+            server_default=sa.text("'SINGLETON'"),
+        ),
+        sa.Column("bot_token", sa.LargeBinary(), nullable=False),  # EncryptedString
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.func.now(),
+            nullable=False,
+        ),
+        sa.CheckConstraint("id = 'SINGLETON'", name="ck_discord_bot_config_singleton"),
+    )
+
+    # DiscordGuildConfig
+    op.create_table(
+        "discord_guild_config",
+        sa.Column("id", sa.Integer(), primary_key=True),
+        sa.Column("guild_id", sa.BigInteger(), nullable=True, unique=True),
+        sa.Column("guild_name", sa.String(), nullable=True),
+        sa.Column("registration_key", sa.String(), nullable=False, unique=True),
+        sa.Column("registered_at", sa.DateTime(timezone=True), nullable=True),
+        sa.Column(
+            "default_persona_id",
+            sa.Integer(),
+            sa.ForeignKey("persona.id", ondelete="SET NULL"),
+            nullable=True,
+        ),
+        sa.Column(
+            "enabled", sa.Boolean(), server_default=sa.text("true"), nullable=False
+        ),
+    )
+
+    # DiscordChannelConfig
+    op.create_table(
+        "discord_channel_config",
+        sa.Column("id", sa.Integer(), primary_key=True),
+        sa.Column(
+            "guild_config_id",
+            sa.Integer(),
+            sa.ForeignKey("discord_guild_config.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column("channel_id", sa.BigInteger(), nullable=False),
+        sa.Column("channel_name", sa.String(), nullable=False),
+        sa.Column(
+            "channel_type",
+            sa.String(20),
+            server_default=sa.text("'text'"),
+            nullable=False,
+        ),
+        sa.Column(
+            "is_private",
+            sa.Boolean(),
+            server_default=sa.text("false"),
+            nullable=False,
+        ),
+        sa.Column(
+            "thread_only_mode",
+            sa.Boolean(),
+            server_default=sa.text("false"),
+            nullable=False,
+        ),
+        sa.Column(
+            "require_bot_invocation",
+            sa.Boolean(),
+            server_default=sa.text("true"),
+            nullable=False,
+        ),
+        sa.Column(
+            "persona_override_id",
+            sa.Integer(),
+            sa.ForeignKey("persona.id", ondelete="SET NULL"),
+            nullable=True,
+        ),
+        sa.Column(
+            "enabled", sa.Boolean(), server_default=sa.text("false"), nullable=False
+        ),
+    )
+
+    # Unique constraint: one config per channel per guild
+    op.create_unique_constraint(
+        "uq_discord_channel_guild_channel",
+        "discord_channel_config",
+        ["guild_config_id", "channel_id"],
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("discord_channel_config")
+    op.drop_table("discord_guild_config")
+    op.drop_table("discord_bot_config")
--- a/backend/alembic/versions/a01bf2971c5d_update_default_tool_descriptions.py
+++ b/backend/alembic/versions/a01bf2971c5d_update_default_tool_descriptions.py
@@ -42,20 +42,13 @@ TOOL_DESCRIPTIONS = {

 def upgrade() -> None:
    conn = op.get_bind()
-    conn.execute(sa.text("BEGIN"))
-
-    try:
-        for tool_id, description in TOOL_DESCRIPTIONS.items():
-            conn.execute(
-                sa.text(
-                    "UPDATE tool SET description = :description WHERE in_code_tool_id = :tool_id"
-                ),
-                {"description": description, "tool_id": tool_id},
-            )
-        conn.execute(sa.text("COMMIT"))
-    except Exception as e:
-        conn.execute(sa.text("ROLLBACK"))
-        raise e
+    for tool_id, description in TOOL_DESCRIPTIONS.items():
+        conn.execute(
+            sa.text(
+                "UPDATE tool SET description = :description WHERE in_code_tool_id = :tool_id"
+            ),
+            {"description": description, "tool_id": tool_id},
+        )


 def downgrade() -> None:
--- a/backend/alembic/versions/a1b2c3d4e5f7_drop_agent_search_metrics_table.py
+++ b/backend/alembic/versions/a1b2c3d4e5f7_drop_agent_search_metrics_table.py
@@ -0,0 +1,47 @@
+"""drop agent_search_metrics table
+
+Revision ID: a1b2c3d4e5f7
+Revises: 73e9983e5091
+Create Date: 2026-01-17
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "a1b2c3d4e5f7"
+down_revision = "73e9983e5091"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.drop_table("agent__search_metrics")
+
+
+def downgrade() -> None:
+    op.create_table(
+        "agent__search_metrics",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("user_id", sa.UUID(), nullable=True),
+        sa.Column("persona_id", sa.Integer(), nullable=True),
+        sa.Column("agent_type", sa.String(), nullable=False),
+        sa.Column("start_time", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("base_duration_s", sa.Float(), nullable=False),
+        sa.Column("full_duration_s", sa.Float(), nullable=False),
+        sa.Column("base_metrics", postgresql.JSONB(), nullable=True),
+        sa.Column("refined_metrics", postgresql.JSONB(), nullable=True),
+        sa.Column("all_metrics", postgresql.JSONB(), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["user_id"],
+            ["user.id"],
+            ondelete="CASCADE",
+        ),
+        sa.ForeignKeyConstraint(
+            ["persona_id"],
+            ["persona.id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
--- a/backend/alembic/versions/c1d2e3f4a5b6_add_deep_research_tool.py
+++ b/backend/alembic/versions/c1d2e3f4a5b6_add_deep_research_tool.py
@@ -7,7 +7,6 @@ Create Date: 2025-12-18 16:00:00.000000
 """

 from alembic import op
-from onyx.deep_research.dr_mock_tools import RESEARCH_AGENT_DB_NAME
 import sqlalchemy as sa


@@ -19,7 +18,7 @@ depends_on = None


 DEEP_RESEARCH_TOOL = {
-    "name": RESEARCH_AGENT_DB_NAME,
+    "name": "ResearchAgent",
    "display_name": "Research Agent",
    "description": "The Research Agent is a sub-agent that conducts research on a specific topic.",
    "in_code_tool_id": "ResearchAgent",
--- a/backend/alembic/versions/d09fc20a3c66_seed_builtin_tools.py
+++ b/backend/alembic/versions/d09fc20a3c66_seed_builtin_tools.py
@@ -70,80 +70,66 @@ BUILT_IN_TOOLS = [
 def upgrade() -> None:
    conn = op.get_bind()

-    # Start transaction
-    conn.execute(sa.text("BEGIN"))
+    # Get existing tools to check what already exists
+    existing_tools = conn.execute(
+        sa.text("SELECT in_code_tool_id FROM tool WHERE in_code_tool_id IS NOT NULL")
+    ).fetchall()
+    existing_tool_ids = {row[0] for row in existing_tools}

-    try:
-        # Get existing tools to check what already exists
-        existing_tools = conn.execute(
-            sa.text(
-                "SELECT in_code_tool_id FROM tool WHERE in_code_tool_id IS NOT NULL"
+    # Insert or update built-in tools
+    for tool in BUILT_IN_TOOLS:
+        in_code_id = tool["in_code_tool_id"]
+
+        # Handle historical rename: InternetSearchTool -> WebSearchTool
+        if (
+            in_code_id == "WebSearchTool"
+            and "WebSearchTool" not in existing_tool_ids
+            and "InternetSearchTool" in existing_tool_ids
+        ):
+            # Rename the existing InternetSearchTool row in place and update fields
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE tool
+                    SET name = :name,
+                        display_name = :display_name,
+                        description = :description,
+                        in_code_tool_id = :in_code_tool_id
+                    WHERE in_code_tool_id = 'InternetSearchTool'
+                    """
+                ),
+                tool,
            )
-        ).fetchall()
-        existing_tool_ids = {row[0] for row in existing_tools}
+            # Keep the local view of existing ids in sync to avoid duplicate insert
+            existing_tool_ids.discard("InternetSearchTool")
+            existing_tool_ids.add("WebSearchTool")
+            continue

-        # Insert or update built-in tools
-        for tool in BUILT_IN_TOOLS:
-            in_code_id = tool["in_code_tool_id"]
-
-            # Handle historical rename: InternetSearchTool -> WebSearchTool
-            if (
-                in_code_id == "WebSearchTool"
-                and "WebSearchTool" not in existing_tool_ids
-                and "InternetSearchTool" in existing_tool_ids
-            ):
-                # Rename the existing InternetSearchTool row in place and update fields
-                conn.execute(
-                    sa.text(
-                        """
-                        UPDATE tool
-                        SET name = :name,
-                            display_name = :display_name,
-                            description = :description,
-                            in_code_tool_id = :in_code_tool_id
-                        WHERE in_code_tool_id = 'InternetSearchTool'
-                        """
-                    ),
-                    tool,
-                )
-                # Keep the local view of existing ids in sync to avoid duplicate insert
-                existing_tool_ids.discard("InternetSearchTool")
-                existing_tool_ids.add("WebSearchTool")
-                continue
-
-            if in_code_id in existing_tool_ids:
-                # Update existing tool
-                conn.execute(
-                    sa.text(
-                        """
-                        UPDATE tool
-                        SET name = :name,
-                            display_name = :display_name,
-                            description = :description
-                        WHERE in_code_tool_id = :in_code_tool_id
-                        """
-                    ),
-                    tool,
-                )
-            else:
-                # Insert new tool
-                conn.execute(
-                    sa.text(
-                        """
-                        INSERT INTO tool (name, display_name, description, in_code_tool_id)
-                        VALUES (:name, :display_name, :description, :in_code_tool_id)
-                        """
-                    ),
-                    tool,
-                )
-
-        # Commit transaction
-        conn.execute(sa.text("COMMIT"))
-
-    except Exception as e:
-        # Rollback on error
-        conn.execute(sa.text("ROLLBACK"))
-        raise e
+        if in_code_id in existing_tool_ids:
+            # Update existing tool
+            conn.execute(
+                sa.text(
+                    """
+                    UPDATE tool
+                    SET name = :name,
+                        display_name = :display_name,
+                        description = :description
+                    WHERE in_code_tool_id = :in_code_tool_id
+                    """
+                ),
+                tool,
+            )
+        else:
+            # Insert new tool
+            conn.execute(
+                sa.text(
+                    """
+                    INSERT INTO tool (name, display_name, description, in_code_tool_id)
+                    VALUES (:name, :display_name, :description, :in_code_tool_id)
+                    """
+                ),
+                tool,
+            )


 def downgrade() -> None:
--- a/backend/alembic/versions/d1b637d7050a_sync_exa_api_key_to_content_provider.py
+++ b/backend/alembic/versions/d1b637d7050a_sync_exa_api_key_to_content_provider.py
@@ -0,0 +1,64 @@
+"""sync_exa_api_key_to_content_provider
+
+Revision ID: d1b637d7050a
+Revises: d25168c2beee
+Create Date: 2026-01-09 15:54:15.646249
+
+"""
+
+from alembic import op
+from sqlalchemy import text
+
+
+# revision identifiers, used by Alembic.
+revision = "d1b637d7050a"
+down_revision = "d25168c2beee"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Exa uses a shared API key between search and content providers.
+    # For existing Exa search providers with API keys, create the corresponding
+    # content provider if it doesn't exist yet.
+    connection = op.get_bind()
+
+    # Check if Exa search provider exists with an API key
+    result = connection.execute(
+        text(
+            """
+            SELECT api_key FROM internet_search_provider
+            WHERE provider_type = 'exa' AND api_key IS NOT NULL
+            LIMIT 1
+            """
+        )
+    )
+    row = result.fetchone()
+
+    if row:
+        api_key = row[0]
+        # Create Exa content provider with the shared key
+        connection.execute(
+            text(
+                """
+                INSERT INTO internet_content_provider
+                (name, provider_type, api_key, is_active)
+                VALUES ('Exa', 'exa', :api_key, false)
+                ON CONFLICT (name) DO NOTHING
+                """
+            ),
+            {"api_key": api_key},
+        )
+
+
+def downgrade() -> None:
+    # Remove the Exa content provider that was created by this migration
+    connection = op.get_bind()
+    connection.execute(
+        text(
+            """
+            DELETE FROM internet_content_provider
+            WHERE provider_type = 'exa'
+            """
+        )
+    )
--- a/backend/alembic/versions/d25168c2beee_tool_name_consistency.py
+++ b/backend/alembic/versions/d25168c2beee_tool_name_consistency.py
@@ -0,0 +1,86 @@
+"""tool_name_consistency
+
+Revision ID: d25168c2beee
+Revises: 8405ca81cc83
+Create Date: 2026-01-11 17:54:40.135777
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "d25168c2beee"
+down_revision = "8405ca81cc83"
+branch_labels = None
+depends_on = None
+
+
+# Currently the seeded tools have the in_code_tool_id == name
+CURRENT_TOOL_NAME_MAPPING = [
+    "SearchTool",
+    "WebSearchTool",
+    "ImageGenerationTool",
+    "PythonTool",
+    "OpenURLTool",
+    "KnowledgeGraphTool",
+    "ResearchAgent",
+]
+
+# Mapping of in_code_tool_id -> name
+# These are the expected names that we want in the database
+EXPECTED_TOOL_NAME_MAPPING = {
+    "SearchTool": "internal_search",
+    "WebSearchTool": "web_search",
+    "ImageGenerationTool": "generate_image",
+    "PythonTool": "python",
+    "OpenURLTool": "open_url",
+    "KnowledgeGraphTool": "run_kg_search",
+    "ResearchAgent": "research_agent",
+}
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    # Mapping of in_code_tool_id to the NAME constant from each tool class
+    # These match the .name property of each tool implementation
+    tool_name_mapping = EXPECTED_TOOL_NAME_MAPPING
+
+    # Update the name column for each tool based on its in_code_tool_id
+    for in_code_tool_id, expected_name in tool_name_mapping.items():
+        conn.execute(
+            sa.text(
+                """
+                UPDATE tool
+                SET name = :expected_name
+                WHERE in_code_tool_id = :in_code_tool_id
+                """
+            ),
+            {
+                "expected_name": expected_name,
+                "in_code_tool_id": in_code_tool_id,
+            },
+        )
+
+
+def downgrade() -> None:
+    conn = op.get_bind()
+
+    # Reverse the migration by setting name back to in_code_tool_id
+    # This matches the original pattern where name was the class name
+    for in_code_tool_id in CURRENT_TOOL_NAME_MAPPING:
+        conn.execute(
+            sa.text(
+                """
+                UPDATE tool
+                SET name = :current_name
+                WHERE in_code_tool_id = :in_code_tool_id
+                """
+            ),
+            {
+                "current_name": in_code_tool_id,
+                "in_code_tool_id": in_code_tool_id,
+            },
+        )
--- a/backend/ee/onyx/configs/app_configs.py
+++ b/backend/ee/onyx/configs/app_configs.py
@@ -109,7 +109,6 @@ CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS = float(


 STRIPE_SECRET_KEY = os.environ.get("STRIPE_SECRET_KEY")
-STRIPE_PRICE_ID = os.environ.get("STRIPE_PRICE")

 # JWT Public Key URL
 JWT_PUBLIC_KEY_URL: str | None = os.getenv("JWT_PUBLIC_KEY_URL", None)
@@ -129,3 +128,8 @@ MARKETING_POSTHOG_API_KEY = os.environ.get("MARKETING_POSTHOG_API_KEY")
 HUBSPOT_TRACKING_URL = os.environ.get("HUBSPOT_TRACKING_URL")

 GATED_TENANTS_KEY = "gated_tenants"
+
+# License enforcement - when True, blocks API access for gated/expired licenses
+LICENSE_ENFORCEMENT_ENABLED = (
+    os.environ.get("LICENSE_ENFORCEMENT_ENABLED", "").lower() == "true"
+)
--- a/backend/ee/onyx/db/persona.py
+++ b/backend/ee/onyx/db/persona.py
@@ -3,30 +3,42 @@ from uuid import UUID
 from sqlalchemy.orm import Session

 from onyx.configs.constants import NotificationType
+from onyx.db.models import Persona
 from onyx.db.models import Persona__User
 from onyx.db.models import Persona__UserGroup
 from onyx.db.notification import create_notification
 from onyx.server.features.persona.models import PersonaSharedNotificationData


-def make_persona_private(
+def update_persona_access(
    persona_id: int,
    creator_user_id: UUID | None,
-    user_ids: list[UUID] | None,
-    group_ids: list[int] | None,
    db_session: Session,
+    is_public: bool | None = None,
+    user_ids: list[UUID] | None = None,
+    group_ids: list[int] | None = None,
 ) -> None:
-    """NOTE(rkuo): This function batches all updates into a single commit. If we don't
-    dedupe the inputs, the commit will exception."""
+    """Updates the access settings for a persona including public status, user shares,
+    and group shares.

-    db_session.query(Persona__User).filter(
-        Persona__User.persona_id == persona_id
-    ).delete(synchronize_session="fetch")
-    db_session.query(Persona__UserGroup).filter(
-        Persona__UserGroup.persona_id == persona_id
-    ).delete(synchronize_session="fetch")
+    NOTE: This function batches all updates. If we don't dedupe the inputs,
+    the commit will exception.
+
+    NOTE: Callers are responsible for committing."""
+
+    if is_public is not None:
+        persona = db_session.query(Persona).filter(Persona.id == persona_id).first()
+        if persona:
+            persona.is_public = is_public
+
+    # NOTE: For user-ids and group-ids, `None` means "leave unchanged", `[]` means "clear all shares",
+    # and a non-empty list means "replace with these shares".
+
+    if user_ids is not None:
+        db_session.query(Persona__User).filter(
+            Persona__User.persona_id == persona_id
+        ).delete(synchronize_session="fetch")

-    if user_ids:
        user_ids_set = set(user_ids)
        for user_id in user_ids_set:
            db_session.add(Persona__User(persona_id=persona_id, user_id=user_id))
@@ -41,11 +53,13 @@ def make_persona_private(
                    ).model_dump(),
                )

-    if group_ids:
+    if group_ids is not None:
+        db_session.query(Persona__UserGroup).filter(
+            Persona__UserGroup.persona_id == persona_id
+        ).delete(synchronize_session="fetch")
+
        group_ids_set = set(group_ids)
        for group_id in group_ids_set:
            db_session.add(
                Persona__UserGroup(persona_id=persona_id, user_group_id=group_id)
            )
-
-    db_session.commit()
--- a/backend/ee/onyx/db/search.py
+++ b/backend/ee/onyx/db/search.py
@@ -0,0 +1,64 @@
+import uuid
+from datetime import timedelta
+from uuid import UUID
+
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from onyx.db.engine.time_utils import get_db_current_time
+from onyx.db.models import SearchQuery
+
+
+def create_search_query(
+    db_session: Session,
+    user_id: UUID,
+    query: str,
+    query_expansions: list[str] | None = None,
+) -> SearchQuery:
+    """Create and persist a `SearchQuery` row.
+
+    Notes:
+    - `SearchQuery.id` is a UUID PK without a server-side default, so we generate it.
+    - `created_at` is filled by the DB (server_default=now()).
+    """
+    search_query = SearchQuery(
+        id=uuid.uuid4(),
+        user_id=user_id,
+        query=query,
+        query_expansions=query_expansions,
+    )
+    db_session.add(search_query)
+    db_session.commit()
+    db_session.refresh(search_query)
+    return search_query
+
+
+def fetch_search_queries_for_user(
+    db_session: Session,
+    user_id: UUID,
+    filter_days: int | None = None,
+    limit: int | None = None,
+) -> list[SearchQuery]:
+    """Fetch `SearchQuery` rows for a user.
+
+    Args:
+        user_id: User UUID.
+        filter_days: Optional time filter. If provided, only rows created within
+            the last `filter_days` days are returned.
+        limit: Optional max number of rows to return.
+    """
+    if filter_days is not None and filter_days <= 0:
+        raise ValueError("filter_days must be > 0")
+
+    stmt = select(SearchQuery).where(SearchQuery.user_id == user_id)
+
+    if filter_days is not None and filter_days > 0:
+        cutoff = get_db_current_time(db_session) - timedelta(days=filter_days)
+        stmt = stmt.where(SearchQuery.created_at >= cutoff)
+
+    stmt = stmt.order_by(SearchQuery.created_at.desc())
+
+    if limit is not None:
+        stmt = stmt.limit(limit)
+
+    return list(db_session.scalars(stmt).all())
--- a/backend/ee/onyx/main.py
+++ b/backend/ee/onyx/main.py
@@ -16,16 +16,17 @@ from ee.onyx.server.enterprise_settings.api import (
 from ee.onyx.server.evals.api import router as evals_router
 from ee.onyx.server.license.api import router as license_router
 from ee.onyx.server.manage.standard_answer import router as standard_answer_router
+from ee.onyx.server.middleware.license_enforcement import (
+    add_license_enforcement_middleware,
+)
 from ee.onyx.server.middleware.tenant_tracking import (
    add_api_server_tenant_id_middleware,
 )
 from ee.onyx.server.oauth.api import router as ee_oauth_router
-from ee.onyx.server.query_and_chat.chat_backend import (
-    router as chat_router,
-)
 from ee.onyx.server.query_and_chat.query_backend import (
    basic_router as ee_query_router,
 )
+from ee.onyx.server.query_and_chat.search_backend import router as search_router
 from ee.onyx.server.query_history.api import router as query_history_router
 from ee.onyx.server.reporting.usage_export_api import router as usage_export_router
 from ee.onyx.server.seeding import seed_db
@@ -85,6 +86,10 @@ def get_application() -> FastAPI:
    if MULTI_TENANT:
        add_api_server_tenant_id_middleware(application, logger)

+    # Add license enforcement middleware (runs after tenant tracking)
+    # This blocks access when license is expired/gated
+    add_license_enforcement_middleware(application, logger)
+
    if AUTH_TYPE == AuthType.CLOUD:
        # For Google OAuth, refresh tokens are requested by:
        # 1. Adding the right scopes
@@ -124,7 +129,7 @@ def get_application() -> FastAPI:
    # EE only backend APIs
    include_router_with_global_prefix_prepended(application, query_router)
    include_router_with_global_prefix_prepended(application, ee_query_router)
-    include_router_with_global_prefix_prepended(application, chat_router)
+    include_router_with_global_prefix_prepended(application, search_router)
    include_router_with_global_prefix_prepended(application, standard_answer_router)
    include_router_with_global_prefix_prepended(application, ee_oauth_router)
    include_router_with_global_prefix_prepended(application, ee_document_cc_pair_router)
--- a/backend/tests/unit/onyx/onyxbot/init.py
+++ b/backend/tests/unit/onyx/onyxbot/init.py
--- a/backend/ee/onyx/prompts/query_expansion.py
+++ b/backend/ee/onyx/prompts/query_expansion.py
@@ -0,0 +1,27 @@
+# Single message is likely most reliable and generally better for this task
+# No final reminders at the end since the user query is expected to be short
+# If it is not short, it should go into the chat flow so we do not need to account for this.
+KEYWORD_EXPANSION_PROMPT = """
+Generate a set of keyword-only queries to help find relevant documents for the provided query. \
+These queries will be passed to a bm25-based keyword search engine. \
+Provide a single query per line (where each query consists of one or more keywords). \
+The queries must be purely keywords and not contain any filler natural language. \
+The each query should have as few keywords as necessary to represent the user's search intent. \
+If there are no useful expansions, simply return the original query with no additional keyword queries. \
+CRITICAL: Do not include any additional formatting, comments, or anything aside from the keyword queries.
+
+The user query is:
+{user_query}
+""".strip()
+
+
+QUERY_TYPE_PROMPT = """
+Determine if the provided query is better suited for a keyword search or a semantic search.
+Respond with "keyword" or "semantic" literally and nothing else.
+Do not provide any additional text or reasoning to your response.
+
+CRITICAL: It must only be 1 single word - EITHER "keyword" or "semantic".
+
+The user query is:
+{user_query}
+""".strip()
--- a/backend/ee/onyx/prompts/search_flow_classification.py
+++ b/backend/ee/onyx/prompts/search_flow_classification.py
@@ -0,0 +1,42 @@
+# ruff: noqa: E501, W605 start
+SEARCH_CLASS = "search"
+CHAT_CLASS = "chat"
+
+# Will note that with many larger LLMs the latency on running this prompt via third party APIs is as high as 2 seconds which is too slow for many
+# use cases.
+SEARCH_CHAT_PROMPT = f"""
+Determine if the following query is better suited for a search UI or a chat UI. Respond with "{SEARCH_CLASS}" or "{CHAT_CLASS}" literally and nothing else. \
+Do not provide any additional text or reasoning to your response. CRITICAL, IT MUST ONLY BE 1 SINGLE WORD - EITHER "{SEARCH_CLASS}" or "{CHAT_CLASS}".
+
+# Classification Guidelines:
+## {SEARCH_CLASS}
+- If the query consists entirely of keywords or query doesn't require any answer from the AI
+- If the query is a short statement that seems like a search query rather than a question
+- If the query feels nonsensical or is a short phrase that possibly describes a document or information that could be found in a internal document
+
+### Examples of {SEARCH_CLASS} queries:
+- Find me the document that goes over the onboarding process for a new hire
+- Pull requests since last week
+- Sales Runbook AMEA Region
+- Procurement process
+- Retrieve the PRD for project X
+
+## {CHAT_CLASS}
+- If the query is asking a question that requires an answer rather than a document
+- If the query is asking for a solution, suggestion, or general help
+- If the query is seeking information that is on the web and likely not in a company internal document
+- If the query should be answered without any context from additional documents or searches
+
+### Examples of {CHAT_CLASS} queries:
+- What led us to win the deal with company X? (seeking answer)
+- Google Drive not sync-ing files to my computer (seeking solution)
+- Review my email: <whatever the email is> (general help)
+- Write me a script to... (general help)
+- Cheap flights Europe to Tokyo (information likely found on the web, not internal)
+
+# User Query:
+{{user_query}}
+
+REMEMBER TO ONLY RESPOND WITH "{SEARCH_CLASS}" OR "{CHAT_CLASS}" AND NOTHING ELSE.
+""".strip()
+# ruff: noqa: E501, W605 end
--- a/backend/ee/onyx/search/process_search_query.py
+++ b/backend/ee/onyx/search/process_search_query.py
@@ -0,0 +1,270 @@
+from collections.abc import Generator
+
+from sqlalchemy.orm import Session
+
+from ee.onyx.db.search import create_search_query
+from ee.onyx.secondary_llm_flows.query_expansion import expand_keywords
+from ee.onyx.server.query_and_chat.models import SearchDocWithContent
+from ee.onyx.server.query_and_chat.models import SearchFullResponse
+from ee.onyx.server.query_and_chat.models import SendSearchQueryRequest
+from ee.onyx.server.query_and_chat.streaming_models import LLMSelectedDocsPacket
+from ee.onyx.server.query_and_chat.streaming_models import SearchDocsPacket
+from ee.onyx.server.query_and_chat.streaming_models import SearchErrorPacket
+from ee.onyx.server.query_and_chat.streaming_models import SearchQueriesPacket
+from onyx.context.search.models import BaseFilters
+from onyx.context.search.models import ChunkSearchRequest
+from onyx.context.search.models import InferenceChunk
+from onyx.context.search.pipeline import merge_individual_chunks
+from onyx.context.search.pipeline import search_pipeline
+from onyx.db.models import User
+from onyx.document_index.factory import get_current_primary_default_document_index
+from onyx.document_index.interfaces import DocumentIndex
+from onyx.llm.factory import get_default_llm
+from onyx.secondary_llm_flows.document_filter import select_sections_for_expansion
+from onyx.tools.tool_implementations.search.search_utils import (
+    weighted_reciprocal_rank_fusion,
+)
+from onyx.utils.logger import setup_logger
+from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
+
+logger = setup_logger()
+
+
+# This is just a heuristic that also happens to work well for the UI/UX
+# Users would not find it useful to see a huge list of suggested docs
+# but more than 1 is also likely good as many questions may target more than 1 doc.
+TARGET_NUM_SECTIONS_FOR_LLM_SELECTION = 3
+
+
+def _run_single_search(
+    query: str,
+    filters: BaseFilters | None,
+    document_index: DocumentIndex,
+    user: User | None,
+    db_session: Session,
+) -> list[InferenceChunk]:
+    """Execute a single search query and return chunks."""
+    chunk_search_request = ChunkSearchRequest(
+        query=query,
+        user_selected_filters=filters,
+    )
+
+    return search_pipeline(
+        chunk_search_request=chunk_search_request,
+        document_index=document_index,
+        user=user,
+        persona=None,  # No persona for direct search
+        db_session=db_session,
+    )
+
+
+def stream_search_query(
+    request: SendSearchQueryRequest,
+    user: User | None,
+    db_session: Session,
+) -> Generator[
+    SearchQueriesPacket | SearchDocsPacket | LLMSelectedDocsPacket | SearchErrorPacket,
+    None,
+    None,
+]:
+    """
+    Core search function that yields streaming packets.
+    Used by both streaming and non-streaming endpoints.
+    """
+    # Get document index
+    document_index = get_current_primary_default_document_index(db_session)
+
+    # Determine queries to execute
+    original_query = request.search_query
+    keyword_expansions: list[str] = []
+
+    if request.run_query_expansion:
+        try:
+            llm = get_default_llm()
+            keyword_expansions = expand_keywords(
+                user_query=original_query,
+                llm=llm,
+            )
+            if keyword_expansions:
+                logger.debug(
+                    f"Query expansion generated {len(keyword_expansions)} keyword queries"
+                )
+        except Exception as e:
+            logger.warning(f"Query expansion failed: {e}; using original query only.")
+            keyword_expansions = []
+
+    # Build list of all executed queries for tracking
+    all_executed_queries = [original_query] + keyword_expansions
+
+    # TODO remove this check, user should not be None
+    if user is not None:
+        create_search_query(
+            db_session=db_session,
+            user_id=user.id,
+            query=request.search_query,
+            query_expansions=keyword_expansions if keyword_expansions else None,
+        )
+
+    # Execute search(es)
+    if not keyword_expansions:
+        # Single query (original only) - no threading needed
+        chunks = _run_single_search(
+            query=original_query,
+            filters=request.filters,
+            document_index=document_index,
+            user=user,
+            db_session=db_session,
+        )
+    else:
+        # Multiple queries - run in parallel and merge with RRF
+        # First query is the original (semantic), rest are keyword expansions
+        search_functions = [
+            (
+                _run_single_search,
+                (query, request.filters, document_index, user, db_session),
+            )
+            for query in all_executed_queries
+        ]
+
+        # Run all searches in parallel
+        all_search_results: list[list[InferenceChunk]] = (
+            run_functions_tuples_in_parallel(
+                search_functions,
+                allow_failures=True,
+            )
+        )
+
+        # Separate original query results from keyword expansion results
+        # Note that in rare cases, the original query may have failed and so we may be
+        # just overweighting one set of keyword results, should be not a big deal though.
+        original_result = all_search_results[0] if all_search_results else []
+        keyword_results = all_search_results[1:] if len(all_search_results) > 1 else []
+
+        # Build valid results and weights
+        # Original query (semantic): weight 2.0
+        # Keyword expansions: weight 1.0 each
+        valid_results: list[list[InferenceChunk]] = []
+        weights: list[float] = []
+
+        if original_result:
+            valid_results.append(original_result)
+            weights.append(2.0)
+
+        for keyword_result in keyword_results:
+            if keyword_result:
+                valid_results.append(keyword_result)
+                weights.append(1.0)
+
+        if not valid_results:
+            logger.warning("All parallel searches returned empty results")
+            chunks = []
+        else:
+            chunks = weighted_reciprocal_rank_fusion(
+                ranked_results=valid_results,
+                weights=weights,
+                id_extractor=lambda chunk: f"{chunk.document_id}_{chunk.chunk_id}",
+            )
+
+    # Merge chunks into sections
+    sections = merge_individual_chunks(chunks)
+
+    # Apply LLM document selection if requested
+    # num_docs_fed_to_llm_selection specifies how many sections to feed to the LLM for selection
+    # The LLM will always try to select TARGET_NUM_SECTIONS_FOR_LLM_SELECTION sections from those fed to it
+    # llm_selected_doc_ids will be:
+    #   - None if LLM selection was not requested or failed
+    #   - Empty list if LLM selection ran but selected nothing
+    #   - List of doc IDs if LLM selection succeeded
+    run_llm_selection = (
+        request.num_docs_fed_to_llm_selection is not None
+        and request.num_docs_fed_to_llm_selection >= 1
+    )
+    llm_selected_doc_ids: list[str] | None = None
+    llm_selection_failed = False
+    if run_llm_selection and sections:
+        try:
+            llm = get_default_llm()
+            sections_to_evaluate = sections[: request.num_docs_fed_to_llm_selection]
+            selected_sections, _ = select_sections_for_expansion(
+                sections=sections_to_evaluate,
+                user_query=original_query,
+                llm=llm,
+                max_sections=TARGET_NUM_SECTIONS_FOR_LLM_SELECTION,
+                try_to_fill_to_max=True,
+            )
+            # Extract unique document IDs from selected sections (may be empty)
+            llm_selected_doc_ids = list(
+                dict.fromkeys(
+                    section.center_chunk.document_id for section in selected_sections
+                )
+            )
+            logger.debug(
+                f"LLM document selection evaluated {len(sections_to_evaluate)} sections, "
+                f"selected {len(selected_sections)} sections with doc IDs: {llm_selected_doc_ids}"
+            )
+        except Exception as e:
+            # Allowing a blanket exception here as this step is not critical and the rest of the results are still valid
+            logger.warning(f"LLM document selection failed: {e}")
+            llm_selection_failed = True
+    elif run_llm_selection and not sections:
+        # LLM selection requested but no sections to evaluate
+        llm_selected_doc_ids = []
+
+    # Convert to SearchDocWithContent list, optionally including content
+    search_docs = SearchDocWithContent.from_inference_sections(
+        sections,
+        include_content=request.include_content,
+        is_internet=False,
+    )
+
+    # Yield queries packet
+    yield SearchQueriesPacket(all_executed_queries=all_executed_queries)
+
+    # Yield docs packet
+    yield SearchDocsPacket(search_docs=search_docs)
+
+    # Yield LLM selected docs packet if LLM selection was requested
+    # - llm_selected_doc_ids is None if selection failed
+    # - llm_selected_doc_ids is empty list if no docs were selected
+    # - llm_selected_doc_ids is list of IDs if docs were selected
+    if run_llm_selection:
+        yield LLMSelectedDocsPacket(
+            llm_selected_doc_ids=None if llm_selection_failed else llm_selected_doc_ids
+        )
+
+
+def gather_search_stream(
+    packets: Generator[
+        SearchQueriesPacket
+        | SearchDocsPacket
+        | LLMSelectedDocsPacket
+        | SearchErrorPacket,
+        None,
+        None,
+    ],
+) -> SearchFullResponse:
+    """
+    Aggregate all streaming packets into SearchFullResponse.
+    """
+    all_executed_queries: list[str] = []
+    search_docs: list[SearchDocWithContent] = []
+    llm_selected_doc_ids: list[str] | None = None
+    error: str | None = None
+
+    for packet in packets:
+        if isinstance(packet, SearchQueriesPacket):
+            all_executed_queries = packet.all_executed_queries
+        elif isinstance(packet, SearchDocsPacket):
+            search_docs = packet.search_docs
+        elif isinstance(packet, LLMSelectedDocsPacket):
+            llm_selected_doc_ids = packet.llm_selected_doc_ids
+        elif isinstance(packet, SearchErrorPacket):
+            error = packet.error
+
+    return SearchFullResponse(
+        all_executed_queries=all_executed_queries,
+        search_docs=search_docs,
+        doc_selection_reasoning=None,
+        llm_selected_doc_ids=llm_selected_doc_ids,
+        error=error,
+    )
--- a/backend/ee/onyx/secondary_llm_flows/init.py
+++ b/backend/ee/onyx/secondary_llm_flows/init.py
--- a/backend/ee/onyx/secondary_llm_flows/query_expansion.py
+++ b/backend/ee/onyx/secondary_llm_flows/query_expansion.py
@@ -0,0 +1,92 @@
+import re
+
+from ee.onyx.prompts.query_expansion import KEYWORD_EXPANSION_PROMPT
+from onyx.llm.interfaces import LLM
+from onyx.llm.models import LanguageModelInput
+from onyx.llm.models import ReasoningEffort
+from onyx.llm.models import UserMessage
+from onyx.llm.utils import llm_response_to_string
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+# Pattern to remove common LLM artifacts: brackets, quotes, list markers, etc.
+CLEANUP_PATTERN = re.compile(r'[\[\]"\'`]')
+
+
+def _clean_keyword_line(line: str) -> str:
+    """Clean a keyword line by removing common LLM artifacts.
+
+    Removes brackets, quotes, and other characters that LLMs may accidentally
+    include in their output.
+    """
+    # Remove common artifacts
+    cleaned = CLEANUP_PATTERN.sub("", line)
+    # Remove leading list markers like "1.", "2.", "-", "*"
+    cleaned = re.sub(r"^\s*(?:\d+[\.\)]\s*|[-*]\s*)", "", cleaned)
+    return cleaned.strip()
+
+
+def expand_keywords(
+    user_query: str,
+    llm: LLM,
+) -> list[str]:
+    """Expand a user query into multiple keyword-only queries for BM25 search.
+
+    Uses an LLM to generate keyword-based search queries that capture different
+    aspects of the user's search intent. Returns only the expanded queries,
+    not the original query.
+
+    Args:
+        user_query: The original search query from the user
+        llm: Language model to use for keyword expansion
+
+    Returns:
+        List of expanded keyword queries (excluding the original query).
+        Returns empty list if expansion fails or produces no useful expansions.
+    """
+    messages: LanguageModelInput = [
+        UserMessage(content=KEYWORD_EXPANSION_PROMPT.format(user_query=user_query))
+    ]
+
+    try:
+        response = llm.invoke(
+            prompt=messages,
+            reasoning_effort=ReasoningEffort.OFF,
+            # Limit output - we only expect a few short keyword queries
+            max_tokens=150,
+        )
+
+        content = llm_response_to_string(response).strip()
+
+        if not content:
+            logger.warning("Keyword expansion returned empty response.")
+            return []
+
+        # Parse response - each line is a separate keyword query
+        # Clean each line to remove LLM artifacts and drop empty lines
+        parsed_queries = []
+        for line in content.strip().split("\n"):
+            cleaned = _clean_keyword_line(line)
+            if cleaned:
+                parsed_queries.append(cleaned)
+
+        if not parsed_queries:
+            logger.warning("Keyword expansion parsing returned no queries.")
+            return []
+
+        # Filter out duplicates and queries that match the original
+        expanded_queries: list[str] = []
+        seen_lower: set[str] = {user_query.lower()}
+        for query in parsed_queries:
+            query_lower = query.lower()
+            if query_lower not in seen_lower:
+                seen_lower.add(query_lower)
+                expanded_queries.append(query)
+
+        logger.debug(f"Keyword expansion generated {len(expanded_queries)} queries")
+        return expanded_queries
+
+    except Exception as e:
+        logger.warning(f"Keyword expansion failed: {e}")
+        return []
--- a/backend/ee/onyx/secondary_llm_flows/search_flow_classification.py
+++ b/backend/ee/onyx/secondary_llm_flows/search_flow_classification.py
@@ -0,0 +1,50 @@
+from ee.onyx.prompts.search_flow_classification import CHAT_CLASS
+from ee.onyx.prompts.search_flow_classification import SEARCH_CHAT_PROMPT
+from ee.onyx.prompts.search_flow_classification import SEARCH_CLASS
+from onyx.llm.interfaces import LLM
+from onyx.llm.models import LanguageModelInput
+from onyx.llm.models import ReasoningEffort
+from onyx.llm.models import UserMessage
+from onyx.llm.utils import llm_response_to_string
+from onyx.utils.logger import setup_logger
+from onyx.utils.timing import log_function_time
+
+logger = setup_logger()
+
+
+@log_function_time(print_only=True)
+def classify_is_search_flow(
+    query: str,
+    llm: LLM,
+) -> bool:
+    messages: LanguageModelInput = [
+        UserMessage(content=SEARCH_CHAT_PROMPT.format(user_query=query))
+    ]
+    response = llm.invoke(
+        prompt=messages,
+        reasoning_effort=ReasoningEffort.OFF,
+        # Nothing can happen in the UI until this call finishes so we need to be aggressive with the timeout
+        timeout_override=2,
+        # Well more than necessary but just to ensure completion and in case it succeeds with classifying but
+        # ends up rambling
+        max_tokens=20,
+    )
+
+    content = llm_response_to_string(response).strip().lower()
+    if not content:
+        logger.warning(
+            "Search flow classification returned empty response; defaulting to chat flow."
+        )
+        return False
+
+    # Prefer chat if both appear.
+    if CHAT_CLASS in content:
+        return False
+    if SEARCH_CLASS in content:
+        return True
+
+    logger.warning(
+        "Search flow classification returned unexpected response; defaulting to chat flow. Response=%r",
+        content,
+    )
+    return False
--- a/backend/ee/onyx/server/analytics/api.py
+++ b/backend/ee/onyx/server/analytics/api.py
@@ -19,9 +19,9 @@ from ee.onyx.db.analytics import fetch_query_analytics
 from ee.onyx.db.analytics import user_can_view_assistant_stats
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_user
+from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
-from onyx.server.utils import PUBLIC_API_TAGS

 router = APIRouter(prefix="/analytics", tags=PUBLIC_API_TAGS)

--- a/backend/ee/onyx/server/middleware/license_enforcement.py
+++ b/backend/ee/onyx/server/middleware/license_enforcement.py
@@ -0,0 +1,102 @@
+"""Middleware to enforce license status application-wide."""
+
+import logging
+from collections.abc import Awaitable
+from collections.abc import Callable
+
+from fastapi import FastAPI
+from fastapi import Request
+from fastapi import Response
+from fastapi.responses import JSONResponse
+from redis.exceptions import RedisError
+
+from ee.onyx.configs.app_configs import LICENSE_ENFORCEMENT_ENABLED
+from ee.onyx.db.license import get_cached_license_metadata
+from ee.onyx.server.tenants.product_gating import is_tenant_gated
+from onyx.server.settings.models import ApplicationStatus
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.contextvars import get_current_tenant_id
+
+# Paths that are ALWAYS accessible, even when license is expired/gated.
+# These enable users to:
+#   /auth - Log in/out (users can't fix billing if locked out of auth)
+#   /license - Fetch, upload, or check license status
+#   /health - Health checks for load balancers/orchestrators
+#   /me - Basic user info needed for UI rendering
+#   /settings, /enterprise-settings - View app status and branding
+#   /tenants/billing-* - Manage subscription to resolve gating
+ALLOWED_PATH_PREFIXES = {
+    "/auth",
+    "/license",
+    "/health",
+    "/me",
+    "/settings",
+    "/enterprise-settings",
+    "/tenants/billing-information",
+    "/tenants/create-customer-portal-session",
+    "/tenants/create-subscription-session",
+}
+
+
+def _is_path_allowed(path: str) -> bool:
+    """Check if path is in allowlist (prefix match)."""
+    return any(path.startswith(prefix) for prefix in ALLOWED_PATH_PREFIXES)
+
+
+def add_license_enforcement_middleware(
+    app: FastAPI, logger: logging.LoggerAdapter
+) -> None:
+    logger.info("License enforcement middleware registered")
+
+    @app.middleware("http")
+    async def enforce_license(
+        request: Request, call_next: Callable[[Request], Awaitable[Response]]
+    ) -> Response:
+        """Block requests when license is expired/gated."""
+        if not LICENSE_ENFORCEMENT_ENABLED:
+            return await call_next(request)
+
+        path = request.url.path
+        if path.startswith("/api"):
+            path = path[4:]
+
+        if _is_path_allowed(path):
+            return await call_next(request)
+
+        is_gated = False
+        tenant_id = get_current_tenant_id()
+
+        if MULTI_TENANT:
+            try:
+                is_gated = is_tenant_gated(tenant_id)
+            except RedisError as e:
+                logger.warning(f"Failed to check tenant gating status: {e}")
+                # Fail open - don't block users due to Redis connectivity issues
+                is_gated = False
+        else:
+            try:
+                metadata = get_cached_license_metadata(tenant_id)
+                if metadata:
+                    if metadata.status == ApplicationStatus.GATED_ACCESS:
+                        is_gated = True
+                else:
+                    # No license metadata = gated for self-hosted EE
+                    is_gated = True
+            except RedisError as e:
+                logger.warning(f"Failed to check license metadata: {e}")
+                # Fail open - don't block users due to Redis connectivity issues
+                is_gated = False
+
+        if is_gated:
+            logger.info(f"Blocking request for gated tenant: {tenant_id}, path={path}")
+            return JSONResponse(
+                status_code=402,
+                content={
+                    "detail": {
+                        "error": "license_expired",
+                        "message": "Your subscription has expired. Please update your billing.",
+                    }
+                },
+            )
+
+        return await call_next(request)
--- a/backend/ee/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/chat_backend.py
@@ -1,214 +0,0 @@
-from fastapi import APIRouter
-from fastapi import Depends
-from fastapi import HTTPException
-from sqlalchemy.orm import Session
-
-from ee.onyx.server.query_and_chat.models import BasicCreateChatMessageRequest
-from ee.onyx.server.query_and_chat.models import (
-    BasicCreateChatMessageWithHistoryRequest,
-)
-from onyx.auth.users import current_user
-from onyx.chat.chat_utils import create_chat_history_chain
-from onyx.chat.models import ChatBasicResponse
-from onyx.chat.process_message import gather_stream
-from onyx.chat.process_message import stream_chat_message_objects
-from onyx.configs.constants import MessageType
-from onyx.context.search.models import OptionalSearchSetting
-from onyx.context.search.models import RetrievalDetails
-from onyx.db.chat import create_chat_session
-from onyx.db.chat import create_new_chat_message
-from onyx.db.chat import get_or_create_root_message
-from onyx.db.engine.sql_engine import get_session
-from onyx.db.models import User
-from onyx.llm.factory import get_llm_for_persona
-from onyx.natural_language_processing.utils import get_tokenizer
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
-from onyx.utils.logger import setup_logger
-
-logger = setup_logger()
-
-router = APIRouter(prefix="/chat")
-
-
-@router.post("/send-message-simple-api")
-def handle_simplified_chat_message(
-    chat_message_req: BasicCreateChatMessageRequest,
-    user: User | None = Depends(current_user),
-    db_session: Session = Depends(get_session),
-) -> ChatBasicResponse:
-    """This is a Non-Streaming version that only gives back a minimal set of information"""
-    logger.notice(f"Received new simple api chat message: {chat_message_req.message}")
-
-    if not chat_message_req.message:
-        raise HTTPException(status_code=400, detail="Empty chat message is invalid")
-
-    # Handle chat session creation if chat_session_id is not provided
-    if chat_message_req.chat_session_id is None:
-        if chat_message_req.persona_id is None:
-            raise HTTPException(
-                status_code=400,
-                detail="Either chat_session_id or persona_id must be provided",
-            )
-
-        # Create a new chat session with the provided persona_id
-        try:
-            new_chat_session = create_chat_session(
-                db_session=db_session,
-                description="",  # Leave empty for simple API
-                user_id=user.id if user else None,
-                persona_id=chat_message_req.persona_id,
-            )
-            chat_session_id = new_chat_session.id
-        except Exception as e:
-            logger.exception(e)
-            raise HTTPException(status_code=400, detail="Invalid Persona provided.")
-    else:
-        chat_session_id = chat_message_req.chat_session_id
-
-    try:
-        parent_message = create_chat_history_chain(
-            chat_session_id=chat_session_id, db_session=db_session
-        )[-1]
-    except Exception:
-        parent_message = get_or_create_root_message(
-            chat_session_id=chat_session_id, db_session=db_session
-        )
-
-    if (
-        chat_message_req.retrieval_options is None
-        and chat_message_req.search_doc_ids is None
-    ):
-        retrieval_options: RetrievalDetails | None = RetrievalDetails(
-            run_search=OptionalSearchSetting.ALWAYS,
-            real_time=False,
-        )
-    else:
-        retrieval_options = chat_message_req.retrieval_options
-
-    full_chat_msg_info = CreateChatMessageRequest(
-        chat_session_id=chat_session_id,
-        parent_message_id=parent_message.id,
-        message=chat_message_req.message,
-        file_descriptors=[],
-        search_doc_ids=chat_message_req.search_doc_ids,
-        retrieval_options=retrieval_options,
-        # Simple API does not support reranking, hide complexity from user
-        rerank_settings=None,
-        query_override=chat_message_req.query_override,
-        # Currently only applies to search flow not chat
-        chunks_above=0,
-        chunks_below=0,
-        full_doc=chat_message_req.full_doc,
-        structured_response_format=chat_message_req.structured_response_format,
-    )
-
-    packets = stream_chat_message_objects(
-        new_msg_req=full_chat_msg_info,
-        user=user,
-        db_session=db_session,
-    )
-
-    return gather_stream(packets)
-
-
-@router.post("/send-message-simple-with-history")
-def handle_send_message_simple_with_history(
-    req: BasicCreateChatMessageWithHistoryRequest,
-    user: User | None = Depends(current_user),
-    db_session: Session = Depends(get_session),
-) -> ChatBasicResponse:
-    """This is a Non-Streaming version that only gives back a minimal set of information.
-    takes in chat history maintained by the caller
-    and does query rephrasing similar to answer-with-quote"""
-
-    if len(req.messages) == 0:
-        raise HTTPException(status_code=400, detail="Messages cannot be zero length")
-
-    # This is a sanity check to make sure the chat history is valid
-    # It must start with a user message and alternate beteen user and assistant
-    expected_role = MessageType.USER
-    for msg in req.messages:
-        if not msg.message:
-            raise HTTPException(
-                status_code=400, detail="One or more chat messages were empty"
-            )
-
-        if msg.role != expected_role:
-            raise HTTPException(
-                status_code=400,
-                detail="Message roles must start and end with MessageType.USER and alternate in-between.",
-            )
-        if expected_role == MessageType.USER:
-            expected_role = MessageType.ASSISTANT
-        else:
-            expected_role = MessageType.USER
-
-    query = req.messages[-1].message
-    msg_history = req.messages[:-1]
-
-    logger.notice(f"Received new simple with history chat message: {query}")
-
-    user_id = user.id if user is not None else None
-    chat_session = create_chat_session(
-        db_session=db_session,
-        description="handle_send_message_simple_with_history",
-        user_id=user_id,
-        persona_id=req.persona_id,
-    )
-
-    llm = get_llm_for_persona(persona=chat_session.persona, user=user)
-
-    llm_tokenizer = get_tokenizer(
-        model_name=llm.config.model_name,
-        provider_type=llm.config.model_provider,
-    )
-
-    # Every chat Session begins with an empty root message
-    root_message = get_or_create_root_message(
-        chat_session_id=chat_session.id, db_session=db_session
-    )
-
-    chat_message = root_message
-    for msg in msg_history:
-        chat_message = create_new_chat_message(
-            chat_session_id=chat_session.id,
-            parent_message=chat_message,
-            message=msg.message,
-            token_count=len(llm_tokenizer.encode(msg.message)),
-            message_type=msg.role,
-            db_session=db_session,
-            commit=False,
-        )
-    db_session.commit()
-
-    if req.retrieval_options is None and req.search_doc_ids is None:
-        retrieval_options: RetrievalDetails | None = RetrievalDetails(
-            run_search=OptionalSearchSetting.ALWAYS,
-            real_time=False,
-        )
-    else:
-        retrieval_options = req.retrieval_options
-
-    full_chat_msg_info = CreateChatMessageRequest(
-        chat_session_id=chat_session.id,
-        parent_message_id=chat_message.id,
-        message=query,
-        file_descriptors=[],
-        search_doc_ids=req.search_doc_ids,
-        retrieval_options=retrieval_options,
-        # Simple API does not support reranking, hide complexity from user
-        rerank_settings=None,
-        query_override=None,
-        chunks_above=0,
-        chunks_below=0,
-        full_doc=req.full_doc,
-        structured_response_format=req.structured_response_format,
-    )
-
-    packets = stream_chat_message_objects(
-        new_msg_req=full_chat_msg_info,
-        user=user,
-        db_session=db_session,
-    )
-
-    return gather_stream(packets)
--- a/backend/ee/onyx/server/query_and_chat/models.py
+++ b/backend/ee/onyx/server/query_and_chat/models.py
@@ -1,18 +1,12 @@
-from collections import OrderedDict
-from typing import Literal
-from uuid import UUID
+from collections.abc import Sequence
+from datetime import datetime

 from pydantic import BaseModel
 from pydantic import Field
-from pydantic import model_validator

-from onyx.chat.models import ThreadMessage
-from onyx.configs.constants import DocumentSource
 from onyx.context.search.models import BaseFilters
-from onyx.context.search.models import BasicChunkRequest
-from onyx.context.search.models import ChunkContext
-from onyx.context.search.models import InferenceChunk
-from onyx.context.search.models import RetrievalDetails
+from onyx.context.search.models import InferenceSection
+from onyx.context.search.models import SearchDoc
 from onyx.server.manage.models import StandardAnswer


@@ -25,119 +19,88 @@ class StandardAnswerResponse(BaseModel):
    standard_answers: list[StandardAnswer] = Field(default_factory=list)


-class DocumentSearchRequest(BasicChunkRequest):
-    user_selected_filters: BaseFilters | None = None
+class SearchFlowClassificationRequest(BaseModel):
+    user_query: str


-class DocumentSearchResponse(BaseModel):
-    top_documents: list[InferenceChunk]
+class SearchFlowClassificationResponse(BaseModel):
+    is_search_flow: bool


-class BasicCreateChatMessageRequest(ChunkContext):
-    """If a chat_session_id is not provided, a persona_id must be provided to automatically create a new chat session
-    Note, for simplicity this option only allows for a single linear chain of messages
-    """
+class SendSearchQueryRequest(BaseModel):
+    search_query: str
+    filters: BaseFilters | None = None
+    num_docs_fed_to_llm_selection: int | None = None
+    run_query_expansion: bool = False

-    chat_session_id: UUID | None = None
-    # Optional persona_id to create a new chat session if chat_session_id is not provided
-    persona_id: int | None = None
-    # New message contents
-    message: str
-    # Defaults to using retrieval with no additional filters
-    retrieval_options: RetrievalDetails | None = None
-    # Allows the caller to specify the exact search query they want to use
-    # will disable Query Rewording if specified
-    query_override: str | None = None
-    # If search_doc_ids provided, then retrieval options are unused
-    search_doc_ids: list[int] | None = None
-    # only works if using an OpenAI model. See the following for more details:
-    # https://platform.openai.com/docs/guides/structured-outputs/introduction
-    structured_response_format: dict | None = None
-
-    @model_validator(mode="after")
-    def validate_chat_session_or_persona(self) -> "BasicCreateChatMessageRequest":
-        if self.chat_session_id is None and self.persona_id is None:
-            raise ValueError("Either chat_session_id or persona_id must be provided")
-        return self
+    include_content: bool = False
+    stream: bool = False


-class BasicCreateChatMessageWithHistoryRequest(ChunkContext):
-    # Last element is the new query. All previous elements are historical context
-    messages: list[ThreadMessage]
-    persona_id: int
-    retrieval_options: RetrievalDetails | None = None
-    query_override: str | None = None
-    skip_rerank: bool | None = None
-    # If search_doc_ids provided, then retrieval options are unused
-    search_doc_ids: list[int] | None = None
-    # only works if using an OpenAI model. See the following for more details:
-    # https://platform.openai.com/docs/guides/structured-outputs/introduction
-    structured_response_format: dict | None = None
+class SearchDocWithContent(SearchDoc):
+    # Allows None because this is determined by a flag but the object used in code
+    # of the search path uses this type
+    content: str | None

+    @classmethod
+    def from_inference_sections(
+        cls,
+        sections: Sequence[InferenceSection],
+        include_content: bool = False,
+        is_internet: bool = False,
+    ) -> list["SearchDocWithContent"]:
+        """Convert InferenceSections to SearchDocWithContent objects.

-class SimpleDoc(BaseModel):
-    id: str
-    semantic_identifier: str
-    link: str | None
-    blurb: str
-    match_highlights: list[str]
-    source_type: DocumentSource
-    metadata: dict | None
+        Args:
+            sections: Sequence of InferenceSection objects
+            include_content: If True, populate content field with combined_content
+            is_internet: Whether these are internet search results

-
-class AgentSubQuestion(BaseModel):
-    sub_question: str
-    document_ids: list[str]
-
-
-class AgentAnswer(BaseModel):
-    answer: str
-    answer_type: Literal["agent_sub_answer", "agent_level_answer"]
-
-
-class AgentSubQuery(BaseModel):
-    sub_query: str
-    query_id: int
-
-    @staticmethod
-    def make_dict_by_level_and_question_index(
-        original_dict: dict[tuple[int, int, int], "AgentSubQuery"],
-    ) -> dict[int, dict[int, list["AgentSubQuery"]]]:
-        """Takes a dict of tuple(level, question num, query_id) to sub queries.
-
-        returns a dict of level to dict[question num to list of query_id's]
-        Ordering is asc for readability.
+        Returns:
+            List of SearchDocWithContent with optional content
        """
-        # In this function, when we sort int | None, we deliberately push None to the end
+        if not sections:
+            return []

-        # map entries to the level_question_dict
-        level_question_dict: dict[int, dict[int, list["AgentSubQuery"]]] = {}
-        for k1, obj in original_dict.items():
-            level = k1[0]
-            question = k1[1]
-
-            if level not in level_question_dict:
-                level_question_dict[level] = {}
-
-            if question not in level_question_dict[level]:
-                level_question_dict[level][question] = []
-
-            level_question_dict[level][question].append(obj)
-
-        # sort each query_id list and question_index
-        for key1, obj1 in level_question_dict.items():
-            for key2, value2 in obj1.items():
-                # sort the query_id list of each question_index
-                level_question_dict[key1][key2] = sorted(
-                    value2, key=lambda o: o.query_id
-                )
-            # sort the question_index dict of level
-            level_question_dict[key1] = OrderedDict(
-                sorted(level_question_dict[key1].items(), key=lambda x: (x is None, x))
+        return [
+            cls(
+                document_id=(chunk := section.center_chunk).document_id,
+                chunk_ind=chunk.chunk_id,
+                semantic_identifier=chunk.semantic_identifier or "Unknown",
+                link=chunk.source_links[0] if chunk.source_links else None,
+                blurb=chunk.blurb,
+                source_type=chunk.source_type,
+                boost=chunk.boost,
+                hidden=chunk.hidden,
+                metadata=chunk.metadata,
+                score=chunk.score,
+                match_highlights=chunk.match_highlights,
+                updated_at=chunk.updated_at,
+                primary_owners=chunk.primary_owners,
+                secondary_owners=chunk.secondary_owners,
+                is_internet=is_internet,
+                content=section.combined_content if include_content else None,
            )
+            for section in sections
+        ]

-        # sort the top dict of levels
-        sorted_dict = OrderedDict(
-            sorted(level_question_dict.items(), key=lambda x: (x is None, x))
-        )
-        return sorted_dict
+
+class SearchFullResponse(BaseModel):
+    all_executed_queries: list[str]
+    search_docs: list[SearchDocWithContent]
+    # Reasoning tokens output by the LLM for the document selection
+    doc_selection_reasoning: str | None = None
+    # This a list of document ids that are in the search_docs list
+    llm_selected_doc_ids: list[str] | None = None
+    # Error message if the search failed partway through
+    error: str | None = None
+
+
+class SearchQueryResponse(BaseModel):
+    query: str
+    query_expansions: list[str] | None
+    created_at: datetime
+
+
+class SearchHistoryResponse(BaseModel):
+    search_queries: list[SearchQueryResponse]
--- a/backend/ee/onyx/server/query_and_chat/search_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/search_backend.py
@@ -0,0 +1,170 @@
+from collections.abc import Generator
+
+from fastapi import APIRouter
+from fastapi import Depends
+from fastapi import HTTPException
+from fastapi.responses import StreamingResponse
+from sqlalchemy.orm import Session
+
+from ee.onyx.db.search import fetch_search_queries_for_user
+from ee.onyx.search.process_search_query import gather_search_stream
+from ee.onyx.search.process_search_query import stream_search_query
+from ee.onyx.secondary_llm_flows.search_flow_classification import (
+    classify_is_search_flow,
+)
+from ee.onyx.server.query_and_chat.models import SearchFlowClassificationRequest
+from ee.onyx.server.query_and_chat.models import SearchFlowClassificationResponse
+from ee.onyx.server.query_and_chat.models import SearchFullResponse
+from ee.onyx.server.query_and_chat.models import SearchHistoryResponse
+from ee.onyx.server.query_and_chat.models import SearchQueryResponse
+from ee.onyx.server.query_and_chat.models import SendSearchQueryRequest
+from ee.onyx.server.query_and_chat.streaming_models import SearchErrorPacket
+from onyx.auth.users import current_user
+from onyx.db.engine.sql_engine import get_session
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.models import User
+from onyx.llm.factory import get_default_llm
+from onyx.server.usage_limits import check_llm_cost_limit_for_provider
+from onyx.server.utils import get_json_line
+from onyx.utils.logger import setup_logger
+from shared_configs.contextvars import get_current_tenant_id
+
+logger = setup_logger()
+
+router = APIRouter(prefix="/search")
+
+
+@router.post("/search-flow-classification")
+def search_flow_classification(
+    request: SearchFlowClassificationRequest,
+    # This is added just to ensure this endpoint isn't spammed by non-authorized users since there's an LLM call underneath it
+    _: User | None = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> SearchFlowClassificationResponse:
+    query = request.user_query
+    # This is a heuristic that if the user is typing a lot of text, it's unlikely they're looking for some specific document
+    # Most likely something needs to be done with the text included so we'll just classify it as a chat flow
+    if len(query) > 200:
+        return SearchFlowClassificationResponse(is_search_flow=False)
+
+    llm = get_default_llm()
+
+    check_llm_cost_limit_for_provider(
+        db_session=db_session,
+        tenant_id=get_current_tenant_id(),
+        llm_provider_api_key=llm.config.api_key,
+    )
+
+    try:
+        is_search_flow = classify_is_search_flow(query=query, llm=llm)
+    except Exception as e:
+        logger.exception(
+            "Search flow classification failed; defaulting to chat flow",
+            exc_info=e,
+        )
+        is_search_flow = False
+
+    return SearchFlowClassificationResponse(is_search_flow=is_search_flow)
+
+
+@router.post("/send-search-message", response_model=None)
+def handle_send_search_message(
+    request: SendSearchQueryRequest,
+    user: User | None = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> StreamingResponse | SearchFullResponse:
+    """
+    Execute a search query with optional streaming.
+
+    When stream=True: Returns StreamingResponse with SSE
+    When stream=False: Returns SearchFullResponse
+    """
+    logger.debug(f"Received search query: {request.search_query}")
+
+    # Non-streaming path
+    if not request.stream:
+        try:
+            packets = stream_search_query(request, user, db_session)
+            return gather_search_stream(packets)
+        except NotImplementedError as e:
+            return SearchFullResponse(
+                all_executed_queries=[],
+                search_docs=[],
+                error=str(e),
+            )
+
+    # Streaming path
+    def stream_generator() -> Generator[str, None, None]:
+        try:
+            with get_session_with_current_tenant() as streaming_db_session:
+                for packet in stream_search_query(request, user, streaming_db_session):
+                    yield get_json_line(packet.model_dump())
+        except NotImplementedError as e:
+            yield get_json_line(SearchErrorPacket(error=str(e)).model_dump())
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.exception("Error in search streaming")
+            yield get_json_line(SearchErrorPacket(error=str(e)).model_dump())
+
+    return StreamingResponse(stream_generator(), media_type="text/event-stream")
+
+
+@router.get("/search-history")
+def get_search_history(
+    limit: int = 100,
+    filter_days: int | None = None,
+    user: User | None = Depends(current_user),
+    db_session: Session = Depends(get_session),
+) -> SearchHistoryResponse:
+    """
+    Fetch past search queries for the authenticated user.
+
+    Args:
+        limit: Maximum number of queries to return (default 100)
+        filter_days: Only return queries from the last N days (optional)
+
+    Returns:
+        SearchHistoryResponse with list of search queries, ordered by most recent first.
+    """
+    # Validate limit
+    if limit <= 0:
+        raise HTTPException(
+            status_code=400,
+            detail="limit must be greater than 0",
+        )
+    if limit > 1000:
+        raise HTTPException(
+            status_code=400,
+            detail="limit must be at most 1000",
+        )
+
+    # Validate filter_days
+    if filter_days is not None and filter_days <= 0:
+        raise HTTPException(
+            status_code=400,
+            detail="filter_days must be greater than 0",
+        )
+
+    # TODO(yuhong) remove this
+    if user is None:
+        # Return empty list for unauthenticated users
+        return SearchHistoryResponse(search_queries=[])
+
+    search_queries = fetch_search_queries_for_user(
+        db_session=db_session,
+        user_id=user.id,
+        filter_days=filter_days,
+        limit=limit,
+    )
+
+    return SearchHistoryResponse(
+        search_queries=[
+            SearchQueryResponse(
+                query=sq.query,
+                query_expansions=sq.query_expansions,
+                created_at=sq.created_at,
+            )
+            for sq in search_queries
+        ]
+    )
--- a/backend/ee/onyx/server/query_and_chat/streaming_models.py
+++ b/backend/ee/onyx/server/query_and_chat/streaming_models.py
@@ -0,0 +1,35 @@
+from typing import Literal
+
+from pydantic import BaseModel
+from pydantic import ConfigDict
+
+from ee.onyx.server.query_and_chat.models import SearchDocWithContent
+
+
+class SearchQueriesPacket(BaseModel):
+    model_config = ConfigDict(frozen=True)
+
+    type: Literal["search_queries"] = "search_queries"
+    all_executed_queries: list[str]
+
+
+class SearchDocsPacket(BaseModel):
+    model_config = ConfigDict(frozen=True)
+
+    type: Literal["search_docs"] = "search_docs"
+    search_docs: list[SearchDocWithContent]
+
+
+class SearchErrorPacket(BaseModel):
+    model_config = ConfigDict(frozen=True)
+
+    type: Literal["search_error"] = "search_error"
+    error: str
+
+
+class LLMSelectedDocsPacket(BaseModel):
+    model_config = ConfigDict(frozen=True)
+
+    type: Literal["llm_selected_docs"] = "llm_selected_docs"
+    # None if LLM selection failed, empty list if no docs selected, list of IDs otherwise
+    llm_selected_doc_ids: list[str] | None
--- a/backend/ee/onyx/server/query_history/api.py
+++ b/backend/ee/onyx/server/query_history/api.py
@@ -32,6 +32,7 @@ from onyx.configs.constants import MessageType
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
+from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.configs.constants import QAFeedbackType
 from onyx.configs.constants import QueryHistoryType
 from onyx.configs.constants import SessionType
@@ -48,7 +49,6 @@ from onyx.file_store.file_store import get_default_file_store
 from onyx.server.documents.models import PaginatedReturn
 from onyx.server.query_and_chat.models import ChatSessionDetails
 from onyx.server.query_and_chat.models import ChatSessionsResponse
-from onyx.server.utils import PUBLIC_API_TAGS
 from onyx.utils.threadpool_concurrency import parallel_yield
 from shared_configs.contextvars import get_current_tenant_id

--- a/backend/ee/onyx/server/settings/init.py
+++ b/backend/ee/onyx/server/settings/init.py
--- a/backend/ee/onyx/server/settings/api.py
+++ b/backend/ee/onyx/server/settings/api.py
@@ -0,0 +1,54 @@
+"""EE Settings API - provides license-aware settings override."""
+
+from redis.exceptions import RedisError
+
+from ee.onyx.configs.app_configs import LICENSE_ENFORCEMENT_ENABLED
+from ee.onyx.db.license import get_cached_license_metadata
+from onyx.server.settings.models import ApplicationStatus
+from onyx.server.settings.models import Settings
+from onyx.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.contextvars import get_current_tenant_id
+
+logger = setup_logger()
+
+# Statuses that indicate a billing/license problem - propagate these to settings
+_GATED_STATUSES = frozenset(
+    {
+        ApplicationStatus.GATED_ACCESS,
+        ApplicationStatus.GRACE_PERIOD,
+        ApplicationStatus.PAYMENT_REMINDER,
+    }
+)
+
+
+def apply_license_status_to_settings(settings: Settings) -> Settings:
+    """EE version: checks license status for self-hosted deployments.
+
+    For self-hosted, looks up license metadata and overrides application_status
+    if the license is missing or indicates a problem (expired, grace period, etc.).
+
+    For multi-tenant (cloud), the settings already have the correct status
+    from the control plane, so no override is needed.
+
+    If LICENSE_ENFORCEMENT_ENABLED is false, settings are returned unchanged,
+    allowing the product to function normally without license checks.
+    """
+    if not LICENSE_ENFORCEMENT_ENABLED:
+        return settings
+
+    if MULTI_TENANT:
+        return settings
+
+    tenant_id = get_current_tenant_id()
+    try:
+        metadata = get_cached_license_metadata(tenant_id)
+        if metadata and metadata.status in _GATED_STATUSES:
+            settings.application_status = metadata.status
+        elif not metadata:
+            # No license = gated access for self-hosted EE
+            settings.application_status = ApplicationStatus.GATED_ACCESS
+    except RedisError as e:
+        logger.warning(f"Failed to check license metadata for settings: {e}")
+
+    return settings
--- a/backend/ee/onyx/server/tenant_usage_limits.py
+++ b/backend/ee/onyx/server/tenant_usage_limits.py
@@ -1,10 +1,14 @@
 """Tenant-specific usage limit overrides from the control plane (EE version)."""

+import time
+
 import requests

 from ee.onyx.server.tenants.access import generate_data_plane_token
 from onyx.configs.app_configs import CONTROL_PLANE_API_BASE_URL
+from onyx.configs.app_configs import DEV_MODE
 from onyx.server.tenant_usage_limits import TenantUsageLimitOverrides
+from onyx.server.usage_limits import NO_LIMIT
 from onyx.utils.logger import setup_logger

 logger = setup_logger()
@@ -12,9 +16,12 @@ logger = setup_logger()

 # In-memory storage for tenant overrides (populated at startup)
 _tenant_usage_limit_overrides: dict[str, TenantUsageLimitOverrides] | None = None
+_last_fetch_time: float = 0.0
+_FETCH_INTERVAL = 60 * 60 * 24  # 24 hours
+_ERROR_FETCH_INTERVAL = 30 * 60  # 30 minutes (if the last fetch failed)


-def fetch_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides]:
+def fetch_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides] | None:
    """
    Fetch tenant-specific usage limit overrides from the control plane.

@@ -45,33 +52,52 @@ def fetch_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides]:
                    f"Failed to parse usage limit overrides for tenant {tenant_id}: {e}"
                )

-        return result
+        return (
+            result or None
+        )  # if empty dictionary, something went wrong and we shouldn't enforce limits

    except requests.exceptions.RequestException as e:
        logger.warning(f"Failed to fetch usage limit overrides from control plane: {e}")
-        return {}
+        return None
    except Exception as e:
        logger.error(f"Error parsing usage limit overrides: {e}")
-        return {}
+        return None


-def load_usage_limit_overrides() -> dict[str, TenantUsageLimitOverrides]:
+def load_usage_limit_overrides() -> None:
    """
    Load tenant usage limit overrides from the control plane.
-
-    Called at server startup to populate the in-memory cache.
    """
    global _tenant_usage_limit_overrides
+    global _last_fetch_time

    logger.info("Loading tenant usage limit overrides from control plane...")
    overrides = fetch_usage_limit_overrides()
-    _tenant_usage_limit_overrides = overrides
+
+    _last_fetch_time = time.time()
+
+    # use the new result if it exists, otherwise use the old result
+    # (prevents us from updating to a failed fetch result)
+    _tenant_usage_limit_overrides = overrides or _tenant_usage_limit_overrides

    if overrides:
        logger.info(f"Loaded usage limit overrides for {len(overrides)} tenants")
    else:
        logger.info("No tenant-specific usage limit overrides found")
-    return overrides
+
+
+def unlimited(tenant_id: str) -> TenantUsageLimitOverrides:
+    return TenantUsageLimitOverrides(
+        tenant_id=tenant_id,
+        llm_cost_cents_trial=NO_LIMIT,
+        llm_cost_cents_paid=NO_LIMIT,
+        chunks_indexed_trial=NO_LIMIT,
+        chunks_indexed_paid=NO_LIMIT,
+        api_calls_trial=NO_LIMIT,
+        api_calls_paid=NO_LIMIT,
+        non_streaming_calls_trial=NO_LIMIT,
+        non_streaming_calls_paid=NO_LIMIT,
+    )


 def get_tenant_usage_limit_overrides(
@@ -86,7 +112,22 @@ def get_tenant_usage_limit_overrides(
    Returns:
        TenantUsageLimitOverrides if the tenant has overrides, None otherwise.
    """
+
+    if DEV_MODE:  # in dev mode, we return unlimited limits for all tenants
+        return unlimited(tenant_id)
+
    global _tenant_usage_limit_overrides
-    if _tenant_usage_limit_overrides is None:
-        _tenant_usage_limit_overrides = load_usage_limit_overrides()
+    time_since = time.time() - _last_fetch_time
+    if (
+        _tenant_usage_limit_overrides is None and time_since > _ERROR_FETCH_INTERVAL
+    ) or (time_since > _FETCH_INTERVAL):
+        logger.debug(
+            f"Last fetch time: {_last_fetch_time}, time since last fetch: {time_since}"
+        )
+
+        load_usage_limit_overrides()
+
+    # If we have failed to fetch from the control plane or we're in dev mode, don't usage limit anyone.
+    if _tenant_usage_limit_overrides is None or DEV_MODE:
+        return unlimited(tenant_id)
    return _tenant_usage_limit_overrides.get(tenant_id)
--- a/backend/ee/onyx/server/tenants/billing.py
+++ b/backend/ee/onyx/server/tenants/billing.py
@@ -1,9 +1,9 @@
 from typing import cast
+from typing import Literal

 import requests
 import stripe

-from ee.onyx.configs.app_configs import STRIPE_PRICE_ID
 from ee.onyx.configs.app_configs import STRIPE_SECRET_KEY
 from ee.onyx.server.tenants.access import generate_data_plane_token
 from ee.onyx.server.tenants.models import BillingInformation
@@ -16,15 +16,21 @@ stripe.api_key = STRIPE_SECRET_KEY
 logger = setup_logger()


-def fetch_stripe_checkout_session(tenant_id: str) -> str:
+def fetch_stripe_checkout_session(
+    tenant_id: str,
+    billing_period: Literal["monthly", "annual"] = "monthly",
+) -> str:
    token = generate_data_plane_token()
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
    }
    url = f"{CONTROL_PLANE_API_BASE_URL}/create-checkout-session"
-    params = {"tenant_id": tenant_id}
-    response = requests.post(url, headers=headers, params=params)
+    payload = {
+        "tenant_id": tenant_id,
+        "billing_period": billing_period,
+    }
+    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    return response.json()["sessionId"]

@@ -70,24 +76,46 @@ def fetch_billing_information(
    return BillingInformation(**response_data)


+def fetch_customer_portal_session(tenant_id: str, return_url: str | None = None) -> str:
+    """
+    Fetch a Stripe customer portal session URL from the control plane.
+    NOTE: This is currently only used for multi-tenant (cloud) deployments.
+    Self-hosted proxy endpoints will be added in a future phase.
+    """
+    token = generate_data_plane_token()
+    headers = {
+        "Authorization": f"Bearer {token}",
+        "Content-Type": "application/json",
+    }
+    url = f"{CONTROL_PLANE_API_BASE_URL}/create-customer-portal-session"
+    payload = {"tenant_id": tenant_id}
+    if return_url:
+        payload["return_url"] = return_url
+    response = requests.post(url, headers=headers, json=payload)
+    response.raise_for_status()
+    return response.json()["url"]
+
+
 def register_tenant_users(tenant_id: str, number_of_users: int) -> stripe.Subscription:
    """
-    Send a request to the control service to register the number of users for a tenant.
+    Update the number of seats for a tenant's subscription.
+    Preserves the existing price (monthly, annual, or grandfathered).
    """
-
-    if not STRIPE_PRICE_ID:
-        raise Exception("STRIPE_PRICE_ID is not set")
-
    response = fetch_tenant_stripe_information(tenant_id)
    stripe_subscription_id = cast(str, response.get("stripe_subscription_id"))

    subscription = stripe.Subscription.retrieve(stripe_subscription_id)
+    subscription_item = subscription["items"]["data"][0]
+
+    # Use existing price to preserve the customer's current plan
+    current_price_id = subscription_item.price.id
+
    updated_subscription = stripe.Subscription.modify(
        stripe_subscription_id,
        items=[
            {
-                "id": subscription["items"]["data"][0].id,
-                "price": STRIPE_PRICE_ID,
+                "id": subscription_item.id,
+                "price": current_price_id,
                "quantity": number_of_users,
            }
        ],
--- a/backend/ee/onyx/server/tenants/billing_api.py
+++ b/backend/ee/onyx/server/tenants/billing_api.py
@@ -1,15 +1,14 @@
-import stripe
 from fastapi import APIRouter
 from fastapi import Depends
 from fastapi import HTTPException

 from ee.onyx.auth.users import current_admin_user
-from ee.onyx.configs.app_configs import STRIPE_SECRET_KEY
 from ee.onyx.server.tenants.access import control_plane_dep
 from ee.onyx.server.tenants.billing import fetch_billing_information
+from ee.onyx.server.tenants.billing import fetch_customer_portal_session
 from ee.onyx.server.tenants.billing import fetch_stripe_checkout_session
-from ee.onyx.server.tenants.billing import fetch_tenant_stripe_information
 from ee.onyx.server.tenants.models import BillingInformation
+from ee.onyx.server.tenants.models import CreateSubscriptionSessionRequest
 from ee.onyx.server.tenants.models import ProductGatingFullSyncRequest
 from ee.onyx.server.tenants.models import ProductGatingRequest
 from ee.onyx.server.tenants.models import ProductGatingResponse
@@ -23,7 +22,6 @@ from onyx.utils.logger import setup_logger
 from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
 from shared_configs.contextvars import get_current_tenant_id

-stripe.api_key = STRIPE_SECRET_KEY
 logger = setup_logger()

 router = APIRouter(prefix="/tenants")
@@ -82,21 +80,17 @@ async def billing_information(
 async def create_customer_portal_session(
    _: User = Depends(current_admin_user),
 ) -> dict:
+    """
+    Create a Stripe customer portal session via the control plane.
+    NOTE: This is currently only used for multi-tenant (cloud) deployments.
+    Self-hosted proxy endpoints will be added in a future phase.
+    """
    tenant_id = get_current_tenant_id()
+    return_url = f"{WEB_DOMAIN}/admin/billing"

    try:
-        stripe_info = fetch_tenant_stripe_information(tenant_id)
-        stripe_customer_id = stripe_info.get("stripe_customer_id")
-        if not stripe_customer_id:
-            raise HTTPException(status_code=400, detail="Stripe customer ID not found")
-        logger.info(stripe_customer_id)
-
-        portal_session = stripe.billing_portal.Session.create(
-            customer=stripe_customer_id,
-            return_url=f"{WEB_DOMAIN}/admin/billing",
-        )
-        logger.info(portal_session)
-        return {"url": portal_session.url}
+        portal_url = fetch_customer_portal_session(tenant_id, return_url)
+        return {"url": portal_url}
    except Exception as e:
        logger.exception("Failed to create customer portal session")
        raise HTTPException(status_code=500, detail=str(e))
@@ -104,15 +98,18 @@ async def create_customer_portal_session(

@router.post("/create-subscription-session")
 async def create_subscription_session(
+    request: CreateSubscriptionSessionRequest | None = None,
    _: User = Depends(current_admin_user),
 ) -> SubscriptionSessionResponse:
    try:
        tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
        if not tenant_id:
            raise HTTPException(status_code=400, detail="Tenant ID not found")
-        session_id = fetch_stripe_checkout_session(tenant_id)
+
+        billing_period = request.billing_period if request else "monthly"
+        session_id = fetch_stripe_checkout_session(tenant_id, billing_period)
        return SubscriptionSessionResponse(sessionId=session_id)

    except Exception as e:
-        logger.exception("Failed to create resubscription session")
+        logger.exception("Failed to create subscription session")
        raise HTTPException(status_code=500, detail=str(e))
--- a/backend/ee/onyx/server/tenants/models.py
+++ b/backend/ee/onyx/server/tenants/models.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+from typing import Literal

 from pydantic import BaseModel

@@ -73,6 +74,12 @@ class SubscriptionSessionResponse(BaseModel):
    sessionId: str


+class CreateSubscriptionSessionRequest(BaseModel):
+    """Request to create a subscription checkout session."""
+
+    billing_period: Literal["monthly", "annual"] = "monthly"
+
+
 class TenantByDomainResponse(BaseModel):
    tenant_id: str
    number_of_users: int
--- a/backend/ee/onyx/server/tenants/product_gating.py
+++ b/backend/ee/onyx/server/tenants/product_gating.py
@@ -65,3 +65,9 @@ def get_gated_tenants() -> set[str]:
    redis_client = get_redis_replica_client(tenant_id=ONYX_CLOUD_TENANT_ID)
    gated_tenants_bytes = cast(set[bytes], redis_client.smembers(GATED_TENANTS_KEY))
    return {tenant_id.decode("utf-8") for tenant_id in gated_tenants_bytes}
+
+
+def is_tenant_gated(tenant_id: str) -> bool:
+    """Fast O(1) check if tenant is in gated set (multi-tenant only)."""
+    redis_client = get_redis_replica_client(tenant_id=ONYX_CLOUD_TENANT_ID)
+    return bool(redis_client.sismember(GATED_TENANTS_KEY, tenant_id))
--- a/backend/ee/onyx/server/token_rate_limits/api.py
+++ b/backend/ee/onyx/server/token_rate_limits/api.py
@@ -9,6 +9,7 @@ from ee.onyx.db.token_limit import fetch_user_group_token_rate_limits_for_user
 from ee.onyx.db.token_limit import insert_user_group_token_rate_limit
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_curator_or_admin_user
+from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.db.token_limit import fetch_all_user_token_rate_limits
@@ -16,7 +17,6 @@ from onyx.db.token_limit import insert_user_token_rate_limit
 from onyx.server.query_and_chat.token_limit import any_rate_limit_exists
 from onyx.server.token_rate_limits.models import TokenRateLimitArgs
 from onyx.server.token_rate_limits.models import TokenRateLimitDisplay
-from onyx.server.utils import PUBLIC_API_TAGS

 router = APIRouter(prefix="/admin/token-rate-limits", tags=PUBLIC_API_TAGS)

--- a/backend/ee/onyx/server/usage_limits.py
+++ b/backend/ee/onyx/server/usage_limits.py
@@ -1,8 +1,5 @@
 """EE Usage limits - trial detection via billing information."""

-from datetime import datetime
-from datetime import timezone
-
 from ee.onyx.server.tenants.billing import fetch_billing_information
 from ee.onyx.server.tenants.models import BillingInformation
 from ee.onyx.server.tenants.models import SubscriptionStatusResponse
@@ -31,13 +28,7 @@ def is_tenant_on_trial(tenant_id: str) -> bool:
            return True

        if isinstance(billing_info, BillingInformation):
-            # Check if trial is active
-            if billing_info.trial_end is not None:
-                now = datetime.now(timezone.utc)
-                # Trial active if trial_end is in the future
-                # and subscription status indicates trialing
-                if billing_info.trial_end > now and billing_info.status == "trialing":
-                    return True
+            return billing_info.status == "trialing"

        return False

--- a/backend/ee/onyx/server/user_group/api.py
+++ b/backend/ee/onyx/server/user_group/api.py
@@ -18,10 +18,10 @@ from ee.onyx.server.user_group.models import UserGroupCreate
 from ee.onyx.server.user_group.models import UserGroupUpdate
 from onyx.auth.users import current_admin_user
 from onyx.auth.users import current_curator_or_admin_user
+from onyx.configs.constants import PUBLIC_API_TAGS
 from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.db.models import UserRole
-from onyx.server.utils import PUBLIC_API_TAGS
 from onyx.utils.logger import setup_logger

 logger = setup_logger()
--- a/backend/onyx/access/models.py
+++ b/backend/onyx/access/models.py
@@ -105,6 +105,8 @@ class DocExternalAccess:
        )


+# TODO(andrei): First refactor this into a pydantic model, then get rid of
+# duplicate fields.
@dataclass(frozen=True, init=False)
 class DocumentAccess(ExternalAccess):
    # User emails for Onyx users, None indicates admin
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -517,6 +517,7 @@ def wait_for_vespa_or_shutdown(sender: Any, **kwargs: Any) -> None:
    Raises WorkerShutdown if the timeout is reached."""

    if ENABLE_OPENSEARCH_FOR_ONYX:
+        # TODO(andrei): Do some similar liveness checking for OpenSearch.
        return

    if not wait_for_vespa_with_timeout():
--- a/backend/onyx/background/celery/apps/background.py
+++ b/backend/onyx/background/celery/apps/background.py
@@ -124,6 +124,7 @@ celery_app.autodiscover_tasks(
        "onyx.background.celery.tasks.kg_processing",
        "onyx.background.celery.tasks.monitoring",
        "onyx.background.celery.tasks.user_file_processing",
+        "onyx.background.celery.tasks.llm_model_update",
        # Light worker tasks
        "onyx.background.celery.tasks.shared",
        "onyx.background.celery.tasks.vespa",
--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -174,7 +174,7 @@ if AUTO_LLM_CONFIG_URL:
            "schedule": timedelta(seconds=AUTO_LLM_UPDATE_INTERVAL_SECONDS),
            "options": {
                "priority": OnyxCeleryPriority.LOW,
-                "expires": AUTO_LLM_UPDATE_INTERVAL_SECONDS,
+                "expires": BEAT_EXPIRES_DEFAULT,
            },
        }
    )
--- a/backend/onyx/background/celery/tasks/llm_model_update/tasks.py
+++ b/backend/onyx/background/celery/tasks/llm_model_update/tasks.py
@@ -5,6 +5,9 @@ from onyx.background.celery.apps.app_base import task_logger
 from onyx.configs.app_configs import AUTO_LLM_CONFIG_URL
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.llm.well_known_providers.auto_update_service import (
+    sync_llm_models_from_github,
+)


@shared_task(
@@ -26,24 +29,9 @@ def check_for_auto_llm_updates(self: Task, *, tenant_id: str) -> bool | None:
        return None

    try:
-        # Import here to avoid circular imports
-        from onyx.llm.well_known_providers.auto_update_service import (
-            fetch_llm_recommendations_from_github,
-        )
-        from onyx.llm.well_known_providers.auto_update_service import (
-            sync_llm_models_from_github,
-        )
-
-        # Fetch config from GitHub
-        config = fetch_llm_recommendations_from_github()
-
-        if not config:
-            task_logger.warning("Failed to fetch GitHub config")
-            return None
-
        # Sync to database
        with get_session_with_current_tenant() as db_session:
-            results = sync_llm_models_from_github(db_session, config)
+            results = sync_llm_models_from_github(db_session)

            if results:
                task_logger.info(f"Auto mode sync results: {results}")
--- a/backend/onyx/chat/chat_processing_checker.py
+++ b/backend/onyx/chat/chat_processing_checker.py
@@ -0,0 +1,57 @@
+from uuid import UUID
+
+from redis.client import Redis
+
+# Redis key prefixes for chat message processing
+PREFIX = "chatprocessing"
+FENCE_PREFIX = f"{PREFIX}_fence"
+FENCE_TTL = 30 * 60  # 30 minutes
+
+
+def _get_fence_key(chat_session_id: UUID) -> str:
+    """
+    Generate the Redis key for a chat session processing a message.
+
+    Args:
+        chat_session_id: The UUID of the chat session
+
+    Returns:
+        The fence key string (tenant_id is automatically added by the Redis client)
+    """
+    return f"{FENCE_PREFIX}_{chat_session_id}"
+
+
+def set_processing_status(
+    chat_session_id: UUID, redis_client: Redis, value: bool
+) -> None:
+    """
+    Set or clear the fence for a chat session processing a message.
+
+    If the key exists, we are processing a message. If the key does not exist, we are not processing a message.
+
+    Args:
+        chat_session_id: The UUID of the chat session
+        redis_client: The Redis client to use
+        value: True to set the fence, False to clear it
+    """
+    fence_key = _get_fence_key(chat_session_id)
+
+    if value:
+        redis_client.set(fence_key, 0, ex=FENCE_TTL)
+    else:
+        redis_client.delete(fence_key)
+
+
+def is_chat_session_processing(chat_session_id: UUID, redis_client: Redis) -> bool:
+    """
+    Check if the chat session is processing a message.
+
+    Args:
+        chat_session_id: The UUID of the chat session
+        redis_client: The Redis client to use
+
+    Returns:
+        True if the chat session is processing a message, False otherwise
+    """
+    fence_key = _get_fence_key(chat_session_id)
+    return bool(redis_client.exists(fence_key))
--- a/backend/onyx/chat/chat_state.py
+++ b/backend/onyx/chat/chat_state.py
@@ -94,6 +94,7 @@ class ChatStateContainer:

 def run_chat_loop_with_state_containers(
    func: Callable[..., None],
+    completion_callback: Callable[[ChatStateContainer], None],
    is_connected: Callable[[], bool],
    emitter: Emitter,
    state_container: ChatStateContainer,
@@ -196,3 +197,12 @@ def run_chat_loop_with_state_containers(
        # Skip waiting if user disconnected to exit quickly.
        if is_connected():
            wait_on_background(thread)
+        try:
+            completion_callback(state_container)
+        except Exception as e:
+            emitter.emit(
+                Packet(
+                    placement=Placement(turn_index=last_turn_index + 1),
+                    obj=PacketException(type="error", exception=e),
+                )
+            )
--- a/backend/onyx/chat/chat_utils.py
+++ b/backend/onyx/chat/chat_utils.py
@@ -18,12 +18,10 @@ from onyx.background.celery.tasks.kg_processing.kg_indexing import (
 from onyx.chat.models import ChatLoadedFile
 from onyx.chat.models import ChatMessageSimple
 from onyx.chat.models import PersonaOverrideConfig
-from onyx.chat.models import ThreadMessage
 from onyx.configs.constants import DEFAULT_PERSONA_ID
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import TMP_DRALPHA_PERSONA_NAME
-from onyx.context.search.models import RerankingDetails
-from onyx.context.search.models import RetrievalDetails
+from onyx.context.search.enums import RecencyBiasSetting
 from onyx.db.chat import create_chat_session
 from onyx.db.chat import get_chat_messages_by_session
 from onyx.db.chat import get_or_create_root_message
@@ -48,13 +46,10 @@ from onyx.kg.models import KGException
 from onyx.kg.setup.kg_default_entity_definitions import (
    populate_missing_default_entity_types__commit,
 )
-from onyx.llm.override_models import LLMOverride
-from onyx.natural_language_processing.utils import BaseTokenizer
 from onyx.prompts.chat_prompts import ADDITIONAL_CONTEXT_PROMPT
 from onyx.prompts.chat_prompts import TOOL_CALL_RESPONSE_CROSS_MESSAGE
 from onyx.prompts.tool_prompts import TOOL_CALL_FAILURE_PROMPT
 from onyx.server.query_and_chat.models import ChatSessionCreationRequest
-from onyx.server.query_and_chat.models import CreateChatMessageRequest
 from onyx.server.query_and_chat.streaming_models import CitationInfo
 from onyx.tools.models import ToolCallKickoff
 from onyx.tools.tool_implementations.custom.custom_tool import (
@@ -103,89 +98,6 @@ def create_chat_session_from_request(
    )


-def prepare_chat_message_request(
-    message_text: str,
-    user: User | None,
-    persona_id: int | None,
-    # Does the question need to have a persona override
-    persona_override_config: PersonaOverrideConfig | None,
-    message_ts_to_respond_to: str | None,
-    retrieval_details: RetrievalDetails | None,
-    rerank_settings: RerankingDetails | None,
-    db_session: Session,
-    skip_gen_ai_answer_generation: bool = False,
-    llm_override: LLMOverride | None = None,
-    allowed_tool_ids: list[int] | None = None,
-    forced_tool_ids: list[int] | None = None,
-) -> CreateChatMessageRequest:
-    # Typically used for one shot flows like SlackBot or non-chat API endpoint use cases
-    new_chat_session = create_chat_session(
-        db_session=db_session,
-        description=None,
-        user_id=user.id if user else None,
-        # If using an override, this id will be ignored later on
-        persona_id=persona_id or DEFAULT_PERSONA_ID,
-        onyxbot_flow=True,
-        slack_thread_id=message_ts_to_respond_to,
-    )
-
-    return CreateChatMessageRequest(
-        chat_session_id=new_chat_session.id,
-        parent_message_id=None,  # It's a standalone chat session each time
-        message=message_text,
-        file_descriptors=[],  # Currently SlackBot/answer api do not support files in the context
-        # Can always override the persona for the single query, if it's a normal persona
-        # then it will be treated the same
-        persona_override_config=persona_override_config,
-        search_doc_ids=None,
-        retrieval_options=retrieval_details,
-        rerank_settings=rerank_settings,
-        skip_gen_ai_answer_generation=skip_gen_ai_answer_generation,
-        llm_override=llm_override,
-        allowed_tool_ids=allowed_tool_ids,
-        forced_tool_ids=forced_tool_ids,
-    )
-
-
-def combine_message_thread(
-    messages: list[ThreadMessage],
-    max_tokens: int | None,
-    llm_tokenizer: BaseTokenizer,
-) -> str:
-    """Used to create a single combined message context from threads"""
-    if not messages:
-        return ""
-
-    message_strs: list[str] = []
-    total_token_count = 0
-
-    for message in reversed(messages):
-        if message.role == MessageType.USER:
-            role_str = message.role.value.upper()
-            if message.sender:
-                role_str += " " + message.sender
-            else:
-                # Since other messages might have the user identifying information
-                # better to use Unknown for symmetry
-                role_str += " Unknown"
-        else:
-            role_str = message.role.value.upper()
-
-        msg_str = f"{role_str}:\n{message.message}"
-        message_token_count = len(llm_tokenizer.encode(msg_str))
-
-        if (
-            max_tokens is not None
-            and total_token_count + message_token_count > max_tokens
-        ):
-            break
-
-        message_strs.insert(0, msg_str)
-        total_token_count += message_token_count
-
-    return "\n\n".join(message_strs)
-
-
 def create_chat_history_chain(
    chat_session_id: UUID,
    db_session: Session,
@@ -247,31 +159,6 @@ def create_chat_history_chain(
    return mainline_messages


-def combine_message_chain(
-    messages: list[ChatMessage],
-    token_limit: int,
-    msg_limit: int | None = None,
-) -> str:
-    """Used for secondary LLM flows that require the chat history,"""
-    message_strs: list[str] = []
-    total_token_count = 0
-
-    if msg_limit is not None:
-        messages = messages[-msg_limit:]
-
-    for message in cast(list[ChatMessage], reversed(messages)):
-        message_token_count = message.token_count
-
-        if total_token_count + message_token_count > token_limit:
-            break
-
-        role = message.message_type.value.upper()
-        message_strs.insert(0, f"{role}:\n{message.message}")
-        total_token_count += message_token_count
-
-    return "\n\n".join(message_strs)
-
-
 def reorganize_citations(
    answer: str, citations: list[CitationInfo]
 ) -> tuple[str, list[CitationInfo]]:
@@ -412,7 +299,7 @@ def create_temporary_persona(
        num_chunks=persona_config.num_chunks,
        llm_relevance_filter=persona_config.llm_relevance_filter,
        llm_filter_extraction=persona_config.llm_filter_extraction,
-        recency_bias=persona_config.recency_bias,
+        recency_bias=RecencyBiasSetting.BASE_DECAY,
        llm_model_provider_override=persona_config.llm_model_provider_override,
        llm_model_version_override=persona_config.llm_model_version_override,
    )
@@ -582,6 +469,71 @@ def load_all_chat_files(
    return files


+def convert_chat_history_basic(
+    chat_history: list[ChatMessage],
+    token_counter: Callable[[str], int],
+    max_individual_message_tokens: int | None = None,
+    max_total_tokens: int | None = None,
+) -> list[ChatMessageSimple]:
+    """Convert ChatMessage history to ChatMessageSimple format with no tool calls or files included.
+
+    Args:
+        chat_history: List of ChatMessage objects to convert
+        token_counter: Function to count tokens in a message string
+        max_individual_message_tokens: If set, messages exceeding this number of tokens are dropped.
+            If None, no messages are dropped based on individual token count.
+        max_total_tokens: If set, maximum number of tokens allowed for the entire history.
+            If None, the history is not trimmed based on total token count.
+
+    Returns:
+        List of ChatMessageSimple objects
+    """
+    # Defensive: treat a non-positive total budget as "no history".
+    if max_total_tokens is not None and max_total_tokens <= 0:
+        return []
+
+    # Convert only the core USER/ASSISTANT messages; omit files and tool calls.
+    converted: list[ChatMessageSimple] = []
+    for chat_message in chat_history:
+        if chat_message.message_type not in (MessageType.USER, MessageType.ASSISTANT):
+            continue
+
+        message = chat_message.message or ""
+        token_count = getattr(chat_message, "token_count", None)
+        if token_count is None:
+            token_count = token_counter(message)
+
+        # Drop any single message that would dominate the context window.
+        if (
+            max_individual_message_tokens is not None
+            and token_count > max_individual_message_tokens
+        ):
+            continue
+
+        converted.append(
+            ChatMessageSimple(
+                message=message,
+                token_count=token_count,
+                message_type=chat_message.message_type,
+                image_files=None,
+            )
+        )
+
+    if max_total_tokens is None:
+        return converted
+
+    # Enforce a max total budget by keeping a contiguous suffix of the conversation.
+    trimmed_reversed: list[ChatMessageSimple] = []
+    total_tokens = 0
+    for msg in reversed(converted):
+        if total_tokens + msg.token_count > max_total_tokens:
+            break
+        trimmed_reversed.append(msg)
+        total_tokens += msg.token_count
+
+    return list(reversed(trimmed_reversed))
+
+
 def convert_chat_history(
    chat_history: list[ChatMessage],
    files: list[ChatLoadedFile],
--- a/backend/onyx/chat/citation_processor.py
+++ b/backend/onyx/chat/citation_processor.py
@@ -4,14 +4,15 @@ Dynamic Citation Processor for LLM Responses
 This module provides a citation processor that can:
 - Accept citation number to SearchDoc mappings dynamically
 - Process token streams from LLMs to extract citations
- Optionally replace citation markers with formatted markdown links
- Emit CitationInfo objects for detected citations (when replacing)
- Track all seen citations regardless of replacement mode
+- Handle citations in three modes: REMOVE, KEEP_MARKERS, or HYPERLINK
+- Emit CitationInfo objects for detected citations (in HYPERLINK mode)
+- Track all seen citations regardless of mode
 - Maintain a list of cited documents in order of first citation
 """

 import re
 from collections.abc import Generator
+from enum import Enum
 from typing import TypeAlias

 from onyx.configs.chat_configs import STOP_STREAM_PAT
@@ -23,6 +24,29 @@ from onyx.utils.logger import setup_logger
 logger = setup_logger()


+class CitationMode(Enum):
+    """Defines how citations should be handled in the output.
+
+    REMOVE: Citations are completely removed from output text.
+            No CitationInfo objects are emitted.
+            Use case: When you need to remove citations from the output if they are not shared with the user
+            (e.g. in discord bot, public slack bot).
+
+    KEEP_MARKERS: Original citation markers like [1], [2] are preserved unchanged.
+                  No CitationInfo objects are emitted.
+                  Use case: When you need to track citations in research agent and later process
+                  them with collapse_citations() to renumber.
+
+    HYPERLINK: Citations are replaced with markdown links like [[1]](url).
+               CitationInfo objects are emitted for UI tracking.
+               Use case: Final reports shown to users with clickable links.
+    """
+
+    REMOVE = "remove"
+    KEEP_MARKERS = "keep_markers"
+    HYPERLINK = "hyperlink"
+
+
 CitationMapping: TypeAlias = dict[int, SearchDoc]


@@ -48,29 +72,37 @@ class DynamicCitationProcessor:

    This processor is designed for multi-turn conversations where the citation
    number to document mapping is provided externally. It processes streaming
-    tokens from an LLM, detects citations (e.g., [1], [2,3], [[4]]), and based
-    on the `replace_citation_tokens` setting:
+    tokens from an LLM, detects citations (e.g., [1], [2,3], [[4]]), and handles
+    them according to the configured CitationMode:

-    When replace_citation_tokens=True (default):
+    CitationMode.HYPERLINK (default):
        1. Replaces citation markers with formatted markdown links (e.g., [[1]](url))
        2. Emits CitationInfo objects for tracking
        3. Maintains the order in which documents were first cited
+        Use case: Final reports shown to users with clickable links.

-    When replace_citation_tokens=False:
-        1. Preserves original citation markers in the output text
+    CitationMode.KEEP_MARKERS:
+        1. Preserves original citation markers like [1], [2] unchanged
        2. Does NOT emit CitationInfo objects
        3. Still tracks all seen citations via get_seen_citations()
+        Use case: When citations need later processing (e.g., renumbering).
+
+    CitationMode.REMOVE:
+        1. Removes citation markers entirely from the output text
+        2. Does NOT emit CitationInfo objects
+        3. Still tracks all seen citations via get_seen_citations()
+        Use case: Research agent intermediate reports.

    Features:
        - Accepts citation number → SearchDoc mapping via update_citation_mapping()
-        - Configurable citation replacement behavior at initialization
-        - Always tracks seen citations regardless of replacement mode
+        - Configurable citation mode at initialization
+        - Always tracks seen citations regardless of mode
        - Holds back tokens that might be partial citations
        - Maintains list of cited SearchDocs in order of first citation
        - Handles unicode bracket variants (【】, ［］)
        - Skips citation processing inside code blocks

-    Example (with citation replacement - default):
+    Example (HYPERLINK mode - default):
        processor = DynamicCitationProcessor()

        # Set up citation mapping
@@ -87,8 +119,8 @@ class DynamicCitationProcessor:
        # Get cited documents at the end
        cited_docs = processor.get_cited_documents()

-    Example (without citation replacement):
-        processor = DynamicCitationProcessor(replace_citation_tokens=False)
+    Example (KEEP_MARKERS mode):
+        processor = DynamicCitationProcessor(citation_mode=CitationMode.KEEP_MARKERS)
        processor.update_citation_mapping({1: search_doc1, 2: search_doc2})

        # Process tokens from LLM
@@ -99,26 +131,42 @@ class DynamicCitationProcessor:

        # Get all seen citations after processing
        seen_citations = processor.get_seen_citations()  # {1: search_doc1, ...}
+
+    Example (REMOVE mode):
+        processor = DynamicCitationProcessor(citation_mode=CitationMode.REMOVE)
+        processor.update_citation_mapping({1: search_doc1, 2: search_doc2})
+
+        # Process tokens - citations are removed but tracked
+        for token in llm_stream:
+            for result in processor.process_token(token):
+                print(result)  # Text without any citation markers
+
+        # Citations are still tracked
+        seen_citations = processor.get_seen_citations()
    """

    def __init__(
        self,
-        replace_citation_tokens: bool = True,
+        citation_mode: CitationMode = CitationMode.HYPERLINK,
        stop_stream: str | None = STOP_STREAM_PAT,
    ):
        """
        Initialize the citation processor.

        Args:
-            replace_citation_tokens: If True (default), citations like [1] are replaced
-                with formatted markdown links like [[1]](url) and CitationInfo objects
-                are emitted. If False, original citation text is preserved in output
-                and no CitationInfo objects are emitted. Regardless of this setting,
-                all seen citations are tracked and available via get_seen_citations().
+            citation_mode: How to handle citations in the output. One of:
+                - CitationMode.HYPERLINK (default): Replace [1] with [[1]](url)
+                  and emit CitationInfo objects.
+                - CitationMode.KEEP_MARKERS: Keep original [1] markers unchanged,
+                  no CitationInfo objects emitted.
+                - CitationMode.REMOVE: Remove citations entirely from output,
+                  no CitationInfo objects emitted.
+                All modes track seen citations via get_seen_citations().
            stop_stream: Optional stop token pattern to halt processing early.
                When this pattern is detected in the token stream, processing stops.
                Defaults to STOP_STREAM_PAT from chat configs.
        """
+
        # Citation mapping from citation number to SearchDoc
        self.citation_to_doc: CitationMapping = {}
        self.seen_citations: CitationMapping = {}  # citation num -> SearchDoc
@@ -128,7 +176,7 @@ class DynamicCitationProcessor:
        self.curr_segment = ""  # tokens held for citation processing
        self.hold = ""  # tokens held for stop token processing
        self.stop_stream = stop_stream
-        self.replace_citation_tokens = replace_citation_tokens
+        self.citation_mode = citation_mode

        # Citation tracking
        self.cited_documents_in_order: list[SearchDoc] = (
@@ -199,19 +247,21 @@ class DynamicCitationProcessor:
        5. Handles stop tokens
        6. Always tracks seen citations in self.seen_citations

-        Behavior depends on the `replace_citation_tokens` setting from __init__:
-        - If True: Citations are replaced with [[n]](url) format and CitationInfo
+        Behavior depends on the `citation_mode` setting from __init__:
+        - HYPERLINK: Citations are replaced with [[n]](url) format and CitationInfo
          objects are yielded before each formatted citation
-        - If False: Original citation text (e.g., [1]) is preserved in output
-          and no CitationInfo objects are yielded
+        - KEEP_MARKERS: Original citation markers like [1] are preserved unchanged,
+          no CitationInfo objects are yielded
+        - REMOVE: Citations are removed entirely from output,
+          no CitationInfo objects are yielded

        Args:
            token: The next token from the LLM stream, or None to signal end of stream.
                Pass None to flush any remaining buffered text at end of stream.

        Yields:
-            str: Text chunks to display. Citation format depends on replace_citation_tokens.
-            CitationInfo: Citation metadata (only when replace_citation_tokens=True)
+            str: Text chunks to display. Citation format depends on citation_mode.
+            CitationInfo: Citation metadata (only when citation_mode=HYPERLINK)
        """
        # None -> end of stream, flush remaining segment
        if token is None:
@@ -299,17 +349,17 @@ class DynamicCitationProcessor:
                if self.non_citation_count > 5:
                    self.recent_cited_documents.clear()

-                # Yield text before citation FIRST (preserve order)
-                if intermatch_str:
-                    yield intermatch_str
-
                # Process the citation (returns formatted citation text and CitationInfo objects)
-                # Always tracks seen citations regardless of strip_citations flag
+                # Always tracks seen citations regardless of citation_mode
                citation_text, citation_info_list = self._process_citation(
-                    match, has_leading_space, self.replace_citation_tokens
+                    match, has_leading_space
                )

-                if self.replace_citation_tokens:
+                if self.citation_mode == CitationMode.HYPERLINK:
+                    # HYPERLINK mode: Replace citations with markdown links [[n]](url)
+                    # Yield text before citation FIRST (preserve order)
+                    if intermatch_str:
+                        yield intermatch_str
                    # Yield CitationInfo objects BEFORE the citation text
                    # This allows the frontend to receive citation metadata before the token
                    # that contains [[n]](link), enabling immediate rendering
@@ -318,10 +368,34 @@ class DynamicCitationProcessor:
                    # Then yield the formatted citation text
                    if citation_text:
                        yield citation_text
-                else:
-                    # When not stripping, yield the original citation text unchanged
+
+                elif self.citation_mode == CitationMode.KEEP_MARKERS:
+                    # KEEP_MARKERS mode: Preserve original citation markers unchanged
+                    # Yield text before citation
+                    if intermatch_str:
+                        yield intermatch_str
+                    # Yield the original citation marker as-is
                    yield match.group()

+                else:  # CitationMode.REMOVE
+                    # REMOVE mode: Remove citations entirely from output
+                    # This strips citation markers like [1], [2], 【1】 from the output text
+                    # When removing citations, we need to handle spacing to avoid issues like:
+                    # - "text [1] more" -> "text  more" (double space)
+                    # - "text [1]." -> "text ." (space before punctuation)
+                    if intermatch_str:
+                        remaining_text = self.curr_segment[match_span[1] :]
+                        # Strip trailing space from intermatch if:
+                        # 1. Remaining text starts with space (avoids double space)
+                        # 2. Remaining text starts with punctuation (avoids space before punctuation)
+                        if intermatch_str[-1].isspace() and remaining_text:
+                            first_char = remaining_text[0]
+                            # Check if next char is space or common punctuation
+                            if first_char.isspace() or first_char in ".,;:!?)]}":
+                                intermatch_str = intermatch_str.rstrip()
+                        if intermatch_str:
+                            yield intermatch_str
+
                self.non_citation_count = 0

            # Leftover text could be part of next citation
@@ -338,7 +412,7 @@ class DynamicCitationProcessor:
            yield result

    def _process_citation(
-        self, match: re.Match, has_leading_space: bool, replace_tokens: bool = True
+        self, match: re.Match, has_leading_space: bool
    ) -> tuple[str, list[CitationInfo]]:
        """
        Process a single citation match and return formatted citation text and citation info objects.
@@ -349,31 +423,28 @@ class DynamicCitationProcessor:
        This method always:
        1. Extracts citation numbers from the match
        2. Looks up the corresponding SearchDoc from the mapping
-        3. Tracks seen citations in self.seen_citations (regardless of replace_tokens)
+        3. Tracks seen citations in self.seen_citations (regardless of citation_mode)

-        When replace_tokens=True (controlled by self.replace_citation_tokens):
+        When citation_mode is HYPERLINK:
        4. Creates formatted citation text as [[n]](url)
        5. Creates CitationInfo objects for new citations
        6. Handles deduplication of recently cited documents

-        When replace_tokens=False:
-        4. Returns empty string and empty list (caller yields original match text)
+        When citation_mode is REMOVE or KEEP_MARKERS:
+        4. Returns empty string and empty list (caller handles output based on mode)

        Args:
            match: Regex match object containing the citation pattern
            has_leading_space: Whether the text immediately before this citation
                ends with whitespace. Used to determine if a leading space should
                be added to the formatted output.
-            replace_tokens: If True, return formatted text and CitationInfo objects.
-                If False, only track seen citations and return empty results.
-                This is passed from self.replace_citation_tokens by the caller.

        Returns:
            Tuple of (formatted_citation_text, citation_info_list):
            - formatted_citation_text: Markdown-formatted citation text like
-              "[[1]](https://example.com)" or empty string if replace_tokens=False
+              "[[1]](https://example.com)" or empty string if not in HYPERLINK mode
            - citation_info_list: List of CitationInfo objects for newly cited
-              documents, or empty list if replace_tokens=False
+              documents, or empty list if not in HYPERLINK mode
        """
        citation_str: str = match.group()  # e.g., '[1]', '[1, 2, 3]', '[[1]]', '【1】'
        formatted = (
@@ -411,11 +482,11 @@ class DynamicCitationProcessor:
            doc_id = search_doc.document_id
            link = search_doc.link or ""

-            # Always track seen citations regardless of replace_tokens setting
+            # Always track seen citations regardless of citation_mode setting
            self.seen_citations[num] = search_doc

-            # When not replacing citation tokens, skip the rest of the processing
-            if not replace_tokens:
+            # Only generate formatted citations and CitationInfo in HYPERLINK mode
+            if self.citation_mode != CitationMode.HYPERLINK:
                continue

            # Format the citation text as [[n]](link)
@@ -450,14 +521,14 @@ class DynamicCitationProcessor:
        """
        Get the list of cited SearchDoc objects in the order they were first cited.

-        Note: This list is only populated when `replace_citation_tokens=True`.
-        When `replace_citation_tokens=False`, this will return an empty list.
+        Note: This list is only populated when `citation_mode=HYPERLINK`.
+        When using REMOVE or KEEP_MARKERS mode, this will return an empty list.
        Use get_seen_citations() instead if you need to track citations without
-        replacing them.
+        emitting CitationInfo objects.

        Returns:
            List of SearchDoc objects in the order they were first cited.
-            Empty list if replace_citation_tokens=False.
+            Empty list if citation_mode is not HYPERLINK.
        """
        return self.cited_documents_in_order

@@ -465,14 +536,14 @@ class DynamicCitationProcessor:
        """
        Get the list of cited document IDs in the order they were first cited.

-        Note: This list is only populated when `replace_citation_tokens=True`.
-        When `replace_citation_tokens=False`, this will return an empty list.
+        Note: This list is only populated when `citation_mode=HYPERLINK`.
+        When using REMOVE or KEEP_MARKERS mode, this will return an empty list.
        Use get_seen_citations() instead if you need to track citations without
-        replacing them.
+        emitting CitationInfo objects.

        Returns:
            List of document IDs (strings) in the order they were first cited.
-            Empty list if replace_citation_tokens=False.
+            Empty list if citation_mode is not HYPERLINK.
        """
        return [doc.document_id for doc in self.cited_documents_in_order]

@@ -481,12 +552,12 @@ class DynamicCitationProcessor:
        Get all seen citations as a mapping from citation number to SearchDoc.

        This returns all citations that have been encountered during processing,
-        regardless of the `replace_citation_tokens` setting. Citations are tracked
+        regardless of the `citation_mode` setting. Citations are tracked
        whenever they are parsed, making this useful for cases where you need to
-        know which citations appeared in the text without replacing them.
+        know which citations appeared in the text without emitting CitationInfo objects.

-        This is particularly useful when `replace_citation_tokens=False`, as
-        get_cited_documents() will be empty in that case, but get_seen_citations()
+        This is particularly useful when using REMOVE or KEEP_MARKERS mode, as
+        get_cited_documents() will be empty in those cases, but get_seen_citations()
        will still contain all the citations that were found.

        Returns:
@@ -501,13 +572,13 @@ class DynamicCitationProcessor:
        """
        Get the number of unique documents that have been cited.

-        Note: This count is only updated when `replace_citation_tokens=True`.
-        When `replace_citation_tokens=False`, this will always return 0.
+        Note: This count is only updated when `citation_mode=HYPERLINK`.
+        When using REMOVE or KEEP_MARKERS mode, this will always return 0.
        Use len(get_seen_citations()) instead if you need to count citations
-        without replacing them.
+        without emitting CitationInfo objects.

        Returns:
-            Number of unique documents cited. 0 if replace_citation_tokens=False.
+            Number of unique documents cited. 0 if citation_mode is not HYPERLINK.
        """
        return len(self.cited_document_ids)

@@ -519,9 +590,9 @@ class DynamicCitationProcessor:
        CitationInfo objects for the same document when it's cited multiple times
        in close succession. This method clears that tracker.

-        This is primarily useful when `replace_citation_tokens=True` to allow
+        This is primarily useful when `citation_mode=HYPERLINK` to allow
        previously cited documents to emit CitationInfo objects again. Has no
-        effect when `replace_citation_tokens=False`.
+        effect when using REMOVE or KEEP_MARKERS mode.

        The recent citation tracker is also automatically cleared when more than
        5 non-citation characters are processed between citations.
--- a/backend/onyx/chat/llm_loop.py
+++ b/backend/onyx/chat/llm_loop.py
@@ -5,9 +5,11 @@ from sqlalchemy.orm import Session
 from onyx.chat.chat_state import ChatStateContainer
 from onyx.chat.chat_utils import create_tool_call_failure_messages
 from onyx.chat.citation_processor import CitationMapping
+from onyx.chat.citation_processor import CitationMode
 from onyx.chat.citation_processor import DynamicCitationProcessor
 from onyx.chat.citation_utils import update_citation_processor_from_tool_response
 from onyx.chat.emitter import Emitter
+from onyx.chat.llm_step import extract_tool_calls_from_response_text
 from onyx.chat.llm_step import run_llm_step
 from onyx.chat.models import ChatMessageSimple
 from onyx.chat.models import ExtractedProjectFiles
@@ -37,6 +39,7 @@ from onyx.tools.built_in_tools import CITEABLE_TOOLS_NAMES
 from onyx.tools.built_in_tools import STOPPING_TOOLS_NAMES
 from onyx.tools.interface import Tool
 from onyx.tools.models import ToolCallInfo
+from onyx.tools.models import ToolCallKickoff
 from onyx.tools.models import ToolResponse
 from onyx.tools.tool_implementations.images.models import (
    FinalImageGenerationResponse,
@@ -50,6 +53,78 @@ from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()

+
+def _try_fallback_tool_extraction(
+    llm_step_result: LlmStepResult,
+    tool_choice: ToolChoiceOptions,
+    fallback_extraction_attempted: bool,
+    tool_defs: list[dict],
+    turn_index: int,
+) -> tuple[LlmStepResult, bool]:
+    """Attempt to extract tool calls from response text as a fallback.
+
+    This is a last resort fallback for low quality LLMs or those that don't have
+    tool calling from the serving layer. Also triggers if there's reasoning but
+    no answer and no tool calls.
+
+    Args:
+        llm_step_result: The result from the LLM step
+        tool_choice: The tool choice option used for this step
+        fallback_extraction_attempted: Whether fallback extraction was already attempted
+        tool_defs: List of tool definitions
+        turn_index: The current turn index for placement
+
+    Returns:
+        Tuple of (possibly updated LlmStepResult, whether fallback was attempted this call)
+    """
+    if fallback_extraction_attempted:
+        return llm_step_result, False
+
+    no_tool_calls = (
+        not llm_step_result.tool_calls or len(llm_step_result.tool_calls) == 0
+    )
+    reasoning_but_no_answer_or_tools = (
+        llm_step_result.reasoning and not llm_step_result.answer and no_tool_calls
+    )
+    should_try_fallback = (
+        tool_choice == ToolChoiceOptions.REQUIRED and no_tool_calls
+    ) or reasoning_but_no_answer_or_tools
+
+    if not should_try_fallback:
+        return llm_step_result, False
+
+    # Try to extract from answer first, then fall back to reasoning
+    extracted_tool_calls: list[ToolCallKickoff] = []
+    if llm_step_result.answer:
+        extracted_tool_calls = extract_tool_calls_from_response_text(
+            response_text=llm_step_result.answer,
+            tool_definitions=tool_defs,
+            placement=Placement(turn_index=turn_index),
+        )
+    if not extracted_tool_calls and llm_step_result.reasoning:
+        extracted_tool_calls = extract_tool_calls_from_response_text(
+            response_text=llm_step_result.reasoning,
+            tool_definitions=tool_defs,
+            placement=Placement(turn_index=turn_index),
+        )
+
+    if extracted_tool_calls:
+        logger.info(
+            f"Extracted {len(extracted_tool_calls)} tool call(s) from response text "
+            f"as fallback (tool_choice was REQUIRED but no tool calls returned)"
+        )
+        return (
+            LlmStepResult(
+                reasoning=llm_step_result.reasoning,
+                answer=llm_step_result.answer,
+                tool_calls=extracted_tool_calls,
+            ),
+            True,
+        )
+
+    return llm_step_result, True
+
+
 # Hardcoded oppinionated value, might breaks down to something like:
 # Cycle 1: Calls web_search for something
 # Cycle 2: Calls open_url for some results
@@ -297,6 +372,7 @@ def run_llm_loop(
    forced_tool_id: int | None = None,
    user_identity: LLMUserIdentity | None = None,
    chat_session_id: str | None = None,
+    include_citations: bool = True,
 ) -> None:
    with trace(
        "run_llm_loop",
@@ -314,7 +390,13 @@ def run_llm_loop(
        initialize_litellm()

        # Initialize citation processor for handling citations dynamically
-        citation_processor = DynamicCitationProcessor()
+        # When include_citations is True, use HYPERLINK mode to format citations as [[1]](url)
+        # When include_citations is False, use REMOVE mode to strip citations from output
+        citation_processor = DynamicCitationProcessor(
+            citation_mode=(
+                CitationMode.HYPERLINK if include_citations else CitationMode.REMOVE
+            )
+        )

        # Add project file citation mappings if project files are present
        project_citation_mapping: CitationMapping = {}
@@ -344,6 +426,7 @@ def run_llm_loop(
        ran_image_gen: bool = False
        just_ran_web_search: bool = False
        has_called_search_tool: bool = False
+        fallback_extraction_attempted: bool = False
        citation_mapping: dict[int, str] = {}  # Maps citation_num -> document_id/URL

        default_base_system_prompt: str = get_default_base_system_prompt(db_session)
@@ -462,10 +545,11 @@ def run_llm_loop(

            # This calls the LLM, yields packets (reasoning, answers, etc.) and returns the result
            # It also pre-processes the tool calls in preparation for running them
+            tool_defs = [tool.tool_definition() for tool in final_tools]
            llm_step_result, has_reasoned = run_llm_step(
                emitter=emitter,
                history=truncated_message_history,
-                tool_definitions=[tool.tool_definition() for tool in final_tools],
+                tool_definitions=tool_defs,
                tool_choice=tool_choice,
                llm=llm,
                placement=Placement(turn_index=llm_cycle_count + reasoning_cycles),
@@ -480,6 +564,19 @@ def run_llm_loop(
            if has_reasoned:
                reasoning_cycles += 1

+            # Fallback extraction for LLMs that don't support tool calling natively or are lower quality
+            # and might incorrectly output tool calls in other channels
+            llm_step_result, attempted = _try_fallback_tool_extraction(
+                llm_step_result=llm_step_result,
+                tool_choice=tool_choice,
+                fallback_extraction_attempted=fallback_extraction_attempted,
+                tool_defs=tool_defs,
+                turn_index=llm_cycle_count + reasoning_cycles,
+            )
+            if attempted:
+                # To prevent the case of excessive looping with bad models, we only allow one fallback attempt
+                fallback_extraction_attempted = True
+
            # Save citation mapping after each LLM step for incremental state updates
            state_container.set_citation_mapping(citation_processor.citation_to_doc)

@@ -505,7 +602,7 @@ def run_llm_loop(
            # in-flight citations
            # It can be cleaned up but not super trivial or worthwhile right now
            just_ran_web_search = False
-            tool_responses, citation_mapping = run_tool_calls(
+            parallel_tool_call_results = run_tool_calls(
                tool_calls=tool_calls,
                tools=final_tools,
                message_history=truncated_message_history,
@@ -516,6 +613,8 @@ def run_llm_loop(
                max_concurrent_tools=None,
                skip_search_query_expansion=has_called_search_tool,
            )
+            tool_responses = parallel_tool_call_results.tool_responses
+            citation_mapping = parallel_tool_call_results.updated_citation_mapping

            # Failure case, give something reasonable to the LLM to try again
            if tool_calls and not tool_responses:
@@ -570,6 +669,12 @@ def run_llm_loop(
                ):
                    generated_images = tool_response.rich_response.generated_images

+                saved_response = (
+                    tool_response.rich_response
+                    if isinstance(tool_response.rich_response, str)
+                    else tool_response.llm_facing_response
+                )
+
                tool_call_info = ToolCallInfo(
                    parent_tool_call_id=None,  # Top-level tool calls are attached to the chat message
                    turn_index=llm_cycle_count + reasoning_cycles,
@@ -579,7 +684,7 @@ def run_llm_loop(
                    tool_id=tool.id,
                    reasoning_tokens=llm_step_result.reasoning,  # All tool calls from this loop share the same reasoning
                    tool_call_arguments=tool_call.tool_args,
-                    tool_call_response=tool_response.llm_facing_response,
+                    tool_call_response=saved_response,
                    search_docs=search_docs,
                    generated_images=generated_images,
                )
@@ -635,7 +740,12 @@ def run_llm_loop(
                should_cite_documents = True

        if not llm_step_result or not llm_step_result.answer:
-            raise RuntimeError("LLM did not return an answer.")
+            raise RuntimeError(
+                "The LLM did not return an answer. "
+                "Typically this is an issue with LLMs that do not support tool calling natively, "
+                "or the model serving API is not configured correctly. "
+                "This may also happen with models that are lower quality outputting invalid tool calls."
+            )

        emitter.emit(
            Packet(
--- a/backend/onyx/chat/llm_step.py
+++ b/backend/onyx/chat/llm_step.py
@@ -1,5 +1,6 @@
 import json
 import time
+import uuid
 from collections.abc import Callable
 from collections.abc import Generator
 from collections.abc import Mapping
@@ -48,6 +49,7 @@ from onyx.tools.models import ToolCallKickoff
 from onyx.tracing.framework.create import generation_span
 from onyx.utils.b64 import get_image_type_from_bytes
 from onyx.utils.logger import setup_logger
+from onyx.utils.text_processing import find_all_json_objects

 logger = setup_logger()

@@ -136,12 +138,11 @@ def _format_message_history_for_logging(

    separator = "================================================"

-    # Handle string input
-    if isinstance(message_history, str):
-        formatted_lines.append("Message [string]:")
-        formatted_lines.append(separator)
-        formatted_lines.append(f"{message_history}")
-        return "\n".join(formatted_lines)
+    # Handle single ChatCompletionMessage - wrap in list for uniform processing
+    if isinstance(
+        message_history, (SystemMessage, UserMessage, AssistantMessage, ToolMessage)
+    ):
+        message_history = [message_history]

    # Handle sequence of messages
    for i, msg in enumerate(message_history):
@@ -211,7 +212,8 @@ def _update_tool_call_with_delta(

    if index not in tool_calls_in_progress:
        tool_calls_in_progress[index] = {
-            "id": None,
+            # Fallback ID in case the provider never sends one via deltas.
+            "id": f"fallback_{uuid.uuid4().hex}",
            "name": None,
            "arguments": "",
        }
@@ -277,6 +279,144 @@ def _extract_tool_call_kickoffs(
    return tool_calls


+def extract_tool_calls_from_response_text(
+    response_text: str | None,
+    tool_definitions: list[dict],
+    placement: Placement,
+) -> list[ToolCallKickoff]:
+    """Extract tool calls from LLM response text by matching JSON against tool definitions.
+
+    This is a fallback mechanism for when the LLM was expected to return tool calls
+    but didn't use the proper tool call format. It searches for JSON objects in the
+    response text that match the structure of available tools.
+
+    Args:
+        response_text: The LLM's text response to search for tool calls
+        tool_definitions: List of tool definitions to match against
+        placement: Placement information for the tool calls
+
+    Returns:
+        List of ToolCallKickoff objects for any matched tool calls
+    """
+    if not response_text or not tool_definitions:
+        return []
+
+    # Build a map of tool names to their definitions
+    tool_name_to_def: dict[str, dict] = {}
+    for tool_def in tool_definitions:
+        if tool_def.get("type") == "function" and "function" in tool_def:
+            func_def = tool_def["function"]
+            tool_name = func_def.get("name")
+            if tool_name:
+                tool_name_to_def[tool_name] = func_def
+
+    if not tool_name_to_def:
+        return []
+
+    # Find all JSON objects in the response text
+    json_objects = find_all_json_objects(response_text)
+
+    tool_calls: list[ToolCallKickoff] = []
+    tab_index = 0
+
+    for json_obj in json_objects:
+        matched_tool_call = _try_match_json_to_tool(json_obj, tool_name_to_def)
+        if matched_tool_call:
+            tool_name, tool_args = matched_tool_call
+            tool_calls.append(
+                ToolCallKickoff(
+                    tool_call_id=f"extracted_{uuid.uuid4().hex[:8]}",
+                    tool_name=tool_name,
+                    tool_args=tool_args,
+                    placement=Placement(
+                        turn_index=placement.turn_index,
+                        tab_index=tab_index,
+                        sub_turn_index=placement.sub_turn_index,
+                    ),
+                )
+            )
+            tab_index += 1
+
+    logger.info(
+        f"Extracted {len(tool_calls)} tool call(s) from response text as fallback"
+    )
+
+    return tool_calls
+
+
+def _try_match_json_to_tool(
+    json_obj: dict[str, Any],
+    tool_name_to_def: dict[str, dict],
+) -> tuple[str, dict[str, Any]] | None:
+    """Try to match a JSON object to a tool definition.
+
+    Supports several formats:
+    1. Direct tool call format: {"name": "tool_name", "arguments": {...}}
+    2. Function call format: {"function": {"name": "tool_name", "arguments": {...}}}
+    3. Tool name as key: {"tool_name": {...arguments...}}
+    4. Arguments matching a tool's parameter schema
+
+    Args:
+        json_obj: The JSON object to match
+        tool_name_to_def: Map of tool names to their function definitions
+
+    Returns:
+        Tuple of (tool_name, tool_args) if matched, None otherwise
+    """
+    # Format 1: Direct tool call format {"name": "...", "arguments": {...}}
+    if "name" in json_obj and json_obj["name"] in tool_name_to_def:
+        tool_name = json_obj["name"]
+        arguments = json_obj.get("arguments", json_obj.get("parameters", {}))
+        if isinstance(arguments, str):
+            try:
+                arguments = json.loads(arguments)
+            except json.JSONDecodeError:
+                arguments = {}
+        if isinstance(arguments, dict):
+            return (tool_name, arguments)
+
+    # Format 2: Function call format {"function": {"name": "...", "arguments": {...}}}
+    if "function" in json_obj and isinstance(json_obj["function"], dict):
+        func_obj = json_obj["function"]
+        if "name" in func_obj and func_obj["name"] in tool_name_to_def:
+            tool_name = func_obj["name"]
+            arguments = func_obj.get("arguments", func_obj.get("parameters", {}))
+            if isinstance(arguments, str):
+                try:
+                    arguments = json.loads(arguments)
+                except json.JSONDecodeError:
+                    arguments = {}
+            if isinstance(arguments, dict):
+                return (tool_name, arguments)
+
+    # Format 3: Tool name as key {"tool_name": {...arguments...}}
+    for tool_name in tool_name_to_def:
+        if tool_name in json_obj:
+            arguments = json_obj[tool_name]
+            if isinstance(arguments, dict):
+                return (tool_name, arguments)
+
+    # Format 4: Check if the JSON object matches a tool's parameter schema
+    for tool_name, func_def in tool_name_to_def.items():
+        params = func_def.get("parameters", {})
+        properties = params.get("properties", {})
+        required = params.get("required", [])
+
+        if not properties:
+            continue
+
+        # Check if all required parameters are present (empty required = all optional)
+        if all(req in json_obj for req in required):
+            # Check if any of the tool's properties are in the JSON object
+            matching_props = [prop for prop in properties if prop in json_obj]
+            if matching_props:
+                # Filter to only include known properties
+                filtered_args = {k: v for k, v in json_obj.items() if k in properties}
+                return (tool_name, filtered_args)
+
+    return None
+
+
 def translate_history_to_llm_format(
    history: list[ChatMessageSimple],
    llm_config: LLMConfig,
@@ -581,6 +721,18 @@ def run_llm_step_pkt_generator(
                }
                # Note: LLM cost tracking is now handled in multi_llm.py
            delta = packet.choice.delta
+
+            # Weird behavior from some model providers, just log and ignore for now
+            if (
+                delta.content is None
+                and delta.reasoning_content is None
+                and delta.tool_calls is None
+            ):
+                logger.warning(
+                    f"LLM packet is empty (no contents, reasoning or tool calls). Skipping: {packet}"
+                )
+                continue
+
            if not first_action_recorded and _delta_has_action(delta):
                span_generation.span_data.time_to_first_action_seconds = (
                    time.monotonic() - stream_start_time
@@ -840,14 +992,14 @@ def run_llm_step_pkt_generator(
        logger.debug(f"Accumulated reasoning: {accumulated_reasoning}")
        logger.debug(f"Accumulated answer: {accumulated_answer}")

-    if tool_calls:
-        tool_calls_str = "\n".join(
-            f"  - {tc.tool_name}: {json.dumps(tc.tool_args, indent=4)}"
-            for tc in tool_calls
-        )
-        logger.debug(f"Tool calls:\n{tool_calls_str}")
-    else:
-        logger.debug("Tool calls: []")
+        if tool_calls:
+            tool_calls_str = "\n".join(
+                f"  - {tc.tool_name}: {json.dumps(tc.tool_args, indent=4)}"
+                for tc in tool_calls
+            )
+            logger.debug(f"Tool calls:\n{tool_calls_str}")
+        else:
+            logger.debug("Tool calls: []")

    return (
        LlmStepResult(
--- a/backend/onyx/chat/models.py
+++ b/backend/onyx/chat/models.py
@@ -1,6 +1,5 @@
 from collections.abc import Callable
 from collections.abc import Iterator
-from datetime import datetime
 from enum import Enum
 from typing import Any
 from uuid import UUID
@@ -8,10 +7,7 @@ from uuid import UUID
 from pydantic import BaseModel
 from pydantic import Field

-from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import MessageType
-from onyx.context.search.enums import QueryFlow
-from onyx.context.search.enums import RecencyBiasSetting
 from onyx.context.search.enums import SearchType
 from onyx.context.search.models import SearchDoc
 from onyx.file_store.models import FileDescriptor
@@ -24,25 +20,6 @@ from onyx.tools.models import ToolCallKickoff
 from onyx.tools.tool_implementations.custom.base_tool_types import ToolResultType


-# First chunk of info for streaming QA
-class QADocsResponse(BaseModel):
-    top_documents: list[SearchDoc]
-    rephrased_query: str | None = None
-    predicted_flow: QueryFlow | None
-    predicted_search: SearchType | None
-    applied_source_filters: list[DocumentSource] | None
-    applied_time_cutoff: datetime | None
-    recency_bias_multiplier: float
-
-    def model_dump(self, *args: list, **kwargs: dict[str, Any]) -> dict[str, Any]:  # type: ignore
-        initial_dict = super().model_dump(mode="json", *args, **kwargs)  # type: ignore
-        initial_dict["applied_time_cutoff"] = (
-            self.applied_time_cutoff.isoformat() if self.applied_time_cutoff else None
-        )
-
-        return initial_dict
-
-
 class StreamStopReason(Enum):
    CONTEXT_LENGTH = "context_length"
    CANCELLED = "cancelled"
@@ -70,22 +47,11 @@ class UserKnowledgeFilePacket(BaseModel):
    user_files: list[FileDescriptor]


-class LLMRelevanceFilterResponse(BaseModel):
-    llm_selected_doc_indices: list[int]
-
-
 class RelevanceAnalysis(BaseModel):
    relevant: bool
    content: str | None = None


-class SectionRelevancePiece(RelevanceAnalysis):
-    """LLM analysis mapped to an Inference Section"""
-
-    document_id: str
-    chunk_id: int  # ID of the center chunk for a given inference section
-
-
 class DocumentRelevance(BaseModel):
    """Contains all relevance information for a given search"""

@@ -116,12 +82,6 @@ class OnyxAnswer(BaseModel):
    answer: str | None


-class ThreadMessage(BaseModel):
-    message: str
-    sender: str | None = None
-    role: MessageType = MessageType.USER
-
-
 class FileChatDisplay(BaseModel):
    file_ids: list[str]

@@ -158,7 +118,6 @@ class PersonaOverrideConfig(BaseModel):
    num_chunks: float | None = None
    llm_relevance_filter: bool = False
    llm_filter_extraction: bool = False
-    recency_bias: RecencyBiasSetting = RecencyBiasSetting.AUTO
    llm_model_provider_override: str | None = None
    llm_model_version_override: str | None = None

--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -5,10 +5,13 @@ An overview can be found in the README.md file in this directory.

 import re
 import traceback
+from collections.abc import Callable
 from uuid import UUID

+from redis.client import Redis
 from sqlalchemy.orm import Session

+from onyx.chat.chat_processing_checker import set_processing_status
 from onyx.chat.chat_state import ChatStateContainer
 from onyx.chat.chat_state import run_chat_loop_with_state_containers
 from onyx.chat.chat_utils import convert_chat_history
@@ -35,9 +38,10 @@ from onyx.chat.save_chat import save_chat_turn
 from onyx.chat.stop_signal_checker import is_connected as check_stop_signal
 from onyx.chat.stop_signal_checker import reset_cancel_status
 from onyx.configs.constants import DEFAULT_PERSONA_ID
+from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import MessageType
 from onyx.configs.constants import MilestoneRecordType
-from onyx.context.search.enums import OptionalSearchSetting
+from onyx.context.search.models import BaseFilters
 from onyx.context.search.models import CitationDocInfo
 from onyx.context.search.models import SearchDoc
 from onyx.db.chat import create_new_chat_message
@@ -45,6 +49,9 @@ from onyx.db.chat import get_chat_session_by_id
 from onyx.db.chat import get_or_create_root_message
 from onyx.db.chat import reserve_message_id
 from onyx.db.memory import get_memories
+from onyx.db.models import ChatMessage
+from onyx.db.models import ChatSession
+from onyx.db.models import Persona
 from onyx.db.models import User
 from onyx.db.projects import get_project_token_count
 from onyx.db.projects import get_user_files_from_project
@@ -62,6 +69,7 @@ from onyx.onyxbot.slack.models import SlackContext
 from onyx.redis.redis_pool import get_redis_client
 from onyx.server.query_and_chat.models import AUTO_PLACE_AFTER_LATEST_MESSAGE
 from onyx.server.query_and_chat.models import CreateChatMessageRequest
+from onyx.server.query_and_chat.models import OptionalSearchSetting
 from onyx.server.query_and_chat.models import SendMessageRequest
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
@@ -78,18 +86,30 @@ from onyx.utils.logger import setup_logger
 from onyx.utils.long_term_log import LongTermLogger
 from onyx.utils.telemetry import mt_cloud_telemetry
 from onyx.utils.timing import log_function_time
+from onyx.utils.variable_functionality import (
+    fetch_versioned_implementation_with_fallback,
+)
+from onyx.utils.variable_functionality import noop_fallback
 from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()
 ERROR_TYPE_CANCELLED = "cancelled"


-class ToolCallException(Exception):
-    """Exception raised for errors during tool calls."""
+def _should_enable_slack_search(
+    persona: Persona,
+    filters: BaseFilters | None,
+) -> bool:
+    """Determine if Slack search should be enabled.

-    def __init__(self, message: str, tool_name: str | None = None):
-        super().__init__(message)
-        self.tool_name = tool_name
+    Returns True if:
+    - Source type filter exists and includes Slack, OR
+    - Default persona with no source type filter
+    """
+    source_types = filters.source_type if filters else None
+    return (source_types is not None and DocumentSource.SLACK in source_types) or (
+        persona.id == DEFAULT_PERSONA_ID and source_types is None
+    )


 def _extract_project_file_texts_and_images(
@@ -280,6 +300,7 @@ def handle_stream_message_objects(
    # on the `new_msg_req.message`. Currently, requires a state where the last message is a
    litellm_additional_headers: dict[str, str] | None = None,
    custom_tool_additional_headers: dict[str, str] | None = None,
+    mcp_headers: dict[str, str] | None = None,
    bypass_acl: bool = False,
    # Additional context that should be included in the chat history, for example:
    # Slack threads where the conversation cannot be represented by a chain of User/Assistant
@@ -294,6 +315,8 @@ def handle_stream_message_objects(
    tenant_id = get_current_tenant_id()

    llm: LLM | None = None
+    chat_session: ChatSession | None = None
+    redis_client: Redis | None = None

    user_id = user.id if user is not None else None
    llm_user_identifier = (
@@ -339,6 +362,24 @@ def handle_stream_message_objects(
            event=MilestoneRecordType.MULTIPLE_ASSISTANTS,
        )

+        # Track user message in PostHog for analytics
+        fetch_versioned_implementation_with_fallback(
+            module="onyx.utils.telemetry",
+            attribute="event_telemetry",
+            fallback=noop_fallback,
+        )(
+            distinct_id=user.email if user else tenant_id,
+            event="user_message_sent",
+            properties={
+                "origin": new_msg_req.origin.value,
+                "has_files": len(new_msg_req.file_descriptors) > 0,
+                "has_project": chat_session.project_id is not None,
+                "has_persona": persona is not None and persona.id != DEFAULT_PERSONA_ID,
+                "deep_research": new_msg_req.deep_research,
+                "tenant_id": tenant_id,
+            },
+        )
+
        llm = get_llm_for_persona(
            persona=persona,
            user=user,
@@ -380,7 +421,10 @@ def handle_stream_message_objects(
        if new_msg_req.parent_message_id == AUTO_PLACE_AFTER_LATEST_MESSAGE:
            # Auto-place after the latest message in the chain
            parent_message = chat_history[-1] if chat_history else root_message
-        elif new_msg_req.parent_message_id is None:
+        elif (
+            new_msg_req.parent_message_id is None
+            or new_msg_req.parent_message_id == root_message.id
+        ):
            # None = regeneration from root
            parent_message = root_message
            # Truncate history since we're starting from root
@@ -480,11 +524,15 @@ def handle_stream_message_objects(
                ),
                bypass_acl=bypass_acl,
                slack_context=slack_context,
+                enable_slack_search=_should_enable_slack_search(
+                    persona, new_msg_req.internal_search_filters
+                ),
            ),
            custom_tool_config=CustomToolConfig(
                chat_session_id=chat_session.id,
                message_id=user_message.id if user_message else None,
                additional_headers=custom_tool_additional_headers,
+                mcp_headers=mcp_headers,
            ),
            allowed_tool_ids=new_msg_req.allowed_tool_ids,
            search_usage_forcing_setting=project_search_config.search_usage,
@@ -536,10 +584,27 @@ def handle_stream_message_objects(
        def check_is_connected() -> bool:
            return check_stop_signal(chat_session.id, redis_client)

+        set_processing_status(
+            chat_session_id=chat_session.id,
+            redis_client=redis_client,
+            value=True,
+        )
+
        # Use external state container if provided, otherwise create internal one
        # External container allows non-streaming callers to access accumulated state
        state_container = external_state_container or ChatStateContainer()

+        def llm_loop_completion_callback(
+            state_container: ChatStateContainer,
+        ) -> None:
+            llm_loop_completion_handle(
+                state_container=state_container,
+                db_session=db_session,
+                chat_session_id=str(chat_session.id),
+                is_connected=check_is_connected,
+                assistant_message=assistant_response,
+            )
+
        # Run the LLM loop with explicit wrapper for stop signal handling
        # The wrapper runs run_llm_loop in a background thread and polls every 300ms
        # for stop signals. run_llm_loop itself doesn't know about stopping.
@@ -555,6 +620,7 @@ def handle_stream_message_objects(

            yield from run_chat_loop_with_state_containers(
                run_deep_research_llm_loop,
+                llm_loop_completion_callback,
                is_connected=check_is_connected,
                emitter=emitter,
                state_container=state_container,
@@ -571,6 +637,7 @@ def handle_stream_message_objects(
        else:
            yield from run_chat_loop_with_state_containers(
                run_llm_loop,
+                llm_loop_completion_callback,
                is_connected=check_is_connected,  # Not passed through to run_llm_loop
                emitter=emitter,
                state_container=state_container,
@@ -586,53 +653,9 @@ def handle_stream_message_objects(
                forced_tool_id=forced_tool_id,
                user_identity=user_identity,
                chat_session_id=str(chat_session.id),
+                include_citations=new_msg_req.include_citations,
            )

-        # Determine if stopped by user
-        completed_normally = check_is_connected()
-        if not completed_normally:
-            logger.debug(f"Chat session {chat_session.id} stopped by user")
-
-        # Build final answer based on completion status
-        if completed_normally:
-            if state_container.answer_tokens is None:
-                raise RuntimeError(
-                    "LLM run completed normally but did not return an answer."
-                )
-            final_answer = state_container.answer_tokens
-        else:
-            # Stopped by user - append stop message
-            if state_container.answer_tokens:
-                final_answer = (
-                    state_container.answer_tokens
-                    + " ... The generation was stopped by the user here."
-                )
-            else:
-                final_answer = "The generation was stopped by the user."
-
-        # Build citation_docs_info from accumulated citations in state container
-        citation_docs_info: list[CitationDocInfo] = []
-        seen_citation_nums: set[int] = set()
-        for citation_num, search_doc in state_container.citation_to_doc.items():
-            if citation_num not in seen_citation_nums:
-                seen_citation_nums.add(citation_num)
-                citation_docs_info.append(
-                    CitationDocInfo(
-                        search_doc=search_doc,
-                        citation_number=citation_num,
-                    )
-                )
-
-        save_chat_turn(
-            message_text=final_answer,
-            reasoning_tokens=state_container.reasoning_tokens,
-            citation_docs_info=citation_docs_info,
-            tool_calls=state_container.tool_calls,
-            db_session=db_session,
-            assistant_message=assistant_response,
-            is_clarification=state_container.is_clarification,
-        )
-
    except ValueError as e:
        logger.exception("Failed to process chat message.")

@@ -650,15 +673,7 @@ def handle_stream_message_objects(
        error_msg = str(e)
        stack_trace = traceback.format_exc()

-        if isinstance(e, ToolCallException):
-            yield StreamingError(
-                error=error_msg,
-                stack_trace=stack_trace,
-                error_code="TOOL_CALL_FAILED",
-                is_retryable=True,
-                details={"tool_name": e.tool_name} if e.tool_name else None,
-            )
-        elif llm:
+        if llm:
            client_error_msg, error_code, is_retryable = litellm_exception_to_error_msg(
                e, llm
            )
@@ -690,7 +705,67 @@ def handle_stream_message_objects(
            )

        db_session.rollback()
-        return
+    finally:
+        try:
+            if redis_client is not None and chat_session is not None:
+                set_processing_status(
+                    chat_session_id=chat_session.id,
+                    redis_client=redis_client,
+                    value=False,
+                )
+        except Exception:
+            logger.exception("Error in setting processing status")
+
+
+def llm_loop_completion_handle(
+    state_container: ChatStateContainer,
+    is_connected: Callable[[], bool],
+    db_session: Session,
+    chat_session_id: str,
+    assistant_message: ChatMessage,
+) -> None:
+    # Determine if stopped by user
+    completed_normally = is_connected()
+    # Build final answer based on completion status
+    if completed_normally:
+        if state_container.answer_tokens is None:
+            raise RuntimeError(
+                "LLM run completed normally but did not return an answer."
+            )
+        final_answer = state_container.answer_tokens
+    else:
+        # Stopped by user - append stop message
+        logger.debug(f"Chat session {chat_session_id} stopped by user")
+        if state_container.answer_tokens:
+            final_answer = (
+                state_container.answer_tokens
+                + " ... \n\nGeneration was stopped by the user."
+            )
+        else:
+            final_answer = "The generation was stopped by the user."
+
+    # Build citation_docs_info from accumulated citations in state container
+    citation_docs_info: list[CitationDocInfo] = []
+    seen_citation_nums: set[int] = set()
+    for citation_num, search_doc in state_container.citation_to_doc.items():
+        if citation_num not in seen_citation_nums:
+            seen_citation_nums.add(citation_num)
+            citation_docs_info.append(
+                CitationDocInfo(
+                    search_doc=search_doc,
+                    citation_number=citation_num,
+                )
+            )
+
+    save_chat_turn(
+        message_text=final_answer,
+        reasoning_tokens=state_container.reasoning_tokens,
+        citation_docs_info=citation_docs_info,
+        tool_calls=state_container.tool_calls,
+        db_session=db_session,
+        assistant_message=assistant_message,
+        is_clarification=state_container.is_clarification,
+    )


 def stream_chat_message_objects(
@@ -739,6 +814,8 @@ def stream_chat_message_objects(
        deep_research=new_msg_req.deep_research,
        parent_message_id=new_msg_req.parent_message_id,
        chat_session_id=new_msg_req.chat_session_id,
+        origin=new_msg_req.origin,
+        include_citations=new_msg_req.include_citations,
    )
    return handle_stream_message_objects(
        new_msg_req=translated_new_msg_req,
--- a/backend/onyx/chat/prompt_utils.py
+++ b/backend/onyx/chat/prompt_utils.py
@@ -18,6 +18,7 @@ from onyx.prompts.prompt_utils import handle_onyx_date_awareness
 from onyx.prompts.prompt_utils import replace_citation_guidance_tag
 from onyx.prompts.tool_prompts import GENERATE_IMAGE_GUIDANCE
 from onyx.prompts.tool_prompts import INTERNAL_SEARCH_GUIDANCE
+from onyx.prompts.tool_prompts import MEMORY_GUIDANCE
 from onyx.prompts.tool_prompts import OPEN_URLS_GUIDANCE
 from onyx.prompts.tool_prompts import PYTHON_TOOL_GUIDANCE
 from onyx.prompts.tool_prompts import TOOL_DESCRIPTION_SEARCH_GUIDANCE
@@ -28,6 +29,7 @@ from onyx.tools.interface import Tool
 from onyx.tools.tool_implementations.images.image_generation_tool import (
    ImageGenerationTool,
 )
+from onyx.tools.tool_implementations.memory.memory_tool import MemoryTool
 from onyx.tools.tool_implementations.open_url.open_url_tool import OpenURLTool
 from onyx.tools.tool_implementations.python.python_tool import PythonTool
 from onyx.tools.tool_implementations.search.search_tool import SearchTool
@@ -178,8 +180,9 @@ def build_system_prompt(
                site_colon_disabled=WEB_SEARCH_SITE_DISABLED_GUIDANCE
            )
            + OPEN_URLS_GUIDANCE
-            + GENERATE_IMAGE_GUIDANCE
            + PYTHON_TOOL_GUIDANCE
+            + GENERATE_IMAGE_GUIDANCE
+            + MEMORY_GUIDANCE
        )
        return system_prompt

@@ -193,6 +196,7 @@ def build_system_prompt(
        has_generate_image = any(
            isinstance(tool, ImageGenerationTool) for tool in tools
        )
+        has_memory = any(isinstance(tool, MemoryTool) for tool in tools)

        if has_web_search or has_internal_search or include_all_guidance:
            system_prompt += TOOL_DESCRIPTION_SEARCH_GUIDANCE
@@ -222,4 +226,7 @@ def build_system_prompt(
        if has_generate_image or include_all_guidance:
            system_prompt += GENERATE_IMAGE_GUIDANCE

+        if has_memory or include_all_guidance:
+            system_prompt += MEMORY_GUIDANCE
+
    return system_prompt
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -22,6 +22,14 @@ APP_PORT = 8080
 # prefix from requests directed towards the API server. In these cases, set this to `/api`
 APP_API_PREFIX = os.environ.get("API_PREFIX", "")

+# Certain services need to make HTTP requests to the API server, such as the MCP server and Discord bot
+API_SERVER_PROTOCOL = os.environ.get("API_SERVER_PROTOCOL", "http")
+API_SERVER_HOST = os.environ.get("API_SERVER_HOST", "127.0.0.1")
+# This override allows self-hosting the MCP server with Onyx Cloud backend.
+API_SERVER_URL_OVERRIDE_FOR_HTTP_REQUESTS = os.environ.get(
+    "API_SERVER_URL_OVERRIDE_FOR_HTTP_REQUESTS"
+)
+
 # Whether to send user metadata (user_id/email and session_id) to the LLM provider.
 # Disabled by default.
 SEND_USER_METADATA_TO_LLM_PROVIDER = (
@@ -568,6 +576,7 @@ JIRA_CONNECTOR_LABELS_TO_SKIP = [
 JIRA_CONNECTOR_MAX_TICKET_SIZE = int(
    os.environ.get("JIRA_CONNECTOR_MAX_TICKET_SIZE", 100 * 1024)
 )
+JIRA_SLIM_PAGE_SIZE = int(os.environ.get("JIRA_SLIM_PAGE_SIZE", 500))

 GONG_CONNECTOR_START_TIME = os.environ.get("GONG_CONNECTOR_START_TIME")

@@ -849,6 +858,7 @@ AZURE_IMAGE_DEPLOYMENT_NAME = os.environ.get(

 # configurable image model
 IMAGE_MODEL_NAME = os.environ.get("IMAGE_MODEL_NAME", "gpt-image-1")
+IMAGE_MODEL_PROVIDER = os.environ.get("IMAGE_MODEL_PROVIDER", "openai")

 # Use managed Vespa (Vespa Cloud). If set, must also set VESPA_CLOUD_URL, VESPA_CLOUD_CERT_PATH and VESPA_CLOUD_KEY_PATH
 MANAGED_VESPA = os.environ.get("MANAGED_VESPA", "").lower() == "true"
@@ -995,3 +1005,14 @@ COHERE_DEFAULT_API_KEY = os.environ.get("COHERE_DEFAULT_API_KEY")
 VERTEXAI_DEFAULT_CREDENTIALS = os.environ.get("VERTEXAI_DEFAULT_CREDENTIALS")
 VERTEXAI_DEFAULT_LOCATION = os.environ.get("VERTEXAI_DEFAULT_LOCATION", "global")
 OPENROUTER_DEFAULT_API_KEY = os.environ.get("OPENROUTER_DEFAULT_API_KEY")
+
+INSTANCE_TYPE = (
+    "managed"
+    if os.environ.get("IS_MANAGED_INSTANCE", "").lower() == "true"
+    else "cloud" if AUTH_TYPE == AuthType.CLOUD else "self_hosted"
+)
+
+
+## Discord Bot Configuration
+DISCORD_BOT_TOKEN = os.environ.get("DISCORD_BOT_TOKEN")
+DISCORD_BOT_INVOKE_CHAR = os.environ.get("DISCORD_BOT_INVOKE_CHAR", "!")
--- a/backend/onyx/configs/chat_configs.py
+++ b/backend/onyx/configs/chat_configs.py
@@ -12,9 +12,6 @@ NUM_POSTPROCESSED_RESULTS = 20
 # May be less depending on model
 MAX_CHUNKS_FED_TO_CHAT = int(os.environ.get("MAX_CHUNKS_FED_TO_CHAT") or 25)

-# Maximum percentage of the context window to fill with selected sections
-SELECTED_SECTIONS_MAX_WINDOW_PERCENTAGE = 0.8
-
 # 1 / (1 + DOC_TIME_DECAY * doc-age-in-years), set to 0 to have no decay
 # Capped in Vespa at 0.5
 DOC_TIME_DECAY = float(
@@ -27,11 +24,6 @@ FAVOR_RECENT_DECAY_MULTIPLIER = 2.0
 # Currently only applies to search flow not chat
 CONTEXT_CHUNKS_ABOVE = int(os.environ.get("CONTEXT_CHUNKS_ABOVE") or 1)
 CONTEXT_CHUNKS_BELOW = int(os.environ.get("CONTEXT_CHUNKS_BELOW") or 1)
-DISABLE_LLM_QUERY_REPHRASE = (
-    os.environ.get("DISABLE_LLM_QUERY_REPHRASE", "").lower() == "true"
-)
-# 1 edit per 20 characters, currently unused due to fuzzy match being too slow
-QUOTE_ALLOWED_ERROR_PERCENT = 0.05
 QA_TIMEOUT = int(os.environ.get("QA_TIMEOUT") or "60")  # 60 seconds
 # Weighting factor between Vector and Keyword Search, 1 for completely vector search
 HYBRID_ALPHA = max(0, min(1, float(os.environ.get("HYBRID_ALPHA") or 0.5)))
@@ -46,34 +38,6 @@ TITLE_CONTENT_RATIO = max(
    0, min(1, float(os.environ.get("TITLE_CONTENT_RATIO") or 0.10))
 )

-# A list of languages passed to the LLM to rephase the query
-# For example "English,French,Spanish", be sure to use the "," separator
-# TODO these are not used, should probably reintroduce these
-MULTILINGUAL_QUERY_EXPANSION = os.environ.get("MULTILINGUAL_QUERY_EXPANSION") or None
-LANGUAGE_HINT = "\n" + (
-    os.environ.get("LANGUAGE_HINT")
-    or "IMPORTANT: Respond in the same language as my query!"
-)
-LANGUAGE_CHAT_NAMING_HINT = (
-    os.environ.get("LANGUAGE_CHAT_NAMING_HINT")
-    or "The name of the conversation must be in the same language as the user query."
-)
-
-# Number of prompts each persona should have
-NUM_PERSONA_PROMPTS = 4
-NUM_PERSONA_PROMPT_GENERATION_CHUNKS = 5
-
-# Agentic search takes significantly more tokens and therefore has much higher cost.
-# This configuration allows users to get a search-only experience with instant results
-# and no involvement from the LLM.
-# Additionally, some LLM providers have strict rate limits which may prohibit
-# sending many API requests at once (as is done in agentic search).
-# Whether the LLM should evaluate all of the document chunks passed in for usefulness
-# in relation to the user query
-DISABLE_LLM_DOC_RELEVANCE = (
-    os.environ.get("DISABLE_LLM_DOC_RELEVANCE", "").lower() == "true"
-)
-
 # Stops streaming answers back to the UI if this pattern is seen:
 STOP_STREAM_PAT = os.environ.get("STOP_STREAM_PAT") or None

@@ -86,9 +50,6 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "").lower() == "true"
 NUM_INTERNET_SEARCH_RESULTS = int(os.environ.get("NUM_INTERNET_SEARCH_RESULTS") or 10)
 NUM_INTERNET_SEARCH_CHUNKS = int(os.environ.get("NUM_INTERNET_SEARCH_CHUNKS") or 50)

-# Enable in-house model for detecting connector-based filtering in queries
-ENABLE_CONNECTOR_CLASSIFIER = os.environ.get("ENABLE_CONNECTOR_CLASSIFIER", False)
-
 VESPA_SEARCHER_THREADS = int(os.environ.get("VESPA_SEARCHER_THREADS") or 2)

 # Whether or not to use the semantic & keyword search expansions for Basic Search
@@ -96,5 +57,3 @@ USE_SEMANTIC_KEYWORD_EXPANSIONS_BASIC_SEARCH = (
    os.environ.get("USE_SEMANTIC_KEYWORD_EXPANSIONS_BASIC_SEARCH", "false").lower()
    == "true"
 )
-
-USE_DIV_CON_AGENT = os.environ.get("USE_DIV_CON_AGENT", "false").lower() == "true"
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -7,6 +7,7 @@ from enum import Enum

 ONYX_DEFAULT_APPLICATION_NAME = "Onyx"
 ONYX_DISCORD_URL = "https://discord.gg/4NA5SbzrWb"
+ONYX_UTM_SOURCE = "onyx_app"
 SLACK_USER_TOKEN_PREFIX = "xoxp-"
 SLACK_BOT_TOKEN_PREFIX = "xoxb-"
 ONYX_EMAILABLE_LOGO_MAX_DIM = 512
@@ -22,6 +23,9 @@ PUBLIC_DOC_PAT = "PUBLIC"
 ID_SEPARATOR = ":;:"
 DEFAULT_BOOST = 0

+# Tag for endpoints that should be included in the public API documentation
+PUBLIC_API_TAGS: list[str | Enum] = ["public"]
+
 # Cookies
 FASTAPI_USERS_AUTH_COOKIE_NAME = (
    "fastapiusersauth"  # Currently a constant, but logic allows for configuration
@@ -89,6 +93,7 @@ SSL_CERT_FILE = "bundle.pem"
 DANSWER_API_KEY_PREFIX = "API_KEY__"
 DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN = "onyxapikey.ai"
 UNNAMED_KEY_PLACEHOLDER = "Unnamed"
+DISCORD_SERVICE_API_KEY_NAME = "discord-bot-service"

 # Key-Value store keys
 KV_REINDEX_KEY = "needs_reindexing"
@@ -235,6 +240,7 @@ class NotificationType(str, Enum):
    PERSONA_SHARED = "persona_shared"
    TRIAL_ENDS_TWO_DAYS = "two_day_trial_ending"  # 2 days left in trial
    RELEASE_NOTES = "release_notes"
+    ASSISTANT_FILES_READY = "assistant_files_ready"


 class BlobType(str, Enum):
@@ -422,6 +428,9 @@ class OnyxRedisLocks:
    USER_FILE_DELETE_BEAT_LOCK = "da_lock:check_user_file_delete_beat"
    USER_FILE_DELETE_LOCK_PREFIX = "da_lock:user_file_delete"

+    # Release notes
+    RELEASE_NOTES_FETCH_LOCK = "da_lock:release_notes_fetch"
+

 class OnyxRedisSignals:
    BLOCK_VALIDATE_INDEXING_FENCES = "signal:block_validate_indexing_fences"
--- a/backend/onyx/configs/onyxbot_configs.py
+++ b/backend/onyx/configs/onyxbot_configs.py
@@ -4,8 +4,6 @@ import os
 # Onyx Slack Bot Configs
 #####
 ONYX_BOT_NUM_RETRIES = int(os.environ.get("ONYX_BOT_NUM_RETRIES", "5"))
-# How much of the available input context can be used for thread context
-MAX_THREAD_CONTEXT_PERCENTAGE = 512 * 2 / 3072
 # Number of docs to display in "Reference Documents"
 ONYX_BOT_NUM_DOCS_TO_DISPLAY = int(os.environ.get("ONYX_BOT_NUM_DOCS_TO_DISPLAY", "5"))
 # If the LLM fails to answer, Onyx can still show the "Reference Documents"
@@ -47,10 +45,6 @@ ONYX_BOT_MAX_WAIT_TIME = int(os.environ.get("ONYX_BOT_MAX_WAIT_TIME") or 180)
 # Time (in minutes) after which a Slack message is sent to the user to remind him to give feedback.
 # Set to 0 to disable it (default)
 ONYX_BOT_FEEDBACK_REMINDER = int(os.environ.get("ONYX_BOT_FEEDBACK_REMINDER") or 0)
-# Set to True to rephrase the Slack users messages
-ONYX_BOT_REPHRASE_MESSAGE = (
-    os.environ.get("ONYX_BOT_REPHRASE_MESSAGE", "").lower() == "true"
-)

 # ONYX_BOT_RESPONSE_LIMIT_PER_TIME_PERIOD is the number of
 # responses OnyxBot can send in a given time period.
--- a/backend/onyx/connectors/README.md
+++ b/backend/onyx/connectors/README.md
@@ -93,7 +93,7 @@ if __name__ == "__main__":
 #### Docs Changes

 Create the new connector page (with guiding images!) with how to get the connector credentials and how to set up the
-connector in Onyx. Then create a Pull Request in https://github.com/onyx-dot-app/onyx-docs.
+connector in Onyx. Then create a Pull Request in [https://github.com/onyx-dot-app/documentation](https://github.com/onyx-dot-app/documentation).

 ### Before opening PR

--- a/backend/onyx/connectors/asana/connector.py
+++ b/backend/onyx/connectors/asana/connector.py
@@ -25,11 +25,17 @@ class AsanaConnector(LoadConnector, PollConnector):
        batch_size: int = INDEX_BATCH_SIZE,
        continue_on_failure: bool = CONTINUE_ON_CONNECTOR_FAILURE,
    ) -> None:
-        self.workspace_id = asana_workspace_id
-        self.project_ids_to_index: list[str] | None = (
-            asana_project_ids.split(",") if asana_project_ids is not None else None
-        )
-        self.asana_team_id = asana_team_id
+        self.workspace_id = asana_workspace_id.strip()
+        if asana_project_ids:
+            project_ids = [
+                project_id.strip()
+                for project_id in asana_project_ids.split(",")
+                if project_id.strip()
+            ]
+            self.project_ids_to_index = project_ids or None
+        else:
+            self.project_ids_to_index = None
+        self.asana_team_id = (asana_team_id.strip() or None) if asana_team_id else None
        self.batch_size = batch_size
        self.continue_on_failure = continue_on_failure
        logger.info(
--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -901,13 +901,16 @@ class OnyxConfluence:
        space_key: str,
    ) -> list[dict[str, Any]]:
        """
-        This is a confluence server specific method that can be used to
+        This is a confluence server/data center specific method that can be used to
        fetch the permissions of a space.
-        This is better logging than calling the get_space_permissions method
-        because it returns a jsonrpc response.
-        TODO: Make this call these endpoints for newer confluence versions:
-        - /rest/api/space/{spaceKey}/permissions
-        - /rest/api/space/{spaceKey}/permissions/anonymous
+
+        NOTE: This uses the JSON-RPC API which is the ONLY way to get space permissions
+        on Confluence Server/Data Center. The REST API equivalent (expand=permissions)
+        is Cloud-only and not available on Data Center as of version 8.9.x.
+
+        If this fails with 401 Unauthorized, the customer needs to enable JSON-RPC:
+        Confluence Admin -> General Configuration -> Further Configuration
+        -> Enable "Remote API (XML-RPC & SOAP)"
        """
        url = "rpc/json-rpc/confluenceservice-v2"
        data = {
@@ -916,7 +919,18 @@ class OnyxConfluence:
            "id": 7,
            "params": [space_key],
        }
-        response = self.post(url, data=data)
+        try:
+            response = self.post(url, data=data)
+        except HTTPError as e:
+            if e.response is not None and e.response.status_code == 401:
+                raise HTTPError(
+                    "Unauthorized (401) when calling JSON-RPC API for space permissions. "
+                    "This is likely because the Remote API is disabled. "
+                    "To fix: Confluence Admin -> General Configuration -> Further Configuration "
+                    "-> Enable 'Remote API (XML-RPC & SOAP)'",
+                    response=e.response,
+                ) from e
+            raise
        logger.debug(f"jsonrpc response: {response}")
        if not response.get("result"):
            logger.warning(
--- a/backend/onyx/connectors/cross_connector_utils/miscellaneous_utils.py
+++ b/backend/onyx/connectors/cross_connector_utils/miscellaneous_utils.py
@@ -97,10 +97,17 @@ def basic_expert_info_representation(info: BasicExpertInfo) -> str | None:
 def get_experts_stores_representations(
    experts: list[BasicExpertInfo] | None,
 ) -> list[str] | None:
+    """Gets string representations of experts supplied.
+
+    If an expert cannot be represented as a string, it is omitted from the
+    result.
+    """
    if not experts:
        return None

-    reps = [basic_expert_info_representation(owner) for owner in experts]
+    reps: list[str | None] = [
+        basic_expert_info_representation(owner) for owner in experts
+    ]
    return [owner for owner in reps if owner is not None]


--- a/backend/onyx/connectors/jira/connector.py
+++ b/backend/onyx/connectors/jira/connector.py
@@ -18,6 +18,7 @@ from typing_extensions import override
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
 from onyx.configs.app_configs import JIRA_CONNECTOR_LABELS_TO_SKIP
 from onyx.configs.app_configs import JIRA_CONNECTOR_MAX_TICKET_SIZE
+from onyx.configs.app_configs import JIRA_SLIM_PAGE_SIZE
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
    is_atlassian_date_error,
@@ -57,7 +58,6 @@ logger = setup_logger()
 ONE_HOUR = 3600

 _MAX_RESULTS_FETCH_IDS = 5000  # 5000
-_JIRA_SLIM_PAGE_SIZE = 500
 _JIRA_FULL_PAGE_SIZE = 50

 # Constants for Jira field names
@@ -683,7 +683,7 @@ class JiraConnector(
                jira_client=self.jira_client,
                jql=jql,
                start=current_offset,
-                max_results=_JIRA_SLIM_PAGE_SIZE,
+                max_results=JIRA_SLIM_PAGE_SIZE,
                all_issue_ids=checkpoint.all_issue_ids,
                checkpoint_callback=checkpoint_callback,
                nextPageToken=checkpoint.cursor,
@@ -703,11 +703,11 @@ class JiraConnector(
                    )
                )
                current_offset += 1
-                if len(slim_doc_batch) >= _JIRA_SLIM_PAGE_SIZE:
+                if len(slim_doc_batch) >= JIRA_SLIM_PAGE_SIZE:
                    yield slim_doc_batch
                    slim_doc_batch = []
            self.update_checkpoint_for_next_run(
-                checkpoint, current_offset, prev_offset, _JIRA_SLIM_PAGE_SIZE
+                checkpoint, current_offset, prev_offset, JIRA_SLIM_PAGE_SIZE
            )
            prev_offset = current_offset

--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -161,6 +161,8 @@ class DocumentBase(BaseModel):
    sections: list[TextSection | ImageSection]
    source: DocumentSource | None = None
    semantic_identifier: str  # displayed in the UI as the main identifier for the doc
+    # TODO(andrei): Ideally we could improve this to where each value is just a
+    # list of strings.
    metadata: dict[str, str | list[str]]

    # UTC time
@@ -202,13 +204,7 @@ class DocumentBase(BaseModel):
        if not self.metadata:
            return None
        # Combined string for the key/value for easy filtering
-        attributes: list[str] = []
-        for k, v in self.metadata.items():
-            if isinstance(v, list):
-                attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
-            else:
-                attributes.append(k + INDEX_SEPARATOR + v)
-        return attributes
+        return convert_metadata_dict_to_list_of_strings(self.metadata)

    def __sizeof__(self) -> int:
        size = sys.getsizeof(self.id)
@@ -240,6 +236,66 @@ class DocumentBase(BaseModel):
        return " ".join([section.text for section in self.sections if section.text])


+def convert_metadata_dict_to_list_of_strings(
+    metadata: dict[str, str | list[str]],
+) -> list[str]:
+    """Converts a metadata dict to a list of strings.
+
+    Each string is a key-value pair separated by the INDEX_SEPARATOR. If a key
+    points to a list of values, each value generates a unique pair.
+
+    Args:
+        metadata: The metadata dict to convert where values can be either a
+            string or a list of strings.
+
+    Returns:
+        A list of strings where each string is a key-value pair separated by the
+            INDEX_SEPARATOR.
+    """
+    attributes: list[str] = []
+    for k, v in metadata.items():
+        if isinstance(v, list):
+            attributes.extend([k + INDEX_SEPARATOR + vi for vi in v])
+        else:
+            attributes.append(k + INDEX_SEPARATOR + v)
+    return attributes
+
+
+def convert_metadata_list_of_strings_to_dict(
+    metadata_list: list[str],
+) -> dict[str, str | list[str]]:
+    """
+    Converts a list of strings to a metadata dict. The inverse of
+    convert_metadata_dict_to_list_of_strings.
+
+    Assumes the input strings are formatted as in the output of
+    convert_metadata_dict_to_list_of_strings.
+
+    The schema of the output metadata dict is suboptimal yet bound to legacy
+    code. Ideally each key would just point to a list of strings, where each
+    list might contain just one element.
+
+    Args:
+        metadata_list: The list of strings to convert to a metadata dict.
+
+    Returns:
+        A metadata dict where values can be either a string or a list of
+            strings.
+    """
+    metadata: dict[str, str | list[str]] = {}
+    for item in metadata_list:
+        key, value = item.split(INDEX_SEPARATOR, 1)
+        if key in metadata:
+            # We have already seen this key therefore it must point to a list.
+            if isinstance(metadata[key], list):
+                cast(list[str], metadata[key]).append(value)
+            else:
+                metadata[key] = [cast(str, metadata[key]), value]
+        else:
+            metadata[key] = value
+    return metadata
+
+
 class Document(DocumentBase):
    """Used for Onyx ingestion api, the ID is required"""

--- a/backend/onyx/context/search/enums.py
+++ b/backend/onyx/context/search/enums.py
@@ -13,13 +13,6 @@ class RecencyBiasSetting(str, Enum):
    AUTO = "auto"


-class OptionalSearchSetting(str, Enum):
-    ALWAYS = "always"
-    NEVER = "never"
-    # Determine whether to run search based on history and latest query
-    AUTO = "auto"
-
-
 class QueryType(str, Enum):
    """
    The type of first-pass query to use for hybrid search.
@@ -36,15 +29,3 @@ class SearchType(str, Enum):
    KEYWORD = "keyword"
    SEMANTIC = "semantic"
    INTERNET = "internet"
-
-
-class LLMEvaluationType(str, Enum):
-    AGENTIC = "agentic"  # applies agentic evaluation
-    BASIC = "basic"  # applies boolean evaluation
-    SKIP = "skip"  # skips evaluation
-    UNSPECIFIED = "unspecified"  # reverts to default
-
-
-class QueryFlow(str, Enum):
-    SEARCH = "search"
-    QUESTION_ANSWER = "question-answer"
--- a/backend/onyx/context/search/federated/slack_search.py
+++ b/backend/onyx/context/search/federated/slack_search.py
@@ -31,7 +31,6 @@ from onyx.context.search.federated.slack_search_utils import is_recency_query
 from onyx.context.search.federated.slack_search_utils import should_include_message
 from onyx.context.search.models import ChunkIndexRequest
 from onyx.context.search.models import InferenceChunk
-from onyx.context.search.models import SearchQuery
 from onyx.db.document import DocumentSource
 from onyx.db.search_settings import get_current_search_settings
 from onyx.document_index.document_index_utils import (
@@ -425,7 +424,6 @@ class SlackQueryResult(BaseModel):

 def query_slack(
    query_string: str,
-    original_query: SearchQuery,
    access_token: str,
    limit: int | None = None,
    allowed_private_channel: str | None = None,
@@ -456,7 +454,7 @@ def query_slack(
    logger.info(f"Final query to slack: {final_query}")

    # Detect if query asks for most recent results
-    sort_by_time = is_recency_query(original_query.query)
+    sort_by_time = is_recency_query(query_string)

    slack_client = WebClient(token=access_token)
    try:
@@ -536,8 +534,7 @@ def query_slack(
        )
        document_id = f"{channel_id}_{message_id}"

-        # compute recency bias (parallels vespa calculation) and metadata
-        decay_factor = DOC_TIME_DECAY * original_query.recency_bias_multiplier
+        decay_factor = DOC_TIME_DECAY
        doc_time = datetime.fromtimestamp(float(message_id))
        doc_age_years = (datetime.now() - doc_time).total_seconds() / (
            365 * 24 * 60 * 60
@@ -1002,7 +999,6 @@ def slack_retrieval(
            query_slack,
            (
                query_string,
-                query,
                access_token,
                query_limit,
                allowed_private_channel,
@@ -1045,7 +1041,6 @@ def slack_retrieval(
                    query_slack,
                    (
                        query_string,
-                        query,
                        access_token,
                        query_limit,
                        allowed_private_channel,
@@ -1225,7 +1220,6 @@ def slack_retrieval(
                source_type=DocumentSource.SLACK,
                title=chunk.title_prefix,
                boost=0,
-                recency_bias=docid_to_message[document_id].recency_bias,
                score=convert_slack_score(docid_to_message[document_id].slack_score),
                hidden=False,
                is_relevant=None,
--- a/backend/onyx/context/search/federated/slack_search_utils.py
+++ b/backend/onyx/context/search/federated/slack_search_utils.py
@@ -13,6 +13,7 @@ from onyx.context.search.federated.models import ChannelMetadata
 from onyx.context.search.models import ChunkIndexRequest
 from onyx.federated_connectors.slack.models import SlackEntities
 from onyx.llm.interfaces import LLM
+from onyx.llm.models import UserMessage
 from onyx.llm.utils import llm_response_to_string
 from onyx.onyxbot.slack.models import ChannelType
 from onyx.prompts.federated_search import SLACK_DATE_EXTRACTION_PROMPT
@@ -190,7 +191,7 @@ def extract_date_range_from_query(

    try:
        prompt = SLACK_DATE_EXTRACTION_PROMPT.format(query=query)
-        response = llm_response_to_string(llm.invoke(prompt))
+        response = llm_response_to_string(llm.invoke(UserMessage(content=prompt)))

        response_clean = _parse_llm_code_block_response(response)

@@ -566,6 +567,23 @@ def extract_content_words_from_recency_query(
    return content_words_filtered[:MAX_CONTENT_WORDS]


+def _is_valid_keyword_query(line: str) -> bool:
+    """Check if a line looks like a valid keyword query vs explanatory text.
+
+    Returns False for lines that appear to be LLM explanations rather than keywords.
+    """
+    # Reject lines that start with parentheses (explanatory notes)
+    if line.startswith("("):
+        return False
+
+    # Reject lines that are too long (likely sentences, not keywords)
+    # Keywords should be short - reject if > 50 chars or > 6 words
+    if len(line) > 50 or len(line.split()) > 6:
+        return False
+
+    return True
+
+
 def expand_query_with_llm(query_text: str, llm: LLM) -> list[str]:
    """Use LLM to expand query into multiple search variations.

@@ -576,8 +594,10 @@ def expand_query_with_llm(query_text: str, llm: LLM) -> list[str]:
    Returns:
        List of rephrased query strings (up to MAX_SLACK_QUERY_EXPANSIONS)
    """
-    prompt = SLACK_QUERY_EXPANSION_PROMPT.format(
-        query=query_text, max_queries=MAX_SLACK_QUERY_EXPANSIONS
+    prompt = UserMessage(
+        content=SLACK_QUERY_EXPANSION_PROMPT.format(
+            query=query_text, max_queries=MAX_SLACK_QUERY_EXPANSIONS
+        )
    )

    try:
@@ -586,10 +606,18 @@ def expand_query_with_llm(query_text: str, llm: LLM) -> list[str]:
        response_clean = _parse_llm_code_block_response(response)

        # Split into lines and filter out empty lines
-        rephrased_queries = [
+        raw_queries = [
            line.strip() for line in response_clean.split("\n") if line.strip()
        ]

+        # Filter out lines that look like explanatory text rather than keywords
+        rephrased_queries = [q for q in raw_queries if _is_valid_keyword_query(q)]
+
+        # Log if we filtered out garbage
+        if len(raw_queries) != len(rephrased_queries):
+            filtered_out = set(raw_queries) - set(rephrased_queries)
+            logger.warning(f"Filtered out non-keyword LLM responses: {filtered_out}")
+
        # If no queries generated, use empty query
        if not rephrased_queries:
            logger.debug("No content keywords extracted from query expansion")
--- a/backend/onyx/context/search/models.py
+++ b/backend/onyx/context/search/models.py
@@ -5,27 +5,15 @@ from typing import Any
 from uuid import UUID

 from pydantic import BaseModel
-from pydantic import ConfigDict
 from pydantic import Field
 from pydantic import field_validator

-from onyx.configs.chat_configs import NUM_RETURNED_HITS
 from onyx.configs.constants import DocumentSource
-from onyx.context.search.enums import LLMEvaluationType
-from onyx.context.search.enums import OptionalSearchSetting
-from onyx.context.search.enums import SearchType
-from onyx.db.models import Persona
 from onyx.db.models import SearchSettings
 from onyx.indexing.models import BaseChunk
 from onyx.indexing.models import IndexingSetting
 from onyx.tools.tool_implementations.web_search.models import WEB_SEARCH_PREFIX
 from shared_configs.enums import RerankerProvider
-from shared_configs.model_server_models import Embedding
-
-
-MAX_METRICS_CONTENT = (
-    200  # Just need enough characters to identify where in the doc the chunk is
-)


 class QueryExpansions(BaseModel):
@@ -38,6 +26,7 @@ class QueryExpansionType(Enum):
    SEMANTIC = "semantic"


+# TODO clean up this stuff, reranking is no longer used
 class RerankingDetails(BaseModel):
    # If model is None (or num_rerank is 0), then reranking is turned off
    rerank_model_name: str | None
@@ -131,13 +120,6 @@ class IndexFilters(BaseFilters, UserFileFilters):
    tenant_id: str | None = None


-class ChunkMetric(BaseModel):
-    document_id: str
-    chunk_content_start: str
-    first_link: str | None
-    score: float
-
-
 class ChunkContext(BaseModel):
    # If not specified (None), picked up from Persona settings if there is space
    # if specified (even if 0), it always uses the specified number of chunks above and below
@@ -192,94 +174,18 @@ class ContextExpansionType(str, Enum):
    FULL_DOCUMENT = "full_document"


-class SearchRequest(ChunkContext):
-    query: str
-
-    expanded_queries: QueryExpansions | None = None
-    original_query: str | None = None
-
-    search_type: SearchType = SearchType.SEMANTIC
-
-    human_selected_filters: BaseFilters | None = None
-    user_file_filters: UserFileFilters | None = None
-    enable_auto_detect_filters: bool | None = None
-    persona: Persona | None = None
-
-    # if None, no offset / limit
-    offset: int | None = None
-    limit: int | None = None
-
-    multilingual_expansion: list[str] | None = None
-    recency_bias_multiplier: float = 1.0
-    hybrid_alpha: float | None = None
-    rerank_settings: RerankingDetails | None = None
-    evaluation_type: LLMEvaluationType = LLMEvaluationType.UNSPECIFIED
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-
-    precomputed_query_embedding: Embedding | None = None
-    precomputed_is_keyword: bool | None = None
-    precomputed_keywords: list[str] | None = None
-
-
-class SearchQuery(ChunkContext):
-    query: str
-    processed_keywords: list[str]
-    search_type: SearchType
-    evaluation_type: LLMEvaluationType
-    filters: IndexFilters
-
-    # by this point, the chunks_above and chunks_below must be set
-    chunks_above: int
-    chunks_below: int
-
-    rerank_settings: RerankingDetails | None
-    hybrid_alpha: float
-    recency_bias_multiplier: float
-
-    # Only used if LLM evaluation type is not skip, None to use default settings
-    max_llm_filter_sections: int
-
-    num_hits: int = NUM_RETURNED_HITS
-    offset: int = 0
-    model_config = ConfigDict(frozen=True)
-
-    precomputed_query_embedding: Embedding | None = None
-
-    expanded_queries: QueryExpansions | None = None
-    original_query: str | None
-
-
-class RetrievalDetails(ChunkContext):
-    # Use LLM to determine whether to do a retrieval or only rely on existing history
-    # If the Persona is configured to not run search (0 chunks), this is bypassed
-    # If no Prompt is configured, the only search results are shown, this is bypassed
-    run_search: OptionalSearchSetting = OptionalSearchSetting.AUTO
-    # Is this a real-time/streaming call or a question where Onyx can take more time?
-    # Used to determine reranking flow
-    real_time: bool = True
-    # The following have defaults in the Persona settings which can be overridden via
-    # the query, if None, then use Persona settings
-    filters: BaseFilters | None = None
-    enable_auto_detect_filters: bool | None = None
-    # if None, no offset / limit
-    offset: int | None = None
-    limit: int | None = None
-
-    # If this is set, only the highest matching chunk (or merged chunks) is returned
-    dedupe_docs: bool = False
-
-
 class InferenceChunk(BaseChunk):
    document_id: str
    source_type: DocumentSource
    semantic_identifier: str
    title: str | None  # Separate from Semantic Identifier though often same
    boost: int
-    recency_bias: float
    score: float | None
    hidden: bool
    is_relevant: bool | None = None
    relevance_explanation: str | None = None
+    # TODO(andrei): Ideally we could improve this to where each value is just a
+    # list of strings.
    metadata: dict[str, str | list[str]]
    # Matched sections in the chunk. Uses Vespa syntax e.g. <hi>TEXT</hi>
    # to specify that a set of words should be highlighted. For example:
@@ -534,15 +440,3 @@ class SavedSearchDocWithContent(SavedSearchDoc):
    section in addition to the match_highlights."""

    content: str
-
-
-class RetrievalMetricsContainer(BaseModel):
-    search_type: SearchType
-    metrics: list[ChunkMetric]  # This contains the scores for retrieval as well
-
-
-class RerankMetricsContainer(BaseModel):
-    """The score held by this is the un-boosted, averaged score of the ensemble cross-encoders"""
-
-    metrics: list[ChunkMetric]
-    raw_similarity_scores: list[float]
--- a/backend/onyx/context/search/preprocessing/preprocessing.py
+++ b/backend/onyx/context/search/preprocessing/preprocessing.py
@@ -1,272 +0,0 @@
-from sqlalchemy.orm import Session
-
-from onyx.configs.chat_configs import BASE_RECENCY_DECAY
-from onyx.configs.chat_configs import CONTEXT_CHUNKS_ABOVE
-from onyx.configs.chat_configs import CONTEXT_CHUNKS_BELOW
-from onyx.configs.chat_configs import DISABLE_LLM_DOC_RELEVANCE
-from onyx.configs.chat_configs import FAVOR_RECENT_DECAY_MULTIPLIER
-from onyx.configs.chat_configs import HYBRID_ALPHA
-from onyx.configs.chat_configs import HYBRID_ALPHA_KEYWORD
-from onyx.configs.chat_configs import NUM_POSTPROCESSED_RESULTS
-from onyx.configs.chat_configs import NUM_RETURNED_HITS
-from onyx.context.search.enums import LLMEvaluationType
-from onyx.context.search.enums import RecencyBiasSetting
-from onyx.context.search.enums import SearchType
-from onyx.context.search.models import BaseFilters
-from onyx.context.search.models import IndexFilters
-from onyx.context.search.models import RerankingDetails
-from onyx.context.search.models import SearchQuery
-from onyx.context.search.models import SearchRequest
-from onyx.context.search.preprocessing.access_filters import (
-    build_access_filters_for_user,
-)
-from onyx.context.search.utils import (
-    remove_stop_words_and_punctuation,
-)
-from onyx.db.models import User
-from onyx.db.search_settings import get_current_search_settings
-from onyx.llm.interfaces import LLM
-from onyx.natural_language_processing.search_nlp_models import QueryAnalysisModel
-from onyx.secondary_llm_flows.source_filter import extract_source_filter
-from onyx.secondary_llm_flows.time_filter import extract_time_filter
-from onyx.utils.logger import setup_logger
-from onyx.utils.threadpool_concurrency import FunctionCall
-from onyx.utils.threadpool_concurrency import run_functions_in_parallel
-from onyx.utils.timing import log_function_time
-from shared_configs.configs import MULTI_TENANT
-from shared_configs.contextvars import get_current_tenant_id
-
-logger = setup_logger()
-
-
-def query_analysis(query: str) -> tuple[bool, list[str]]:
-    analysis_model = QueryAnalysisModel()
-    return analysis_model.predict(query)
-
-
-# TODO: This is unused code.
-@log_function_time(print_only=True)
-def retrieval_preprocessing(
-    search_request: SearchRequest,
-    user: User | None,
-    llm: LLM,
-    skip_query_analysis: bool,
-    db_session: Session,
-    favor_recent_decay_multiplier: float = FAVOR_RECENT_DECAY_MULTIPLIER,
-    base_recency_decay: float = BASE_RECENCY_DECAY,
-    bypass_acl: bool = False,
-) -> SearchQuery:
-    """Logic is as follows:
-    Any global disables apply first
-    Then any filters or settings as part of the query are used
-    Then defaults to Persona settings if not specified by the query
-    """
-    query = search_request.query
-    limit = search_request.limit
-    offset = search_request.offset
-    persona = search_request.persona
-
-    preset_filters = search_request.human_selected_filters or BaseFilters()
-    if persona and persona.document_sets and preset_filters.document_set is None:
-        preset_filters.document_set = [
-            document_set.name for document_set in persona.document_sets
-        ]
-
-    time_filter = preset_filters.time_cutoff
-    if time_filter is None and persona:
-        time_filter = persona.search_start_date
-
-    source_filter = preset_filters.source_type
-
-    auto_detect_time_filter = True
-    auto_detect_source_filter = True
-    if not search_request.enable_auto_detect_filters:
-        logger.debug("Retrieval details disables auto detect filters")
-        auto_detect_time_filter = False
-        auto_detect_source_filter = False
-    elif persona and persona.llm_filter_extraction is False:
-        logger.debug("Persona disables auto detect filters")
-        auto_detect_time_filter = False
-        auto_detect_source_filter = False
-    else:
-        logger.debug("Auto detect filters enabled")
-
-    if (
-        time_filter is not None
-        and persona
-        and persona.recency_bias != RecencyBiasSetting.AUTO
-    ):
-        auto_detect_time_filter = False
-        logger.debug("Not extract time filter - already provided")
-    if source_filter is not None:
-        logger.debug("Not extract source filter - already provided")
-        auto_detect_source_filter = False
-
-    # Based on the query figure out if we should apply any hard time filters /
-    # if we should bias more recent docs even more strongly
-    run_time_filters = (
-        FunctionCall(extract_time_filter, (query, llm), {})
-        if auto_detect_time_filter
-        else None
-    )
-
-    # Based on the query, figure out if we should apply any source filters
-    run_source_filters = (
-        FunctionCall(extract_source_filter, (query, llm, db_session), {})
-        if auto_detect_source_filter
-        else None
-    )
-
-    # Sometimes this is pre-computed in parallel with other heavy tasks to improve
-    # latency, and in that case we don't need to run the model again
-    run_query_analysis = (
-        None
-        if (skip_query_analysis or search_request.precomputed_is_keyword is not None)
-        else FunctionCall(query_analysis, (query,), {})
-    )
-
-    functions_to_run = [
-        filter_fn
-        for filter_fn in [
-            run_time_filters,
-            run_source_filters,
-            run_query_analysis,
-        ]
-        if filter_fn
-    ]
-    parallel_results = run_functions_in_parallel(functions_to_run)
-
-    predicted_time_cutoff, predicted_favor_recent = (
-        parallel_results[run_time_filters.result_id]
-        if run_time_filters
-        else (None, None)
-    )
-    predicted_source_filters = (
-        parallel_results[run_source_filters.result_id] if run_source_filters else None
-    )
-
-    # The extracted keywords right now are not very reliable, not using for now
-    # Can maybe use for highlighting
-    is_keyword, _extracted_keywords = False, None
-    if search_request.precomputed_is_keyword is not None:
-        is_keyword = search_request.precomputed_is_keyword
-        _extracted_keywords = search_request.precomputed_keywords
-    elif run_query_analysis:
-        is_keyword, _extracted_keywords = parallel_results[run_query_analysis.result_id]
-
-    all_query_terms = query.split()
-    processed_keywords = (
-        remove_stop_words_and_punctuation(all_query_terms)
-        # If the user is using a different language, don't edit the query or remove english stopwords
-        if not search_request.multilingual_expansion
-        else all_query_terms
-    )
-
-    user_acl_filters = (
-        None if bypass_acl else build_access_filters_for_user(user, db_session)
-    )
-    user_file_filters = search_request.user_file_filters
-    user_file_ids = (user_file_filters.user_file_ids or []) if user_file_filters else []
-    if persona and persona.user_files:
-        user_file_ids = list(
-            set(user_file_ids) | set([file.id for file in persona.user_files])
-        )
-
-    final_filters = IndexFilters(
-        user_file_ids=user_file_ids,
-        project_id=user_file_filters.project_id if user_file_filters else None,
-        source_type=preset_filters.source_type or predicted_source_filters,
-        document_set=preset_filters.document_set,
-        time_cutoff=time_filter or predicted_time_cutoff,
-        tags=preset_filters.tags,  # Tags are never auto-extracted
-        access_control_list=user_acl_filters,
-        tenant_id=get_current_tenant_id() if MULTI_TENANT else None,
-        # kg_entities=preset_filters.kg_entities,
-        # kg_relationships=preset_filters.kg_relationships,
-        # kg_terms=preset_filters.kg_terms,
-        # kg_sources=preset_filters.kg_sources,
-        # kg_chunk_id_zero_only=preset_filters.kg_chunk_id_zero_only,
-    )
-
-    llm_evaluation_type = LLMEvaluationType.BASIC
-    if search_request.evaluation_type is not LLMEvaluationType.UNSPECIFIED:
-        llm_evaluation_type = search_request.evaluation_type
-
-    elif persona:
-        llm_evaluation_type = (
-            LLMEvaluationType.BASIC
-            if persona.llm_relevance_filter
-            else LLMEvaluationType.SKIP
-        )
-
-    if DISABLE_LLM_DOC_RELEVANCE:
-        if llm_evaluation_type:
-            logger.info(
-                "LLM chunk filtering would have run but has been globally disabled"
-            )
-        llm_evaluation_type = LLMEvaluationType.SKIP
-
-    rerank_settings = search_request.rerank_settings
-    # If not explicitly specified by the query, use the current settings
-    if rerank_settings is None:
-        search_settings = get_current_search_settings(db_session)
-
-        # For non-streaming flows, the rerank settings are applied at the search_request level
-        if not search_settings.disable_rerank_for_streaming:
-            rerank_settings = RerankingDetails.from_db_model(search_settings)
-
-    # Decays at 1 / (1 + (multiplier * num years))
-    if persona and persona.recency_bias == RecencyBiasSetting.NO_DECAY:
-        recency_bias_multiplier = 0.0
-    elif persona and persona.recency_bias == RecencyBiasSetting.BASE_DECAY:
-        recency_bias_multiplier = base_recency_decay
-    elif persona and persona.recency_bias == RecencyBiasSetting.FAVOR_RECENT:
-        recency_bias_multiplier = base_recency_decay * favor_recent_decay_multiplier
-    else:
-        if predicted_favor_recent:
-            recency_bias_multiplier = base_recency_decay * favor_recent_decay_multiplier
-        else:
-            recency_bias_multiplier = base_recency_decay
-
-    hybrid_alpha = HYBRID_ALPHA_KEYWORD if is_keyword else HYBRID_ALPHA
-    if search_request.hybrid_alpha:
-        hybrid_alpha = search_request.hybrid_alpha
-
-    # Search request overrides anything else as it's explicitly set by the request
-    # If not explicitly specified, use the persona settings if they exist
-    # Otherwise, use the global defaults
-    chunks_above = (
-        search_request.chunks_above
-        if search_request.chunks_above is not None
-        else (persona.chunks_above if persona else CONTEXT_CHUNKS_ABOVE)
-    )
-    chunks_below = (
-        search_request.chunks_below
-        if search_request.chunks_below is not None
-        else (persona.chunks_below if persona else CONTEXT_CHUNKS_BELOW)
-    )
-
-    return SearchQuery(
-        query=query,
-        original_query=search_request.original_query,
-        processed_keywords=processed_keywords,
-        search_type=SearchType.KEYWORD if is_keyword else SearchType.SEMANTIC,
-        evaluation_type=llm_evaluation_type,
-        filters=final_filters,
-        hybrid_alpha=hybrid_alpha,
-        recency_bias_multiplier=recency_bias_multiplier,
-        num_hits=limit if limit is not None else NUM_RETURNED_HITS,
-        offset=offset or 0,
-        rerank_settings=rerank_settings,
-        # Should match the LLM filtering to the same as the reranked, it's understood as this is the number of results
-        # the user wants to do heavier processing on, so do the same for the LLM if reranking is on
-        # if no reranking settings are set, then use the global default
-        max_llm_filter_sections=(
-            rerank_settings.num_rerank if rerank_settings else NUM_POSTPROCESSED_RESULTS
-        ),
-        chunks_above=chunks_above,
-        chunks_below=chunks_below,
-        full_doc=search_request.full_doc,
-        precomputed_query_embedding=search_request.precomputed_query_embedding,
-        expanded_queries=search_request.expanded_queries,
-    )
--- a/backend/onyx/context/search/retrieval/search_runner.py
+++ b/backend/onyx/context/search/retrieval/search_runner.py
@@ -1,42 +1,24 @@
-import string
 from collections.abc import Callable
 from uuid import UUID

 from sqlalchemy.orm import Session

+from onyx.configs.chat_configs import HYBRID_ALPHA
 from onyx.configs.chat_configs import NUM_RETURNED_HITS
-from onyx.context.search.enums import SearchType
 from onyx.context.search.models import ChunkIndexRequest
-from onyx.context.search.models import ChunkMetric
 from onyx.context.search.models import IndexFilters
 from onyx.context.search.models import InferenceChunk
 from onyx.context.search.models import InferenceSection
-from onyx.context.search.models import MAX_METRICS_CONTENT
 from onyx.context.search.models import QueryExpansionType
-from onyx.context.search.models import RetrievalMetricsContainer
-from onyx.context.search.models import SearchQuery
-from onyx.context.search.preprocessing.preprocessing import HYBRID_ALPHA
-from onyx.context.search.preprocessing.preprocessing import HYBRID_ALPHA_KEYWORD
 from onyx.context.search.utils import get_query_embedding
-from onyx.context.search.utils import get_query_embeddings
 from onyx.context.search.utils import inference_section_from_chunks
-from onyx.db.search_settings import get_multilingual_expansion
 from onyx.document_index.interfaces import DocumentIndex
 from onyx.document_index.interfaces import VespaChunkRequest
-from onyx.document_index.vespa.shared_utils.utils import (
-    replace_invalid_doc_id_characters,
-)
 from onyx.federated_connectors.federated_retrieval import (
    get_federated_retrieval_functions,
 )
-from onyx.secondary_llm_flows.query_expansion import multilingual_query_expansion
 from onyx.utils.logger import setup_logger
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
-from onyx.utils.threadpool_concurrency import run_in_background
-from onyx.utils.threadpool_concurrency import TimeoutThread
-from onyx.utils.threadpool_concurrency import wait_on_background
-from onyx.utils.timing import log_function_time
-from shared_configs.model_server_models import Embedding

 logger = setup_logger()

@@ -80,19 +62,6 @@ def download_nltk_data() -> None:
                logger.error(f"Failed to download {resource_name}. Error: {e}")


-def lemmatize_text(keywords: list[str]) -> list[str]:
-    raise NotImplementedError("Lemmatization should not be used currently")
-    # try:
-    #     query = " ".join(keywords)
-    #     lemmatizer = WordNetLemmatizer()
-    #     word_tokens = word_tokenize(query)
-    #     lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
-    #     combined_keywords = list(set(keywords + lemmatized_words))
-    #     return combined_keywords
-    # except Exception:
-    #     return keywords
-
-
 def combine_retrieval_results(
    chunk_sets: list[list[InferenceChunk]],
 ) -> list[InferenceChunk]:
@@ -117,313 +86,6 @@ def combine_retrieval_results(
    return sorted_chunks


-# TODO: This is unused code.
-@log_function_time(print_only=True)
-def doc_index_retrieval(
-    query: SearchQuery,
-    document_index: DocumentIndex,
-    db_session: Session,
-) -> list[InferenceChunk]:
-    """
-    This function performs the search to retrieve the chunks,
-    extracts chunks from the large chunks, persists the scores
-    from the large chunks to the referenced chunks,
-    dedupes the chunks, and cleans the chunks.
-    """
-    query_embedding = query.precomputed_query_embedding or get_query_embedding(
-        query.query, db_session
-    )
-
-    keyword_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
-    semantic_embeddings_thread: TimeoutThread[list[Embedding]] | None = None
-    top_base_chunks_standard_ranking_thread: (
-        TimeoutThread[list[InferenceChunk]] | None
-    ) = None
-
-    top_semantic_chunks_thread: TimeoutThread[list[InferenceChunk]] | None = None
-
-    keyword_embeddings: list[Embedding] | None = None
-    semantic_embeddings: list[Embedding] | None = None
-
-    top_semantic_chunks: list[InferenceChunk] | None = None
-
-    # original retrieveal method
-    top_base_chunks_standard_ranking_thread = run_in_background(
-        document_index.hybrid_retrieval,
-        query.query,
-        query_embedding,
-        query.processed_keywords,
-        query.filters,
-        query.hybrid_alpha,
-        query.recency_bias_multiplier,
-        query.num_hits,
-        QueryExpansionType.SEMANTIC,
-        query.offset,
-    )
-
-    if (
-        query.expanded_queries
-        and query.expanded_queries.keywords_expansions
-        and query.expanded_queries.semantic_expansions
-    ):
-
-        keyword_embeddings_thread = run_in_background(
-            get_query_embeddings,
-            query.expanded_queries.keywords_expansions,
-            db_session,
-        )
-
-        if query.search_type == SearchType.SEMANTIC:
-            semantic_embeddings_thread = run_in_background(
-                get_query_embeddings,
-                query.expanded_queries.semantic_expansions,
-                db_session,
-            )
-
-        keyword_embeddings = wait_on_background(keyword_embeddings_thread)
-        if query.search_type == SearchType.SEMANTIC:
-            assert semantic_embeddings_thread is not None
-            semantic_embeddings = wait_on_background(semantic_embeddings_thread)
-
-        # Use original query embedding for keyword retrieval embedding
-        keyword_embeddings = [query_embedding]
-
-        # Note: we generally prepped earlier for multiple expansions, but for now we only use one.
-        top_keyword_chunks_thread = run_in_background(
-            document_index.hybrid_retrieval,
-            query.expanded_queries.keywords_expansions[0],
-            keyword_embeddings[0],
-            query.processed_keywords,
-            query.filters,
-            HYBRID_ALPHA_KEYWORD,
-            query.recency_bias_multiplier,
-            query.num_hits,
-            QueryExpansionType.KEYWORD,
-            query.offset,
-        )
-
-        if query.search_type == SearchType.SEMANTIC:
-            assert semantic_embeddings is not None
-
-            top_semantic_chunks_thread = run_in_background(
-                document_index.hybrid_retrieval,
-                query.expanded_queries.semantic_expansions[0],
-                semantic_embeddings[0],
-                query.processed_keywords,
-                query.filters,
-                HYBRID_ALPHA,
-                query.recency_bias_multiplier,
-                query.num_hits,
-                QueryExpansionType.SEMANTIC,
-                query.offset,
-            )
-
-        top_base_chunks_standard_ranking = wait_on_background(
-            top_base_chunks_standard_ranking_thread
-        )
-
-        top_keyword_chunks = wait_on_background(top_keyword_chunks_thread)
-
-        if query.search_type == SearchType.SEMANTIC:
-            assert top_semantic_chunks_thread is not None
-            top_semantic_chunks = wait_on_background(top_semantic_chunks_thread)
-
-        all_top_chunks = top_base_chunks_standard_ranking + top_keyword_chunks
-
-        # use all three retrieval methods to retrieve top chunks
-
-        if query.search_type == SearchType.SEMANTIC and top_semantic_chunks is not None:
-
-            all_top_chunks += top_semantic_chunks
-
-        top_chunks = _dedupe_chunks(all_top_chunks)
-
-    else:
-
-        top_base_chunks_standard_ranking = wait_on_background(
-            top_base_chunks_standard_ranking_thread
-        )
-
-        top_chunks = _dedupe_chunks(top_base_chunks_standard_ranking)
-
-    logger.info(f"Overall number of top initial retrieval chunks: {len(top_chunks)}")
-
-    retrieval_requests: list[VespaChunkRequest] = []
-    normal_chunks: list[InferenceChunk] = []
-    referenced_chunk_scores: dict[tuple[str, int], float] = {}
-    for chunk in top_chunks:
-        if chunk.large_chunk_reference_ids:
-            retrieval_requests.append(
-                VespaChunkRequest(
-                    document_id=replace_invalid_doc_id_characters(chunk.document_id),
-                    min_chunk_ind=chunk.large_chunk_reference_ids[0],
-                    max_chunk_ind=chunk.large_chunk_reference_ids[-1],
-                )
-            )
-            # for each referenced chunk, persist the
-            # highest score to the referenced chunk
-            for chunk_id in chunk.large_chunk_reference_ids:
-                key = (chunk.document_id, chunk_id)
-                referenced_chunk_scores[key] = max(
-                    referenced_chunk_scores.get(key, 0), chunk.score or 0
-                )
-        else:
-            normal_chunks.append(chunk)
-
-    # If there are no large chunks, just return the normal chunks
-    if not retrieval_requests:
-        return normal_chunks
-
-    # Retrieve and return the referenced normal chunks from the large chunks
-    retrieved_inference_chunks = document_index.id_based_retrieval(
-        chunk_requests=retrieval_requests,
-        filters=query.filters,
-        batch_retrieval=True,
-    )
-
-    # Apply the scores from the large chunks to the chunks referenced
-    # by each large chunk
-    for chunk in retrieved_inference_chunks:
-        if (chunk.document_id, chunk.chunk_id) in referenced_chunk_scores:
-            chunk.score = referenced_chunk_scores[(chunk.document_id, chunk.chunk_id)]
-            referenced_chunk_scores.pop((chunk.document_id, chunk.chunk_id))
-        else:
-            logger.error(
-                f"Chunk {chunk.document_id} {chunk.chunk_id} not found in referenced chunk scores"
-            )
-
-    # Log any chunks that were not found in the retrieved chunks
-    for reference in referenced_chunk_scores.keys():
-        logger.error(f"Chunk {reference} not found in retrieved chunks")
-
-    unique_chunks: dict[tuple[str, int], InferenceChunk] = {
-        (chunk.document_id, chunk.chunk_id): chunk for chunk in normal_chunks
-    }
-
-    # persist the highest score of each deduped chunk
-    for chunk in retrieved_inference_chunks:
-        key = (chunk.document_id, chunk.chunk_id)
-        # For duplicates, keep the highest score
-        if key not in unique_chunks or (chunk.score or 0) > (
-            unique_chunks[key].score or 0
-        ):
-            unique_chunks[key] = chunk
-
-    # Deduplicate the chunks
-    deduped_chunks = list(unique_chunks.values())
-    deduped_chunks.sort(key=lambda chunk: chunk.score or 0, reverse=True)
-    return deduped_chunks
-
-
-def _simplify_text(text: str) -> str:
-    return "".join(
-        char for char in text if char not in string.punctuation and not char.isspace()
-    ).lower()
-
-
-# TODO delete this
-def retrieve_chunks(
-    query: SearchQuery,
-    user_id: UUID | None,
-    document_index: DocumentIndex,
-    db_session: Session,
-    retrieval_metrics_callback: (
-        Callable[[RetrievalMetricsContainer], None] | None
-    ) = None,
-) -> list[InferenceChunk]:
-    """Returns a list of the best chunks from an initial keyword/semantic/ hybrid search."""
-
-    multilingual_expansion = get_multilingual_expansion(db_session)
-    run_queries: list[tuple[Callable, tuple]] = []
-
-    source_filters = (
-        set(query.filters.source_type) if query.filters.source_type else None
-    )
-
-    # Federated retrieval
-    federated_retrieval_infos = get_federated_retrieval_functions(
-        db_session,
-        user_id,
-        list(query.filters.source_type) if query.filters.source_type else None,
-        query.filters.document_set,
-        user_file_ids=query.filters.user_file_ids,
-    )
-    federated_sources = set(
-        federated_retrieval_info.source.to_non_federated_source()
-        for federated_retrieval_info in federated_retrieval_infos
-    )
-    for federated_retrieval_info in federated_retrieval_infos:
-        run_queries.append((federated_retrieval_info.retrieval_function, (query,)))
-
-    # Normal retrieval
-    normal_search_enabled = (source_filters is None) or (
-        len(set(source_filters) - federated_sources) > 0
-    )
-    if normal_search_enabled and (
-        not multilingual_expansion or "\n" in query.query or "\r" in query.query
-    ):
-        # Don't do query expansion on complex queries, rephrasings likely would not work well
-        run_queries.append((doc_index_retrieval, (query, document_index, db_session)))
-    elif normal_search_enabled:
-        simplified_queries = set()
-
-        # Currently only uses query expansion on multilingual use cases
-        query_rephrases = multilingual_query_expansion(
-            query.query, multilingual_expansion
-        )
-        # Just to be extra sure, add the original query.
-        query_rephrases.append(query.query)
-        for rephrase in set(query_rephrases):
-            # Sometimes the model rephrases the query in the same language with minor changes
-            # Avoid doing an extra search with the minor changes as this biases the results
-            simplified_rephrase = _simplify_text(rephrase)
-            if simplified_rephrase in simplified_queries:
-                continue
-            simplified_queries.add(simplified_rephrase)
-
-            q_copy = query.model_copy(
-                update={
-                    "query": rephrase,
-                    # need to recompute for each rephrase
-                    # note that `SearchQuery` is a frozen model, so we can't update
-                    # it below
-                    "precomputed_query_embedding": None,
-                },
-                deep=True,
-            )
-            run_queries.append(
-                (doc_index_retrieval, (q_copy, document_index, db_session))
-            )
-
-    parallel_search_results = run_functions_tuples_in_parallel(run_queries)
-    top_chunks = combine_retrieval_results(parallel_search_results)
-
-    if not top_chunks:
-        logger.warning(
-            f"Hybrid ({query.search_type.value.capitalize()}) search returned no results "
-            f"with filters: {query.filters}"
-        )
-        return []
-
-    if retrieval_metrics_callback is not None:
-        chunk_metrics = [
-            ChunkMetric(
-                document_id=chunk.document_id,
-                chunk_content_start=chunk.content[:MAX_METRICS_CONTENT],
-                first_link=chunk.source_links[0] if chunk.source_links else None,
-                score=chunk.score if chunk.score is not None else 0,
-            )
-            for chunk in top_chunks
-        ]
-        retrieval_metrics_callback(
-            RetrievalMetricsContainer(
-                search_type=query.search_type, metrics=chunk_metrics
-            )
-        )
-
-    return top_chunks
-
-
 def _embed_and_search(
    query_request: ChunkIndexRequest,
    document_index: DocumentIndex,
--- a/backend/onyx/context/search/utils.py
+++ b/backend/onyx/context/search/utils.py
@@ -1,16 +1,12 @@
-import string
-from collections.abc import Sequence
 from typing import TypeVar

 from sqlalchemy.orm import Session

-from onyx.chat.models import SectionRelevancePiece
 from onyx.context.search.models import InferenceChunk
 from onyx.context.search.models import InferenceSection
 from onyx.context.search.models import SavedSearchDoc
 from onyx.context.search.models import SavedSearchDocWithContent
 from onyx.context.search.models import SearchDoc
-from onyx.db.models import SearchDoc as DBSearchDoc
 from onyx.db.search_settings import get_current_search_settings
 from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
 from onyx.utils.logger import setup_logger
@@ -41,66 +37,6 @@ TSection = TypeVar(
 )


-def dedupe_documents(items: list[T]) -> tuple[list[T], list[int]]:
-    seen_ids = set()
-    deduped_items = []
-    dropped_indices = []
-    for index, item in enumerate(items):
-        if isinstance(item, InferenceSection):
-            document_id = item.center_chunk.document_id
-        else:
-            document_id = item.document_id
-
-        if document_id not in seen_ids:
-            seen_ids.add(document_id)
-            deduped_items.append(item)
-        else:
-            dropped_indices.append(index)
-    return deduped_items, dropped_indices
-
-
-def relevant_sections_to_indices(
-    relevance_sections: list[SectionRelevancePiece] | None, items: list[TSection]
-) -> list[int]:
-    if not relevance_sections:
-        return []
-
-    relevant_set = {
-        (chunk.document_id, chunk.chunk_id)
-        for chunk in relevance_sections
-        if chunk.relevant
-    }
-
-    return [
-        index
-        for index, item in enumerate(items)
-        if (
-            (
-                isinstance(item, InferenceSection)
-                and (item.center_chunk.document_id, item.center_chunk.chunk_id)
-                in relevant_set
-            )
-            or (
-                not isinstance(item, (InferenceSection))
-                and (item.document_id, item.chunk_ind) in relevant_set
-            )
-        )
-    ]
-
-
-def drop_llm_indices(
-    llm_indices: list[int],
-    search_docs: Sequence[DBSearchDoc | SavedSearchDoc],
-    dropped_indices: list[int],
-) -> list[int]:
-    llm_bools = [i in llm_indices for i in range(len(search_docs))]
-    if dropped_indices:
-        llm_bools = [
-            val for ind, val in enumerate(llm_bools) if ind not in dropped_indices
-        ]
-    return [i for i, val in enumerate(llm_bools) if val]
-
-
 def inference_section_from_chunks(
    center_chunk: InferenceChunk,
    chunks: list[InferenceChunk],
@@ -128,26 +64,6 @@ def inference_section_from_single_chunk(
    )


-def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
-    from nltk.corpus import stopwords  # type:ignore
-    from nltk.tokenize import word_tokenize  # type:ignore
-
-    try:
-        # Re-tokenize using the NLTK tokenizer for better matching
-        query = " ".join(keywords)
-        stop_words = set(stopwords.words("english"))
-        word_tokens = word_tokenize(query)
-        text_trimmed = [
-            word
-            for word in word_tokens
-            if (word.casefold() not in stop_words and word not in string.punctuation)
-        ]
-        return text_trimmed or word_tokens
-    except Exception as e:
-        logger.warning(f"Error removing stop words and punctuation: {e}")
-        return keywords
-
-
 def get_query_embeddings(queries: list[str], db_session: Session) -> list[Embedding]:
    search_settings = get_current_search_settings(db_session)

--- a/backend/onyx/db/chat.py
+++ b/backend/onyx/db/chat.py
@@ -1,6 +1,7 @@
 from collections.abc import Sequence
 from datetime import datetime
 from datetime import timedelta
+from datetime import timezone
 from typing import Tuple
 from uuid import UUID

@@ -90,59 +91,6 @@ def get_chat_sessions_by_slack_thread_id(
    return db_session.scalars(stmt).all()


-def get_valid_messages_from_query_sessions(
-    chat_session_ids: list[UUID],
-    db_session: Session,
-) -> dict[UUID, str]:
-    user_message_subquery = (
-        select(
-            ChatMessage.chat_session_id, func.min(ChatMessage.id).label("user_msg_id")
-        )
-        .where(
-            ChatMessage.chat_session_id.in_(chat_session_ids),
-            ChatMessage.message_type == MessageType.USER,
-        )
-        .group_by(ChatMessage.chat_session_id)
-        .subquery()
-    )
-
-    assistant_message_subquery = (
-        select(
-            ChatMessage.chat_session_id,
-            func.min(ChatMessage.id).label("assistant_msg_id"),
-        )
-        .where(
-            ChatMessage.chat_session_id.in_(chat_session_ids),
-            ChatMessage.message_type == MessageType.ASSISTANT,
-        )
-        .group_by(ChatMessage.chat_session_id)
-        .subquery()
-    )
-
-    query = (
-        select(ChatMessage.chat_session_id, ChatMessage.message)
-        .join(
-            user_message_subquery,
-            ChatMessage.chat_session_id == user_message_subquery.c.chat_session_id,
-        )
-        .join(
-            assistant_message_subquery,
-            ChatMessage.chat_session_id == assistant_message_subquery.c.chat_session_id,
-        )
-        .join(
-            ChatMessage__SearchDoc,
-            ChatMessage__SearchDoc.chat_message_id
-            == assistant_message_subquery.c.assistant_msg_id,
-        )
-        .where(ChatMessage.id == user_message_subquery.c.user_msg_id)
-    )
-
-    first_messages = db_session.execute(query).all()
-    logger.info(f"Retrieved {len(first_messages)} first messages with documents")
-
-    return {row.chat_session_id: row.message for row in first_messages}
-
-
 # Retrieves chat sessions by user
 # Chat sessions do not include onyxbot flows
 def get_chat_sessions_by_user(
@@ -181,7 +129,11 @@ def get_chat_sessions_by_user(
            .correlate(ChatSession)
        )

-        stmt = stmt.where(non_system_message_exists_subq)
+        # Leeway for newly created chats that don't have messages yet
+        time = datetime.now(timezone.utc) - timedelta(minutes=5)
+        recently_created = ChatSession.time_created >= time
+
+        stmt = stmt.where(or_(non_system_message_exists_subq, recently_created))

    result = db_session.execute(stmt)
    chat_sessions = result.scalars().all()
@@ -505,21 +457,6 @@ def add_chats_to_session_from_slack_thread(
        )


-def get_search_docs_for_chat_message(
-    chat_message_id: int, db_session: Session
-) -> list[DBSearchDoc]:
-    stmt = (
-        select(DBSearchDoc)
-        .join(
-            ChatMessage__SearchDoc,
-            ChatMessage__SearchDoc.search_doc_id == DBSearchDoc.id,
-        )
-        .where(ChatMessage__SearchDoc.chat_message_id == chat_message_id)
-    )
-
-    return list(db_session.scalars(stmt).all())
-
-
 def add_search_docs_to_chat_message(
    chat_message_id: int, search_doc_ids: list[int], db_session: Session
 ) -> None:
--- a/backend/onyx/db/discord_bot.py
+++ b/backend/onyx/db/discord_bot.py
@@ -0,0 +1,451 @@
+"""CRUD operations for Discord bot models."""
+
+from datetime import datetime
+from datetime import timezone
+
+from sqlalchemy import delete
+from sqlalchemy import select
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import joinedload
+from sqlalchemy.orm import Session
+
+from onyx.auth.api_key import build_displayable_api_key
+from onyx.auth.api_key import generate_api_key
+from onyx.auth.api_key import hash_api_key
+from onyx.auth.schemas import UserRole
+from onyx.configs.constants import DISCORD_SERVICE_API_KEY_NAME
+from onyx.db.api_key import insert_api_key
+from onyx.db.models import ApiKey
+from onyx.db.models import DiscordBotConfig
+from onyx.db.models import DiscordChannelConfig
+from onyx.db.models import DiscordGuildConfig
+from onyx.db.models import User
+from onyx.db.utils import DiscordChannelView
+from onyx.server.api_key.models import APIKeyArgs
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+# === DiscordBotConfig ===
+
+
+def get_discord_bot_config(db_session: Session) -> DiscordBotConfig | None:
+    """Get the Discord bot config for this tenant (at most one)."""
+    return db_session.scalar(select(DiscordBotConfig).limit(1))
+
+
+def create_discord_bot_config(
+    db_session: Session,
+    bot_token: str,
+) -> DiscordBotConfig:
+    """Create the Discord bot config. Raises ValueError if already exists.
+
+    The check constraint on id='SINGLETON' ensures only one config per tenant.
+    """
+    existing = get_discord_bot_config(db_session)
+    if existing:
+        raise ValueError("Discord bot config already exists")
+
+    config = DiscordBotConfig(bot_token=bot_token)
+    db_session.add(config)
+    try:
+        db_session.flush()
+    except IntegrityError:
+        # Race condition: another request created the config concurrently
+        db_session.rollback()
+        raise ValueError("Discord bot config already exists")
+    return config
+
+
+def delete_discord_bot_config(db_session: Session) -> bool:
+    """Delete the Discord bot config. Returns True if deleted."""
+    result = db_session.execute(delete(DiscordBotConfig))
+    db_session.flush()
+    return result.rowcount > 0  # type: ignore[attr-defined]
+
+
+# === Discord Service API Key ===
+
+
+def get_discord_service_api_key(db_session: Session) -> ApiKey | None:
+    """Get the Discord service API key if it exists."""
+    return db_session.scalar(
+        select(ApiKey).where(ApiKey.name == DISCORD_SERVICE_API_KEY_NAME)
+    )
+
+
+def get_or_create_discord_service_api_key(
+    db_session: Session,
+    tenant_id: str,
+) -> str:
+    """Get existing Discord service API key or create one.
+
+    The API key is used by the Discord bot to authenticate with the
+    Onyx API pods when sending chat requests.
+
+    Args:
+        db_session: Database session for the tenant.
+        tenant_id: The tenant ID (used for logging/context).
+
+    Returns:
+        The raw API key string (not hashed).
+
+    Raises:
+        RuntimeError: If API key creation fails.
+    """
+    # Check for existing key
+    existing = get_discord_service_api_key(db_session)
+    if existing:
+        # Database only stores the hash, so we must regenerate to get the raw key.
+        # This is safe since the Discord bot is the only consumer of this key.
+        logger.debug(
+            f"Found existing Discord service API key for tenant {tenant_id} that isn't in cache, "
+            "regenerating to update cache"
+        )
+        new_api_key = generate_api_key(tenant_id)
+        existing.hashed_api_key = hash_api_key(new_api_key)
+        existing.api_key_display = build_displayable_api_key(new_api_key)
+        db_session.flush()
+        return new_api_key
+
+    # Create new API key
+    logger.info(f"Creating Discord service API key for tenant {tenant_id}")
+    api_key_args = APIKeyArgs(
+        name=DISCORD_SERVICE_API_KEY_NAME,
+        role=UserRole.LIMITED,  # Limited role is sufficient for chat requests
+    )
+    api_key_descriptor = insert_api_key(
+        db_session=db_session,
+        api_key_args=api_key_args,
+        user_id=None,  # Service account, no owner
+    )
+
+    if not api_key_descriptor.api_key:
+        raise RuntimeError(
+            f"Failed to create Discord service API key for tenant {tenant_id}"
+        )
+
+    return api_key_descriptor.api_key
+
+
+def delete_discord_service_api_key(db_session: Session) -> bool:
+    """Delete the Discord service API key for a tenant.
+
+    Called when:
+    - Bot config is deleted (self-hosted)
+    - All guild configs are deleted (Cloud)
+
+    Args:
+        db_session: Database session for the tenant.
+
+    Returns:
+        True if the key was deleted, False if it didn't exist.
+    """
+    existing_key = get_discord_service_api_key(db_session)
+    if not existing_key:
+        return False
+
+    # Also delete the associated user
+    api_key_user = db_session.scalar(
+        select(User).where(User.id == existing_key.user_id)  # type: ignore[arg-type]
+    )
+
+    db_session.delete(existing_key)
+    if api_key_user:
+        db_session.delete(api_key_user)
+
+    db_session.flush()
+    logger.info("Deleted Discord service API key")
+    return True
+
+
+# === DiscordGuildConfig ===
+
+
+def get_guild_configs(
+    db_session: Session,
+    include_channels: bool = False,
+) -> list[DiscordGuildConfig]:
+    """Get all guild configs for this tenant."""
+    stmt = select(DiscordGuildConfig)
+    if include_channels:
+        stmt = stmt.options(joinedload(DiscordGuildConfig.channels))
+    return list(db_session.scalars(stmt).unique().all())
+
+
+def get_guild_config_by_internal_id(
+    db_session: Session,
+    internal_id: int,
+) -> DiscordGuildConfig | None:
+    """Get a specific guild config by its ID."""
+    return db_session.scalar(
+        select(DiscordGuildConfig).where(DiscordGuildConfig.id == internal_id)
+    )
+
+
+def get_guild_config_by_discord_id(
+    db_session: Session,
+    guild_id: int,
+) -> DiscordGuildConfig | None:
+    """Get a guild config by Discord guild ID."""
+    return db_session.scalar(
+        select(DiscordGuildConfig).where(DiscordGuildConfig.guild_id == guild_id)
+    )
+
+
+def get_guild_config_by_registration_key(
+    db_session: Session,
+    registration_key: str,
+) -> DiscordGuildConfig | None:
+    """Get a guild config by its registration key."""
+    return db_session.scalar(
+        select(DiscordGuildConfig).where(
+            DiscordGuildConfig.registration_key == registration_key
+        )
+    )
+
+
+def create_guild_config(
+    db_session: Session,
+    registration_key: str,
+) -> DiscordGuildConfig:
+    """Create a new guild config with a registration key (guild_id=NULL)."""
+    config = DiscordGuildConfig(registration_key=registration_key)
+    db_session.add(config)
+    db_session.flush()
+    return config
+
+
+def register_guild(
+    db_session: Session,
+    config: DiscordGuildConfig,
+    guild_id: int,
+    guild_name: str,
+) -> DiscordGuildConfig:
+    """Complete registration by setting guild_id and guild_name."""
+    config.guild_id = guild_id
+    config.guild_name = guild_name
+    config.registered_at = datetime.now(timezone.utc)
+    db_session.flush()
+    return config
+
+
+def update_guild_config(
+    db_session: Session,
+    config: DiscordGuildConfig,
+    enabled: bool,
+    default_persona_id: int | None = None,
+) -> DiscordGuildConfig:
+    """Update guild config fields."""
+    config.enabled = enabled
+    config.default_persona_id = default_persona_id
+    db_session.flush()
+    return config
+
+
+def delete_guild_config(
+    db_session: Session,
+    internal_id: int,
+) -> bool:
+    """Delete guild config (cascades to channel configs). Returns True if deleted."""
+    result = db_session.execute(
+        delete(DiscordGuildConfig).where(DiscordGuildConfig.id == internal_id)
+    )
+    db_session.flush()
+    return result.rowcount > 0  # type: ignore[attr-defined]
+
+
+# === DiscordChannelConfig ===
+
+
+def get_channel_configs(
+    db_session: Session,
+    guild_config_id: int,
+) -> list[DiscordChannelConfig]:
+    """Get all channel configs for a guild."""
+    return list(
+        db_session.scalars(
+            select(DiscordChannelConfig).where(
+                DiscordChannelConfig.guild_config_id == guild_config_id
+            )
+        ).all()
+    )
+
+
+def get_channel_config_by_discord_ids(
+    db_session: Session,
+    guild_id: int,
+    channel_id: int,
+) -> DiscordChannelConfig | None:
+    """Get a specific channel config by guild_id and channel_id."""
+    return db_session.scalar(
+        select(DiscordChannelConfig)
+        .join(DiscordGuildConfig)
+        .where(
+            DiscordGuildConfig.guild_id == guild_id,
+            DiscordChannelConfig.channel_id == channel_id,
+        )
+    )
+
+
+def get_channel_config_by_internal_ids(
+    db_session: Session,
+    guild_config_id: int,
+    channel_config_id: int,
+) -> DiscordChannelConfig | None:
+    """Get a specific channel config by guild_config_id and channel_config_id"""
+    return db_session.scalar(
+        select(DiscordChannelConfig).where(
+            DiscordChannelConfig.guild_config_id == guild_config_id,
+            DiscordChannelConfig.id == channel_config_id,
+        )
+    )
+
+
+def update_discord_channel_config(
+    db_session: Session,
+    config: DiscordChannelConfig,
+    channel_name: str,
+    thread_only_mode: bool,
+    require_bot_invocation: bool,
+    enabled: bool,
+    persona_override_id: int | None = None,
+) -> DiscordChannelConfig:
+    """Update channel config fields."""
+    config.channel_name = channel_name
+    config.require_bot_invocation = require_bot_invocation
+    config.persona_override_id = persona_override_id
+    config.enabled = enabled
+    config.thread_only_mode = thread_only_mode
+    db_session.flush()
+    return config
+
+
+def delete_discord_channel_config(
+    db_session: Session,
+    guild_config_id: int,
+    channel_config_id: int,
+) -> bool:
+    """Delete a channel config. Returns True if deleted."""
+    result = db_session.execute(
+        delete(DiscordChannelConfig).where(
+            DiscordChannelConfig.guild_config_id == guild_config_id,
+            DiscordChannelConfig.id == channel_config_id,
+        )
+    )
+    db_session.flush()
+    return result.rowcount > 0  # type: ignore[attr-defined]
+
+
+def create_channel_config(
+    db_session: Session,
+    guild_config_id: int,
+    channel_view: DiscordChannelView,
+) -> DiscordChannelConfig:
+    """Create a new channel config with default settings (disabled by default, admin enables via UI)."""
+    config = DiscordChannelConfig(
+        guild_config_id=guild_config_id,
+        channel_id=channel_view.channel_id,
+        channel_name=channel_view.channel_name,
+        channel_type=channel_view.channel_type,
+        is_private=channel_view.is_private,
+    )
+    db_session.add(config)
+    db_session.flush()
+    return config
+
+
+def bulk_create_channel_configs(
+    db_session: Session,
+    guild_config_id: int,
+    channels: list[DiscordChannelView],
+) -> list[DiscordChannelConfig]:
+    """Create multiple channel configs at once. Skips existing channels."""
+    # Get existing channel IDs for this guild
+    existing_channel_ids = set(
+        db_session.scalars(
+            select(DiscordChannelConfig.channel_id).where(
+                DiscordChannelConfig.guild_config_id == guild_config_id
+            )
+        ).all()
+    )
+
+    # Create configs for new channels only
+    new_configs = []
+    for channel_view in channels:
+        if channel_view.channel_id not in existing_channel_ids:
+            config = DiscordChannelConfig(
+                guild_config_id=guild_config_id,
+                channel_id=channel_view.channel_id,
+                channel_name=channel_view.channel_name,
+                channel_type=channel_view.channel_type,
+                is_private=channel_view.is_private,
+            )
+            db_session.add(config)
+            new_configs.append(config)
+
+    db_session.flush()
+    return new_configs
+
+
+def sync_channel_configs(
+    db_session: Session,
+    guild_config_id: int,
+    current_channels: list[DiscordChannelView],
+) -> tuple[int, int, int]:
+    """Sync channel configs with current Discord channels.
+
+    - Creates configs for new channels (disabled by default)
+    - Removes configs for deleted channels
+    - Updates names and types for existing channels if changed
+
+    Returns: (added_count, removed_count, updated_count)
+    """
+    current_channel_map = {
+        channel_view.channel_id: channel_view for channel_view in current_channels
+    }
+    current_channel_ids = set(current_channel_map.keys())
+
+    # Get existing configs
+    existing_configs = get_channel_configs(db_session, guild_config_id)
+    existing_channel_ids = {c.channel_id for c in existing_configs}
+
+    # Find channels to add, remove, and potentially update
+    to_add = current_channel_ids - existing_channel_ids
+    to_remove = existing_channel_ids - current_channel_ids
+
+    # Add new channels
+    added_count = 0
+    for channel_id in to_add:
+        channel_view = current_channel_map[channel_id]
+        create_channel_config(db_session, guild_config_id, channel_view)
+        added_count += 1
+
+    # Remove deleted channels
+    removed_count = 0
+    for config in existing_configs:
+        if config.channel_id in to_remove:
+            db_session.delete(config)
+            removed_count += 1
+
+    # Update names, types, and privacy for existing channels if changed
+    updated_count = 0
+    for config in existing_configs:
+        if config.channel_id in current_channel_ids:
+            channel_view = current_channel_map[config.channel_id]
+            changed = False
+            if config.channel_name != channel_view.channel_name:
+                config.channel_name = channel_view.channel_name
+                changed = True
+            if config.channel_type != channel_view.channel_type:
+                config.channel_type = channel_view.channel_type
+                changed = True
+            if config.is_private != channel_view.is_private:
+                config.is_private = channel_view.is_private
+                changed = True
+            if changed:
+                updated_count += 1
+
+    db_session.flush()
+    return added_count, removed_count, updated_count
--- a/backend/onyx/db/document.py
+++ b/backend/onyx/db/document.py
@@ -444,6 +444,8 @@ def upsert_documents(
        logger.info("No documents to upsert. Skipping.")
        return

+    includes_permissions = any(doc.external_access for doc in seen_documents.values())
+
    insert_stmt = insert(DbDocument).values(
        [
            model_to_dict(
@@ -479,21 +481,38 @@ def upsert_documents(
        ]
    )

+    update_set = {
+        "from_ingestion_api": insert_stmt.excluded.from_ingestion_api,
+        "boost": insert_stmt.excluded.boost,
+        "hidden": insert_stmt.excluded.hidden,
+        "semantic_id": insert_stmt.excluded.semantic_id,
+        "link": insert_stmt.excluded.link,
+        "primary_owners": insert_stmt.excluded.primary_owners,
+        "secondary_owners": insert_stmt.excluded.secondary_owners,
+        "doc_metadata": insert_stmt.excluded.doc_metadata,
+    }
+    if includes_permissions:
+        # Use COALESCE to preserve existing permissions when new values are NULL.
+        # This prevents subsequent indexing runs (which don't fetch permissions)
+        # from overwriting permissions set by permission sync jobs.
+        update_set.update(
+            {
+                "external_user_emails": func.coalesce(
+                    insert_stmt.excluded.external_user_emails,
+                    DbDocument.external_user_emails,
+                ),
+                "external_user_group_ids": func.coalesce(
+                    insert_stmt.excluded.external_user_group_ids,
+                    DbDocument.external_user_group_ids,
+                ),
+                "is_public": func.coalesce(
+                    insert_stmt.excluded.is_public,
+                    DbDocument.is_public,
+                ),
+            }
+        )
    on_conflict_stmt = insert_stmt.on_conflict_do_update(
-        index_elements=["id"],  # Conflict target
-        set_={
-            "from_ingestion_api": insert_stmt.excluded.from_ingestion_api,
-            "boost": insert_stmt.excluded.boost,
-            "hidden": insert_stmt.excluded.hidden,
-            "semantic_id": insert_stmt.excluded.semantic_id,
-            "link": insert_stmt.excluded.link,
-            "primary_owners": insert_stmt.excluded.primary_owners,
-            "secondary_owners": insert_stmt.excluded.secondary_owners,
-            "external_user_emails": insert_stmt.excluded.external_user_emails,
-            "external_user_group_ids": insert_stmt.excluded.external_user_group_ids,
-            "is_public": insert_stmt.excluded.is_public,
-            "doc_metadata": insert_stmt.excluded.doc_metadata,
-        },
+        index_elements=["id"], set_=update_set  # Conflict target
    )
    db_session.execute(on_conflict_stmt)
    db_session.commit()
--- a/backend/onyx/db/llm.py
+++ b/backend/onyx/db/llm.py
@@ -374,7 +374,7 @@ def fetch_existing_tools(db_session: Session, tool_ids: list[int]) -> list[ToolM
 def fetch_existing_llm_providers(
    db_session: Session,
    only_public: bool = False,
-    exclude_image_generation_providers: bool = False,
+    exclude_image_generation_providers: bool = True,
 ) -> list[LLMProviderModel]:
    """Fetch all LLM providers with optional filtering.

@@ -585,13 +585,12 @@ def update_default_vision_provider(

 def fetch_auto_mode_providers(db_session: Session) -> list[LLMProviderModel]:
    """Fetch all LLM providers that are in Auto mode."""
-    return list(
-        db_session.scalars(
-            select(LLMProviderModel)
-            .where(LLMProviderModel.is_auto_mode == True)  # noqa: E712
-            .options(selectinload(LLMProviderModel.model_configurations))
-        ).all()
+    query = (
+        select(LLMProviderModel)
+        .where(LLMProviderModel.is_auto_mode.is_(True))
+        .options(selectinload(LLMProviderModel.model_configurations))
    )
+    return list(db_session.scalars(query).all())


 def sync_auto_mode_models(
@@ -620,7 +619,9 @@ def sync_auto_mode_models(

    # Build the list of all visible models from the config
    # All models in the config are visible (default + additional_visible_models)
-    recommended_visible_models = llm_recommendations.get_visible_models(provider.name)
+    recommended_visible_models = llm_recommendations.get_visible_models(
+        provider.provider
+    )
    recommended_visible_model_names = [
        model.name for model in recommended_visible_models
    ]
@@ -635,11 +636,12 @@ def sync_auto_mode_models(
        ).all()
    }

-    # Remove models that are no longer in GitHub config
+    # Mark models that are no longer in GitHub config as not visible
    for model_name, model in existing_models.items():
        if model_name not in recommended_visible_model_names:
-            db_session.delete(model)
-            changes += 1
+            if model.is_visible:
+                model.is_visible = False
+                changes += 1

    # Add or update models from GitHub config
    for model_config in recommended_visible_models:
@@ -669,7 +671,7 @@ def sync_auto_mode_models(
            changes += 1

    # In Auto mode, default model is always set from GitHub config
-    default_model = llm_recommendations.get_default_model(provider.name)
+    default_model = llm_recommendations.get_default_model(provider.provider)
    if default_model and provider.default_model_name != default_model.name:
        provider.default_model_name = default_model.name
        changes += 1
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -26,6 +26,7 @@ from sqlalchemy import ForeignKey
 from sqlalchemy import func
 from sqlalchemy import Index
 from sqlalchemy import Integer
+from sqlalchemy import BigInteger

 from sqlalchemy import Sequence
 from sqlalchemy import String
@@ -83,7 +84,6 @@ from onyx.utils.special_types import JSON_ro
 from onyx.file_store.models import FileDescriptor
 from onyx.llm.override_models import LLMOverride
 from onyx.llm.override_models import PromptOverride
-from onyx.context.search.enums import RecencyBiasSetting
 from onyx.kg.models import KGStage
 from onyx.server.features.mcp.models import MCPConnectionData
 from onyx.utils.encryption import decrypt_bytes_to_string
@@ -91,6 +91,8 @@ from onyx.utils.encryption import encrypt_string_to_bytes
 from onyx.utils.headers import HeaderItemDict
 from shared_configs.enums import EmbeddingProvider
 from shared_configs.enums import RerankerProvider
+from onyx.context.search.enums import RecencyBiasSetting
+

 logger = setup_logger()

@@ -377,6 +379,17 @@ class Notification(Base):
        postgresql.JSONB(), nullable=True
    )

+    # Unique constraint ix_notification_user_type_data on (user_id, notif_type, additional_data)
+    # ensures notification deduplication for batch inserts. Defined in migration 8405ca81cc83.
+    __table_args__ = (
+        Index(
+            "ix_notification_user_sort",
+            "user_id",
+            "dismissed",
+            desc("first_shown"),
+        ),
+    )
+

 """
 Association Tables
@@ -2321,6 +2334,23 @@ class SearchDoc(Base):
    )


+class SearchQuery(Base):
+    # This table contains search queries for the Search UI. There are no followups and less is stored because the reply
+    # functionality is simply to rerun the search query again as things may have changed and this is more common for search.
+    __tablename__ = "search_query"
+    id: Mapped[UUID] = mapped_column(
+        PGUUID(as_uuid=True), primary_key=True, default=uuid4
+    )
+    user_id: Mapped[UUID] = mapped_column(PGUUID(as_uuid=True), ForeignKey("user.id"))
+    query: Mapped[str] = mapped_column(String)
+    query_expansions: Mapped[list[str] | None] = mapped_column(
+        postgresql.ARRAY(String), nullable=True
+    )
+    created_at: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now()
+    )
+
+
 """
 Feedback, Logging, Metrics Tables
 """
@@ -2605,6 +2635,7 @@ class Tool(Base):
    __tablename__ = "tool"

    id: Mapped[int] = mapped_column(Integer, primary_key=True)
+    # The name of the tool that the LLM will see
    name: Mapped[str] = mapped_column(String, nullable=False)
    description: Mapped[str] = mapped_column(Text, nullable=True)
    # ID of the tool in the codebase, only applies for in-code tools.
@@ -2901,8 +2932,6 @@ class PersonaLabel(Base):
        "Persona",
        secondary=Persona__PersonaLabel.__table__,
        back_populates="labels",
-        cascade="all, delete-orphan",
-        single_parent=True,
    )


@@ -3008,6 +3037,124 @@ class SlackBot(Base):
    )


+class DiscordBotConfig(Base):
+    """Global Discord bot configuration (one per tenant).
+
+    Stores the bot token when not provided via DISCORD_BOT_TOKEN env var.
+    Uses a fixed ID with check constraint to enforce only one row per tenant.
+    """
+
+    __tablename__ = "discord_bot_config"
+
+    id: Mapped[str] = mapped_column(
+        String, primary_key=True, server_default=text("'SINGLETON'")
+    )
+    bot_token: Mapped[str] = mapped_column(EncryptedString(), nullable=False)
+    created_at: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now(), nullable=False
+    )
+
+
+class DiscordGuildConfig(Base):
+    """Configuration for a Discord guild (server) connected to this tenant.
+
+    registration_key is a one-time key used to link a Discord server to this tenant.
+    Format: discord_<tenant_id>.<random_token>
+    guild_id is NULL until the Discord admin runs !register with the key.
+    """
+
+    __tablename__ = "discord_guild_config"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+
+    # Discord snowflake - NULL until registered via command in Discord
+    guild_id: Mapped[int | None] = mapped_column(BigInteger, nullable=True, unique=True)
+    guild_name: Mapped[str | None] = mapped_column(String(256), nullable=True)
+
+    # One-time registration key: discord_<tenant_id>.<random_token>
+    registration_key: Mapped[str] = mapped_column(String, unique=True, nullable=False)
+
+    registered_at: Mapped[datetime.datetime | None] = mapped_column(
+        DateTime(timezone=True), nullable=True
+    )
+
+    # Configuration
+    default_persona_id: Mapped[int | None] = mapped_column(
+        ForeignKey("persona.id", ondelete="SET NULL"), nullable=True
+    )
+    enabled: Mapped[bool] = mapped_column(
+        Boolean, server_default=text("true"), nullable=False
+    )
+
+    # Relationships
+    default_persona: Mapped["Persona | None"] = relationship(
+        "Persona", foreign_keys=[default_persona_id]
+    )
+    channels: Mapped[list["DiscordChannelConfig"]] = relationship(
+        back_populates="guild_config", cascade="all, delete-orphan"
+    )
+
+
+class DiscordChannelConfig(Base):
+    """Per-channel configuration for Discord bot behavior.
+
+    Used to whitelist specific channels and configure per-channel behavior.
+    """
+
+    __tablename__ = "discord_channel_config"
+
+    id: Mapped[int] = mapped_column(primary_key=True)
+    guild_config_id: Mapped[int] = mapped_column(
+        ForeignKey("discord_guild_config.id", ondelete="CASCADE"), nullable=False
+    )
+
+    # Discord snowflake
+    channel_id: Mapped[int] = mapped_column(BigInteger, nullable=False)
+    channel_name: Mapped[str] = mapped_column(String(), nullable=False)
+
+    # Channel type from Discord (text, forum)
+    channel_type: Mapped[str] = mapped_column(
+        String(20), server_default=text("'text'"), nullable=False
+    )
+
+    # True if @everyone cannot view the channel
+    is_private: Mapped[bool] = mapped_column(
+        Boolean, server_default=text("false"), nullable=False
+    )
+
+    # If true, bot only responds to messages in threads
+    # Otherwise, will reply in channel
+    thread_only_mode: Mapped[bool] = mapped_column(
+        Boolean, server_default=text("false"), nullable=False
+    )
+
+    # If true (default), bot only responds when @mentioned
+    # If false, bot responds to ALL messages in this channel
+    require_bot_invocation: Mapped[bool] = mapped_column(
+        Boolean, server_default=text("true"), nullable=False
+    )
+
+    # Override the guild's default persona for this channel
+    persona_override_id: Mapped[int | None] = mapped_column(
+        ForeignKey("persona.id", ondelete="SET NULL"), nullable=True
+    )
+
+    enabled: Mapped[bool] = mapped_column(
+        Boolean, server_default=text("false"), nullable=False
+    )
+
+    # Relationships
+    guild_config: Mapped["DiscordGuildConfig"] = relationship(back_populates="channels")
+    persona_override: Mapped["Persona | None"] = relationship()
+
+    # Constraints
+    __table_args__ = (
+        UniqueConstraint(
+            "guild_config_id", "channel_id", name="uq_discord_channel_guild_channel"
+        ),
+    )
+
+
 class Milestone(Base):
    # This table is used to track significant events for a deployment towards finding value
    # The table is currently not used for features but it may be used in the future to inform
@@ -3085,25 +3232,6 @@ class FileRecord(Base):
    )


-class AgentSearchMetrics(Base):
-    __tablename__ = "agent__search_metrics"
-
-    id: Mapped[int] = mapped_column(primary_key=True)
-    user_id: Mapped[UUID | None] = mapped_column(
-        ForeignKey("user.id", ondelete="CASCADE"), nullable=True
-    )
-    persona_id: Mapped[int | None] = mapped_column(
-        ForeignKey("persona.id"), nullable=True
-    )
-    agent_type: Mapped[str] = mapped_column(String)
-    start_time: Mapped[datetime.datetime] = mapped_column(DateTime(timezone=True))
-    base_duration_s: Mapped[float] = mapped_column(Float)
-    full_duration_s: Mapped[float] = mapped_column(Float)
-    base_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
-    refined_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
-    all_metrics: Mapped[JSON_ro] = mapped_column(postgresql.JSONB(), nullable=True)
-
-
 """
 ************************************************************************
 Enterprise Edition Models
--- a/backend/onyx/db/notification.py
+++ b/backend/onyx/db/notification.py
@@ -1,6 +1,11 @@
+from datetime import datetime
+from datetime import timezone
 from uuid import UUID

+from sqlalchemy import cast
 from sqlalchemy import select
+from sqlalchemy.dialects import postgresql
+from sqlalchemy.dialects.postgresql import insert
 from sqlalchemy.orm import Session
 from sqlalchemy.sql import func

@@ -17,23 +22,33 @@ def create_notification(
    title: str,
    description: str | None = None,
    additional_data: dict | None = None,
+    autocommit: bool = True,
 ) -> Notification:
-    # Check if an undismissed notification of the same type and data exists
+    # Previously, we only matched the first identical, undismissed notification
+    # Now, we assume some uniqueness to notifications
+    # If we previously issued a notification that was dismissed, we no longer issue a new one
+
+    # Normalize additional_data to match the unique index behavior
+    # The index uses COALESCE(additional_data, '{}'::jsonb)
+    # We need to match this logic in our query
+    additional_data_normalized = additional_data if additional_data is not None else {}
+
    existing_notification = (
        db_session.query(Notification)
-        .filter_by(
-            user_id=user_id,
-            notif_type=notif_type,
-            dismissed=False,
+        .filter_by(user_id=user_id, notif_type=notif_type)
+        .filter(
+            func.coalesce(Notification.additional_data, cast({}, postgresql.JSONB))
+            == additional_data_normalized
        )
-        .filter(Notification.additional_data == additional_data)
        .first()
    )

    if existing_notification:
-        # Update the last_shown timestamp
-        existing_notification.last_shown = func.now()
-        db_session.commit()
+        # Update the last_shown timestamp if the notification is not dismissed
+        if not existing_notification.dismissed:
+            existing_notification.last_shown = func.now()
+            if autocommit:
+                db_session.commit()
        return existing_notification

    # Create a new notification if none exists
@@ -48,7 +63,8 @@ def create_notification(
        additional_data=additional_data,
    )
    db_session.add(notification)
-    db_session.commit()
+    if autocommit:
+        db_session.commit()
    return notification


@@ -81,6 +97,11 @@ def get_notifications(
        query = query.where(Notification.dismissed.is_(False))
    if notif_type:
        query = query.where(Notification.notif_type == notif_type)
+    # Sort: undismissed first, then by date (newest first)
+    query = query.order_by(
+        Notification.dismissed.asc(),
+        Notification.first_shown.desc(),
+    )
    return list(db_session.execute(query).scalars().all())


@@ -99,6 +120,63 @@ def dismiss_notification(notification: Notification, db_session: Session) -> Non
    db_session.commit()


+def batch_dismiss_notifications(
+    notifications: list[Notification],
+    db_session: Session,
+) -> None:
+    for notification in notifications:
+        notification.dismissed = True
+    db_session.commit()
+
+
+def batch_create_notifications(
+    user_ids: list[UUID],
+    notif_type: NotificationType,
+    db_session: Session,
+    title: str,
+    description: str | None = None,
+    additional_data: dict | None = None,
+) -> int:
+    """
+    Create notifications for multiple users in a single batch operation.
+    Uses ON CONFLICT DO NOTHING for atomic idempotent inserts - if a user already
+    has a notification with the same (user_id, notif_type, additional_data), the
+    insert is silently skipped.
+
+    Returns the number of notifications created.
+
+    Relies on unique index on (user_id, notif_type, COALESCE(additional_data, '{}'))
+    """
+    if not user_ids:
+        return 0
+
+    now = datetime.now(timezone.utc)
+    # Use empty dict instead of None to match COALESCE behavior in the unique index
+    additional_data_normalized = additional_data if additional_data is not None else {}
+
+    values = [
+        {
+            "user_id": uid,
+            "notif_type": notif_type.value,
+            "title": title,
+            "description": description,
+            "dismissed": False,
+            "last_shown": now,
+            "first_shown": now,
+            "additional_data": additional_data_normalized,
+        }
+        for uid in user_ids
+    ]
+
+    stmt = insert(Notification).values(values).on_conflict_do_nothing()
+    result = db_session.execute(stmt)
+    db_session.commit()
+
+    # rowcount returns number of rows inserted (excludes conflicts)
+    # CursorResult has rowcount but session.execute type hints are too broad
+    return result.rowcount if result.rowcount >= 0 else 0  # type: ignore[attr-defined]
+
+
 def update_notification_last_shown(
    notification: Notification, db_session: Session
 ) -> None:
--- a/backend/onyx/db/persona.py
+++ b/backend/onyx/db/persona.py
@@ -187,13 +187,25 @@ def _get_persona_by_name(
    return result


-def make_persona_private(
+def update_persona_access(
    persona_id: int,
    creator_user_id: UUID | None,
-    user_ids: list[UUID] | None,
-    group_ids: list[int] | None,
    db_session: Session,
+    is_public: bool | None = None,
+    user_ids: list[UUID] | None = None,
+    group_ids: list[int] | None = None,
 ) -> None:
+    """Updates the access settings for a persona including public status and user shares.
+
+    NOTE: Callers are responsible for committing."""
+
+    if is_public is not None:
+        persona = db_session.query(Persona).filter(Persona.id == persona_id).first()
+        if persona:
+            persona.is_public = is_public
+
+    # NOTE: For user-ids and group-ids, `None` means "leave unchanged", `[]` means "clear all shares",
+    # and a non-empty list means "replace with these shares".
    if user_ids is not None:
        db_session.query(Persona__User).filter(
            Persona__User.persona_id == persona_id
@@ -212,11 +224,15 @@ def make_persona_private(
                    ).model_dump(),
                )

-        db_session.commit()
+    # MIT doesn't support group-based sharing, so we allow clearing (no-op since
+    # there shouldn't be any) but raise an error if trying to add actual groups.
+    if group_ids is not None:
+        db_session.query(Persona__UserGroup).filter(
+            Persona__UserGroup.persona_id == persona_id
+        ).delete(synchronize_session="fetch")

-    # May cause error if someone switches down to MIT from EE
-    if group_ids:
-        raise NotImplementedError("Onyx MIT does not support private Personas")
+        if group_ids:
+            raise NotImplementedError("Onyx MIT does not support group-based sharing")


 def create_update_persona(
@@ -282,20 +298,21 @@ def create_update_persona(
            llm_filter_extraction=create_persona_request.llm_filter_extraction,
            is_default_persona=create_persona_request.is_default_persona,
            user_file_ids=converted_user_file_ids,
+            commit=False,
        )

-        versioned_make_persona_private = fetch_versioned_implementation(
-            "onyx.db.persona", "make_persona_private"
+        versioned_update_persona_access = fetch_versioned_implementation(
+            "onyx.db.persona", "update_persona_access"
        )

-        # Privatize Persona
-        versioned_make_persona_private(
+        versioned_update_persona_access(
            persona_id=persona.id,
            creator_user_id=user.id if user else None,
+            db_session=db_session,
            user_ids=create_persona_request.users,
            group_ids=create_persona_request.groups,
-            db_session=db_session,
        )
+        db_session.commit()

    except ValueError as e:
        logger.exception("Failed to create persona")
@@ -304,11 +321,13 @@ def create_update_persona(
    return FullPersonaSnapshot.from_model(persona)


-def update_persona_shared_users(
+def update_persona_shared(
    persona_id: int,
-    user_ids: list[UUID],
    user: User | None,
    db_session: Session,
+    user_ids: list[UUID] | None = None,
+    group_ids: list[int] | None = None,
+    is_public: bool | None = None,
 ) -> None:
    """Simplified version of `create_update_persona` which only touches the
    accessibility rather than any of the logic (e.g. prompt, connected data sources,
@@ -317,22 +336,25 @@ def update_persona_shared_users(
        db_session=db_session, persona_id=persona_id, user=user, get_editable=True
    )

-    if persona.is_public:
-        raise HTTPException(status_code=400, detail="Cannot share public persona")
+    if user and user.role != UserRole.ADMIN and persona.user_id != user.id:
+        raise HTTPException(
+            status_code=403, detail="You don't have permission to modify this persona"
+        )

-    versioned_make_persona_private = fetch_versioned_implementation(
-        "onyx.db.persona", "make_persona_private"
+    versioned_update_persona_access = fetch_versioned_implementation(
+        "onyx.db.persona", "update_persona_access"
    )
-
-    # Privatize Persona
-    versioned_make_persona_private(
+    versioned_update_persona_access(
        persona_id=persona_id,
        creator_user_id=user.id if user else None,
-        user_ids=user_ids,
-        group_ids=None,
        db_session=db_session,
+        is_public=is_public,
+        user_ids=user_ids,
+        group_ids=group_ids,
    )

+    db_session.commit()
+

 def update_persona_public_status(
    persona_id: int,
@@ -895,7 +917,9 @@ def upsert_persona(
        existing_persona.icon_name = icon_name
        existing_persona.is_visible = is_visible
        existing_persona.search_start_date = search_start_date
-        existing_persona.labels = labels or []
+        if label_ids is not None:
+            existing_persona.labels.clear()
+            existing_persona.labels = labels or []
        existing_persona.is_default_persona = (
            is_default_persona
            if is_default_persona is not None
--- a/backend/onyx/db/release_notes.py
+++ b/backend/onyx/db/release_notes.py
@@ -0,0 +1,94 @@
+"""Database functions for release notes functionality."""
+
+from urllib.parse import urlencode
+
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from onyx.auth.schemas import UserRole
+from onyx.configs.app_configs import INSTANCE_TYPE
+from onyx.configs.constants import DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN
+from onyx.configs.constants import NotificationType
+from onyx.configs.constants import ONYX_UTM_SOURCE
+from onyx.db.models import User
+from onyx.db.notification import batch_create_notifications
+from onyx.server.features.release_notes.constants import DOCS_CHANGELOG_BASE_URL
+from onyx.server.features.release_notes.models import ReleaseNoteEntry
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def create_release_notifications_for_versions(
+    db_session: Session,
+    release_note_entries: list[ReleaseNoteEntry],
+) -> int:
+    """
+    Create release notes notifications for each release note entry.
+    Uses batch_create_notifications for efficient bulk insertion.
+
+    If a user already has a notification for a specific version (dismissed or not),
+    no new one is created (handled by unique constraint on additional_data).
+
+    Note: Entries should already be filtered by app_version before calling this
+    function. The filtering happens in _parse_mdx_to_release_note_entries().
+
+    Args:
+        db_session: Database session
+        release_note_entries: List of release note entries to notify about (pre-filtered)
+
+    Returns:
+        Total number of notifications created across all versions.
+    """
+    if not release_note_entries:
+        logger.debug("No release note entries to notify about")
+        return 0
+
+    # Get active users and exclude API key users
+    user_ids = list(
+        db_session.scalars(
+            select(User.id).where(  # type: ignore
+                User.is_active == True,  # noqa: E712
+                User.role.notin_([UserRole.SLACK_USER, UserRole.EXT_PERM_USER]),
+                User.email.endswith(DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN).is_(False),  # type: ignore[attr-defined]
+            )
+        ).all()
+    )
+
+    total_created = 0
+    for entry in release_note_entries:
+        # Convert version to anchor format for external docs links
+        # v2.7.0 -> v2-7-0
+        version_anchor = entry.version.replace(".", "-")
+
+        # Build UTM parameters for tracking
+        utm_params = {
+            "utm_source": ONYX_UTM_SOURCE,
+            "utm_medium": "notification",
+            "utm_campaign": INSTANCE_TYPE,
+            "utm_content": f"release_notes-{entry.version}",
+        }
+
+        link = f"{DOCS_CHANGELOG_BASE_URL}#{version_anchor}?{urlencode(utm_params)}"
+
+        additional_data: dict[str, str] = {
+            "version": entry.version,
+            "link": link,
+        }
+
+        created_count = batch_create_notifications(
+            user_ids,
+            NotificationType.RELEASE_NOTES,
+            db_session,
+            title=entry.title,
+            description=f"Check out what's new in {entry.version}",
+            additional_data=additional_data,
+        )
+        total_created += created_count
+
+        logger.debug(
+            f"Created {created_count} release notes notifications "
+            f"(version {entry.version}, {len(user_ids)} eligible users)"
+        )
+
+    return total_created
--- a/backend/onyx/db/utils.py
+++ b/backend/onyx/db/utils.py
@@ -40,3 +40,10 @@ class DocumentRow(BaseModel):
 class SortOrder(str, Enum):
    ASC = "asc"
    DESC = "desc"
+
+
+class DiscordChannelView(BaseModel):
+    channel_id: int
+    channel_name: str
+    channel_type: str = "text"  # text, forum
+    is_private: bool = False  # True if @everyone cannot view the channel
--- a/backend/onyx/db/web_search.py
+++ b/backend/onyx/db/web_search.py
@@ -113,7 +113,6 @@ def upsert_web_search_provider(
    if activate:
        set_active_web_search_provider(provider_id=provider.id, db_session=db_session)

-    db_session.commit()
    db_session.refresh(provider)
    return provider

@@ -269,7 +268,6 @@ def upsert_web_content_provider(
    if activate:
        set_active_web_content_provider(provider_id=provider.id, db_session=db_session)

-    db_session.commit()
    db_session.refresh(provider)
    return provider

--- a/backend/onyx/deep_research/dr_loop.py
+++ b/backend/onyx/deep_research/dr_loop.py
@@ -21,7 +21,6 @@ from onyx.configs.constants import MessageType
 from onyx.db.tools import get_tool_by_name
 from onyx.deep_research.dr_mock_tools import get_clarification_tool_definitions
 from onyx.deep_research.dr_mock_tools import get_orchestrator_tools
-from onyx.deep_research.dr_mock_tools import RESEARCH_AGENT_DB_NAME
 from onyx.deep_research.dr_mock_tools import RESEARCH_AGENT_TOOL_NAME
 from onyx.deep_research.dr_mock_tools import THINK_TOOL_RESPONSE_MESSAGE
 from onyx.deep_research.dr_mock_tools import THINK_TOOL_RESPONSE_TOKEN_COUNT
@@ -150,6 +149,9 @@ def generate_final_report(
            is_deep_research=True,
        )

+        # Save citation mapping to state_container so citations are persisted
+        state_container.set_citation_mapping(citation_processor.citation_to_doc)
+
        final_report = llm_step_result.answer
        if final_report is None:
            raise ValueError("LLM failed to generate the final deep research report")
@@ -217,35 +219,90 @@ def run_deep_research_llm_loop(
            else ""
        )
        if not skip_clarification:
-            clarification_prompt = CLARIFICATION_PROMPT.format(
-                current_datetime=get_current_llm_day_time(full_sentence=False),
-                internal_search_clarification_guidance=internal_search_clarification_guidance,
-            )
+            with function_span("clarification_step") as span:
+                clarification_prompt = CLARIFICATION_PROMPT.format(
+                    current_datetime=get_current_llm_day_time(full_sentence=False),
+                    internal_search_clarification_guidance=internal_search_clarification_guidance,
+                )
+                system_prompt = ChatMessageSimple(
+                    message=clarification_prompt,
+                    token_count=300,  # Skips the exact token count but has enough leeway
+                    message_type=MessageType.SYSTEM,
+                )
+
+                truncated_message_history = construct_message_history(
+                    system_prompt=system_prompt,
+                    custom_agent_prompt=None,
+                    simple_chat_history=simple_chat_history,
+                    reminder_message=None,
+                    project_files=None,
+                    available_tokens=available_tokens,
+                    last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT,
+                )
+
+                llm_step_result, _ = run_llm_step(
+                    emitter=emitter,
+                    history=truncated_message_history,
+                    tool_definitions=get_clarification_tool_definitions(),
+                    tool_choice=ToolChoiceOptions.AUTO,
+                    llm=llm,
+                    placement=Placement(turn_index=0),
+                    # No citations in this step, it should just pass through all
+                    # tokens directly so initialized as an empty citation processor
+                    citation_processor=None,
+                    state_container=state_container,
+                    final_documents=None,
+                    user_identity=user_identity,
+                    is_deep_research=True,
+                )
+
+                if not llm_step_result.tool_calls:
+                    # Mark this turn as a clarification question
+                    state_container.set_is_clarification(True)
+                    span.span_data.output = "clarification_required"
+
+                    emitter.emit(
+                        Packet(
+                            placement=Placement(turn_index=0),
+                            obj=OverallStop(type="stop"),
+                        )
+                    )
+
+                    # If a clarification is asked, we need to end this turn and wait on user input
+                    return
+
+        #########################################################
+        # RESEARCH PLAN STEP
+        #########################################################
+        with function_span("research_plan_step") as span:
            system_prompt = ChatMessageSimple(
-                message=clarification_prompt,
-                token_count=300,  # Skips the exact token count but has enough leeway
+                message=RESEARCH_PLAN_PROMPT.format(
+                    current_datetime=get_current_llm_day_time(full_sentence=False)
+                ),
+                token_count=300,
                message_type=MessageType.SYSTEM,
            )
-
+            reminder_message = ChatMessageSimple(
+                message=RESEARCH_PLAN_REMINDER,
+                token_count=100,
+                message_type=MessageType.USER,
+            )
            truncated_message_history = construct_message_history(
                system_prompt=system_prompt,
                custom_agent_prompt=None,
-                simple_chat_history=simple_chat_history,
+                simple_chat_history=simple_chat_history + [reminder_message],
                reminder_message=None,
                project_files=None,
                available_tokens=available_tokens,
-                last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT,
+                last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT + 1,
            )

-            llm_step_result, _ = run_llm_step(
-                emitter=emitter,
+            research_plan_generator = run_llm_step_pkt_generator(
                history=truncated_message_history,
-                tool_definitions=get_clarification_tool_definitions(),
-                tool_choice=ToolChoiceOptions.AUTO,
+                tool_definitions=[],
+                tool_choice=ToolChoiceOptions.NONE,
                llm=llm,
                placement=Placement(turn_index=0),
-                # No citations in this step, it should just pass through all
-                # tokens directly so initialized as an empty citation processor
                citation_processor=None,
                state_container=state_container,
                final_documents=None,
@@ -253,301 +310,177 @@ def run_deep_research_llm_loop(
                is_deep_research=True,
            )

-            if not llm_step_result.tool_calls:
-                # Mark this turn as a clarification question
-                state_container.set_is_clarification(True)
-
-                emitter.emit(
-                    Packet(
-                        placement=Placement(turn_index=0), obj=OverallStop(type="stop")
-                    )
-                )
-
-                # If a clarification is asked, we need to end this turn and wait on user input
-                return
-
-        #########################################################
-        # RESEARCH PLAN STEP
-        #########################################################
-        system_prompt = ChatMessageSimple(
-            message=RESEARCH_PLAN_PROMPT.format(
-                current_datetime=get_current_llm_day_time(full_sentence=False)
-            ),
-            token_count=300,
-            message_type=MessageType.SYSTEM,
-        )
-        reminder_message = ChatMessageSimple(
-            message=RESEARCH_PLAN_REMINDER,
-            token_count=100,
-            message_type=MessageType.USER,
-        )
-        truncated_message_history = construct_message_history(
-            system_prompt=system_prompt,
-            custom_agent_prompt=None,
-            simple_chat_history=simple_chat_history + [reminder_message],
-            reminder_message=None,
-            project_files=None,
-            available_tokens=available_tokens,
-            last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT + 1,
-        )
-
-        research_plan_generator = run_llm_step_pkt_generator(
-            history=truncated_message_history,
-            tool_definitions=[],
-            tool_choice=ToolChoiceOptions.NONE,
-            llm=llm,
-            placement=Placement(turn_index=0),
-            citation_processor=None,
-            state_container=state_container,
-            final_documents=None,
-            user_identity=user_identity,
-            is_deep_research=True,
-        )
-
-        while True:
-            try:
-                packet = next(research_plan_generator)
-                # Translate AgentResponseStart/Delta packets to DeepResearchPlanStart/Delta
-                # The LLM response from this prompt is the research plan
-                if isinstance(packet.obj, AgentResponseStart):
+            while True:
+                try:
+                    packet = next(research_plan_generator)
+                    # Translate AgentResponseStart/Delta packets to DeepResearchPlanStart/Delta
+                    # The LLM response from this prompt is the research plan
+                    if isinstance(packet.obj, AgentResponseStart):
+                        emitter.emit(
+                            Packet(
+                                placement=packet.placement,
+                                obj=DeepResearchPlanStart(),
+                            )
+                        )
+                    elif isinstance(packet.obj, AgentResponseDelta):
+                        emitter.emit(
+                            Packet(
+                                placement=packet.placement,
+                                obj=DeepResearchPlanDelta(content=packet.obj.content),
+                            )
+                        )
+                    else:
+                        # Pass through other packet types (e.g., ReasoningStart, ReasoningDelta, etc.)
+                        emitter.emit(packet)
+                except StopIteration as e:
+                    llm_step_result, reasoned = e.value
                    emitter.emit(
                        Packet(
-                            placement=packet.placement,
-                            obj=DeepResearchPlanStart(),
+                            # Marks the last turn end which should be the plan generation
+                            placement=Placement(
+                                turn_index=1 if reasoned else 0,
+                            ),
+                            obj=SectionEnd(),
                        )
                    )
-                elif isinstance(packet.obj, AgentResponseDelta):
-                    emitter.emit(
-                        Packet(
-                            placement=packet.placement,
-                            obj=DeepResearchPlanDelta(content=packet.obj.content),
-                        )
-                    )
-                else:
-                    # Pass through other packet types (e.g., ReasoningStart, ReasoningDelta, etc.)
-                    emitter.emit(packet)
-            except StopIteration as e:
-                llm_step_result, reasoned = e.value
-                emitter.emit(
-                    Packet(
-                        # Marks the last turn end which should be the plan generation
-                        placement=Placement(
-                            turn_index=1 if reasoned else 0,
-                        ),
-                        obj=SectionEnd(),
-                    )
-                )
-                if reasoned:
-                    orchestrator_start_turn_index += 1
-                break
-        llm_step_result = cast(LlmStepResult, llm_step_result)
+                    if reasoned:
+                        orchestrator_start_turn_index += 1
+                    break
+            llm_step_result = cast(LlmStepResult, llm_step_result)

-        research_plan = llm_step_result.answer
+            research_plan = llm_step_result.answer
+            span.span_data.output = research_plan if research_plan else None

        #########################################################
        # RESEARCH EXECUTION STEP
        #########################################################
-        is_reasoning_model = model_is_reasoning_model(
-            llm.config.model_name, llm.config.model_provider
-        )
+        with function_span("research_execution_step") as span:
+            is_reasoning_model = model_is_reasoning_model(
+                llm.config.model_name, llm.config.model_provider
+            )

-        max_orchestrator_cycles = (
-            MAX_ORCHESTRATOR_CYCLES
-            if not is_reasoning_model
-            else MAX_ORCHESTRATOR_CYCLES_REASONING
-        )
+            max_orchestrator_cycles = (
+                MAX_ORCHESTRATOR_CYCLES
+                if not is_reasoning_model
+                else MAX_ORCHESTRATOR_CYCLES_REASONING
+            )

-        orchestrator_prompt_template = (
-            ORCHESTRATOR_PROMPT
-            if not is_reasoning_model
-            else ORCHESTRATOR_PROMPT_REASONING
-        )
+            orchestrator_prompt_template = (
+                ORCHESTRATOR_PROMPT
+                if not is_reasoning_model
+                else ORCHESTRATOR_PROMPT_REASONING
+            )

-        internal_search_research_task_guidance = (
-            INTERNAL_SEARCH_RESEARCH_TASK_GUIDANCE
-            if include_internal_search_tunings
-            else ""
-        )
-        token_count_prompt = orchestrator_prompt_template.format(
-            current_datetime=get_current_llm_day_time(full_sentence=False),
-            current_cycle_count=1,
-            max_cycles=max_orchestrator_cycles,
-            research_plan=research_plan,
-            internal_search_research_task_guidance=internal_search_research_task_guidance,
-        )
-        orchestration_tokens = token_counter(token_count_prompt)
-
-        reasoning_cycles = 0
-        most_recent_reasoning: str | None = None
-        citation_mapping: CitationMapping = {}
-        final_turn_index: int = (
-            orchestrator_start_turn_index  # Track the final turn_index for stop packet
-        )
-        for cycle in range(max_orchestrator_cycles):
-            if cycle == max_orchestrator_cycles - 1:
-                # If it's the last cycle, forcibly generate the final report
-                report_turn_index = (
-                    orchestrator_start_turn_index + cycle + reasoning_cycles
-                )
-                report_reasoned = generate_final_report(
-                    history=simple_chat_history,
-                    llm=llm,
-                    token_counter=token_counter,
-                    state_container=state_container,
-                    emitter=emitter,
-                    turn_index=report_turn_index,
-                    citation_mapping=citation_mapping,
-                    user_identity=user_identity,
-                )
-                # Update final_turn_index: base + 1 for the report itself + 1 if reasoning occurred
-                final_turn_index = report_turn_index + (1 if report_reasoned else 0)
-                break
-
-            research_agent_calls: list[ToolCallKickoff] = []
-
-            orchestrator_prompt = orchestrator_prompt_template.format(
+            internal_search_research_task_guidance = (
+                INTERNAL_SEARCH_RESEARCH_TASK_GUIDANCE
+                if include_internal_search_tunings
+                else ""
+            )
+            token_count_prompt = orchestrator_prompt_template.format(
                current_datetime=get_current_llm_day_time(full_sentence=False),
-                current_cycle_count=cycle,
+                current_cycle_count=1,
                max_cycles=max_orchestrator_cycles,
                research_plan=research_plan,
                internal_search_research_task_guidance=internal_search_research_task_guidance,
            )
+            orchestration_tokens = token_counter(token_count_prompt)

-            system_prompt = ChatMessageSimple(
-                message=orchestrator_prompt,
-                token_count=orchestration_tokens,
-                message_type=MessageType.SYSTEM,
+            reasoning_cycles = 0
+            most_recent_reasoning: str | None = None
+            citation_mapping: CitationMapping = {}
+            final_turn_index: int = (
+                orchestrator_start_turn_index  # Track the final turn_index for stop packet
            )
+            for cycle in range(max_orchestrator_cycles):
+                if cycle == max_orchestrator_cycles - 1:
+                    # If it's the last cycle, forcibly generate the final report
+                    report_turn_index = (
+                        orchestrator_start_turn_index + cycle + reasoning_cycles
+                    )
+                    report_reasoned = generate_final_report(
+                        history=simple_chat_history,
+                        llm=llm,
+                        token_counter=token_counter,
+                        state_container=state_container,
+                        emitter=emitter,
+                        turn_index=report_turn_index,
+                        citation_mapping=citation_mapping,
+                        user_identity=user_identity,
+                    )
+                    # Update final_turn_index: base + 1 for the report itself + 1 if reasoning occurred
+                    final_turn_index = report_turn_index + (1 if report_reasoned else 0)
+                    break

-            truncated_message_history = construct_message_history(
-                system_prompt=system_prompt,
-                custom_agent_prompt=None,
-                simple_chat_history=simple_chat_history,
-                reminder_message=None,
-                project_files=None,
-                available_tokens=available_tokens,
-                last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT,
-            )
+                research_agent_calls: list[ToolCallKickoff] = []

-            # Use think tool processor for non-reasoning models to convert
-            # think_tool calls to reasoning content
-            custom_processor = (
-                create_think_tool_token_processor() if not is_reasoning_model else None
-            )
-
-            llm_step_result, has_reasoned = run_llm_step(
-                emitter=emitter,
-                history=truncated_message_history,
-                tool_definitions=get_orchestrator_tools(
-                    include_think_tool=not is_reasoning_model
-                ),
-                tool_choice=ToolChoiceOptions.REQUIRED,
-                llm=llm,
-                placement=Placement(
-                    turn_index=orchestrator_start_turn_index + cycle + reasoning_cycles
-                ),
-                # No citations in this step, it should just pass through all
-                # tokens directly so initialized as an empty citation processor
-                citation_processor=DynamicCitationProcessor(),
-                state_container=state_container,
-                final_documents=None,
-                user_identity=user_identity,
-                custom_token_processor=custom_processor,
-                is_deep_research=True,
-            )
-            if has_reasoned:
-                reasoning_cycles += 1
-
-            tool_calls = llm_step_result.tool_calls or []
-
-            if not tool_calls and cycle == 0:
-                raise RuntimeError(
-                    "Deep Research failed to generate any research tasks for the agents."
+                orchestrator_prompt = orchestrator_prompt_template.format(
+                    current_datetime=get_current_llm_day_time(full_sentence=False),
+                    current_cycle_count=cycle,
+                    max_cycles=max_orchestrator_cycles,
+                    research_plan=research_plan,
+                    internal_search_research_task_guidance=internal_search_research_task_guidance,
                )

-            if not tool_calls:
-                # Basically hope that this is an infrequent occurence and hopefully multiple research
-                # cycles have already ran
-                logger.warning("No tool calls found, this should not happen.")
-                report_turn_index = (
-                    orchestrator_start_turn_index + cycle + reasoning_cycles
+                system_prompt = ChatMessageSimple(
+                    message=orchestrator_prompt,
+                    token_count=orchestration_tokens,
+                    message_type=MessageType.SYSTEM,
                )
-                report_reasoned = generate_final_report(
-                    history=simple_chat_history,
-                    llm=llm,
-                    token_counter=token_counter,
-                    state_container=state_container,
+
+                truncated_message_history = construct_message_history(
+                    system_prompt=system_prompt,
+                    custom_agent_prompt=None,
+                    simple_chat_history=simple_chat_history,
+                    reminder_message=None,
+                    project_files=None,
+                    available_tokens=available_tokens,
+                    last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT,
+                )
+
+                # Use think tool processor for non-reasoning models to convert
+                # think_tool calls to reasoning content
+                custom_processor = (
+                    create_think_tool_token_processor()
+                    if not is_reasoning_model
+                    else None
+                )
+
+                llm_step_result, has_reasoned = run_llm_step(
                    emitter=emitter,
-                    turn_index=report_turn_index,
-                    citation_mapping=citation_mapping,
-                    user_identity=user_identity,
-                )
-                final_turn_index = report_turn_index + (1 if report_reasoned else 0)
-                break
-
-            special_tool_calls = check_special_tool_calls(tool_calls=tool_calls)
-
-            if special_tool_calls.generate_report_tool_call:
-                report_turn_index = (
-                    special_tool_calls.generate_report_tool_call.placement.turn_index
-                )
-                report_reasoned = generate_final_report(
-                    history=simple_chat_history,
+                    history=truncated_message_history,
+                    tool_definitions=get_orchestrator_tools(
+                        include_think_tool=not is_reasoning_model
+                    ),
+                    tool_choice=ToolChoiceOptions.REQUIRED,
                    llm=llm,
-                    token_counter=token_counter,
+                    placement=Placement(
+                        turn_index=orchestrator_start_turn_index
+                        + cycle
+                        + reasoning_cycles
+                    ),
+                    # No citations in this step, it should just pass through all
+                    # tokens directly so initialized as an empty citation processor
+                    citation_processor=DynamicCitationProcessor(),
                    state_container=state_container,
-                    emitter=emitter,
-                    turn_index=report_turn_index,
-                    citation_mapping=citation_mapping,
+                    final_documents=None,
                    user_identity=user_identity,
-                    saved_reasoning=most_recent_reasoning,
+                    custom_token_processor=custom_processor,
+                    is_deep_research=True,
                )
-                final_turn_index = report_turn_index + (1 if report_reasoned else 0)
-                break
-            elif special_tool_calls.think_tool_call:
-                think_tool_call = special_tool_calls.think_tool_call
-                # Only process the THINK_TOOL and skip all other tool calls
-                # This will not actually get saved to the db as a tool call but we'll attach it to the tool(s) called after
-                # it as if it were just a reasoning model doing it. In the chat history, because it happens in 2 steps,
-                # we will show it as a separate message.
-                # NOTE: This does not need to increment the reasoning cycles because the custom token processor causes
-                # the LLM step to handle this
-                with function_span("think_tool") as span:
-                    span.span_data.input = str(think_tool_call.tool_args)
-                    most_recent_reasoning = state_container.reasoning_tokens
-                    tool_call_message = think_tool_call.to_msg_str()
+                if has_reasoned:
+                    reasoning_cycles += 1

-                    think_tool_msg = ChatMessageSimple(
-                        message=tool_call_message,
-                        token_count=token_counter(tool_call_message),
-                        message_type=MessageType.TOOL_CALL,
-                        tool_call_id=think_tool_call.tool_call_id,
-                        image_files=None,
+                tool_calls = llm_step_result.tool_calls or []
+
+                if not tool_calls and cycle == 0:
+                    raise RuntimeError(
+                        "Deep Research failed to generate any research tasks for the agents."
                    )
-                    simple_chat_history.append(think_tool_msg)

-                    think_tool_response_msg = ChatMessageSimple(
-                        message=THINK_TOOL_RESPONSE_MESSAGE,
-                        token_count=THINK_TOOL_RESPONSE_TOKEN_COUNT,
-                        message_type=MessageType.TOOL_CALL_RESPONSE,
-                        tool_call_id=think_tool_call.tool_call_id,
-                        image_files=None,
-                    )
-                    simple_chat_history.append(think_tool_response_msg)
-                    span.span_data.output = THINK_TOOL_RESPONSE_MESSAGE
-                continue
-            else:
-                for tool_call in tool_calls:
-                    if tool_call.tool_name != RESEARCH_AGENT_TOOL_NAME:
-                        logger.warning(f"Unexpected tool call: {tool_call.tool_name}")
-                        continue
-
-                    research_agent_calls.append(tool_call)
-
-                if not research_agent_calls:
-                    logger.warning(
-                        "No research agent tool calls found, this should not happen."
-                    )
+                if not tool_calls:
+                    # Basically hope that this is an infrequent occurence and hopefully multiple research
+                    # cycles have already ran
+                    logger.warning("No tool calls found, this should not happen.")
                    report_turn_index = (
                        orchestrator_start_turn_index + cycle + reasoning_cycles
                    )
@@ -564,91 +497,177 @@ def run_deep_research_llm_loop(
                    final_turn_index = report_turn_index + (1 if report_reasoned else 0)
                    break

-                if len(research_agent_calls) > 1:
-                    emitter.emit(
-                        Packet(
-                            placement=Placement(
-                                turn_index=research_agent_calls[0].placement.turn_index
-                            ),
-                            obj=TopLevelBranching(
-                                num_parallel_branches=len(research_agent_calls)
-                            ),
+                special_tool_calls = check_special_tool_calls(tool_calls=tool_calls)
+
+                if special_tool_calls.generate_report_tool_call:
+                    report_turn_index = (
+                        special_tool_calls.generate_report_tool_call.placement.turn_index
+                    )
+                    report_reasoned = generate_final_report(
+                        history=simple_chat_history,
+                        llm=llm,
+                        token_counter=token_counter,
+                        state_container=state_container,
+                        emitter=emitter,
+                        turn_index=report_turn_index,
+                        citation_mapping=citation_mapping,
+                        user_identity=user_identity,
+                        saved_reasoning=most_recent_reasoning,
+                    )
+                    final_turn_index = report_turn_index + (1 if report_reasoned else 0)
+                    break
+                elif special_tool_calls.think_tool_call:
+                    think_tool_call = special_tool_calls.think_tool_call
+                    # Only process the THINK_TOOL and skip all other tool calls
+                    # This will not actually get saved to the db as a tool call but we'll attach it to the tool(s) called after
+                    # it as if it were just a reasoning model doing it. In the chat history, because it happens in 2 steps,
+                    # we will show it as a separate message.
+                    # NOTE: This does not need to increment the reasoning cycles because the custom token processor causes
+                    # the LLM step to handle this
+                    with function_span("think_tool") as span:
+                        span.span_data.input = str(think_tool_call.tool_args)
+                        most_recent_reasoning = state_container.reasoning_tokens
+                        tool_call_message = think_tool_call.to_msg_str()
+
+                        think_tool_msg = ChatMessageSimple(
+                            message=tool_call_message,
+                            token_count=token_counter(tool_call_message),
+                            message_type=MessageType.TOOL_CALL,
+                            tool_call_id=think_tool_call.tool_call_id,
+                            image_files=None,
                        )
-                    )
+                        simple_chat_history.append(think_tool_msg)

-                research_results = run_research_agent_calls(
-                    # The tool calls here contain the placement information
-                    research_agent_calls=research_agent_calls,
-                    parent_tool_call_ids=[
-                        tool_call.tool_call_id for tool_call in tool_calls
-                    ],
-                    tools=allowed_tools,
-                    emitter=emitter,
-                    state_container=state_container,
-                    llm=llm,
-                    is_reasoning_model=is_reasoning_model,
-                    token_counter=token_counter,
-                    citation_mapping=citation_mapping,
-                    user_identity=user_identity,
-                )
-
-                citation_mapping = research_results.citation_mapping
-
-                for tab_index, report in enumerate(
-                    research_results.intermediate_reports
-                ):
-                    if report is None:
-                        # The LLM will not see that this research was even attempted, it may try
-                        # something similar again but this is not bad.
-                        logger.error(
-                            f"Research agent call at tab_index {tab_index} failed, skipping"
+                        think_tool_response_msg = ChatMessageSimple(
+                            message=THINK_TOOL_RESPONSE_MESSAGE,
+                            token_count=THINK_TOOL_RESPONSE_TOKEN_COUNT,
+                            message_type=MessageType.TOOL_CALL_RESPONSE,
+                            tool_call_id=think_tool_call.tool_call_id,
+                            image_files=None,
                        )
-                        continue
+                        simple_chat_history.append(think_tool_response_msg)
+                        span.span_data.output = THINK_TOOL_RESPONSE_MESSAGE
+                    continue
+                else:
+                    for tool_call in tool_calls:
+                        if tool_call.tool_name != RESEARCH_AGENT_TOOL_NAME:
+                            logger.warning(
+                                f"Unexpected tool call: {tool_call.tool_name}"
+                            )
+                            continue

-                    current_tool_call = research_agent_calls[tab_index]
-                    tool_call_info = ToolCallInfo(
-                        parent_tool_call_id=None,
-                        turn_index=orchestrator_start_turn_index
-                        + cycle
-                        + reasoning_cycles,
-                        tab_index=tab_index,
-                        tool_name=current_tool_call.tool_name,
-                        tool_call_id=current_tool_call.tool_call_id,
-                        tool_id=get_tool_by_name(
-                            tool_name=RESEARCH_AGENT_DB_NAME, db_session=db_session
-                        ).id,
-                        reasoning_tokens=llm_step_result.reasoning
-                        or most_recent_reasoning,
-                        tool_call_arguments=current_tool_call.tool_args,
-                        tool_call_response=report,
-                        search_docs=None,  # Intermediate docs are not saved/shown
-                        generated_images=None,
+                        research_agent_calls.append(tool_call)
+
+                    if not research_agent_calls:
+                        logger.warning(
+                            "No research agent tool calls found, this should not happen."
+                        )
+                        report_turn_index = (
+                            orchestrator_start_turn_index + cycle + reasoning_cycles
+                        )
+                        report_reasoned = generate_final_report(
+                            history=simple_chat_history,
+                            llm=llm,
+                            token_counter=token_counter,
+                            state_container=state_container,
+                            emitter=emitter,
+                            turn_index=report_turn_index,
+                            citation_mapping=citation_mapping,
+                            user_identity=user_identity,
+                        )
+                        final_turn_index = report_turn_index + (
+                            1 if report_reasoned else 0
+                        )
+                        break
+
+                    if len(research_agent_calls) > 1:
+                        emitter.emit(
+                            Packet(
+                                placement=Placement(
+                                    turn_index=research_agent_calls[
+                                        0
+                                    ].placement.turn_index
+                                ),
+                                obj=TopLevelBranching(
+                                    num_parallel_branches=len(research_agent_calls)
+                                ),
+                            )
+                        )
+
+                    research_results = run_research_agent_calls(
+                        # The tool calls here contain the placement information
+                        research_agent_calls=research_agent_calls,
+                        parent_tool_call_ids=[
+                            tool_call.tool_call_id for tool_call in tool_calls
+                        ],
+                        tools=allowed_tools,
+                        emitter=emitter,
+                        state_container=state_container,
+                        llm=llm,
+                        is_reasoning_model=is_reasoning_model,
+                        token_counter=token_counter,
+                        citation_mapping=citation_mapping,
+                        user_identity=user_identity,
                    )
-                    state_container.add_tool_call(tool_call_info)

-                    tool_call_message = current_tool_call.to_msg_str()
-                    tool_call_token_count = token_counter(tool_call_message)
+                    citation_mapping = research_results.citation_mapping

-                    tool_call_msg = ChatMessageSimple(
-                        message=tool_call_message,
-                        token_count=tool_call_token_count,
-                        message_type=MessageType.TOOL_CALL,
-                        tool_call_id=current_tool_call.tool_call_id,
-                        image_files=None,
-                    )
-                    simple_chat_history.append(tool_call_msg)
+                    for tab_index, report in enumerate(
+                        research_results.intermediate_reports
+                    ):
+                        if report is None:
+                            # The LLM will not see that this research was even attempted, it may try
+                            # something similar again but this is not bad.
+                            logger.error(
+                                f"Research agent call at tab_index {tab_index} failed, skipping"
+                            )
+                            continue

-                    tool_call_response_msg = ChatMessageSimple(
-                        message=report,
-                        token_count=token_counter(report),
-                        message_type=MessageType.TOOL_CALL_RESPONSE,
-                        tool_call_id=current_tool_call.tool_call_id,
-                        image_files=None,
-                    )
-                    simple_chat_history.append(tool_call_response_msg)
+                        current_tool_call = research_agent_calls[tab_index]
+                        tool_call_info = ToolCallInfo(
+                            parent_tool_call_id=None,
+                            turn_index=orchestrator_start_turn_index
+                            + cycle
+                            + reasoning_cycles,
+                            tab_index=tab_index,
+                            tool_name=current_tool_call.tool_name,
+                            tool_call_id=current_tool_call.tool_call_id,
+                            tool_id=get_tool_by_name(
+                                tool_name=RESEARCH_AGENT_TOOL_NAME,
+                                db_session=db_session,
+                            ).id,
+                            reasoning_tokens=llm_step_result.reasoning
+                            or most_recent_reasoning,
+                            tool_call_arguments=current_tool_call.tool_args,
+                            tool_call_response=report,
+                            search_docs=None,  # Intermediate docs are not saved/shown
+                            generated_images=None,
+                        )
+                        state_container.add_tool_call(tool_call_info)

-            # If it reached this point, it did not call reasoning, so here we wipe it to not save it to multiple turns
-            most_recent_reasoning = None
+                        tool_call_message = current_tool_call.to_msg_str()
+                        tool_call_token_count = token_counter(tool_call_message)
+
+                        tool_call_msg = ChatMessageSimple(
+                            message=tool_call_message,
+                            token_count=tool_call_token_count,
+                            message_type=MessageType.TOOL_CALL,
+                            tool_call_id=current_tool_call.tool_call_id,
+                            image_files=None,
+                        )
+                        simple_chat_history.append(tool_call_msg)
+
+                        tool_call_response_msg = ChatMessageSimple(
+                            message=report,
+                            token_count=token_counter(report),
+                            message_type=MessageType.TOOL_CALL_RESPONSE,
+                            tool_call_id=current_tool_call.tool_call_id,
+                            image_files=None,
+                        )
+                        simple_chat_history.append(tool_call_response_msg)
+
+                # If it reached this point, it did not call reasoning, so here we wipe it to not save it to multiple turns
+                most_recent_reasoning = None

        emitter.emit(
            Packet(
--- a/backend/onyx/deep_research/dr_mock_tools.py
+++ b/backend/onyx/deep_research/dr_mock_tools.py
@@ -1,6 +1,6 @@
 GENERATE_PLAN_TOOL_NAME = "generate_plan"

-RESEARCH_AGENT_DB_NAME = "ResearchAgent"
+RESEARCH_AGENT_IN_CODE_ID = "ResearchAgent"
 RESEARCH_AGENT_TOOL_NAME = "research_agent"
 RESEARCH_AGENT_TASK_KEY = "task"

--- a/backend/onyx/document_index/chunk_content_enrichment.py
+++ b/backend/onyx/document_index/chunk_content_enrichment.py
@@ -0,0 +1,101 @@
+from onyx.configs.app_configs import BLURB_SIZE
+from onyx.configs.constants import RETURN_SEPARATOR
+from onyx.context.search.models import InferenceChunk
+from onyx.context.search.models import InferenceChunkUncleaned
+from onyx.indexing.models import DocMetadataAwareIndexChunk
+
+
+def generate_enriched_content_for_chunk(chunk: DocMetadataAwareIndexChunk) -> str:
+    return f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_keyword}"
+
+
+def cleanup_content_for_chunks(
+    chunks: list[InferenceChunkUncleaned],
+) -> list[InferenceChunk]:
+    """
+    Removes indexing-time content additions from chunks. Inverse of
+    generate_enriched_content_for_chunk.
+
+    During indexing, chunks are augmented with additional text to improve search
+    quality:
+    - Title prepended to content (for better keyword/semantic matching)
+    - Metadata suffix appended to content
+    - Contextual RAG: doc_summary (beginning) and chunk_context (end)
+
+    This function strips these additions before returning chunks to users,
+    restoring the original document content. Cleaning is applied in sequence:
+    1. Title removal:
+        - Full match: Strips exact title from beginning
+        - Partial match: If content starts with title[:BLURB_SIZE], splits on
+          RETURN_SEPARATOR to remove title section
+    2. Metadata suffix removal:
+        - Strips metadata_suffix from end, plus trailing RETURN_SEPARATOR
+    3. Contextual RAG removal:
+        - Strips doc_summary from beginning (if present)
+        - Strips chunk_context from end (if present)
+
+    TODO(andrei): This entire function is not that fantastic, clean it up during
+    QA before rolling out OpenSearch.
+
+    Args:
+        chunks: Chunks as retrieved from the document index with indexing
+            augmentations intact.
+
+    Returns:
+        Clean InferenceChunk objects with augmentations removed, containing only
+            the original document content that should be shown to users.
+    """
+
+    def _remove_title(chunk: InferenceChunkUncleaned) -> str:
+        # TODO(andrei): This was ported over from
+        # backend/onyx/document_index/vespa/vespa_document_index.py but I don't
+        # think this logic is correct. In Vespa at least we set the title field
+        # from the output of get_title_for_document_index, which is not
+        # necessarily the same data that is prepended to the content; that comes
+        # from title_prefix.
+        # This was added in
+        # https://github.com/onyx-dot-app/onyx/commit/e90c66c1b61c5b7da949652d703f7c906863e6e4#diff-2a2a29d5929de75cdaea77867a397934d9f8b785ce40a861c0d704033e3663ab,
+        # see postprocessing.py. At that time the content enrichment logic was
+        # also added in that commit, see
+        # https://github.com/onyx-dot-app/onyx/commit/e90c66c1b61c5b7da949652d703f7c906863e6e4#diff-d807718aa263a15c1d991a4ab063c360c8419eaad210b4ba70e1e9f47d2aa6d2R77
+        # chunker.py.
+        if not chunk.title or not chunk.content:
+            return chunk.content
+
+        if chunk.content.startswith(chunk.title):
+            return chunk.content[len(chunk.title) :].lstrip()
+
+        # BLURB SIZE is by token instead of char but each token is at least 1 char
+        # If this prefix matches the content, it's assumed the title was prepended
+        if chunk.content.startswith(chunk.title[:BLURB_SIZE]):
+            return (
+                chunk.content.split(RETURN_SEPARATOR, 1)[-1]
+                if RETURN_SEPARATOR in chunk.content
+                else chunk.content
+            )
+        return chunk.content
+
+    def _remove_metadata_suffix(chunk: InferenceChunkUncleaned) -> str:
+        if not chunk.metadata_suffix:
+            return chunk.content
+        return chunk.content.removesuffix(chunk.metadata_suffix).rstrip(
+            RETURN_SEPARATOR
+        )
+
+    def _remove_contextual_rag(chunk: InferenceChunkUncleaned) -> str:
+        # remove document summary
+        if chunk.doc_summary and chunk.content.startswith(chunk.doc_summary):
+            chunk.content = chunk.content[len(chunk.doc_summary) :].lstrip()
+        # remove chunk context
+        if chunk.chunk_context and chunk.content.endswith(chunk.chunk_context):
+            chunk.content = chunk.content[
+                : len(chunk.content) - len(chunk.chunk_context)
+            ].rstrip()
+        return chunk.content
+
+    for chunk in chunks:
+        chunk.content = _remove_title(chunk)
+        chunk.content = _remove_metadata_suffix(chunk)
+        chunk.content = _remove_contextual_rag(chunk)
+
+    return [chunk.to_inference_chunk() for chunk in chunks]
--- a/backend/onyx/document_index/interfaces_new.py
+++ b/backend/onyx/document_index/interfaces_new.py
@@ -167,9 +167,9 @@ class IndexRetrievalFilters(BaseModel):

 class SchemaVerifiable(abc.ABC):
    """
-    Class must implement document index schema verification. For example, verify that all of the
-    necessary attributes for indexing, querying, filtering, and fields to return from search are
-    all valid in the schema.
+    Class must implement document index schema verification. For example, verify
+    that all of the necessary attributes for indexing, querying, filtering, and
+    fields to return from search are all valid in the schema.
    """

    @abc.abstractmethod
@@ -179,13 +179,18 @@ class SchemaVerifiable(abc.ABC):
        embedding_precision: EmbeddingPrecision,
    ) -> None:
        """
-        Verify that the document index exists and is consistent with the expectations in the code. For certain search
-        engines, the schema needs to be created before indexing can happen. This call should create the schema if it
-        does not exist.
+        Verifies that the document index exists and is consistent with the
+        expectations in the code.

-        Parameters:
-        - embedding_dim: Vector dimensionality for the vector similarity part of the search
-        - embedding_precision: Precision of the vector similarity part of the search
+        For certain search engines, the schema needs to be created before
+        indexing can happen. This call should create the schema if it does not
+        exist.
+
+        Args:
+            embedding_dim: Vector dimensionality for the vector similarity part
+                of the search.
+            embedding_precision: Precision of the values of the vectors for the
+                similarity part of the search.
        """
        raise NotImplementedError

@@ -238,8 +243,8 @@ class Deletable(abc.ABC):
    @abc.abstractmethod
    def delete(
        self,
-        # TODO(andrei): Fine for now but this can probably be a batch operation that
-        # takes in a list of IDs.
+        # TODO(andrei): Fine for now but this can probably be a batch operation
+        # that takes in a list of IDs.
        document_id: str,
        chunk_count: int | None = None,
        # TODO(andrei): Shouldn't this also have some acl filtering at minimum?
@@ -283,10 +288,7 @@ class Updatable(abc.ABC):
        self,
        update_requests: list[MetadataUpdateRequest],
    ) -> None:
-        """
-        Updates some set of chunks. The document and fields to update are specified in the update
-        requests. Each update request in the list applies its changes to a list of document ids.
-        None values mean that the field does not need an update.
+        """Updates some set of chunks.

        The document and fields to update are specified in the update requests.
        Each update request in the list applies its changes to a list of
--- a/backend/onyx/document_index/opensearch/README.md
+++ b/backend/onyx/document_index/opensearch/README.md
@@ -0,0 +1,62 @@
+# Opensearch Idiosyncrasies
+
+## How it works at a high level
+Opensearch has 2 phases, a `Search` phase and a `Fetch` phase. The `Search` phase works by getting the document scores on each
+shard separately, then typically a fetch phase grabs all of the relevant fields/data for returning to the user. There is also
+an intermediate phase (seemingly built specifically to handle hybrid search queries) which can run in between as a processor.
+References:
+https://docs.opensearch.org/latest/search-plugins/search-pipelines/search-processors/
+https://docs.opensearch.org/latest/search-plugins/search-pipelines/normalization-processor/
+https://docs.opensearch.org/latest/query-dsl/compound/hybrid/
+
+## How Hybrid queries work
+Hybrid queries are basically parallel queries that each run through their own `Search` phase and do not interact in any way.
+They also run across all the shards. It is not entirely clear what happens if a combination pipeline is not specified for them,
+perhaps the scores are just summed.
+
+When the normalization processor is applied to keyword/vector hybrid searches, documents that show up due to keyword match may
+not also have showed up in the vector search and vice versa. In these situations, it just receives a 0 score for the missing
+query component. Opensearch does not run another phase to recapture those missing values. The impact of this is that after
+normalizing, the missing scores are 0 but this is a higher score than if it actually received a non-zero score.
+
+This may not be immediately obvious so an explanation is included here. If it got a non-zero score instead, it must be lower
+than all of the other scores of the list (otherwise it would have shown up). Therefore it would impact the normalization and
+push the other scores higher so that it's not only the lowest score still, but now it's a differentiated lowest score. This is
+not strictly the case in a multi-node setup but the high level concept approximately holds. So basically the 0 score is a form
+of "minimum value clipping".
+
+## On time decay and boosting
+Embedding models do not have a uniform distribution from 0 to 1. The values typically cluster strongly around 0.6 to 0.8 but also
+varies between models and even the query. It is not a safe assumption to pre-normalize the scores so we also cannot apply any
+additive or multiplicative boost to it. Ie. if results of a doc cluster around 0.6 to 0.8 and I give a 50% penalty to the score,
+it doesn't bring a result from the top of the range to 50 percentile, it brings its under the 0.6 and is now the worst match.
+Same logic applies to additive boosting.
+
+So these boosts can only be applied after normalization. Unfortunately with Opensearch, the normalization processor runs last
+and only applies to the results of the completely independent `Search` phase queries. So if a time based boost (a separate
+query which filters on recently updated documents) is added, it would not be able to introduce any new documents
+to the set (since the new documents would have no keyword/vector score or already be present) since the 0 scores on keyword
+and vector would make the docs which only came because of time filter very low scoring. This can however make some of the lower
+scored documents from the union of all the `Search` phase documents to show up higher and potentially not get dropped before
+being fetched and returned to the user. But there are other issues of including these:
+- There is no way to sort by this field, only a filter, so there's no way to guarantee the best docs even irrespective of the
+contents. If there are lots of updates, this may miss
+- There is not a good way to normalize this field, the best is to clip it on the bottom.
+- This would require using min-max norm but z-score norm is better for the other functions due to things like it being less
+sensitive to outliers, better handles distribution drifts (min-max assumes stable meaningful ranges), better for comparing
+"unusual-ness" across distributions.
+
+So while it is possible to apply time based boosting at the normalization stage (or specifically to the keyword score), we have
+decided it is better to not apply it during the OpenSearch query.
+
+Because of these limitations, Onyx in code applies further refinements, boostings, etc. based on OpenSearch providing an initial
+filtering. The impact of time decay and boost should not be so big that we would need orders of magnitude more results back
+from OpenSearch.
+
+## Other concepts to be aware of
+Within the `Search` phase, there are optional steps like Rescore but these are not useful for the combination/normalization
+work that is relevant for the hybrid search. Since the Rescore happens prior to normalization, it's not able to provide any
+meaningful operations to the query for our usage.
+
+Because the Title is included in the Contents for both embedding and keyword searches, the Title scores are very low relative to
+the actual full contents scoring. It is seen as a boost rather than a core scoring component. Time decay works similarly.
--- a/Show More
+++ b/Show More