.

2026-02-19 00:35:46 +00:00 · 2025-12-12 08:20:00 -10:00 · 2025-12-11 14:52:24 -10:00 · 2025-12-11 14:51:11 -10:00 · 2025-12-11 14:22:14 -10:00
632 changed files with 12378 additions and 16063 deletions
--- a/.github/workflows/check-lazy-imports.yml
+++ b/.github/workflows/check-lazy-imports.yml
@@ -0,0 +1,33 @@
+name: Check Lazy Imports
+concurrency:
+  group: Check-Lazy-Imports-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }}
+  cancel-in-progress: true
+
+on:
+  merge_group:
+  pull_request:
+    branches:
+      - main
+      - 'release/**'
+
+permissions:
+  contents: read
+
+jobs:
+  check-lazy-imports:
+    runs-on: ubuntu-latest
+    timeout-minutes: 45
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
+      with:
+        persist-credentials: false
+
+    - name: Set up Python
+      uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # ratchet:actions/setup-python@v6
+      with:
+        python-version: '3.11'
+
+    - name: Check lazy imports
+      run: python3 backend/scripts/check_lazy_imports.py
--- a/.github/workflows/deployment.yml
+++ b/.github/workflows/deployment.yml
@@ -89,10 +89,9 @@ jobs:
    if: ${{ !startsWith(github.ref_name, 'nightly-latest') && github.event_name != 'workflow_dispatch' }}
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false
-          fetch-depth: 0

      - name: Setup uv
        uses: astral-sh/setup-uv@1e862dfacbd1d6d858c55d9b792c756523627244 # ratchet:astral-sh/setup-uv@v7.1.4
@@ -112,7 +111,7 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -141,7 +140,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -199,7 +198,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -307,7 +306,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -373,7 +372,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -486,7 +485,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -543,7 +542,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -651,7 +650,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -715,7 +714,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -908,7 +907,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -998,7 +997,7 @@ jobs:
    timeout-minutes: 90
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

--- a/.github/workflows/helm-chart-releases.yml
+++ b/.github/workflows/helm-chart-releases.yml
@@ -15,7 +15,7 @@ jobs:
    timeout-minutes: 45
    steps:
      - name: Checkout
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          fetch-depth: 0
          persist-credentials: false
--- a/.github/workflows/nightly-scan-licenses.yml
+++ b/.github/workflows/nightly-scan-licenses.yml
@@ -28,7 +28,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

--- a/.github/workflows/pr-external-dependency-unit-tests.yml
+++ b/.github/workflows/pr-external-dependency-unit-tests.yml
@@ -52,7 +52,7 @@ jobs:
      test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -80,13 +80,12 @@ jobs:
    env:
      PYTHONPATH: ./backend
      MODEL_SERVER_HOST: "disabled"
-      DISABLE_TELEMETRY: "true"

    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -114,7 +113,6 @@ jobs:
        run: |
          cat <<EOF > deployment/docker_compose/.env
          CODE_INTERPRETER_BETA_ENABLED=true
-          DISABLE_TELEMETRY=true
          EOF

      - name: Set up Standard Dependencies
--- a/.github/workflows/pr-helm-chart-testing.yml
+++ b/.github/workflows/pr-helm-chart-testing.yml
@@ -24,7 +24,7 @@ jobs:
    # fetch-depth 0 is required for helm/chart-testing-action
    steps:
    - name: Checkout code
-      uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
      with:
        fetch-depth: 0
        persist-credentials: false
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -43,7 +43,7 @@ jobs:
      test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -74,7 +74,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -129,7 +129,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -183,7 +183,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -259,7 +259,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -274,28 +274,23 @@ jobs:

      # NOTE: Use pre-ping/null pool to reduce flakiness due to dropped connections
      # NOTE: don't need web server for integration tests
-      - name: Create .env file for Docker Compose
+      - name: Start Docker containers
        env:
          ECR_CACHE: ${{ env.RUNS_ON_ECR_CACHE }}
          RUN_ID: ${{ github.run_id }}
-        run: |
-          cat <<EOF > deployment/docker_compose/.env
-          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true
-          AUTH_TYPE=basic
-          POSTGRES_POOL_PRE_PING=true
-          POSTGRES_USE_NULL_POOL=true
-          REQUIRE_EMAIL_VERIFICATION=false
-          DISABLE_TELEMETRY=true
-          ONYX_BACKEND_IMAGE=${ECR_CACHE}:integration-test-backend-test-${RUN_ID}
-          ONYX_MODEL_SERVER_IMAGE=${ECR_CACHE}:integration-test-model-server-test-${RUN_ID}
-          INTEGRATION_TESTS_MODE=true
-          CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=0.001
-          MCP_SERVER_ENABLED=true
-          EOF
-
-      - name: Start Docker containers
        run: |
          cd deployment/docker_compose
+          ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
+          AUTH_TYPE=basic \
+          POSTGRES_POOL_PRE_PING=true \
+          POSTGRES_USE_NULL_POOL=true \
+          REQUIRE_EMAIL_VERIFICATION=false \
+          DISABLE_TELEMETRY=true \
+          ONYX_BACKEND_IMAGE=${ECR_CACHE}:integration-test-backend-test-${RUN_ID} \
+          ONYX_MODEL_SERVER_IMAGE=${ECR_CACHE}:integration-test-model-server-test-${RUN_ID} \
+          INTEGRATION_TESTS_MODE=true \
+          CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=0.001 \
+          MCP_SERVER_ENABLED=true \
          docker compose -f docker-compose.yml -f docker-compose.dev.yml up \
            relational_db \
            index \
@@ -441,7 +436,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

--- a/.github/workflows/pr-jest-tests.yml
+++ b/.github/workflows/pr-jest-tests.yml
@@ -16,12 +16,12 @@ jobs:
    timeout-minutes: 45
    steps:
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

      - name: Setup node
-        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v4
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # ratchet:actions/setup-node@v4
        with:
          node-version: 22
          cache: "npm"
--- a/.github/workflows/pr-mit-integration-tests.yml
+++ b/.github/workflows/pr-mit-integration-tests.yml
@@ -40,7 +40,7 @@ jobs:
      test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
    steps:
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -70,7 +70,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -124,7 +124,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -177,7 +177,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -253,7 +253,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -268,26 +268,21 @@ jobs:

      # NOTE: Use pre-ping/null pool to reduce flakiness due to dropped connections
      # NOTE: don't need web server for integration tests
-      - name: Create .env file for Docker Compose
+      - name: Start Docker containers
        env:
          ECR_CACHE: ${{ env.RUNS_ON_ECR_CACHE }}
          RUN_ID: ${{ github.run_id }}
-        run: |
-          cat <<EOF > deployment/docker_compose/.env
-          AUTH_TYPE=basic
-          POSTGRES_POOL_PRE_PING=true
-          POSTGRES_USE_NULL_POOL=true
-          REQUIRE_EMAIL_VERIFICATION=false
-          DISABLE_TELEMETRY=true
-          ONYX_BACKEND_IMAGE=${ECR_CACHE}:integration-test-backend-test-${RUN_ID}
-          ONYX_MODEL_SERVER_IMAGE=${ECR_CACHE}:integration-test-model-server-test-${RUN_ID}
-          INTEGRATION_TESTS_MODE=true
-          MCP_SERVER_ENABLED=true
-          EOF
-
-      - name: Start Docker containers
        run: |
          cd deployment/docker_compose
+          AUTH_TYPE=basic \
+          POSTGRES_POOL_PRE_PING=true \
+          POSTGRES_USE_NULL_POOL=true \
+          REQUIRE_EMAIL_VERIFICATION=false \
+          DISABLE_TELEMETRY=true \
+          ONYX_BACKEND_IMAGE=${ECR_CACHE}:integration-test-backend-test-${RUN_ID} \
+          ONYX_MODEL_SERVER_IMAGE=${ECR_CACHE}:integration-test-model-server-test-${RUN_ID} \
+          INTEGRATION_TESTS_MODE=true \
+          MCP_SERVER_ENABLED=true \
          docker compose -f docker-compose.yml -f docker-compose.dev.yml up \
            relational_db \
            index \
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -53,7 +53,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -108,7 +108,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -163,7 +163,7 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -229,13 +229,13 @@ jobs:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          fetch-depth: 0
          persist-credentials: false

      - name: Setup node
-        uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v4
+        uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # ratchet:actions/setup-node@v4
        with:
          node-version: 22
          cache: 'npm'
@@ -465,12 +465,12 @@ jobs:
 #     ]
 #   steps:
 #     - name: Checkout code
-#       uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+#       uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
 #       with:
 #         fetch-depth: 0

 #     - name: Setup node
-#       uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v4
+#       uses: actions/setup-node@2028fbc5c25fe9cf00d9f06a71cc4710d4507903 # ratchet:actions/setup-node@v4
 #       with:
 #         node-version: 22

--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -27,7 +27,7 @@ jobs:
    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -40,10 +40,35 @@ jobs:
            backend/requirements/model_server.txt
            backend/requirements/ee.txt

-      - name: Generate OpenAPI schema and Python client
+      - name: Generate OpenAPI schema
+        shell: bash
+        working-directory: backend
+        env:
+          PYTHONPATH: "."
+        run: |
+          python scripts/onyx_openapi_schema.py --filename generated/openapi.json
+
+      # needed for pulling openapitools/openapi-generator-cli
+      # otherwise, we hit the "Unauthenticated users" limit
+      # https://docs.docker.com/docker-hub/usage/
+      - name: Login to Docker Hub
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_TOKEN }}
+
+      - name: Generate OpenAPI Python client
        shell: bash
        run: |
-          ods openapi all
+          docker run --rm \
+            -v "${{ github.workspace }}/backend/generated:/local" \
+            openapitools/openapi-generator-cli generate \
+            -i /local/openapi.json \
+            -g python \
+            -o /local/onyx_openapi_client \
+            --package-name onyx_openapi_client \
+            --skip-validate-spec \
+            --openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"

      - name: Cache mypy cache
        if: ${{ vars.DISABLE_MYPY_CACHE != 'true' }}
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -133,13 +133,12 @@ jobs:

    env:
      PYTHONPATH: ./backend
-      DISABLE_TELEMETRY: "true"

    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

@@ -161,20 +160,16 @@ jobs:
            hubspot:
              - 'backend/onyx/connectors/hubspot/**'
              - 'backend/tests/daily/connectors/hubspot/**'
-              - 'uv.lock'
            salesforce:
              - 'backend/onyx/connectors/salesforce/**'
              - 'backend/tests/daily/connectors/salesforce/**'
-              - 'uv.lock'
            github:
              - 'backend/onyx/connectors/github/**'
              - 'backend/tests/daily/connectors/github/**'
-              - 'uv.lock'
            file_processing:
              - 'backend/onyx/file_processing/**'
-              - 'uv.lock'

-      - name: Run Tests (excluding HubSpot, Salesforce, GitHub, and Coda)
+      - name: Run Tests (excluding HubSpot, Salesforce, and GitHub)
        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
        run: |
          py.test \
@@ -187,8 +182,7 @@ jobs:
            backend/tests/daily/connectors \
            --ignore backend/tests/daily/connectors/hubspot \
            --ignore backend/tests/daily/connectors/salesforce \
-            --ignore backend/tests/daily/connectors/github \
-            --ignore backend/tests/daily/connectors/coda
+            --ignore backend/tests/daily/connectors/github

      - name: Run HubSpot Connector Tests
        if: ${{ github.event_name == 'schedule' || steps.changes.outputs.hubspot == 'true' || steps.changes.outputs.file_processing == 'true' }}
--- a/.github/workflows/pr-python-model-tests.yml
+++ b/.github/workflows/pr-python-model-tests.yml
@@ -39,7 +39,7 @@ jobs:

    steps:
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false

--- a/.github/workflows/pr-python-tests.yml
+++ b/.github/workflows/pr-python-tests.yml
@@ -26,13 +26,15 @@ jobs:
    env:
      PYTHONPATH: ./backend
      REDIS_CLOUD_PYTEST_PASSWORD: ${{ secrets.REDIS_CLOUD_PYTEST_PASSWORD }}
-      DISABLE_TELEMETRY: "true"
+      SF_USERNAME: ${{ secrets.SF_USERNAME }}
+      SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
+      SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}

    steps:
    - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2

    - name: Checkout code
-      uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
      with:
        persist-credentials: false

--- a/.github/workflows/pr-quality-checks.yml
+++ b/.github/workflows/pr-quality-checks.yml
@@ -7,8 +7,6 @@ on:
  merge_group:
  pull_request: null
  push:
-    branches:
-      - main
    tags:
      - "v*.*.*"

@@ -41,7 +39,7 @@ jobs:
      - uses: j178/prek-action@91fd7d7cf70ae1dee9f4f44e7dfa5d1073fe6623 # ratchet:j178/prek-action@v1
        with:
          prek-version: '0.2.21'
-          extra-args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || github.event_name == 'merge_group' && format('--from-ref {0} --to-ref {1}', github.event.merge_group.base_sha, github.event.merge_group.head_sha) || github.ref_name == 'main' && '--all-files' || '' }}
+          extra_args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || '' }}
      - name: Check Actions
        uses: giner/check-actions@28d366c7cbbe235f9624a88aa31a628167eee28c # ratchet:giner/check-actions@v1.0.1
        with:
--- a/.github/workflows/release-devtools.yml
+++ b/.github/workflows/release-devtools.yml
@@ -24,7 +24,7 @@ jobs:
          - {goos: "darwin", goarch: "arm64"}
          - {goos: "", goarch: ""}
    steps:
-      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          persist-credentials: false
          fetch-depth: 0
--- a/.github/workflows/sync_foss.yml
+++ b/.github/workflows/sync_foss.yml
@@ -14,7 +14,7 @@ jobs:
      contents: read
    steps:
      - name: Checkout main Onyx repo
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          fetch-depth: 0
          persist-credentials: false
--- a/.github/workflows/tag-nightly.yml
+++ b/.github/workflows/tag-nightly.yml
@@ -18,7 +18,7 @@ jobs:
      # see https://github.com/orgs/community/discussions/27028#discussioncomment-3254367 for the workaround we
      # implement here which needs an actual user's deploy key
      - name: Checkout code
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6
        with:
          ssh-key: "${{ secrets.DEPLOY_KEY }}"
          persist-credentials: true
--- a/.github/workflows/zizmor.yml
+++ b/.github/workflows/zizmor.yml
@@ -17,7 +17,7 @@ jobs:
      security-events: write # needed for SARIF uploads
    steps:
      - name: Checkout repository
-        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6.0.1
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # ratchet:actions/checkout@v6.0.0
        with:
          persist-credentials: false

--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,3 @@ node_modules

 # MCP configs
 .playwright-mcp
-
-# plans
-plans/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,13 +5,8 @@ default_install_hook_types:
  - post-rewrite
 repos:
  - repo: https://github.com/astral-sh/uv-pre-commit
-    # From: https://github.com/astral-sh/uv-pre-commit/pull/53/commits/d30b4298e4fb63ce8609e29acdbcf4c9018a483c
-    rev: d30b4298e4fb63ce8609e29acdbcf4c9018a483c
+    rev: 569ddf04117761eb74cef7afb5143bbb96fcdfbb  # frozen: 0.9.15
    hooks:
-      - id: uv-run
-        name: Check lazy imports
-        args: ["--with=onyx-devtools", "ods", "check-lazy-imports"]
-        files: ^backend/(?!\.venv/).*\.py$
      - id: uv-sync
        args: ["--locked", "--all-extras"]
      - id: uv-lock
@@ -19,19 +14,19 @@ repos:
      - id: uv-export
        name: uv-export default.txt
        args: ["--no-emit-project", "--no-default-groups", "--no-hashes", "--extra", "backend", "-o", "backend/requirements/default.txt"]
-        files: ^(pyproject\.toml|uv\.lock|backend/requirements/.*\.txt)$
+        files: ^(pyproject\.toml|uv\.lock)$
      - id: uv-export
        name: uv-export dev.txt
        args: ["--no-emit-project", "--no-default-groups", "--no-hashes", "--extra", "dev", "-o", "backend/requirements/dev.txt"]
-        files: ^(pyproject\.toml|uv\.lock|backend/requirements/.*\.txt)$
+        files: ^(pyproject\.toml|uv\.lock)$
      - id: uv-export
        name: uv-export ee.txt
        args: ["--no-emit-project", "--no-default-groups", "--no-hashes", "--extra", "ee", "-o", "backend/requirements/ee.txt"]
-        files: ^(pyproject\.toml|uv\.lock|backend/requirements/.*\.txt)$
+        files: ^(pyproject\.toml|uv\.lock)$
      - id: uv-export
        name: uv-export model_server.txt
        args: ["--no-emit-project", "--no-default-groups", "--no-hashes", "--extra", "model_server", "-o", "backend/requirements/model_server.txt"]
-        files: ^(pyproject\.toml|uv\.lock|backend/requirements/.*\.txt)$
+        files: ^(pyproject\.toml|uv\.lock)$
      # NOTE: This takes ~6s on a single, large module which is prohibitively slow.
      # - id: uv-run
      #   name: mypy
@@ -76,7 +71,7 @@ repos:
        args: [ '--remove-all-unused-imports', '--remove-unused-variables', '--in-place' , '--recursive']

  - repo: https://github.com/golangci/golangci-lint
-    rev: 9f61b0f53f80672872fced07b6874397c3ed197b  # frozen: v2.7.2
+    rev: e6ebea0145f385056bce15041d3244c0e5e15848  # frozen: v2.7.0
    hooks:
      - id: golangci-lint
        entry: bash -c "find tools/ -name go.mod -print0 | xargs -0 -I{} bash -c 'cd \"$(dirname {})\" && golangci-lint run ./...'"
@@ -112,6 +107,12 @@ repos:
        pass_filenames: false
        files: \.tf$

+      - id: check-lazy-imports
+        name: Check lazy imports
+        entry: python3 backend/scripts/check_lazy_imports.py
+        language: system
+        files: ^backend/(?!\.venv/).*\.py$
+
      - id: typescript-check
        name: TypeScript type check
        entry: bash -c 'cd web && npm run types:check'
--- a/.vscode/launch.template.jsonc
+++ b/.vscode/launch.template.jsonc
@@ -508,6 +508,7 @@
      ],
      "cwd": "${workspaceFolder}",
      "console": "integratedTerminal",
+      "stopOnEntry": true,
      "presentation": {
        "group": "3"
      }
--- a/CLAUDE.md.template
+++ b/CLAUDE.md.template
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co

 ## KEY NOTES

- If you run into any missing python dependency errors, try running your command with `source .venv/bin/activate` \
+- If you run into any missing python dependency errors, try running your command with `source backend/.venv/bin/activate` \
 to assume the python venv.
 - To make tests work, check the `.env` file at the root of the project to find an OpenAI key.
 - If using `playwright` to explore the frontend, you can usually log in with username `a@test.com` and password
--- a/backend/alembic/README.md
+++ b/backend/alembic/README.md
@@ -7,12 +7,8 @@ Onyx migrations use a generic single-database configuration with an async dbapi.

 ## To generate new migrations:

-From onyx/backend, run:
-`alembic revision -m <DESCRIPTION_OF_MIGRATION>`
-
-Note: you cannot use the `--autogenerate` flag as the automatic schema parsing does not work.
-
-Manually populate the upgrade and downgrade in your new migration.
+run from onyx/backend:
+`alembic revision --autogenerate -m <DESCRIPTION_OF_MIGRATION>`

 More info can be found here: https://alembic.sqlalchemy.org/en/latest/autogenerate.html

--- a/backend/alembic/versions/18b5b2524446_add_is_clarification_to_chat_message.py
+++ b/backend/alembic/versions/18b5b2524446_add_is_clarification_to_chat_message.py
@@ -1,29 +0,0 @@
-"""add is_clarification to chat_message
-
-Revision ID: 18b5b2524446
-Revises: 87c52ec39f84
-Create Date: 2025-01-16
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-# revision identifiers, used by Alembic.
-revision = "18b5b2524446"
-down_revision = "87c52ec39f84"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    op.add_column(
-        "chat_message",
-        sa.Column(
-            "is_clarification", sa.Boolean(), nullable=False, server_default="false"
-        ),
-    )
-
-
-def downgrade() -> None:
-    op.drop_column("chat_message", "is_clarification")
--- a/backend/alembic/versions/373848adba48_add_task_id_to_avatar_permission_request.py
+++ b/backend/alembic/versions/373848adba48_add_task_id_to_avatar_permission_request.py
@@ -0,0 +1,37 @@
+"""add_task_id_to_avatar_permission_request
+
+Revision ID: 373848adba48
+Revises: a1b2c3d4e5f6
+Create Date: 2025-12-11 18:41:18.678042
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "373848adba48"
+down_revision = "a1b2c3d4e5f6"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "avatar_permission_request",
+        sa.Column("task_id", sa.String(), nullable=True),
+    )
+    op.create_index(
+        "ix_avatar_permission_request_task_id",
+        "avatar_permission_request",
+        ["task_id"],
+    )
+
+
+def downgrade() -> None:
+    op.drop_index(
+        "ix_avatar_permission_request_task_id",
+        table_name="avatar_permission_request",
+    )
+    op.drop_column("avatar_permission_request", "task_id")
--- a/backend/alembic/versions/a01bf2971c5d_update_default_tool_descriptions.py
+++ b/backend/alembic/versions/a01bf2971c5d_update_default_tool_descriptions.py
@@ -1,62 +0,0 @@
-"""update_default_tool_descriptions
-
-Revision ID: a01bf2971c5d
-Revises: 87c52ec39f84
-Create Date: 2025-12-16 15:21:25.656375
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "a01bf2971c5d"
-down_revision = "18b5b2524446"
-branch_labels = None
-depends_on = None
-
-# new tool descriptions (12/2025)
-TOOL_DESCRIPTIONS = {
-    "SearchTool": "The Search Action allows the agent to search through connected knowledge to help build an answer.",
-    "ImageGenerationTool": (
-        "The Image Generation Action allows the agent to use DALL-E 3 or GPT-IMAGE-1 to generate images. "
-        "The action will be used when the user asks the agent to generate an image."
-    ),
-    "WebSearchTool": (
-        "The Web Search Action allows the agent "
-        "to perform internet searches for up-to-date information."
-    ),
-    "KnowledgeGraphTool": (
-        "The Knowledge Graph Search Action allows the agent to search the "
-        "Knowledge Graph for information. This tool can (for now) only be active in the KG Beta Agent, "
-        "and it requires the Knowledge Graph to be enabled."
-    ),
-    "OktaProfileTool": (
-        "The Okta Profile Action allows the agent to fetch the current user's information from Okta. "
-        "This may include the user's name, email, phone number, address, and other details such as their "
-        "manager and direct reports."
-    ),
-}
-
-
-def upgrade() -> None:
-    conn = op.get_bind()
-    conn.execute(sa.text("BEGIN"))
-
-    try:
-        for tool_id, description in TOOL_DESCRIPTIONS.items():
-            conn.execute(
-                sa.text(
-                    "UPDATE tool SET description = :description WHERE in_code_tool_id = :tool_id"
-                ),
-                {"description": description, "tool_id": tool_id},
-            )
-        conn.execute(sa.text("COMMIT"))
-    except Exception as e:
-        conn.execute(sa.text("ROLLBACK"))
-        raise e
-
-
-def downgrade() -> None:
-    pass
--- a/backend/alembic/versions/a1b2c3d4e5f6_add_avatar_tables.py
+++ b/backend/alembic/versions/a1b2c3d4e5f6_add_avatar_tables.py
@@ -0,0 +1,236 @@
+"""Add avatar tables
+
+Revision ID: a1b2c3d4e5f6
+Revises: 87c52ec39f84
+Create Date: 2025-01-15 10:00:00.000000
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "a1b2c3d4e5f6"
+down_revision = "87c52ec39f84"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create avatar table
+    op.create_table(
+        "avatar",
+        sa.Column("id", sa.Integer(), primary_key=True),
+        sa.Column(
+            "user_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("user.id", ondelete="CASCADE"),
+            nullable=False,
+            unique=True,
+        ),
+        sa.Column("name", sa.String(), nullable=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("is_enabled", sa.Boolean(), nullable=False, default=True),
+        sa.Column(
+            "default_query_mode",
+            sa.String(),
+            nullable=False,
+            default="owned_documents",
+        ),
+        sa.Column("allow_accessible_mode", sa.Boolean(), nullable=False, default=True),
+        sa.Column("auto_approve_rules", postgresql.JSONB(), nullable=True),
+        sa.Column("show_query_in_request", sa.Boolean(), nullable=False, default=True),
+        sa.Column("max_requests_per_day", sa.Integer(), nullable=True, default=100),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.func.now(),
+            nullable=False,
+        ),
+        sa.Column(
+            "updated_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.func.now(),
+            nullable=False,
+        ),
+    )
+
+    # Create avatar_permission_request table
+    op.create_table(
+        "avatar_permission_request",
+        sa.Column("id", sa.Integer(), primary_key=True),
+        sa.Column(
+            "avatar_id",
+            sa.Integer(),
+            sa.ForeignKey("avatar.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column(
+            "requester_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("user.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column("query_text", sa.Text(), nullable=True),
+        sa.Column(
+            "chat_session_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("chat_session.id", ondelete="SET NULL"),
+            nullable=True,
+        ),
+        sa.Column(
+            "chat_message_id",
+            sa.Integer(),
+            sa.ForeignKey("chat_message.id", ondelete="SET NULL"),
+            nullable=True,
+        ),
+        sa.Column("cached_answer", sa.Text(), nullable=True),
+        sa.Column("cached_search_doc_ids", postgresql.JSONB(), nullable=True),
+        sa.Column("answer_quality_score", sa.Float(), nullable=True),
+        sa.Column("status", sa.String(), nullable=False, default="pending"),
+        sa.Column("denial_reason", sa.String(), nullable=True),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.func.now(),
+            nullable=False,
+        ),
+        sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False),
+        sa.Column("resolved_at", sa.DateTime(timezone=True), nullable=True),
+    )
+
+    # Create indexes for avatar_permission_request
+    op.create_index(
+        "ix_avatar_permission_request_avatar_id",
+        "avatar_permission_request",
+        ["avatar_id"],
+    )
+    op.create_index(
+        "ix_avatar_permission_request_requester_id",
+        "avatar_permission_request",
+        ["requester_id"],
+    )
+    op.create_index(
+        "ix_avatar_permission_request_status",
+        "avatar_permission_request",
+        ["status"],
+    )
+    op.create_index(
+        "ix_avatar_permission_request_avatar_status",
+        "avatar_permission_request",
+        ["avatar_id", "status"],
+    )
+    op.create_index(
+        "ix_avatar_permission_request_requester_created",
+        "avatar_permission_request",
+        ["requester_id", "created_at"],
+    )
+
+    # Create avatar_query table
+    op.create_table(
+        "avatar_query",
+        sa.Column("id", sa.Integer(), primary_key=True),
+        sa.Column(
+            "avatar_id",
+            sa.Integer(),
+            sa.ForeignKey("avatar.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column(
+            "requester_id",
+            postgresql.UUID(as_uuid=True),
+            sa.ForeignKey("user.id", ondelete="CASCADE"),
+            nullable=False,
+        ),
+        sa.Column("query_mode", sa.String(), nullable=False),
+        sa.Column("query_text", sa.Text(), nullable=False),
+        sa.Column(
+            "created_at",
+            sa.DateTime(timezone=True),
+            server_default=sa.func.now(),
+            nullable=False,
+        ),
+    )
+
+    # Create indexes for avatar_query
+    op.create_index(
+        "ix_avatar_query_avatar_id",
+        "avatar_query",
+        ["avatar_id"],
+    )
+    op.create_index(
+        "ix_avatar_query_requester_id",
+        "avatar_query",
+        ["requester_id"],
+    )
+    op.create_index(
+        "ix_avatar_query_rate_limit",
+        "avatar_query",
+        ["avatar_id", "requester_id", "created_at"],
+    )
+
+    # Create avatars for all existing users
+    # Using raw SQL to avoid ORM dependencies in migrations
+    connection = op.get_bind()
+    connection.execute(
+        sa.text(
+            """
+            INSERT INTO avatar (
+                user_id,
+                is_enabled,
+                default_query_mode,
+                allow_accessible_mode,
+                show_query_in_request,
+                max_requests_per_day,
+                created_at,
+                updated_at
+            )
+            SELECT
+                id,
+                true,
+                'OWNED_DOCUMENTS',
+                true,
+                true,
+                100,
+                NOW(),
+                NOW()
+            FROM "user"
+            WHERE id NOT IN (SELECT user_id FROM avatar)
+            """
+        )
+    )
+
+
+def downgrade() -> None:
+    # Drop avatar_query table and indexes
+    op.drop_index("ix_avatar_query_rate_limit", table_name="avatar_query")
+    op.drop_index("ix_avatar_query_requester_id", table_name="avatar_query")
+    op.drop_index("ix_avatar_query_avatar_id", table_name="avatar_query")
+    op.drop_table("avatar_query")
+
+    # Drop avatar_permission_request table and indexes
+    op.drop_index(
+        "ix_avatar_permission_request_requester_created",
+        table_name="avatar_permission_request",
+    )
+    op.drop_index(
+        "ix_avatar_permission_request_avatar_status",
+        table_name="avatar_permission_request",
+    )
+    op.drop_index(
+        "ix_avatar_permission_request_status",
+        table_name="avatar_permission_request",
+    )
+    op.drop_index(
+        "ix_avatar_permission_request_requester_id",
+        table_name="avatar_permission_request",
+    )
+    op.drop_index(
+        "ix_avatar_permission_request_avatar_id",
+        table_name="avatar_permission_request",
+    )
+    op.drop_table("avatar_permission_request")
+
+    # Drop avatar table
+    op.drop_table("avatar")
--- a/backend/ee/onyx/db/user_group.py
+++ b/backend/ee/onyx/db/user_group.py
@@ -8,7 +8,6 @@ from sqlalchemy import func
 from sqlalchemy import Select
 from sqlalchemy import select
 from sqlalchemy import update
-from sqlalchemy.dialects.postgresql import insert
 from sqlalchemy.orm import Session

 from ee.onyx.server.user_group.models import SetCuratorRequest
@@ -363,29 +362,14 @@ def _check_user_group_is_modifiable(user_group: UserGroup) -> None:

 def _add_user__user_group_relationships__no_commit(
    db_session: Session, user_group_id: int, user_ids: list[UUID]
-) -> None:
-    """NOTE: does not commit the transaction.
-
-    This function is idempotent - it will skip users who are already in the group
-    to avoid duplicate key violations during concurrent operations or re-syncs.
-    Uses ON CONFLICT DO NOTHING to keep inserts atomic under concurrency.
-    """
-    if not user_ids:
-        return
-
-    insert_stmt = (
-        insert(User__UserGroup)
-        .values(
-            [
-                {"user_id": user_id, "user_group_id": user_group_id}
-                for user_id in user_ids
-            ]
-        )
-        .on_conflict_do_nothing(
-            index_elements=[User__UserGroup.user_group_id, User__UserGroup.user_id]
-        )
-    )
-    db_session.execute(insert_stmt)
+) -> list[User__UserGroup]:
+    """NOTE: does not commit the transaction."""
+    relationships = [
+        User__UserGroup(user_id=user_id, user_group_id=user_group_id)
+        for user_id in user_ids
+    ]
+    db_session.add_all(relationships)
+    return relationships


 def _add_user_group__cc_pair_relationships__no_commit(
--- a/backend/ee/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/ee/onyx/server/query_and_chat/chat_backend.py
@@ -8,10 +8,12 @@ from ee.onyx.server.query_and_chat.models import (
    BasicCreateChatMessageWithHistoryRequest,
 )
 from onyx.auth.users import current_user
+from onyx.chat.chat_utils import combine_message_thread
 from onyx.chat.chat_utils import create_chat_history_chain
 from onyx.chat.models import ChatBasicResponse
 from onyx.chat.process_message import gather_stream
 from onyx.chat.process_message import stream_chat_message_objects
+from onyx.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
 from onyx.configs.constants import MessageType
 from onyx.context.search.models import OptionalSearchSetting
 from onyx.context.search.models import RetrievalDetails
@@ -22,6 +24,7 @@ from onyx.db.engine.sql_engine import get_session
 from onyx.db.models import User
 from onyx.llm.factory import get_llms_for_persona
 from onyx.natural_language_processing.utils import get_tokenizer
+from onyx.secondary_llm_flows.query_expansion import thread_based_query_rephrase
 from onyx.server.query_and_chat.models import CreateChatMessageRequest
 from onyx.utils.logger import setup_logger

@@ -165,6 +168,8 @@ def handle_send_message_simple_with_history(
        provider_type=llm.config.model_provider,
    )

+    max_history_tokens = int(llm.config.max_input_tokens * CHAT_TARGET_CHUNK_PERCENTAGE)
+
    # Every chat Session begins with an empty root message
    root_message = get_or_create_root_message(
        chat_session_id=chat_session.id, db_session=db_session
@@ -183,6 +188,17 @@ def handle_send_message_simple_with_history(
        )
    db_session.commit()

+    history_str = combine_message_thread(
+        messages=msg_history,
+        max_tokens=max_history_tokens,
+        llm_tokenizer=llm_tokenizer,
+    )
+
+    rephrased_query = req.query_override or thread_based_query_rephrase(
+        user_query=query,
+        history_str=history_str,
+    )
+
    if req.retrieval_options is None and req.search_doc_ids is None:
        retrieval_options: RetrievalDetails | None = RetrievalDetails(
            run_search=OptionalSearchSetting.ALWAYS,
@@ -200,7 +216,7 @@ def handle_send_message_simple_with_history(
        retrieval_options=retrieval_options,
        # Simple API does not support reranking, hide complexity from user
        rerank_settings=None,
-        query_override=None,
+        query_override=rephrased_query,
        chunks_above=0,
        chunks_below=0,
        full_doc=req.full_doc,
--- a/backend/onyx/auth/users.py
+++ b/backend/onyx/auth/users.py
@@ -219,7 +219,7 @@ def verify_email_is_invited(email: str) -> None:
        raise PermissionError("Email must be specified")

    try:
-        email_info = validate_email(email, check_deliverability=False)
+        email_info = validate_email(email)
    except EmailUndeliverableError:
        raise PermissionError("Email is not valid")

@@ -227,9 +227,7 @@ def verify_email_is_invited(email: str) -> None:
        try:
            # normalized emails are now being inserted into the db
            # we can remove this normalization on read after some time has passed
-            email_info_whitelist = validate_email(
-                email_whitelist, check_deliverability=False
-            )
+            email_info_whitelist = validate_email(email_whitelist)
        except EmailNotValidError:
            continue

@@ -402,6 +400,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    user = await self.update(user_update, user)
                if user_created:
                    await self._assign_default_pinned_assistants(user, db_session)
+                    await self._create_user_avatar(user, db_session)
                remove_user_from_invited_users(user_create.email)
        finally:
            CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
@@ -436,6 +435,21 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        )
        user.pinned_assistants = default_persona_ids

+    async def _create_user_avatar(self, user: User, db_session: AsyncSession) -> None:
+        """Create a default avatar for a newly registered user."""
+        from onyx.db.avatar import create_avatar_for_user_async
+
+        try:
+            await create_avatar_for_user_async(
+                user_id=user.id,
+                db_session=db_session,
+                name=None,  # Will default to user's email in UI
+                description=None,
+            )
+        except Exception as e:
+            # Log but don't fail user creation if avatar creation fails
+            logger.warning(f"Failed to create avatar for user {user.id}: {e}")
+
    async def validate_password(self, password: str, _: schemas.UC | models.UP) -> None:
        # Validate password according to configurable security policy (defined via environment variables)
        if len(password) < PASSWORD_MIN_LENGTH:
@@ -557,6 +571,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    user = await self.user_db.create(user_dict)
                    await self.user_db.add_oauth_account(user, oauth_account_dict)
                    await self._assign_default_pinned_assistants(user, db_session)
+                    await self._create_user_avatar(user, db_session)
                    await self.on_after_register(user, request)

            else:
--- a/backend/onyx/background/celery/apps/background.py
+++ b/backend/onyx/background/celery/apps/background.py
@@ -133,5 +133,7 @@ celery_app.autodiscover_tasks(
        "onyx.background.celery.tasks.docprocessing",
        # Docfetching worker tasks
        "onyx.background.celery.tasks.docfetching",
+        # Avatar query tasks
+        "onyx.background.celery.tasks.avatar",
    ]
 )
--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -98,5 +98,6 @@ for bootstep in base_bootsteps:
 celery_app.autodiscover_tasks(
    [
        "onyx.background.celery.tasks.pruning",
+        "onyx.background.celery.tasks.avatar",
    ]
 )
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -315,6 +315,7 @@ for bootstep in base_bootsteps:

 celery_app.autodiscover_tasks(
    [
+        "onyx.background.celery.tasks.avatar",
        "onyx.background.celery.tasks.connector_deletion",
        "onyx.background.celery.tasks.docprocessing",
        "onyx.background.celery.tasks.evals",
--- a/backend/onyx/background/celery/tasks/avatar/init.py
+++ b/backend/onyx/background/celery/tasks/avatar/init.py
--- a/backend/onyx/background/celery/tasks/avatar/tasks.py
+++ b/backend/onyx/background/celery/tasks/avatar/tasks.py
@@ -0,0 +1,294 @@
+"""
+Celery tasks for avatar queries.
+
+These tasks handle background processing of avatar queries,
+particularly for the "All Accessible Documents" mode which can
+be time-consuming and should not block the user.
+"""
+
+from celery import shared_task
+from celery import Task
+
+from onyx.background.celery.apps.app_base import task_logger
+from onyx.configs.constants import OnyxCeleryTask
+from onyx.context.search.models import IndexFilters
+from onyx.context.search.models import QueryExpansionType
+from onyx.context.search.preprocessing.access_filters import (
+    build_access_filters_for_user,
+)
+from onyx.context.search.utils import get_query_embedding
+from onyx.db.avatar import get_avatar_by_id
+from onyx.db.avatar import get_permission_request_by_id
+from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.enums import AvatarPermissionRequestStatus
+from onyx.document_index.factory import get_current_primary_default_document_index
+from onyx.llm.factory import get_default_llms
+from onyx.llm.factory import get_main_llm_from_tuple
+from onyx.llm.message_types import SystemMessage
+from onyx.llm.message_types import UserMessageWithText
+from onyx.utils.logger import setup_logger
+from shared_configs.configs import MULTI_TENANT
+from shared_configs.contextvars import get_current_tenant_id
+
+
+logger = setup_logger()
+
+# Time limits for the task (in seconds)
+AVATAR_QUERY_SOFT_TIME_LIMIT = 120  # 2 minutes
+AVATAR_QUERY_TIME_LIMIT = 150  # 2.5 minutes
+
+# Search/answer generation constants
+MIN_RESULT_SCORE = 0.3
+MIN_CHUNKS_FOR_ANSWER = 1
+
+AVATAR_ANSWER_SYSTEM_PROMPT = """You are a helpful assistant answering questions based on documents \
+owned by or accessible to a specific user (the "avatar").
+
+Your task is to synthesize information from the provided document excerpts and generate a \
+clear, accurate answer to the user's question.
+
+Guidelines:
+- Base your answer ONLY on the provided document excerpts
+- Be concise but thorough
+- If the documents don't contain enough information to fully answer the question, acknowledge what \
+information is available and what is missing
+- Use a professional, helpful tone
+- When referencing specific information, indicate which document it came from using [1], [2], etc."""
+
+AVATAR_ANSWER_USER_PROMPT_TEMPLATE = """Based on the following document excerpts from {avatar_name}'s \
+documents, please answer this question:
+
+Question: {query}
+
+Document Excerpts:
+{context}
+
+Please provide a clear, helpful answer based on the information above."""
+
+
+@shared_task(
+    name=OnyxCeleryTask.AVATAR_QUERY_TASK,
+    soft_time_limit=AVATAR_QUERY_SOFT_TIME_LIMIT,
+    time_limit=AVATAR_QUERY_TIME_LIMIT,
+    bind=True,
+    trail=False,
+)
+def avatar_query_task(
+    self: Task,
+    *,
+    permission_request_id: int,
+    tenant_id: str | None = None,
+) -> dict:
+    """
+    Background task to execute an avatar query and store the results.
+
+    This task is used for "All Accessible Documents" mode queries.
+    It executes the search, generates an answer, and updates the
+    permission request with the cached results.
+
+    Args:
+        permission_request_id: The ID of the AvatarPermissionRequest to process
+        tenant_id: The tenant ID for multi-tenant deployments
+
+    Returns:
+        dict with status and any error message
+    """
+    task_logger.info(
+        f"Starting avatar query task for permission_request_id={permission_request_id}"
+    )
+
+    try:
+        with get_session_with_current_tenant() as db_session:
+            # Get the permission request
+            request = get_permission_request_by_id(permission_request_id, db_session)
+            if not request:
+                task_logger.error(
+                    f"Permission request {permission_request_id} not found"
+                )
+                return {"status": "error", "message": "Permission request not found"}
+
+            # Verify it's in PROCESSING status
+            if request.status != AvatarPermissionRequestStatus.PROCESSING:
+                task_logger.warning(
+                    f"Permission request {permission_request_id} is not in PROCESSING status"
+                )
+                return {
+                    "status": "skipped",
+                    "message": f"Request status is {request.status}, not PROCESSING",
+                }
+
+            # Get the avatar
+            avatar = get_avatar_by_id(request.avatar_id, db_session)
+            if not avatar:
+                _mark_request_failed(request, db_session, "Avatar not found")
+                return {"status": "error", "message": "Avatar not found"}
+
+            # Build filters for accessible documents (query as the avatar's user)
+            user_acl = build_access_filters_for_user(avatar.user, db_session)
+            filters = IndexFilters(
+                source_type=None,
+                document_set=None,
+                time_cutoff=None,
+                tags=None,
+                access_control_list=list(user_acl),
+                tenant_id=get_current_tenant_id() if MULTI_TENANT else None,
+            )
+
+            # Execute search
+            query = request.query_text or ""
+            chunks = _execute_search(query, filters, db_session)
+
+            if not _has_good_results(chunks):
+                # No good results - mark as NO_ANSWER
+                request.status = AvatarPermissionRequestStatus.NO_ANSWER
+                request.cached_answer = None
+                db_session.commit()
+                task_logger.info(
+                    f"Avatar query {permission_request_id} completed with no results"
+                )
+                return {
+                    "status": "no_results",
+                    "message": "No relevant documents found",
+                }
+
+            # Generate answer
+            answer = _generate_answer(query, chunks, avatar)
+            cached_doc_ids = [chunk.chunk_id for chunk in chunks[:10]]
+
+            # Calculate answer quality score
+            if chunks and chunks[0].score:
+                answer_quality = sum(c.score or 0 for c in chunks[:3]) / min(
+                    3, len(chunks)
+                )
+            else:
+                answer_quality = None
+
+            # Update the request with results - set to PENDING for owner approval
+            request.cached_answer = answer
+            request.cached_search_doc_ids = cached_doc_ids
+            request.answer_quality_score = answer_quality
+            request.status = AvatarPermissionRequestStatus.PENDING
+            db_session.commit()
+
+            task_logger.info(
+                f"Avatar query {permission_request_id} completed successfully"
+            )
+            return {"status": "success", "message": "Query completed"}
+
+    except Exception as e:
+        task_logger.error(f"Avatar query task failed: {e}")
+        # Try to mark the request as failed
+        try:
+            with get_session_with_current_tenant() as db_session:
+                request = get_permission_request_by_id(
+                    permission_request_id, db_session
+                )
+                if (
+                    request
+                    and request.status == AvatarPermissionRequestStatus.PROCESSING
+                ):
+                    _mark_request_failed(request, db_session, str(e))
+        except Exception:
+            pass
+        raise
+
+
+def _execute_search(query: str, filters: IndexFilters, db_session) -> list:
+    """Execute a hybrid search with the given filters."""
+
+    try:
+        query_embedding = get_query_embedding(query, db_session)
+        document_index = get_current_primary_default_document_index(db_session)
+
+        chunks = document_index.hybrid_retrieval(
+            query=query,
+            query_embedding=query_embedding,
+            final_keywords=None,
+            filters=filters,
+            hybrid_alpha=0.5,
+            time_decay_multiplier=1.0,
+            num_to_retrieve=10,
+            ranking_profile_type=QueryExpansionType.SEMANTIC,
+        )
+
+        return chunks[:10]
+    except Exception as e:
+        task_logger.error(f"Search failed: {e}")
+        return []
+
+
+def _has_good_results(chunks: list) -> bool:
+    """Check if the search results are good enough to proceed."""
+    if len(chunks) < MIN_CHUNKS_FOR_ANSWER:
+        return False
+
+    for chunk in chunks:
+        if chunk.score and chunk.score >= MIN_RESULT_SCORE:
+            return True
+
+    return len(chunks) >= MIN_CHUNKS_FOR_ANSWER
+
+
+def _generate_answer(query: str, chunks: list, avatar) -> str | None:
+    """Generate an answer from the retrieved chunks using the LLM."""
+    if not chunks:
+        return None
+
+    # Build context from chunks
+    context_parts = []
+    for i, chunk in enumerate(chunks[:5], 1):
+        source = chunk.semantic_identifier or chunk.document_id
+        context_parts.append(f"[{i}] Source: {source}\n{chunk.content}")
+
+    context = "\n\n---\n\n".join(context_parts)
+    avatar_name = avatar.name or avatar.user.email
+
+    user_prompt = AVATAR_ANSWER_USER_PROMPT_TEMPLATE.format(
+        avatar_name=avatar_name,
+        query=query,
+        context=context,
+    )
+
+    try:
+        llms = get_default_llms()
+        llm = get_main_llm_from_tuple(llms)
+
+        system_msg: SystemMessage = {
+            "role": "system",
+            "content": AVATAR_ANSWER_SYSTEM_PROMPT,
+        }
+        user_msg: UserMessageWithText = {
+            "role": "user",
+            "content": user_prompt,
+        }
+
+        response = llm.invoke([system_msg, user_msg])
+
+        if response and response.choice and response.choice.message:
+            content = response.choice.message.content
+            if content:
+                return content
+
+        return None
+
+    except Exception as e:
+        task_logger.error(f"Failed to generate LLM answer: {e}")
+        # Fall back to simple summary
+        summary_parts = []
+        for i, chunk in enumerate(chunks[:5], 1):
+            source = chunk.semantic_identifier or chunk.document_id
+            preview = (
+                chunk.content[:200] + "..."
+                if len(chunk.content) > 200
+                else chunk.content
+            )
+            summary_parts.append(f"[{i}] {source}: {preview}")
+
+        return "\n\n".join(summary_parts)
+
+
+def _mark_request_failed(request, db_session, error_message: str) -> None:
+    """Mark a request as failed (NO_ANSWER status with error in denial_reason)."""
+    request.status = AvatarPermissionRequestStatus.NO_ANSWER
+    request.denial_reason = f"Processing failed: {error_message}"
+    db_session.commit()
--- a/backend/onyx/chat/README.md
+++ b/backend/onyx/chat/README.md
@@ -105,49 +105,52 @@ S, U1, TC, TR, R -- agent calls another tool -> S, U1, TC, TR, TC, TR, R, A1
 - Reminder moved to the end
 ```

-
-## Product considerations
-Project files are important to the entire duration of the chat session. If the user has uploaded project files, they are likely very intent on working with
-those files. The LLM is much better at referencing documents close to the end of the context window so keeping it there for ease of access.
-
-User uploaded files are considered relevant for that point in time, it is ok if the Agent forgets about it as the chat gets long. If every uploaded file is
-constantly moved towards the end of the chat, it would degrade quality as these stack up. Even with a single file, there is some cost of making the previous
-User Message further away. This tradeoff is accepted for Projects because of the intent of the feature.
-
-Reminder are absolutely necessary to ensure 1-2 specific instructions get followed with a very high probability. It is less detailed than the system prompt
-and should be very targetted for it to work reliably and also not interfere with the last user message.
-
-
 ## Reasons / Experiments
 Custom Agent instructions being placed in the system prompt is poorly followed. It also degrade performance of the system especially when the instructions
 are orthogonal (or even possibly contradictory) to the system prompt. For weaker models, it causes strange artifacts in tool calls and final responses
 that completely ruins the user experience. Empirically, this way works better across a range of models especially when the history gets longer.
 Having the Custom Agent instructions not move means it fades more as the chat gets long which is also not ok from a UX perspective.

-Different LLMs vary in this but some now have a section that cannot be set via the API layer called the "System Prompt" (OpenAI terminology) which contains
+Project files are important to the entire duration of the chat session. If the user has uploaded project files, they are likely very intent on working with
+those files. The LLM is much better at referencing documents close to the end of the context window so keeping it there for ease of access.
+
+Reminder are absolutely necessary to ensure 1-2 specific instructions get followed with a very high probability. It is less detailed than the system prompt
+and should be very targetted for it to work reliably.
+
+User uploaded files are considered relevant for that point in time, it is ok if the Agent forgets about it as the chat gets long. If every uploaded file is
+constantly moved towards the end of the chat, it would degrade quality as these stack up. Even with a single file, there is some cost of making the previous
+User Message further away. This tradeoff is accepted for Projects because of the intent of the feature.
+
+
+## Other related pointers
+- How messages, files, images are stored can be found in db/models.py
+
+
+# Appendix (just random tidbits for those interested)
+- Reminder messages are placed at the end of the prompt because all model fine tuning approaches cause the LLMs to attend very strongly to the tokens at the very
+back of the context closest to generation. This is the only way to get the LLMs to not miss critical information and for the product to be reliable. Specifically
+the built-in reminders are around citations and what tools it should call in certain situations.
+
+- LLMs are able to handle changes in topic best at message boundaries. There are special tokens under the hood for this. We also use this property to slice up
+the history in the way presented above.
+
+- Different LLMs vary in this but some now have a section that cannot be set via the API layer called the "System Prompt" (OpenAI terminology) which contains
 information like the model cutoff date, identity, and some other basic non-changing information. The System prompt described above is in that convention called
 the "Developer Prompt". It seems the distribution of the System Prompt, by which I mean the style of wording and terms used can also affect the behavior. This
 is different between different models and not necessarily scientific so the system prompt is built from an exploration across different models. It currently
 starts with: "You are a highly capable, thoughtful, and precise assistant. Your goal is to deeply understand the user's intent..."

-LLMs are able to handle changes in topic best at message boundaries. There are special tokens under the hood for this. We also use this property to slice up
-the history in the way presented above.
-
-Reminder messages are placed at the end of the prompt because all model fine tuning approaches cause the LLMs to attend very strongly to the tokens at the very
-back of the context closest to generation. This is the only way to get the LLMs to not miss critical information and for the product to be reliable. Specifically
-the built-in reminders are around citations and what tools it should call in certain situations.
-
-The document json includes a field for the LLM to cite (it's a single number) to make citations reliable and avoid weird artifacts. It's called "document" so
+- The document json includes a field for the LLM to cite (it's a single number) to make citations reliable and avoid weird artifacts. It's called "document" so
 that the LLM does not create weird artifacts in reasoning like "I should reference citation_id: 5 for...". It is also strategically placed so that it is easy to
 reference. It is followed by a couple short sections like the metadata and title before the long content section. It seems LLMs are still better at local
 attention despite having global access.

-In a similar concept, LLM instructions in the system prompt are structured specifically so that there are coherent sections for the LLM to attend to. This is
+- In a similar concept, LLM instructions in the system prompt are structured specifically so that there are coherent sections for the LLM to attend to. This is
 fairly surprising actually but if there is a line of instructions effectively saying "If you try to use some tools and find that you need more information or
 need to call additional tools, you are encouraged to do this", having this in the Tool section of the System prompt makes all the LLMs follow it well but if it's
 even just a paragraph away like near the beginning of the prompt, it is often often ignored. The difference is as drastic as a 30% follow rate to a 90% follow
 rate even just moving the same statement a few sentences.

-
-## Other related pointers
- How messages, files, images are stored can be found in backend/onyx/db/models.py, there is also a README.md under that directory that may be helpful.
+- Custom Agent prompts are also completely separate from the system prompt. Having potentially orthogonal instructions in the system prompt (both the actual
+instructions and the writing style) can greatly deteriorate the quality of the responses. There is also a product motivation to keep it close to the end of
+generation so it's strongly followed.
--- a/backend/onyx/chat/chat_state.py
+++ b/backend/onyx/chat/chat_state.py
@@ -26,8 +26,6 @@ class ChatStateContainer:
        self.answer_tokens: str | None = None
        # Store citation mapping for building citation_docs_info during partial saves
        self.citation_to_doc: dict[int, SearchDoc] = {}
-        # True if this turn is a clarification question (deep research flow)
-        self.is_clarification: bool = False

    def add_tool_call(self, tool_call: ToolCallInfo) -> None:
        """Add a tool call to the accumulated state."""
@@ -45,10 +43,6 @@ class ChatStateContainer:
        """Set the citation mapping from citation processor."""
        self.citation_to_doc = citation_to_doc

-    def set_is_clarification(self, is_clarification: bool) -> None:
-        """Set whether this turn is a clarification question."""
-        self.is_clarification = is_clarification
-

 def run_chat_llm_with_state_containers(
    func: Callable[..., None],
--- a/backend/onyx/chat/chat_utils.py
+++ b/backend/onyx/chat/chat_utils.py
@@ -477,10 +477,7 @@ def load_chat_file(

    # Extract text content if it's a text file type (not an image)
    content_text = None
-    # `FileDescriptor` is often JSON-roundtripped (e.g. JSONB / API), so `type`
-    # may arrive as a raw string value instead of a `ChatFileType`.
-    file_type = ChatFileType(file_descriptor["type"])
-
+    file_type = file_descriptor["type"]
    if file_type.is_text_file():
        try:
            content_text = content.decode("utf-8")
@@ -711,21 +708,3 @@ def get_custom_agent_prompt(persona: Persona, chat_session: ChatSession) -> str
        return chat_session.project.instructions
    else:
        return None
-
-
-def is_last_assistant_message_clarification(chat_history: list[ChatMessage]) -> bool:
-    """Check if the last assistant message in chat history was a clarification question.
-
-    This is used in the deep research flow to determine whether to skip the
-    clarification step when the user has already responded to a clarification.
-
-    Args:
-        chat_history: List of ChatMessage objects in chronological order
-
-    Returns:
-        True if the last assistant message has is_clarification=True, False otherwise
-    """
-    for message in reversed(chat_history):
-        if message.message_type == MessageType.ASSISTANT:
-            return message.is_clarification
-    return False
--- a/backend/onyx/chat/llm_loop.py
+++ b/backend/onyx/chat/llm_loop.py
@@ -24,9 +24,19 @@ from onyx.configs.constants import MessageType
 from onyx.context.search.models import SearchDoc
 from onyx.context.search.models import SearchDocsResponse
 from onyx.db.models import Persona
+from onyx.file_store.models import ChatFileType
+from onyx.llm.interfaces import LanguageModelInput
 from onyx.llm.interfaces import LLM
-from onyx.llm.interfaces import LLMUserIdentity
 from onyx.llm.interfaces import ToolChoiceOptions
+from onyx.llm.message_types import AssistantMessage
+from onyx.llm.message_types import ChatCompletionMessage
+from onyx.llm.message_types import ImageContentPart
+from onyx.llm.message_types import SystemMessage
+from onyx.llm.message_types import TextContentPart
+from onyx.llm.message_types import ToolCall
+from onyx.llm.message_types import ToolMessage
+from onyx.llm.message_types import UserMessageWithParts
+from onyx.llm.message_types import UserMessageWithText
 from onyx.llm.utils import model_needs_formatting_reenabled
 from onyx.prompts.chat_prompts import IMAGE_GEN_REMINDER
 from onyx.prompts.chat_prompts import OPEN_URL_REMINDER
@@ -46,6 +56,7 @@ from onyx.tools.tool_implementations.search.search_tool import SearchTool
 from onyx.tools.tool_implementations.web_search.web_search_tool import WebSearchTool
 from onyx.tools.tool_runner import run_tool_calls
 from onyx.tracing.framework.create import trace
+from onyx.utils.b64 import get_image_type_from_bytes
 from onyx.utils.logger import setup_logger
 from shared_configs.contextvars import get_current_tenant_id

@@ -104,23 +115,15 @@ def construct_message_history(
    custom_agent_prompt: ChatMessageSimple | None,
    simple_chat_history: list[ChatMessageSimple],
    reminder_message: ChatMessageSimple | None,
-    project_files: ExtractedProjectFiles | None,
+    project_files: ExtractedProjectFiles,
    available_tokens: int,
-    last_n_user_messages: int | None = None,
 ) -> list[ChatMessageSimple]:
-    if last_n_user_messages is not None:
-        if last_n_user_messages <= 0:
-            raise ValueError(
-                "filtering chat history by last N user messages must be a value greater than 0"
-            )
-
    history_token_budget = available_tokens
    history_token_budget -= system_prompt.token_count
    history_token_budget -= (
        custom_agent_prompt.token_count if custom_agent_prompt else 0
    )
-    if project_files:
-        history_token_budget -= project_files.total_token_count
+    history_token_budget -= project_files.total_token_count
    history_token_budget -= reminder_message.token_count if reminder_message else 0

    if history_token_budget < 0:
@@ -131,7 +134,7 @@ def construct_message_history(
        result = [system_prompt]
        if custom_agent_prompt:
            result.append(custom_agent_prompt)
-        if project_files and project_files.project_file_texts:
+        if project_files.project_file_texts:
            project_message = _create_project_files_message(
                project_files, token_counter=None
            )
@@ -140,26 +143,6 @@ def construct_message_history(
            result.append(reminder_message)
        return result

-    # If last_n_user_messages is set, filter history to only include the last n user messages
-    if last_n_user_messages is not None:
-        # Find all user message indices
-        user_msg_indices = [
-            i
-            for i, msg in enumerate(simple_chat_history)
-            if msg.message_type == MessageType.USER
-        ]
-
-        if not user_msg_indices:
-            raise ValueError("No user message found in simple_chat_history")
-
-        # If we have more than n user messages, keep only the last n
-        if len(user_msg_indices) > last_n_user_messages:
-            # Find the index of the n-th user message from the end
-            # For example, if last_n_user_messages=2, we want the 2nd-to-last user message
-            nth_user_msg_index = user_msg_indices[-(last_n_user_messages)]
-            # Keep everything from that user message onwards
-            simple_chat_history = simple_chat_history[nth_user_msg_index:]
-
    # Find the last USER message in the history
    # The history may contain tool calls and responses after the last user message
    last_user_msg_index = None
@@ -207,7 +190,7 @@ def construct_message_history(
            break

    # Attach project images to the last user message
-    if project_files and project_files.project_image_files:
+    if project_files.project_image_files:
        existing_images = last_user_message.image_files or []
        last_user_message = ChatMessageSimple(
            message=last_user_message.message,
@@ -229,7 +212,7 @@ def construct_message_history(
        result.append(custom_agent_prompt)

    # 3. Add project files message (inserted before last user message)
-    if project_files and project_files.project_file_texts:
+    if project_files.project_file_texts:
        project_message = _create_project_files_message(
            project_files, token_counter=None
        )
@@ -279,6 +262,140 @@ def _create_project_files_message(
    )


+def translate_history_to_llm_format(
+    history: list[ChatMessageSimple],
+) -> LanguageModelInput:
+    """Convert a list of ChatMessageSimple to LanguageModelInput format.
+
+    Converts ChatMessageSimple messages to ChatCompletionMessage format,
+    handling different message types and image files for multimodal support.
+    """
+    messages: list[ChatCompletionMessage] = []
+
+    for msg in history:
+        if msg.message_type == MessageType.SYSTEM:
+            system_msg: SystemMessage = {
+                "role": "system",
+                "content": msg.message,
+            }
+            messages.append(system_msg)
+
+        elif msg.message_type == MessageType.USER:
+            # Handle user messages with potential images
+            if msg.image_files:
+                # Build content parts: text + images
+                content_parts: list[TextContentPart | ImageContentPart] = [
+                    {"type": "text", "text": msg.message}
+                ]
+
+                # Add image parts
+                for img_file in msg.image_files:
+                    if img_file.file_type == ChatFileType.IMAGE:
+                        try:
+                            image_type = get_image_type_from_bytes(img_file.content)
+                            base64_data = img_file.to_base64()
+                            image_url = f"data:{image_type};base64,{base64_data}"
+
+                            image_part: ImageContentPart = {
+                                "type": "image_url",
+                                "image_url": {"url": image_url},
+                            }
+                            content_parts.append(image_part)
+                        except Exception as e:
+                            logger.warning(
+                                f"Failed to process image file {img_file.file_id}: {e}. "
+                                "Skipping image."
+                            )
+
+                user_msg_with_parts: UserMessageWithParts = {
+                    "role": "user",
+                    "content": content_parts,
+                }
+                messages.append(user_msg_with_parts)
+            else:
+                # Simple text-only user message
+                user_msg_text: UserMessageWithText = {
+                    "role": "user",
+                    "content": msg.message,
+                }
+                messages.append(user_msg_text)
+
+        elif msg.message_type == MessageType.ASSISTANT:
+            assistant_msg: AssistantMessage = {
+                "role": "assistant",
+                "content": msg.message or None,
+            }
+            messages.append(assistant_msg)
+
+        elif msg.message_type == MessageType.TOOL_CALL:
+            # Tool calls are represented as Assistant Messages with tool_calls field
+            # Try to reconstruct tool call structure if we have tool_call_id
+            tool_calls: list[ToolCall] = []
+            if msg.tool_call_id:
+                try:
+                    # Parse the message content (which should contain function_name and arguments)
+                    tool_call_data = json.loads(msg.message) if msg.message else {}
+
+                    if (
+                        isinstance(tool_call_data, dict)
+                        and TOOL_CALL_MSG_FUNC_NAME in tool_call_data
+                    ):
+                        function_name = tool_call_data.get(
+                            TOOL_CALL_MSG_FUNC_NAME, "unknown"
+                        )
+                        tool_args = tool_call_data.get(TOOL_CALL_MSG_ARGUMENTS, {})
+                    else:
+                        function_name = "unknown"
+                        tool_args = (
+                            tool_call_data if isinstance(tool_call_data, dict) else {}
+                        )
+
+                    # NOTE: if the model is trained on a different tool call format, this may slightly interfere
+                    # with the future tool calls, if it doesn't look like this. Almost certainly not a big deal.
+                    tool_call: ToolCall = {
+                        "id": msg.tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": function_name,
+                            "arguments": json.dumps(tool_args) if tool_args else "{}",
+                        },
+                    }
+                    tool_calls.append(tool_call)
+                except (json.JSONDecodeError, ValueError) as e:
+                    logger.warning(
+                        f"Failed to parse tool call data for tool_call_id {msg.tool_call_id}: {e}. "
+                        "Including as content-only message."
+                    )
+
+            assistant_msg_with_tool: AssistantMessage = {
+                "role": "assistant",
+                "content": None,  # The tool call is parsed, doesn't need to be duplicated in the content
+            }
+            if tool_calls:
+                assistant_msg_with_tool["tool_calls"] = tool_calls
+            messages.append(assistant_msg_with_tool)
+
+        elif msg.message_type == MessageType.TOOL_CALL_RESPONSE:
+            if not msg.tool_call_id:
+                raise ValueError(
+                    f"Tool call response message encountered but tool_call_id is not available. Message: {msg}"
+                )
+
+            tool_msg: ToolMessage = {
+                "role": "tool",
+                "content": msg.message,
+                "tool_call_id": msg.tool_call_id,
+            }
+            messages.append(tool_msg)
+
+        else:
+            logger.warning(
+                f"Unknown message type {msg.message_type} in history. Skipping message."
+            )
+
+    return messages
+
+
 def run_llm_loop(
    emitter: Emitter,
    state_container: ChatStateContainer,
@@ -292,7 +409,6 @@ def run_llm_loop(
    token_counter: Callable[[str], int],
    db_session: Session,
    forced_tool_id: int | None = None,
-    user_identity: LLMUserIdentity | None = None,
 ) -> None:
    with trace("run_llm_loop", metadata={"tenant_id": get_current_tenant_id()}):
        # Fix some LiteLLM issues,
@@ -324,7 +440,7 @@ def run_llm_loop(

        # Pass the total budget to construct_message_history, which will handle token allocation
        available_tokens = llm.config.max_input_tokens
-        tool_choice: ToolChoiceOptions = ToolChoiceOptions.AUTO
+        tool_choice: ToolChoiceOptions = "auto"
        collected_tool_calls: list[ToolCallInfo] = []
        # Initialize gathered_documents with project files if present
        gathered_documents: list[SearchDoc] | None = (
@@ -340,7 +456,6 @@ def run_llm_loop(
        should_cite_documents: bool = False
        ran_image_gen: bool = False
        just_ran_web_search: bool = False
-        has_called_search_tool: bool = False
        citation_mapping: dict[int, str] = {}  # Maps citation_num -> document_id/URL

        current_tool_call_index = (
@@ -354,14 +469,14 @@ def run_llm_loop(
                final_tools = [tool for tool in tools if tool.id == forced_tool_id]
                if not final_tools:
                    raise ValueError(f"Tool {forced_tool_id} not found in tools")
-                tool_choice = ToolChoiceOptions.REQUIRED
+                tool_choice = "required"
                forced_tool_id = None
            elif llm_cycle_count == MAX_LLM_CYCLES - 1 or ran_image_gen:
                # Last cycle, no tools allowed, just answer!
-                tool_choice = ToolChoiceOptions.NONE
+                tool_choice = "none"
                final_tools = []
            else:
-                tool_choice = ToolChoiceOptions.AUTO
+                tool_choice = "auto"
                final_tools = tools

            # The section below calculates the available tokens for history a bit more accurately
@@ -457,7 +572,6 @@ def run_llm_loop(
                # immediately yield the full set of found documents. This gives us the option to show the
                # final set of documents immediately if desired.
                final_documents=gathered_documents,
-                user_identity=user_identity,
            )

            # Consume the generator, emitting packets and capturing the final result
@@ -492,13 +606,8 @@ def run_llm_loop(
                    user_info=None,  # TODO, this is part of memories right now, might want to separate it out
                    citation_mapping=citation_mapping,
                    citation_processor=citation_processor,
-                    skip_search_query_expansion=has_called_search_tool,
                )

-                # Track if search tool was called (for skipping query expansion on subsequent calls)
-                if tool_call.tool_name == SearchTool.NAME:
-                    has_called_search_tool = True
-
                # Build a mapping of tool names to tool objects for getting tool_id
                tools_by_name = {tool.name: tool for tool in final_tools}

--- a/backend/onyx/chat/llm_step.py
+++ b/backend/onyx/chat/llm_step.py
@@ -15,18 +15,16 @@ from onyx.context.search.models import SearchDoc
 from onyx.file_store.models import ChatFileType
 from onyx.llm.interfaces import LanguageModelInput
 from onyx.llm.interfaces import LLM
-from onyx.llm.interfaces import LLMUserIdentity
 from onyx.llm.interfaces import ToolChoiceOptions
-from onyx.llm.models import AssistantMessage
-from onyx.llm.models import ChatCompletionMessage
-from onyx.llm.models import FunctionCall
-from onyx.llm.models import ImageContentPart
-from onyx.llm.models import ImageUrlDetail
-from onyx.llm.models import SystemMessage
-from onyx.llm.models import TextContentPart
-from onyx.llm.models import ToolCall
-from onyx.llm.models import ToolMessage
-from onyx.llm.models import UserMessage
+from onyx.llm.message_types import AssistantMessage
+from onyx.llm.message_types import ChatCompletionMessage
+from onyx.llm.message_types import ImageContentPart
+from onyx.llm.message_types import SystemMessage
+from onyx.llm.message_types import TextContentPart
+from onyx.llm.message_types import ToolCall
+from onyx.llm.message_types import ToolMessage
+from onyx.llm.message_types import UserMessageWithParts
+from onyx.llm.message_types import UserMessageWithText
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
 from onyx.server.query_and_chat.streaming_models import CitationInfo
@@ -67,56 +65,78 @@ def _format_message_history_for_logging(

    # Handle sequence of messages
    for i, msg in enumerate(message_history):
-        if isinstance(msg, SystemMessage):
-            formatted_lines.append(f"Message {i + 1} [system]:")
-            formatted_lines.append(separator)
-            formatted_lines.append(f"{msg.content}")
-
-        elif isinstance(msg, UserMessage):
-            formatted_lines.append(f"Message {i + 1} [user]:")
-            formatted_lines.append(separator)
-            if isinstance(msg.content, str):
-                formatted_lines.append(f"{msg.content}")
-            elif isinstance(msg.content, list):
-                # Handle multimodal content (text + images)
-                for part in msg.content:
-                    if isinstance(part, TextContentPart):
-                        formatted_lines.append(f"{part.text}")
-                    elif isinstance(part, ImageContentPart):
-                        url = part.image_url.url
-                        formatted_lines.append(f"[Image: {url[:50]}...]")
-
-        elif isinstance(msg, AssistantMessage):
-            formatted_lines.append(f"Message {i + 1} [assistant]:")
-            formatted_lines.append(separator)
-            if msg.content:
-                formatted_lines.append(f"{msg.content}")
-
-            if msg.tool_calls:
-                formatted_lines.append("Tool calls:")
-                for tool_call in msg.tool_calls:
-                    tool_call_dict: dict[str, Any] = {
-                        "id": tool_call.id,
-                        "type": tool_call.type,
-                        "function": {
-                            "name": tool_call.function.name,
-                            "arguments": tool_call.function.arguments,
-                        },
-                    }
-                    tool_call_json = json.dumps(tool_call_dict, indent=4)
-                    formatted_lines.append(tool_call_json)
-
-        elif isinstance(msg, ToolMessage):
-            formatted_lines.append(f"Message {i + 1} [tool]:")
-            formatted_lines.append(separator)
-            formatted_lines.append(f"Tool call ID: {msg.tool_call_id}")
-            formatted_lines.append(f"Response: {msg.content}")
-
-        else:
-            # Fallback for unknown message types
+        # Type guard: ensure msg is a dict-like object (TypedDict)
+        if not isinstance(msg, dict):
            formatted_lines.append(f"Message {i + 1} [unknown]:")
            formatted_lines.append(separator)
            formatted_lines.append(f"{msg}")
+            if i < len(message_history) - 1:
+                formatted_lines.append(separator)
+            continue
+
+        role = msg.get("role", "unknown")
+        formatted_lines.append(f"Message {i + 1} [{role}]:")
+        formatted_lines.append(separator)
+
+        if role == "system":
+            content = msg.get("content", "")
+            if isinstance(content, str):
+                formatted_lines.append(f"{content}")
+
+        elif role == "user":
+            content = msg.get("content", "")
+            if isinstance(content, str):
+                formatted_lines.append(f"{content}")
+            elif isinstance(content, list):
+                # Handle multimodal content (text + images)
+                for part in content:
+                    if isinstance(part, dict):
+                        part_type = part.get("type")
+                        if part_type == "text":
+                            text = part.get("text", "")
+                            if isinstance(text, str):
+                                formatted_lines.append(f"{text}")
+                        elif part_type == "image_url":
+                            image_url_dict = part.get("image_url")
+                            if isinstance(image_url_dict, dict):
+                                url = image_url_dict.get("url", "")
+                                if isinstance(url, str):
+                                    formatted_lines.append(f"[Image: {url[:50]}...]")
+
+        elif role == "assistant":
+            content = msg.get("content")
+            if content and isinstance(content, str):
+                formatted_lines.append(f"{content}")
+
+            tool_calls = msg.get("tool_calls")
+            if tool_calls and isinstance(tool_calls, list):
+                formatted_lines.append("Tool calls:")
+                for tool_call in tool_calls:
+                    if isinstance(tool_call, dict):
+                        tool_call_dict: dict[str, Any] = {}
+                        tool_call_id = tool_call.get("id")
+                        tool_call_type = tool_call.get("type")
+                        function_dict = tool_call.get("function")
+
+                        if tool_call_id:
+                            tool_call_dict["id"] = tool_call_id
+                        if tool_call_type:
+                            tool_call_dict["type"] = tool_call_type
+                        if isinstance(function_dict, dict):
+                            tool_call_dict["function"] = {
+                                "name": function_dict.get("name", ""),
+                                "arguments": function_dict.get("arguments", ""),
+                            }
+
+                        tool_call_json = json.dumps(tool_call_dict, indent=4)
+                        formatted_lines.append(tool_call_json)
+
+        elif role == "tool":
+            content = msg.get("content", "")
+            tool_call_id = msg.get("tool_call_id", "")
+            if isinstance(content, str) and isinstance(tool_call_id, str):
+                formatted_lines.append(f"Tool call ID: {tool_call_id}")
+                formatted_lines.append(f"Response: {content}")

        # Add separator before next message (or at end)
        if i < len(message_history) - 1:
@@ -197,10 +217,10 @@ def translate_history_to_llm_format(

    for msg in history:
        if msg.message_type == MessageType.SYSTEM:
-            system_msg = SystemMessage(
-                role="system",
-                content=msg.message,
-            )
+            system_msg: SystemMessage = {
+                "role": "system",
+                "content": msg.message,
+            }
            messages.append(system_msg)

        elif msg.message_type == MessageType.USER:
@@ -208,10 +228,7 @@ def translate_history_to_llm_format(
            if msg.image_files:
                # Build content parts: text + images
                content_parts: list[TextContentPart | ImageContentPart] = [
-                    TextContentPart(
-                        type="text",
-                        text=msg.message,
-                    )
+                    {"type": "text", "text": msg.message}
                ]

                # Add image parts
@@ -222,38 +239,35 @@ def translate_history_to_llm_format(
                            base64_data = img_file.to_base64()
                            image_url = f"data:{image_type};base64,{base64_data}"

-                            image_part = ImageContentPart(
-                                type="image_url",
-                                image_url=ImageUrlDetail(
-                                    url=image_url,
-                                    detail=None,
-                                ),
-                            )
+                            image_part: ImageContentPart = {
+                                "type": "image_url",
+                                "image_url": {"url": image_url},
+                            }
                            content_parts.append(image_part)
                        except Exception as e:
                            logger.warning(
                                f"Failed to process image file {img_file.file_id}: {e}. "
                                "Skipping image."
                            )
-                user_msg = UserMessage(
-                    role="user",
-                    content=content_parts,
-                )
-                messages.append(user_msg)
+
+                user_msg_with_parts: UserMessageWithParts = {
+                    "role": "user",
+                    "content": content_parts,
+                }
+                messages.append(user_msg_with_parts)
            else:
                # Simple text-only user message
-                user_msg_text = UserMessage(
-                    role="user",
-                    content=msg.message,
-                )
+                user_msg_text: UserMessageWithText = {
+                    "role": "user",
+                    "content": msg.message,
+                }
                messages.append(user_msg_text)

        elif msg.message_type == MessageType.ASSISTANT:
-            assistant_msg = AssistantMessage(
-                role="assistant",
-                content=msg.message or None,
-                tool_calls=None,
-            )
+            assistant_msg: AssistantMessage = {
+                "role": "assistant",
+                "content": msg.message or None,
+            }
            messages.append(assistant_msg)

        elif msg.message_type == MessageType.TOOL_CALL:
@@ -281,14 +295,14 @@ def translate_history_to_llm_format(

                    # NOTE: if the model is trained on a different tool call format, this may slightly interfere
                    # with the future tool calls, if it doesn't look like this. Almost certainly not a big deal.
-                    tool_call = ToolCall(
-                        id=msg.tool_call_id,
-                        type="function",
-                        function=FunctionCall(
-                            name=function_name,
-                            arguments=json.dumps(tool_args) if tool_args else "{}",
-                        ),
-                    )
+                    tool_call: ToolCall = {
+                        "id": msg.tool_call_id,
+                        "type": "function",
+                        "function": {
+                            "name": function_name,
+                            "arguments": json.dumps(tool_args) if tool_args else "{}",
+                        },
+                    }
                    tool_calls.append(tool_call)
                except (json.JSONDecodeError, ValueError) as e:
                    logger.warning(
@@ -296,11 +310,12 @@ def translate_history_to_llm_format(
                        "Including as content-only message."
                    )

-            assistant_msg_with_tool = AssistantMessage(
-                role="assistant",
-                content=None,  # The tool call is parsed, doesn't need to be duplicated in the content
-                tool_calls=tool_calls if tool_calls else None,
-            )
+            assistant_msg_with_tool: AssistantMessage = {
+                "role": "assistant",
+                "content": None,  # The tool call is parsed, doesn't need to be duplicated in the content
+            }
+            if tool_calls:
+                assistant_msg_with_tool["tool_calls"] = tool_calls
            messages.append(assistant_msg_with_tool)

        elif msg.message_type == MessageType.TOOL_CALL_RESPONSE:
@@ -309,11 +324,11 @@ def translate_history_to_llm_format(
                    f"Tool call response message encountered but tool_call_id is not available. Message: {msg}"
                )

-            tool_msg = ToolMessage(
-                role="tool",
-                content=msg.message,
-                tool_call_id=msg.tool_call_id,
-            )
+            tool_msg: ToolMessage = {
+                "role": "tool",
+                "content": msg.message,
+                "tool_call_id": msg.tool_call_id,
+            }
            messages.append(tool_msg)

        else:
@@ -333,7 +348,6 @@ def run_llm_step(
    citation_processor: DynamicCitationProcessor,
    state_container: ChatStateContainer,
    final_documents: list[SearchDoc] | None = None,
-    user_identity: LLMUserIdentity | None = None,
 ) -> Generator[Packet, None, tuple[LlmStepResult, int]]:
    # The second return value is for the turn index because reasoning counts on the frontend as a turn
    # TODO this is maybe ok but does not align well with the backend logic too well
@@ -366,8 +380,6 @@ def run_llm_step(
            tools=tool_definitions,
            tool_choice=tool_choice,
            structured_response_format=None,  # TODO
-            # reasoning_effort=ReasoningEffort.OFF,  # Can set this for dev/testing.
-            user_identity=user_identity,
        ):
            if packet.usage:
                usage = packet.usage
@@ -444,30 +456,27 @@ def run_llm_step(
        tool_calls = _extract_tool_call_kickoffs(id_to_tool_call_map)
        if tool_calls:
            tool_calls_list: list[ToolCall] = [
-                ToolCall(
-                    id=kickoff.tool_call_id,
-                    type="function",
-                    function=FunctionCall(
-                        name=kickoff.tool_name,
-                        arguments=json.dumps(kickoff.tool_args),
-                    ),
-                )
+                {
+                    "id": kickoff.tool_call_id,
+                    "type": "function",
+                    "function": {
+                        "name": kickoff.tool_name,
+                        "arguments": json.dumps(kickoff.tool_args),
+                    },
+                }
                for kickoff in tool_calls
            ]

-            assistant_msg: AssistantMessage = AssistantMessage(
-                role="assistant",
-                content=accumulated_answer if accumulated_answer else None,
-                tool_calls=tool_calls_list,
-            )
-            span_generation.span_data.output = [assistant_msg.model_dump()]
+            assistant_msg: AssistantMessage = {
+                "role": "assistant",
+                "content": accumulated_answer if accumulated_answer else None,
+                "tool_calls": tool_calls_list,
+            }
+            span_generation.span_data.output = [assistant_msg]
        elif accumulated_answer:
-            assistant_msg_no_tools = AssistantMessage(
-                role="assistant",
-                content=accumulated_answer,
-                tool_calls=None,
-            )
-            span_generation.span_data.output = [assistant_msg_no_tools.model_dump()]
+            span_generation.span_data.output = [
+                {"role": "assistant", "content": accumulated_answer}
+            ]
    # Close reasoning block if still open (stream ended with reasoning content)
    if reasoning_start:
        yield Packet(
--- a/backend/onyx/chat/models.py
+++ b/backend/onyx/chat/models.py
@@ -102,11 +102,6 @@ class MessageResponseIDInfo(BaseModel):
 class StreamingError(BaseModel):
    error: str
    stack_trace: str | None = None
-    error_code: str | None = (
-        None  # e.g., "RATE_LIMIT", "AUTH_ERROR", "TOOL_CALL_FAILED"
-    )
-    is_retryable: bool = True  # Hint to frontend if retry might help
-    details: dict | None = None  # Additional context (tool name, model name, etc.)


 class OnyxAnswer(BaseModel):
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -13,7 +13,6 @@ from onyx.chat.chat_state import run_chat_llm_with_state_containers
 from onyx.chat.chat_utils import convert_chat_history
 from onyx.chat.chat_utils import create_chat_history_chain
 from onyx.chat.chat_utils import get_custom_agent_prompt
-from onyx.chat.chat_utils import is_last_assistant_message_clarification
 from onyx.chat.chat_utils import load_all_chat_files
 from onyx.chat.emitter import get_default_emitter
 from onyx.chat.llm_loop import run_llm_loop
@@ -40,6 +39,7 @@ from onyx.db.chat import get_chat_session_by_id
 from onyx.db.chat import get_or_create_root_message
 from onyx.db.chat import reserve_message_id
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
+from onyx.db.enums import AvatarQueryMode
 from onyx.db.memory import get_memories
 from onyx.db.models import ChatMessage
 from onyx.db.models import User
@@ -51,25 +51,26 @@ from onyx.file_store.models import ChatFileType
 from onyx.file_store.models import FileDescriptor
 from onyx.file_store.utils import load_in_memory_chat_files
 from onyx.file_store.utils import verify_user_files
+from onyx.llm.factory import get_default_llms
 from onyx.llm.factory import get_llm_token_counter
 from onyx.llm.factory import get_llms_for_persona
+from onyx.llm.factory import get_tokenizer
 from onyx.llm.interfaces import LLM
-from onyx.llm.interfaces import LLMUserIdentity
 from onyx.llm.utils import litellm_exception_to_error_msg
 from onyx.onyxbot.slack.models import SlackContext
 from onyx.redis.redis_pool import get_redis_client
+from onyx.server.features.avatar.query_service import execute_avatar_query
 from onyx.server.query_and_chat.models import CreateChatMessageRequest
 from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
 from onyx.server.query_and_chat.streaming_models import AgentResponseStart
 from onyx.server.query_and_chat.streaming_models import CitationInfo
+from onyx.server.query_and_chat.streaming_models import OverallStop
 from onyx.server.query_and_chat.streaming_models import Packet
 from onyx.server.utils import get_json_line
-from onyx.tools.constants import SEARCH_TOOL_ID
 from onyx.tools.tool import Tool
 from onyx.tools.tool_constructor import construct_tools
 from onyx.tools.tool_constructor import CustomToolConfig
 from onyx.tools.tool_constructor import SearchToolConfig
-from onyx.tools.tool_constructor import SearchToolUsage
 from onyx.utils.logger import setup_logger
 from onyx.utils.long_term_log import LongTermLogger
 from onyx.utils.timing import log_function_time
@@ -83,10 +84,6 @@ ERROR_TYPE_CANCELLED = "cancelled"
 class ToolCallException(Exception):
    """Exception raised for errors during tool calls."""

-    def __init__(self, message: str, tool_name: str | None = None):
-        super().__init__(message)
-        self.tool_name = tool_name
-

 def _extract_project_file_texts_and_images(
    project_id: int | None,
@@ -214,46 +211,6 @@ def _extract_project_file_texts_and_images(
    )


-def _get_project_search_availability(
-    project_id: int | None,
-    persona_id: int | None,
-    has_project_file_texts: bool,
-    forced_tool_ids: list[int] | None,
-    search_tool_id: int | None,
-) -> SearchToolUsage:
-    """Determine search tool availability based on project context.
-
-    Args:
-        project_id: The project ID if the user is in a project
-        persona_id: The persona ID to check if it's the default persona
-        has_project_file_texts: Whether project files are loaded in context
-        forced_tool_ids: List of forced tool IDs (may be mutated to remove search tool)
-        search_tool_id: The search tool ID to check against
-
-    Returns:
-        SearchToolUsage setting indicating how search should be used
-    """
-    # There are cases where the internal search tool should be disabled
-    # If the user is in a project, it should not use other sources / generic search
-    # If they are in a project but using a custom agent, it should use the agent setup
-    # (which means it can use search)
-    # However if in a project and there are more files than can fit in the context,
-    # it should use the search tool with the project filter on
-    # If no files are uploaded, search should remain enabled
-    search_usage_forcing_setting = SearchToolUsage.AUTO
-    if project_id:
-        if bool(persona_id is DEFAULT_PERSONA_ID and has_project_file_texts):
-            search_usage_forcing_setting = SearchToolUsage.DISABLED
-            # Remove search tool from forced_tool_ids if it's present
-            if forced_tool_ids and search_tool_id and search_tool_id in forced_tool_ids:
-                forced_tool_ids[:] = [
-                    tool_id for tool_id in forced_tool_ids if tool_id != search_tool_id
-                ]
-        elif forced_tool_ids and search_tool_id and search_tool_id in forced_tool_ids:
-            search_usage_forcing_setting = SearchToolUsage.ENABLED
-    return search_usage_forcing_setting
-
-
 def _initialize_chat_session(
    message_text: str,
    files: list[FileDescriptor],
@@ -306,6 +263,220 @@ def _initialize_chat_session(
    return user_message


+def _stream_avatar_query(
+    avatar_id: int,
+    query: str,
+    query_mode: AvatarQueryMode | None,
+    user: User,
+    db_session: Session,
+    chat_session_id: UUID,
+    parent_message_id: int | None,
+) -> AnswerStream:
+    """Handle avatar query and yield streaming response packets.
+
+    This creates user and assistant messages and yields the avatar response
+    in the same streaming format as regular chat messages.
+    """
+
+    # Get a tokenizer for message initialization
+    llm, _ = get_default_llms()
+    token_counter = get_tokenizer(
+        model_name=llm.config.model_name, provider_type=llm.config.model_provider
+    )
+
+    # Initialize chat session to create user message
+    user_message = _initialize_chat_session(
+        message_text=query,
+        files=[],
+        token_counter=lambda text: len(token_counter.encode(text)),
+        parent_id=parent_message_id,
+        user_id=user.id,
+        chat_session_id=chat_session_id,
+        db_session=db_session,
+        use_existing_user_message=False,
+    )
+
+    # Commit user message
+    db_session.commit()
+
+    # Reserve assistant message ID
+    assistant_message = reserve_message_id(
+        db_session=db_session,
+        chat_session_id=chat_session_id,
+        parent_message=user_message.id,
+        message_type=MessageType.ASSISTANT,
+    )
+
+    # Yield message IDs first
+    yield MessageResponseIDInfo(
+        user_message_id=user_message.id,
+        reserved_assistant_message_id=assistant_message.id,
+    )
+
+    # Execute avatar query
+    result = execute_avatar_query(
+        avatar_id=avatar_id,
+        query=query,
+        query_mode=query_mode or AvatarQueryMode.OWNED_DOCUMENTS,
+        requester=user,
+        db_session=db_session,
+        chat_session_id=chat_session_id,
+        chat_message_id=user_message.id,
+    )
+
+    # Yield start packet
+    yield Packet(turn_index=0, obj=AgentResponseStart(final_documents=None))
+
+    # Build the response message based on status
+    if result.status == "success" and result.answer:
+        response_text = result.answer
+    elif result.status == "pending_permission":
+        response_text = (
+            f"Your request has been sent to the avatar owner for approval. "
+            f"Request ID: #{result.permission_request_id}\n\n"
+            f"You'll be notified when they respond."
+        )
+    elif result.status == "no_results":
+        response_text = result.message or "No relevant documents found."
+    elif result.status == "rate_limited":
+        response_text = (
+            result.message or "You have exceeded the rate limit for this avatar."
+        )
+    elif result.status == "disabled":
+        response_text = result.message or "This avatar is currently disabled."
+    else:
+        response_text = result.message or "An error occurred processing your request."
+
+    # Yield the response as delta packets (simulating streaming)
+    yield Packet(turn_index=0, obj=AgentResponseDelta(content=response_text))
+
+    # Yield stop packet to signal end of stream
+    yield Packet(turn_index=0, obj=OverallStop())
+
+    # Update the assistant message with the actual response
+    assistant_message.message = response_text
+    assistant_message.token_count = len(response_text.split())  # Simple token count
+    db_session.commit()
+
+
+def _stream_broadcast_avatar_query(
+    avatar_ids: list[int],
+    query: str,
+    query_mode: AvatarQueryMode | None,
+    user: User,
+    db_session: Session,
+    chat_session_id: UUID,
+    parent_message_id: int | None,
+) -> AnswerStream:
+    """Handle broadcast avatar query - query multiple avatars and aggregate results.
+
+    This creates user and assistant messages and yields the aggregated avatar responses
+    in the same streaming format as regular chat messages.
+    """
+    from onyx.db.avatar import get_avatar_by_id
+    from onyx.llm.utils import check_number_of_tokens
+
+    # Simple token counter for message initialization
+    def token_counter(text: str) -> int:
+        return check_number_of_tokens(text)
+
+    # Initialize chat session to create user message
+    user_message = _initialize_chat_session(
+        message_text=query,
+        files=[],
+        token_counter=token_counter,
+        parent_id=parent_message_id,
+        user_id=user.id,
+        chat_session_id=chat_session_id,
+        db_session=db_session,
+        use_existing_user_message=False,
+    )
+
+    # Commit user message
+    db_session.commit()
+
+    # Reserve assistant message ID
+    assistant_message = reserve_message_id(
+        db_session=db_session,
+        chat_session_id=chat_session_id,
+        parent_message=user_message.id,
+        message_type=MessageType.ASSISTANT,
+    )
+
+    # Yield message IDs first
+    yield MessageResponseIDInfo(
+        user_message_id=user_message.id,
+        reserved_assistant_message_id=assistant_message.id,
+    )
+
+    # Yield start packet
+    yield Packet(turn_index=0, obj=AgentResponseStart(final_documents=None))
+
+    # Execute queries for each avatar and collect results
+    results: list[tuple[str, str]] = []  # (avatar_name, response)
+
+    for avatar_id in avatar_ids:
+        avatar = get_avatar_by_id(avatar_id, db_session)
+        if not avatar:
+            results.append((f"Avatar #{avatar_id}", "Avatar not found"))
+            continue
+
+        avatar_name = (
+            avatar.name or avatar.user.email if avatar.user else f"Avatar #{avatar_id}"
+        )
+
+        result = execute_avatar_query(
+            avatar_id=avatar_id,
+            query=query,
+            query_mode=query_mode or AvatarQueryMode.OWNED_DOCUMENTS,
+            requester=user,
+            db_session=db_session,
+            chat_session_id=chat_session_id,
+            chat_message_id=user_message.id,
+        )
+
+        # Build response for this avatar
+        if result.status == "success" and result.answer:
+            results.append((avatar_name, result.answer))
+        elif result.status == "pending_permission":
+            results.append(
+                (
+                    avatar_name,
+                    f"⏳ Permission requested (Request #{result.permission_request_id})",
+                )
+            )
+        elif result.status == "no_results":
+            results.append((avatar_name, "No relevant documents found"))
+        elif result.status == "rate_limited":
+            results.append((avatar_name, "Rate limited"))
+        elif result.status == "disabled":
+            results.append((avatar_name, "Avatar disabled"))
+        else:
+            results.append((avatar_name, result.message or "Error"))
+
+    # Format the aggregated response
+    response_parts = []
+    for avatar_name, response in results:
+        response_parts.append(f"## {avatar_name}\n\n{response}")
+
+    response_text = "\n\n---\n\n".join(response_parts)
+
+    # If no results at all
+    if not results:
+        response_text = "No avatars were queried."
+
+    # Yield the response as delta packets
+    yield Packet(turn_index=0, obj=AgentResponseDelta(content=response_text))
+
+    # Yield stop packet to signal end of stream
+    yield Packet(turn_index=0, obj=OverallStop())
+
+    # Update the assistant message with the actual response
+    assistant_message.message = response_text
+    assistant_message.token_count = len(response_text.split())
+    db_session.commit()
+
+
 def stream_chat_message_objects(
    new_msg_req: CreateChatMessageRequest,
    user: User | None,
@@ -333,15 +504,45 @@ def stream_chat_message_objects(
    tenant_id = get_current_tenant_id()
    use_existing_user_message = new_msg_req.use_existing_user_message

-    llm: LLM | None = None
+    # Handle avatar queries - route to separate flow
+    # Single avatar query
+    if new_msg_req.avatar_id is not None:
+        if user is None:
+            yield StreamingError(error="Authentication required for avatar queries")
+            return
+
+        yield from _stream_avatar_query(
+            avatar_id=new_msg_req.avatar_id,
+            query=new_msg_req.message,
+            query_mode=new_msg_req.avatar_query_mode,
+            user=user,
+            db_session=db_session,
+            chat_session_id=new_msg_req.chat_session_id,
+            parent_message_id=new_msg_req.parent_message_id,
+        )
+        return
+
+    # Broadcast mode - multiple avatar queries
+    if new_msg_req.avatar_ids is not None and len(new_msg_req.avatar_ids) > 0:
+        if user is None:
+            yield StreamingError(error="Authentication required for avatar queries")
+            return
+
+        yield from _stream_broadcast_avatar_query(
+            avatar_ids=new_msg_req.avatar_ids,
+            query=new_msg_req.message,
+            query_mode=new_msg_req.avatar_query_mode,
+            user=user,
+            db_session=db_session,
+            chat_session_id=new_msg_req.chat_session_id,
+            parent_message_id=new_msg_req.parent_message_id,
+        )
+        return
+
+    llm: LLM

    try:
        user_id = user.id if user is not None else None
-        llm_user_identifier = (
-            user.email
-            if user is not None and getattr(user, "email", None)
-            else (str(user_id) if user_id else "anonymous_user")
-        )

        chat_session = get_chat_session_by_id(
            chat_session_id=new_msg_req.chat_session_id,
@@ -352,9 +553,6 @@ def stream_chat_message_objects(

        message_text = new_msg_req.message
        chat_session_id = new_msg_req.chat_session_id
-        user_identity = LLMUserIdentity(
-            user_id=llm_user_identifier, session_id=str(chat_session_id)
-        )
        parent_id = new_msg_req.parent_message_id
        reference_doc_ids = new_msg_req.search_doc_ids
        retrieval_options = new_msg_req.retrieval_options
@@ -447,23 +645,19 @@ def stream_chat_message_objects(
            db_session=db_session,
        )

-        # Build a mapping of tool_id to tool_name for history reconstruction
-        all_tools = get_tools(db_session)
-        tool_id_to_name_map = {tool.id: tool.name for tool in all_tools}
-
-        search_tool_id = next(
-            (tool.id for tool in all_tools if tool.in_code_tool_id == SEARCH_TOOL_ID),
-            None,
-        )
-
-        # This may also mutate the new_msg_req.forced_tool_ids
-        # This logic is specifically for projects
-        search_usage_forcing_setting = _get_project_search_availability(
-            project_id=chat_session.project_id,
-            persona_id=persona.id,
-            has_project_file_texts=bool(extracted_project_files.project_file_texts),
-            forced_tool_ids=new_msg_req.forced_tool_ids,
-            search_tool_id=search_tool_id,
+        # There are cases where the internal search tool should be disabled
+        # If the user is in a project, it should not use other sources / generic search
+        # If they are in a project but using a custom agent, it should use the agent setup
+        # (which means it can use search)
+        # However if in a project and there are more files than can fit in the context,
+        # it should use the search tool with the project filter on
+        disable_internal_search = bool(
+            chat_session.project_id
+            and persona.id is DEFAULT_PERSONA_ID
+            and (
+                extracted_project_files.project_file_texts
+                or not extracted_project_files.project_as_filter
+            )
        )

        emitter = get_default_emitter()
@@ -492,7 +686,7 @@ def stream_chat_message_objects(
                additional_headers=custom_tool_additional_headers,
            ),
            allowed_tool_ids=new_msg_req.allowed_tool_ids,
-            search_usage_forcing_setting=search_usage_forcing_setting,
+            disable_internal_search=disable_internal_search,
        )
        tools: list[Tool] = []
        for tool_list in tool_dict.values():
@@ -517,6 +711,10 @@ def stream_chat_message_objects(
            reserved_assistant_message_id=assistant_response.id,
        )

+        # Build a mapping of tool_id to tool_name for history reconstruction
+        all_tools = get_tools(db_session)
+        tool_id_to_name_map = {tool.id: tool.name for tool in all_tools}
+
        # Convert the chat history into a simple format that is free of any DB objects
        # and is easy to parse for the agent loop
        simple_chat_history = convert_chat_history(
@@ -547,13 +745,6 @@ def stream_chat_message_objects(
        # Note: DB session is not thread safe but nothing else uses it and the
        # reference is passed directly so it's ok.
        if os.environ.get("ENABLE_DEEP_RESEARCH_LOOP"):  # Dev only feature flag for now
-            if chat_session.project_id:
-                raise RuntimeError("Deep research is not supported for projects")
-
-            # Skip clarification if the last assistant message was a clarification
-            # (user has already responded to a clarification question)
-            skip_clarification = is_last_assistant_message_clarification(chat_history)
-
            yield from run_chat_llm_with_state_containers(
                run_deep_research_llm_loop,
                is_connected=check_is_connected,
@@ -565,8 +756,6 @@ def stream_chat_message_objects(
                llm=llm,
                token_counter=token_counter,
                db_session=db_session,
-                skip_clarification=skip_clarification,
-                user_identity=user_identity,
            )
        else:
            yield from run_chat_llm_with_state_containers(
@@ -588,7 +777,6 @@ def stream_chat_message_objects(
                    if new_msg_req.forced_tool_ids
                    else None
                ),
-                user_identity=user_identity,
            )

        # Determine if stopped by user
@@ -633,18 +821,13 @@ def stream_chat_message_objects(
            tool_calls=state_container.tool_calls,
            db_session=db_session,
            assistant_message=assistant_response,
-            is_clarification=state_container.is_clarification,
        )

    except ValueError as e:
        logger.exception("Failed to process chat message.")

        error_msg = str(e)
-        yield StreamingError(
-            error=error_msg,
-            error_code="VALIDATION_ERROR",
-            is_retryable=True,
-        )
+        yield StreamingError(error=error_msg)
        db_session.rollback()
        return

@@ -654,17 +837,9 @@ def stream_chat_message_objects(
        stack_trace = traceback.format_exc()

        if isinstance(e, ToolCallException):
-            yield StreamingError(
-                error=error_msg,
-                stack_trace=stack_trace,
-                error_code="TOOL_CALL_FAILED",
-                is_retryable=True,
-                details={"tool_name": e.tool_name} if e.tool_name else None,
-            )
+            yield StreamingError(error=error_msg, stack_trace=stack_trace)
        elif llm:
-            client_error_msg, error_code, is_retryable = litellm_exception_to_error_msg(
-                e, llm
-            )
+            client_error_msg = litellm_exception_to_error_msg(e, llm)
            if llm.config.api_key and len(llm.config.api_key) > 2:
                client_error_msg = client_error_msg.replace(
                    llm.config.api_key, "[REDACTED_API_KEY]"
@@ -673,24 +848,7 @@ def stream_chat_message_objects(
                    llm.config.api_key, "[REDACTED_API_KEY]"
                )

-            yield StreamingError(
-                error=client_error_msg,
-                stack_trace=stack_trace,
-                error_code=error_code,
-                is_retryable=is_retryable,
-                details={
-                    "model": llm.config.model_name,
-                    "provider": llm.config.model_provider,
-                },
-            )
-        else:
-            # LLM was never initialized - early failure
-            yield StreamingError(
-                error="Failed to initialize the chat. Please check your configuration and try again.",
-                stack_trace=stack_trace,
-                error_code="INIT_FAILED",
-                is_retryable=True,
-            )
+            yield StreamingError(error=client_error_msg, stack_trace=stack_trace)

        db_session.rollback()
        return
--- a/backend/onyx/chat/save_chat.py
+++ b/backend/onyx/chat/save_chat.py
@@ -148,7 +148,6 @@ def save_chat_turn(
    citation_docs_info: list[CitationDocInfo],
    db_session: Session,
    assistant_message: ChatMessage,
-    is_clarification: bool = False,
 ) -> None:
    """
    Save a chat turn by populating the assistant_message and creating related entities.
@@ -176,12 +175,10 @@ def save_chat_turn(
        citation_docs_info: List of citation document information for building citations mapping
        db_session: Database session for persistence
        assistant_message: The ChatMessage object to populate (should already exist in DB)
-        is_clarification: Whether this assistant message is a clarification question (deep research flow)
    """
    # 1. Update ChatMessage with message content, reasoning tokens, and token count
    assistant_message.message = message_text
    assistant_message.reasoning_tokens = reasoning_tokens
-    assistant_message.is_clarification = is_clarification

    # Calculate token count using default tokenizer, when storing, this should not use the LLM
    # specific one so we use a system default tokenizer here.
--- a/backend/onyx/chat/stop_signal_checker.py
+++ b/backend/onyx/chat/stop_signal_checker.py
@@ -7,7 +7,6 @@ from shared_configs.contextvars import get_current_tenant_id
 # Redis key prefixes for chat session stop signals
 PREFIX = "chatsessionstop"
 FENCE_PREFIX = f"{PREFIX}_fence"
-FENCE_TTL = 24 * 60 * 60  # 24 hours - defensive TTL to prevent memory leaks


 def set_fence(chat_session_id: UUID, redis_client: Redis, value: bool) -> None:
@@ -25,7 +24,7 @@ def set_fence(chat_session_id: UUID, redis_client: Redis, value: bool) -> None:
        redis_client.delete(fence_key)
        return

-    redis_client.set(fence_key, 0, ex=FENCE_TTL)
+    redis_client.set(fence_key, 0)


 def is_connected(chat_session_id: UUID, redis_client: Redis) -> bool:
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -24,12 +24,6 @@ APP_PORT = 8080
 # prefix from requests directed towards the API server. In these cases, set this to `/api`
 APP_API_PREFIX = os.environ.get("API_PREFIX", "")

-# Whether to send user metadata (user_id/email and session_id) to the LLM provider.
-# Disabled by default.
-SEND_USER_METADATA_TO_LLM_PROVIDER = (
-    os.environ.get("SEND_USER_METADATA_TO_LLM_PROVIDER", "")
-).lower() == "true"
-
 #####
 # User Facing Features Configs
 #####
@@ -37,6 +31,7 @@ BLURB_SIZE = 128  # Number Encoder Tokens included in the chunk blurb
 GENERATIVE_MODEL_ACCESS_CHECK_FREQ = int(
    os.environ.get("GENERATIVE_MODEL_ACCESS_CHECK_FREQ") or 86400
 )  # 1 day
+DISABLE_GENERATIVE_AI = os.environ.get("DISABLE_GENERATIVE_AI", "").lower() == "true"

 # Controls whether users can use User Knowledge (personal documents) in assistants
 DISABLE_USER_KNOWLEDGE = os.environ.get("DISABLE_USER_KNOWLEDGE", "").lower() == "true"
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -177,7 +177,6 @@ class DocumentSource(str, Enum):
    SLAB = "slab"
    PRODUCTBOARD = "productboard"
    FILE = "file"
-    CODA = "coda"
    NOTION = "notion"
    ZULIP = "zulip"
    LINEAR = "linear"
@@ -236,6 +235,10 @@ class NotificationType(str, Enum):
    REINDEX = "reindex"
    PERSONA_SHARED = "persona_shared"
    TRIAL_ENDS_TWO_DAYS = "two_day_trial_ending"  # 2 days left in trial
+    # Avatar permission requests
+    AVATAR_PERMISSION_REQUEST = "avatar_permission_request"
+    AVATAR_REQUEST_APPROVED = "avatar_request_approved"
+    AVATAR_REQUEST_DENIED = "avatar_request_denied"


 class BlobType(str, Enum):
@@ -543,6 +546,9 @@ class OnyxCeleryTask:

    EVAL_RUN_TASK = "eval_run_task"

+    # Avatar queries
+    AVATAR_QUERY_TASK = "avatar_query_task"
+
    EXPORT_QUERY_HISTORY_TASK = "export_query_history_task"
    EXPORT_QUERY_HISTORY_CLEANUP_TASK = "export_query_history_cleanup_task"

@@ -597,7 +603,6 @@ DocumentSourceDescription: dict[DocumentSource, str] = {
    DocumentSource.SLAB: "slab data",
    DocumentSource.PRODUCTBOARD: "productboard data (boards, etc.)",
    DocumentSource.FILE: "files",
-    DocumentSource.CODA: "coda - team workspace with docs, tables, and pages",
    DocumentSource.NOTION: "notion data - a workspace that combines note-taking, \
 project management, and collaboration tools into a single, customizable platform",
    DocumentSource.ZULIP: "zulip data",
--- a/backend/onyx/configs/model_configs.py
+++ b/backend/onyx/configs/model_configs.py
@@ -65,10 +65,9 @@ GEN_AI_NUM_RESERVED_OUTPUT_TOKENS = int(
    os.environ.get("GEN_AI_NUM_RESERVED_OUTPUT_TOKENS") or 1024
 )

-# Fallback token limit for models where the max context is unknown
-# Set conservatively at 32K to handle most modern models
+# Typically, GenAI models nowadays are at least 4K tokens
 GEN_AI_MODEL_FALLBACK_MAX_TOKENS = int(
-    os.environ.get("GEN_AI_MODEL_FALLBACK_MAX_TOKENS") or 32000
+    os.environ.get("GEN_AI_MODEL_FALLBACK_MAX_TOKENS") or 4096
 )

 # This is used when computing how much context space is available for documents
--- a/backend/onyx/connectors/asana/asana_api.py
+++ b/backend/onyx/connectors/asana/asana_api.py
@@ -97,31 +97,28 @@ class AsanaAPI:
        self, project_gid: str, start_date: str, start_seconds: int
    ) -> Iterator[AsanaTask]:
        project = self.project_api.get_project(project_gid, opts={})
-        project_name = project.get("name", project_gid)
-        team = project.get("team") or {}
-        team_gid = team.get("gid")
-
-        if project.get("archived"):
-            logger.info(f"Skipping archived project: {project_name} ({project_gid})")
-            return
-        if not team_gid:
+        if project["archived"]:
+            logger.info(f"Skipping archived project: {project['name']} ({project_gid})")
+            yield from []
+        if not project["team"] or not project["team"]["gid"]:
            logger.info(
-                f"Skipping project without a team: {project_name} ({project_gid})"
+                f"Skipping project without a team: {project['name']} ({project_gid})"
            )
-            return
-        if project.get("privacy_setting") == "private":
-            if self.team_gid and team_gid != self.team_gid:
+            yield from []
+        if project["privacy_setting"] == "private":
+            if self.team_gid and project["team"]["gid"] != self.team_gid:
                logger.info(
-                    f"Skipping private project not in configured team: {project_name} ({project_gid})"
+                    f"Skipping private project not in configured team: {project['name']} ({project_gid})"
+                )
+                yield from []
+            else:
+                logger.info(
+                    f"Processing private project in configured team: {project['name']} ({project_gid})"
                )
-                return
-            logger.info(
-                f"Processing private project in configured team: {project_name} ({project_gid})"
-            )

        simple_start_date = start_date.split(".")[0].split("+")[0]
        logger.info(
-            f"Fetching tasks modified since {simple_start_date} for project: {project_name} ({project_gid})"
+            f"Fetching tasks modified since {simple_start_date} for project: {project['name']} ({project_gid})"
        )

        opts = {
@@ -160,7 +157,7 @@ class AsanaAPI:
                    link=data["permalink_url"],
                    last_modified=datetime.fromisoformat(data["modified_at"]),
                    project_gid=project_gid,
-                    project_name=project_name,
+                    project_name=project["name"],
                )
                yield task
            except Exception:
--- a/backend/onyx/connectors/coda/connector.py
+++ b/backend/onyx/connectors/coda/connector.py
@@ -1,711 +0,0 @@
-import os
-from collections.abc import Generator
-from datetime import datetime
-from datetime import timezone
-from typing import Any
-from typing import cast
-from typing import Dict
-from typing import List
-from typing import Optional
-
-from pydantic import BaseModel
-from retry import retry
-
-from onyx.configs.app_configs import INDEX_BATCH_SIZE
-from onyx.configs.constants import DocumentSource
-from onyx.connectors.cross_connector_utils.rate_limit_wrapper import (
-    rl_requests,
-)
-from onyx.connectors.exceptions import ConnectorValidationError
-from onyx.connectors.exceptions import CredentialExpiredError
-from onyx.connectors.exceptions import UnexpectedValidationError
-from onyx.connectors.interfaces import GenerateDocumentsOutput
-from onyx.connectors.interfaces import LoadConnector
-from onyx.connectors.interfaces import PollConnector
-from onyx.connectors.interfaces import SecondsSinceUnixEpoch
-from onyx.connectors.models import ConnectorMissingCredentialError
-from onyx.connectors.models import Document
-from onyx.connectors.models import ImageSection
-from onyx.connectors.models import TextSection
-from onyx.utils.batching import batch_generator
-from onyx.utils.logger import setup_logger
-
-_CODA_CALL_TIMEOUT = 30
-_CODA_BASE_URL = "https://coda.io/apis/v1"
-
-logger = setup_logger()
-
-
-class CodaClientRequestFailedError(ConnectionError):
-    def __init__(self, message: str, status_code: int):
-        super().__init__(
-            f"Coda API request failed with status {status_code}: {message}"
-        )
-        self.status_code = status_code
-
-
-class CodaDoc(BaseModel):
-    id: str
-    browser_link: str
-    name: str
-    created_at: str
-    updated_at: str
-    workspace_id: str
-    workspace_name: str
-    folder_id: str | None
-    folder_name: str | None
-
-
-class CodaPage(BaseModel):
-    id: str
-    browser_link: str
-    name: str
-    content_type: str
-    created_at: str
-    updated_at: str
-    doc_id: str
-
-
-class CodaTable(BaseModel):
-    id: str
-    name: str
-    browser_link: str
-    created_at: str
-    updated_at: str
-    doc_id: str
-
-
-class CodaRow(BaseModel):
-    id: str
-    name: Optional[str] = None
-    index: Optional[int] = None
-    browser_link: str
-    created_at: str
-    updated_at: str
-    values: Dict[str, Any]
-    table_id: str
-    doc_id: str
-
-
-class CodaApiClient:
-    def __init__(
-        self,
-        bearer_token: str,
-    ) -> None:
-        self.bearer_token = bearer_token
-        self.base_url = os.environ.get("CODA_BASE_URL", _CODA_BASE_URL)
-
-    def get(
-        self, endpoint: str, params: Optional[dict[str, str]] = None
-    ) -> dict[str, Any]:
-        url = self._build_url(endpoint)
-        headers = self._build_headers()
-
-        response = rl_requests.get(
-            url, headers=headers, params=params, timeout=_CODA_CALL_TIMEOUT
-        )
-
-        try:
-            json = response.json()
-        except Exception:
-            json = {}
-
-        if response.status_code >= 300:
-            error = response.reason
-            response_error = json.get("error", {}).get("message", "")
-            if response_error:
-                error = response_error
-            raise CodaClientRequestFailedError(error, response.status_code)
-
-        return json
-
-    def _build_headers(self) -> Dict[str, str]:
-        return {"Authorization": f"Bearer {self.bearer_token}"}
-
-    def _build_url(self, endpoint: str) -> str:
-        return self.base_url.rstrip("/") + "/" + endpoint.lstrip("/")
-
-
-class CodaConnector(LoadConnector, PollConnector):
-    def __init__(
-        self,
-        batch_size: int = INDEX_BATCH_SIZE,
-        index_page_content: bool = True,
-        workspace_id: str | None = None,
-    ) -> None:
-        self.batch_size = batch_size
-        self.index_page_content = index_page_content
-        self.workspace_id = workspace_id
-        self._coda_client: CodaApiClient | None = None
-
-    @property
-    def coda_client(self) -> CodaApiClient:
-        if self._coda_client is None:
-            raise ConnectorMissingCredentialError("Coda")
-        return self._coda_client
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _get_doc(self, doc_id: str) -> CodaDoc:
-        """Fetch a specific Coda document by its ID."""
-        logger.debug(f"Fetching Coda doc with ID: {doc_id}")
-        try:
-            response = self.coda_client.get(f"docs/{doc_id}")
-        except CodaClientRequestFailedError as e:
-            if e.status_code == 404:
-                raise ConnectorValidationError(f"Failed to fetch doc: {doc_id}") from e
-            else:
-                raise
-
-        return CodaDoc(
-            id=response["id"],
-            browser_link=response["browserLink"],
-            name=response["name"],
-            created_at=response["createdAt"],
-            updated_at=response["updatedAt"],
-            workspace_id=response["workspace"]["id"],
-            workspace_name=response["workspace"]["name"],
-            folder_id=response["folder"]["id"] if response.get("folder") else None,
-            folder_name=response["folder"]["name"] if response.get("folder") else None,
-        )
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _get_page(self, doc_id: str, page_id: str) -> CodaPage:
-        """Fetch a specific page from a Coda document."""
-        logger.debug(f"Fetching Coda page with ID: {page_id}")
-        try:
-            response = self.coda_client.get(f"docs/{doc_id}/pages/{page_id}")
-        except CodaClientRequestFailedError as e:
-            if e.status_code == 404:
-                raise ConnectorValidationError(
-                    f"Failed to fetch page: {page_id} from doc: {doc_id}"
-                ) from e
-            else:
-                raise
-
-        return CodaPage(
-            id=response["id"],
-            doc_id=doc_id,
-            browser_link=response["browserLink"],
-            name=response["name"],
-            content_type=response["contentType"],
-            created_at=response["createdAt"],
-            updated_at=response["updatedAt"],
-        )
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _get_table(self, doc_id: str, table_id: str) -> CodaTable:
-        """Fetch a specific table from a Coda document."""
-        logger.debug(f"Fetching Coda table with ID: {table_id}")
-        try:
-            response = self.coda_client.get(f"docs/{doc_id}/tables/{table_id}")
-        except CodaClientRequestFailedError as e:
-            if e.status_code == 404:
-                raise ConnectorValidationError(
-                    f"Failed to fetch table: {table_id} from doc: {doc_id}"
-                ) from e
-            else:
-                raise
-
-        return CodaTable(
-            id=response["id"],
-            name=response["name"],
-            browser_link=response["browserLink"],
-            created_at=response["createdAt"],
-            updated_at=response["updatedAt"],
-            doc_id=doc_id,
-        )
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _get_row(self, doc_id: str, table_id: str, row_id: str) -> CodaRow:
-        """Fetch a specific row from a Coda table."""
-        logger.debug(f"Fetching Coda row with ID: {row_id}")
-        try:
-            response = self.coda_client.get(
-                f"docs/{doc_id}/tables/{table_id}/rows/{row_id}"
-            )
-        except CodaClientRequestFailedError as e:
-            if e.status_code == 404:
-                raise ConnectorValidationError(
-                    f"Failed to fetch row: {row_id} from table: {table_id} in doc: {doc_id}"
-                ) from e
-            else:
-                raise
-
-        values = {}
-        for col_name, col_value in response.get("values", {}).items():
-            values[col_name] = col_value
-
-        return CodaRow(
-            id=response["id"],
-            name=response.get("name"),
-            index=response.get("index"),
-            browser_link=response["browserLink"],
-            created_at=response["createdAt"],
-            updated_at=response["updatedAt"],
-            values=values,
-            table_id=table_id,
-            doc_id=doc_id,
-        )
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _list_all_docs(
-        self, endpoint: str = "docs", params: Optional[Dict[str, str]] = None
-    ) -> List[CodaDoc]:
-        """List all Coda documents in the workspace."""
-        logger.debug("Listing documents in Coda")
-
-        all_docs: List[CodaDoc] = []
-        next_page_token: str | None = None
-        params = params or {}
-
-        if self.workspace_id:
-            params["workspaceId"] = self.workspace_id
-
-        while True:
-            if next_page_token:
-                params["pageToken"] = next_page_token
-
-            try:
-                response = self.coda_client.get(endpoint, params=params)
-            except CodaClientRequestFailedError as e:
-                if e.status_code == 404:
-                    raise ConnectorValidationError("Failed to list docs") from e
-                else:
-                    raise
-
-            items = response.get("items", [])
-
-            for item in items:
-                doc = CodaDoc(
-                    id=item["id"],
-                    browser_link=item["browserLink"],
-                    name=item["name"],
-                    created_at=item["createdAt"],
-                    updated_at=item["updatedAt"],
-                    workspace_id=item["workspace"]["id"],
-                    workspace_name=item["workspace"]["name"],
-                    folder_id=item["folder"]["id"] if item.get("folder") else None,
-                    folder_name=item["folder"]["name"] if item.get("folder") else None,
-                )
-                all_docs.append(doc)
-
-            next_page_token = response.get("nextPageToken")
-            if not next_page_token:
-                break
-
-        logger.debug(f"Found {len(all_docs)} docs")
-        return all_docs
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _list_pages_in_doc(self, doc_id: str) -> List[CodaPage]:
-        """List all pages in a Coda document."""
-        logger.debug(f"Listing pages in Coda doc with ID: {doc_id}")
-
-        pages: List[CodaPage] = []
-        endpoint = f"docs/{doc_id}/pages"
-        params: Dict[str, str] = {}
-        next_page_token: str | None = None
-
-        while True:
-            if next_page_token:
-                params["pageToken"] = next_page_token
-
-            try:
-                response = self.coda_client.get(endpoint, params=params)
-            except CodaClientRequestFailedError as e:
-                if e.status_code == 404:
-                    raise ConnectorValidationError(
-                        f"Failed to list pages for doc: {doc_id}"
-                    ) from e
-                else:
-                    raise
-
-            items = response.get("items", [])
-            for item in items:
-                # can be removed if we don't care to skip hidden pages
-                if item.get("isHidden", False):
-                    continue
-
-                pages.append(
-                    CodaPage(
-                        id=item["id"],
-                        browser_link=item["browserLink"],
-                        name=item["name"],
-                        content_type=item["contentType"],
-                        created_at=item["createdAt"],
-                        updated_at=item["updatedAt"],
-                        doc_id=doc_id,
-                    )
-                )
-
-            next_page_token = response.get("nextPageToken")
-            if not next_page_token:
-                break
-
-        logger.debug(f"Found {len(pages)} pages in doc {doc_id}")
-        return pages
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _fetch_page_content(self, doc_id: str, page_id: str) -> str:
-        """Fetch the content of a Coda page."""
-        logger.debug(f"Fetching content for page {page_id} in doc {doc_id}")
-
-        content_parts = []
-        next_page_token: str | None = None
-        params: Dict[str, str] = {}
-
-        while True:
-            if next_page_token:
-                params["pageToken"] = next_page_token
-
-            try:
-                response = self.coda_client.get(
-                    f"docs/{doc_id}/pages/{page_id}/content", params=params
-                )
-            except CodaClientRequestFailedError as e:
-                if e.status_code == 404:
-                    logger.debug(f"No content available for page {page_id}")
-                    return ""
-                raise
-
-            items = response.get("items", [])
-
-            for item in items:
-                item_content = item.get("itemContent", {})
-
-                content_text = item_content.get("content", "")
-                if content_text:
-                    content_parts.append(content_text)
-
-            next_page_token = response.get("nextPageToken")
-            if not next_page_token:
-                break
-
-        return "\n\n".join(content_parts)
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _list_tables(self, doc_id: str) -> List[CodaTable]:
-        """List all tables in a Coda document."""
-        logger.debug(f"Listing tables in Coda doc with ID: {doc_id}")
-
-        tables: List[CodaTable] = []
-        endpoint = f"docs/{doc_id}/tables"
-        params: Dict[str, str] = {}
-        next_page_token: str | None = None
-
-        while True:
-            if next_page_token:
-                params["pageToken"] = next_page_token
-
-            try:
-                response = self.coda_client.get(endpoint, params=params)
-            except CodaClientRequestFailedError as e:
-                if e.status_code == 404:
-                    raise ConnectorValidationError(
-                        f"Failed to list tables for doc: {doc_id}"
-                    ) from e
-                else:
-                    raise
-
-            items = response.get("items", [])
-            for item in items:
-                tables.append(
-                    CodaTable(
-                        id=item["id"],
-                        browser_link=item["browserLink"],
-                        name=item["name"],
-                        created_at=item["createdAt"],
-                        updated_at=item["updatedAt"],
-                        doc_id=doc_id,
-                    )
-                )
-
-            next_page_token = response.get("nextPageToken")
-            if not next_page_token:
-                break
-
-        logger.debug(f"Found {len(tables)} tables in doc {doc_id}")
-        return tables
-
-    @retry(tries=3, delay=1, backoff=2)
-    def _list_rows_and_values(self, doc_id: str, table_id: str) -> List[CodaRow]:
-        """List all rows and their values in a table."""
-        logger.debug(f"Listing rows in Coda table: {table_id} in Coda doc: {doc_id}")
-
-        rows: List[CodaRow] = []
-        endpoint = f"docs/{doc_id}/tables/{table_id}/rows"
-        params: Dict[str, str] = {"valueFormat": "rich"}
-        next_page_token: str | None = None
-
-        while True:
-            if next_page_token:
-                params["pageToken"] = next_page_token
-
-            try:
-                response = self.coda_client.get(endpoint, params=params)
-            except CodaClientRequestFailedError as e:
-                if e.status_code == 404:
-                    raise ConnectorValidationError(
-                        f"Failed to list rows for table: {table_id} in doc: {doc_id}"
-                    ) from e
-                else:
-                    raise
-
-            items = response.get("items", [])
-            for item in items:
-                values = {}
-                for col_name, col_value in item.get("values", {}).items():
-                    values[col_name] = col_value
-
-                rows.append(
-                    CodaRow(
-                        id=item["id"],
-                        name=item["name"],
-                        index=item["index"],
-                        browser_link=item["browserLink"],
-                        created_at=item["createdAt"],
-                        updated_at=item["updatedAt"],
-                        values=values,
-                        table_id=table_id,
-                        doc_id=doc_id,
-                    )
-                )
-
-            next_page_token = response.get("nextPageToken")
-            if not next_page_token:
-                break
-
-        logger.debug(f"Found {len(rows)} rows in table {table_id}")
-        return rows
-
-    def _convert_page_to_document(self, page: CodaPage, content: str = "") -> Document:
-        """Convert a page into a Document."""
-        page_updated = datetime.fromisoformat(page.updated_at).astimezone(timezone.utc)
-
-        text_parts = [page.name, page.browser_link]
-        if content:
-            text_parts.append(content)
-
-        sections = [TextSection(link=page.browser_link, text="\n\n".join(text_parts))]
-
-        return Document(
-            id=f"coda-page-{page.doc_id}-{page.id}",
-            sections=cast(list[TextSection | ImageSection], sections),
-            source=DocumentSource.CODA,
-            semantic_identifier=page.name or f"Page {page.id}",
-            doc_updated_at=page_updated,
-            metadata={
-                "browser_link": page.browser_link,
-                "doc_id": page.doc_id,
-                "content_type": page.content_type,
-            },
-        )
-
-    def _convert_table_with_rows_to_document(
-        self, table: CodaTable, rows: List[CodaRow]
-    ) -> Document:
-        """Convert a table and its rows into a single Document with multiple sections (one per row)."""
-        table_updated = datetime.fromisoformat(table.updated_at).astimezone(
-            timezone.utc
-        )
-
-        sections: List[TextSection] = []
-        for row in rows:
-            content_text = " ".join(
-                str(v) if not isinstance(v, list) else " ".join(map(str, v))
-                for v in row.values.values()
-            )
-
-            row_name = row.name or f"Row {row.index or row.id}"
-            text = f"{row_name}: {content_text}" if content_text else row_name
-
-            sections.append(TextSection(link=row.browser_link, text=text))
-
-        # If no rows, create a single section for the table itself
-        if not sections:
-            sections = [
-                TextSection(link=table.browser_link, text=f"Table: {table.name}")
-            ]
-
-        return Document(
-            id=f"coda-table-{table.doc_id}-{table.id}",
-            sections=cast(list[TextSection | ImageSection], sections),
-            source=DocumentSource.CODA,
-            semantic_identifier=table.name or f"Table {table.id}",
-            doc_updated_at=table_updated,
-            metadata={
-                "browser_link": table.browser_link,
-                "doc_id": table.doc_id,
-                "row_count": str(len(rows)),
-            },
-        )
-
-    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
-        """Load and validate Coda credentials."""
-        self._coda_client = CodaApiClient(bearer_token=credentials["coda_bearer_token"])
-
-        try:
-            self._coda_client.get("docs", params={"limit": "1"})
-        except CodaClientRequestFailedError as e:
-            if e.status_code == 401:
-                raise ConnectorMissingCredentialError("Invalid Coda API token")
-            raise
-
-        return None
-
-    def load_from_state(self) -> GenerateDocumentsOutput:
-        """Load all documents from Coda workspace."""
-
-        def _iter_documents() -> Generator[Document, None, None]:
-            docs = self._list_all_docs()
-            logger.info(f"Found {len(docs)} Coda docs to process")
-
-            for doc in docs:
-                logger.debug(f"Processing doc: {doc.name} ({doc.id})")
-
-                try:
-                    pages = self._list_pages_in_doc(doc.id)
-                    for page in pages:
-                        content = ""
-                        if self.index_page_content:
-                            try:
-                                content = self._fetch_page_content(doc.id, page.id)
-                            except Exception as e:
-                                logger.warning(
-                                    f"Failed to fetch content for page {page.id}: {e}"
-                                )
-                        yield self._convert_page_to_document(page, content)
-                except ConnectorValidationError as e:
-                    logger.warning(f"Failed to list pages for doc {doc.id}: {e}")
-
-                try:
-                    tables = self._list_tables(doc.id)
-                    for table in tables:
-                        try:
-                            rows = self._list_rows_and_values(doc.id, table.id)
-                            yield self._convert_table_with_rows_to_document(table, rows)
-                        except ConnectorValidationError as e:
-                            logger.warning(
-                                f"Failed to list rows for table {table.id}: {e}"
-                            )
-                            yield self._convert_table_with_rows_to_document(table, [])
-                except ConnectorValidationError as e:
-                    logger.warning(f"Failed to list tables for doc {doc.id}: {e}")
-
-        return batch_generator(_iter_documents(), self.batch_size)
-
-    def poll_source(
-        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
-    ) -> GenerateDocumentsOutput:
-        """
-        Polls the Coda API for documents updated between start and end timestamps.
-        We refer to page and table update times to determine if they need to be re-indexed.
-        """
-
-        def _iter_documents() -> Generator[Document, None, None]:
-            docs = self._list_all_docs()
-            logger.info(
-                f"Polling {len(docs)} Coda docs for updates between {start} and {end}"
-            )
-
-            for doc in docs:
-                try:
-                    pages = self._list_pages_in_doc(doc.id)
-                    for page in pages:
-                        page_timestamp = (
-                            datetime.fromisoformat(page.updated_at)
-                            .astimezone(timezone.utc)
-                            .timestamp()
-                        )
-                        if start < page_timestamp <= end:
-                            content = ""
-                            if self.index_page_content:
-                                try:
-                                    content = self._fetch_page_content(doc.id, page.id)
-                                except Exception as e:
-                                    logger.warning(
-                                        f"Failed to fetch content for page {page.id}: {e}"
-                                    )
-                            yield self._convert_page_to_document(page, content)
-                except ConnectorValidationError as e:
-                    logger.warning(f"Failed to list pages for doc {doc.id}: {e}")
-
-                try:
-                    tables = self._list_tables(doc.id)
-                    for table in tables:
-                        table_timestamp = (
-                            datetime.fromisoformat(table.updated_at)
-                            .astimezone(timezone.utc)
-                            .timestamp()
-                        )
-
-                        try:
-                            rows = self._list_rows_and_values(doc.id, table.id)
-
-                            table_or_rows_updated = start < table_timestamp <= end
-                            if not table_or_rows_updated:
-                                for row in rows:
-                                    row_timestamp = (
-                                        datetime.fromisoformat(row.updated_at)
-                                        .astimezone(timezone.utc)
-                                        .timestamp()
-                                    )
-                                    if start < row_timestamp <= end:
-                                        table_or_rows_updated = True
-                                        break
-
-                            if table_or_rows_updated:
-                                yield self._convert_table_with_rows_to_document(
-                                    table, rows
-                                )
-
-                        except ConnectorValidationError as e:
-                            logger.warning(
-                                f"Failed to list rows for table {table.id}: {e}"
-                            )
-                            if table_timestamp > start and table_timestamp <= end:
-                                yield self._convert_table_with_rows_to_document(
-                                    table, []
-                                )
-
-                except ConnectorValidationError as e:
-                    logger.warning(f"Failed to list tables for doc {doc.id}: {e}")
-
-        return batch_generator(_iter_documents(), self.batch_size)
-
-    def validate_connector_settings(self) -> None:
-        """Validates the Coda connector settings calling the 'whoami' endpoint."""
-        try:
-            response = self.coda_client.get("whoami")
-            logger.info(
-                f"Coda connector validated for user: {response.get('name', 'Unknown')}"
-            )
-
-            if self.workspace_id:
-                params = {"workspaceId": self.workspace_id, "limit": "1"}
-                self.coda_client.get("docs", params=params)
-                logger.info(f"Validated access to workspace: {self.workspace_id}")
-
-        except CodaClientRequestFailedError as e:
-            if e.status_code == 401:
-                raise CredentialExpiredError(
-                    "Coda credential appears to be invalid or expired (HTTP 401)."
-                )
-            elif e.status_code == 404:
-                raise ConnectorValidationError(
-                    "Coda workspace not found or not accessible (HTTP 404). "
-                    "Please verify the workspace_id is correct and shared with the integration."
-                )
-            elif e.status_code == 429:
-                raise ConnectorValidationError(
-                    "Validation failed due to Coda rate-limits being exceeded (HTTP 429). "
-                    "Please try again later."
-                )
-            else:
-                raise UnexpectedValidationError(
-                    f"Unexpected Coda HTTP error (status={e.status_code}): {e}"
-                )
-        except Exception as exc:
-            raise UnexpectedValidationError(
-                f"Unexpected error during Coda settings validation: {exc}"
-            )
--- a/backend/onyx/connectors/confluence/connector.py
+++ b/backend/onyx/connectors/confluence/connector.py
@@ -387,162 +387,124 @@ class ConfluenceConnector(
        attachment_docs: list[Document] = []
        page_url = ""

-        try:
-            for attachment in self.confluence_client.paginated_cql_retrieval(
-                cql=attachment_query,
-                expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
-            ):
-                media_type: str = attachment.get("metadata", {}).get("mediaType", "")
+        for attachment in self.confluence_client.paginated_cql_retrieval(
+            cql=attachment_query,
+            expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
+        ):
+            media_type: str = attachment.get("metadata", {}).get("mediaType", "")

-                # TODO(rkuo): this check is partially redundant with validate_attachment_filetype
-                # and checks in convert_attachment_to_content/process_attachment
-                # but doing the check here avoids an unnecessary download. Due for refactoring.
-                if not self.allow_images:
-                    if media_type.startswith("image/"):
-                        logger.info(
-                            f"Skipping attachment because allow images is False: {attachment['title']}"
-                        )
-                        continue
-
-                if not validate_attachment_filetype(
-                    attachment,
-                ):
+            # TODO(rkuo): this check is partially redundant with validate_attachment_filetype
+            # and checks in convert_attachment_to_content/process_attachment
+            # but doing the check here avoids an unnecessary download. Due for refactoring.
+            if not self.allow_images:
+                if media_type.startswith("image/"):
                    logger.info(
-                        f"Skipping attachment because it is not an accepted file type: {attachment['title']}"
+                        f"Skipping attachment because allow images is False: {attachment['title']}"
                    )
                    continue

+            if not validate_attachment_filetype(
+                attachment,
+            ):
                logger.info(
-                    f"Processing attachment: {attachment['title']} attached to page {page['title']}"
+                    f"Skipping attachment because it is not an accepted file type: {attachment['title']}"
                )
-                # Attachment document id: use the download URL for stable identity
-                try:
-                    object_url = build_confluence_document_id(
-                        self.wiki_base, attachment["_links"]["download"], self.is_cloud
-                    )
-                except Exception as e:
-                    logger.warning(
-                        f"Invalid attachment url for id {attachment['id']}, skipping"
-                    )
-                    logger.debug(f"Error building attachment url: {e}")
-                    continue
-                try:
-                    response = convert_attachment_to_content(
-                        confluence_client=self.confluence_client,
-                        attachment=attachment,
-                        page_id=page["id"],
-                        allow_images=self.allow_images,
-                    )
-                    if response is None:
-                        continue
+                continue

-                    content_text, file_storage_name = response
-
-                    sections: list[TextSection | ImageSection] = []
-                    if content_text:
-                        sections.append(TextSection(text=content_text, link=object_url))
-                    elif file_storage_name:
-                        sections.append(
-                            ImageSection(
-                                link=object_url, image_file_id=file_storage_name
-                            )
-                        )
-
-                    # Build attachment-specific metadata
-                    attachment_metadata: dict[str, str | list[str]] = {}
-                    if "space" in attachment:
-                        attachment_metadata["space"] = attachment["space"].get(
-                            "name", ""
-                        )
-                    labels: list[str] = []
-                    if "metadata" in attachment and "labels" in attachment["metadata"]:
-                        for label in attachment["metadata"]["labels"].get(
-                            "results", []
-                        ):
-                            labels.append(label.get("name", ""))
-                    if labels:
-                        attachment_metadata["labels"] = labels
-                    page_url = page_url or build_confluence_document_id(
-                        self.wiki_base, page["_links"]["webui"], self.is_cloud
-                    )
-                    attachment_metadata["parent_page_id"] = page_url
-                    attachment_id = build_confluence_document_id(
-                        self.wiki_base, attachment["_links"]["webui"], self.is_cloud
-                    )
-
-                    primary_owners: list[BasicExpertInfo] | None = None
-                    if "version" in attachment and "by" in attachment["version"]:
-                        author = attachment["version"]["by"]
-                        display_name = author.get("displayName", "Unknown")
-                        email = author.get("email", "unknown@domain.invalid")
-                        primary_owners = [
-                            BasicExpertInfo(display_name=display_name, email=email)
-                        ]
-
-                    attachment_doc = Document(
-                        id=attachment_id,
-                        sections=sections,
-                        source=DocumentSource.CONFLUENCE,
-                        semantic_identifier=attachment.get("title", object_url),
-                        metadata=attachment_metadata,
-                        doc_updated_at=(
-                            datetime_from_string(attachment["version"]["when"])
-                            if attachment.get("version")
-                            and attachment["version"].get("when")
-                            else None
-                        ),
-                        primary_owners=primary_owners,
-                    )
-                    attachment_docs.append(attachment_doc)
-                except Exception as e:
-                    logger.error(
-                        f"Failed to extract/summarize attachment {attachment['title']}",
-                        exc_info=e,
-                    )
-                    if is_atlassian_date_error(e):
-                        # propagate error to be caught and retried
-                        raise
-                    attachment_failures.append(
-                        ConnectorFailure(
-                            failed_document=DocumentFailure(
-                                document_id=object_url,
-                                document_link=object_url,
-                            ),
-                            failure_message=f"Failed to extract/summarize attachment {attachment['title']} for doc {object_url}",
-                            exception=e,
-                        )
-                    )
-        except HTTPError as e:
-            # If we get a 403 after all retries, the user likely doesn't have permission
-            # to access attachments on this page. Log and skip rather than failing the whole job.
-            if e.response and e.response.status_code == 403:
-                page_title = page.get("title", "unknown")
-                page_id = page.get("id", "unknown")
+            logger.info(
+                f"Processing attachment: {attachment['title']} attached to page {page['title']}"
+            )
+            # Attachment document id: use the download URL for stable identity
+            try:
+                object_url = build_confluence_document_id(
+                    self.wiki_base, attachment["_links"]["download"], self.is_cloud
+                )
+            except Exception as e:
                logger.warning(
-                    f"Permission denied (403) when fetching attachments for page '{page_title}' "
-                    f"(ID: {page_id}). The user may not have permission to query attachments on this page. "
-                    "Skipping attachments for this page."
+                    f"Invalid attachment url for id {attachment['id']}, skipping"
                )
-                # Build the page URL for the failure record
-                try:
-                    page_url = build_confluence_document_id(
-                        self.wiki_base, page["_links"]["webui"], self.is_cloud
-                    )
-                except Exception:
-                    page_url = f"page_id:{page_id}"
+                logger.debug(f"Error building attachment url: {e}")
+                continue
+            try:
+                response = convert_attachment_to_content(
+                    confluence_client=self.confluence_client,
+                    attachment=attachment,
+                    page_id=page["id"],
+                    allow_images=self.allow_images,
+                )
+                if response is None:
+                    continue

-                return [], [
+                content_text, file_storage_name = response
+
+                sections: list[TextSection | ImageSection] = []
+                if content_text:
+                    sections.append(TextSection(text=content_text, link=object_url))
+                elif file_storage_name:
+                    sections.append(
+                        ImageSection(link=object_url, image_file_id=file_storage_name)
+                    )
+
+                # Build attachment-specific metadata
+                attachment_metadata: dict[str, str | list[str]] = {}
+                if "space" in attachment:
+                    attachment_metadata["space"] = attachment["space"].get("name", "")
+                labels: list[str] = []
+                if "metadata" in attachment and "labels" in attachment["metadata"]:
+                    for label in attachment["metadata"]["labels"].get("results", []):
+                        labels.append(label.get("name", ""))
+                if labels:
+                    attachment_metadata["labels"] = labels
+                page_url = page_url or build_confluence_document_id(
+                    self.wiki_base, page["_links"]["webui"], self.is_cloud
+                )
+                attachment_metadata["parent_page_id"] = page_url
+                attachment_id = build_confluence_document_id(
+                    self.wiki_base, attachment["_links"]["webui"], self.is_cloud
+                )
+
+                primary_owners: list[BasicExpertInfo] | None = None
+                if "version" in attachment and "by" in attachment["version"]:
+                    author = attachment["version"]["by"]
+                    display_name = author.get("displayName", "Unknown")
+                    email = author.get("email", "unknown@domain.invalid")
+                    primary_owners = [
+                        BasicExpertInfo(display_name=display_name, email=email)
+                    ]
+
+                attachment_doc = Document(
+                    id=attachment_id,
+                    sections=sections,
+                    source=DocumentSource.CONFLUENCE,
+                    semantic_identifier=attachment.get("title", object_url),
+                    metadata=attachment_metadata,
+                    doc_updated_at=(
+                        datetime_from_string(attachment["version"]["when"])
+                        if attachment.get("version")
+                        and attachment["version"].get("when")
+                        else None
+                    ),
+                    primary_owners=primary_owners,
+                )
+                attachment_docs.append(attachment_doc)
+            except Exception as e:
+                logger.error(
+                    f"Failed to extract/summarize attachment {attachment['title']}",
+                    exc_info=e,
+                )
+                if is_atlassian_date_error(e):
+                    # propagate error to be caught and retried
+                    raise
+                attachment_failures.append(
                    ConnectorFailure(
                        failed_document=DocumentFailure(
-                            document_id=page_id,
-                            document_link=page_url,
+                            document_id=object_url,
+                            document_link=object_url,
                        ),
-                        failure_message=f"Permission denied (403) when fetching attachments for page '{page_title}'",
+                        failure_message=f"Failed to extract/summarize attachment {attachment['title']} for doc {object_url}",
                        exception=e,
                    )
-                ]
-            else:
-                raise
+                )

        return attachment_docs, attachment_failures

--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -579,18 +579,13 @@ class OnyxConfluence:
        while url_suffix:
            logger.debug(f"Making confluence call to {url_suffix}")
            try:
-                # Only pass params if they're not already in the URL to avoid duplicate
-                # params accumulating. Confluence's _links.next already includes these.
-                params = {}
-                if "body-format=" not in url_suffix:
-                    params["body-format"] = "atlas_doc_format"
-                if "expand=" not in url_suffix:
-                    params["expand"] = "body.atlas_doc_format"
-
                raw_response = self.get(
                    path=url_suffix,
                    advanced_mode=True,
-                    params=params,
+                    params={
+                        "body-format": "atlas_doc_format",
+                        "expand": "body.atlas_doc_format",
+                    },
                )
            except Exception as e:
                logger.exception(f"Error in confluence call to {url_suffix}")
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -1,4 +1,5 @@
 import io
+import random
 from collections.abc import Callable
 from datetime import datetime
 from typing import Any
@@ -23,6 +24,7 @@ from onyx.connectors.google_utils.resources import get_drive_service
 from onyx.connectors.google_utils.resources import get_google_docs_service
 from onyx.connectors.google_utils.resources import GoogleDocsService
 from onyx.connectors.google_utils.resources import GoogleDriveService
+from onyx.connectors.models import BasicExpertInfo
 from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import DocumentFailure
@@ -548,6 +550,11 @@ def _convert_drive_item_to_document(
            doc_updated_at=datetime.fromisoformat(
                file.get("modifiedTime", "").replace("Z", "+00:00")
            ),
+            primary_owners=[
+                BasicExpertInfo(
+                    email=random.choice(["yuhong@onyx.app", "justin@onyx.app"])
+                )
+            ],
            external_access=external_access,
        )
    except Exception as e:
--- a/backend/onyx/connectors/hubspot/connector.py
+++ b/backend/onyx/connectors/hubspot/connector.py
@@ -26,6 +26,7 @@ from onyx.utils.logger import setup_logger
 HUBSPOT_BASE_URL = "https://app.hubspot.com"
 HUBSPOT_API_URL = "https://api.hubapi.com/integrations/v1/me"

+# Available HubSpot object types
 AVAILABLE_OBJECT_TYPES = {"tickets", "companies", "deals", "contacts"}

 HUBSPOT_PAGE_SIZE = 100
--- a/backend/onyx/connectors/registry.py
+++ b/backend/onyx/connectors/registry.py
@@ -68,10 +68,6 @@ CONNECTOR_CLASS_MAP = {
        module_path="onyx.connectors.slab.connector",
        class_name="SlabConnector",
    ),
-    DocumentSource.CODA: ConnectorMapping(
-        module_path="onyx.connectors.coda.connector",
-        class_name="CodaConnector",
-    ),
    DocumentSource.NOTION: ConnectorMapping(
        module_path="onyx.connectors.notion.connector",
        class_name="NotionConnector",
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@@ -99,9 +99,7 @@ DEFAULT_HEADERS = {
        "image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
    ),
    "Accept-Language": "en-US,en;q=0.9",
-    # Brotli decoding has been flaky in brotlicffi/httpx for certain chunked responses;
-    # stick to gzip/deflate to keep connectivity checks stable.
-    "Accept-Encoding": "gzip, deflate",
+    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
--- a/backend/onyx/context/search/enums.py
+++ b/backend/onyx/context/search/enums.py
@@ -20,11 +20,6 @@ class OptionalSearchSetting(str, Enum):
    AUTO = "auto"


-class QueryType(str, Enum):
-    KEYWORD = "keyword"
-    SEMANTIC = "semantic"
-
-
 class SearchType(str, Enum):
    KEYWORD = "keyword"
    SEMANTIC = "semantic"
--- a/backend/onyx/context/search/federated/slack_search_utils.py
+++ b/backend/onyx/context/search/federated/slack_search_utils.py
@@ -6,6 +6,7 @@ from datetime import timedelta
 from datetime import timezone
 from typing import Any

+from langchain_core.messages import HumanMessage
 from pydantic import ValidationError

 from onyx.configs.app_configs import MAX_SLACK_QUERY_EXPANSIONS
@@ -13,7 +14,7 @@ from onyx.context.search.federated.models import ChannelMetadata
 from onyx.context.search.models import ChunkIndexRequest
 from onyx.federated_connectors.slack.models import SlackEntities
 from onyx.llm.interfaces import LLM
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.utils import message_to_string
 from onyx.onyxbot.slack.models import ChannelType
 from onyx.prompts.federated_search import SLACK_DATE_EXTRACTION_PROMPT
 from onyx.prompts.federated_search import SLACK_QUERY_EXPANSION_PROMPT
@@ -190,7 +191,9 @@ def extract_date_range_from_query(

    try:
        prompt = SLACK_DATE_EXTRACTION_PROMPT.format(query=query)
-        response = llm_response_to_string(llm.invoke(prompt))
+        response = message_to_string(
+            llm.invoke_langchain([HumanMessage(content=prompt)])
+        )

        response_clean = _parse_llm_code_block_response(response)

@@ -581,7 +584,9 @@ def expand_query_with_llm(query_text: str, llm: LLM) -> list[str]:
    )

    try:
-        response = llm_response_to_string(llm.invoke(prompt))
+        response = message_to_string(
+            llm.invoke_langchain([HumanMessage(content=prompt)])
+        )

        response_clean = _parse_llm_code_block_response(response)

--- a/backend/onyx/context/search/models.py
+++ b/backend/onyx/context/search/models.py
@@ -129,6 +129,8 @@ class UserFileFilters(BaseModel):
 class IndexFilters(BaseFilters, UserFileFilters):
    access_control_list: list[str] | None
    tenant_id: str | None = None
+    # Filter documents by primary owner email (for avatar queries)
+    primary_owner_emails: list[str] | None = None


 class ChunkMetric(BaseModel):
--- a/backend/onyx/db/README.md
+++ b/backend/onyx/db/README.md
@@ -1,7 +1,7 @@
 An explanation of how the history of messages, tool calls, and docs are stored in the database:

 Messages are grouped by a chat session, a tree structured is used to allow edits and for the
-user to switch between branches. Each ChatMessage is either a user message or an assistant message.
+user to switch between branches. Each ChatMessage is either a user message of an assistant message.
 It should always alternate between the two, System messages, custom agent prompt injections, and
 reminder messages are injected dynamically after the chat session is loaded into memory. The user
 and assistant messages are stored in pairs, though it is ok if the user message is stored and the
--- a/backend/onyx/db/avatar.py
+++ b/backend/onyx/db/avatar.py
@@ -0,0 +1,449 @@
+"""
+Avatar database operations.
+
+This module provides CRUD operations for Avatar, AvatarPermissionRequest,
+and AvatarQuery models.
+"""
+
+from datetime import datetime
+from datetime import timedelta
+from uuid import UUID
+
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.orm import Session
+
+from onyx.db.enums import AvatarPermissionRequestStatus
+from onyx.db.enums import AvatarQueryMode
+from onyx.db.models import Avatar
+from onyx.db.models import AvatarPermissionRequest
+from onyx.db.models import AvatarQuery
+from onyx.db.models import User
+
+
+# Default expiration for permission requests (in days)
+DEFAULT_REQUEST_EXPIRY_DAYS = 7
+
+
+# ============================================================================
+# Avatar CRUD Operations
+# ============================================================================
+
+
+def create_avatar_for_user(
+    user_id: UUID,
+    db_session: Session,
+    name: str | None = None,
+    description: str | None = None,
+) -> Avatar:
+    """Create a new avatar for a user.
+
+    Args:
+        user_id: The ID of the user to create an avatar for
+        db_session: Database session
+        name: Optional display name for the avatar
+        description: Optional description for the avatar
+
+    Returns:
+        The created Avatar instance
+    """
+    avatar = Avatar(
+        user_id=user_id,
+        name=name,
+        description=description,
+        is_enabled=True,
+        default_query_mode=AvatarQueryMode.OWNED_DOCUMENTS,
+        allow_accessible_mode=True,
+        show_query_in_request=True,
+        max_requests_per_day=100,
+    )
+    db_session.add(avatar)
+    db_session.flush()
+    return avatar
+
+
+async def create_avatar_for_user_async(
+    user_id: UUID,
+    db_session: AsyncSession,
+    name: str | None = None,
+    description: str | None = None,
+) -> Avatar:
+    """Create a new avatar for a user (async version).
+
+    Args:
+        user_id: The ID of the user to create an avatar for
+        db_session: Async database session
+        name: Optional display name for the avatar
+        description: Optional description for the avatar
+
+    Returns:
+        The created Avatar instance
+    """
+    avatar = Avatar(
+        user_id=user_id,
+        name=name,
+        description=description,
+        is_enabled=True,
+        default_query_mode=AvatarQueryMode.OWNED_DOCUMENTS,
+        allow_accessible_mode=True,
+        show_query_in_request=True,
+        max_requests_per_day=100,
+    )
+    db_session.add(avatar)
+    await db_session.flush()
+    return avatar
+
+
+def get_avatar_by_id(avatar_id: int, db_session: Session) -> Avatar | None:
+    """Get an avatar by its ID."""
+    return db_session.query(Avatar).filter(Avatar.id == avatar_id).first()
+
+
+def get_avatar_by_user_id(user_id: UUID, db_session: Session) -> Avatar | None:
+    """Get an avatar by its user ID."""
+    return db_session.query(Avatar).filter(Avatar.user_id == user_id).first()
+
+
+def get_all_enabled_avatars(
+    db_session: Session,
+    exclude_user_id: UUID | None = None,
+) -> list[Avatar]:
+    """Get all enabled avatars, optionally excluding a specific user's avatar."""
+    query = db_session.query(Avatar).filter(Avatar.is_enabled == True)  # noqa: E712
+    if exclude_user_id:
+        query = query.filter(Avatar.user_id != exclude_user_id)
+    return query.all()
+
+
+def update_avatar(
+    avatar_id: int,
+    db_session: Session,
+    name: str | None = None,
+    description: str | None = None,
+    is_enabled: bool | None = None,
+    default_query_mode: AvatarQueryMode | None = None,
+    allow_accessible_mode: bool | None = None,
+    auto_approve_rules: dict | None = None,
+    show_query_in_request: bool | None = None,
+    max_requests_per_day: int | None = None,
+) -> Avatar | None:
+    """Update an avatar's settings.
+
+    Only non-None values will be updated.
+    """
+    avatar = get_avatar_by_id(avatar_id, db_session)
+    if not avatar:
+        return None
+
+    if name is not None:
+        avatar.name = name
+    if description is not None:
+        avatar.description = description
+    if is_enabled is not None:
+        avatar.is_enabled = is_enabled
+    if default_query_mode is not None:
+        avatar.default_query_mode = default_query_mode
+    if allow_accessible_mode is not None:
+        avatar.allow_accessible_mode = allow_accessible_mode
+    if auto_approve_rules is not None:
+        avatar.auto_approve_rules = auto_approve_rules
+    if show_query_in_request is not None:
+        avatar.show_query_in_request = show_query_in_request
+    if max_requests_per_day is not None:
+        avatar.max_requests_per_day = max_requests_per_day
+
+    db_session.flush()
+    return avatar
+
+
+def delete_avatar(avatar_id: int, db_session: Session) -> bool:
+    """Delete an avatar by ID."""
+    avatar = get_avatar_by_id(avatar_id, db_session)
+    if not avatar:
+        return False
+    db_session.delete(avatar)
+    db_session.flush()
+    return True
+
+
+# ============================================================================
+# Avatar Permission Request Operations
+# ============================================================================
+
+
+def create_permission_request(
+    avatar_id: int,
+    requester_id: UUID,
+    query_text: str | None,
+    db_session: Session,
+    chat_session_id: UUID | None = None,
+    chat_message_id: int | None = None,
+    cached_answer: str | None = None,
+    cached_search_doc_ids: list[int] | None = None,
+    answer_quality_score: float | None = None,
+    expires_in_days: int = DEFAULT_REQUEST_EXPIRY_DAYS,
+    status: AvatarPermissionRequestStatus = AvatarPermissionRequestStatus.PENDING,
+    task_id: str | None = None,
+) -> AvatarPermissionRequest:
+    """Create a new permission request.
+
+    Args:
+        avatar_id: The avatar being queried
+        requester_id: The user making the request
+        query_text: The query text (may be hidden per privacy settings)
+        db_session: Database session
+        chat_session_id: Optional chat session for context
+        chat_message_id: Optional chat message for context
+        cached_answer: Pre-computed answer (for sync queries)
+        cached_search_doc_ids: Document IDs from the search
+        answer_quality_score: Quality score of the answer
+        expires_in_days: How long before the request expires
+        status: Initial status (PENDING for sync, PROCESSING for async)
+        task_id: Celery task ID for async processing
+    """
+    request = AvatarPermissionRequest(
+        avatar_id=avatar_id,
+        requester_id=requester_id,
+        query_text=query_text,
+        chat_session_id=chat_session_id,
+        chat_message_id=chat_message_id,
+        cached_answer=cached_answer,
+        cached_search_doc_ids=cached_search_doc_ids,
+        answer_quality_score=answer_quality_score,
+        status=status,
+        task_id=task_id,
+        expires_at=datetime.utcnow() + timedelta(days=expires_in_days),
+    )
+    db_session.add(request)
+    db_session.flush()
+    return request
+
+
+def update_permission_request_task_id(
+    request_id: int,
+    task_id: str,
+    db_session: Session,
+) -> AvatarPermissionRequest | None:
+    """Update the task_id for a permission request after queuing."""
+    request = get_permission_request_by_id(request_id, db_session)
+    if not request:
+        return None
+    request.task_id = task_id
+    db_session.flush()
+    return request
+
+
+def get_permission_request_by_id(
+    request_id: int, db_session: Session
+) -> AvatarPermissionRequest | None:
+    """Get a permission request by ID."""
+    return (
+        db_session.query(AvatarPermissionRequest)
+        .filter(AvatarPermissionRequest.id == request_id)
+        .first()
+    )
+
+
+def get_pending_requests_for_avatar_owner(
+    user_id: UUID, db_session: Session
+) -> list[AvatarPermissionRequest]:
+    """Get all pending permission requests for a user's avatar."""
+    return (
+        db_session.query(AvatarPermissionRequest)
+        .join(Avatar, AvatarPermissionRequest.avatar_id == Avatar.id)
+        .filter(
+            Avatar.user_id == user_id,
+            AvatarPermissionRequest.status == AvatarPermissionRequestStatus.PENDING,
+            AvatarPermissionRequest.expires_at > datetime.utcnow(),
+        )
+        .order_by(AvatarPermissionRequest.created_at.desc())
+        .all()
+    )
+
+
+def get_permission_requests_by_requester(
+    requester_id: UUID,
+    db_session: Session,
+    status: AvatarPermissionRequestStatus | None = None,
+) -> list[AvatarPermissionRequest]:
+    """Get all permission requests made by a user."""
+    query = db_session.query(AvatarPermissionRequest).filter(
+        AvatarPermissionRequest.requester_id == requester_id
+    )
+    if status:
+        query = query.filter(AvatarPermissionRequest.status == status)
+    return query.order_by(AvatarPermissionRequest.created_at.desc()).all()
+
+
+def get_permission_requests_by_chat_session(
+    chat_session_id: UUID,
+    requester_id: UUID,
+    db_session: Session,
+) -> list[AvatarPermissionRequest]:
+    """Get all permission requests for a specific chat session.
+
+    Only returns requests made by the specified requester for security.
+    Returns all statuses so the UI can show pending, approved, and denied requests.
+    """
+    return (
+        db_session.query(AvatarPermissionRequest)
+        .filter(
+            AvatarPermissionRequest.chat_session_id == chat_session_id,
+            AvatarPermissionRequest.requester_id == requester_id,
+        )
+        .order_by(AvatarPermissionRequest.created_at.desc())
+        .all()
+    )
+
+
+def approve_permission_request(
+    request_id: int, db_session: Session
+) -> AvatarPermissionRequest | None:
+    """Approve a permission request."""
+    request = get_permission_request_by_id(request_id, db_session)
+    if not request or request.status != AvatarPermissionRequestStatus.PENDING:
+        return None
+
+    request.status = AvatarPermissionRequestStatus.APPROVED
+    request.resolved_at = datetime.utcnow()
+    db_session.flush()
+    return request
+
+
+def deny_permission_request(
+    request_id: int,
+    db_session: Session,
+    denial_reason: str | None = None,
+) -> AvatarPermissionRequest | None:
+    """Deny a permission request."""
+    request = get_permission_request_by_id(request_id, db_session)
+    if not request or request.status != AvatarPermissionRequestStatus.PENDING:
+        return None
+
+    request.status = AvatarPermissionRequestStatus.DENIED
+    request.denial_reason = denial_reason
+    request.resolved_at = datetime.utcnow()
+    db_session.flush()
+    return request
+
+
+def expire_old_permission_requests(db_session: Session) -> int:
+    """Mark all expired permission requests as expired.
+
+    Returns the number of requests that were expired.
+    """
+    expired_count = (
+        db_session.query(AvatarPermissionRequest)
+        .filter(
+            AvatarPermissionRequest.status == AvatarPermissionRequestStatus.PENDING,
+            AvatarPermissionRequest.expires_at <= datetime.utcnow(),
+        )
+        .update(
+            {
+                AvatarPermissionRequest.status: AvatarPermissionRequestStatus.EXPIRED,
+                AvatarPermissionRequest.resolved_at: datetime.utcnow(),
+            }
+        )
+    )
+    db_session.flush()
+    return expired_count
+
+
+# ============================================================================
+# Avatar Query Operations (Rate Limiting & Analytics)
+# ============================================================================
+
+
+def log_avatar_query(
+    avatar_id: int,
+    requester_id: UUID,
+    query_mode: AvatarQueryMode,
+    query_text: str,
+    db_session: Session,
+) -> AvatarQuery:
+    """Log an avatar query for rate limiting and analytics."""
+    query = AvatarQuery(
+        avatar_id=avatar_id,
+        requester_id=requester_id,
+        query_mode=query_mode,
+        query_text=query_text,
+    )
+    db_session.add(query)
+    db_session.flush()
+    return query
+
+
+def get_avatar_query_count_today(
+    avatar_id: int,
+    requester_id: UUID,
+    db_session: Session,
+) -> int:
+    """Get the number of queries made to an avatar by a user today."""
+    today_start = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0)
+    return (
+        db_session.query(AvatarQuery)
+        .filter(
+            AvatarQuery.avatar_id == avatar_id,
+            AvatarQuery.requester_id == requester_id,
+            AvatarQuery.created_at >= today_start,
+        )
+        .count()
+    )
+
+
+def check_rate_limit(
+    avatar_id: int,
+    requester_id: UUID,
+    db_session: Session,
+) -> bool:
+    """Check if a requester has exceeded the rate limit for an avatar.
+
+    Returns True if the request is allowed, False if rate limited.
+    """
+    avatar = get_avatar_by_id(avatar_id, db_session)
+    if not avatar or not avatar.max_requests_per_day:
+        return True
+
+    query_count = get_avatar_query_count_today(avatar_id, requester_id, db_session)
+    return query_count < avatar.max_requests_per_day
+
+
+# ============================================================================
+# Auto-Approval Logic
+# ============================================================================
+
+
+def should_auto_approve(
+    avatar: Avatar,
+    requester: User,
+) -> bool:
+    """Check if a request should be auto-approved based on avatar's rules.
+
+    Auto-approve rules format:
+    {
+        "user_ids": ["uuid1", "uuid2"],
+        "group_ids": ["group1", "group2"],
+        "all_users": false
+    }
+    """
+    if not avatar.auto_approve_rules:
+        return False
+
+    rules = avatar.auto_approve_rules
+
+    # Check if all users are auto-approved
+    if rules.get("all_users", False):
+        return True
+
+    # Check if requester is in the user whitelist
+    user_ids = rules.get("user_ids", [])
+    if str(requester.id) in user_ids:
+        return True
+
+    # Check if requester is in any of the whitelisted groups
+    # Note: This would need integration with the UserGroup system
+    # group_ids = rules.get("group_ids", [])
+    # if group_ids:
+    #     # TODO: Check if user is member of any whitelisted group
+    #     pass
+
+    return False
--- a/backend/onyx/db/enums.py
+++ b/backend/onyx/db/enums.py
@@ -194,3 +194,21 @@ class SwitchoverType(str, PyEnum):
    REINDEX = "reindex"
    ACTIVE_ONLY = "active_only"
    INSTANT = "instant"
+
+
+class AvatarQueryMode(str, PyEnum):
+    """Mode for querying an avatar's knowledge."""
+
+    OWNED_DOCUMENTS = "owned_documents"  # Query only docs where user is primary_owner
+    ACCESSIBLE_DOCUMENTS = "accessible_documents"  # Query all docs user can access
+
+
+class AvatarPermissionRequestStatus(str, PyEnum):
+    """Status of an avatar permission request."""
+
+    PENDING = "pending"  # Awaiting owner approval (accessible mode)
+    PROCESSING = "processing"  # Query is being executed in background
+    APPROVED = "approved"
+    DENIED = "denied"
+    EXPIRED = "expired"
+    NO_ANSWER = "no_answer"  # Query ran but found nothing useful
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -54,6 +54,8 @@ from onyx.configs.constants import FileOrigin
 from onyx.configs.constants import MessageType
 from onyx.db.enums import (
    AccessType,
+    AvatarPermissionRequestStatus,
+    AvatarQueryMode,
    EmbeddingPrecision,
    IndexingMode,
    SyncType,
@@ -256,6 +258,13 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
        back_populates="user",
        cascade="all, delete-orphan",
    )
+    # User's queryable avatar (1:1 relationship)
+    avatar: Mapped["Avatar | None"] = relationship(
+        "Avatar",
+        back_populates="user",
+        uselist=False,
+        cascade="all, delete-orphan",
+    )

    @validates("email")
    def validate_email(self, key: str, value: str) -> str:
@@ -2141,8 +2150,6 @@ class ChatMessage(Base):
    time_sent: Mapped[datetime.datetime] = mapped_column(
        DateTime(timezone=True), server_default=func.now()
    )
-    # True if this assistant message is a clarification question (deep research flow)
-    is_clarification: Mapped[bool] = mapped_column(Boolean, default=False)

    # Relationships
    chat_session: Mapped[ChatSession] = relationship("ChatSession")
@@ -3913,3 +3920,190 @@ class ExternalGroupPermissionSyncAttempt(Base):

    def is_finished(self) -> bool:
        return self.status.is_terminal()
+
+
+"""
+Avatar Models
+Avatars are queryable mirrors of individual users within Onyx.
+"""
+
+
+class Avatar(Base):
+    """User's queryable knowledge avatar - mirrors their document ownership/access."""
+
+    __tablename__ = "avatar"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True)
+    user_id: Mapped[UUID] = mapped_column(
+        ForeignKey("user.id", ondelete="CASCADE"), nullable=False, unique=True
+    )
+
+    # Display settings
+    name: Mapped[str | None] = mapped_column(String, nullable=True)
+    description: Mapped[str | None] = mapped_column(String, nullable=True)
+    is_enabled: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
+
+    # Query mode settings
+    default_query_mode: Mapped[AvatarQueryMode] = mapped_column(
+        Enum(AvatarQueryMode, native_enum=False),
+        default=AvatarQueryMode.OWNED_DOCUMENTS,
+        nullable=False,
+    )
+    allow_accessible_mode: Mapped[bool] = mapped_column(
+        Boolean, default=True, nullable=False
+    )
+
+    # Auto-approval rules: {"user_ids": [...], "group_ids": [...], "all_users": false}
+    auto_approve_rules: Mapped[dict | None] = mapped_column(
+        postgresql.JSONB(), nullable=True
+    )
+
+    # Privacy settings
+    show_query_in_request: Mapped[bool] = mapped_column(
+        Boolean, default=True, nullable=False
+    )
+
+    # Rate limiting
+    max_requests_per_day: Mapped[int | None] = mapped_column(
+        Integer, nullable=True, default=100
+    )
+
+    # Timestamps
+    created_at: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now(), nullable=False
+    )
+    updated_at: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True),
+        server_default=func.now(),
+        onupdate=func.now(),
+        nullable=False,
+    )
+
+    # Relationships
+    user: Mapped["User"] = relationship("User", back_populates="avatar")
+    permission_requests: Mapped[list["AvatarPermissionRequest"]] = relationship(
+        "AvatarPermissionRequest",
+        back_populates="avatar",
+        cascade="all, delete-orphan",
+    )
+    queries: Mapped[list["AvatarQuery"]] = relationship(
+        "AvatarQuery",
+        back_populates="avatar",
+        cascade="all, delete-orphan",
+    )
+
+
+class AvatarPermissionRequest(Base):
+    """Tracks permission requests for accessible-mode avatar queries."""
+
+    __tablename__ = "avatar_permission_request"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True)
+
+    # The avatar being queried
+    avatar_id: Mapped[int] = mapped_column(
+        ForeignKey("avatar.id", ondelete="CASCADE"), nullable=False, index=True
+    )
+
+    # Who is requesting
+    requester_id: Mapped[UUID] = mapped_column(
+        ForeignKey("user.id", ondelete="CASCADE"), nullable=False, index=True
+    )
+
+    # The query context
+    query_text: Mapped[str | None] = mapped_column(
+        Text, nullable=True
+    )  # May be hidden per privacy settings
+    chat_session_id: Mapped[UUID | None] = mapped_column(
+        ForeignKey("chat_session.id", ondelete="SET NULL"), nullable=True
+    )
+    chat_message_id: Mapped[int | None] = mapped_column(
+        ForeignKey("chat_message.id", ondelete="SET NULL"), nullable=True
+    )
+
+    # Cached answer (stored until approval/denial)
+    cached_answer: Mapped[str | None] = mapped_column(Text, nullable=True)
+    cached_search_doc_ids: Mapped[list[int] | None] = mapped_column(
+        postgresql.JSONB(), nullable=True
+    )
+    answer_quality_score: Mapped[float | None] = mapped_column(Float, nullable=True)
+
+    # Status
+    status: Mapped[AvatarPermissionRequestStatus] = mapped_column(
+        Enum(AvatarPermissionRequestStatus, native_enum=False),
+        default=AvatarPermissionRequestStatus.PENDING,
+        nullable=False,
+        index=True,
+    )
+
+    # Background task tracking (for PROCESSING status)
+    task_id: Mapped[str | None] = mapped_column(String, nullable=True, index=True)
+
+    # Response from avatar owner
+    denial_reason: Mapped[str | None] = mapped_column(String, nullable=True)
+
+    # Timestamps
+    created_at: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now(), nullable=False
+    )
+    expires_at: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), nullable=False
+    )
+    resolved_at: Mapped[datetime.datetime | None] = mapped_column(
+        DateTime(timezone=True), nullable=True
+    )
+
+    # Relationships
+    avatar: Mapped["Avatar"] = relationship(
+        "Avatar", back_populates="permission_requests"
+    )
+    requester: Mapped["User"] = relationship("User", foreign_keys=[requester_id])
+    chat_session: Mapped["ChatSession | None"] = relationship("ChatSession")
+
+    __table_args__ = (
+        Index(
+            "ix_avatar_permission_request_avatar_status",
+            "avatar_id",
+            "status",
+        ),
+        Index(
+            "ix_avatar_permission_request_requester_created",
+            "requester_id",
+            "created_at",
+        ),
+    )
+
+
+class AvatarQuery(Base):
+    """Tracks avatar queries for rate limiting and analytics."""
+
+    __tablename__ = "avatar_query"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True)
+    avatar_id: Mapped[int] = mapped_column(
+        ForeignKey("avatar.id", ondelete="CASCADE"), nullable=False, index=True
+    )
+    requester_id: Mapped[UUID] = mapped_column(
+        ForeignKey("user.id", ondelete="CASCADE"), nullable=False, index=True
+    )
+    query_mode: Mapped[AvatarQueryMode] = mapped_column(
+        Enum(AvatarQueryMode, native_enum=False), nullable=False
+    )
+    query_text: Mapped[str] = mapped_column(Text, nullable=False)
+    created_at: Mapped[datetime.datetime] = mapped_column(
+        DateTime(timezone=True), server_default=func.now(), nullable=False
+    )
+
+    # Relationships
+    avatar: Mapped["Avatar"] = relationship("Avatar", back_populates="queries")
+    requester: Mapped["User"] = relationship("User", foreign_keys=[requester_id])
+
+    # Index for rate limiting queries
+    __table_args__ = (
+        Index(
+            "ix_avatar_query_rate_limit",
+            "avatar_id",
+            "requester_id",
+            "created_at",
+        ),
+    )
--- a/backend/onyx/db/persona.py
+++ b/backend/onyx/db/persona.py
@@ -416,9 +416,6 @@ def get_persona_snapshots_for_user(
        selectinload(Persona.labels),
        selectinload(Persona.document_sets),
        selectinload(Persona.user),
-        selectinload(Persona.user_files),
-        selectinload(Persona.users),
-        selectinload(Persona.groups),
    )

    results = db_session.scalars(stmt).all()
--- a/backend/onyx/deep_research/dr_loop.py
+++ b/backend/onyx/deep_research/dr_loop.py
@@ -1,47 +1,16 @@
-# TODO: Notes for potential extensions and future improvements:
-# 1. Allow tools that aren't search specific tools
-# 2. Use user provided custom prompts
-
 from collections.abc import Callable
-from typing import cast

 from sqlalchemy.orm import Session

 from onyx.chat.chat_state import ChatStateContainer
-from onyx.chat.citation_processor import DynamicCitationProcessor
 from onyx.chat.emitter import Emitter
-from onyx.chat.llm_loop import construct_message_history
-from onyx.chat.llm_step import run_llm_step
 from onyx.chat.models import ChatMessageSimple
-from onyx.chat.models import LlmStepResult
-from onyx.configs.constants import MessageType
-from onyx.deep_research.dr_mock_tools import get_clarification_tool_definitions
 from onyx.llm.interfaces import LLM
-from onyx.llm.interfaces import LLMUserIdentity
-from onyx.llm.models import ToolChoiceOptions
-from onyx.llm.utils import model_is_reasoning_model
-from onyx.prompts.deep_research.orchestration_layer import CLARIFICATION_PROMPT
-from onyx.prompts.deep_research.orchestration_layer import ORCHESTRATOR_PROMPT
-from onyx.prompts.deep_research.orchestration_layer import ORCHESTRATOR_PROMPT_REASONING
-from onyx.prompts.deep_research.orchestration_layer import RESEARCH_PLAN_PROMPT
-from onyx.prompts.prompt_utils import get_current_llm_day_time
-from onyx.server.query_and_chat.streaming_models import AgentResponseDelta
-from onyx.server.query_and_chat.streaming_models import AgentResponseStart
-from onyx.server.query_and_chat.streaming_models import DeepResearchPlanDelta
-from onyx.server.query_and_chat.streaming_models import DeepResearchPlanStart
-from onyx.server.query_and_chat.streaming_models import OverallStop
-from onyx.server.query_and_chat.streaming_models import Packet
 from onyx.tools.tool import Tool
-from onyx.tools.tool_implementations.open_url.open_url_tool import OpenURLTool
-from onyx.tools.tool_implementations.search.search_tool import SearchTool
-from onyx.tools.tool_implementations.web_search.web_search_tool import WebSearchTool
 from onyx.utils.logger import setup_logger

 logger = setup_logger()

-MAX_USER_MESSAGES_FOR_CONTEXT = 5
-MAX_ORCHESTRATOR_CYCLES = 8
-

 def run_deep_research_llm_loop(
    emitter: Emitter,
@@ -52,203 +21,8 @@ def run_deep_research_llm_loop(
    llm: LLM,
    token_counter: Callable[[str], int],
    db_session: Session,
-    skip_clarification: bool = False,
-    user_identity: LLMUserIdentity | None = None,
 ) -> None:
-    # Here for lazy load LiteLLM
-    from onyx.llm.litellm_singleton.config import initialize_litellm
-
-    # An approximate limit. In extreme cases it may still fail but this should allow deep research
-    # to work in most cases.
    if llm.config.max_input_tokens < 25000:
        raise RuntimeError(
            "Cannot run Deep Research with an LLM that has less than 25,000 max input tokens"
        )
-
-    initialize_litellm()
-
-    available_tokens = llm.config.max_input_tokens
-
-    llm_step_result: LlmStepResult | None = None
-
-    # Filter tools to only allow web search, internal search, and open URL
-    allowed_tool_names = {SearchTool.NAME, WebSearchTool.NAME, OpenURLTool.NAME}
-    [tool for tool in tools if tool.name in allowed_tool_names]
-
-    #########################################################
-    # CLARIFICATION STEP (optional)
-    #########################################################
-    if not skip_clarification:
-        clarification_prompt = CLARIFICATION_PROMPT.format(
-            current_datetime=get_current_llm_day_time(full_sentence=False)
-        )
-        system_prompt = ChatMessageSimple(
-            message=clarification_prompt,
-            token_count=300,  # Skips the exact token count but has enough leeway
-            message_type=MessageType.SYSTEM,
-        )
-
-        truncated_message_history = construct_message_history(
-            system_prompt=system_prompt,
-            custom_agent_prompt=None,
-            simple_chat_history=simple_chat_history,
-            reminder_message=None,
-            project_files=None,
-            available_tokens=available_tokens,
-            last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT,
-        )
-
-        step_generator = run_llm_step(
-            history=truncated_message_history,
-            tool_definitions=get_clarification_tool_definitions(),
-            tool_choice=ToolChoiceOptions.AUTO,
-            llm=llm,
-            turn_index=0,
-            # No citations in this step, it should just pass through all
-            # tokens directly so initialized as an empty citation processor
-            citation_processor=DynamicCitationProcessor(),
-            state_container=state_container,
-            final_documents=None,
-            user_identity=user_identity,
-        )
-
-        # Consume the generator, emitting packets and capturing the final result
-        while True:
-            try:
-                packet = next(step_generator)
-                emitter.emit(packet)
-            except StopIteration as e:
-                llm_step_result, _ = e.value
-                break
-
-        # Type narrowing: generator always returns a result, so this can't be None
-        llm_step_result = cast(LlmStepResult, llm_step_result)
-
-        if not llm_step_result.tool_calls:
-            # Mark this turn as a clarification question
-            state_container.set_is_clarification(True)
-
-            emitter.emit(Packet(turn_index=0, obj=OverallStop(type="stop")))
-
-            # If a clarification is asked, we need to end this turn and wait on user input
-            return
-
-    #########################################################
-    # RESEARCH PLAN STEP
-    #########################################################
-    system_prompt = ChatMessageSimple(
-        message=RESEARCH_PLAN_PROMPT.format(
-            current_datetime=get_current_llm_day_time(full_sentence=False)
-        ),
-        token_count=300,
-        message_type=MessageType.SYSTEM,
-    )
-
-    truncated_message_history = construct_message_history(
-        system_prompt=system_prompt,
-        custom_agent_prompt=None,
-        simple_chat_history=simple_chat_history,
-        reminder_message=None,
-        project_files=None,
-        available_tokens=available_tokens,
-        last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT,
-    )
-
-    research_plan_generator = run_llm_step(
-        history=truncated_message_history,
-        tool_definitions=[],
-        tool_choice=ToolChoiceOptions.NONE,
-        llm=llm,
-        turn_index=0,
-        # No citations in this step, it should just pass through all
-        # tokens directly so initialized as an empty citation processor
-        citation_processor=DynamicCitationProcessor(),
-        state_container=state_container,
-        final_documents=None,
-        user_identity=user_identity,
-    )
-
-    while True:
-        try:
-            packet = next(research_plan_generator)
-            # Translate AgentResponseStart/Delta packets to DeepResearchPlanStart/Delta
-            if isinstance(packet.obj, AgentResponseStart):
-                emitter.emit(
-                    Packet(
-                        turn_index=packet.turn_index,
-                        obj=DeepResearchPlanStart(),
-                    )
-                )
-            elif isinstance(packet.obj, AgentResponseDelta):
-                emitter.emit(
-                    Packet(
-                        turn_index=packet.turn_index,
-                        obj=DeepResearchPlanDelta(content=packet.obj.content),
-                    )
-                )
-            else:
-                # Pass through other packet types (e.g., ReasoningStart, ReasoningDelta, etc.)
-                emitter.emit(packet)
-        except StopIteration as e:
-            llm_step_result, _ = e.value
-            break
-    llm_step_result = cast(LlmStepResult, llm_step_result)
-
-    research_plan = llm_step_result.answer
-
-    #########################################################
-    # RESEARCH EXECUTION STEP
-    #########################################################
-    is_reasoning_model = model_is_reasoning_model(
-        llm.config.model_name, llm.config.model_provider
-    )
-
-    orchestrator_prompt_template = (
-        ORCHESTRATOR_PROMPT if not is_reasoning_model else ORCHESTRATOR_PROMPT_REASONING
-    )
-
-    token_count_prompt = orchestrator_prompt_template.format(
-        current_datetime=get_current_llm_day_time(full_sentence=False),
-        current_cycle_count=1,
-        max_cycles=MAX_ORCHESTRATOR_CYCLES,
-        research_plan=research_plan,
-    )
-    orchestration_tokens = token_counter(token_count_prompt)
-
-    for cycle in range(MAX_ORCHESTRATOR_CYCLES):
-        orchestrator_prompt = orchestrator_prompt_template.format(
-            current_datetime=get_current_llm_day_time(full_sentence=False),
-            current_cycle_count=cycle,
-            max_cycles=MAX_ORCHESTRATOR_CYCLES,
-            research_plan=research_plan,
-        )
-
-        system_prompt = ChatMessageSimple(
-            message=orchestrator_prompt,
-            token_count=orchestration_tokens,
-            message_type=MessageType.SYSTEM,
-        )
-
-        truncated_message_history = construct_message_history(
-            system_prompt=system_prompt,
-            custom_agent_prompt=None,
-            simple_chat_history=simple_chat_history,
-            reminder_message=None,
-            project_files=None,
-            available_tokens=available_tokens,
-            last_n_user_messages=MAX_USER_MESSAGES_FOR_CONTEXT,
-        )
-
-        research_plan_generator = run_llm_step(
-            history=truncated_message_history,
-            tool_definitions=[],
-            tool_choice=ToolChoiceOptions.AUTO,
-            llm=llm,
-            turn_index=cycle,
-            # No citations in this step, it should just pass through all
-            # tokens directly so initialized as an empty citation processor
-            citation_processor=DynamicCitationProcessor(),
-            state_container=state_container,
-            final_documents=None,
-            user_identity=user_identity,
-        )
--- a/backend/onyx/deep_research/dr_mock_tools.py
+++ b/backend/onyx/deep_research/dr_mock_tools.py
@@ -1,18 +0,0 @@
-GENERATE_PLAN_TOOL_NAME = "generate_plan"
-
-
-def get_clarification_tool_definitions() -> list[dict]:
-    return [
-        {
-            "type": "function",
-            "function": {
-                "name": GENERATE_PLAN_TOOL_NAME,
-                "description": "No clarification needed, generate a research plan for the user's query.",
-                "parameters": {
-                    "type": "object",
-                    "properties": {},
-                    "required": [],
-                },
-            },
-        }
-    ]
--- a/backend/onyx/document_index/interfaces_new.py
+++ b/backend/onyx/document_index/interfaces_new.py
@@ -1,325 +0,0 @@
-import abc
-from collections.abc import Iterator
-from typing import Any
-
-from pydantic import BaseModel
-
-from onyx.access.models import DocumentAccess
-from onyx.context.search.enums import QueryType
-from onyx.context.search.models import IndexFilters
-from onyx.context.search.models import InferenceChunk
-from onyx.db.enums import EmbeddingPrecision
-from onyx.indexing.models import DocMetadataAwareIndexChunk
-from shared_configs.model_server_models import Embedding
-
-# NOTE: "Document" in the naming convention is used to refer to the entire document as represented in Onyx.
-# What is actually stored in the index is the document chunks. By the terminology of most search engines / vector
-# databases, the individual objects stored are called documents, but in this case it refers to a chunk.
-
-# Outside of searching and update capabilities, the document index must also implement the ability to port all of
-# the documents over to a secondary index. This allows for embedding models to be updated and for porting documents
-# to happen in the background while the primary index still serves the main traffic.
-
-
-__all__ = [
-    # Main interfaces - these are what you should inherit from
-    "DocumentIndex",
-    # Data models - used in method signatures
-    "DocumentInsertionRecord",
-    "DocumentSectionRequest",
-    "IndexingMetadata",
-    "MetadataUpdateRequest",
-    # Capability mixins - for custom compositions or type checking
-    "SchemaVerifiable",
-    "Indexable",
-    "Deletable",
-    "Updatable",
-    "IdRetrievalCapable",
-    "HybridCapable",
-    "RandomCapable",
-]
-
-
-class DocumentInsertionRecord(BaseModel):
-    """
-    Result of indexing a document
-    """
-
-    model_config = {"frozen": True}
-
-    document_id: str
-    already_existed: bool
-
-
-class DocumentSectionRequest(BaseModel):
-    """
-    Request for a document section or whole document
-    If no min_chunk_ind is provided it should start at the beginning of the document
-    If no max_chunk_ind is provided it should go to the end of the document
-    """
-
-    model_config = {"frozen": True}
-
-    document_id: str
-    min_chunk_ind: int | None = None
-    max_chunk_ind: int | None = None
-
-
-class IndexingMetadata(BaseModel):
-    """
-    Information about chunk counts for efficient cleaning / updating of document chunks. A common pattern to ensure
-    that no chunks are left over is to delete all of the chunks for a document and then re-index the document. This
-    information allows us to only delete the extra "tail" chunks when the document has gotten shorter.
-    """
-
-    # The tuple is (old_chunk_cnt, new_chunk_cnt)
-    doc_id_to_chunk_cnt_diff: dict[str, tuple[int, int]]
-
-
-class MetadataUpdateRequest(BaseModel):
-    """
-    Updates to the documents that can happen without there being an update to the contents of the document.
-    """
-
-    document_ids: list[str]
-    # Passed in to help with potential optimizations of the implementation
-    doc_id_to_chunk_cnt: dict[str, int]
-    # For the ones that are None, there is no update required to that field
-    access: DocumentAccess | None = None
-    document_sets: set[str] | None = None
-    boost: float | None = None
-    hidden: bool | None = None
-    secondary_index_updated: bool | None = None
-    project_ids: set[int] | None = None
-
-
-class SchemaVerifiable(abc.ABC):
-    """
-    Class must implement document index schema verification. For example, verify that all of the
-    necessary attributes for indexing, querying, filtering, and fields to return from search are
-    all valid in the schema.
-    """
-
-    def __init__(
-        self,
-        index_name: str,
-        tenant_id: int | None,
-        *args: Any,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(*args, **kwargs)
-        self.index_name = index_name
-        self.tenant_id = tenant_id
-
-    @abc.abstractmethod
-    def verify_and_create_index_if_necessary(
-        self,
-        embedding_dim: int,
-        embedding_precision: EmbeddingPrecision,
-    ) -> None:
-        """
-        Verify that the document index exists and is consistent with the expectations in the code. For certain search
-        engines, the schema needs to be created before indexing can happen. This call should create the schema if it
-        does not exist.
-
-        Parameters:
-        - embedding_dim: Vector dimensionality for the vector similarity part of the search
-        - embedding_precision: Precision of the vector similarity part of the search
-        """
-        raise NotImplementedError
-
-
-class Indexable(abc.ABC):
-    """
-    Class must implement the ability to index document chunks
-    """
-
-    @abc.abstractmethod
-    def index(
-        self,
-        chunks: Iterator[DocMetadataAwareIndexChunk],
-        indexing_metadata: IndexingMetadata,
-    ) -> set[DocumentInsertionRecord]:
-        """
-        Takes a list of document chunks and indexes them in the document index. This is often a batch operation
-        including chunks from multiple documents.
-
-        NOTE: When a document is reindexed/updated here and has gotten shorter, it is important to delete the extra
-        chunks at the end to ensure there are no stale chunks in the index.
-
-        NOTE: The chunks of a document are never separated into separate index() calls. So there is
-        no worry of receiving the first 0 through n chunks in one index call and the next n through
-        m chunks of a document in the next index call.
-
-        Parameters:
-        - chunks: Document chunks with all of the information needed for indexing to the document index.
-        - indexing_metadata: Information about chunk counts for efficient cleaning / updating
-
-        Returns:
-            List of document ids which map to unique documents and are used for deduping chunks
-            when updating, as well as if the document is newly indexed or already existed and
-            just updated
-        """
-        raise NotImplementedError
-
-
-class Deletable(abc.ABC):
-    """
-    Class must implement the ability to delete document by a given unique document id. Note that the document id is the
-    unique identifier for the document as represented in Onyx, not in the document index.
-    """
-
-    @abc.abstractmethod
-    def delete(
-        self,
-        db_doc_id: str,
-        *,
-        # Passed in in case it helps the efficiency of the delete implementation
-        chunk_count: int | None,
-    ) -> int:
-        """
-        Given a single document, hard delete all of the chunks for the document from the document index
-
-        Parameters:
-        - doc_id: document id as represented in Onyx
-        - chunk_count: number of chunks in the document
-
-        Returns:
-            number of chunks deleted
-        """
-        raise NotImplementedError
-
-
-class Updatable(abc.ABC):
-    """
-    Class must implement the ability to update certain attributes of a document without needing to
-    update all of the fields. Specifically, needs to be able to update:
-    - Access Control List
-    - Document-set membership
-    - Boost value (learning from feedback mechanism)
-    - Whether the document is hidden or not, hidden documents are not returned from search
-    - Which Projects the document is a part of
-    """
-
-    @abc.abstractmethod
-    def update(self, update_requests: list[MetadataUpdateRequest]) -> None:
-        """
-        Updates some set of chunks. The document and fields to update are specified in the update
-        requests. Each update request in the list applies its changes to a list of document ids.
-        None values mean that the field does not need an update.
-
-        Parameters:
-        - update_requests: for a list of document ids in the update request, apply the same updates
-                to all of the documents with those ids. This is for bulk handling efficiency. Many
-                updates are done at the connector level which have many documents for the connector
-        """
-        raise NotImplementedError
-
-
-class IdRetrievalCapable(abc.ABC):
-    """
-    Class must implement the ability to retrieve either:
-    - All of the chunks of a document IN ORDER given a document id. Caller assumes it to be in order.
-    - A specific section (continuous set of chunks) for some document.
-    """
-
-    @abc.abstractmethod
-    def id_based_retrieval(
-        self,
-        chunk_requests: list[DocumentSectionRequest],
-    ) -> list[InferenceChunk]:
-        """
-        Fetch chunk(s) based on document id
-
-        NOTE: This is used to reconstruct a full document or an extended (multi-chunk) section
-        of a document. Downstream currently assumes that the chunking does not introduce overlaps
-        between the chunks. If there are overlaps for the chunks, then the reconstructed document
-        or extended section will have duplicate segments.
-
-        NOTE: This should be used after a search call to get more context around returned chunks.
-        There is no filters here since the calling code should not be calling this on arbitrary
-        documents.
-
-        Parameters:
-        - chunk_requests: requests containing the document id and the chunk range to retrieve
-
-        Returns:
-            list of sections from the documents specified
-        """
-        raise NotImplementedError
-
-
-class HybridCapable(abc.ABC):
-    """
-    Class must implement hybrid (keyword + vector) search functionality
-    """
-
-    @abc.abstractmethod
-    def hybrid_retrieval(
-        self,
-        query: str,
-        query_embedding: Embedding,
-        final_keywords: list[str] | None,
-        query_type: QueryType,
-        filters: IndexFilters,
-        num_to_retrieve: int,
-        offset: int = 0,
-    ) -> list[InferenceChunk]:
-        """
-        Run hybrid search and return a list of inference chunks.
-
-        Parameters:
-        - query: unmodified user query. This may be needed for getting the matching highlighted
-                keywords or for logging purposes
-        - query_embedding: vector representation of the query, must be of the correct
-                dimensionality for the primary index
-        - final_keywords: Final keywords to be used from the query, defaults to query if not set
-        - query_type: Semantic or keyword type query, may use different scoring logic for each
-        - filters: Filters for things like permissions, source type, time, etc.
-        - num_to_retrieve: number of highest matching chunks to return
-        - offset: number of highest matching chunks to skip (kind of like pagination)
-
-        Returns:
-            Score ranked (highest first) list of highest matching chunks
-        """
-        raise NotImplementedError
-
-
-class RandomCapable(abc.ABC):
-    """Class must implement random document retrieval capability.
-    This currently is just used for porting the documents to a secondary index."""
-
-    @abc.abstractmethod
-    def random_retrieval(
-        self,
-        filters: IndexFilters | None = None,
-        num_to_retrieve: int = 100,
-        dirty: bool | None = None,
-    ) -> list[InferenceChunk]:
-        """Retrieve random chunks matching the filters"""
-        raise NotImplementedError
-
-
-class DocumentIndex(
-    SchemaVerifiable,
-    Indexable,
-    Updatable,
-    Deletable,
-    HybridCapable,
-    IdRetrievalCapable,
-    RandomCapable,
-    abc.ABC,
-):
-    """
-    A valid document index that can plug into all Onyx flows must implement all of these
-    functionalities.
-
-    As a high level summary, document indices need to be able to
-    - Verify the schema definition is valid
-    - Index new documents
-    - Update specific attributes of existing documents
-    - Delete documents
-    - Run hybrid search
-    - Retrieve document or sections of documents based on document id
-    - Retrieve sets of random documents
-    """
--- a/backend/onyx/document_index/vespa/shared_utils/vespa_request_builders.py
+++ b/backend/onyx/document_index/vespa/shared_utils/vespa_request_builders.py
@@ -12,6 +12,7 @@ from onyx.document_index.vespa_constants import DOCUMENT_ID
 from onyx.document_index.vespa_constants import DOCUMENT_SETS
 from onyx.document_index.vespa_constants import HIDDEN
 from onyx.document_index.vespa_constants import METADATA_LIST
+from onyx.document_index.vespa_constants import PRIMARY_OWNERS
 from onyx.document_index.vespa_constants import SOURCE_TYPE
 from onyx.document_index.vespa_constants import TENANT_ID
 from onyx.document_index.vespa_constants import USER_PROJECT
@@ -165,6 +166,10 @@ def build_vespa_filters(
            ACCESS_CONTROL_LIST, filters.access_control_list
        )

+    # Primary owner filter (for avatar queries)
+    if filters.primary_owner_emails:
+        filter_str += _build_or_filters(PRIMARY_OWNERS, filters.primary_owner_emails)
+
    # Source type filters
    source_strs = (
        [s.value for s in filters.source_type] if filters.source_type else None
--- a/backend/onyx/federated_connectors/slack/models.py
+++ b/backend/onyx/federated_connectors/slack/models.py
@@ -25,17 +25,17 @@ class SlackEntities(BaseModel):

    # Direct message filtering
    include_dm: bool = Field(
-        default=True,
+        default=False,
        description="Include user direct messages in search results",
    )
    include_group_dm: bool = Field(
-        default=True,
+        default=False,
        description="Include group direct messages (multi-person DMs) in search results",
    )

    # Private channel filtering
    include_private_channels: bool = Field(
-        default=True,
+        default=False,
        description="Include private channels in search results (user must have access)",
    )

--- a/backend/onyx/file_processing/image_summarization.py
+++ b/backend/onyx/file_processing/image_summarization.py
@@ -1,19 +1,15 @@
 import base64
 from io import BytesIO

+from langchain_core.messages import BaseMessage
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import SystemMessage
 from PIL import Image

 from onyx.configs.app_configs import IMAGE_SUMMARIZATION_SYSTEM_PROMPT
 from onyx.configs.app_configs import IMAGE_SUMMARIZATION_USER_PROMPT
 from onyx.llm.interfaces import LLM
-from onyx.llm.models import ChatCompletionMessage
-from onyx.llm.models import ContentPart
-from onyx.llm.models import ImageContentPart
-from onyx.llm.models import ImageUrlDetail
-from onyx.llm.models import SystemMessage
-from onyx.llm.models import TextContentPart
-from onyx.llm.models import UserMessage
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.utils import message_to_string
 from onyx.utils.b64 import get_image_type_from_bytes
 from onyx.utils.logger import setup_logger

@@ -101,24 +97,22 @@ def _summarize_image(
 ) -> str:
    """Use default LLM (if it is multimodal) to generate a summary of an image."""

-    messages: list[ChatCompletionMessage] = []
+    messages: list[BaseMessage] = []

    if system_prompt:
        messages.append(SystemMessage(content=system_prompt))

-    content: list[ContentPart] = []
-    if query:
-        content.append(TextContentPart(text=query))
-    content.append(ImageContentPart(image_url=ImageUrlDetail(url=encoded_image)))
-
    messages.append(
-        UserMessage(
-            content=content,
+        HumanMessage(
+            content=[
+                {"type": "text", "text": query},
+                {"type": "image_url", "image_url": {"url": encoded_image}},
+            ],
        ),
    )

    try:
-        return llm_response_to_string(llm.invoke(messages))
+        return message_to_string(llm.invoke_langchain(messages))

    except Exception as e:
        error_msg = f"Summarization failed. Messages: {messages}"
--- a/backend/onyx/file_store/utils.py
+++ b/backend/onyx/file_store/utils.py
@@ -298,17 +298,17 @@ def verify_user_files(

    for file_descriptor in user_files:
        # Check if this file descriptor has a user_file_id
-        if file_descriptor.get("user_file_id"):
+        if "user_file_id" in file_descriptor and file_descriptor["user_file_id"]:
            try:
                user_file_ids.append(UUID(file_descriptor["user_file_id"]))
            except (ValueError, TypeError):
                logger.warning(
-                    f"Invalid user_file_id in file descriptor: {file_descriptor['user_file_id']}"
+                    f"Invalid user_file_id in file descriptor: {file_descriptor.get('user_file_id')}"
                )
                continue
        else:
            # This is a project file - use the 'id' field which is the file_id
-            if file_descriptor.get("id"):
+            if "id" in file_descriptor and file_descriptor["id"]:
                project_file_ids.append(file_descriptor["id"])

    # Verify user files (existing logic)
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -54,8 +54,8 @@ from onyx.llm.chat_llm import LLMRateLimitError
 from onyx.llm.factory import get_default_llm_with_vision
 from onyx.llm.factory import get_llm_for_contextual_rag
 from onyx.llm.interfaces import LLM
-from onyx.llm.utils import llm_response_to_string
 from onyx.llm.utils import MAX_CONTEXT_TOKENS
+from onyx.llm.utils import message_to_string
 from onyx.natural_language_processing.search_nlp_models import (
    InformationContentClassificationModel,
 )
@@ -542,8 +542,8 @@ def add_document_summaries(
    doc_tokens = tokenizer.encode(chunks_by_doc[0].source_document.get_text_content())
    doc_content = tokenizer_trim_middle(doc_tokens, trunc_doc_tokens, tokenizer)
    summary_prompt = DOCUMENT_SUMMARY_PROMPT.format(document=doc_content)
-    doc_summary = llm_response_to_string(
-        llm.invoke(summary_prompt, max_tokens=MAX_CONTEXT_TOKENS)
+    doc_summary = message_to_string(
+        llm.invoke_langchain(summary_prompt, max_tokens=MAX_CONTEXT_TOKENS)
    )

    for chunk in chunks_by_doc:
@@ -583,8 +583,8 @@ def add_chunk_summaries(
    if not doc_info:
        # This happens if the document is too long AND document summaries are turned off
        # In this case we compute a doc summary using the LLM
-        doc_info = llm_response_to_string(
-            llm.invoke(
+        doc_info = message_to_string(
+            llm.invoke_langchain(
                DOCUMENT_SUMMARY_PROMPT.format(document=doc_content),
                max_tokens=MAX_CONTEXT_TOKENS,
            )
@@ -595,8 +595,8 @@ def add_chunk_summaries(
    def assign_context(chunk: DocAwareChunk) -> None:
        context_prompt2 = CONTEXTUAL_RAG_PROMPT2.format(chunk=chunk.content)
        try:
-            chunk.chunk_context = llm_response_to_string(
-                llm.invoke(
+            chunk.chunk_context = message_to_string(
+                llm.invoke_langchain(
                    context_prompt1 + context_prompt2,
                    max_tokens=MAX_CONTEXT_TOKENS,
                )
--- a/backend/onyx/key_value_store/store.py
+++ b/backend/onyx/key_value_store/store.py
@@ -80,11 +80,7 @@ class PgRedisKVStore(KeyValueStore):
                value = None

            try:
-                self.redis_client.set(
-                    REDIS_KEY_PREFIX + key,
-                    json.dumps(value),
-                    ex=KV_REDIS_KEY_EXPIRATION,
-                )
+                self.redis_client.set(REDIS_KEY_PREFIX + key, json.dumps(value))
            except Exception as e:
                logger.error(f"Failed to set value in Redis for key '{key}': {str(e)}")

--- a/backend/onyx/kg/utils/extraction_utils.py
+++ b/backend/onyx/kg/utils/extraction_utils.py
@@ -1,5 +1,7 @@
 import json

+from langchain_core.messages import HumanMessage
+
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import OnyxCallTypes
 from onyx.configs.kg_configs import KG_METADATA_TRACKING_THRESHOLD
@@ -29,7 +31,7 @@ from onyx.kg.utils.formatting_utils import make_relationship_id
 from onyx.kg.utils.formatting_utils import make_relationship_type_id
 from onyx.kg.vespa.vespa_interactions import get_document_vespa_contents
 from onyx.llm.factory import get_default_llms
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.utils import message_to_string
 from onyx.prompts.kg_prompts import CALL_CHUNK_PREPROCESSING_PROMPT
 from onyx.prompts.kg_prompts import CALL_DOCUMENT_CLASSIFICATION_PROMPT
 from onyx.prompts.kg_prompts import GENERAL_CHUNK_PREPROCESSING_PROMPT
@@ -416,10 +418,14 @@ def kg_classify_document(

    # classify with LLM
    primary_llm, _ = get_default_llms()
+    msg = [HumanMessage(content=prompt)]
    try:
-        raw_classification_result = llm_response_to_string(primary_llm.invoke(prompt))
+        raw_classification_result = primary_llm.invoke_langchain(msg)
        classification_result = (
-            raw_classification_result.replace("```json", "").replace("```", "").strip()
+            message_to_string(raw_classification_result)
+            .replace("```json", "")
+            .replace("```", "")
+            .strip()
        )
        # no json parsing here because of reasoning output
        classification_class = classification_result.split("CATEGORY:")[1].strip()
@@ -480,10 +486,12 @@ def kg_deep_extract_chunks(

    # extract with LLM
    _, fast_llm = get_default_llms()
+    msg = [HumanMessage(content=prompt)]
    try:
-        raw_extraction_result = llm_response_to_string(fast_llm.invoke(prompt))
+        raw_extraction_result = fast_llm.invoke_langchain(msg)
        cleaned_response = (
-            raw_extraction_result.replace("{{", "{")
+            message_to_string(raw_extraction_result)
+            .replace("{{", "{")
            .replace("}}", "}")
            .replace("```json\n", "")
            .replace("\n```", "")
--- a/backend/onyx/llm/chat_llm.py
+++ b/backend/onyx/llm/chat_llm.py
@@ -1,23 +1,45 @@
+import json
 import os
 import traceback
 from collections.abc import Iterator
+from collections.abc import Sequence
 from typing import Any
 from typing import cast
 from typing import TYPE_CHECKING
 from typing import Union

+from httpx import RemoteProtocolError
+from langchain.schema.language_model import (
+    LanguageModelInput as LangChainLanguageModelInput,
+)
+from langchain_core.messages import AIMessage
+from langchain_core.messages import AIMessageChunk
 from langchain_core.messages import BaseMessage
+from langchain_core.messages import BaseMessageChunk
+from langchain_core.messages import ChatMessage
+from langchain_core.messages import ChatMessageChunk
+from langchain_core.messages import FunctionMessage
+from langchain_core.messages import FunctionMessageChunk
+from langchain_core.messages import HumanMessage
+from langchain_core.messages import HumanMessageChunk
+from langchain_core.messages import SystemMessage
+from langchain_core.messages import SystemMessageChunk
+from langchain_core.messages.tool import ToolCallChunk
+from langchain_core.messages.tool import ToolMessage
+from langchain_core.prompt_values import PromptValue

+from onyx.configs.app_configs import LOG_ONYX_MODEL_INTERACTIONS
 from onyx.configs.app_configs import MOCK_LLM_RESPONSE
-from onyx.configs.app_configs import SEND_USER_METADATA_TO_LLM_PROVIDER
 from onyx.configs.chat_configs import QA_TIMEOUT
+from onyx.configs.model_configs import (
+    DISABLE_LITELLM_STREAMING,
+)
 from onyx.configs.model_configs import GEN_AI_TEMPERATURE
 from onyx.configs.model_configs import LITELLM_EXTRA_BODY
 from onyx.llm.interfaces import LanguageModelInput
 from onyx.llm.interfaces import LLM
 from onyx.llm.interfaces import LLMConfig
-from onyx.llm.interfaces import LLMUserIdentity
-from onyx.llm.interfaces import ReasoningEffort
+from onyx.llm.interfaces import STANDARD_TOOL_CHOICE_OPTIONS
 from onyx.llm.interfaces import ToolChoiceOptions
 from onyx.llm.llm_provider_options import AZURE_PROVIDER_NAME
 from onyx.llm.llm_provider_options import OLLAMA_PROVIDER_NAME
@@ -25,8 +47,6 @@ from onyx.llm.llm_provider_options import VERTEX_CREDENTIALS_FILE_KWARG
 from onyx.llm.llm_provider_options import VERTEX_LOCATION_KWARG
 from onyx.llm.model_response import ModelResponse
 from onyx.llm.model_response import ModelResponseStream
-from onyx.llm.models import CLAUDE_REASONING_BUDGET_TOKENS
-from onyx.llm.models import OPENAI_REASONING_EFFORT
 from onyx.llm.utils import is_true_openai_model
 from onyx.llm.utils import model_is_reasoning_model
 from onyx.server.utils import mask_string
@@ -37,13 +57,14 @@ from onyx.utils.special_types import JSON_ro
 logger = setup_logger()

 if TYPE_CHECKING:
-    from litellm import CustomStreamWrapper
+    from litellm import CustomStreamWrapper, Message


 _LLM_PROMPT_LONG_TERM_LOG_CATEGORY = "llm_prompt"
 LEGACY_MAX_TOKENS_KWARG = "max_tokens"
 STANDARD_MAX_TOKENS_KWARG = "max_completion_tokens"
-MAX_LITELLM_USER_ID_LENGTH = 64
+
+LegacyPromptDict = Sequence[str | list[str] | dict[str, Any] | tuple[str, str]]


 class LLMTimeoutError(Exception):
@@ -58,30 +79,199 @@ class LLMRateLimitError(Exception):
    """


-def _prompt_to_dicts(prompt: LanguageModelInput) -> list[dict[str, Any]]:
-    """Convert Pydantic message models to dictionaries for LiteLLM.
-
-    LiteLLM expects messages to be dictionaries (with .get() method),
-    not Pydantic models. This function serializes the messages.
-    """
-    if isinstance(prompt, str):
-        return [{"role": "user", "content": prompt}]
-    return [msg.model_dump(exclude_none=True) for msg in prompt]
+def _base_msg_to_role(msg: BaseMessage) -> str:
+    if isinstance(msg, HumanMessage) or isinstance(msg, HumanMessageChunk):
+        return "user"
+    if isinstance(msg, AIMessage) or isinstance(msg, AIMessageChunk):
+        return "assistant"
+    if isinstance(msg, SystemMessage) or isinstance(msg, SystemMessageChunk):
+        return "system"
+    if isinstance(msg, FunctionMessage) or isinstance(msg, FunctionMessageChunk):
+        return "function"
+    return "unknown"


-def _prompt_as_json(prompt: LanguageModelInput) -> JSON_ro:
-    return cast(JSON_ro, _prompt_to_dicts(prompt))
+def _convert_litellm_message_to_langchain_message(
+    litellm_message: "Message",
+) -> BaseMessage:
+    from onyx.llm.litellm_singleton import litellm

+    # Extracting the basic attributes from the litellm message
+    content = litellm_message.content or ""
+    role = litellm_message.role

-def _truncate_litellm_user_id(user_id: str) -> str:
-    if len(user_id) <= MAX_LITELLM_USER_ID_LENGTH:
-        return user_id
-    logger.warning(
-        "LLM user id exceeds %d chars (len=%d); truncating for provider compatibility.",
-        MAX_LITELLM_USER_ID_LENGTH,
-        len(user_id),
+    # Handling function calls and tool calls if present
+    tool_calls = (
+        cast(
+            list[litellm.ChatCompletionMessageToolCall],
+            litellm_message.tool_calls,
+        )
+        if hasattr(litellm_message, "tool_calls")
+        else []
    )
-    return user_id[:MAX_LITELLM_USER_ID_LENGTH]
+
+    # Create the appropriate langchain message based on the role
+    if role == "user":
+        return HumanMessage(content=content)
+    elif role == "assistant":
+        return AIMessage(
+            content=content,
+            tool_calls=(
+                [
+                    {
+                        "name": tool_call.function.name or "",
+                        "args": json.loads(tool_call.function.arguments),
+                        "id": tool_call.id,
+                    }
+                    for tool_call in tool_calls
+                ]
+                if tool_calls
+                else []
+            ),
+        )
+    elif role == "system":
+        return SystemMessage(content=content)
+    else:
+        raise ValueError(f"Unknown role type received: {role}")
+
+
+def _convert_message_to_dict(message: BaseMessage) -> dict:
+    """Adapted from langchain_community.chat_models.litellm._convert_message_to_dict"""
+    if isinstance(message, ChatMessage):
+        message_dict = {"role": message.role, "content": message.content}
+    elif isinstance(message, HumanMessage):
+        message_dict = {"role": "user", "content": message.content}
+    elif isinstance(message, AIMessage):
+        message_dict = {"role": "assistant", "content": message.content}
+        if message.tool_calls:
+            message_dict["tool_calls"] = [
+                {
+                    "id": tool_call.get("id"),
+                    "function": {
+                        "name": tool_call["name"],
+                        "arguments": json.dumps(tool_call["args"]),
+                    },
+                    "type": "function",
+                    "index": tool_call.get("index", 0),
+                }
+                for tool_call in message.tool_calls
+            ]
+        if "function_call" in message.additional_kwargs:
+            message_dict["function_call"] = message.additional_kwargs["function_call"]
+    elif isinstance(message, SystemMessage):
+        message_dict = {"role": "system", "content": message.content}
+    elif isinstance(message, FunctionMessage):
+        message_dict = {
+            "role": "function",
+            "content": message.content,
+            "name": message.name,
+        }
+    elif isinstance(message, ToolMessage):
+        message_dict = {
+            "tool_call_id": message.tool_call_id,
+            "role": "tool",
+            "name": message.name or "",
+            "content": message.content,
+        }
+    else:
+        raise ValueError(f"Got unknown type {message}")
+    if "name" in message.additional_kwargs:
+        message_dict["name"] = message.additional_kwargs["name"]
+    return message_dict
+
+
+def _convert_delta_to_message_chunk(
+    _dict: dict[str, Any],
+    curr_msg: BaseMessage | None,
+    stop_reason: str | None = None,
+) -> BaseMessageChunk:
+    from litellm.utils import ChatCompletionDeltaToolCall
+
+    """Adapted from langchain_community.chat_models.litellm._convert_delta_to_message_chunk"""
+    role = _dict.get("role") or (_base_msg_to_role(curr_msg) if curr_msg else "unknown")
+    content = _dict.get("content") or ""
+    additional_kwargs = {}
+    if _dict.get("function_call"):
+        additional_kwargs.update({"function_call": dict(_dict["function_call"])})
+    tool_calls = cast(list[ChatCompletionDeltaToolCall] | None, _dict.get("tool_calls"))
+
+    if role == "user":
+        return HumanMessageChunk(content=content)
+    # NOTE: if tool calls are present, then it's an assistant.
+    # In Ollama, the role will be None for tool-calls
+    elif role == "assistant" or tool_calls:
+        if tool_calls:
+            tool_call = tool_calls[0]
+            tool_name = tool_call.function.name or (curr_msg and curr_msg.name) or ""
+            idx = tool_call.index
+
+            tool_call_chunk = ToolCallChunk(
+                name=tool_name,
+                id=tool_call.id,
+                args=tool_call.function.arguments,
+                index=idx,
+            )
+
+            return AIMessageChunk(
+                content=content,
+                tool_call_chunks=[tool_call_chunk],
+                additional_kwargs={
+                    "usage_metadata": {"stop": stop_reason},
+                    **additional_kwargs,
+                },
+            )
+
+        return AIMessageChunk(
+            content=content,
+            additional_kwargs={
+                "usage_metadata": {"stop": stop_reason},
+                **additional_kwargs,
+            },
+        )
+    elif role == "system":
+        return SystemMessageChunk(content=content)
+    elif role == "function":
+        return FunctionMessageChunk(content=content, name=_dict["name"])
+    elif role:
+        return ChatMessageChunk(content=content, role=role)
+
+    raise ValueError(f"Unknown role: {role}")
+
+
+def _prompt_to_dict(
+    prompt: LanguageModelInput | LangChainLanguageModelInput,
+) -> LegacyPromptDict:
+    # NOTE: this must go first, since it is also a Sequence
+    if isinstance(prompt, str):
+        return [_convert_message_to_dict(HumanMessage(content=prompt))]
+
+    if isinstance(prompt, (list, Sequence)):
+        normalized_prompt: list[str | list[str] | dict[str, Any] | tuple[str, str]] = []
+        for msg in prompt:
+            if isinstance(msg, BaseMessage):
+                normalized_prompt.append(_convert_message_to_dict(msg))
+            elif isinstance(msg, dict):
+                normalized_prompt.append(dict(msg))
+            else:
+                normalized_prompt.append(msg)
+        return normalized_prompt
+
+    if isinstance(prompt, BaseMessage):
+        return [_convert_message_to_dict(prompt)]
+
+    if isinstance(prompt, PromptValue):
+        return [_convert_message_to_dict(message) for message in prompt.to_messages()]
+
+    raise TypeError(f"Unsupported prompt type: {type(prompt)}")
+
+
+def _prompt_as_json(
+    prompt: LanguageModelInput | LangChainLanguageModelInput,
+    *,
+    is_legacy_langchain: bool,
+) -> JSON_ro:
+    prompt_payload = _prompt_to_dict(prompt) if is_legacy_langchain else prompt
+    return cast(JSON_ro, prompt_payload)


 class LitellmLLM(LLM):
@@ -181,12 +371,18 @@ class LitellmLLM(LLM):
            dump["credentials_file"] = mask_string(credentials_file)
        return dump

+    def log_model_configs(self) -> None:
+        logger.debug(f"Config: {self._safe_model_config()}")
+
    def _record_call(
        self,
-        prompt: LanguageModelInput,
+        prompt: LanguageModelInput | LangChainLanguageModelInput,
+        is_legacy_langchain: bool = False,
    ) -> None:
        if self._long_term_logger:
-            prompt_json = _prompt_as_json(prompt)
+            prompt_json = _prompt_as_json(
+                prompt, is_legacy_langchain=is_legacy_langchain
+            )
            self._long_term_logger.record(
                {
                    "prompt": prompt_json,
@@ -197,11 +393,14 @@ class LitellmLLM(LLM):

    def _record_result(
        self,
-        prompt: LanguageModelInput,
+        prompt: LanguageModelInput | LangChainLanguageModelInput,
        model_output: BaseMessage,
+        is_legacy_langchain: bool,
    ) -> None:
        if self._long_term_logger:
-            prompt_json = _prompt_as_json(prompt)
+            prompt_json = _prompt_as_json(
+                prompt, is_legacy_langchain=is_legacy_langchain
+            )
            tool_calls = (
                model_output.tool_calls if hasattr(model_output, "tool_calls") else []
            )
@@ -217,11 +416,14 @@ class LitellmLLM(LLM):

    def _record_error(
        self,
-        prompt: LanguageModelInput,
+        prompt: LanguageModelInput | LangChainLanguageModelInput,
        error: Exception,
+        is_legacy_langchain: bool,
    ) -> None:
        if self._long_term_logger:
-            prompt_json = _prompt_as_json(prompt)
+            prompt_json = _prompt_as_json(
+                prompt, is_legacy_langchain=is_legacy_langchain
+            )
            self._long_term_logger.record(
                {
                    "prompt": prompt_json,
@@ -238,27 +440,48 @@ class LitellmLLM(LLM):

    def _completion(
        self,
-        prompt: LanguageModelInput,
+        prompt: LanguageModelInput | LangChainLanguageModelInput,
        tools: list[dict] | None,
        tool_choice: ToolChoiceOptions | None,
        stream: bool,
        parallel_tool_calls: bool,
-        reasoning_effort: ReasoningEffort | None = None,
+        reasoning_effort: str | None = None,
        structured_response_format: dict | None = None,
        timeout_override: int | None = None,
        max_tokens: int | None = None,
-        user_identity: LLMUserIdentity | None = None,
+        is_legacy_langchain: bool = False,
    ) -> Union["ModelResponse", "CustomStreamWrapper"]:
-        self._record_call(prompt)
+        # litellm doesn't accept LangChain BaseMessage objects, so we need to convert them
+        # to a dict representation
+        processed_prompt: LegacyPromptDict | LanguageModelInput
+        if is_legacy_langchain:
+            processed_prompt = _prompt_to_dict(prompt)
+        else:
+            processed_prompt = cast(LanguageModelInput, prompt)
+
+        # Record the original prompt (not the processed one) for logging
+        original_prompt = prompt
+        self._record_call(original_prompt, is_legacy_langchain)
        from onyx.llm.litellm_singleton import litellm
        from litellm.exceptions import Timeout, RateLimitError

+        tool_choice_formatted: dict[str, Any] | str | None
+        if not tools:
+            tool_choice_formatted = None
+        elif tool_choice and tool_choice not in STANDARD_TOOL_CHOICE_OPTIONS:
+            tool_choice_formatted = {
+                "type": "function",
+                "function": {"name": tool_choice},
+            }
+        else:
+            tool_choice_formatted = tool_choice
+
        is_reasoning = model_is_reasoning_model(
            self.config.model_name, self.config.model_provider
        )

        # Needed to get reasoning tokens from the model
-        if (
+        if not is_legacy_langchain and (
            is_true_openai_model(self.config.model_provider, self.config.model_name)
            or self.config.model_provider == AZURE_PROVIDER_NAME
        ):
@@ -266,29 +489,6 @@ class LitellmLLM(LLM):
        else:
            model_provider = self.config.model_provider

-        completion_kwargs: dict[str, Any] = self._model_kwargs
-        if SEND_USER_METADATA_TO_LLM_PROVIDER and user_identity:
-            completion_kwargs = dict(self._model_kwargs)
-
-            if user_identity.user_id:
-                completion_kwargs["user"] = _truncate_litellm_user_id(
-                    user_identity.user_id
-                )
-
-            if user_identity.session_id:
-                existing_metadata = completion_kwargs.get("metadata")
-                metadata: dict[str, Any] | None
-                if existing_metadata is None:
-                    metadata = {}
-                elif isinstance(existing_metadata, dict):
-                    metadata = dict(existing_metadata)
-                else:
-                    metadata = None
-
-                if metadata is not None:
-                    metadata["session_id"] = user_identity.session_id
-                    completion_kwargs["metadata"] = metadata
-
        try:
            return litellm.completion(
                mock_response=MOCK_LLM_RESPONSE,
@@ -302,9 +502,9 @@ class LitellmLLM(LLM):
                api_version=self._api_version or None,
                custom_llm_provider=self._custom_llm_provider or None,
                # actual input
-                messages=_prompt_to_dicts(prompt),
+                messages=processed_prompt,
                tools=tools,
-                tool_choice=tool_choice if tools else None,
+                tool_choice=tool_choice_formatted,
                # streaming choice
                stream=stream,
                # model params
@@ -332,16 +532,8 @@ class LitellmLLM(LLM):
                # Anthropic Claude uses `thinking` with budget_tokens for extended thinking
                # This applies to Claude models on any provider (anthropic, vertex_ai, bedrock)
                **(
-                    {
-                        "thinking": {
-                            "type": "enabled",
-                            "budget_tokens": CLAUDE_REASONING_BUDGET_TOKENS[
-                                reasoning_effort
-                            ],
-                        }
-                    }
+                    {"thinking": {"type": "enabled", "budget_tokens": 10000}}
                    if reasoning_effort
-                    and reasoning_effort != ReasoningEffort.OFF
                    and is_reasoning
                    and "claude" in self.config.model_name.lower()
                    else {}
@@ -349,9 +541,8 @@ class LitellmLLM(LLM):
                # OpenAI and other providers use reasoning_effort
                # (litellm maps this to thinking_level for Gemini 3 models)
                **(
-                    {"reasoning_effort": OPENAI_REASONING_EFFORT[reasoning_effort]}
+                    {"reasoning_effort": reasoning_effort}
                    if reasoning_effort
-                    and reasoning_effort != ReasoningEffort.OFF
                    and is_reasoning
                    and "claude" not in self.config.model_name.lower()
                    else {}
@@ -362,11 +553,11 @@ class LitellmLLM(LLM):
                    else {}
                ),
                **({self._max_token_param: max_tokens} if max_tokens else {}),
-                **completion_kwargs,
+                **self._model_kwargs,
            )
        except Exception as e:

-            self._record_error(prompt, e)
+            self._record_error(original_prompt, e, is_legacy_langchain)
            # for break pointing
            if isinstance(e, Timeout):
                raise LLMTimeoutError(e)
@@ -396,7 +587,134 @@ class LitellmLLM(LLM):
            max_input_tokens=self._max_input_tokens,
        )

-    def invoke(
+    def _invoke_implementation_langchain(
+        self,
+        prompt: LangChainLanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> BaseMessage:
+        from litellm import ModelResponse
+
+        if LOG_ONYX_MODEL_INTERACTIONS:
+            self.log_model_configs()
+
+        response = cast(
+            ModelResponse,
+            self._completion(
+                is_legacy_langchain=True,
+                prompt=prompt,
+                tools=tools,
+                tool_choice=tool_choice,
+                stream=False,
+                structured_response_format=structured_response_format,
+                timeout_override=timeout_override,
+                max_tokens=max_tokens,
+                parallel_tool_calls=False,
+            ),
+        )
+        choice = response.choices[0]
+        if hasattr(choice, "message"):
+            output = _convert_litellm_message_to_langchain_message(choice.message)
+            if output:
+                self._record_result(prompt, output, is_legacy_langchain=True)
+            return output
+        else:
+            raise ValueError("Unexpected response choice type")
+
+    def _stream_implementation_langchain(
+        self,
+        prompt: LangChainLanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> Iterator[BaseMessage]:
+        from litellm import CustomStreamWrapper
+
+        if LOG_ONYX_MODEL_INTERACTIONS:
+            self.log_model_configs()
+
+        if DISABLE_LITELLM_STREAMING:
+            yield self.invoke_langchain(
+                prompt,
+                tools,
+                tool_choice,
+                structured_response_format,
+                timeout_override,
+                max_tokens,
+            )
+            return
+
+        output = None
+        response = cast(
+            CustomStreamWrapper,
+            self._completion(
+                is_legacy_langchain=True,
+                prompt=prompt,
+                tools=tools,
+                tool_choice=tool_choice,
+                stream=True,
+                structured_response_format=structured_response_format,
+                timeout_override=timeout_override,
+                max_tokens=max_tokens,
+                parallel_tool_calls=False,
+                reasoning_effort="minimal",
+            ),
+        )
+        try:
+            for part in response:
+                if not part["choices"]:
+                    continue
+
+                choice = part["choices"][0]
+                message_chunk = _convert_delta_to_message_chunk(
+                    choice["delta"],
+                    output,
+                    stop_reason=choice["finish_reason"],
+                )
+
+                if output is None:
+                    output = message_chunk
+                else:
+                    output += message_chunk
+
+                yield message_chunk
+
+        except RemoteProtocolError:
+            raise RuntimeError(
+                "The AI model failed partway through generation, please try again."
+            )
+
+        if output:
+            self._record_result(prompt, output, is_legacy_langchain=True)
+
+        if LOG_ONYX_MODEL_INTERACTIONS and output:
+            content = output.content or ""
+            if isinstance(output, AIMessage):
+                if content:
+                    log_msg = content
+                elif output.tool_calls:
+                    log_msg = "Tool Calls: " + str(
+                        [
+                            {
+                                key: value
+                                for key, value in tool_call.items()
+                                if key != "index"
+                            }
+                            for tool_call in output.tool_calls
+                        ]
+                    )
+                else:
+                    log_msg = ""
+                logger.debug(f"Raw Model Output:\n{log_msg}")
+            else:
+                logger.debug(f"Raw Model Output:\n{content}")
+
+    def _invoke_implementation(
        self,
        prompt: LanguageModelInput,
        tools: list[dict] | None = None,
@@ -404,13 +722,15 @@ class LitellmLLM(LLM):
        structured_response_format: dict | None = None,
        timeout_override: int | None = None,
        max_tokens: int | None = None,
-        reasoning_effort: ReasoningEffort | None = None,
-        user_identity: LLMUserIdentity | None = None,
+        reasoning_effort: str | None = "medium",
    ) -> ModelResponse:
        from litellm import ModelResponse as LiteLLMModelResponse

        from onyx.llm.model_response import from_litellm_model_response

+        if LOG_ONYX_MODEL_INTERACTIONS:
+            self.log_model_configs()
+
        response = cast(
            LiteLLMModelResponse,
            self._completion(
@@ -423,13 +743,12 @@ class LitellmLLM(LLM):
                max_tokens=max_tokens,
                parallel_tool_calls=True,
                reasoning_effort=reasoning_effort,
-                user_identity=user_identity,
            ),
        )

        return from_litellm_model_response(response)

-    def stream(
+    def _stream_implementation(
        self,
        prompt: LanguageModelInput,
        tools: list[dict] | None = None,
@@ -437,12 +756,14 @@ class LitellmLLM(LLM):
        structured_response_format: dict | None = None,
        timeout_override: int | None = None,
        max_tokens: int | None = None,
-        reasoning_effort: ReasoningEffort | None = None,
-        user_identity: LLMUserIdentity | None = None,
+        reasoning_effort: str | None = "medium",
    ) -> Iterator[ModelResponseStream]:
        from litellm import CustomStreamWrapper as LiteLLMCustomStreamWrapper
        from onyx.llm.model_response import from_litellm_model_response_stream

+        if LOG_ONYX_MODEL_INTERACTIONS:
+            self.log_model_configs()
+
        response = cast(
            LiteLLMCustomStreamWrapper,
            self._completion(
@@ -455,7 +776,6 @@ class LitellmLLM(LLM):
                max_tokens=max_tokens,
                parallel_tool_calls=True,
                reasoning_effort=reasoning_effort,
-                user_identity=user_identity,
            ),
        )

--- a/backend/onyx/llm/exceptions.py
+++ b/backend/onyx/llm/exceptions.py
@@ -0,0 +1,4 @@
+class GenAIDisabledException(Exception):
+    def __init__(self, message: str = "Generative AI has been turned off") -> None:
+        self.message = message
+        super().__init__(self.message)
--- a/backend/onyx/llm/factory.py
+++ b/backend/onyx/llm/factory.py
@@ -3,6 +3,7 @@ from collections.abc import Callable
 from sqlalchemy.orm import Session

 from onyx.chat.models import PersonaOverrideConfig
+from onyx.configs.app_configs import DISABLE_GENERATIVE_AI
 from onyx.configs.model_configs import GEN_AI_TEMPERATURE
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.llm import can_user_access_llm_provider
@@ -15,6 +16,7 @@ from onyx.db.llm import fetch_user_group_ids
 from onyx.db.models import Persona
 from onyx.db.models import User
 from onyx.llm.chat_llm import LitellmLLM
+from onyx.llm.exceptions import GenAIDisabledException
 from onyx.llm.interfaces import LLM
 from onyx.llm.interfaces import LLMConfig
 from onyx.llm.llm_provider_options import OLLAMA_API_KEY_CONFIG_KEY
@@ -200,6 +202,8 @@ def get_default_llm_with_vision(

    Returns None if no providers exist or if no provider supports images.
    """
+    if DISABLE_GENERATIVE_AI:
+        raise GenAIDisabledException()

    def create_vision_llm(provider: LLMProviderView, model: str) -> LLM:
        """Helper to create an LLM if the provider supports image input."""
@@ -317,6 +321,9 @@ def get_default_llms(
    additional_headers: dict[str, str] | None = None,
    long_term_logger: LongTermLogger | None = None,
 ) -> tuple[LLM, LLM]:
+    if DISABLE_GENERATIVE_AI:
+        raise GenAIDisabledException()
+
    with get_session_with_current_tenant() as db_session:
        llm_provider = fetch_default_provider(db_session)

--- a/backend/onyx/llm/interfaces.py
+++ b/backend/onyx/llm/interfaces.py
@@ -1,22 +1,30 @@
 import abc
 from collections.abc import Iterator
+from collections.abc import Sequence
+from typing import Literal
+from typing import Union

 from braintrust import traced
+from langchain.schema.language_model import (
+    LanguageModelInput as LangChainLanguageModelInput,
+)
+from langchain_core.messages import AIMessageChunk
+from langchain_core.messages import BaseMessage
 from pydantic import BaseModel

+from onyx.configs.app_configs import DISABLE_GENERATIVE_AI
+from onyx.configs.app_configs import LOG_INDIVIDUAL_MODEL_TOKENS
+from onyx.configs.app_configs import LOG_ONYX_MODEL_INTERACTIONS
+from onyx.llm.message_types import ChatCompletionMessage
 from onyx.llm.model_response import ModelResponse
 from onyx.llm.model_response import ModelResponseStream
-from onyx.llm.models import LanguageModelInput
-from onyx.llm.models import ReasoningEffort
-from onyx.llm.models import ToolChoiceOptions
 from onyx.utils.logger import setup_logger

 logger = setup_logger()

-
-class LLMUserIdentity(BaseModel):
-    user_id: str | None = None
-    session_id: str | None = None
+STANDARD_TOOL_CHOICE_OPTIONS = ("required", "auto", "none")
+ToolChoiceOptions = Union[Literal["required", "auto", "none"], str]
+LanguageModelInput = Union[Sequence[ChatCompletionMessage], str]


 class LLMConfig(BaseModel):
@@ -33,12 +41,60 @@ class LLMConfig(BaseModel):
    model_config = {"protected_namespaces": ()}


+def log_prompt(prompt: LangChainLanguageModelInput) -> None:
+    if isinstance(prompt, list):
+        for ind, msg in enumerate(prompt):
+            if isinstance(msg, AIMessageChunk):
+                if msg.content:
+                    log_msg = msg.content
+                elif msg.tool_call_chunks:
+                    log_msg = "Tool Calls: " + str(
+                        [
+                            {
+                                key: value
+                                for key, value in tool_call.items()
+                                if key != "index"
+                            }
+                            for tool_call in msg.tool_call_chunks
+                        ]
+                    )
+                else:
+                    log_msg = ""
+                logger.debug(f"Message {ind}:\n{log_msg}")
+            else:
+                logger.debug(f"Message {ind}:\n{msg.content}")
+    if isinstance(prompt, str):
+        logger.debug(f"Prompt:\n{prompt}")
+
+
 class LLM(abc.ABC):
+    """Mimics the LangChain LLM / BaseChatModel interfaces to make it easy
+    to use these implementations to connect to a variety of LLM providers."""
+
+    @property
+    def requires_warm_up(self) -> bool:
+        """Is this model running in memory and needs an initial call to warm it up?"""
+        return False
+
+    @property
+    def requires_api_key(self) -> bool:
+        return True
+
    @property
    @abc.abstractmethod
    def config(self) -> LLMConfig:
        raise NotImplementedError

+    @abc.abstractmethod
+    def log_model_configs(self) -> None:
+        raise NotImplementedError
+
+    def _precall(self, prompt: LangChainLanguageModelInput) -> None:
+        if DISABLE_GENERATIVE_AI:
+            raise Exception("Generative AI is disabled")
+        if LOG_ONYX_MODEL_INTERACTIONS:
+            log_prompt(prompt)
+
    @traced(name="invoke llm", type="llm")
    def invoke(
        self,
@@ -48,9 +104,72 @@ class LLM(abc.ABC):
        structured_response_format: dict | None = None,
        timeout_override: int | None = None,
        max_tokens: int | None = None,
-        reasoning_effort: ReasoningEffort | None = None,
-        user_identity: LLMUserIdentity | None = None,
    ) -> "ModelResponse":
+        return self._invoke_implementation(
+            prompt,
+            tools,
+            tool_choice,
+            structured_response_format,
+            timeout_override,
+            max_tokens,
+        )
+
+    @traced(name="invoke llm", type="llm")
+    def invoke_langchain(
+        self,
+        prompt: LangChainLanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> BaseMessage:
+        self._precall(prompt)
+        # TODO add a postcall to log model outputs independent of concrete class
+        # implementation
+        return self._invoke_implementation_langchain(
+            prompt,
+            tools,
+            tool_choice,
+            structured_response_format,
+            timeout_override,
+            max_tokens,
+        )
+
+    @abc.abstractmethod
+    def _invoke_implementation(
+        self,
+        prompt: LanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> "ModelResponse":
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def _stream_implementation(
+        self,
+        prompt: LanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> Iterator[ModelResponseStream]:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def _invoke_implementation_langchain(
+        self,
+        prompt: LangChainLanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> BaseMessage:
        raise NotImplementedError

    def stream(
@@ -61,7 +180,54 @@ class LLM(abc.ABC):
        structured_response_format: dict | None = None,
        timeout_override: int | None = None,
        max_tokens: int | None = None,
-        reasoning_effort: ReasoningEffort | None = None,
-        user_identity: LLMUserIdentity | None = None,
    ) -> Iterator[ModelResponseStream]:
+        return self._stream_implementation(
+            prompt,
+            tools,
+            tool_choice,
+            structured_response_format,
+            timeout_override,
+            max_tokens,
+        )
+
+    def stream_langchain(
+        self,
+        prompt: LangChainLanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> Iterator[BaseMessage]:
+        self._precall(prompt)
+        # TODO add a postcall to log model outputs independent of concrete class
+        # implementation
+        messages = self._stream_implementation_langchain(
+            prompt,
+            tools,
+            tool_choice,
+            structured_response_format,
+            timeout_override,
+            max_tokens,
+        )
+
+        tokens = []
+        for message in messages:
+            if LOG_INDIVIDUAL_MODEL_TOKENS:
+                tokens.append(message.content)
+            yield message
+
+        if LOG_INDIVIDUAL_MODEL_TOKENS and tokens:
+            logger.debug(f"Model Tokens: {tokens}")
+
+    @abc.abstractmethod
+    def _stream_implementation_langchain(
+        self,
+        prompt: LangChainLanguageModelInput,
+        tools: list[dict] | None = None,
+        tool_choice: ToolChoiceOptions | None = None,
+        structured_response_format: dict | None = None,
+        timeout_override: int | None = None,
+        max_tokens: int | None = None,
+    ) -> Iterator[BaseMessage]:
        raise NotImplementedError
--- a/backend/onyx/llm/litellm_singleton/monkey_patches.py
+++ b/backend/onyx/llm/litellm_singleton/monkey_patches.py
@@ -606,56 +606,6 @@ def _patch_openai_responses_transform_response() -> None:
    LiteLLMResponsesTransformationHandler.transform_response = _patched_transform_response  # type: ignore[method-assign]


-def _patch_openai_responses_tool_content_type() -> None:
-    """
-    Patches LiteLLMResponsesTransformationHandler._convert_content_str_to_input_text
-    to use 'input_text' type for tool messages instead of 'output_text'.
-
-    The OpenAI Responses API only accepts 'input_text', 'input_image', and 'input_file'
-    in the function_call_output.output array. The default litellm implementation
-    incorrectly uses 'output_text' for tool messages, causing 400 Bad Request errors.
-
-    See: https://github.com/BerriAI/litellm/issues/17507
-
-    This should be removed once litellm releases a fix for this issue.
-    """
-    original_method = (
-        LiteLLMResponsesTransformationHandler._convert_content_str_to_input_text
-    )
-
-    if (
-        getattr(
-            original_method,
-            "__name__",
-            "",
-        )
-        == "_patched_convert_content_str_to_input_text"
-    ):
-        return
-
-    def _patched_convert_content_str_to_input_text(
-        self: Any, content: str, role: str
-    ) -> Dict[str, Any]:
-        """
-        Convert string content to the appropriate Responses API format.
-
-        For user, system, and tool messages, use 'input_text' type.
-        For assistant messages, use 'output_text' type.
-
-        Tool messages go into function_call_output.output, which only accepts
-        'input_text', 'input_image', and 'input_file' types.
-        """
-        if role in ("user", "system", "tool"):
-            return {"type": "input_text", "text": content}
-        else:
-            return {"type": "output_text", "text": content}
-
-    _patched_convert_content_str_to_input_text.__name__ = (
-        "_patched_convert_content_str_to_input_text"
-    )
-    LiteLLMResponsesTransformationHandler._convert_content_str_to_input_text = _patched_convert_content_str_to_input_text  # type: ignore[method-assign]
-
-
 def apply_monkey_patches() -> None:
    """
    Apply all necessary monkey patches to LiteLLM for compatibility.
@@ -665,13 +615,11 @@ def apply_monkey_patches() -> None:
    - Patching OllamaChatCompletionResponseIterator.chunk_parser for streaming content
    - Patching OpenAiResponsesToChatCompletionStreamIterator.chunk_parser for OpenAI Responses API
    - Patching LiteLLMResponsesTransformationHandler.transform_response for non-streaming responses
-    - Patching LiteLLMResponsesTransformationHandler._convert_content_str_to_input_text for tool content types
    """
    _patch_ollama_transform_request()
    _patch_ollama_chunk_parser()
    _patch_openai_responses_chunk_parser()
    _patch_openai_responses_transform_response()
-    _patch_openai_responses_tool_content_type()


 def _extract_reasoning_content(message: dict) -> Tuple[Optional[str], Optional[str]]:
--- a/backend/onyx/llm/llm_provider_options.py
+++ b/backend/onyx/llm/llm_provider_options.py
@@ -56,15 +56,6 @@ class WellKnownLLMProviderDescriptor(BaseModel):


 OPENAI_PROVIDER_NAME = "openai"
-# Curated list of OpenAI models to show by default in the UI
-OPENAI_VISIBLE_MODEL_NAMES = {
-    "gpt-5",
-    "gpt-5-mini",
-    "o1",
-    "o3-mini",
-    "gpt-4o",
-    "gpt-4o-mini",
-}

 BEDROCK_PROVIDER_NAME = "bedrock"
 BEDROCK_DEFAULT_MODEL = "anthropic.claude-3-5-sonnet-20241022-v2:0"
@@ -134,12 +125,6 @@ _IGNORABLE_ANTHROPIC_MODELS = {
    "claude-instant-1",
    "anthropic/claude-3-5-sonnet-20241022",
 }
-# Curated list of Anthropic models to show by default in the UI
-ANTHROPIC_VISIBLE_MODEL_NAMES = {
-    "claude-opus-4-5",
-    "claude-sonnet-4-5",
-    "claude-haiku-4-5",
-}

 AZURE_PROVIDER_NAME = "azure"

@@ -149,55 +134,6 @@ VERTEX_CREDENTIALS_FILE_KWARG = "vertex_credentials"
 VERTEX_LOCATION_KWARG = "vertex_location"
 VERTEXAI_DEFAULT_MODEL = "gemini-2.5-flash"
 VERTEXAI_DEFAULT_FAST_MODEL = "gemini-2.5-flash-lite"
-# Curated list of Vertex AI models to show by default in the UI
-VERTEXAI_VISIBLE_MODEL_NAMES = {
-    "gemini-2.5-flash",
-    "gemini-2.5-flash-lite",
-    "gemini-2.5-pro",
-}
-
-
-def is_obsolete_model(model_name: str, provider: str) -> bool:
-    """Check if a model is obsolete and should be filtered out.
-
-    Filters models that are 2+ major versions behind or deprecated.
-    This is the single source of truth for obsolete model detection.
-    """
-    model_lower = model_name.lower()
-
-    # OpenAI obsolete models
-    if provider == "openai":
-        # GPT-3 models are obsolete
-        if "gpt-3" in model_lower:
-            return True
-        # Legacy models
-        deprecated = {
-            "text-davinci-003",
-            "text-davinci-002",
-            "text-curie-001",
-            "text-babbage-001",
-            "text-ada-001",
-            "davinci",
-            "curie",
-            "babbage",
-            "ada",
-        }
-        if model_lower in deprecated:
-            return True
-
-    # Anthropic obsolete models
-    if provider == "anthropic":
-        if "claude-2" in model_lower or "claude-instant" in model_lower:
-            return True
-
-    # Vertex AI obsolete models
-    if provider == "vertex_ai":
-        if "gemini-1.0" in model_lower:
-            return True
-        if "palm" in model_lower or "bison" in model_lower:
-            return True
-
-    return False


 def _get_provider_to_models_map() -> dict[str, list[str]]:
@@ -219,43 +155,22 @@ def _get_provider_to_models_map() -> dict[str, list[str]]:

 def get_openai_model_names() -> list[str]:
    """Get OpenAI model names dynamically from litellm."""
-    import re
    import litellm

-    # TODO: remove these lists once we have a comprehensive model configuration page
-    # The ideal flow should be: fetch all available models --> filter by type
-    # --> allow user to modify filters and select models based on current context
-    non_chat_model_terms = {
-        "embed",
-        "audio",
-        "tts",
-        "whisper",
-        "dall-e",
-        "image",
-        "moderation",
-        "sora",
-        "container",
-    }
-    deprecated_model_terms = {"babbage", "davinci", "gpt-3.5", "gpt-4-"}
-    excluded_terms = non_chat_model_terms | deprecated_model_terms
-
-    # NOTE: We are explicitly excluding all "timestamped" models
-    # because they are mostly just noise in the admin configuration panel
-    # e.g. gpt-4o-2025-07-16, gpt-3.5-turbo-0613, etc.
-    date_pattern = re.compile(r"-\d{4}")
-
-    def is_valid_model(model: str) -> bool:
-        model_lower = model.lower()
-        return not any(
-            ex in model_lower for ex in excluded_terms
-        ) and not date_pattern.search(model)
-
    return sorted(
-        (
-            model.removeprefix("openai/")
+        [
+            # Strip openai/ prefix if present
+            model.replace("openai/", "") if model.startswith("openai/") else model
            for model in litellm.open_ai_chat_completion_models
-            if is_valid_model(model)
-        ),
+            if "embed" not in model.lower()
+            and "audio" not in model.lower()
+            and "tts" not in model.lower()
+            and "whisper" not in model.lower()
+            and "dall-e" not in model.lower()
+            and "moderation" not in model.lower()
+            and "sora" not in model.lower()  # video generation
+            and "container" not in model.lower()  # not a model
+        ],
        reverse=True,
    )

@@ -269,7 +184,6 @@ def get_anthropic_model_names() -> list[str]:
            model
            for model in litellm.anthropic_models
            if model not in _IGNORABLE_ANTHROPIC_MODELS
-            and not is_obsolete_model(model, ANTHROPIC_PROVIDER_NAME)
        ],
        reverse=True,
    )
@@ -315,7 +229,6 @@ def get_vertexai_model_names() -> list[str]:
            and "/" not in model  # filter out prefixed models like openai/gpt-oss
            and "search_api" not in model.lower()  # not a model
            and "-maas" not in model.lower()  # marketplace models
-            and not is_obsolete_model(model, VERTEXAI_PROVIDER_NAME)
        ],
        reverse=True,
    )
@@ -555,30 +468,18 @@ def get_provider_display_name(provider_name: str) -> str:
    )


-def _get_visible_models_for_provider(provider_name: str) -> set[str]:
-    """Get the set of models that should be visible by default for a provider."""
-    _PROVIDER_TO_VISIBLE_MODELS: dict[str, set[str]] = {
-        OPENAI_PROVIDER_NAME: OPENAI_VISIBLE_MODEL_NAMES,
-        ANTHROPIC_PROVIDER_NAME: ANTHROPIC_VISIBLE_MODEL_NAMES,
-        VERTEXAI_PROVIDER_NAME: VERTEXAI_VISIBLE_MODEL_NAMES,
-    }
-    return _PROVIDER_TO_VISIBLE_MODELS.get(provider_name, set())
-
-
 def fetch_model_configurations_for_provider(
    provider_name: str,
 ) -> list[ModelConfigurationView]:
    """Fetch model configurations for a static provider (OpenAI, Anthropic, Vertex AI).

    Looks up max_input_tokens from LiteLLM's model_cost. If not found, stores None
-    and the runtime will use the fallback (32000).
-
-    Models in the curated visible lists (OPENAI_VISIBLE_MODEL_NAMES, etc.) are
-    marked as is_visible=True by default.
+    and the runtime will use the fallback (4096).
    """
    from onyx.llm.utils import get_max_input_tokens

-    visible_models = _get_visible_models_for_provider(provider_name)
+    # No models are marked visible by default - the default model logic
+    # in the frontend/backend will handle making default models visible.
    configs = []
    for model_name in fetch_models_for_provider(provider_name):
        max_input_tokens = get_max_input_tokens(
@@ -589,7 +490,7 @@ def fetch_model_configurations_for_provider(
        configs.append(
            ModelConfigurationView(
                name=model_name,
-                is_visible=model_name in visible_models,
+                is_visible=False,
                max_input_tokens=max_input_tokens,
                supports_image_input=model_supports_image_input(
                    model_name=model_name,
--- a/backend/onyx/llm/message_types.py
+++ b/backend/onyx/llm/message_types.py
@@ -0,0 +1,70 @@
+from typing import Literal
+from typing import NotRequired
+
+from typing_extensions import TypedDict
+
+
+# Content part structures for multimodal messages
+class TextContentPart(TypedDict):
+    type: Literal["text"]
+    text: str
+
+
+class ImageUrlDetail(TypedDict):
+    url: str
+    detail: NotRequired[Literal["auto", "low", "high"]]
+
+
+class ImageContentPart(TypedDict):
+    type: Literal["image_url"]
+    image_url: ImageUrlDetail
+
+
+ContentPart = TextContentPart | ImageContentPart
+
+
+# Tool call structures
+class FunctionCall(TypedDict):
+    name: str
+    arguments: str
+
+
+class ToolCall(TypedDict):
+    id: str
+    type: Literal["function"]
+    function: FunctionCall
+
+
+# Message types
+class SystemMessage(TypedDict):
+    role: Literal["system"]
+    content: str
+
+
+class UserMessageWithText(TypedDict):
+    role: Literal["user"]
+    content: str
+
+
+class UserMessageWithParts(TypedDict):
+    role: Literal["user"]
+    content: list[ContentPart]
+
+
+UserMessage = UserMessageWithText | UserMessageWithParts
+
+
+class AssistantMessage(TypedDict):
+    role: Literal["assistant"]
+    content: NotRequired[str | None]
+    tool_calls: NotRequired[list[ToolCall]]
+
+
+class ToolMessage(TypedDict):
+    role: Literal["tool"]
+    content: str
+    tool_call_id: str
+
+
+# Union type for all OpenAI Chat Completions messages
+ChatCompletionMessage = SystemMessage | UserMessage | AssistantMessage | ToolMessage
--- a/backend/onyx/llm/model_metadata_enrichments.json
+++ b/backend/onyx/llm/model_metadata_enrichments.json
@@ -2621,28 +2621,6 @@
    "model_vendor": "openai",
    "model_version": "2025-10-06"
  },
-  "gpt-5.2-pro-2025-12-11": {
-    "display_name": "GPT-5.2 Pro",
-    "model_vendor": "openai",
-    "model_version": "2025-12-11"
-  },
-  "gpt-5.2-pro": {
-    "display_name": "GPT-5.2 Pro",
-    "model_vendor": "openai"
-  },
-  "gpt-5.2-chat-latest": {
-    "display_name": "GPT 5.2 Chat",
-    "model_vendor": "openai"
-  },
-  "gpt-5.2-2025-12-11": {
-    "display_name": "GPT 5.2",
-    "model_vendor": "openai",
-    "model_version": "2025-12-11"
-  },
-  "gpt-5.2": {
-    "display_name": "GPT 5.2",
-    "model_vendor": "openai"
-  },
  "gpt-5.1": {
    "display_name": "GPT 5.1",
    "model_vendor": "openai"
--- a/backend/onyx/llm/models.py
+++ b/backend/onyx/llm/models.py
@@ -1,104 +0,0 @@
-from enum import Enum
-from typing import Literal
-
-from pydantic import BaseModel
-
-
-class ToolChoiceOptions(str, Enum):
-    REQUIRED = "required"
-    AUTO = "auto"
-    NONE = "none"
-
-
-class ReasoningEffort(str, Enum):
-    """Reasoning effort levels for models that support extended thinking.
-
-    Different providers map these values differently:
-    - OpenAI: Uses "low", "medium", "high" directly for reasoning_effort. Recently added "none" for 5 series
-              which is like "minimal"
-    - Claude: Uses budget_tokens with different values for each level
-    - Gemini: Uses "none", "low", "medium", "high" for thinking_budget (via litellm mapping)
-    """
-
-    OFF = "off"
-    LOW = "low"
-    MEDIUM = "medium"
-    HIGH = "high"
-
-
-# Budget tokens for Claude extended thinking at each reasoning effort level
-CLAUDE_REASONING_BUDGET_TOKENS: dict[ReasoningEffort, int] = {
-    ReasoningEffort.OFF: 0,
-    ReasoningEffort.LOW: 1000,
-    ReasoningEffort.MEDIUM: 5000,
-    ReasoningEffort.HIGH: 10000,
-}
-
-# OpenAI reasoning effort mapping (direct string values)
-OPENAI_REASONING_EFFORT: dict[ReasoningEffort, str] = {
-    ReasoningEffort.OFF: "none",  # this only works for the 5 series though
-    ReasoningEffort.LOW: "low",
-    ReasoningEffort.MEDIUM: "medium",
-    ReasoningEffort.HIGH: "high",
-}
-
-
-# Content part structures for multimodal messages
-# The classes in this mirror the OpenAI Chat Completions message types and work well with routers like LiteLLM
-class TextContentPart(BaseModel):
-    type: Literal["text"] = "text"
-    text: str
-
-
-class ImageUrlDetail(BaseModel):
-    url: str
-    detail: Literal["auto", "low", "high"] | None = None
-
-
-class ImageContentPart(BaseModel):
-    type: Literal["image_url"] = "image_url"
-    image_url: ImageUrlDetail
-
-
-ContentPart = TextContentPart | ImageContentPart
-
-
-# Tool call structures
-class FunctionCall(BaseModel):
-    name: str
-    arguments: str
-
-
-class ToolCall(BaseModel):
-    type: Literal["function"] = "function"
-    id: str
-    function: FunctionCall
-
-
-# Message types
-class SystemMessage(BaseModel):
-    role: Literal["system"] = "system"
-    content: str
-
-
-class UserMessage(BaseModel):
-    role: Literal["user"] = "user"
-    content: str | list[ContentPart]
-
-
-class AssistantMessage(BaseModel):
-    role: Literal["assistant"] = "assistant"
-    content: str | None = None
-    tool_calls: list[ToolCall] | None = None
-
-
-class ToolMessage(BaseModel):
-    role: Literal["tool"] = "tool"
-    content: str
-    tool_call_id: str
-
-
-# Union type for all OpenAI Chat Completions messages
-ChatCompletionMessage = SystemMessage | UserMessage | AssistantMessage | ToolMessage
-# Allows for passing in a string directly. This is provided for convenience and is wrapped as a UserMessage.
-LanguageModelInput = list[ChatCompletionMessage] | str
--- a/backend/onyx/llm/utils.py
+++ b/backend/onyx/llm/utils.py
@@ -6,6 +6,10 @@ from typing import Any
 from typing import cast
 from typing import TYPE_CHECKING

+from langchain.schema.messages import AIMessage
+from langchain.schema.messages import BaseMessage
+from langchain.schema.messages import HumanMessage
+from langchain.schema.messages import SystemMessage
 from sqlalchemy import select

 from onyx.configs.app_configs import LITELLM_CUSTOM_ERROR_MESSAGE_MAPPINGS
@@ -19,7 +23,6 @@ from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.models import LLMProvider
 from onyx.db.models import ModelConfiguration
 from onyx.llm.interfaces import LLM
-from onyx.llm.model_response import ModelResponse
 from onyx.prompts.contextual_retrieval import CONTEXTUAL_RAG_TOKEN_ESTIMATE
 from onyx.prompts.contextual_retrieval import DOCUMENT_SUMMARY_TOKEN_ESTIMATE
 from onyx.utils.logger import setup_logger
@@ -85,15 +88,7 @@ def litellm_exception_to_error_msg(
    custom_error_msg_mappings: (
        dict[str, str] | None
    ) = LITELLM_CUSTOM_ERROR_MESSAGE_MAPPINGS,
-) -> tuple[str, str, bool]:
-    """Convert a LiteLLM exception to a user-friendly error message with classification.
-
-    Returns:
-        tuple: (error_message, error_code, is_retryable)
-            - error_message: User-friendly error description
-            - error_code: Categorized error code for frontend display
-            - is_retryable: Whether the user should try again
-    """
+) -> str:
    from litellm.exceptions import BadRequestError
    from litellm.exceptions import AuthenticationError
    from litellm.exceptions import PermissionDeniedError
@@ -110,37 +105,25 @@ def litellm_exception_to_error_msg(

    core_exception = _unwrap_nested_exception(e)
    error_msg = str(core_exception)
-    error_code = "UNKNOWN_ERROR"
-    is_retryable = True

    if custom_error_msg_mappings:
        for error_msg_pattern, custom_error_msg in custom_error_msg_mappings.items():
            if error_msg_pattern in error_msg:
-                return custom_error_msg, "CUSTOM_ERROR", True
+                return custom_error_msg

    if isinstance(core_exception, BadRequestError):
        error_msg = "Bad request: The server couldn't process your request. Please check your input."
-        error_code = "BAD_REQUEST"
-        is_retryable = True
    elif isinstance(core_exception, AuthenticationError):
        error_msg = "Authentication failed: Please check your API key and credentials."
-        error_code = "AUTH_ERROR"
-        is_retryable = False
    elif isinstance(core_exception, PermissionDeniedError):
        error_msg = (
-            "Permission denied: You don't have the necessary permissions for this operation. "
+            "Permission denied: You don't have the necessary permissions for this operation."
            "Ensure you have access to this model."
        )
-        error_code = "PERMISSION_DENIED"
-        is_retryable = False
    elif isinstance(core_exception, NotFoundError):
        error_msg = "Resource not found: The requested resource doesn't exist."
-        error_code = "NOT_FOUND"
-        is_retryable = False
    elif isinstance(core_exception, UnprocessableEntityError):
        error_msg = "Unprocessable entity: The server couldn't process your request due to semantic errors."
-        error_code = "UNPROCESSABLE_ENTITY"
-        is_retryable = True
    elif isinstance(core_exception, RateLimitError):
        provider_name = (
            llm.config.model_provider
@@ -171,8 +154,6 @@ def litellm_exception_to_error_msg(
            if upstream_detail
            else f"{provider_name} rate limit exceeded: Please slow down your requests and try again later."
        )
-        error_code = "RATE_LIMIT"
-        is_retryable = True
    elif isinstance(core_exception, ServiceUnavailableError):
        provider_name = (
            llm.config.model_provider
@@ -190,8 +171,6 @@ def litellm_exception_to_error_msg(
        else:
            # Generic 503 Service Unavailable
            error_msg = f"{provider_name} service error: {str(core_exception)}"
-        error_code = "SERVICE_UNAVAILABLE"
-        is_retryable = True
    elif isinstance(core_exception, ContextWindowExceededError):
        error_msg = (
            "Context window exceeded: Your input is too long for the model to process."
@@ -202,51 +181,58 @@ def litellm_exception_to_error_msg(
                    model_name=llm.config.model_name,
                    model_provider=llm.config.model_provider,
                )
-                error_msg += f" Your invoked model ({llm.config.model_name}) has a maximum context size of {max_context}."
+                error_msg += f"Your invoked model ({llm.config.model_name}) has a maximum context size of {max_context}"
            except Exception:
                logger.warning(
-                    "Unable to get maximum input token for LiteLLM exception handling"
+                    "Unable to get maximum input token for LiteLLM excpetion handling"
                )
-        error_code = "CONTEXT_TOO_LONG"
-        is_retryable = False
    elif isinstance(core_exception, ContentPolicyViolationError):
        error_msg = "Content policy violation: Your request violates the content policy. Please revise your input."
-        error_code = "CONTENT_POLICY"
-        is_retryable = False
    elif isinstance(core_exception, APIConnectionError):
        error_msg = "API connection error: Failed to connect to the API. Please check your internet connection."
-        error_code = "CONNECTION_ERROR"
-        is_retryable = True
    elif isinstance(core_exception, BudgetExceededError):
        error_msg = (
            "Budget exceeded: You've exceeded your allocated budget for API usage."
        )
-        error_code = "BUDGET_EXCEEDED"
-        is_retryable = False
    elif isinstance(core_exception, Timeout):
        error_msg = "Request timed out: The operation took too long to complete. Please try again."
-        error_code = "CONNECTION_ERROR"
-        is_retryable = True
    elif isinstance(core_exception, APIError):
        error_msg = (
            "API error: An error occurred while communicating with the API. "
            f"Details: {str(core_exception)}"
        )
-        error_code = "API_ERROR"
-        is_retryable = True
    elif not fallback_to_error_msg:
        error_msg = "An unexpected error occurred while processing your request. Please try again later."
-        error_code = "UNKNOWN_ERROR"
-        is_retryable = True
-
-    return error_msg, error_code, is_retryable
+    return error_msg


-def llm_response_to_string(message: ModelResponse) -> str:
-    if not isinstance(message.choice.message.content, str):
+def dict_based_prompt_to_langchain_prompt(
+    messages: list[dict[str, str]],
+) -> list[BaseMessage]:
+    prompt: list[BaseMessage] = []
+    for message in messages:
+        role = message.get("role")
+        content = message.get("content")
+        if not role:
+            raise ValueError(f"Message missing `role`: {message}")
+        if not content:
+            raise ValueError(f"Message missing `content`: {message}")
+        elif role == "user":
+            prompt.append(HumanMessage(content=content))
+        elif role == "system":
+            prompt.append(SystemMessage(content=content))
+        elif role == "assistant":
+            prompt.append(AIMessage(content=content))
+        else:
+            raise ValueError(f"Unknown role: {role}")
+    return prompt
+
+
+def message_to_string(message: BaseMessage) -> str:
+    if not isinstance(message.content, str):
        raise RuntimeError("LLM message not in expected format.")

-    return message.choice.message.content
+    return message.content


 def check_number_of_tokens(
@@ -269,7 +255,7 @@ def test_llm(llm: LLM) -> str | None:
    error_msg = None
    for _ in range(2):
        try:
-            llm.invoke("Do not respond")
+            llm.invoke_langchain("Do not respond")
            return None
        except Exception as e:
            error_msg = str(e)
@@ -475,10 +461,10 @@ def llm_max_input_tokens(
    if "max_tokens" in model_obj:
        return model_obj["max_tokens"]

-    logger.warning(
-        f"No max tokens found for '{model_name}'. "
-        f"Falling back to {GEN_AI_MODEL_FALLBACK_MAX_TOKENS} tokens."
-    )
+    # logger.warning(
+    #     f"No max tokens found for '{model_name}'. "
+    #     f"Falling back to {GEN_AI_MODEL_FALLBACK_MAX_TOKENS} tokens."
+    # )
    return GEN_AI_MODEL_FALLBACK_MAX_TOKENS


@@ -553,11 +539,11 @@ def get_max_input_tokens_from_llm_provider(
    1. Use max_input_tokens from model_configuration (populated from source APIs
       like OpenRouter, Ollama, or our Bedrock mapping)
    2. Look up in litellm.model_cost dictionary
-    3. Fall back to GEN_AI_MODEL_FALLBACK_MAX_TOKENS (32000)
+    3. Fall back to GEN_AI_MODEL_FALLBACK_MAX_TOKENS (4096)

    Most dynamic providers (OpenRouter, Ollama) provide context_length via their
    APIs. Bedrock doesn't expose this, so we parse from model ID suffix (:200k)
-    or use BEDROCK_MODEL_TOKEN_LIMITS mapping. The 32000 fallback is only hit for
+    or use BEDROCK_MODEL_TOKEN_LIMITS mapping. The 4096 fallback is only hit for
    unknown models not in any of these sources.
    """
    max_input_tokens = None
@@ -584,7 +570,7 @@ def get_bedrock_token_limit(model_id: str) -> int:
    1. Parse from model ID suffix (e.g., ":200k" → 200000)
    2. Check LiteLLM's model_cost dictionary
    3. Fall back to our hardcoded BEDROCK_MODEL_TOKEN_LIMITS mapping
-    4. Default to 32000 if not found anywhere
+    4. Default to 4096 if not found anywhere
    """
    from onyx.llm.constants import BEDROCK_MODEL_TOKEN_LIMITS

@@ -686,7 +672,7 @@ def model_is_reasoning_model(model_name: str, model_provider: str) -> bool:

        # Fallback: try using litellm.supports_reasoning() for newer models
        try:
-            logger.debug("Falling back to `litellm.supports_reasoning`")
+            # logger.debug("Falling back to `litellm.supports_reasoning`")
            full_model_name = (
                f"{model_provider}/{model_name}"
                if model_provider not in model_name
--- a/backend/onyx/main.py
+++ b/backend/onyx/main.py
@@ -38,6 +38,7 @@ from onyx.configs.app_configs import APP_HOST
 from onyx.configs.app_configs import APP_PORT
 from onyx.configs.app_configs import AUTH_RATE_LIMITING_ENABLED
 from onyx.configs.app_configs import AUTH_TYPE
+from onyx.configs.app_configs import DISABLE_GENERATIVE_AI
 from onyx.configs.app_configs import LOG_ENDPOINT_LATENCY
 from onyx.configs.app_configs import OAUTH_CLIENT_ID
 from onyx.configs.app_configs import OAUTH_CLIENT_SECRET
@@ -63,6 +64,11 @@ from onyx.server.documents.connector import router as connector_router
 from onyx.server.documents.credential import router as credential_router
 from onyx.server.documents.document import router as document_router
 from onyx.server.documents.standard_oauth import router as standard_oauth_router
+from onyx.server.features.avatar.api import router as avatar_router
+from onyx.server.features.avatar.permission_api import (
+    router as avatar_permission_router,
+)
+from onyx.server.features.avatar.query_api import router as avatar_query_router
 from onyx.server.features.default_assistant.api import (
    router as default_assistant_router,
 )
@@ -270,6 +276,9 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
    if OAUTH_CLIENT_ID and OAUTH_CLIENT_SECRET:
        logger.notice("Both OAuth Client ID and Secret are configured.")

+    if DISABLE_GENERATIVE_AI:
+        logger.notice("Generative AI Q&A disabled")
+
    # Initialize tracing if credentials are provided
    setup_braintrust_if_creds_available()
    setup_langfuse_if_creds_available()
@@ -385,6 +394,9 @@ def get_application(lifespan_override: Lifespan | None = None) -> FastAPI:
    include_router_with_global_prefix_prepended(application, admin_agents_router)
    include_router_with_global_prefix_prepended(application, default_assistant_router)
    include_router_with_global_prefix_prepended(application, notification_router)
+    include_router_with_global_prefix_prepended(application, avatar_router)
+    include_router_with_global_prefix_prepended(application, avatar_permission_router)
+    include_router_with_global_prefix_prepended(application, avatar_query_router)
    include_router_with_global_prefix_prepended(application, tool_router)
    include_router_with_global_prefix_prepended(application, admin_tool_router)
    include_router_with_global_prefix_prepended(application, oauth_config_router)
--- a/backend/onyx/onyxbot/slack/blocks.py
+++ b/backend/onyx/onyxbot/slack/blocks.py
@@ -16,6 +16,7 @@ from slack_sdk.models.blocks.basic_components import MarkdownTextObject
 from slack_sdk.models.blocks.block_elements import ImageElement

 from onyx.chat.models import ChatBasicResponse
+from onyx.configs.app_configs import DISABLE_GENERATIVE_AI
 from onyx.configs.app_configs import WEB_DOMAIN
 from onyx.configs.constants import DocumentSource
 from onyx.configs.constants import SearchFeedbackType
@@ -254,7 +255,9 @@ def _build_documents_blocks(
    message_id: int | None,
    num_docs_to_display: int = ONYX_BOT_NUM_DOCS_TO_DISPLAY,
 ) -> list[Block]:
-    header_text = "Reference Documents"
+    header_text = (
+        "Retrieved Documents" if DISABLE_GENERATIVE_AI else "Reference Documents"
+    )
    seen_docs_identifiers = set()
    section_blocks: list[Block] = [HeaderBlock(text=header_text)]
    included_docs = 0
--- a/backend/onyx/onyxbot/slack/utils.py
+++ b/backend/onyx/onyxbot/slack/utils.py
@@ -34,8 +34,10 @@ from onyx.configs.onyxbot_configs import (
 from onyx.connectors.slack.utils import SlackTextCleaner
 from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.users import get_user_by_email
+from onyx.llm.exceptions import GenAIDisabledException
 from onyx.llm.factory import get_default_llms
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.utils import dict_based_prompt_to_langchain_prompt
+from onyx.llm.utils import message_to_string
 from onyx.onyxbot.slack.constants import FeedbackVisibility
 from onyx.onyxbot.slack.models import ChannelType
 from onyx.onyxbot.slack.models import ThreadMessage
@@ -141,9 +143,24 @@ def check_message_limit() -> bool:


 def rephrase_slack_message(msg: str) -> str:
-    llm, _ = get_default_llms(timeout=5)
-    prompt = SLACK_LANGUAGE_REPHRASE_PROMPT.format(query=msg)
-    model_output = llm_response_to_string(llm.invoke(prompt))
+    def _get_rephrase_message() -> list[dict[str, str]]:
+        messages = [
+            {
+                "role": "user",
+                "content": SLACK_LANGUAGE_REPHRASE_PROMPT.format(query=msg),
+            },
+        ]
+
+        return messages
+
+    try:
+        llm, _ = get_default_llms(timeout=5)
+    except GenAIDisabledException:
+        logger.warning("Unable to rephrase Slack user message, Gen AI disabled")
+        return msg
+    messages = _get_rephrase_message()
+    filled_llm_prompt = dict_based_prompt_to_langchain_prompt(messages)
+    model_output = message_to_string(llm.invoke_langchain(filled_llm_prompt))
    logger.debug(model_output)

    return model_output
--- a/backend/onyx/prompts/tool_prompts.py
+++ b/backend/onyx/prompts/tool_prompts.py
@@ -5,7 +5,7 @@ TOOL_SECTION_HEADER = "\n\n# Tools\n"

 # This section is included if there are search type tools, currently internal_search and web_search
 TOOL_DESCRIPTION_SEARCH_GUIDANCE = """
-For questions that can be fully answered from existing knowledge which is unlikely to change, answer the user directly without using any tools. When there is ambiguity, default to searching to get more context.
+For knowledge that you already have and that is unlikely to change, answer the user directly without using any tools.

 When using any search type tool, do not make any assumptions and stay as faithful to the user's query as possible. Between internal and web search, think about if the user's query is likely better answered by team internal sources or online web pages. \
 For queries that are short phrases, ambiguous/unclear, or keyword heavy, prioritize internal search. If ambiguious, prioritize internal search.
@@ -21,7 +21,6 @@ Use the `internal_search` tool to search connected applications for information.
 - Niche/Specific information: information that is likely not found in public sources, things specific to a project or product, team, process, etc.
 - Keyword Queries: queries that are heavily keyword based are often internal document search queries.
 - Ambiguity: questions about something that is not widely known or understood.
-Never provide more than 3 queries at once to `internal_search`.
 """


--- a/backend/onyx/redis/redis_connector_delete.py
+++ b/backend/onyx/redis/redis_connector_delete.py
@@ -30,7 +30,6 @@ class RedisConnectorDelete:

    PREFIX = "connectordeletion"
    FENCE_PREFIX = f"{PREFIX}_fence"  # "connectordeletion_fence"
-    FENCE_TTL = 7 * 24 * 60 * 60  # 7 days - defensive TTL to prevent memory leaks
    TASKSET_PREFIX = f"{PREFIX}_taskset"  # "connectordeletion_taskset"

    # used to signal the overall workflow is still active
@@ -79,7 +78,7 @@ class RedisConnectorDelete:
            self.redis.delete(self.fence_key)
            return

-        self.redis.set(self.fence_key, payload.model_dump_json(), ex=self.FENCE_TTL)
+        self.redis.set(self.fence_key, payload.model_dump_json())
        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)

    def set_active(self) -> None:
--- a/backend/onyx/redis/redis_connector_doc_perm_sync.py
+++ b/backend/onyx/redis/redis_connector_doc_perm_sync.py
@@ -43,7 +43,6 @@ class RedisConnectorPermissionSync:
    PREFIX = "connectordocpermissionsync"

    FENCE_PREFIX = f"{PREFIX}_fence"
-    FENCE_TTL = 7 * 24 * 60 * 60  # 7 days - defensive TTL to prevent memory leaks

    # phase 1 - geneartor task and progress signals
    GENERATORTASK_PREFIX = f"{PREFIX}+generator"  # connectorpermissions+generator
@@ -127,7 +126,7 @@ class RedisConnectorPermissionSync:
            self.redis.delete(self.fence_key)
            return

-        self.redis.set(self.fence_key, payload.model_dump_json(), ex=self.FENCE_TTL)
+        self.redis.set(self.fence_key, payload.model_dump_json())
        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)

    def set_active(self) -> None:
@@ -163,7 +162,7 @@ class RedisConnectorPermissionSync:
            self.redis.delete(self.generator_complete_key)
            return

-        self.redis.set(self.generator_complete_key, payload, ex=self.FENCE_TTL)
+        self.redis.set(self.generator_complete_key, payload)

    def update_db(
        self,
--- a/backend/onyx/redis/redis_connector_ext_group_sync.py
+++ b/backend/onyx/redis/redis_connector_ext_group_sync.py
@@ -25,7 +25,6 @@ class RedisConnectorExternalGroupSync:
    PREFIX = "connectorexternalgroupsync"

    FENCE_PREFIX = f"{PREFIX}_fence"
-    FENCE_TTL = 7 * 24 * 60 * 60  # 7 days - defensive TTL to prevent memory leaks

    # phase 1 - geneartor task and progress signals
    GENERATORTASK_PREFIX = f"{PREFIX}+generator"  # connectorexternalgroupsync+generator
@@ -111,7 +110,7 @@ class RedisConnectorExternalGroupSync:
            self.redis.delete(self.fence_key)
            return

-        self.redis.set(self.fence_key, payload.model_dump_json(), ex=self.FENCE_TTL)
+        self.redis.set(self.fence_key, payload.model_dump_json())
        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)

    def set_active(self) -> None:
@@ -148,7 +147,7 @@ class RedisConnectorExternalGroupSync:
            self.redis.delete(self.generator_complete_key)
            return

-        self.redis.set(self.generator_complete_key, payload, ex=self.FENCE_TTL)
+        self.redis.set(self.generator_complete_key, payload)

    def generate_tasks(
        self,
--- a/backend/onyx/redis/redis_connector_prune.py
+++ b/backend/onyx/redis/redis_connector_prune.py
@@ -33,7 +33,6 @@ class RedisConnectorPrune:
    PREFIX = "connectorpruning"

    FENCE_PREFIX = f"{PREFIX}_fence"
-    FENCE_TTL = 7 * 24 * 60 * 60  # 7 days - defensive TTL to prevent memory leaks

    # phase 1 - geneartor task and progress signals
    GENERATORTASK_PREFIX = f"{PREFIX}+generator"  # connectorpruning+generator
@@ -116,7 +115,7 @@ class RedisConnectorPrune:
            self.redis.delete(self.fence_key)
            return

-        self.redis.set(self.fence_key, payload.model_dump_json(), ex=self.FENCE_TTL)
+        self.redis.set(self.fence_key, payload.model_dump_json())
        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)

    def set_active(self) -> None:
@@ -149,7 +148,7 @@ class RedisConnectorPrune:
            self.redis.delete(self.generator_complete_key)
            return

-        self.redis.set(self.generator_complete_key, payload, ex=self.FENCE_TTL)
+        self.redis.set(self.generator_complete_key, payload)

    def generate_tasks(
        self,
--- a/backend/onyx/redis/redis_connector_stop.py
+++ b/backend/onyx/redis/redis_connector_stop.py
@@ -7,7 +7,6 @@ class RedisConnectorStop:

    PREFIX = "connectorstop"
    FENCE_PREFIX = f"{PREFIX}_fence"
-    FENCE_TTL = 7 * 24 * 60 * 60  # 7 days - defensive TTL to prevent memory leaks

    # if this timeout is exceeded, the caller may decide to take more
    # drastic measures
@@ -31,7 +30,7 @@ class RedisConnectorStop:
            self.redis.delete(self.fence_key)
            return

-        self.redis.set(self.fence_key, 0, ex=self.FENCE_TTL)
+        self.redis.set(self.fence_key, 0)

    @property
    def timed_out(self) -> bool:
--- a/backend/onyx/redis/redis_document_set.py
+++ b/backend/onyx/redis/redis_document_set.py
@@ -21,7 +21,6 @@ from onyx.redis.redis_object_helper import RedisObjectHelper
 class RedisDocumentSet(RedisObjectHelper):
    PREFIX = "documentset"
    FENCE_PREFIX = PREFIX + "_fence"
-    FENCE_TTL = 7 * 24 * 60 * 60  # 7 days - defensive TTL to prevent memory leaks
    TASKSET_PREFIX = PREFIX + "_taskset"

    def __init__(self, tenant_id: str, id: int) -> None:
@@ -37,7 +36,7 @@ class RedisDocumentSet(RedisObjectHelper):
            self.redis.delete(self.fence_key)
            return

-        self.redis.set(self.fence_key, payload, ex=self.FENCE_TTL)
+        self.redis.set(self.fence_key, payload)
        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)

    @property
--- a/backend/onyx/redis/redis_usergroup.py
+++ b/backend/onyx/redis/redis_usergroup.py
@@ -22,7 +22,6 @@ from onyx.utils.variable_functionality import global_version
 class RedisUserGroup(RedisObjectHelper):
    PREFIX = "usergroup"
    FENCE_PREFIX = PREFIX + "_fence"
-    FENCE_TTL = 7 * 24 * 60 * 60  # 7 days - defensive TTL to prevent memory leaks
    TASKSET_PREFIX = PREFIX + "_taskset"

    def __init__(self, tenant_id: str, id: int) -> None:
@@ -41,7 +40,7 @@ class RedisUserGroup(RedisObjectHelper):
            self.redis.delete(self.fence_key)
            return

-        self.redis.set(self.fence_key, payload, ex=self.FENCE_TTL)
+        self.redis.set(self.fence_key, payload)
        self.redis.sadd(OnyxRedisConstants.ACTIVE_FENCES, self.fence_key)

    @property
--- a/backend/onyx/secondary_llm_flows/answer_validation.py
+++ b/backend/onyx/secondary_llm_flows/answer_validation.py
@@ -1,5 +1,7 @@
+from onyx.llm.exceptions import GenAIDisabledException
 from onyx.llm.factory import get_default_llms
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.utils import dict_based_prompt_to_langchain_prompt
+from onyx.llm.utils import message_to_string
 from onyx.prompts.answer_validation import ANSWER_VALIDITY_PROMPT
 from onyx.utils.logger import setup_logger
 from onyx.utils.timing import log_function_time
@@ -12,15 +14,46 @@ def get_answer_validity(
    query: str,
    answer: str,
 ) -> bool:
+    def _get_answer_validation_messages(
+        query: str, answer: str
+    ) -> list[dict[str, str]]:
+        # Below COT block is unused, keeping for reference. Chain of Thought here significantly increases the time to
+        # answer, we can get most of the way there but just having the model evaluate each individual condition with
+        # a single True/False.
+        # cot_block = (
+        #    f"{THOUGHT_PAT} Use this as a scratchpad to write out in a step by step manner your reasoning "
+        #    f"about EACH criterion to ensure that your conclusion is correct. "
+        #    f"Be brief when evaluating each condition.\n"
+        #    f"{FINAL_ANSWER_PAT} Valid or Invalid"
+        # )
+
+        messages = [
+            {
+                "role": "user",
+                "content": ANSWER_VALIDITY_PROMPT.format(
+                    user_query=query, llm_answer=answer
+                ),
+            },
+        ]
+
+        return messages
+
    def _extract_validity(model_output: str) -> bool:
        if model_output.strip().strip("```").strip().split()[-1].lower() == "invalid":
            return False
        return True  # If something is wrong, let's not toss away the answer

-    llm, _ = get_default_llms()
+    try:
+        llm, _ = get_default_llms()
+    except GenAIDisabledException:
+        return True

-    prompt = ANSWER_VALIDITY_PROMPT.format(user_query=query, llm_answer=answer)
-    model_output = llm_response_to_string(llm.invoke(prompt))
+    if not answer:
+        return False
+
+    messages = _get_answer_validation_messages(query, answer)
+    filled_llm_prompt = dict_based_prompt_to_langchain_prompt(messages)
+    model_output = message_to_string(llm.invoke_langchain(filled_llm_prompt))
    logger.debug(model_output)

    validity = _extract_validity(model_output)
--- a/backend/onyx/secondary_llm_flows/chat_session_naming.py
+++ b/backend/onyx/secondary_llm_flows/chat_session_naming.py
@@ -3,7 +3,8 @@ from onyx.configs.chat_configs import LANGUAGE_CHAT_NAMING_HINT
 from onyx.db.models import ChatMessage
 from onyx.db.search_settings import get_multilingual_expansion
 from onyx.llm.interfaces import LLM
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.utils import dict_based_prompt_to_langchain_prompt
+from onyx.llm.utils import message_to_string
 from onyx.prompts.chat_prompts import CHAT_NAMING
 from onyx.utils.logger import setup_logger

@@ -25,13 +26,19 @@ def get_renamed_conversation_name(
        else ""
    )

-    prompt = CHAT_NAMING.format(
-        language_hint_or_empty=language_hint, chat_history=history_str
-    )
+    prompt_msgs = [
+        {
+            "role": "user",
+            "content": CHAT_NAMING.format(
+                language_hint_or_empty=language_hint, chat_history=history_str
+            ),
+        },
+    ]

-    new_name_raw = llm_response_to_string(llm.invoke(prompt))
+    filled_llm_prompt = dict_based_prompt_to_langchain_prompt(prompt_msgs)
+    new_name_raw = message_to_string(llm.invoke_langchain(filled_llm_prompt))

-    new_name = new_name_raw.strip().strip('"')
+    new_name = new_name_raw.strip().strip(' "')

    logger.debug(f"New Session Name: {new_name}")

--- a/backend/onyx/secondary_llm_flows/chunk_usefulness.py
+++ b/backend/onyx/secondary_llm_flows/chunk_usefulness.py
@@ -2,7 +2,8 @@ from collections.abc import Callable

 from onyx.configs.chat_configs import DISABLE_LLM_DOC_RELEVANCE
 from onyx.llm.interfaces import LLM
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.utils import dict_based_prompt_to_langchain_prompt
+from onyx.llm.utils import message_to_string
 from onyx.prompts.llm_chunk_filter import NONUSEFUL_PAT
 from onyx.prompts.llm_chunk_filter import SECTION_FILTER_PROMPT
 from onyx.utils.logger import setup_logger
@@ -25,13 +26,20 @@ def llm_eval_section(
            metadata_str += f"{key} - {value_str}\n"
        return metadata_str

-    metadata_str = _get_metadata_str(metadata) if metadata else ""
-    prompt = SECTION_FILTER_PROMPT.format(
-        title=title.replace("\n", " "),
-        chunk_text=section_content,
-        user_query=query,
-        optional_metadata=metadata_str,
-    )
+    def _get_usefulness_messages() -> list[dict[str, str]]:
+        metadata_str = _get_metadata_str(metadata) if metadata else ""
+        messages = [
+            {
+                "role": "user",
+                "content": SECTION_FILTER_PROMPT.format(
+                    title=title.replace("\n", " "),
+                    chunk_text=section_content,
+                    user_query=query,
+                    optional_metadata=metadata_str,
+                ),
+            },
+        ]
+        return messages

    def _extract_usefulness(model_output: str) -> bool:
        """Default useful if the LLM doesn't match pattern exactly
@@ -40,7 +48,9 @@ def llm_eval_section(
            return False
        return True

-    model_output = llm_response_to_string(llm.invoke(prompt))
+    messages = _get_usefulness_messages()
+    filled_llm_prompt = dict_based_prompt_to_langchain_prompt(messages)
+    model_output = message_to_string(llm.invoke_langchain(filled_llm_prompt))

    # NOTE(rkuo): all this does is print "Yes useful" or "Not useful"
    # disabling becuase it's spammy, restore and give more context if this is needed
--- a/backend/onyx/secondary_llm_flows/document_filter.py
+++ b/backend/onyx/secondary_llm_flows/document_filter.py
@@ -5,7 +5,7 @@ from onyx.context.search.models import ContextExpansionType
 from onyx.context.search.models import InferenceChunk
 from onyx.context.search.models import InferenceSection
 from onyx.llm.interfaces import LLM
-from onyx.llm.models import ReasoningEffort
+from onyx.llm.message_types import UserMessage
 from onyx.prompts.search_prompts import DOCUMENT_CONTEXT_SELECTION_PROMPT
 from onyx.prompts.search_prompts import DOCUMENT_SELECTION_PROMPT
 from onyx.tools.tool_implementations.search.constants import (
@@ -116,12 +116,19 @@ def classify_section_relevance(
        user_query=user_query,
    )

+    user_msg: UserMessage = {
+        "role": "user",
+        "content": prompt_text,
+    }
+
+    messages = [user_msg]
+
    # Default to MAIN_SECTION_ONLY
    default_classification = ContextExpansionType.MAIN_SECTION_ONLY

    # Call LLM for classification
    try:
-        response = llm.invoke(prompt=prompt_text, reasoning_effort=ReasoningEffort.OFF)
+        response = llm.invoke(prompt=messages)
        llm_response = response.choice.message.content

        if not llm_response:
@@ -253,9 +260,16 @@ def select_sections_for_expansion(
        user_query=user_query,
    )

+    user_msg: UserMessage = {
+        "role": "user",
+        "content": prompt_text,
+    }
+
+    messages = [user_msg]
+
    # Call LLM for selection
    try:
-        response = llm.invoke(prompt=prompt_text, reasoning_effort=ReasoningEffort.OFF)
+        response = llm.invoke(prompt=messages)
        llm_response = response.choice.message.content

        if not llm_response:
--- a/backend/onyx/secondary_llm_flows/query_expansion.py
+++ b/backend/onyx/secondary_llm_flows/query_expansion.py
@@ -1,14 +1,15 @@
 from collections.abc import Callable

 from onyx.configs.constants import MessageType
+from onyx.llm.exceptions import GenAIDisabledException
 from onyx.llm.factory import get_default_llms
 from onyx.llm.interfaces import LLM
-from onyx.llm.models import AssistantMessage
-from onyx.llm.models import ChatCompletionMessage
-from onyx.llm.models import ReasoningEffort
-from onyx.llm.models import SystemMessage
-from onyx.llm.models import UserMessage
-from onyx.llm.utils import llm_response_to_string
+from onyx.llm.message_types import AssistantMessage
+from onyx.llm.message_types import ChatCompletionMessage
+from onyx.llm.message_types import SystemMessage
+from onyx.llm.message_types import UserMessage
+from onyx.llm.utils import dict_based_prompt_to_langchain_prompt
+from onyx.llm.utils import message_to_string
 from onyx.prompts.miscellaneous_prompts import LANGUAGE_REPHRASE_PROMPT
 from onyx.prompts.prompt_utils import get_current_llm_day_time
 from onyx.prompts.search_prompts import KEYWORD_REPHRASE_SYSTEM_PROMPT
@@ -18,6 +19,7 @@ from onyx.prompts.search_prompts import SEMANTIC_QUERY_REPHRASE_SYSTEM_PROMPT
 from onyx.prompts.search_prompts import SEMANTIC_QUERY_REPHRASE_USER_PROMPT
 from onyx.tools.models import ChatMinimalTextMessage
 from onyx.utils.logger import setup_logger
+from onyx.utils.text_processing import count_punctuation
 from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel

 logger = setup_logger()
@@ -59,10 +61,16 @@ def _build_message_history(

    for msg in history:
        if msg.message_type == MessageType.USER:
-            user_msg = UserMessage(content=msg.message)
+            user_msg: UserMessage = {
+                "role": "user",
+                "content": msg.message,
+            }
            messages.append(user_msg)
        elif msg.message_type == MessageType.ASSISTANT:
-            assistant_msg = AssistantMessage(content=msg.message)
+            assistant_msg: AssistantMessage = {
+                "role": "assistant",
+                "content": msg.message,
+            }
            messages.append(assistant_msg)

    return messages
@@ -116,26 +124,29 @@ def semantic_query_rephrase(
    )

    # Build system message with current date
-    system_msg = SystemMessage(
-        content=SEMANTIC_QUERY_REPHRASE_SYSTEM_PROMPT.format(
+    system_msg: SystemMessage = {
+        "role": "system",
+        "content": SEMANTIC_QUERY_REPHRASE_SYSTEM_PROMPT.format(
            current_date=current_datetime_str
-        )
-    )
+        ),
+    }

    # Convert chat history to message format (excluding the last user message and everything after it)
    messages: list[ChatCompletionMessage] = [system_msg]
    messages.extend(_build_message_history(history[:last_user_message_idx]))

    # Add the last message as the user prompt with instructions
-    final_user_msg = UserMessage(
-        content=SEMANTIC_QUERY_REPHRASE_USER_PROMPT.format(
-            additional_context=additional_context, user_query=user_query
-        )
-    )
+    final_user_msg: UserMessage = {
+        "role": "user",
+        "content": SEMANTIC_QUERY_REPHRASE_USER_PROMPT.format(
+            additional_context=additional_context,
+            user_query=user_query,
+        ),
+    }
    messages.append(final_user_msg)

    # Call LLM and return result
-    response = llm.invoke(prompt=messages, reasoning_effort=ReasoningEffort.OFF)
+    response = llm.invoke(prompt=messages)

    final_query = response.choice.message.content

@@ -195,24 +206,29 @@ def keyword_query_expansion(
    )

    # Build system message with current date
-    system_msg = SystemMessage(
-        content=KEYWORD_REPHRASE_SYSTEM_PROMPT.format(current_date=current_datetime_str)
-    )
+    system_msg: SystemMessage = {
+        "role": "system",
+        "content": KEYWORD_REPHRASE_SYSTEM_PROMPT.format(
+            current_date=current_datetime_str
+        ),
+    }

    # Convert chat history to message format (excluding the last user message and everything after it)
    messages: list[ChatCompletionMessage] = [system_msg]
    messages.extend(_build_message_history(history[:last_user_message_idx]))

    # Add the last message as the user prompt with instructions
-    final_user_msg = UserMessage(
-        content=KEYWORD_REPHRASE_USER_PROMPT.format(
-            additional_context=additional_context, user_query=user_query
-        )
-    )
+    final_user_msg: UserMessage = {
+        "role": "user",
+        "content": KEYWORD_REPHRASE_USER_PROMPT.format(
+            additional_context=additional_context,
+            user_query=user_query,
+        ),
+    }
    messages.append(final_user_msg)

    # Call LLM and return result
-    response = llm.invoke(prompt=messages, reasoning_effort=ReasoningEffort.OFF)
+    response = llm.invoke(prompt=messages)
    content = response.choice.message.content

    # Parse the response - each line is a separate keyword query
@@ -224,12 +240,29 @@ def keyword_query_expansion(


 def llm_multilingual_query_expansion(query: str, language: str) -> str:
-    _, fast_llm = get_default_llms(timeout=5)
+    def _get_rephrase_messages() -> list[dict[str, str]]:
+        messages = [
+            {
+                "role": "user",
+                "content": LANGUAGE_REPHRASE_PROMPT.format(
+                    query=query, target_language=language
+                ),
+            },
+        ]

-    prompt = LANGUAGE_REPHRASE_PROMPT.format(query=query, target_language=language)
-    model_output = llm_response_to_string(
-        fast_llm.invoke(prompt, reasoning_effort=ReasoningEffort.OFF)
-    )
+        return messages
+
+    try:
+        _, fast_llm = get_default_llms(timeout=5)
+    except GenAIDisabledException:
+        logger.warning(
+            "Unable to perform multilingual query expansion, Gen AI disabled"
+        )
+        return query
+
+    messages = _get_rephrase_messages()
+    filled_llm_prompt = dict_based_prompt_to_langchain_prompt(messages)
+    model_output = message_to_string(fast_llm.invoke_langchain(filled_llm_prompt))
    logger.debug(model_output)

    return model_output
@@ -255,3 +288,75 @@ def multilingual_query_expansion(
            llm_multilingual_query_expansion(query, language) for language in languages
        ]
        return query_rephrases
+
+
+# The stuff below is old and should be retired
+OLD_HISTORY_QUERY_REPHRASE = """
+Given the following conversation and a follow up input, rephrase the follow up into a SHORT, \
+standalone query (which captures any relevant context from previous messages) for a vectorstore.
+IMPORTANT: EDIT THE QUERY TO BE AS CONCISE AS POSSIBLE. Respond with a short, compressed phrase \
+with mainly keywords instead of a complete sentence.
+If there is a clear change in topic, disregard the previous messages.
+Strip out any information that is not relevant for the retrieval task.
+If the follow up message is an error or code snippet, repeat the same input back EXACTLY.
+
+Chat History:
+--------------
+{chat_history}
+--------------
+
+Follow Up Input: {question}
+Standalone question (Respond with only the short combined query):
+""".strip()
+
+
+def get_contextual_rephrase_messages(
+    question: str,
+    history_str: str,
+    prompt_template: str = OLD_HISTORY_QUERY_REPHRASE,
+) -> list[dict[str, str]]:
+    messages = [
+        {
+            "role": "user",
+            "content": prompt_template.format(
+                question=question, chat_history=history_str
+            ),
+        },
+    ]
+
+    return messages
+
+
+def thread_based_query_rephrase(
+    user_query: str,
+    history_str: str,
+    llm: LLM | None = None,
+    size_heuristic: int = 200,
+    punctuation_heuristic: int = 10,
+) -> str:
+    if not history_str:
+        return user_query
+
+    if len(user_query) >= size_heuristic:
+        return user_query
+
+    if count_punctuation(user_query) >= punctuation_heuristic:
+        return user_query
+
+    if llm is None:
+        try:
+            llm, _ = get_default_llms()
+        except GenAIDisabledException:
+            # If Generative AI is turned off, just return the original query
+            return user_query
+
+    prompt_msgs = get_contextual_rephrase_messages(
+        question=user_query, history_str=history_str
+    )
+
+    filled_llm_prompt = dict_based_prompt_to_langchain_prompt(prompt_msgs)
+    rephrased_query = message_to_string(llm.invoke_langchain(filled_llm_prompt))
+
+    logger.debug(f"Rephrased combined query: {rephrased_query}")
+
+    return rephrased_query
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Weves	98bd71a796	.	2025-12-12 08:20:00 -10:00
Weves	f3462414b7	.	2025-12-11 14:52:24 -10:00
Weves	0897e57d2d	.	2025-12-11 14:51:11 -10:00
Weves	5a4c2bb263	avatars v0	2025-12-11 14:22:14 -10:00