Compare commits

..

1 Commits

Author SHA1 Message Date
Richard Kuo (Onyx)
54e61611c5 prototype for surfacing docs without a query 2025-03-27 16:52:31 -07:00
387 changed files with 10706 additions and 28389 deletions

View File

@@ -1,209 +0,0 @@
name: Run MIT Integration Tests v2
concurrency:
group: Run-MIT-Integration-Tests-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }}
cancel-in-progress: true
on:
merge_group:
pull_request:
branches:
- main
- "release/**"
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
jobs:
integration-tests-mit:
# See https://runs-on.com/runners/linux/
runs-on: [runs-on, runner=32cpu-linux-x64, "run-id=${{ github.run_id }}"]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}
# tag every docker image with "test" so that we can spin up the correct set
# of images during testing
# We don't need to build the Web Docker image since it's not yet used
# in the integration tests. We have a separate action to verify that it builds
# successfully.
- name: Pull Web Docker image
run: |
docker pull onyxdotapp/onyx-web-server:latest
docker tag onyxdotapp/onyx-web-server:latest onyxdotapp/onyx-web-server:test
# we use the runs-on cache for docker builds
# in conjunction with runs-on runners, it has better speed and unlimited caching
# https://runs-on.com/caching/s3-cache-for-github-actions/
# https://runs-on.com/caching/docker/
# https://github.com/moby/buildkit#s3-cache-experimental
# images are built and run locally for testing purposes. Not pushed.
- name: Build Backend Docker image
uses: ./.github/actions/custom-build-and-push
with:
context: ./backend
file: ./backend/Dockerfile
platforms: linux/amd64
tags: onyxdotapp/onyx-backend:test
push: false
load: true
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
- name: Build Model Server Docker image
uses: ./.github/actions/custom-build-and-push
with:
context: ./backend
file: ./backend/Dockerfile.model_server
platforms: linux/amd64
tags: onyxdotapp/onyx-model-server:test
push: false
load: true
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
- name: Build integration test Docker image
uses: ./.github/actions/custom-build-and-push
with:
context: ./backend
file: ./backend/tests/integration/Dockerfile
platforms: linux/amd64
tags: onyxdotapp/onyx-integration:test
push: false
load: true
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
# NOTE: Use pre-ping/null pool to reduce flakiness due to dropped connections
- name: Start Docker containers
run: |
cd deployment/docker_compose
AUTH_TYPE=basic \
POSTGRES_POOL_PRE_PING=true \
POSTGRES_USE_NULL_POOL=true \
REQUIRE_EMAIL_VERIFICATION=false \
DISABLE_TELEMETRY=true \
IMAGE_TAG=test \
INTEGRATION_TESTS_MODE=true \
docker compose -f docker-compose.dev.yml -p onyx-stack up -d
id: start_docker
- name: Wait for service to be ready
run: |
echo "Starting wait-for-service script..."
docker logs -f onyx-stack-api_server-1 &
start_time=$(date +%s)
timeout=300 # 5 minutes in seconds
while true; do
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ $elapsed_time -ge $timeout ]; then
echo "Timeout reached. Service did not become ready in 5 minutes."
exit 1
fi
# Use curl with error handling to ignore specific exit code 56
response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
if [ "$response" = "200" ]; then
echo "Service is ready!"
break
elif [ "$response" = "curl_error" ]; then
echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
else
echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
fi
sleep 5
done
echo "Finished waiting for service."
- name: Start Mock Services
run: |
cd backend/tests/integration/mock_services
docker compose -f docker-compose.mock-it-services.yml \
-p mock-it-services-stack up -d
# NOTE: Use pre-ping/null to reduce flakiness due to dropped connections
- name: Run Standard Integration Tests
run: |
echo "Running integration tests..."
docker run --rm --network onyx-stack_default \
--name test-runner \
-e POSTGRES_HOST=relational_db \
-e POSTGRES_USER=postgres \
-e POSTGRES_PASSWORD=password \
-e POSTGRES_DB=postgres \
-e POSTGRES_POOL_PRE_PING=true \
-e POSTGRES_USE_NULL_POOL=true \
-e VESPA_HOST=index \
-e REDIS_HOST=cache \
-e API_SERVER_HOST=api_server \
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
-e TEST_WEB_HOSTNAME=test-runner \
-e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
-e MOCK_CONNECTOR_SERVER_PORT=8001 \
onyxdotapp/onyx-integration:test \
/app/tests/integration/tests \
/app/tests/integration/connector_job_tests
continue-on-error: true
id: run_tests
- name: Check test results
run: |
if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then
echo "Integration tests failed. Exiting with error."
exit 1
else
echo "All integration tests passed successfully."
fi
# ------------------------------------------------------------
# Always gather logs BEFORE "down":
- name: Dump API server logs
if: always()
run: |
cd deployment/docker_compose
docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color api_server > $GITHUB_WORKSPACE/api_server.log || true
- name: Dump all-container logs (optional)
if: always()
run: |
cd deployment/docker_compose
docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
with:
name: docker-all-logs
path: ${{ github.workspace }}/docker-compose.log
# ------------------------------------------------------------
- name: Stop Docker containers
if: always()
run: |
cd deployment/docker_compose
docker compose -f docker-compose.dev.yml -p onyx-stack down -v

View File

@@ -23,10 +23,6 @@ env:
# Jira
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
GONG_ACCESS_KEY: ${{ secrets.GONG_ACCESS_KEY }}
GONG_ACCESS_KEY_SECRET: ${{ secrets.GONG_ACCESS_KEY_SECRET }}
# Google
GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }}

View File

@@ -6,419 +6,396 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"compounds": [
{
// Dummy entry used to label the group
"name": "--- Compound ---",
"configurations": ["--- Individual ---"],
"presentation": {
"group": "1"
}
},
{
"name": "Run All Onyx Services",
"configurations": [
"Web Server",
"Model Server",
"API Server",
"Slack Bot",
"Celery primary",
"Celery light",
"Celery heavy",
"Celery indexing",
"Celery user files indexing",
"Celery beat",
"Celery monitoring"
],
"presentation": {
"group": "1"
}
},
{
"name": "Web / Model / API",
"configurations": ["Web Server", "Model Server", "API Server"],
"presentation": {
"group": "1"
}
},
{
"name": "Celery (all)",
"configurations": [
"Celery primary",
"Celery light",
"Celery heavy",
"Celery indexing",
"Celery user files indexing",
"Celery beat",
"Celery monitoring"
],
"presentation": {
"group": "1"
}
}
{
// Dummy entry used to label the group
"name": "--- Compound ---",
"configurations": [
"--- Individual ---"
],
"presentation": {
"group": "1",
}
},
{
"name": "Run All Onyx Services",
"configurations": [
"Web Server",
"Model Server",
"API Server",
"Slack Bot",
"Celery primary",
"Celery light",
"Celery heavy",
"Celery indexing",
"Celery beat",
"Celery monitoring",
],
"presentation": {
"group": "1",
}
},
{
"name": "Web / Model / API",
"configurations": [
"Web Server",
"Model Server",
"API Server",
],
"presentation": {
"group": "1",
}
},
{
"name": "Celery (all)",
"configurations": [
"Celery primary",
"Celery light",
"Celery heavy",
"Celery indexing",
"Celery beat",
"Celery monitoring",
],
"presentation": {
"group": "1",
}
}
],
"configurations": [
{
// Dummy entry used to label the group
"name": "--- Individual ---",
"type": "node",
"request": "launch",
"presentation": {
"group": "2",
"order": 0
}
},
{
"name": "Web Server",
"type": "node",
"request": "launch",
"cwd": "${workspaceRoot}/web",
"runtimeExecutable": "npm",
"envFile": "${workspaceFolder}/.vscode/.env",
"runtimeArgs": ["run", "dev"],
"presentation": {
"group": "2"
{
// Dummy entry used to label the group
"name": "--- Individual ---",
"type": "node",
"request": "launch",
"presentation": {
"group": "2",
"order": 0
}
},
{
"name": "Web Server",
"type": "node",
"request": "launch",
"cwd": "${workspaceRoot}/web",
"runtimeExecutable": "npm",
"envFile": "${workspaceFolder}/.vscode/.env",
"runtimeArgs": [
"run", "dev"
],
"presentation": {
"group": "2",
},
"console": "integratedTerminal",
"consoleTitle": "Web Server Console"
},
"console": "integratedTerminal",
"consoleTitle": "Web Server Console"
},
{
"name": "Model Server",
"consoleName": "Model Server",
"type": "debugpy",
"request": "launch",
"module": "uvicorn",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1"
{
"name": "Model Server",
"consoleName": "Model Server",
"type": "debugpy",
"request": "launch",
"module": "uvicorn",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1"
},
"args": [
"model_server.main:app",
"--reload",
"--port",
"9000"
],
"presentation": {
"group": "2",
},
"consoleTitle": "Model Server Console"
},
"args": ["model_server.main:app", "--reload", "--port", "9000"],
"presentation": {
"group": "2"
{
"name": "API Server",
"consoleName": "API Server",
"type": "debugpy",
"request": "launch",
"module": "uvicorn",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1"
},
"args": [
"onyx.main:app",
"--reload",
"--port",
"8080"
],
"presentation": {
"group": "2",
},
"consoleTitle": "API Server Console"
},
"consoleTitle": "Model Server Console"
},
{
"name": "API Server",
"consoleName": "API Server",
"type": "debugpy",
"request": "launch",
"module": "uvicorn",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1"
// For the listener to access the Slack API,
// DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
{
"name": "Slack Bot",
"consoleName": "Slack Bot",
"type": "debugpy",
"request": "launch",
"program": "onyx/onyxbot/slack/listener.py",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"presentation": {
"group": "2",
},
"consoleTitle": "Slack Bot Console"
},
"args": ["onyx.main:app", "--reload", "--port", "8080"],
"presentation": {
"group": "2"
{
"name": "Celery primary",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "INFO",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.primary",
"worker",
"--pool=threads",
"--concurrency=4",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=primary@%n",
"-Q",
"celery",
],
"presentation": {
"group": "2",
},
"consoleTitle": "Celery primary Console"
},
"consoleTitle": "API Server Console"
},
// For the listener to access the Slack API,
// DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
{
"name": "Slack Bot",
"consoleName": "Slack Bot",
"type": "debugpy",
"request": "launch",
"program": "onyx/onyxbot/slack/listener.py",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
{
"name": "Celery light",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "INFO",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.light",
"worker",
"--pool=threads",
"--concurrency=64",
"--prefetch-multiplier=8",
"--loglevel=INFO",
"--hostname=light@%n",
"-Q",
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert,checkpoint_cleanup",
],
"presentation": {
"group": "2",
},
"consoleTitle": "Celery light Console"
},
"presentation": {
"group": "2"
{
"name": "Celery heavy",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "INFO",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.heavy",
"worker",
"--pool=threads",
"--concurrency=4",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=heavy@%n",
"-Q",
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync",
],
"presentation": {
"group": "2",
},
"consoleTitle": "Celery heavy Console"
},
"consoleTitle": "Slack Bot Console"
},
{
"name": "Celery primary",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "INFO",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
{
"name": "Celery indexing",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"ENABLE_MULTIPASS_INDEXING": "false",
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.indexing",
"worker",
"--pool=threads",
"--concurrency=1",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=indexing@%n",
"-Q",
"connector_indexing",
],
"presentation": {
"group": "2",
},
"consoleTitle": "Celery indexing Console"
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.primary",
"worker",
"--pool=threads",
"--concurrency=4",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=primary@%n",
"-Q",
"celery"
],
"presentation": {
"group": "2"
{
"name": "Celery monitoring",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {},
"args": [
"-A",
"onyx.background.celery.versioned_apps.monitoring",
"worker",
"--pool=solo",
"--concurrency=1",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=monitoring@%n",
"-Q",
"monitoring",
],
"presentation": {
"group": "2",
},
"consoleTitle": "Celery monitoring Console"
},
"consoleTitle": "Celery primary Console"
},
{
"name": "Celery light",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "INFO",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
{
"name": "Celery beat",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.beat",
"beat",
"--loglevel=INFO",
],
"presentation": {
"group": "2",
},
"consoleTitle": "Celery beat Console"
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.light",
"worker",
"--pool=threads",
"--concurrency=64",
"--prefetch-multiplier=8",
"--loglevel=INFO",
"--hostname=light@%n",
"-Q",
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert"
],
"presentation": {
"group": "2"
{
"name": "Pytest",
"consoleName": "Pytest",
"type": "debugpy",
"request": "launch",
"module": "pytest",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-v"
// Specify a sepcific module/test to run or provide nothing to run all tests
//"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
],
"presentation": {
"group": "2",
},
"consoleTitle": "Pytest Console"
},
"consoleTitle": "Celery light Console"
},
{
"name": "Celery heavy",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "INFO",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
{
// Dummy entry used to label the group
"name": "--- Tasks ---",
"type": "node",
"request": "launch",
"presentation": {
"group": "3",
"order": 0
}
},
{
"name": "Clear and Restart External Volumes and Containers",
"type": "node",
"request": "launch",
"runtimeExecutable": "bash",
"runtimeArgs": ["${workspaceFolder}/backend/scripts/restart_containers.sh"],
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"stopOnEntry": true,
"presentation": {
"group": "3",
},
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.heavy",
"worker",
"--pool=threads",
"--concurrency=4",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=heavy@%n",
"-Q",
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync"
],
"presentation": {
"group": "2"
{
// Celery jobs launched through a single background script (legacy)
// Recommend using the "Celery (all)" compound launch instead.
"name": "Background Jobs",
"consoleName": "Background Jobs",
"type": "debugpy",
"request": "launch",
"program": "scripts/dev_run_background_jobs.py",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
},
"consoleTitle": "Celery heavy Console"
},
{
"name": "Celery indexing",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"ENABLE_MULTIPASS_INDEXING": "false",
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
{
"name": "Install Python Requirements",
"type": "node",
"request": "launch",
"runtimeExecutable": "bash",
"runtimeArgs": [
"-c",
"pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
],
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"presentation": {
"group": "3"
}
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.indexing",
"worker",
"--pool=threads",
"--concurrency=1",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=indexing@%n",
"-Q",
"connector_indexing"
],
"presentation": {
"group": "2"
},
"consoleTitle": "Celery indexing Console"
},
{
"name": "Celery monitoring",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {},
"args": [
"-A",
"onyx.background.celery.versioned_apps.monitoring",
"worker",
"--pool=solo",
"--concurrency=1",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=monitoring@%n",
"-Q",
"monitoring"
],
"presentation": {
"group": "2"
},
"consoleTitle": "Celery monitoring Console"
},
{
"name": "Celery beat",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.beat",
"beat",
"--loglevel=INFO"
],
"presentation": {
"group": "2"
},
"consoleTitle": "Celery beat Console"
},
{
"name": "Celery user files indexing",
"type": "debugpy",
"request": "launch",
"module": "celery",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-A",
"onyx.background.celery.versioned_apps.indexing",
"worker",
"--pool=threads",
"--concurrency=1",
"--prefetch-multiplier=1",
"--loglevel=INFO",
"--hostname=user_files_indexing@%n",
"-Q",
"user_files_indexing"
],
"presentation": {
"group": "2"
},
"consoleTitle": "Celery user files indexing Console"
},
{
"name": "Pytest",
"consoleName": "Pytest",
"type": "debugpy",
"request": "launch",
"module": "pytest",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
},
"args": [
"-v"
// Specify a sepcific module/test to run or provide nothing to run all tests
//"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
],
"presentation": {
"group": "2"
},
"consoleTitle": "Pytest Console"
},
{
// Dummy entry used to label the group
"name": "--- Tasks ---",
"type": "node",
"request": "launch",
"presentation": {
"group": "3",
"order": 0
}
},
{
"name": "Clear and Restart External Volumes and Containers",
"type": "node",
"request": "launch",
"runtimeExecutable": "bash",
"runtimeArgs": [
"${workspaceFolder}/backend/scripts/restart_containers.sh"
],
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"stopOnEntry": true,
"presentation": {
"group": "3"
}
},
{
// Celery jobs launched through a single background script (legacy)
// Recommend using the "Celery (all)" compound launch instead.
"name": "Background Jobs",
"consoleName": "Background Jobs",
"type": "debugpy",
"request": "launch",
"program": "scripts/dev_run_background_jobs.py",
"cwd": "${workspaceFolder}/backend",
"envFile": "${workspaceFolder}/.vscode/.env",
"env": {
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
"LOG_LEVEL": "DEBUG",
"PYTHONUNBUFFERED": "1",
"PYTHONPATH": "."
}
},
{
"name": "Install Python Requirements",
"type": "node",
"request": "launch",
"runtimeExecutable": "bash",
"runtimeArgs": [
"-c",
"pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
],
"cwd": "${workspaceFolder}",
"console": "integratedTerminal",
"presentation": {
"group": "3"
}
},
{
"name": "Debug React Web App in Chrome",
"type": "chrome",
"request": "launch",
"url": "http://localhost:3000",
"webRoot": "${workspaceFolder}/web"
}
]
}
}

View File

@@ -30,26 +30,30 @@ Keep knowledge and access controls sync-ed across over 40 connectors like Google
Create custom AI agents with unique prompts, knowledge, and actions that the agents can take.
Onyx can be deployed securely anywhere and for any scale - on a laptop, on-premise, or to cloud.
<h3>Feature Highlights</h3>
**Deep research over your team's knowledge:**
https://private-user-images.githubusercontent.com/32520769/414509312-48392e83-95d0-4fb5-8650-a396e05e0a32.mp4?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3Mzk5Mjg2MzYsIm5iZiI6MTczOTkyODMzNiwicGF0aCI6Ii8zMjUyMDc2OS80MTQ1MDkzMTItNDgzOTJlODMtOTVkMC00ZmI1LTg2NTAtYTM5NmUwNWUwYTMyLm1wND9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNTAyMTklMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjUwMjE5VDAxMjUzNlomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWFhMzk5Njg2Y2Y5YjFmNDNiYTQ2YzM5ZTg5YWJiYTU2NWMyY2YwNmUyODE2NWUxMDRiMWQxZWJmODI4YTA0MTUmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0In0.a9D8A0sgKE9AoaoE-mfFbJ6_OKYeqaf7TZ4Han2JfW8
**Use Onyx as a secure AI Chat with any LLM:**
![Onyx Chat Silent Demo](https://github.com/onyx-dot-app/onyx/releases/download/v0.21.1/OnyxChatSilentDemo.gif)
**Easily set up connectors to your apps:**
![Onyx Connector Silent Demo](https://github.com/onyx-dot-app/onyx/releases/download/v0.21.1/OnyxConnectorSilentDemo.gif)
**Access Onyx where your team already works:**
![Onyx Bot Demo](https://github.com/onyx-dot-app/onyx/releases/download/v0.21.1/OnyxBot.png)
## Deployment
## Deployment
**To try it out for free and get started in seconds, check out [Onyx Cloud](https://cloud.onyx.app/signup)**.
Onyx can also be run locally (even on a laptop) or deployed on a virtual machine with a single
@@ -58,23 +62,23 @@ Onyx can also be run locally (even on a laptop) or deployed on a virtual machine
We also have built-in support for high-availability/scalable deployment on Kubernetes.
References [here](https://github.com/onyx-dot-app/onyx/tree/main/deployment).
## 🔍 Other Notable Benefits of Onyx
## 🔍 Other Notable Benefits of Onyx
- Custom deep learning models for indexing and inference time, only through Onyx + learning from user feedback.
- Flexible security features like SSO (OIDC/SAML/OAuth2), RBAC, encryption of credentials, etc.
- Knowledge curation features like document-sets, query history, usage analytics, etc.
- Scalable deployment options tested up to many tens of thousands users and hundreds of millions of documents.
## 🚧 Roadmap
## 🚧 Roadmap
- New methods in information retrieval (StructRAG, LightGraphRAG, etc.)
- Personalized Search
- Organizational understanding and ability to locate and suggest experts from your team.
- Code Search
- SQL and Structured Query Language
## 🔌 Connectors
## 🔌 Connectors
Keep knowledge and access up to sync across 40+ connectors:
- Google Drive
@@ -95,65 +99,19 @@ Keep knowledge and access up to sync across 40+ connectors:
See the full list [here](https://docs.onyx.app/connectors).
## 📚 Licensing
## 📚 Licensing
There are two editions of Onyx:
- Onyx Community Edition (CE) is available freely under the MIT Expat license. Simply follow the Deployment guide above.
- Onyx Enterprise Edition (EE) includes extra features that are primarily useful for larger organizations.
For feature details, check out [our website](https://www.onyx.app/pricing).
For feature details, check out [our website](https://www.onyx.app/pricing).
To try the Onyx Enterprise Edition:
1. Checkout [Onyx Cloud](https://cloud.onyx.app/signup).
2. For self-hosting the Enterprise Edition, contact us at [founders@onyx.app](mailto:founders@onyx.app) or book a call with us on our [Cal](https://cal.com/team/onyx/founders).
## 💡 Contributing
## 💡 Contributing
Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details.
# YC Company Twitter Scraper
A script that scrapes YC company pages and extracts Twitter/X.com links.
## Requirements
- Python 3.7+
- Playwright
## Installation
1. Install the required packages:
```
pip install -r requirements.txt
```
2. Install Playwright browsers:
```
playwright install
```
## Usage
Run the script with default settings:
```
python scrape_yc_twitter.py
```
This will scrape the YC companies from recent batches (W23, S23, S24, F24, S22, W22) and save the Twitter links to `twitter_links.txt`.
### Custom URL and Output
```
python scrape_yc_twitter.py --url "https://www.ycombinator.com/companies?batch=W24" --output "w24_twitter.txt"
```
## How it works
1. Navigates to the specified YC companies page
2. Scrolls down to load all company cards
3. Extracts links to individual company pages
4. Visits each company page and extracts Twitter/X.com links
5. Saves the results to a text file

View File

@@ -1,45 +0,0 @@
# YC Company Twitter Scraper
A script that scrapes YC company pages and extracts Twitter/X.com links.
## Requirements
- Python 3.7+
- Playwright
## Installation
1. Install the required packages:
```
pip install -r requirements.txt
```
2. Install Playwright browsers:
```
playwright install
```
## Usage
Run the script with default settings:
```
python scrape_yc_twitter.py
```
This will scrape the YC companies from recent batches (W23, S23, S24, F24, S22, W22) and save the Twitter links to `twitter_links.txt`.
### Custom URL and Output
```
python scrape_yc_twitter.py --url "https://www.ycombinator.com/companies?batch=W24" --output "w24_twitter.txt"
```
## How it works
1. Navigates to the specified YC companies page
2. Scrolls down to load all company cards
3. Extracts links to individual company pages
4. Visits each company page and extracts Twitter/X.com links
5. Saves the results to a text file

View File

@@ -46,7 +46,6 @@ WORKDIR /app
# Utils used by model server
COPY ./onyx/utils/logger.py /app/onyx/utils/logger.py
COPY ./onyx/utils/middleware.py /app/onyx/utils/middleware.py
# Place to fetch version information
COPY ./onyx/__init__.py /app/onyx/__init__.py

View File

@@ -1,50 +0,0 @@
"""update prompt length
Revision ID: 4794bc13e484
Revises: f7505c5b0284
Create Date: 2025-04-02 11:26:36.180328
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "4794bc13e484"
down_revision = "f7505c5b0284"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.alter_column(
"prompt",
"system_prompt",
existing_type=sa.TEXT(),
type_=sa.String(length=5000000),
existing_nullable=False,
)
op.alter_column(
"prompt",
"task_prompt",
existing_type=sa.TEXT(),
type_=sa.String(length=5000000),
existing_nullable=False,
)
def downgrade() -> None:
op.alter_column(
"prompt",
"system_prompt",
existing_type=sa.String(length=5000000),
type_=sa.TEXT(),
existing_nullable=False,
)
op.alter_column(
"prompt",
"task_prompt",
existing_type=sa.String(length=5000000),
type_=sa.TEXT(),
existing_nullable=False,
)

View File

@@ -1,117 +0,0 @@
"""duplicated no-harm user file migration
Revision ID: 6a804aeb4830
Revises: 8e1ac4f39a9f
Create Date: 2025-04-01 07:26:10.539362
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy import inspect
import datetime
# revision identifiers, used by Alembic.
revision = "6a804aeb4830"
down_revision = "8e1ac4f39a9f"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Check if user_file table already exists
conn = op.get_bind()
inspector = inspect(conn)
if not inspector.has_table("user_file"):
# Create user_folder table without parent_id
op.create_table(
"user_folder",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column("name", sa.String(length=255), nullable=True),
sa.Column("description", sa.String(length=255), nullable=True),
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
sa.Column(
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
),
)
# Create user_file table with folder_id instead of parent_folder_id
op.create_table(
"user_file",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column(
"folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
nullable=True,
),
sa.Column("link_url", sa.String(), nullable=True),
sa.Column("token_count", sa.Integer(), nullable=True),
sa.Column("file_type", sa.String(), nullable=True),
sa.Column("file_id", sa.String(length=255), nullable=False),
sa.Column("document_id", sa.String(length=255), nullable=False),
sa.Column("name", sa.String(length=255), nullable=False),
sa.Column(
"created_at",
sa.DateTime(),
default=datetime.datetime.utcnow,
),
sa.Column(
"cc_pair_id",
sa.Integer(),
sa.ForeignKey("connector_credential_pair.id"),
nullable=True,
unique=True,
),
)
# Create persona__user_file table
op.create_table(
"persona__user_file",
sa.Column(
"persona_id",
sa.Integer(),
sa.ForeignKey("persona.id"),
primary_key=True,
),
sa.Column(
"user_file_id",
sa.Integer(),
sa.ForeignKey("user_file.id"),
primary_key=True,
),
)
# Create persona__user_folder table
op.create_table(
"persona__user_folder",
sa.Column(
"persona_id",
sa.Integer(),
sa.ForeignKey("persona.id"),
primary_key=True,
),
sa.Column(
"user_folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
primary_key=True,
),
)
op.add_column(
"connector_credential_pair",
sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
)
# Update existing records to have is_user_file=False instead of NULL
op.execute(
"UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
)
def downgrade() -> None:
pass

View File

@@ -1,50 +0,0 @@
"""enable contextual retrieval
Revision ID: 8e1ac4f39a9f
Revises: 9aadf32dfeb4
Create Date: 2024-12-20 13:29:09.918661
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "8e1ac4f39a9f"
down_revision = "9aadf32dfeb4"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"search_settings",
sa.Column(
"enable_contextual_rag",
sa.Boolean(),
nullable=False,
server_default="false",
),
)
op.add_column(
"search_settings",
sa.Column(
"contextual_rag_llm_name",
sa.String(),
nullable=True,
),
)
op.add_column(
"search_settings",
sa.Column(
"contextual_rag_llm_provider",
sa.String(),
nullable=True,
),
)
def downgrade() -> None:
op.drop_column("search_settings", "enable_contextual_rag")
op.drop_column("search_settings", "contextual_rag_llm_name")
op.drop_column("search_settings", "contextual_rag_llm_provider")

View File

@@ -1,113 +0,0 @@
"""add user files
Revision ID: 9aadf32dfeb4
Revises: 3781a5eb12cb
Create Date: 2025-01-26 16:08:21.551022
"""
import sqlalchemy as sa
import datetime
from alembic import op
# revision identifiers, used by Alembic.
revision = "9aadf32dfeb4"
down_revision = "3781a5eb12cb"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Create user_folder table without parent_id
op.create_table(
"user_folder",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column("name", sa.String(length=255), nullable=True),
sa.Column("description", sa.String(length=255), nullable=True),
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
sa.Column(
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
),
)
# Create user_file table with folder_id instead of parent_folder_id
op.create_table(
"user_file",
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
sa.Column(
"folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
nullable=True,
),
sa.Column("link_url", sa.String(), nullable=True),
sa.Column("token_count", sa.Integer(), nullable=True),
sa.Column("file_type", sa.String(), nullable=True),
sa.Column("file_id", sa.String(length=255), nullable=False),
sa.Column("document_id", sa.String(length=255), nullable=False),
sa.Column("name", sa.String(length=255), nullable=False),
sa.Column(
"created_at",
sa.DateTime(),
default=datetime.datetime.utcnow,
),
sa.Column(
"cc_pair_id",
sa.Integer(),
sa.ForeignKey("connector_credential_pair.id"),
nullable=True,
unique=True,
),
)
# Create persona__user_file table
op.create_table(
"persona__user_file",
sa.Column(
"persona_id", sa.Integer(), sa.ForeignKey("persona.id"), primary_key=True
),
sa.Column(
"user_file_id",
sa.Integer(),
sa.ForeignKey("user_file.id"),
primary_key=True,
),
)
# Create persona__user_folder table
op.create_table(
"persona__user_folder",
sa.Column(
"persona_id", sa.Integer(), sa.ForeignKey("persona.id"), primary_key=True
),
sa.Column(
"user_folder_id",
sa.Integer(),
sa.ForeignKey("user_folder.id"),
primary_key=True,
),
)
op.add_column(
"connector_credential_pair",
sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
)
# Update existing records to have is_user_file=False instead of NULL
op.execute(
"UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
)
def downgrade() -> None:
# Drop the persona__user_folder table
op.drop_table("persona__user_folder")
# Drop the persona__user_file table
op.drop_table("persona__user_file")
# Drop the user_file table
op.drop_table("user_file")
# Drop the user_folder table
op.drop_table("user_folder")
op.drop_column("connector_credential_pair", "is_user_file")

View File

@@ -1,50 +0,0 @@
"""add prompt length limit
Revision ID: f71470ba9274
Revises: 6a804aeb4830
Create Date: 2025-04-01 15:07:14.977435
"""
# revision identifiers, used by Alembic.
revision = "f71470ba9274"
down_revision = "6a804aeb4830"
branch_labels = None
depends_on = None
def upgrade() -> None:
# op.alter_column(
# "prompt",
# "system_prompt",
# existing_type=sa.TEXT(),
# type_=sa.String(length=8000),
# existing_nullable=False,
# )
# op.alter_column(
# "prompt",
# "task_prompt",
# existing_type=sa.TEXT(),
# type_=sa.String(length=8000),
# existing_nullable=False,
# )
pass
def downgrade() -> None:
# op.alter_column(
# "prompt",
# "system_prompt",
# existing_type=sa.String(length=8000),
# type_=sa.TEXT(),
# existing_nullable=False,
# )
# op.alter_column(
# "prompt",
# "task_prompt",
# existing_type=sa.String(length=8000),
# type_=sa.TEXT(),
# existing_nullable=False,
# )
pass

View File

@@ -1,77 +0,0 @@
"""updated constraints for ccpairs
Revision ID: f7505c5b0284
Revises: f71470ba9274
Create Date: 2025-04-01 17:50:42.504818
"""
from alembic import op
# revision identifiers, used by Alembic.
revision = "f7505c5b0284"
down_revision = "f71470ba9274"
branch_labels = None
depends_on = None
def upgrade() -> None:
# 1) Drop the old foreign-key constraints
op.drop_constraint(
"document_by_connector_credential_pair_connector_id_fkey",
"document_by_connector_credential_pair",
type_="foreignkey",
)
op.drop_constraint(
"document_by_connector_credential_pair_credential_id_fkey",
"document_by_connector_credential_pair",
type_="foreignkey",
)
# 2) Re-add them with ondelete='CASCADE'
op.create_foreign_key(
"document_by_connector_credential_pair_connector_id_fkey",
source_table="document_by_connector_credential_pair",
referent_table="connector",
local_cols=["connector_id"],
remote_cols=["id"],
ondelete="CASCADE",
)
op.create_foreign_key(
"document_by_connector_credential_pair_credential_id_fkey",
source_table="document_by_connector_credential_pair",
referent_table="credential",
local_cols=["credential_id"],
remote_cols=["id"],
ondelete="CASCADE",
)
def downgrade() -> None:
# Reverse the changes for rollback
op.drop_constraint(
"document_by_connector_credential_pair_connector_id_fkey",
"document_by_connector_credential_pair",
type_="foreignkey",
)
op.drop_constraint(
"document_by_connector_credential_pair_credential_id_fkey",
"document_by_connector_credential_pair",
type_="foreignkey",
)
# Recreate without CASCADE
op.create_foreign_key(
"document_by_connector_credential_pair_connector_id_fkey",
"document_by_connector_credential_pair",
"connector",
["connector_id"],
["id"],
)
op.create_foreign_key(
"document_by_connector_credential_pair_credential_id_fkey",
"document_by_connector_credential_pair",
"credential",
["credential_id"],
["id"],
)

View File

@@ -93,12 +93,12 @@ def _get_access_for_documents(
)
# To avoid collisions of group namings between connectors, they need to be prefixed
access_map[document_id] = DocumentAccess.build(
user_emails=list(non_ee_access.user_emails),
user_groups=user_group_info.get(document_id, []),
access_map[document_id] = DocumentAccess(
user_emails=non_ee_access.user_emails,
user_groups=set(user_group_info.get(document_id, [])),
is_public=is_public_anywhere,
external_user_emails=list(ext_u_emails),
external_user_group_ids=list(ext_u_groups),
external_user_emails=ext_u_emails,
external_user_group_ids=ext_u_groups,
)
return access_map

View File

@@ -2,6 +2,7 @@ from ee.onyx.server.query_and_chat.models import OneShotQAResponse
from onyx.chat.models import AllCitations
from onyx.chat.models import LLMRelevanceFilterResponse
from onyx.chat.models import OnyxAnswerPiece
from onyx.chat.models import OnyxContexts
from onyx.chat.models import QADocsResponse
from onyx.chat.models import StreamingError
from onyx.chat.process_message import ChatPacketStream
@@ -31,6 +32,8 @@ def gather_stream_for_answer_api(
response.llm_selected_doc_indices = packet.llm_selected_doc_indices
elif isinstance(packet, AllCitations):
response.citations = packet.citations
elif isinstance(packet, OnyxContexts):
response.contexts = packet
if answer:
response.answer = answer

View File

@@ -159,9 +159,6 @@ def _get_space_permissions(
# Stores the permissions for each space
space_permissions_by_space_key[space_key] = space_permissions
logger.info(
f"Found space permissions for space '{space_key}': {space_permissions}"
)
return space_permissions_by_space_key

View File

@@ -55,7 +55,7 @@ def _post_query_chunk_censoring(
# if user is None, permissions are not enforced
return chunks
final_chunk_dict: dict[str, InferenceChunk] = {}
chunks_to_keep = []
chunks_to_process: dict[DocumentSource, list[InferenceChunk]] = {}
sources_to_censor = _get_all_censoring_enabled_sources()
@@ -64,7 +64,7 @@ def _post_query_chunk_censoring(
if chunk.source_type in sources_to_censor:
chunks_to_process.setdefault(chunk.source_type, []).append(chunk)
else:
final_chunk_dict[chunk.unique_id] = chunk
chunks_to_keep.append(chunk)
# For each source, filter out the chunks using the permission
# check function for that source
@@ -79,16 +79,6 @@ def _post_query_chunk_censoring(
f" chunks for this source and continuing: {e}"
)
continue
chunks_to_keep.extend(censored_chunks)
for censored_chunk in censored_chunks:
final_chunk_dict[censored_chunk.unique_id] = censored_chunk
# IMPORTANT: make sure to retain the same ordering as the original `chunks` passed in
final_chunk_list: list[InferenceChunk] = []
for chunk in chunks:
# only if the chunk is in the final censored chunks, add it to the final list
# if it is missing, that means it was intentionally left out
if chunk.unique_id in final_chunk_dict:
final_chunk_list.append(final_chunk_dict[chunk.unique_id])
return final_chunk_list
return chunks_to_keep

View File

@@ -51,14 +51,13 @@ def _get_objects_access_for_user_email_from_salesforce(
# This is cached in the function so the first query takes an extra 0.1-0.3 seconds
# but subsequent queries by the same user are essentially instant
start_time = time.monotonic()
start_time = time.time()
user_id = get_salesforce_user_id_from_email(salesforce_client, user_email)
end_time = time.monotonic()
end_time = time.time()
logger.info(
f"Time taken to get Salesforce user ID: {end_time - start_time} seconds"
)
if user_id is None:
logger.warning(f"User '{user_email}' not found in Salesforce")
return None
# This is the only query that is not cached in the function
@@ -66,7 +65,6 @@ def _get_objects_access_for_user_email_from_salesforce(
object_id_to_access = get_objects_access_for_user_id(
salesforce_client, user_id, list(object_ids)
)
logger.debug(f"Object ID to access: {object_id_to_access}")
return object_id_to_access

View File

@@ -1,6 +1,10 @@
from simple_salesforce import Salesforce
from sqlalchemy.orm import Session
from onyx.connectors.salesforce.sqlite_functions import get_user_id_by_email
from onyx.connectors.salesforce.sqlite_functions import init_db
from onyx.connectors.salesforce.sqlite_functions import NULL_ID_STRING
from onyx.connectors.salesforce.sqlite_functions import update_email_to_id_table
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
from onyx.db.document import get_cc_pairs_for_document
from onyx.utils.logger import setup_logger
@@ -24,8 +28,6 @@ def get_any_salesforce_client_for_doc_id(
E.g. there are 2 different credential sets for 2 different salesforce cc_pairs
but only one has the permissions to access the permissions needed for the query.
"""
# NOTE: this global seems very very bad
global _ANY_SALESFORCE_CLIENT
if _ANY_SALESFORCE_CLIENT is None:
cc_pairs = get_cc_pairs_for_document(db_session, doc_id)
@@ -40,18 +42,11 @@ def get_any_salesforce_client_for_doc_id(
def _query_salesforce_user_id(sf_client: Salesforce, user_email: str) -> str | None:
query = f"SELECT Id FROM User WHERE Username = '{user_email}' AND IsActive = true"
query = f"SELECT Id FROM User WHERE Email = '{user_email}'"
result = sf_client.query(query)
if len(result["records"]) > 0:
return result["records"][0]["Id"]
# try emails
query = f"SELECT Id FROM User WHERE Email = '{user_email}' AND IsActive = true"
result = sf_client.query(query)
if len(result["records"]) > 0:
return result["records"][0]["Id"]
return None
if len(result["records"]) == 0:
return None
return result["records"][0]["Id"]
# This contains only the user_ids that we have found in Salesforce.
@@ -82,21 +77,35 @@ def get_salesforce_user_id_from_email(
salesforce database. (Around 0.1-0.3 seconds)
If it's cached or stored in the local salesforce database, it's fast (<0.001 seconds).
"""
# NOTE: this global seems bad
global _CACHED_SF_EMAIL_TO_ID_MAP
if user_email in _CACHED_SF_EMAIL_TO_ID_MAP:
if _CACHED_SF_EMAIL_TO_ID_MAP[user_email] is not None:
return _CACHED_SF_EMAIL_TO_ID_MAP[user_email]
# some caching via sqlite existed here before ... check history if interested
# ...query Salesforce and store the result in the database
user_id = _query_salesforce_user_id(sf_client, user_email)
db_exists = True
try:
# Check if the user is already in the database
user_id = get_user_id_by_email(user_email)
except Exception:
init_db()
try:
user_id = get_user_id_by_email(user_email)
except Exception as e:
logger.error(f"Error checking if user is in database: {e}")
user_id = None
db_exists = False
# If no entry is found in the database (indicated by user_id being None)...
if user_id is None:
# ...query Salesforce and store the result in the database
user_id = _query_salesforce_user_id(sf_client, user_email)
if db_exists:
update_email_to_id_table(user_email, user_id)
return user_id
elif user_id is None:
return None
elif user_id == NULL_ID_STRING:
return None
# If the found user_id is real, cache it
_CACHED_SF_EMAIL_TO_ID_MAP[user_email] = user_id
return user_id

View File

@@ -5,14 +5,12 @@ from slack_sdk import WebClient
from ee.onyx.external_permissions.slack.utils import fetch_user_id_to_email_map
from onyx.access.models import DocExternalAccess
from onyx.access.models import ExternalAccess
from onyx.connectors.credentials_provider import OnyxDBCredentialsProvider
from onyx.connectors.slack.connector import get_channels
from onyx.connectors.slack.connector import make_paginated_slack_api_call_w_retries
from onyx.connectors.slack.connector import SlackConnector
from onyx.db.models import ConnectorCredentialPair
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from shared_configs.contextvars import get_current_tenant_id
logger = setup_logger()
@@ -103,12 +101,7 @@ def _get_slack_document_access(
callback: IndexingHeartbeatInterface | None,
) -> Generator[DocExternalAccess, None, None]:
slack_connector = SlackConnector(**cc_pair.connector.connector_specific_config)
# Use credentials provider instead of directly loading credentials
provider = OnyxDBCredentialsProvider(
get_current_tenant_id(), "slack", cc_pair.credential.id
)
slack_connector.set_credentials_provider(provider)
slack_connector.load_credentials(cc_pair.credential.credential_json)
slim_doc_generator = slack_connector.retrieve_all_slim_documents(callback=callback)

View File

@@ -51,7 +51,6 @@ def _get_slack_group_members_email(
def slack_group_sync(
tenant_id: str,
cc_pair: ConnectorCredentialPair,
) -> list[ExternalUserGroup]:
slack_client = WebClient(

View File

@@ -15,7 +15,6 @@ from ee.onyx.external_permissions.post_query_censoring import (
DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION,
)
from ee.onyx.external_permissions.slack.doc_sync import slack_doc_sync
from ee.onyx.external_permissions.slack.group_sync import slack_group_sync
from onyx.access.models import DocExternalAccess
from onyx.configs.constants import DocumentSource
from onyx.db.models import ConnectorCredentialPair
@@ -57,7 +56,6 @@ DOC_PERMISSIONS_FUNC_MAP: dict[DocumentSource, DocSyncFuncType] = {
GROUP_PERMISSIONS_FUNC_MAP: dict[DocumentSource, GroupSyncFuncType] = {
DocumentSource.GOOGLE_DRIVE: gdrive_group_sync,
DocumentSource.CONFLUENCE: confluence_group_sync,
DocumentSource.SLACK: slack_group_sync,
}

View File

@@ -44,7 +44,7 @@ async def _get_tenant_id_from_request(
Attempt to extract tenant_id from:
1) The API key header
2) The Redis-based token (stored in Cookie: fastapiusersauth)
3) The anonymous user cookie
3) Reset token cookie
Fallback: POSTGRES_DEFAULT_SCHEMA
"""
# Check for API key
@@ -52,55 +52,41 @@ async def _get_tenant_id_from_request(
if tenant_id is not None:
return tenant_id
# Check for anonymous user cookie
anonymous_user_cookie = request.cookies.get(ANONYMOUS_USER_COOKIE_NAME)
if anonymous_user_cookie:
try:
anonymous_user_data = decode_anonymous_user_jwt_token(anonymous_user_cookie)
return anonymous_user_data.get("tenant_id", POSTGRES_DEFAULT_SCHEMA)
except Exception as e:
logger.error(f"Error decoding anonymous user cookie: {str(e)}")
# Continue and attempt to authenticate
try:
# Look up token data in Redis
token_data = await retrieve_auth_token_data_from_redis(request)
if token_data:
tenant_id_from_payload = token_data.get(
"tenant_id", POSTGRES_DEFAULT_SCHEMA
if not token_data:
logger.debug(
"Token data not found or expired in Redis, defaulting to POSTGRES_DEFAULT_SCHEMA"
)
# Return POSTGRES_DEFAULT_SCHEMA, so non-authenticated requests are sent to the default schema
# The CURRENT_TENANT_ID_CONTEXTVAR is initialized with POSTGRES_DEFAULT_SCHEMA,
# so we maintain consistency by returning it here when no valid tenant is found.
return POSTGRES_DEFAULT_SCHEMA
tenant_id = (
str(tenant_id_from_payload)
if tenant_id_from_payload is not None
else None
)
tenant_id_from_payload = token_data.get("tenant_id", POSTGRES_DEFAULT_SCHEMA)
if tenant_id and not is_valid_schema_name(tenant_id):
raise HTTPException(status_code=400, detail="Invalid tenant ID format")
# Check for anonymous user cookie
anonymous_user_cookie = request.cookies.get(ANONYMOUS_USER_COOKIE_NAME)
if anonymous_user_cookie:
try:
anonymous_user_data = decode_anonymous_user_jwt_token(
anonymous_user_cookie
)
tenant_id = anonymous_user_data.get(
"tenant_id", POSTGRES_DEFAULT_SCHEMA
)
if not tenant_id or not is_valid_schema_name(tenant_id):
raise HTTPException(
status_code=400, detail="Invalid tenant ID format"
)
return tenant_id
except Exception as e:
logger.error(f"Error decoding anonymous user cookie: {str(e)}")
# Continue and attempt to authenticate
logger.debug(
"Token data not found or expired in Redis, defaulting to POSTGRES_DEFAULT_SCHEMA"
# Since token_data.get() can return None, ensure we have a string
tenant_id = (
str(tenant_id_from_payload)
if tenant_id_from_payload is not None
else POSTGRES_DEFAULT_SCHEMA
)
# Return POSTGRES_DEFAULT_SCHEMA, so non-authenticated requests are sent to the default schema
# The CURRENT_TENANT_ID_CONTEXTVAR is initialized with POSTGRES_DEFAULT_SCHEMA,
# so we maintain consistency by returning it here when no valid tenant is found.
return POSTGRES_DEFAULT_SCHEMA
if not is_valid_schema_name(tenant_id):
raise HTTPException(status_code=400, detail="Invalid tenant ID format")
except Exception as e:
logger.error(f"Unexpected error in _get_tenant_id_from_request: {str(e)}")

View File

@@ -14,6 +14,7 @@ from ee.onyx.server.query_and_chat.models import (
BasicCreateChatMessageWithHistoryRequest,
)
from ee.onyx.server.query_and_chat.models import ChatBasicResponse
from ee.onyx.server.query_and_chat.models import SimpleDoc
from onyx.auth.users import current_user
from onyx.chat.chat_utils import combine_message_thread
from onyx.chat.chat_utils import create_chat_chain
@@ -55,6 +56,25 @@ logger = setup_logger()
router = APIRouter(prefix="/chat")
def _translate_doc_response_to_simple_doc(
doc_response: QADocsResponse,
) -> list[SimpleDoc]:
return [
SimpleDoc(
id=doc.document_id,
semantic_identifier=doc.semantic_identifier,
link=doc.link,
blurb=doc.blurb,
match_highlights=[
highlight for highlight in doc.match_highlights if highlight
],
source_type=doc.source_type,
metadata=doc.metadata,
)
for doc in doc_response.top_documents
]
def _get_final_context_doc_indices(
final_context_docs: list[LlmDoc] | None,
top_docs: list[SavedSearchDoc] | None,
@@ -91,6 +111,9 @@ def _convert_packet_stream_to_response(
elif isinstance(packet, QADocsResponse):
response.top_documents = packet.top_documents
# TODO: deprecate `simple_search_docs`
response.simple_search_docs = _translate_doc_response_to_simple_doc(packet)
# This is a no-op if agent_sub_questions hasn't already been filled
if packet.level is not None and packet.level_question_num is not None:
id = (packet.level, packet.level_question_num)

View File

@@ -8,6 +8,7 @@ from pydantic import model_validator
from ee.onyx.server.manage.models import StandardAnswer
from onyx.chat.models import CitationInfo
from onyx.chat.models import OnyxContexts
from onyx.chat.models import PersonaOverrideConfig
from onyx.chat.models import QADocsResponse
from onyx.chat.models import SubQuestionIdentifier
@@ -163,6 +164,8 @@ class ChatBasicResponse(BaseModel):
cited_documents: dict[int, str] | None = None
# FOR BACKWARDS COMPATIBILITY
# TODO: deprecate both of these
simple_search_docs: list[SimpleDoc] | None = None
llm_chunks_indices: list[int] | None = None
# agentic fields
@@ -217,3 +220,4 @@ class OneShotQAResponse(BaseModel):
llm_selected_doc_indices: list[int] | None = None
error_msg: str | None = None
chat_message_id: int | None = None
contexts: OnyxContexts | None = None

View File

@@ -36,6 +36,9 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
router = APIRouter(prefix="/auth/saml")
# Define non-authenticated user roles that should be re-created during SAML login
NON_AUTHENTICATED_ROLES = {UserRole.SLACK_USER, UserRole.EXT_PERM_USER}
async def upsert_saml_user(email: str) -> User:
logger.debug(f"Attempting to upsert SAML user with email: {email}")
@@ -51,7 +54,7 @@ async def upsert_saml_user(email: str) -> User:
try:
user = await user_manager.get_by_email(email)
# If user has a non-authenticated role, treat as non-existent
if not user.role.is_web_login():
if user.role in NON_AUTHENTICATED_ROLES:
raise exceptions.UserNotExists()
return user
except exceptions.UserNotExists:

View File

@@ -94,7 +94,6 @@ async def get_or_provision_tenant(
# Notify control plane if we have created / assigned a new tenant
if not DEV_MODE:
await notify_control_plane(tenant_id, email, referral_source)
return tenant_id
except Exception as e:
@@ -506,11 +505,8 @@ async def setup_tenant(tenant_id: str) -> None:
try:
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
# Run Alembic migrations in a way that isolates it from the current event loop
# Create a new event loop for this synchronous operation
loop = asyncio.get_event_loop()
# Use run_in_executor which properly isolates the thread execution
await loop.run_in_executor(None, lambda: run_alembic_migrations(tenant_id))
# Run Alembic migrations
await asyncio.to_thread(run_alembic_migrations, tenant_id)
# Configure the tenant with default settings
with get_session_with_tenant(tenant_id=tenant_id) as db_session:

Binary file not shown.

View File

@@ -1,4 +1,3 @@
import logging
import os
import shutil
from collections.abc import AsyncGenerator
@@ -9,7 +8,6 @@ import sentry_sdk
import torch
import uvicorn
from fastapi import FastAPI
from prometheus_fastapi_instrumentator import Instrumentator
from sentry_sdk.integrations.fastapi import FastApiIntegration
from sentry_sdk.integrations.starlette import StarletteIntegration
from transformers import logging as transformer_logging # type:ignore
@@ -22,8 +20,6 @@ from model_server.management_endpoints import router as management_router
from model_server.utils import get_gpu_type
from onyx import __version__
from onyx.utils.logger import setup_logger
from onyx.utils.logger import setup_uvicorn_logger
from onyx.utils.middleware import add_onyx_request_id_middleware
from shared_configs.configs import INDEXING_ONLY
from shared_configs.configs import MIN_THREADS_ML_MODELS
from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST
@@ -40,12 +36,6 @@ transformer_logging.set_verbosity_error()
logger = setup_logger()
file_handlers = [
h for h in logger.logger.handlers if isinstance(h, logging.FileHandler)
]
setup_uvicorn_logger(shared_file_handlers=file_handlers)
def _move_files_recursively(source: Path, dest: Path, overwrite: bool = False) -> None:
"""
@@ -122,15 +112,6 @@ def get_model_app() -> FastAPI:
application.include_router(encoders_router)
application.include_router(custom_models_router)
request_id_prefix = "INF"
if INDEXING_ONLY:
request_id_prefix = "IDX"
add_onyx_request_id_middleware(application, request_id_prefix, logger)
# Initialize and instrument the app
Instrumentator().instrument(application).expose(application)
return application

View File

@@ -18,7 +18,7 @@ def _get_access_for_document(
document_id=document_id,
)
doc_access = DocumentAccess.build(
return DocumentAccess.build(
user_emails=info[1] if info and info[1] else [],
user_groups=[],
external_user_emails=[],
@@ -26,8 +26,6 @@ def _get_access_for_document(
is_public=info[2] if info else False,
)
return doc_access
def get_access_for_document(
document_id: str,
@@ -40,12 +38,12 @@ def get_access_for_document(
def get_null_document_access() -> DocumentAccess:
return DocumentAccess.build(
user_emails=[],
user_groups=[],
return DocumentAccess(
user_emails=set(),
user_groups=set(),
is_public=False,
external_user_emails=[],
external_user_group_ids=[],
external_user_emails=set(),
external_user_group_ids=set(),
)
@@ -57,18 +55,19 @@ def _get_access_for_documents(
db_session=db_session,
document_ids=document_ids,
)
doc_access = {}
for document_id, user_emails, is_public in document_access_info:
doc_access[document_id] = DocumentAccess.build(
user_emails=[email for email in user_emails if email],
doc_access = {
document_id: DocumentAccess(
user_emails=set([email for email in user_emails if email]),
# MIT version will wipe all groups and external groups on update
user_groups=[],
user_groups=set(),
is_public=is_public,
external_user_emails=[],
external_user_group_ids=[],
external_user_emails=set(),
external_user_group_ids=set(),
)
for document_id, user_emails, is_public in document_access_info
}
# Sometimes the document has not been indexed by the indexing job yet, in those cases
# Sometimes the document has not be indexed by the indexing job yet, in those cases
# the document does not exist and so we use least permissive. Specifically the EE version
# checks the MIT version permissions and creates a superset. This ensures that this flow
# does not fail even if the Document has not yet been indexed.

View File

@@ -15,22 +15,6 @@ class ExternalAccess:
# Whether the document is public in the external system or Onyx
is_public: bool
def __str__(self) -> str:
"""Prevent extremely long logs"""
def truncate_set(s: set[str], max_len: int = 100) -> str:
s_str = str(s)
if len(s_str) > max_len:
return f"{s_str[:max_len]}... ({len(s)} items)"
return s_str
return (
f"ExternalAccess("
f"external_user_emails={truncate_set(self.external_user_emails)}, "
f"external_user_group_ids={truncate_set(self.external_user_group_ids)}, "
f"is_public={self.is_public})"
)
@dataclass(frozen=True)
class DocExternalAccess:
@@ -72,45 +56,33 @@ class DocExternalAccess:
)
@dataclass(frozen=True, init=False)
@dataclass(frozen=True)
class DocumentAccess(ExternalAccess):
# User emails for Onyx users, None indicates admin
user_emails: set[str | None]
# Names of user groups associated with this document
user_groups: set[str]
external_user_emails: set[str]
external_user_group_ids: set[str]
is_public: bool
def __init__(self) -> None:
raise TypeError(
"Use `DocumentAccess.build(...)` instead of creating an instance directly."
)
def to_acl(self) -> set[str]:
# the acl's emitted by this function are prefixed by type
# to get the native objects, access the member variables directly
acl_set: set[str] = set()
for user_email in self.user_emails:
if user_email:
acl_set.add(prefix_user_email(user_email))
for group_name in self.user_groups:
acl_set.add(prefix_user_group(group_name))
for external_user_email in self.external_user_emails:
acl_set.add(prefix_user_email(external_user_email))
for external_group_id in self.external_user_group_ids:
acl_set.add(prefix_external_group(external_group_id))
if self.is_public:
acl_set.add(PUBLIC_DOC_PAT)
return acl_set
return set(
[
prefix_user_email(user_email)
for user_email in self.user_emails
if user_email
]
+ [prefix_user_group(group_name) for group_name in self.user_groups]
+ [
prefix_user_email(user_email)
for user_email in self.external_user_emails
]
+ [
# The group names are already prefixed by the source type
# This adds an additional prefix of "external_group:"
prefix_external_group(group_name)
for group_name in self.external_user_group_ids
]
+ ([PUBLIC_DOC_PAT] if self.is_public else [])
)
@classmethod
def build(
@@ -121,32 +93,29 @@ class DocumentAccess(ExternalAccess):
external_user_group_ids: list[str],
is_public: bool,
) -> "DocumentAccess":
"""Don't prefix incoming data wth acl type, prefix on read from to_acl!"""
obj = object.__new__(cls)
object.__setattr__(
obj, "user_emails", {user_email for user_email in user_emails if user_email}
return cls(
external_user_emails={
prefix_user_email(external_email)
for external_email in external_user_emails
},
external_user_group_ids={
prefix_external_group(external_group_id)
for external_group_id in external_user_group_ids
},
user_emails={
prefix_user_email(user_email)
for user_email in user_emails
if user_email
},
user_groups=set(user_groups),
is_public=is_public,
)
object.__setattr__(obj, "user_groups", set(user_groups))
object.__setattr__(
obj,
"external_user_emails",
{external_email for external_email in external_user_emails},
)
object.__setattr__(
obj,
"external_user_group_ids",
{external_group_id for external_group_id in external_user_group_ids},
)
object.__setattr__(obj, "is_public", is_public)
return obj
default_public_access = DocumentAccess.build(
external_user_emails=[],
external_user_group_ids=[],
user_emails=[],
user_groups=[],
default_public_access = DocumentAccess(
external_user_emails=set(),
external_user_group_ids=set(),
user_emails=set(),
user_groups=set(),
is_public=True,
)

View File

@@ -7,6 +7,7 @@ from langgraph.types import StreamWriter
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.chat.models import LlmDoc
from onyx.chat.models import OnyxContext
from onyx.chat.stream_processing.answer_response_handler import AnswerResponseHandler
from onyx.chat.stream_processing.answer_response_handler import CitationResponseHandler
from onyx.chat.stream_processing.answer_response_handler import (
@@ -23,7 +24,7 @@ def process_llm_stream(
should_stream_answer: bool,
writer: StreamWriter,
final_search_results: list[LlmDoc] | None = None,
displayed_search_results: list[LlmDoc] | None = None,
displayed_search_results: list[OnyxContext] | list[LlmDoc] | None = None,
) -> AIMessageChunk:
tool_call_chunk = AIMessageChunk(content="")

View File

@@ -1,62 +0,0 @@
from collections.abc import Hashable
from typing import cast
from langchain_core.runnables.config import RunnableConfig
from langgraph.types import Send
from onyx.agents.agent_search.dc_search_analysis.states import ObjectInformationInput
from onyx.agents.agent_search.dc_search_analysis.states import (
ObjectResearchInformationUpdate,
)
from onyx.agents.agent_search.dc_search_analysis.states import ObjectSourceInput
from onyx.agents.agent_search.dc_search_analysis.states import (
SearchSourcesObjectsUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
def parallel_object_source_research_edge(
state: SearchSourcesObjectsUpdate, config: RunnableConfig
) -> list[Send | Hashable]:
"""
LangGraph edge to parallelize the research for an individual object and source
"""
search_objects = state.analysis_objects
search_sources = state.analysis_sources
object_source_combinations = [
(object, source) for object in search_objects for source in search_sources
]
return [
Send(
"research_object_source",
ObjectSourceInput(
object_source_combination=object_source_combination,
log_messages=[],
),
)
for object_source_combination in object_source_combinations
]
def parallel_object_research_consolidation_edge(
state: ObjectResearchInformationUpdate, config: RunnableConfig
) -> list[Send | Hashable]:
"""
LangGraph edge to parallelize the research for an individual object and source
"""
cast(GraphConfig, config["metadata"]["config"])
object_research_information_results = state.object_research_information_results
return [
Send(
"consolidate_object_research",
ObjectInformationInput(
object_information=object_information,
log_messages=[],
),
)
for object_information in object_research_information_results
]

View File

@@ -1,103 +0,0 @@
from langgraph.graph import END
from langgraph.graph import START
from langgraph.graph import StateGraph
from onyx.agents.agent_search.dc_search_analysis.edges import (
parallel_object_research_consolidation_edge,
)
from onyx.agents.agent_search.dc_search_analysis.edges import (
parallel_object_source_research_edge,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a1_search_objects import (
search_objects,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a2_research_object_source import (
research_object_source,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a3_structure_research_by_object import (
structure_research_by_object,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a4_consolidate_object_research import (
consolidate_object_research,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a5_consolidate_research import (
consolidate_research,
)
from onyx.agents.agent_search.dc_search_analysis.states import MainInput
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.utils.logger import setup_logger
logger = setup_logger()
test_mode = False
def divide_and_conquer_graph_builder(test_mode: bool = False) -> StateGraph:
"""
LangGraph graph builder for the knowledge graph search process.
"""
graph = StateGraph(
state_schema=MainState,
input=MainInput,
)
### Add nodes ###
graph.add_node(
"search_objects",
search_objects,
)
graph.add_node(
"structure_research_by_source",
structure_research_by_object,
)
graph.add_node(
"research_object_source",
research_object_source,
)
graph.add_node(
"consolidate_object_research",
consolidate_object_research,
)
graph.add_node(
"consolidate_research",
consolidate_research,
)
### Add edges ###
graph.add_edge(start_key=START, end_key="search_objects")
graph.add_conditional_edges(
source="search_objects",
path=parallel_object_source_research_edge,
path_map=["research_object_source"],
)
graph.add_edge(
start_key="research_object_source",
end_key="structure_research_by_source",
)
graph.add_conditional_edges(
source="structure_research_by_source",
path=parallel_object_research_consolidation_edge,
path_map=["consolidate_object_research"],
)
graph.add_edge(
start_key="consolidate_object_research",
end_key="consolidate_research",
)
graph.add_edge(
start_key="consolidate_research",
end_key=END,
)
return graph

View File

@@ -1,159 +0,0 @@
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.ops import research
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.agents.agent_search.dc_search_analysis.states import (
SearchSourcesObjectsUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.chat.models import AgentAnswerPiece
from onyx.configs.constants import DocumentSource
from onyx.prompts.agents.dc_prompts import DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT
from onyx.prompts.agents.dc_prompts import DC_OBJECT_SEPARATOR
from onyx.prompts.agents.dc_prompts import DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def search_objects(
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
) -> SearchSourcesObjectsUpdate:
"""
LangGraph node to start the agentic search process.
"""
graph_config = cast(GraphConfig, config["metadata"]["config"])
question = graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
try:
instructions = graph_config.inputs.search_request.persona.prompts[
0
].system_prompt
agent_1_instructions = extract_section(
instructions, "Agent Step 1:", "Agent Step 2:"
)
if agent_1_instructions is None:
raise ValueError("Agent 1 instructions not found")
agent_1_base_data = extract_section(instructions, "|Start Data|", "|End Data|")
agent_1_task = extract_section(
agent_1_instructions, "Task:", "Independent Research Sources:"
)
if agent_1_task is None:
raise ValueError("Agent 1 task not found")
agent_1_independent_sources_str = extract_section(
agent_1_instructions, "Independent Research Sources:", "Output Objective:"
)
if agent_1_independent_sources_str is None:
raise ValueError("Agent 1 Independent Research Sources not found")
document_sources = [
DocumentSource(x.strip().lower())
for x in agent_1_independent_sources_str.split(DC_OBJECT_SEPARATOR)
]
agent_1_output_objective = extract_section(
agent_1_instructions, "Output Objective:"
)
if agent_1_output_objective is None:
raise ValueError("Agent 1 output objective not found")
except Exception as e:
raise ValueError(
f"Agent 1 instructions not found or not formatted correctly: {e}"
)
# Extract objects
if agent_1_base_data is None:
# Retrieve chunks for objects
retrieved_docs = research(question, search_tool)[:10]
document_texts_list = []
for doc_num, doc in enumerate(retrieved_docs):
chunk_text = "Document " + str(doc_num) + ":\n" + doc.content
document_texts_list.append(chunk_text)
document_texts = "\n\n".join(document_texts_list)
dc_object_extraction_prompt = DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT.format(
question=question,
task=agent_1_task,
document_text=document_texts,
objects_of_interest=agent_1_output_objective,
)
else:
dc_object_extraction_prompt = DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT.format(
question=question,
task=agent_1_task,
base_data=agent_1_base_data,
objects_of_interest=agent_1_output_objective,
)
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_object_extraction_prompt,
reserved_str="",
),
)
]
primary_llm = graph_config.tooling.primary_llm
# Grader
try:
llm_response = run_with_timeout(
30,
primary_llm.invoke,
prompt=msg,
timeout_override=30,
max_tokens=300,
)
cleaned_response = (
str(llm_response.content)
.replace("```json\n", "")
.replace("\n```", "")
.replace("\n", "")
)
cleaned_response = cleaned_response.split("OBJECTS:")[1]
object_list = [x.strip() for x in cleaned_response.split(";")]
except Exception as e:
raise ValueError(f"Error in search_objects: {e}")
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=" Researching the individual objects for each source type... ",
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
return SearchSourcesObjectsUpdate(
analysis_objects=object_list,
analysis_sources=document_sources,
log_messages=["Agent 1 Task done"],
)

View File

@@ -1,185 +0,0 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.ops import research
from onyx.agents.agent_search.dc_search_analysis.states import ObjectSourceInput
from onyx.agents.agent_search.dc_search_analysis.states import (
ObjectSourceResearchUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.prompts.agents.dc_prompts import DC_OBJECT_SOURCE_RESEARCH_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def research_object_source(
state: ObjectSourceInput,
config: RunnableConfig,
writer: StreamWriter = lambda _: None,
) -> ObjectSourceResearchUpdate:
"""
LangGraph node to start the agentic search process.
"""
datetime.now()
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
question = graph_config.inputs.search_request.query
object, document_source = state.object_source_combination
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
try:
instructions = graph_config.inputs.search_request.persona.prompts[
0
].system_prompt
agent_2_instructions = extract_section(
instructions, "Agent Step 2:", "Agent Step 3:"
)
if agent_2_instructions is None:
raise ValueError("Agent 2 instructions not found")
agent_2_task = extract_section(
agent_2_instructions, "Task:", "Independent Research Sources:"
)
if agent_2_task is None:
raise ValueError("Agent 2 task not found")
agent_2_time_cutoff = extract_section(
agent_2_instructions, "Time Cutoff:", "Research Topics:"
)
agent_2_research_topics = extract_section(
agent_2_instructions, "Research Topics:", "Output Objective"
)
agent_2_output_objective = extract_section(
agent_2_instructions, "Output Objective:"
)
if agent_2_output_objective is None:
raise ValueError("Agent 2 output objective not found")
except Exception:
raise ValueError(
"Agent 1 instructions not found or not formatted correctly: {e}"
)
# Populate prompt
# Retrieve chunks for objects
if agent_2_time_cutoff is not None and agent_2_time_cutoff.strip() != "":
if agent_2_time_cutoff.strip().endswith("d"):
try:
days = int(agent_2_time_cutoff.strip()[:-1])
agent_2_source_start_time = datetime.now(timezone.utc) - timedelta(
days=days
)
except ValueError:
raise ValueError(
f"Invalid time cutoff format: {agent_2_time_cutoff}. Expected format: '<number>d'"
)
else:
raise ValueError(
f"Invalid time cutoff format: {agent_2_time_cutoff}. Expected format: '<number>d'"
)
else:
agent_2_source_start_time = None
document_sources = [document_source] if document_source else None
if len(question.strip()) > 0:
research_area = f"{question} for {object}"
elif agent_2_research_topics and len(agent_2_research_topics.strip()) > 0:
research_area = f"{agent_2_research_topics} for {object}"
else:
research_area = object
retrieved_docs = research(
question=research_area,
search_tool=search_tool,
document_sources=document_sources,
time_cutoff=agent_2_source_start_time,
)
# Generate document text
document_texts_list = []
for doc_num, doc in enumerate(retrieved_docs):
chunk_text = "Document " + str(doc_num) + ":\n" + doc.content
document_texts_list.append(chunk_text)
document_texts = "\n\n".join(document_texts_list)
# Built prompt
today = datetime.now().strftime("%A, %Y-%m-%d")
dc_object_source_research_prompt = (
DC_OBJECT_SOURCE_RESEARCH_PROMPT.format(
today=today,
question=question,
task=agent_2_task,
document_text=document_texts,
format=agent_2_output_objective,
)
.replace("---object---", object)
.replace("---source---", document_source.value)
)
# Run LLM
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_object_source_research_prompt,
reserved_str="",
),
)
]
# fast_llm = graph_config.tooling.fast_llm
primary_llm = graph_config.tooling.primary_llm
llm = primary_llm
# Grader
try:
llm_response = run_with_timeout(
30,
llm.invoke,
prompt=msg,
timeout_override=30,
max_tokens=300,
)
cleaned_response = str(llm_response.content).replace("```json\n", "")
cleaned_response = cleaned_response.split("RESEARCH RESULTS:")[1]
object_research_results = {
"object": object,
"source": document_source.value,
"research_result": cleaned_response,
}
except Exception as e:
raise ValueError(f"Error in research_object_source: {e}")
logger.debug("DivCon Step A2 - Object Source Research - completed for an object")
return ObjectSourceResearchUpdate(
object_source_research_results=[object_research_results],
log_messages=["Agent Step 2 done for one object"],
)

View File

@@ -1,68 +0,0 @@
from collections import defaultdict
from datetime import datetime
from typing import cast
from typing import Dict
from typing import List
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.agents.agent_search.dc_search_analysis.states import (
ObjectResearchInformationUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.chat.models import AgentAnswerPiece
from onyx.utils.logger import setup_logger
logger = setup_logger()
def structure_research_by_object(
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
) -> ObjectResearchInformationUpdate:
"""
LangGraph node to start the agentic search process.
"""
datetime.now()
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=" consolidating the information across source types for each object...",
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
object_source_research_results = state.object_source_research_results
object_research_information_results: List[Dict[str, str]] = []
object_research_information_results_list: Dict[str, List[str]] = defaultdict(list)
for object_source_research in object_source_research_results:
object = object_source_research["object"]
source = object_source_research["source"]
research_result = object_source_research["research_result"]
object_research_information_results_list[object].append(
f"Source: {source}\n{research_result}"
)
for object, information in object_research_information_results_list.items():
object_research_information_results.append(
{"object": object, "information": "\n".join(information)}
)
logger.debug("DivCon Step A3 - Object Research Information Structuring - completed")
return ObjectResearchInformationUpdate(
object_research_information_results=object_research_information_results,
log_messages=["A3 - Object Research Information structured"],
)

View File

@@ -1,107 +0,0 @@
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.states import ObjectInformationInput
from onyx.agents.agent_search.dc_search_analysis.states import ObjectResearchUpdate
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.prompts.agents.dc_prompts import DC_OBJECT_CONSOLIDATION_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def consolidate_object_research(
state: ObjectInformationInput,
config: RunnableConfig,
writer: StreamWriter = lambda _: None,
) -> ObjectResearchUpdate:
"""
LangGraph node to start the agentic search process.
"""
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
question = graph_config.inputs.search_request.query
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
instructions = graph_config.inputs.search_request.persona.prompts[0].system_prompt
agent_4_instructions = extract_section(
instructions, "Agent Step 4:", "Agent Step 5:"
)
if agent_4_instructions is None:
raise ValueError("Agent 4 instructions not found")
agent_4_output_objective = extract_section(
agent_4_instructions, "Output Objective:"
)
if agent_4_output_objective is None:
raise ValueError("Agent 4 output objective not found")
object_information = state.object_information
object = object_information["object"]
information = object_information["information"]
# Create a prompt for the object consolidation
dc_object_consolidation_prompt = DC_OBJECT_CONSOLIDATION_PROMPT.format(
question=question,
object=object,
information=information,
format=agent_4_output_objective,
)
# Run LLM
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_object_consolidation_prompt,
reserved_str="",
),
)
]
graph_config.tooling.primary_llm
# fast_llm = graph_config.tooling.fast_llm
primary_llm = graph_config.tooling.primary_llm
llm = primary_llm
# Grader
try:
llm_response = run_with_timeout(
30,
llm.invoke,
prompt=msg,
timeout_override=30,
max_tokens=300,
)
cleaned_response = str(llm_response.content).replace("```json\n", "")
consolidated_information = cleaned_response.split("INFORMATION:")[1]
except Exception as e:
raise ValueError(f"Error in consolidate_object_research: {e}")
object_research_results = {
"object": object,
"research_result": consolidated_information,
}
logger.debug(
"DivCon Step A4 - Object Research Consolidation - completed for an object"
)
return ObjectResearchUpdate(
object_research_results=[object_research_results],
log_messages=["Agent Source Consilidation done"],
)

View File

@@ -1,164 +0,0 @@
from datetime import datetime
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.agents.agent_search.dc_search_analysis.states import ResearchUpdate
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.chat.models import AgentAnswerPiece
from onyx.prompts.agents.dc_prompts import DC_FORMATTING_NO_BASE_DATA_PROMPT
from onyx.prompts.agents.dc_prompts import DC_FORMATTING_WITH_BASE_DATA_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def consolidate_research(
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
) -> ResearchUpdate:
"""
LangGraph node to start the agentic search process.
"""
datetime.now()
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=" generating the answer\n\n\n",
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
# Populate prompt
instructions = graph_config.inputs.search_request.persona.prompts[0].system_prompt
try:
agent_5_instructions = extract_section(
instructions, "Agent Step 5:", "Agent End"
)
if agent_5_instructions is None:
raise ValueError("Agent 5 instructions not found")
agent_5_base_data = extract_section(instructions, "|Start Data|", "|End Data|")
agent_5_task = extract_section(
agent_5_instructions, "Task:", "Independent Research Sources:"
)
if agent_5_task is None:
raise ValueError("Agent 5 task not found")
agent_5_output_objective = extract_section(
agent_5_instructions, "Output Objective:"
)
if agent_5_output_objective is None:
raise ValueError("Agent 5 output objective not found")
except ValueError as e:
raise ValueError(
f"Instructions for Agent Step 5 were not properly formatted: {e}"
)
research_result_list = []
if agent_5_task.strip() == "*concatenate*":
object_research_results = state.object_research_results
for object_research_result in object_research_results:
object = object_research_result["object"]
research_result = object_research_result["research_result"]
research_result_list.append(f"Object: {object}\n\n{research_result}")
research_results = "\n\n".join(research_result_list)
else:
raise NotImplementedError("Only '*concatenate*' is currently supported")
# Create a prompt for the object consolidation
if agent_5_base_data is None:
dc_formatting_prompt = DC_FORMATTING_NO_BASE_DATA_PROMPT.format(
text=research_results,
format=agent_5_output_objective,
)
else:
dc_formatting_prompt = DC_FORMATTING_WITH_BASE_DATA_PROMPT.format(
base_data=agent_5_base_data,
text=research_results,
format=agent_5_output_objective,
)
# Run LLM
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_formatting_prompt,
reserved_str="",
),
)
]
dispatch_timings: list[float] = []
primary_model = graph_config.tooling.primary_llm
def stream_initial_answer() -> list[str]:
response: list[str] = []
for message in primary_model.stream(msg, timeout_override=30, max_tokens=None):
# TODO: in principle, the answer here COULD contain images, but we don't support that yet
content = message.content
if not isinstance(content, str):
raise ValueError(
f"Expected content to be a string, but got {type(content)}"
)
start_stream_token = datetime.now()
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=content,
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
end_stream_token = datetime.now()
dispatch_timings.append(
(end_stream_token - start_stream_token).microseconds
)
response.append(content)
return response
try:
_ = run_with_timeout(
60,
stream_initial_answer,
)
except Exception as e:
raise ValueError(f"Error in consolidate_research: {e}")
logger.debug("DivCon Step A5 - Final Generation - completed")
return ResearchUpdate(
research_results=research_results,
log_messages=["Agent Source Consilidation done"],
)

View File

@@ -1,61 +0,0 @@
from datetime import datetime
from typing import cast
from onyx.chat.models import LlmDoc
from onyx.configs.constants import DocumentSource
from onyx.context.search.models import InferenceSection
from onyx.db.engine import get_session_with_current_tenant
from onyx.tools.models import SearchToolOverrideKwargs
from onyx.tools.tool_implementations.search.search_tool import (
FINAL_CONTEXT_DOCUMENTS_ID,
)
from onyx.tools.tool_implementations.search.search_tool import SearchTool
def research(
question: str,
search_tool: SearchTool,
document_sources: list[DocumentSource] | None = None,
time_cutoff: datetime | None = None,
) -> list[LlmDoc]:
# new db session to avoid concurrency issues
callback_container: list[list[InferenceSection]] = []
retrieved_docs: list[LlmDoc] = []
with get_session_with_current_tenant() as db_session:
for tool_response in search_tool.run(
query=question,
override_kwargs=SearchToolOverrideKwargs(
force_no_rerank=False,
alternate_db_session=db_session,
retrieved_sections_callback=callback_container.append,
skip_query_analysis=True,
document_sources=document_sources,
time_cutoff=time_cutoff,
),
):
# get retrieved docs to send to the rest of the graph
if tool_response.id == FINAL_CONTEXT_DOCUMENTS_ID:
retrieved_docs = cast(list[LlmDoc], tool_response.response)[:10]
break
return retrieved_docs
def extract_section(
text: str, start_marker: str, end_marker: str | None = None
) -> str | None:
"""Extract text between markers, returning None if markers not found"""
parts = text.split(start_marker)
if len(parts) == 1:
return None
after_start = parts[1].strip()
if not end_marker:
return after_start
extract = after_start.split(end_marker)[0]
return extract.strip()

View File

@@ -1,72 +0,0 @@
from operator import add
from typing import Annotated
from typing import Dict
from typing import TypedDict
from pydantic import BaseModel
from onyx.agents.agent_search.core_state import CoreState
from onyx.agents.agent_search.orchestration.states import ToolCallUpdate
from onyx.agents.agent_search.orchestration.states import ToolChoiceInput
from onyx.agents.agent_search.orchestration.states import ToolChoiceUpdate
from onyx.configs.constants import DocumentSource
### States ###
class LoggerUpdate(BaseModel):
log_messages: Annotated[list[str], add] = []
class SearchSourcesObjectsUpdate(LoggerUpdate):
analysis_objects: list[str] = []
analysis_sources: list[DocumentSource] = []
class ObjectSourceInput(LoggerUpdate):
object_source_combination: tuple[str, DocumentSource]
class ObjectSourceResearchUpdate(LoggerUpdate):
object_source_research_results: Annotated[list[Dict[str, str]], add] = []
class ObjectInformationInput(LoggerUpdate):
object_information: Dict[str, str]
class ObjectResearchInformationUpdate(LoggerUpdate):
object_research_information_results: Annotated[list[Dict[str, str]], add] = []
class ObjectResearchUpdate(LoggerUpdate):
object_research_results: Annotated[list[Dict[str, str]], add] = []
class ResearchUpdate(LoggerUpdate):
research_results: str | None = None
## Graph Input State
class MainInput(CoreState):
pass
## Graph State
class MainState(
# This includes the core state
MainInput,
ToolChoiceInput,
ToolCallUpdate,
ToolChoiceUpdate,
SearchSourcesObjectsUpdate,
ObjectSourceResearchUpdate,
ObjectResearchInformationUpdate,
ObjectResearchUpdate,
ResearchUpdate,
):
pass
## Graph Output State - presently not used
class MainOutput(TypedDict):
log_messages: list[str]

View File

@@ -156,6 +156,7 @@ def generate_initial_answer(
for tool_response in yield_search_responses(
query=question,
get_retrieved_sections=lambda: answer_generation_documents.context_documents,
get_reranked_sections=lambda: answer_generation_documents.streaming_documents,
get_final_context_sections=lambda: answer_generation_documents.context_documents,
search_query_info=query_info,
get_section_relevance=lambda: relevance_list,

View File

@@ -183,6 +183,7 @@ def generate_validate_refined_answer(
for tool_response in yield_search_responses(
query=question,
get_retrieved_sections=lambda: answer_generation_documents.context_documents,
get_reranked_sections=lambda: answer_generation_documents.streaming_documents,
get_final_context_sections=lambda: answer_generation_documents.context_documents,
search_query_info=query_info,
get_section_relevance=lambda: relevance_list,

View File

@@ -57,6 +57,7 @@ def format_results(
for tool_response in yield_search_responses(
query=state.question,
get_retrieved_sections=lambda: reranked_documents,
get_reranked_sections=lambda: state.retrieved_documents,
get_final_context_sections=lambda: reranked_documents,
search_query_info=query_info,
get_section_relevance=lambda: relevance_list,

View File

@@ -13,7 +13,9 @@ from onyx.tools.tool_implementations.search.search_tool import (
SEARCH_RESPONSE_SUMMARY_ID,
)
from onyx.tools.tool_implementations.search.search_tool import SearchResponseSummary
from onyx.tools.tool_implementations.search.search_utils import section_to_llm_doc
from onyx.tools.tool_implementations.search.search_utils import (
context_from_inference_section,
)
from onyx.tools.tool_implementations.search_like_tool_utils import (
FINAL_CONTEXT_DOCUMENTS_ID,
)
@@ -57,7 +59,9 @@ def basic_use_tool_response(
search_response_summary = cast(SearchResponseSummary, yield_item.response)
for section in search_response_summary.top_sections:
if section.center_chunk.document_id not in initial_search_results:
initial_search_results.append(section_to_llm_doc(section))
initial_search_results.append(
context_from_inference_section(section)
)
new_tool_call_chunk = AIMessageChunk(content="")
if not agent_config.behavior.skip_gen_ai_answer_generation:

View File

@@ -8,10 +8,6 @@ from langgraph.graph.state import CompiledStateGraph
from onyx.agents.agent_search.basic.graph_builder import basic_graph_builder
from onyx.agents.agent_search.basic.states import BasicInput
from onyx.agents.agent_search.dc_search_analysis.graph_builder import (
divide_and_conquer_graph_builder,
)
from onyx.agents.agent_search.dc_search_analysis.states import MainInput as DCMainInput
from onyx.agents.agent_search.deep_search.main.graph_builder import (
main_graph_builder as main_graph_builder_a,
)
@@ -86,7 +82,7 @@ def _parse_agent_event(
def manage_sync_streaming(
compiled_graph: CompiledStateGraph,
config: GraphConfig,
graph_input: BasicInput | MainInput | DCMainInput,
graph_input: BasicInput | MainInput,
) -> Iterable[StreamEvent]:
message_id = config.persistence.message_id if config.persistence else None
for event in compiled_graph.stream(
@@ -100,7 +96,7 @@ def manage_sync_streaming(
def run_graph(
compiled_graph: CompiledStateGraph,
config: GraphConfig,
input: BasicInput | MainInput | DCMainInput,
input: BasicInput | MainInput,
) -> AnswerStream:
config.behavior.perform_initial_search_decomposition = (
INITIAL_SEARCH_DECOMPOSITION_ENABLED
@@ -150,16 +146,6 @@ def run_basic_graph(
return run_graph(compiled_graph, config, input)
def run_dc_graph(
config: GraphConfig,
) -> AnswerStream:
graph = divide_and_conquer_graph_builder()
compiled_graph = graph.compile()
input = DCMainInput(log_messages=[])
config.inputs.search_request.query = config.inputs.search_request.query.strip()
return run_graph(compiled_graph, config, input)
if __name__ == "__main__":
for _ in range(1):
query_start_time = datetime.now()

View File

@@ -180,35 +180,3 @@ def binary_string_test_after_answer_separator(
relevant_text = text.split(f"{separator}")[-1]
return binary_string_test(relevant_text, positive_value)
def build_dc_search_prompt(
question: str,
original_question: str,
docs: list[InferenceSection],
persona_specification: str,
config: LLMConfig,
) -> list[SystemMessage | HumanMessage | AIMessage | ToolMessage]:
system_message = SystemMessage(
content=persona_specification,
)
date_str = build_date_time_string()
docs_str = format_docs(docs)
docs_str = trim_prompt_piece(
config,
docs_str,
SUB_QUESTION_RAG_PROMPT + question + original_question + date_str,
)
human_message = HumanMessage(
content=SUB_QUESTION_RAG_PROMPT.format(
question=question,
original_question=original_question,
context=docs_str,
date_prompt=date_str,
)
)
return [system_message, human_message]

View File

@@ -321,10 +321,8 @@ def dispatch_separated(
sep: str = DISPATCH_SEP_CHAR,
) -> list[BaseMessage_Content]:
num = 1
accumulated_tokens = ""
streamed_tokens: list[BaseMessage_Content] = []
for token in tokens:
accumulated_tokens += cast(str, token.content)
content = cast(str, token.content)
if sep in content:
sub_question_parts = content.split(sep)

View File

@@ -23,7 +23,6 @@ from onyx.utils.url import add_url_params
from onyx.utils.variable_functionality import fetch_versioned_implementation
from shared_configs.configs import MULTI_TENANT
HTML_EMAIL_TEMPLATE = """\
<!DOCTYPE html>
<html lang="en">

View File

@@ -56,7 +56,6 @@ from httpx_oauth.oauth2 import OAuth2Token
from pydantic import BaseModel
from sqlalchemy.ext.asyncio import AsyncSession
from ee.onyx.configs.app_configs import ANONYMOUS_USER_COOKIE_NAME
from onyx.auth.api_key import get_hashed_api_key_from_request
from onyx.auth.email_utils import send_forgot_password_email
from onyx.auth.email_utils import send_user_verification_email
@@ -361,6 +360,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
reason="Password must contain at least one special character from the following set: "
f"{PASSWORD_SPECIAL_CHARS}."
)
return
async def oauth_callback(
@@ -514,25 +514,6 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
return user
async def on_after_login(
self,
user: User,
request: Optional[Request] = None,
response: Optional[Response] = None,
) -> None:
try:
if response and request and ANONYMOUS_USER_COOKIE_NAME in request.cookies:
response.delete_cookie(
ANONYMOUS_USER_COOKIE_NAME,
# Ensure cookie deletion doesn't override other cookies by setting the same path/domain
path="/",
domain=None,
secure=WEB_DOMAIN.startswith("https"),
)
logger.debug(f"Deleted anonymous user cookie for user {user.email}")
except Exception:
logger.exception("Error deleting anonymous user cookie")
async def on_after_register(
self, user: User, request: Optional[Request] = None
) -> None:
@@ -1322,7 +1303,6 @@ def get_oauth_router(
# Login user
response = await backend.login(strategy, user)
await user_manager.on_after_login(user, request, response)
# Prepare redirect response
if tenant_id is None:
# Use URL utility to add parameters
@@ -1332,14 +1312,9 @@ def get_oauth_router(
# No parameters to add
redirect_response = RedirectResponse(next_url, status_code=302)
# Copy headers from auth response to redirect response, with special handling for Set-Cookie
# Copy headers and other attributes from 'response' to 'redirect_response'
for header_name, header_value in response.headers.items():
# FastAPI can have multiple Set-Cookie headers as a list
if header_name.lower() == "set-cookie" and isinstance(header_value, list):
for cookie_value in header_value:
redirect_response.headers.append(header_name, cookie_value)
else:
redirect_response.headers[header_name] = header_value
redirect_response.headers[header_name] = header_value
if hasattr(response, "body"):
redirect_response.body = response.body

View File

@@ -1,6 +1,5 @@
import logging
import multiprocessing
import os
import time
from typing import Any
from typing import cast
@@ -306,7 +305,7 @@ def wait_for_db(sender: Any, **kwargs: Any) -> None:
def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None:
logger.info(f"Running as a secondary celery worker: pid={os.getpid()}")
logger.info("Running as a secondary celery worker.")
# Set up variables for waiting on primary worker
WAIT_INTERVAL = 5

View File

@@ -1,7 +0,0 @@
from celery import Celery
import onyx.background.celery.apps.app_base as app_base
celery_app = Celery(__name__)
celery_app.config_from_object("onyx.background.celery.configs.client")
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]

View File

@@ -111,7 +111,6 @@ celery_app.autodiscover_tasks(
"onyx.background.celery.tasks.vespa",
"onyx.background.celery.tasks.connector_deletion",
"onyx.background.celery.tasks.doc_permission_syncing",
"onyx.background.celery.tasks.user_file_folder_sync",
"onyx.background.celery.tasks.indexing",
"onyx.background.celery.tasks.tenant_provisioning",
]

View File

@@ -1,5 +1,4 @@
import logging
import os
from typing import Any
from typing import cast
@@ -96,7 +95,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
logger.info(f"Running as the primary celery worker: pid={os.getpid()}")
logger.info("Running as the primary celery worker.")
# Less startup checks in multi-tenant case
if MULTI_TENANT:
@@ -175,9 +174,6 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
f"search_settings={attempt.search_settings_id}"
)
logger.warning(failure_reason)
logger.exception(
f"Marking attempt {attempt.id} as canceled due to validation error 2"
)
mark_attempt_canceled(attempt.id, db_session, failure_reason)
@@ -289,6 +285,5 @@ celery_app.autodiscover_tasks(
"onyx.background.celery.tasks.shared",
"onyx.background.celery.tasks.vespa",
"onyx.background.celery.tasks.llm_model_update",
"onyx.background.celery.tasks.user_file_folder_sync",
]
)

View File

@@ -1,16 +0,0 @@
import onyx.background.celery.configs.base as shared_config
broker_url = shared_config.broker_url
broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup
broker_pool_limit = shared_config.broker_pool_limit
broker_transport_options = shared_config.broker_transport_options
redis_socket_keepalive = shared_config.redis_socket_keepalive
redis_retry_on_timeout = shared_config.redis_retry_on_timeout
redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval
result_backend = shared_config.result_backend
result_expires = shared_config.result_expires # 86400 seconds is the default
task_default_priority = shared_config.task_default_priority
task_acks_late = shared_config.task_acks_late

View File

@@ -64,15 +64,6 @@ beat_task_templates.extend(
"expires": BEAT_EXPIRES_DEFAULT,
},
},
{
"name": "check-for-user-file-folder-sync",
"task": OnyxCeleryTask.CHECK_FOR_USER_FILE_FOLDER_SYNC,
"schedule": timedelta(seconds=30),
"options": {
"priority": OnyxCeleryPriority.MEDIUM,
"expires": BEAT_EXPIRES_DEFAULT,
},
},
{
"name": "check-for-pruning",
"task": OnyxCeleryTask.CHECK_FOR_PRUNING,

View File

@@ -389,8 +389,6 @@ def monitor_connector_deletion_taskset(
db_session=db_session,
cc_pair_id=cc_pair_id,
)
credential_id_to_delete: int | None = None
connector_id_to_delete: int | None = None
if not cc_pair:
task_logger.warning(
f"Connector deletion - cc_pair not found: cc_pair={cc_pair_id}"
@@ -445,35 +443,26 @@ def monitor_connector_deletion_taskset(
db_session=db_session,
)
# Store IDs before potentially expiring cc_pair
connector_id_to_delete = cc_pair.connector_id
credential_id_to_delete = cc_pair.credential_id
# Explicitly delete document by connector credential pair records before deleting the connector
# This is needed because connector_id is a primary key in that table and cascading deletes won't work
delete_all_documents_by_connector_credential_pair__no_commit(
db_session=db_session,
connector_id=connector_id_to_delete,
credential_id=credential_id_to_delete,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
)
# Flush to ensure document deletion happens before connector deletion
db_session.flush()
# Expire the cc_pair to ensure SQLAlchemy doesn't try to manage its state
# related to the deleted DocumentByConnectorCredentialPair during commit
db_session.expire(cc_pair)
# finally, delete the cc-pair
delete_connector_credential_pair__no_commit(
db_session=db_session,
connector_id=connector_id_to_delete,
credential_id=credential_id_to_delete,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
)
# if there are no credentials left, delete the connector
connector = fetch_connector_by_id(
db_session=db_session,
connector_id=connector_id_to_delete,
connector_id=cc_pair.connector_id,
)
if not connector or not len(connector.credentials):
task_logger.info(
@@ -506,15 +495,15 @@ def monitor_connector_deletion_taskset(
task_logger.exception(
f"Connector deletion exceptioned: "
f"cc_pair={cc_pair_id} connector={connector_id_to_delete} credential={credential_id_to_delete}"
f"cc_pair={cc_pair_id} connector={cc_pair.connector_id} credential={cc_pair.credential_id}"
)
raise e
task_logger.info(
f"Connector deletion succeeded: "
f"cc_pair={cc_pair_id} "
f"connector={connector_id_to_delete} "
f"credential={credential_id_to_delete} "
f"connector={cc_pair.connector_id} "
f"credential={cc_pair.credential_id} "
f"docs_deleted={fence_data.num_tasks}"
)
@@ -564,7 +553,7 @@ def validate_connector_deletion_fences(
def validate_connector_deletion_fence(
tenant_id: str,
key_bytes: bytes,
queued_upsert_tasks: set[str],
queued_tasks: set[str],
r: Redis,
) -> None:
"""Checks for the error condition where an indexing fence is set but the associated celery tasks don't exist.
@@ -651,7 +640,7 @@ def validate_connector_deletion_fence(
member_bytes = cast(bytes, member)
member_str = member_bytes.decode("utf-8")
if member_str in queued_upsert_tasks:
if member_str in queued_tasks:
continue
tasks_not_in_celery += 1

View File

@@ -886,8 +886,11 @@ def monitor_ccpair_permissions_taskset(
record_type=RecordType.PERMISSION_SYNC_PROGRESS,
data={
"cc_pair_id": cc_pair_id,
"total_docs_synced": initial if initial is not None else 0,
"remaining_docs_to_sync": remaining,
"id": payload.id if payload else None,
"total_docs": initial if initial is not None else 0,
"remaining_docs": remaining,
"synced_docs": (initial - remaining) if initial is not None else 0,
"is_complete": remaining == 0,
},
tenant_id=tenant_id,
)
@@ -903,13 +906,6 @@ def monitor_ccpair_permissions_taskset(
f"num_synced={initial}"
)
# Add telemetry for permission syncing complete
optional_telemetry(
record_type=RecordType.PERMISSION_SYNC_COMPLETE,
data={"cc_pair_id": cc_pair_id},
tenant_id=tenant_id,
)
update_sync_record_status(
db_session=db_session,
entity_id=cc_pair_id,

View File

@@ -365,7 +365,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
Occcasionally does some validation of existing state to clear up error conditions"""
time_start = time.monotonic()
task_logger.warning("check_for_indexing - Starting")
tasks_created = 0
locked = False
@@ -434,9 +433,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
lock_beat.reacquire()
cc_pair_ids: list[int] = []
with get_session_with_current_tenant() as db_session:
cc_pairs = fetch_connector_credential_pairs(
db_session, include_user_files=True
)
cc_pairs = fetch_connector_credential_pairs(db_session)
for cc_pair_entry in cc_pairs:
cc_pair_ids.append(cc_pair_entry.id)
@@ -455,18 +452,12 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
not search_settings_instance.status.is_current()
and not search_settings_instance.background_reindex_enabled
):
task_logger.warning("SKIPPING DUE TO NON-LIVE SEARCH SETTINGS")
continue
redis_connector_index = redis_connector.new_index(
search_settings_instance.id
)
if redis_connector_index.fenced:
task_logger.info(
f"check_for_indexing - Skipping fenced connector: "
f"cc_pair={cc_pair_id} search_settings={search_settings_instance.id}"
)
continue
cc_pair = get_connector_credential_pair_from_id(
@@ -474,9 +465,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
cc_pair_id=cc_pair_id,
)
if not cc_pair:
task_logger.warning(
f"check_for_indexing - CC pair not found: cc_pair={cc_pair_id}"
)
continue
last_attempt = get_last_attempt_for_cc_pair(
@@ -490,20 +478,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
secondary_index_building=len(search_settings_list) > 1,
db_session=db_session,
):
task_logger.info(
f"check_for_indexing - Not indexing cc_pair_id: {cc_pair_id} "
f"search_settings={search_settings_instance.id}, "
f"last_attempt={last_attempt.id if last_attempt else None}, "
f"secondary_index_building={len(search_settings_list) > 1}"
)
continue
else:
task_logger.info(
f"check_for_indexing - Will index cc_pair_id: {cc_pair_id} "
f"search_settings={search_settings_instance.id}, "
f"last_attempt={last_attempt.id if last_attempt else None}, "
f"secondary_index_building={len(search_settings_list) > 1}"
)
reindex = False
if search_settings_instance.status.is_current():
@@ -542,12 +517,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
f"search_settings={search_settings_instance.id}"
)
tasks_created += 1
else:
task_logger.info(
f"Failed to create indexing task: "
f"cc_pair={cc_pair.id} "
f"search_settings={search_settings_instance.id}"
)
lock_beat.reacquire()
@@ -1180,9 +1149,6 @@ def connector_indexing_proxy_task(
if result.status == IndexingWatchdogTerminalStatus.TERMINATED_BY_SIGNAL:
try:
with get_session_with_current_tenant() as db_session:
logger.exception(
f"Marking attempt {index_attempt_id} as canceled due to termination signal"
)
mark_attempt_canceled(
index_attempt_id,
db_session,

View File

@@ -371,7 +371,6 @@ def should_index(
# don't kick off indexing for `NOT_APPLICABLE` sources
if connector.source == DocumentSource.NOT_APPLICABLE:
print(f"Not indexing cc_pair={cc_pair.id}: NOT_APPLICABLE source")
return False
# User can still manually create single indexing attempts via the UI for the
@@ -381,9 +380,6 @@ def should_index(
search_settings_instance.status == IndexModelStatus.PRESENT
and secondary_index_building
):
print(
f"Not indexing cc_pair={cc_pair.id}: DISABLE_INDEX_UPDATE_ON_SWAP is True and secondary index building"
)
return False
# When switching over models, always index at least once
@@ -392,31 +388,19 @@ def should_index(
# No new index if the last index attempt succeeded
# Once is enough. The model will never be able to swap otherwise.
if last_index.status == IndexingStatus.SUCCESS:
print(
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with successful last index attempt={last_index.id}"
)
return False
# No new index if the last index attempt is waiting to start
if last_index.status == IndexingStatus.NOT_STARTED:
print(
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with NOT_STARTED last index attempt={last_index.id}"
)
return False
# No new index if the last index attempt is running
if last_index.status == IndexingStatus.IN_PROGRESS:
print(
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with IN_PROGRESS last index attempt={last_index.id}"
)
return False
else:
if (
connector.id == 0 or connector.source == DocumentSource.INGESTION_API
): # Ingestion API
print(
f"Not indexing cc_pair={cc_pair.id}: FUTURE model with Ingestion API source"
)
return False
return True
@@ -428,9 +412,6 @@ def should_index(
or connector.id == 0
or connector.source == DocumentSource.INGESTION_API
):
print(
f"Not indexing cc_pair={cc_pair.id}: Connector is paused or is Ingestion API"
)
return False
if search_settings_instance.status.is_current():
@@ -443,16 +424,11 @@ def should_index(
return True
if connector.refresh_freq is None:
print(f"Not indexing cc_pair={cc_pair.id}: refresh_freq is None")
return False
current_db_time = get_db_current_time(db_session)
time_since_index = current_db_time - last_index.time_updated
if time_since_index.total_seconds() < connector.refresh_freq:
print(
f"Not indexing cc_pair={cc_pair.id}: Last index attempt={last_index.id} "
f"too recent ({time_since_index.total_seconds()}s < {connector.refresh_freq}s)"
)
return False
return True
@@ -532,13 +508,6 @@ def try_creating_indexing_task(
custom_task_id = redis_connector_index.generate_generator_task_id()
# Determine which queue to use based on whether this is a user file
queue = (
OnyxCeleryQueues.USER_FILES_INDEXING
if cc_pair.is_user_file
else OnyxCeleryQueues.CONNECTOR_INDEXING
)
# when the task is sent, we have yet to finish setting up the fence
# therefore, the task must contain code that blocks until the fence is ready
result = celery_app.send_task(
@@ -549,7 +518,7 @@ def try_creating_indexing_task(
search_settings_id=search_settings.id,
tenant_id=tenant_id,
),
queue=queue,
queue=OnyxCeleryQueues.CONNECTOR_INDEXING,
task_id=custom_task_id,
priority=OnyxCeleryPriority.MEDIUM,
)

View File

@@ -6,7 +6,6 @@ from tenacity import wait_random_exponential
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import VespaDocumentFields
from onyx.document_index.interfaces import VespaDocumentUserFields
class RetryDocumentIndex:
@@ -53,13 +52,11 @@ class RetryDocumentIndex:
*,
tenant_id: str,
chunk_count: int | None,
fields: VespaDocumentFields | None,
user_fields: VespaDocumentUserFields | None,
fields: VespaDocumentFields,
) -> int:
return self.index.update_single(
doc_id,
tenant_id=tenant_id,
chunk_count=chunk_count,
fields=fields,
user_fields=user_fields,
)

View File

@@ -164,7 +164,6 @@ def document_by_cc_pair_cleanup_task(
tenant_id=tenant_id,
chunk_count=doc.chunk_count,
fields=fields,
user_fields=None,
)
# there are still other cc_pair references to the doc, so just resync to Vespa

View File

@@ -1,266 +0,0 @@
import time
from typing import List
from celery import shared_task
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from redis.lock import Lock as RedisLock
from sqlalchemy.orm import Session
from tenacity import RetryError
from onyx.background.celery.apps.app_base import task_logger
from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocumentIndex
from onyx.background.celery.tasks.shared.tasks import LIGHT_SOFT_TIME_LIMIT
from onyx.background.celery.tasks.shared.tasks import LIGHT_TIME_LIMIT
from onyx.background.celery.tasks.shared.tasks import OnyxCeleryTaskCompletionStatus
from onyx.configs.app_configs import JOB_TIMEOUT
from onyx.configs.constants import CELERY_USER_FILE_FOLDER_SYNC_BEAT_LOCK_TIMEOUT
from onyx.configs.constants import OnyxCeleryTask
from onyx.configs.constants import OnyxRedisLocks
from onyx.db.connector_credential_pair import (
get_connector_credential_pairs_with_user_files,
)
from onyx.db.document import get_document
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import Document
from onyx.db.models import DocumentByConnectorCredentialPair
from onyx.db.search_settings import get_active_search_settings
from onyx.db.user_documents import fetch_user_files_for_documents
from onyx.db.user_documents import fetch_user_folders_for_documents
from onyx.document_index.factory import get_default_document_index
from onyx.document_index.interfaces import VespaDocumentUserFields
from onyx.httpx.httpx_pool import HttpxPool
from onyx.redis.redis_pool import get_redis_client
from onyx.utils.logger import setup_logger
logger = setup_logger()
@shared_task(
name=OnyxCeleryTask.CHECK_FOR_USER_FILE_FOLDER_SYNC,
ignore_result=True,
soft_time_limit=JOB_TIMEOUT,
trail=False,
bind=True,
)
def check_for_user_file_folder_sync(self: Task, *, tenant_id: str) -> bool | None:
"""Runs periodically to check for documents that need user file folder metadata updates.
This task fetches all connector credential pairs with user files, gets the documents
associated with them, and updates the user file and folder metadata in Vespa.
"""
time_start = time.monotonic()
r = get_redis_client()
lock_beat: RedisLock = r.lock(
OnyxRedisLocks.CHECK_USER_FILE_FOLDER_SYNC_BEAT_LOCK,
timeout=CELERY_USER_FILE_FOLDER_SYNC_BEAT_LOCK_TIMEOUT,
)
# these tasks should never overlap
if not lock_beat.acquire(blocking=False):
return None
try:
with get_session_with_current_tenant() as db_session:
# Get all connector credential pairs that have user files
cc_pairs = get_connector_credential_pairs_with_user_files(db_session)
if not cc_pairs:
task_logger.info("No connector credential pairs with user files found")
return True
# Get all documents associated with these cc_pairs
document_ids = get_documents_for_cc_pairs(cc_pairs, db_session)
if not document_ids:
task_logger.info(
"No documents found for connector credential pairs with user files"
)
return True
# Fetch current user file and folder IDs for these documents
doc_id_to_user_file_id = fetch_user_files_for_documents(
document_ids=document_ids, db_session=db_session
)
doc_id_to_user_folder_id = fetch_user_folders_for_documents(
document_ids=document_ids, db_session=db_session
)
# Update Vespa metadata for each document
for doc_id in document_ids:
user_file_id = doc_id_to_user_file_id.get(doc_id)
user_folder_id = doc_id_to_user_folder_id.get(doc_id)
if user_file_id is not None or user_folder_id is not None:
# Schedule a task to update the document metadata
update_user_file_folder_metadata.apply_async(
args=(doc_id,), # Use tuple instead of list for args
kwargs={
"tenant_id": tenant_id,
"user_file_id": user_file_id,
"user_folder_id": user_folder_id,
},
queue="vespa_metadata_sync",
)
task_logger.info(
f"Scheduled metadata updates for {len(document_ids)} documents. "
f"Elapsed time: {time.monotonic() - time_start:.2f}s"
)
return True
except Exception as e:
task_logger.exception(f"Error in check_for_user_file_folder_sync: {e}")
return False
finally:
lock_beat.release()
def get_documents_for_cc_pairs(
cc_pairs: List[ConnectorCredentialPair], db_session: Session
) -> List[str]:
"""Get all document IDs associated with the given connector credential pairs."""
if not cc_pairs:
return []
cc_pair_ids = [cc_pair.id for cc_pair in cc_pairs]
# Query to get document IDs from DocumentByConnectorCredentialPair
# Note: DocumentByConnectorCredentialPair uses connector_id and credential_id, not cc_pair_id
doc_cc_pairs = (
db_session.query(Document.id)
.join(
DocumentByConnectorCredentialPair,
Document.id == DocumentByConnectorCredentialPair.id,
)
.filter(
db_session.query(ConnectorCredentialPair)
.filter(
ConnectorCredentialPair.id.in_(cc_pair_ids),
ConnectorCredentialPair.connector_id
== DocumentByConnectorCredentialPair.connector_id,
ConnectorCredentialPair.credential_id
== DocumentByConnectorCredentialPair.credential_id,
)
.exists()
)
.all()
)
return [doc_id for (doc_id,) in doc_cc_pairs]
@shared_task(
name=OnyxCeleryTask.UPDATE_USER_FILE_FOLDER_METADATA,
bind=True,
soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
time_limit=LIGHT_TIME_LIMIT,
max_retries=3,
)
def update_user_file_folder_metadata(
self: Task,
document_id: str,
*,
tenant_id: str,
user_file_id: int | None,
user_folder_id: int | None,
) -> bool:
"""Updates the user file and folder metadata for a document in Vespa."""
start = time.monotonic()
completion_status = OnyxCeleryTaskCompletionStatus.UNDEFINED
try:
with get_session_with_current_tenant() as db_session:
active_search_settings = get_active_search_settings(db_session)
doc_index = get_default_document_index(
search_settings=active_search_settings.primary,
secondary_search_settings=active_search_settings.secondary,
httpx_client=HttpxPool.get("vespa"),
)
retry_index = RetryDocumentIndex(doc_index)
doc = get_document(document_id, db_session)
if not doc:
elapsed = time.monotonic() - start
task_logger.info(
f"doc={document_id} "
f"action=no_operation "
f"elapsed={elapsed:.2f}"
)
completion_status = OnyxCeleryTaskCompletionStatus.SKIPPED
return False
# Create user fields object with file and folder IDs
user_fields = VespaDocumentUserFields(
user_file_id=str(user_file_id) if user_file_id is not None else None,
user_folder_id=str(user_folder_id)
if user_folder_id is not None
else None,
)
# Update Vespa. OK if doc doesn't exist. Raises exception otherwise.
chunks_affected = retry_index.update_single(
document_id,
tenant_id=tenant_id,
chunk_count=doc.chunk_count,
fields=None, # We're only updating user fields
user_fields=user_fields,
)
elapsed = time.monotonic() - start
task_logger.info(
f"doc={document_id} "
f"action=user_file_folder_sync "
f"user_file_id={user_file_id} "
f"user_folder_id={user_folder_id} "
f"chunks={chunks_affected} "
f"elapsed={elapsed:.2f}"
)
completion_status = OnyxCeleryTaskCompletionStatus.SUCCEEDED
return True
except SoftTimeLimitExceeded:
task_logger.info(f"SoftTimeLimitExceeded exception. doc={document_id}")
completion_status = OnyxCeleryTaskCompletionStatus.SOFT_TIME_LIMIT
except Exception as ex:
e: Exception | None = None
while True:
if isinstance(ex, RetryError):
task_logger.warning(
f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}"
)
# only set the inner exception if it is of type Exception
e_temp = ex.last_attempt.exception()
if isinstance(e_temp, Exception):
e = e_temp
else:
e = ex
task_logger.exception(
f"update_user_file_folder_metadata exceptioned: doc={document_id}"
)
completion_status = OnyxCeleryTaskCompletionStatus.RETRYABLE_EXCEPTION
if (
self.max_retries is not None
and self.request.retries >= self.max_retries
):
completion_status = (
OnyxCeleryTaskCompletionStatus.NON_RETRYABLE_EXCEPTION
)
# Exponential backoff from 2^4 to 2^6 ... i.e. 16, 32, 64
countdown = 2 ** (self.request.retries + 4)
self.retry(exc=e, countdown=countdown) # this will raise a celery exception
break # we won't hit this, but it looks weird not to have it
finally:
task_logger.info(
f"update_user_file_folder_metadata completed: status={completion_status.value} doc={document_id}"
)
return False

View File

@@ -80,8 +80,7 @@ def check_for_vespa_sync_task(self: Task, *, tenant_id: str) -> bool | None:
"""Runs periodically to check if any document needs syncing.
Generates sets of tasks for Celery if syncing is needed."""
# Useful for debugging timing issues with reacquisitions.
# TODO: remove once more generalized logging is in place
# Useful for debugging timing issues with reacquisitions. TODO: remove once more generalized logging is in place
task_logger.info("check_for_vespa_sync_task started")
time_start = time.monotonic()
@@ -573,7 +572,6 @@ def vespa_metadata_sync_task(self: Task, document_id: str, *, tenant_id: str) ->
tenant_id=tenant_id,
chunk_count=doc.chunk_count,
fields=fields,
user_fields=None,
)
# update db last. Worst case = we crash right before this and

View File

@@ -1,20 +0,0 @@
"""Factory stub for running celery worker / celery beat.
This code is different from the primary/beat stubs because there is no EE version to
fetch. Port over the code in those files if we add an EE version of this worker.
This is an app stub purely for sending tasks as a client.
"""
from celery import Celery
from onyx.utils.variable_functionality import set_is_ee_based_on_env_variable
set_is_ee_based_on_env_variable()
def get_app() -> Celery:
from onyx.background.celery.apps.client import celery_app
return celery_app
app = get_app()

View File

@@ -56,6 +56,7 @@ from onyx.indexing.indexing_pipeline import build_indexing_pipeline
from onyx.natural_language_processing.search_nlp_models import (
InformationContentClassificationModel,
)
from onyx.redis.redis_connector import RedisConnector
from onyx.utils.logger import setup_logger
from onyx.utils.logger import TaskAttemptSingleton
from onyx.utils.telemetry import create_milestone_and_report
@@ -273,6 +274,7 @@ def _run_indexing(
"Search settings must be set for indexing. This should not be possible."
)
# search_settings = index_attempt_start.search_settings
db_connector = index_attempt_start.connector_credential_pair.connector
db_credential = index_attempt_start.connector_credential_pair.credential
ctx = RunIndexingContext(
@@ -577,8 +579,11 @@ def _run_indexing(
data={
"index_attempt_id": index_attempt_id,
"cc_pair_id": ctx.cc_pair_id,
"current_docs_indexed": document_count,
"current_chunks_indexed": chunk_count,
"connector_id": ctx.connector_id,
"credential_id": ctx.credential_id,
"total_docs_indexed": document_count,
"total_chunks": chunk_count,
"batch_num": batch_num,
"source": ctx.source.value,
},
tenant_id=tenant_id,
@@ -599,15 +604,26 @@ def _run_indexing(
checkpoint=checkpoint,
)
# Add telemetry for completed indexing
redis_connector = RedisConnector(tenant_id, ctx.cc_pair_id)
redis_connector_index = redis_connector.new_index(
index_attempt_start.search_settings_id
)
final_progress = redis_connector_index.get_progress() or 0
optional_telemetry(
record_type=RecordType.INDEXING_COMPLETE,
data={
"index_attempt_id": index_attempt_id,
"cc_pair_id": ctx.cc_pair_id,
"connector_id": ctx.connector_id,
"credential_id": ctx.credential_id,
"total_docs_indexed": document_count,
"total_chunks": chunk_count,
"batch_count": batch_num,
"time_elapsed_seconds": time.monotonic() - start_time,
"source": ctx.source.value,
"redis_progress": final_progress,
},
tenant_id=tenant_id,
)
@@ -622,9 +638,6 @@ def _run_indexing(
# and mark the CCPair as invalid. This prevents the connector from being
# used in the future until the credentials are updated.
with get_session_with_current_tenant() as db_session_temp:
logger.exception(
f"Marking attempt {index_attempt_id} as canceled due to validation error."
)
mark_attempt_canceled(
index_attempt_id,
db_session_temp,
@@ -671,9 +684,6 @@ def _run_indexing(
elif isinstance(e, ConnectorStopSignal):
with get_session_with_current_tenant() as db_session_temp:
logger.exception(
f"Marking attempt {index_attempt_id} as canceled due to stop signal."
)
mark_attempt_canceled(
index_attempt_id,
db_session_temp,
@@ -736,7 +746,6 @@ def _run_indexing(
f"Connector succeeded: "
f"docs={document_count} chunks={chunk_count} elapsed={elapsed_time:.2f}s"
)
else:
mark_attempt_partially_succeeded(index_attempt_id, db_session_temp)
logger.info(

View File

@@ -10,7 +10,6 @@ from onyx.agents.agent_search.models import GraphPersistence
from onyx.agents.agent_search.models import GraphSearchConfig
from onyx.agents.agent_search.models import GraphTooling
from onyx.agents.agent_search.run_graph import run_basic_graph
from onyx.agents.agent_search.run_graph import run_dc_graph
from onyx.agents.agent_search.run_graph import run_main_graph
from onyx.chat.models import AgentAnswerPiece
from onyx.chat.models import AnswerPacket
@@ -143,18 +142,11 @@ class Answer:
yield from self._processed_stream
return
if self.graph_config.behavior.use_agentic_search:
run_langgraph = run_main_graph
elif (
self.graph_config.inputs.search_request.persona
and self.graph_config.inputs.search_request.persona.description.startswith(
"DivCon Beta Agent"
)
):
run_langgraph = run_dc_graph
else:
run_langgraph = run_basic_graph
run_langgraph = (
run_main_graph
if self.graph_config.behavior.use_agentic_search
else run_basic_graph
)
stream = run_langgraph(
self.graph_config,
)

View File

@@ -127,10 +127,6 @@ class StreamStopInfo(SubQuestionIdentifier):
return data
class UserKnowledgeFilePacket(BaseModel):
user_files: list[FileDescriptor]
class LLMRelevanceFilterResponse(BaseModel):
llm_selected_doc_indices: list[int]
@@ -198,6 +194,17 @@ class StreamingError(BaseModel):
stack_trace: str | None = None
class OnyxContext(BaseModel):
content: str
document_id: str
semantic_identifier: str
blurb: str
class OnyxContexts(BaseModel):
contexts: list[OnyxContext]
class OnyxAnswer(BaseModel):
answer: str | None
@@ -263,6 +270,7 @@ class PersonaOverrideConfig(BaseModel):
AnswerQuestionPossibleReturn = (
OnyxAnswerPiece
| CitationInfo
| OnyxContexts
| FileChatDisplay
| CustomToolResponse
| StreamingError

View File

@@ -29,6 +29,7 @@ from onyx.chat.models import LLMRelevanceFilterResponse
from onyx.chat.models import MessageResponseIDInfo
from onyx.chat.models import MessageSpecificCitations
from onyx.chat.models import OnyxAnswerPiece
from onyx.chat.models import OnyxContexts
from onyx.chat.models import PromptConfig
from onyx.chat.models import QADocsResponse
from onyx.chat.models import RefinedAnswerImprovement
@@ -36,14 +37,12 @@ from onyx.chat.models import StreamingError
from onyx.chat.models import StreamStopInfo
from onyx.chat.models import StreamStopReason
from onyx.chat.models import SubQuestionKey
from onyx.chat.models import UserKnowledgeFilePacket
from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
from onyx.chat.prompt_builder.answer_prompt_builder import default_build_system_message
from onyx.chat.prompt_builder.answer_prompt_builder import default_build_user_message
from onyx.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
from onyx.configs.chat_configs import DISABLE_LLM_CHOOSE_SEARCH
from onyx.configs.chat_configs import MAX_CHUNKS_FED_TO_CHAT
from onyx.configs.chat_configs import SELECTED_SECTIONS_MAX_WINDOW_PERCENTAGE
from onyx.configs.constants import AGENT_SEARCH_INITIAL_KEY
from onyx.configs.constants import BASIC_KEY
from onyx.configs.constants import MessageType
@@ -53,7 +52,6 @@ from onyx.context.search.enums import LLMEvaluationType
from onyx.context.search.enums import OptionalSearchSetting
from onyx.context.search.enums import QueryFlow
from onyx.context.search.enums import SearchType
from onyx.context.search.models import BaseFilters
from onyx.context.search.models import InferenceSection
from onyx.context.search.models import RetrievalDetails
from onyx.context.search.models import SearchRequest
@@ -67,7 +65,6 @@ from onyx.context.search.utils import relevant_sections_to_indices
from onyx.db.chat import attach_files_to_chat_message
from onyx.db.chat import create_db_search_doc
from onyx.db.chat import create_new_chat_message
from onyx.db.chat import create_search_doc_from_user_file
from onyx.db.chat import get_chat_message
from onyx.db.chat import get_chat_session_by_id
from onyx.db.chat import get_db_search_doc_by_id
@@ -84,16 +81,12 @@ from onyx.db.milestone import update_user_assistant_milestone
from onyx.db.models import SearchDoc as DbSearchDoc
from onyx.db.models import ToolCall
from onyx.db.models import User
from onyx.db.models import UserFile
from onyx.db.persona import get_persona_by_id
from onyx.db.search_settings import get_current_search_settings
from onyx.document_index.factory import get_default_document_index
from onyx.file_store.models import ChatFileType
from onyx.file_store.models import FileDescriptor
from onyx.file_store.models import InMemoryChatFile
from onyx.file_store.utils import load_all_chat_files
from onyx.file_store.utils import load_all_user_file_files
from onyx.file_store.utils import load_all_user_files
from onyx.file_store.utils import save_files
from onyx.llm.exceptions import GenAIDisabledException
from onyx.llm.factory import get_llms_for_persona
@@ -106,7 +99,6 @@ from onyx.server.query_and_chat.models import ChatMessageDetail
from onyx.server.query_and_chat.models import CreateChatMessageRequest
from onyx.server.utils import get_json_line
from onyx.tools.force import ForceUseTool
from onyx.tools.models import SearchToolOverrideKwargs
from onyx.tools.models import ToolResponse
from onyx.tools.tool import Tool
from onyx.tools.tool_constructor import construct_tools
@@ -139,6 +131,7 @@ from onyx.tools.tool_implementations.internet_search.internet_search_tool import
from onyx.tools.tool_implementations.search.search_tool import (
FINAL_CONTEXT_DOCUMENTS_ID,
)
from onyx.tools.tool_implementations.search.search_tool import SEARCH_DOC_CONTENT_ID
from onyx.tools.tool_implementations.search.search_tool import (
SEARCH_RESPONSE_SUMMARY_ID,
)
@@ -184,14 +177,11 @@ def _handle_search_tool_response_summary(
db_session: Session,
selected_search_docs: list[DbSearchDoc] | None,
dedupe_docs: bool = False,
user_files: list[UserFile] | None = None,
loaded_user_files: list[InMemoryChatFile] | None = None,
) -> tuple[QADocsResponse, list[DbSearchDoc], list[int] | None]:
response_sumary = cast(SearchResponseSummary, packet.response)
is_extended = isinstance(packet, ExtendedToolResponse)
dropped_inds = None
if not selected_search_docs:
top_docs = chunks_or_sections_to_search_docs(response_sumary.top_sections)
@@ -205,31 +195,9 @@ def _handle_search_tool_response_summary(
create_db_search_doc(server_search_doc=doc, db_session=db_session)
for doc in deduped_docs
]
else:
reference_db_search_docs = selected_search_docs
doc_ids = {doc.id for doc in reference_db_search_docs}
if user_files is not None:
for user_file in user_files:
if user_file.id not in doc_ids:
associated_chat_file = None
if loaded_user_files is not None:
associated_chat_file = next(
(
file
for file in loaded_user_files
if file.file_id == str(user_file.file_id)
),
None,
)
# Use create_search_doc_from_user_file to properly add the document to the database
if associated_chat_file is not None:
db_doc = create_search_doc_from_user_file(
user_file, associated_chat_file, db_session
)
reference_db_search_docs.append(db_doc)
response_docs = [
translate_db_search_doc_to_server_search_doc(db_search_doc)
for db_search_doc in reference_db_search_docs
@@ -287,10 +255,7 @@ def _handle_internet_search_tool_response_summary(
def _get_force_search_settings(
new_msg_req: CreateChatMessageRequest,
tools: list[Tool],
user_file_ids: list[int],
user_folder_ids: list[int],
new_msg_req: CreateChatMessageRequest, tools: list[Tool]
) -> ForceUseTool:
internet_search_available = any(
isinstance(tool, InternetSearchTool) for tool in tools
@@ -298,11 +263,8 @@ def _get_force_search_settings(
search_tool_available = any(isinstance(tool, SearchTool) for tool in tools)
if not internet_search_available and not search_tool_available:
if new_msg_req.force_user_file_search:
return ForceUseTool(force_use=True, tool_name=SearchTool._NAME)
else:
# Does not matter much which tool is set here as force is false and neither tool is available
return ForceUseTool(force_use=False, tool_name=SearchTool._NAME)
# Does not matter much which tool is set here as force is false and neither tool is available
return ForceUseTool(force_use=False, tool_name=SearchTool._NAME)
tool_name = SearchTool._NAME if search_tool_available else InternetSearchTool._NAME
# Currently, the internet search tool does not support query override
@@ -312,25 +274,12 @@ def _get_force_search_settings(
else None
)
# Create override_kwargs for the search tool if user_file_ids are provided
override_kwargs = None
if (user_file_ids or user_folder_ids) and tool_name == SearchTool._NAME:
override_kwargs = SearchToolOverrideKwargs(
force_no_rerank=False,
alternate_db_session=None,
retrieved_sections_callback=None,
skip_query_analysis=False,
user_file_ids=user_file_ids,
user_folder_ids=user_folder_ids,
)
if new_msg_req.file_descriptors:
# If user has uploaded files they're using, don't run any of the search tools
return ForceUseTool(force_use=False, tool_name=tool_name)
should_force_search = any(
[
new_msg_req.force_user_file_search,
new_msg_req.retrieval_options
and new_msg_req.retrieval_options.run_search
== OptionalSearchSetting.ALWAYS,
@@ -343,22 +292,15 @@ def _get_force_search_settings(
if should_force_search:
# If we are using selected docs, just put something here so the Tool doesn't need to build its own args via an LLM call
args = {"query": new_msg_req.message} if new_msg_req.search_doc_ids else args
return ForceUseTool(force_use=True, tool_name=tool_name, args=args)
return ForceUseTool(
force_use=True,
tool_name=tool_name,
args=args,
override_kwargs=override_kwargs,
)
return ForceUseTool(
force_use=False, tool_name=tool_name, args=args, override_kwargs=override_kwargs
)
return ForceUseTool(force_use=False, tool_name=tool_name, args=args)
ChatPacket = (
StreamingError
| QADocsResponse
| OnyxContexts
| LLMRelevanceFilterResponse
| FinalUsedContextDocsResponse
| ChatMessageDetail
@@ -372,7 +314,6 @@ ChatPacket = (
| AgenticMessageResponseIDInfo
| StreamStopInfo
| AgentSearchPacket
| UserKnowledgeFilePacket
)
ChatPacketStream = Iterator[ChatPacket]
@@ -418,10 +359,6 @@ def stream_chat_message_objects(
llm: LLM
try:
# Move these variables inside the try block
file_id_to_user_file = {}
ordered_user_files = None
user_id = user.id if user is not None else None
chat_session = get_chat_session_by_id(
@@ -601,70 +538,6 @@ def stream_chat_message_objects(
)
req_file_ids = [f["id"] for f in new_msg_req.file_descriptors]
latest_query_files = [file for file in files if file.file_id in req_file_ids]
user_file_ids = new_msg_req.user_file_ids or []
user_folder_ids = new_msg_req.user_folder_ids or []
if persona.user_files:
for file in persona.user_files:
user_file_ids.append(file.id)
if persona.user_folders:
for folder in persona.user_folders:
user_folder_ids.append(folder.id)
# Initialize flag for user file search
use_search_for_user_files = False
user_files: list[InMemoryChatFile] | None = None
search_for_ordering_only = False
user_file_files: list[UserFile] | None = None
if user_file_ids or user_folder_ids:
# Load user files
user_files = load_all_user_files(
user_file_ids or [],
user_folder_ids or [],
db_session,
)
user_file_files = load_all_user_file_files(
user_file_ids or [],
user_folder_ids or [],
db_session,
)
# Store mapping of file_id to file for later reordering
if user_files:
file_id_to_user_file = {file.file_id: file for file in user_files}
# Calculate token count for the files
from onyx.db.user_documents import calculate_user_files_token_count
from onyx.chat.prompt_builder.citations_prompt import (
compute_max_document_tokens_for_persona,
)
total_tokens = calculate_user_files_token_count(
user_file_ids or [],
user_folder_ids or [],
db_session,
)
# Calculate available tokens for documents based on prompt, user input, etc.
available_tokens = compute_max_document_tokens_for_persona(
db_session=db_session,
persona=persona,
actual_user_input=message_text, # Use the actual user message
)
logger.debug(
f"Total file tokens: {total_tokens}, Available tokens: {available_tokens}"
)
# ALWAYS use search for user files, but track if we need it for context or just ordering
use_search_for_user_files = True
# If files are small enough for context, we'll just use search for ordering
search_for_ordering_only = total_tokens <= available_tokens
if search_for_ordering_only:
# Add original user files to context since they fit
if user_files:
latest_query_files.extend(user_files)
if user_message:
attach_files_to_chat_message(
@@ -693,13 +566,8 @@ def stream_chat_message_objects(
doc_identifiers=identifier_tuples,
document_index=document_index,
)
# Add a maximum context size in the case of user-selected docs to prevent
# slight inaccuracies in context window size pruning from causing
# the entire query to fail
document_pruning_config = DocumentPruningConfig(
is_manually_selected_docs=True,
max_window_percentage=SELECTED_SECTIONS_MAX_WINDOW_PERCENTAGE,
is_manually_selected_docs=True
)
# In case the search doc is deleted, just don't include it
@@ -812,10 +680,8 @@ def stream_chat_message_objects(
prompt_config=prompt_config,
db_session=db_session,
user=user,
user_knowledge_present=bool(user_files or user_folder_ids),
llm=llm,
fast_llm=fast_llm,
use_file_search=new_msg_req.force_user_file_search,
search_tool_config=SearchToolConfig(
answer_style_config=answer_style_config,
document_pruning_config=document_pruning_config,
@@ -845,138 +711,17 @@ def stream_chat_message_objects(
for tool_list in tool_dict.values():
tools.extend(tool_list)
force_use_tool = _get_force_search_settings(
new_msg_req, tools, user_file_ids, user_folder_ids
)
# Set force_use if user files exceed token limit
if use_search_for_user_files:
try:
# Check if search tool is available in the tools list
search_tool_available = any(
isinstance(tool, SearchTool) for tool in tools
)
# If no search tool is available, add one
if not search_tool_available:
logger.info("No search tool available, creating one for user files")
# Create a basic search tool config
search_tool_config = SearchToolConfig(
answer_style_config=answer_style_config,
document_pruning_config=document_pruning_config,
retrieval_options=retrieval_options or RetrievalDetails(),
)
# Create and add the search tool
search_tool = SearchTool(
db_session=db_session,
user=user,
persona=persona,
retrieval_options=search_tool_config.retrieval_options,
prompt_config=prompt_config,
llm=llm,
fast_llm=fast_llm,
pruning_config=search_tool_config.document_pruning_config,
answer_style_config=search_tool_config.answer_style_config,
evaluation_type=(
LLMEvaluationType.BASIC
if persona.llm_relevance_filter
else LLMEvaluationType.SKIP
),
bypass_acl=bypass_acl,
)
# Add the search tool to the tools list
tools.append(search_tool)
logger.info(
"Added search tool for user files that exceed token limit"
)
# Now set force_use_tool.force_use to True
force_use_tool.force_use = True
force_use_tool.tool_name = SearchTool._NAME
# Set query argument if not already set
if not force_use_tool.args:
force_use_tool.args = {"query": final_msg.message}
# Pass the user file IDs to the search tool
if user_file_ids or user_folder_ids:
# Create a BaseFilters object with user_file_ids
if not retrieval_options:
retrieval_options = RetrievalDetails()
if not retrieval_options.filters:
retrieval_options.filters = BaseFilters()
# Set user file and folder IDs in the filters
retrieval_options.filters.user_file_ids = user_file_ids
retrieval_options.filters.user_folder_ids = user_folder_ids
# Create override kwargs for the search tool
override_kwargs = SearchToolOverrideKwargs(
force_no_rerank=search_for_ordering_only, # Skip reranking for ordering-only
alternate_db_session=None,
retrieved_sections_callback=None,
skip_query_analysis=search_for_ordering_only, # Skip query analysis for ordering-only
user_file_ids=user_file_ids,
user_folder_ids=user_folder_ids,
ordering_only=search_for_ordering_only, # Set ordering_only flag for fast path
)
# Set the override kwargs in the force_use_tool
force_use_tool.override_kwargs = override_kwargs
if search_for_ordering_only:
logger.info(
"Fast path: Configured search tool with optimized settings for ordering-only"
)
logger.info(
"Fast path: Skipping reranking and query analysis for ordering-only mode"
)
logger.info(
f"Using {len(user_file_ids or [])} files and {len(user_folder_ids or [])} folders"
)
else:
logger.info(
"Configured search tool to use ",
f"{len(user_file_ids or [])} files and {len(user_folder_ids or [])} folders",
)
except Exception as e:
logger.exception(
f"Error configuring search tool for user files: {str(e)}"
)
use_search_for_user_files = False
# TODO: unify message history with single message history
message_history = [
PreviousMessage.from_chat_message(msg, files) for msg in history_msgs
]
if not use_search_for_user_files and user_files:
yield UserKnowledgeFilePacket(
user_files=[
FileDescriptor(
id=str(file.file_id), type=ChatFileType.USER_KNOWLEDGE
)
for file in user_files
]
)
if search_for_ordering_only:
logger.info(
"Performance: Forcing LLMEvaluationType.SKIP to prevent chunk evaluation for ordering-only search"
)
search_request = SearchRequest(
query=final_msg.message,
evaluation_type=(
LLMEvaluationType.SKIP
if search_for_ordering_only
else (
LLMEvaluationType.BASIC
if persona.llm_relevance_filter
else LLMEvaluationType.SKIP
)
LLMEvaluationType.BASIC
if persona.llm_relevance_filter
else LLMEvaluationType.SKIP
),
human_selected_filters=(
retrieval_options.filters if retrieval_options else None
@@ -995,6 +740,7 @@ def stream_chat_message_objects(
),
)
force_use_tool = _get_force_search_settings(new_msg_req, tools)
prompt_builder = AnswerPromptBuilder(
user_message=default_build_user_message(
user_query=final_msg.message,
@@ -1063,22 +809,8 @@ def stream_chat_message_objects(
info = info_by_subq[
SubQuestionKey(level=level, question_num=level_question_num)
]
# Skip LLM relevance processing entirely for ordering-only mode
if search_for_ordering_only and packet.id == SECTION_RELEVANCE_LIST_ID:
logger.info(
"Fast path: Completely bypassing section relevance processing for ordering-only mode"
)
# Skip this packet entirely since it would trigger LLM processing
continue
# TODO: don't need to dedupe here when we do it in agent flow
if packet.id == SEARCH_RESPONSE_SUMMARY_ID:
if search_for_ordering_only:
logger.info(
"Fast path: Skipping document deduplication for ordering-only mode"
)
(
info.qa_docs_response,
info.reference_db_search_docs,
@@ -1088,91 +820,16 @@ def stream_chat_message_objects(
db_session=db_session,
selected_search_docs=selected_db_search_docs,
# Deduping happens at the last step to avoid harming quality by dropping content early on
# Skip deduping completely for ordering-only mode to save time
dedupe_docs=(
False
if search_for_ordering_only
else (
retrieval_options.dedupe_docs
if retrieval_options
else False
)
retrieval_options.dedupe_docs
if retrieval_options
else False
),
user_files=user_file_files if search_for_ordering_only else [],
loaded_user_files=user_files
if search_for_ordering_only
else [],
)
# If we're using search just for ordering user files
if (
search_for_ordering_only
and user_files
and info.qa_docs_response
):
logger.info(
f"ORDERING: Processing search results for ordering {len(user_files)} user files"
)
import time
ordering_start = time.time()
# Extract document order from search results
doc_order = []
for doc in info.qa_docs_response.top_documents:
doc_id = doc.document_id
if str(doc_id).startswith("USER_FILE_CONNECTOR__"):
file_id = doc_id.replace("USER_FILE_CONNECTOR__", "")
if file_id in file_id_to_user_file:
doc_order.append(file_id)
logger.info(
f"ORDERING: Found {len(doc_order)} files from search results"
)
# Add any files that weren't in search results at the end
missing_files = [
f_id
for f_id in file_id_to_user_file.keys()
if f_id not in doc_order
]
missing_files.extend(doc_order)
doc_order = missing_files
logger.info(
f"ORDERING: Added {len(missing_files)} missing files to the end"
)
# Reorder user files based on search results
ordered_user_files = [
file_id_to_user_file[f_id]
for f_id in doc_order
if f_id in file_id_to_user_file
]
time.time() - ordering_start
yield UserKnowledgeFilePacket(
user_files=[
FileDescriptor(
id=str(file.file_id),
type=ChatFileType.USER_KNOWLEDGE,
)
for file in ordered_user_files
]
)
yield info.qa_docs_response
elif packet.id == SECTION_RELEVANCE_LIST_ID:
relevance_sections = packet.response
if search_for_ordering_only:
logger.info(
"Performance: Skipping relevance filtering for ordering-only mode"
)
continue
if info.reference_db_search_docs is None:
logger.warning(
"No reference docs found for relevance filtering"
@@ -1262,6 +919,8 @@ def stream_chat_message_objects(
response=custom_tool_response.tool_result,
tool_name=custom_tool_response.tool_name,
)
elif packet.id == SEARCH_DOC_CONTENT_ID and include_contexts:
yield cast(OnyxContexts, packet.response)
elif isinstance(packet, StreamStopInfo):
if packet.stop_reason == StreamStopReason.FINISHED:
@@ -1282,7 +941,7 @@ def stream_chat_message_objects(
]
info.tool_result = packet
yield cast(ChatPacket, packet)
logger.debug("Reached end of stream")
except ValueError as e:
logger.exception("Failed to process chat message.")
@@ -1364,16 +1023,10 @@ def stream_chat_message_objects(
error=ERROR_TYPE_CANCELLED if answer.is_cancelled() else None,
tool_call=(
ToolCall(
tool_id=tool_name_to_tool_id.get(info.tool_result.tool_name, 0)
if info.tool_result
else None,
tool_name=info.tool_result.tool_name if info.tool_result else None,
tool_arguments=info.tool_result.tool_args
if info.tool_result
else None,
tool_result=info.tool_result.tool_result
if info.tool_result
else None,
tool_id=tool_name_to_tool_id[info.tool_result.tool_name],
tool_name=info.tool_result.tool_name,
tool_arguments=info.tool_result.tool_args,
tool_result=info.tool_result.tool_result,
)
if info.tool_result
else None

View File

@@ -19,7 +19,6 @@ def translate_onyx_msg_to_langchain(
# attached. Just ignore them for now.
if not isinstance(msg, ChatMessage):
files = msg.files
content = build_content_with_imgs(
msg.message, files, message_type=msg.message_type, exclude_images=exclude_images
)

View File

@@ -153,8 +153,6 @@ def _apply_pruning(
# remove docs that are explicitly marked as not for QA
sections = _remove_sections_to_ignore(sections=sections)
section_idx_token_count: dict[int, int] = {}
final_section_ind = None
total_tokens = 0
for ind, section in enumerate(sections):
@@ -204,20 +202,10 @@ def _apply_pruning(
section_token_count = DOC_EMBEDDING_CONTEXT_SIZE
total_tokens += section_token_count
section_idx_token_count[ind] = section_token_count
if total_tokens > token_limit:
final_section_ind = ind
break
try:
logger.debug(f"Number of documents after pruning: {ind + 1}")
logger.debug("Number of tokens per document (pruned):")
for x, y in section_idx_token_count.items():
logger.debug(f"{x + 1}: {y}")
except Exception as e:
logger.error(f"Error logging prune statistics: {e}")
if final_section_ind is not None:
if is_manually_selected_docs or use_sections:
if final_section_ind != len(sections) - 1:
@@ -312,14 +300,7 @@ def prune_sections(
)
def _merge_doc_chunks(chunks: list[InferenceChunk]) -> tuple[InferenceSection, int]:
assert (
len(set([chunk.document_id for chunk in chunks])) == 1
), "One distinct document must be passed into merge_doc_chunks"
ADJACENT_CHUNK_SEP = "\n"
DISTANT_CHUNK_SEP = "\n\n...\n\n"
def _merge_doc_chunks(chunks: list[InferenceChunk]) -> InferenceSection:
# Assuming there are no duplicates by this point
sorted_chunks = sorted(chunks, key=lambda x: x.chunk_id)
@@ -327,48 +308,33 @@ def _merge_doc_chunks(chunks: list[InferenceChunk]) -> tuple[InferenceSection, i
chunks, key=lambda x: x.score if x.score is not None else float("-inf")
)
added_chars = 0
merged_content = []
for i, chunk in enumerate(sorted_chunks):
if i > 0:
prev_chunk_id = sorted_chunks[i - 1].chunk_id
sep = (
ADJACENT_CHUNK_SEP
if chunk.chunk_id == prev_chunk_id + 1
else DISTANT_CHUNK_SEP
)
merged_content.append(sep)
added_chars += len(sep)
if chunk.chunk_id == prev_chunk_id + 1:
merged_content.append("\n")
else:
merged_content.append("\n\n...\n\n")
merged_content.append(chunk.content)
combined_content = "".join(merged_content)
return (
InferenceSection(
center_chunk=center_chunk,
chunks=sorted_chunks,
combined_content=combined_content,
),
added_chars,
return InferenceSection(
center_chunk=center_chunk,
chunks=sorted_chunks,
combined_content=combined_content,
)
def _merge_sections(sections: list[InferenceSection]) -> list[InferenceSection]:
docs_map: dict[str, dict[int, InferenceChunk]] = defaultdict(dict)
doc_order: dict[str, int] = {}
combined_section_lengths: dict[str, int] = defaultdict(lambda: 0)
# chunk de-duping and doc ordering
for index, section in enumerate(sections):
if section.center_chunk.document_id not in doc_order:
doc_order[section.center_chunk.document_id] = index
combined_section_lengths[section.center_chunk.document_id] += len(
section.combined_content
)
chunks_map = docs_map[section.center_chunk.document_id]
for chunk in [section.center_chunk] + section.chunks:
chunks_map = docs_map[section.center_chunk.document_id]
existing_chunk = chunks_map.get(chunk.chunk_id)
if (
existing_chunk is None
@@ -379,22 +345,8 @@ def _merge_sections(sections: list[InferenceSection]) -> list[InferenceSection]:
chunks_map[chunk.chunk_id] = chunk
new_sections = []
for doc_id, section_chunks in docs_map.items():
section_chunks_list = list(section_chunks.values())
merged_section, added_chars = _merge_doc_chunks(chunks=section_chunks_list)
previous_length = combined_section_lengths[doc_id] + added_chars
# After merging, ensure the content respects the pruning done earlier. Each
# combined section is restricted to the sum of the lengths of the sections
# from the pruning step. Technically the correct approach would be to prune based
# on tokens AGAIN, but this is a good approximation and worth not adding the
# tokenization overhead. This could also be fixed if we added a way of removing
# chunks from sections in the pruning step; at the moment this issue largely
# exists because we only trim the final section's combined_content.
merged_section.combined_content = merged_section.combined_content[
:previous_length
]
new_sections.append(merged_section)
for section_chunks in docs_map.values():
new_sections.append(_merge_doc_chunks(chunks=list(section_chunks.values())))
# Sort by highest score, then by original document order
# It is now 1 large section per doc, the center chunk being the one with the highest score
@@ -406,26 +358,6 @@ def _merge_sections(sections: list[InferenceSection]) -> list[InferenceSection]:
reverse=True,
)
try:
num_original_sections = len(sections)
num_original_document_ids = len(
set([section.center_chunk.document_id for section in sections])
)
num_merged_sections = len(new_sections)
num_merged_document_ids = len(
set([section.center_chunk.document_id for section in new_sections])
)
logger.debug(
f"Merged {num_original_sections} sections from {num_original_document_ids} documents "
f"into {num_merged_sections} new sections in {num_merged_document_ids} documents"
)
logger.debug("Number of chunks per document (new ranking):")
for x, y in enumerate(new_sections):
logger.debug(f"{x + 1}: {len(y.chunks)}")
except Exception as e:
logger.error(f"Error logging merge statistics: {e}")
return new_sections

View File

@@ -3,6 +3,7 @@ from collections.abc import Sequence
from pydantic import BaseModel
from onyx.chat.models import LlmDoc
from onyx.chat.models import OnyxContext
from onyx.context.search.models import InferenceChunk
@@ -11,7 +12,7 @@ class DocumentIdOrderMapping(BaseModel):
def map_document_id_order(
chunks: Sequence[InferenceChunk | LlmDoc], one_indexed: bool = True
chunks: Sequence[InferenceChunk | LlmDoc | OnyxContext], one_indexed: bool = True
) -> DocumentIdOrderMapping:
order_mapping = {}
current = 1 if one_indexed else 0

View File

@@ -180,10 +180,6 @@ def get_tool_call_for_non_tool_calling_llm_impl(
if tool_args is None:
raise RuntimeError(f"Tool '{tool.name}' did not return args")
# If we have override_kwargs, add them to the tool_args
if force_use_tool.override_kwargs is not None:
tool_args["override_kwargs"] = force_use_tool.override_kwargs
return (tool, tool_args)
else:
tool_options = check_which_tools_should_run_for_non_tool_calling_llm(

View File

@@ -170,7 +170,7 @@ POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
POSTGRES_PASSWORD = urllib.parse.quote_plus(
os.environ.get("POSTGRES_PASSWORD") or "password"
)
POSTGRES_HOST = os.environ.get("POSTGRES_HOST") or "127.0.0.1"
POSTGRES_HOST = os.environ.get("POSTGRES_HOST") or "localhost"
POSTGRES_PORT = os.environ.get("POSTGRES_PORT") or "5432"
POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"
AWS_REGION_NAME = os.environ.get("AWS_REGION_NAME") or "us-east-2"
@@ -437,7 +437,7 @@ LINEAR_CLIENT_ID = os.getenv("LINEAR_CLIENT_ID")
LINEAR_CLIENT_SECRET = os.getenv("LINEAR_CLIENT_SECRET")
# Slack specific configs
SLACK_NUM_THREADS = int(os.getenv("SLACK_NUM_THREADS") or 8)
SLACK_NUM_THREADS = int(os.getenv("SLACK_NUM_THREADS") or 2)
DASK_JOB_CLIENT_ENABLED = (
os.environ.get("DASK_JOB_CLIENT_ENABLED", "").lower() == "true"
@@ -495,11 +495,6 @@ NUM_SECONDARY_INDEXING_WORKERS = int(
ENABLE_MULTIPASS_INDEXING = (
os.environ.get("ENABLE_MULTIPASS_INDEXING", "").lower() == "true"
)
# Enable contextual retrieval
ENABLE_CONTEXTUAL_RAG = os.environ.get("ENABLE_CONTEXTUAL_RAG", "").lower() == "true"
DEFAULT_CONTEXTUAL_RAG_LLM_NAME = "gpt-4o-mini"
DEFAULT_CONTEXTUAL_RAG_LLM_PROVIDER = "DevEnvPresetOpenAI"
# Finer grained chunking for more detail retention
# Slightly larger since the sentence aware split is a max cutoff so most minichunks will be under MINI_CHUNK_SIZE
# tokens. But we need it to be at least as big as 1/4th chunk size to avoid having a tiny mini-chunk at the end
@@ -541,17 +536,6 @@ MAX_FILE_SIZE_BYTES = int(
os.environ.get("MAX_FILE_SIZE_BYTES") or 2 * 1024 * 1024 * 1024
) # 2GB in bytes
# Use document summary for contextual rag
USE_DOCUMENT_SUMMARY = os.environ.get("USE_DOCUMENT_SUMMARY", "true").lower() == "true"
# Use chunk summary for contextual rag
USE_CHUNK_SUMMARY = os.environ.get("USE_CHUNK_SUMMARY", "true").lower() == "true"
# Average summary embeddings for contextual rag (not yet implemented)
AVERAGE_SUMMARY_EMBEDDINGS = (
os.environ.get("AVERAGE_SUMMARY_EMBEDDINGS", "false").lower() == "true"
)
MAX_TOKENS_FOR_FULL_INCLUSION = 4096
#####
# Miscellaneous
#####

View File

@@ -3,7 +3,7 @@ import os
INPUT_PROMPT_YAML = "./onyx/seeding/input_prompts.yaml"
PROMPTS_YAML = "./onyx/seeding/prompts.yaml"
PERSONAS_YAML = "./onyx/seeding/personas.yaml"
USER_FOLDERS_YAML = "./onyx/seeding/user_folders.yaml"
NUM_RETURNED_HITS = 50
# Used for LLM filtering and reranking
# We want this to be approximately the number of results we want to show on the first page
@@ -16,9 +16,6 @@ MAX_CHUNKS_FED_TO_CHAT = float(os.environ.get("MAX_CHUNKS_FED_TO_CHAT") or 10.0)
# ~3k input, half for docs, half for chat history + prompts
CHAT_TARGET_CHUNK_PERCENTAGE = 512 * 3 / 3072
# Maximum percentage of the context window to fill with selected sections
SELECTED_SECTIONS_MAX_WINDOW_PERCENTAGE = 0.8
# 1 / (1 + DOC_TIME_DECAY * doc-age-in-years), set to 0 to have no decay
# Capped in Vespa at 0.5
DOC_TIME_DECAY = float(

View File

@@ -102,8 +102,6 @@ CELERY_GENERIC_BEAT_LOCK_TIMEOUT = 120
CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT = 120
CELERY_USER_FILE_FOLDER_SYNC_BEAT_LOCK_TIMEOUT = 120
CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120
@@ -271,7 +269,6 @@ class FileOrigin(str, Enum):
CONNECTOR = "connector"
GENERATED_REPORT = "generated_report"
INDEXING_CHECKPOINT = "indexing_checkpoint"
PLAINTEXT_CACHE = "plaintext_cache"
OTHER = "other"
@@ -312,7 +309,6 @@ class OnyxCeleryQueues:
# Indexing queue
CONNECTOR_INDEXING = "connector_indexing"
USER_FILES_INDEXING = "user_files_indexing"
# Monitoring queue
MONITORING = "monitoring"
@@ -331,7 +327,6 @@ class OnyxRedisLocks:
CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK = (
"da_lock:check_connector_external_group_sync_beat"
)
CHECK_USER_FILE_FOLDER_SYNC_BEAT_LOCK = "da_lock:check_user_file_folder_sync_beat"
MONITOR_BACKGROUND_PROCESSES_LOCK = "da_lock:monitor_background_processes"
CHECK_AVAILABLE_TENANTS_LOCK = "da_lock:check_available_tenants"
PRE_PROVISION_TENANT_LOCK = "da_lock:pre_provision_tenant"
@@ -402,7 +397,6 @@ class OnyxCeleryTask:
# Tenant pre-provisioning
PRE_PROVISION_TENANT = f"{ONYX_CLOUD_CELERY_TASK_PREFIX}_pre_provision_tenant"
UPDATE_USER_FILE_FOLDER_METADATA = "update_user_file_folder_metadata"
CHECK_FOR_CONNECTOR_DELETION = "check_for_connector_deletion_task"
CHECK_FOR_VESPA_SYNC_TASK = "check_for_vespa_sync_task"
@@ -411,7 +405,6 @@ class OnyxCeleryTask:
CHECK_FOR_DOC_PERMISSIONS_SYNC = "check_for_doc_permissions_sync"
CHECK_FOR_EXTERNAL_GROUP_SYNC = "check_for_external_group_sync"
CHECK_FOR_LLM_MODEL_UPDATE = "check_for_llm_model_update"
CHECK_FOR_USER_FILE_FOLDER_SYNC = "check_for_user_file_folder_sync"
# Connector checkpoint cleanup
CHECK_FOR_CHECKPOINT_CLEANUP = "check_for_checkpoint_cleanup"

View File

@@ -13,7 +13,6 @@ from typing import TYPE_CHECKING
from typing import TypeVar
from urllib.parse import parse_qs
from urllib.parse import quote
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
@@ -343,14 +342,9 @@ def build_confluence_document_id(
Returns:
str: The document id
"""
# NOTE: urljoin is tricky and will drop the last segment of the base if it doesn't
# end with "/" because it believes that makes it a file.
final_url = base_url.rstrip("/") + "/"
if is_cloud and not final_url.endswith("/wiki/"):
final_url = urljoin(final_url, "wiki") + "/"
final_url = urljoin(final_url, content_url.lstrip("/"))
return final_url
if is_cloud and not base_url.endswith("/wiki"):
base_url += "/wiki"
return f"{base_url}{content_url}"
def datetime_from_string(datetime_string: str) -> datetime:
@@ -460,19 +454,6 @@ def _handle_http_error(e: requests.HTTPError, attempt: int) -> int:
logger.warning("HTTPError with `None` as response or as headers")
raise e
# Confluence Server returns 403 when rate limited
if e.response.status_code == 403:
FORBIDDEN_MAX_RETRY_ATTEMPTS = 7
FORBIDDEN_RETRY_DELAY = 10
if attempt < FORBIDDEN_MAX_RETRY_ATTEMPTS:
logger.warning(
"403 error. This sometimes happens when we hit "
f"Confluence rate limits. Retrying in {FORBIDDEN_RETRY_DELAY} seconds..."
)
return FORBIDDEN_RETRY_DELAY
raise e
if (
e.response.status_code != 429
and RATE_LIMIT_MESSAGE_LOWERCASE not in e.response.text.lower()

View File

@@ -45,8 +45,6 @@ _FIREFLIES_API_QUERY = """
}
"""
ONE_MINUTE = 60
def _create_doc_from_transcript(transcript: dict) -> Document | None:
sections: List[TextSection] = []
@@ -108,8 +106,6 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
)
# If not all transcripts are being indexed, try using a more-recently-generated
# API key.
class FirefliesConnector(PollConnector, LoadConnector):
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
self.batch_size = batch_size
@@ -195,9 +191,6 @@ class FirefliesConnector(PollConnector, LoadConnector):
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
# add some leeway to account for any timezone funkiness and/or bad handling
# of start time on the Fireflies side
start = max(0, start - ONE_MINUTE)
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%S.000Z"
)

View File

@@ -276,26 +276,7 @@ class GithubConnector(CheckpointConnector[GithubConnectorCheckpoint]):
return checkpoint
assert checkpoint.cached_repo is not None, "No repo saved in checkpoint"
# Try to access the requester - different PyGithub versions may use different attribute names
try:
# Try direct access to a known attribute name first
if hasattr(self.github_client, "_requester"):
requester = self.github_client._requester
elif hasattr(self.github_client, "_Github__requester"):
requester = self.github_client._Github__requester
else:
# If we can't find the requester attribute, we need to fall back to recreating the repo
raise AttributeError("Could not find requester attribute")
repo = checkpoint.cached_repo.to_Repository(requester)
except Exception as e:
# If all else fails, re-fetch the repo directly
logger.warning(
f"Failed to deserialize repository: {e}. Attempting to re-fetch."
)
repo_id = checkpoint.cached_repo.id
repo = self.github_client.get_repo(repo_id)
repo = checkpoint.cached_repo.to_Repository(self.github_client.requester)
if self.include_prs and checkpoint.stage == GithubConnectorStage.PRS:
logger.info(f"Fetching PRs for repo: {repo.name}")

View File

@@ -1,5 +1,4 @@
import base64
import time
from collections.abc import Generator
from datetime import datetime
from datetime import timedelta
@@ -8,8 +7,6 @@ from typing import Any
from typing import cast
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.app_configs import GONG_CONNECTOR_START_TIME
@@ -24,14 +21,13 @@ from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
logger = setup_logger()
GONG_BASE_URL = "https://us-34014.api.gong.io"
class GongConnector(LoadConnector, PollConnector):
BASE_URL = "https://api.gong.io"
MAX_CALL_DETAILS_ATTEMPTS = 6
CALL_DETAILS_DELAY = 30 # in seconds
def __init__(
self,
workspaces: list[str] | None = None,
@@ -45,23 +41,15 @@ class GongConnector(LoadConnector, PollConnector):
self.auth_token_basic: str | None = None
self.hide_user_info = hide_user_info
retry_strategy = Retry(
total=5,
backoff_factor=2,
status_forcelist=[429, 500, 502, 503, 504],
)
def _get_auth_header(self) -> dict[str, str]:
if self.auth_token_basic is None:
raise ConnectorMissingCredentialError("Gong")
session = requests.Session()
session.mount(GongConnector.BASE_URL, HTTPAdapter(max_retries=retry_strategy))
self._session = session
@staticmethod
def make_url(endpoint: str) -> str:
url = f"{GongConnector.BASE_URL}{endpoint}"
return url
return {"Authorization": f"Basic {self.auth_token_basic}"}
def _get_workspace_id_map(self) -> dict[str, str]:
response = self._session.get(GongConnector.make_url("/v2/workspaces"))
url = f"{GONG_BASE_URL}/v2/workspaces"
response = requests.get(url, headers=self._get_auth_header())
response.raise_for_status()
workspaces_details = response.json().get("workspaces")
@@ -78,6 +66,7 @@ class GongConnector(LoadConnector, PollConnector):
def _get_transcript_batches(
self, start_datetime: str | None = None, end_datetime: str | None = None
) -> Generator[list[dict[str, Any]], None, None]:
url = f"{GONG_BASE_URL}/v2/calls/transcript"
body: dict[str, dict] = {"filter": {}}
if start_datetime:
body["filter"]["fromDateTime"] = start_datetime
@@ -105,8 +94,8 @@ class GongConnector(LoadConnector, PollConnector):
del body["filter"]["workspaceId"]
while True:
response = self._session.post(
GongConnector.make_url("/v2/calls/transcript"), json=body
response = requests.post(
url, headers=self._get_auth_header(), json=body
)
# If no calls in the range, just break out
if response.status_code == 404:
@@ -136,14 +125,14 @@ class GongConnector(LoadConnector, PollConnector):
yield transcripts
def _get_call_details_by_ids(self, call_ids: list[str]) -> dict:
url = f"{GONG_BASE_URL}/v2/calls/extensive"
body = {
"filter": {"callIds": call_ids},
"contentSelector": {"exposedFields": {"parties": True}},
}
response = self._session.post(
GongConnector.make_url("/v2/calls/extensive"), json=body
)
response = requests.post(url, headers=self._get_auth_header(), json=body)
response.raise_for_status()
calls = response.json().get("calls")
@@ -176,74 +165,24 @@ class GongConnector(LoadConnector, PollConnector):
def _fetch_calls(
self, start_datetime: str | None = None, end_datetime: str | None = None
) -> GenerateDocumentsOutput:
num_calls = 0
for transcript_batch in self._get_transcript_batches(
start_datetime, end_datetime
):
doc_batch: list[Document] = []
transcript_call_ids = cast(
call_ids = cast(
list[str],
[t.get("callId") for t in transcript_batch if t.get("callId")],
)
call_details_map = self._get_call_details_by_ids(call_ids)
call_details_map: dict[str, Any] = {}
# There's a likely race condition in the API where a transcript will have a
# call id but the call to v2/calls/extensive will not return all of the id's
# retry with exponential backoff has been observed to mitigate this
# in ~2 minutes
current_attempt = 0
while True:
current_attempt += 1
call_details_map = self._get_call_details_by_ids(transcript_call_ids)
if set(transcript_call_ids) == set(call_details_map.keys()):
# we got all the id's we were expecting ... break and continue
break
# we are missing some id's. Log and retry with exponential backoff
missing_call_ids = set(transcript_call_ids) - set(
call_details_map.keys()
)
logger.warning(
f"_get_call_details_by_ids is missing call id's: "
f"current_attempt={current_attempt} "
f"missing_call_ids={missing_call_ids}"
)
if current_attempt >= self.MAX_CALL_DETAILS_ATTEMPTS:
raise RuntimeError(
f"Attempt count exceeded for _get_call_details_by_ids: "
f"missing_call_ids={missing_call_ids} "
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
)
wait_seconds = self.CALL_DETAILS_DELAY * pow(2, current_attempt - 1)
logger.warning(
f"_get_call_details_by_ids waiting to retry: "
f"wait={wait_seconds}s "
f"current_attempt={current_attempt} "
f"next_attempt={current_attempt+1} "
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
)
time.sleep(wait_seconds)
# now we can iterate per call/transcript
for transcript in transcript_batch:
call_id = transcript.get("callId")
if not call_id or call_id not in call_details_map:
# NOTE(rkuo): seeing odd behavior where call_ids from the transcript
# don't have call details. adding error debugging logs to trace.
logger.error(
f"Couldn't get call information for Call ID: {call_id}"
)
if call_id:
logger.error(
f"Call debug info: call_id={call_id} "
f"call_ids={transcript_call_ids} "
f"call_details_map={call_details_map.keys()}"
)
if not self.continue_on_fail:
raise RuntimeError(
f"Couldn't get call information for Call ID: {call_id}"
@@ -256,8 +195,7 @@ class GongConnector(LoadConnector, PollConnector):
call_time_str = call_metadata["started"]
call_title = call_metadata["title"]
logger.info(
f"{num_calls+1}: Indexing Gong call id {call_id} "
f"from {call_time_str.split('T', 1)[0]}: {call_title}"
f"Indexing Gong call from {call_time_str.split('T', 1)[0]}: {call_title}"
)
call_parties = cast(list[dict] | None, call_details.get("parties"))
@@ -316,13 +254,8 @@ class GongConnector(LoadConnector, PollConnector):
metadata={"client": call_metadata.get("system")},
)
)
num_calls += 1
yield doc_batch
logger.info(f"_fetch_calls finished: num_calls={num_calls}")
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
combined = (
f'{credentials["gong_access_key"]}:{credentials["gong_access_key_secret"]}'
@@ -330,13 +263,6 @@ class GongConnector(LoadConnector, PollConnector):
self.auth_token_basic = base64.b64encode(combined.encode("utf-8")).decode(
"utf-8"
)
if self.auth_token_basic is None:
raise ConnectorMissingCredentialError("Gong")
self._session.headers.update(
{"Authorization": f"Basic {self.auth_token_basic}"}
)
return None
def load_from_state(self) -> GenerateDocumentsOutput:

View File

@@ -28,9 +28,7 @@ from onyx.connectors.google_drive.doc_conversion import (
)
from onyx.connectors.google_drive.file_retrieval import crawl_folders_for_files
from onyx.connectors.google_drive.file_retrieval import get_all_files_for_oauth
from onyx.connectors.google_drive.file_retrieval import (
get_all_files_in_my_drive_and_shared,
)
from onyx.connectors.google_drive.file_retrieval import get_all_files_in_my_drive
from onyx.connectors.google_drive.file_retrieval import get_files_in_shared_drive
from onyx.connectors.google_drive.file_retrieval import get_root_folder_id
from onyx.connectors.google_drive.models import DriveRetrievalStage
@@ -88,18 +86,13 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
def _convert_single_file(
creds: Any,
primary_admin_email: str,
allow_images: bool,
size_threshold: int,
retriever_email: str,
file: dict[str, Any],
) -> Document | ConnectorFailure | None:
# We used to always get the user email from the file owners when available,
# but this was causing issues with shared folders where the owner was not included in the service account
# now we use the email of the account that successfully listed the file. Leaving this in case we end up
# wanting to retry with file owners and/or admin email at some point.
# user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
user_email = retriever_email
# Only construct these services when needed
user_drive_service = lazy_eval(
lambda: get_drive_service(creds, user_email=user_email)
@@ -445,9 +438,6 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
logger.warning(
f"User '{user_email}' does not have access to the drive APIs."
)
# mark this user as done so we don't try to retrieve anything for them
# again
curr_stage.stage = DriveRetrievalStage.DONE
return
raise
@@ -460,11 +450,10 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
logger.info(f"Getting all files in my drive as '{user_email}'")
yield from add_retrieval_info(
get_all_files_in_my_drive_and_shared(
get_all_files_in_my_drive(
service=drive_service,
update_traversed_ids_func=self._update_traversed_parent_ids,
is_slim=is_slim,
include_shared_with_me=self.include_files_shared_with_me,
start=curr_stage.completed_until if resuming else start,
end=end,
),
@@ -584,25 +573,6 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
drive_ids_to_retrieve, checkpoint
)
# only process emails that we haven't already completed retrieval for
non_completed_org_emails = [
user_email
for user_email, stage in checkpoint.completion_map.items()
if stage != DriveRetrievalStage.DONE
]
# don't process too many emails before returning a checkpoint. This is
# to resolve the case where there are a ton of emails that don't have access
# to the drive APIs. Without this, we could loop through these emails for
# more than 3 hours, causing a timeout and stalling progress.
email_batch_takes_us_to_completion = True
MAX_EMAILS_TO_PROCESS_BEFORE_CHECKPOINTING = 50
if len(non_completed_org_emails) > MAX_EMAILS_TO_PROCESS_BEFORE_CHECKPOINTING:
non_completed_org_emails = non_completed_org_emails[
:MAX_EMAILS_TO_PROCESS_BEFORE_CHECKPOINTING
]
email_batch_takes_us_to_completion = False
user_retrieval_gens = [
self._impersonate_user_for_retrieval(
email,
@@ -613,14 +583,10 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
start,
end,
)
for email in non_completed_org_emails
for email in all_org_emails
]
yield from parallel_yield(user_retrieval_gens, max_workers=MAX_DRIVE_WORKERS)
# if there are more emails to process, don't mark as complete
if not email_batch_takes_us_to_completion:
return
remaining_folders = (
drive_ids_to_retrieve | folder_ids_to_retrieve
) - self._retrieved_ids
@@ -950,28 +916,20 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
convert_func = partial(
_convert_single_file,
self.creds,
self.primary_admin_email,
self.allow_images,
self.size_threshold,
)
# Fetch files in batches
batches_complete = 0
files_batch: list[RetrievedDriveFile] = []
files_batch: list[GoogleDriveFileType] = []
def _yield_batch(
files_batch: list[RetrievedDriveFile],
files_batch: list[GoogleDriveFileType],
) -> Iterator[Document | ConnectorFailure]:
nonlocal batches_complete
# Process the batch using run_functions_tuples_in_parallel
func_with_args = [
(
convert_func,
(
file.user_email,
file.drive_file,
),
)
for file in files_batch
]
func_with_args = [(convert_func, (file,)) for file in files_batch]
results = cast(
list[Document | ConnectorFailure | None],
run_functions_tuples_in_parallel(func_with_args, max_workers=8),
@@ -1009,7 +967,7 @@ class GoogleDriveConnector(SlimConnector, CheckpointConnector[GoogleDriveCheckpo
)
continue
files_batch.append(retrieved_file)
files_batch.append(retrieved_file.drive_file)
if len(files_batch) < self.batch_size:
continue

View File

@@ -30,7 +30,6 @@ from onyx.file_processing.file_validation import is_valid_image_type
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.llm.interfaces import LLM
from onyx.utils.lazy import lazy_eval
from onyx.utils.logger import setup_logger
logger = setup_logger()
@@ -77,26 +76,6 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
return is_valid_image_type(mime_type)
def download_request(service: GoogleDriveService, file_id: str) -> bytes:
"""
Download the file from Google Drive.
"""
# For other file types, download the file
# Use the correct API call for downloading files
request = service.files().get_media(fileId=file_id)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
while not done:
_, done = downloader.next_chunk()
response = response_bytes.getvalue()
if not response:
logger.warning(f"Failed to download {file_id}")
return bytes()
return response
def _download_and_extract_sections_basic(
file: dict[str, str],
service: GoogleDriveService,
@@ -108,17 +87,35 @@ def _download_and_extract_sections_basic(
mime_type = file["mimeType"]
link = file.get("webViewLink", "")
# skip images if not explicitly enabled
if not allow_images and is_gdrive_image_mime_type(mime_type):
return []
try:
# skip images if not explicitly enabled
if not allow_images and is_gdrive_image_mime_type(mime_type):
return []
# For Google Docs, Sheets, and Slides, export as plain text
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
export_mime_type = GOOGLE_MIME_TYPES_TO_EXPORT[mime_type]
# Use the correct API call for exporting files
request = service.files().export_media(
fileId=file_id, mimeType=export_mime_type
)
# For Google Docs, Sheets, and Slides, export as plain text
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
export_mime_type = GOOGLE_MIME_TYPES_TO_EXPORT[mime_type]
# Use the correct API call for exporting files
request = service.files().export_media(
fileId=file_id, mimeType=export_mime_type
)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
while not done:
_, done = downloader.next_chunk()
response = response_bytes.getvalue()
if not response:
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
return []
text = response.decode("utf-8")
return [TextSection(link=link, text=text)]
# For other file types, download the file
# Use the correct API call for downloading files
request = service.files().get_media(fileId=file_id)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
@@ -127,97 +124,88 @@ def _download_and_extract_sections_basic(
response = response_bytes.getvalue()
if not response:
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
logger.warning(f"Failed to download {file_name}")
return []
text = response.decode("utf-8")
return [TextSection(link=link, text=text)]
# Process based on mime type
if mime_type == "text/plain":
text = response.decode("utf-8")
return [TextSection(link=link, text=text)]
# For other file types, download the file
# Use the correct API call for downloading files
response_call = lazy_eval(lambda: download_request(service, file_id))
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
text, _ = docx_to_text_and_images(io.BytesIO(response))
return [TextSection(link=link, text=text)]
# Process based on mime type
if mime_type == "text/plain":
text = response_call().decode("utf-8")
return [TextSection(link=link, text=text)]
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
text = xlsx_to_text(io.BytesIO(response))
return [TextSection(link=link, text=text)]
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
text, _ = docx_to_text_and_images(io.BytesIO(response_call()))
return [TextSection(link=link, text=text)]
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
text = pptx_to_text(io.BytesIO(response))
return [TextSection(link=link, text=text)]
elif (
mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
):
text = xlsx_to_text(io.BytesIO(response_call()))
return [TextSection(link=link, text=text)]
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
text = pptx_to_text(io.BytesIO(response_call()))
return [TextSection(link=link, text=text)]
elif is_gdrive_image_mime_type(mime_type):
# For images, store them for later processing
sections: list[TextSection | ImageSection] = []
try:
with get_session_with_current_tenant() as db_session:
section, embedded_id = store_image_and_create_section(
db_session=db_session,
image_data=response_call(),
file_name=file_id,
display_name=file_name,
media_type=mime_type,
file_origin=FileOrigin.CONNECTOR,
link=link,
)
sections.append(section)
except Exception as e:
logger.error(f"Failed to process image {file_name}: {e}")
return sections
elif mime_type == "application/pdf":
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_call()))
pdf_sections: list[TextSection | ImageSection] = [
TextSection(link=link, text=text)
]
# Process embedded images in the PDF
try:
with get_session_with_current_tenant() as db_session:
for idx, (img_data, img_name) in enumerate(images):
elif is_gdrive_image_mime_type(mime_type):
# For images, store them for later processing
sections: list[TextSection | ImageSection] = []
try:
with get_session_with_current_tenant() as db_session:
section, embedded_id = store_image_and_create_section(
db_session=db_session,
image_data=img_data,
file_name=f"{file_id}_img_{idx}",
display_name=img_name or f"{file_name} - image {idx}",
image_data=response,
file_name=file_id,
display_name=file_name,
media_type=mime_type,
file_origin=FileOrigin.CONNECTOR,
link=link,
)
pdf_sections.append(section)
except Exception as e:
logger.error(f"Failed to process PDF images in {file_name}: {e}")
return pdf_sections
sections.append(section)
except Exception as e:
logger.error(f"Failed to process image {file_name}: {e}")
return sections
else:
# For unsupported file types, try to extract text
if mime_type in [
"application/vnd.google-apps.video",
"application/vnd.google-apps.audio",
"application/zip",
]:
return []
# For unsupported file types, try to extract text
try:
text = extract_file_text(io.BytesIO(response_call()), file_name)
return [TextSection(link=link, text=text)]
except Exception as e:
logger.warning(f"Failed to extract text from {file_name}: {e}")
return []
elif mime_type == "application/pdf":
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response))
pdf_sections: list[TextSection | ImageSection] = [
TextSection(link=link, text=text)
]
# Process embedded images in the PDF
try:
with get_session_with_current_tenant() as db_session:
for idx, (img_data, img_name) in enumerate(images):
section, embedded_id = store_image_and_create_section(
db_session=db_session,
image_data=img_data,
file_name=f"{file_id}_img_{idx}",
display_name=img_name or f"{file_name} - image {idx}",
file_origin=FileOrigin.CONNECTOR,
)
pdf_sections.append(section)
except Exception as e:
logger.error(f"Failed to process PDF images in {file_name}: {e}")
return pdf_sections
else:
# For unsupported file types, try to extract text
try:
text = extract_file_text(io.BytesIO(response), file_name)
return [TextSection(link=link, text=text)]
except Exception as e:
logger.warning(f"Failed to extract text from {file_name}: {e}")
return []
except Exception as e:
logger.error(f"Error processing file {file_name}: {e}")
return []
def convert_drive_item_to_document(

View File

@@ -214,11 +214,10 @@ def get_files_in_shared_drive(
yield file
def get_all_files_in_my_drive_and_shared(
def get_all_files_in_my_drive(
service: GoogleDriveService,
update_traversed_ids_func: Callable,
is_slim: bool,
include_shared_with_me: bool,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]:
@@ -230,8 +229,7 @@ def get_all_files_in_my_drive_and_shared(
# Get all folders being queried and add them to the traversed set
folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
folder_query += " and trashed = false"
if not include_shared_with_me:
folder_query += " and 'me' in owners"
folder_query += " and 'me' in owners"
found_folders = False
for file in execute_paginated_retrieval(
retrieval_function=service.files().list,
@@ -248,8 +246,7 @@ def get_all_files_in_my_drive_and_shared(
# Then get the files
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
file_query += " and trashed = false"
if not include_shared_with_me:
file_query += " and 'me' in owners"
file_query += " and 'me' in owners"
file_query += _generate_time_range_filter(start, end)
yield from execute_paginated_retrieval(
retrieval_function=service.files().list,

View File

@@ -75,7 +75,7 @@ class HighspotClient:
self.key = key
self.secret = secret
self.base_url = base_url.rstrip("/") + "/"
self.base_url = base_url
self.timeout = timeout
# Set up session with retry logic

View File

@@ -20,8 +20,7 @@ from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ALL_ACCEPTED_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
@@ -85,21 +84,14 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Populate the spot ID map with all available spots.
Keys are stored as lowercase for case-insensitive lookups.
"""
try:
spots = self.client.get_spots()
for spot in spots:
if "title" in spot and "id" in spot:
spot_name = spot["title"]
self._spot_id_map[spot_name.lower()] = spot["id"]
spots = self.client.get_spots()
for spot in spots:
if "title" in spot and "id" in spot:
spot_name = spot["title"]
self._spot_id_map[spot_name.lower()] = spot["id"]
self._all_spots_fetched = True
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
except HighspotClientError as e:
logger.error(f"Error retrieving spots from Highspot: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error retrieving spots from Highspot: {str(e)}")
raise
self._all_spots_fetched = True
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
def _get_all_spot_names(self) -> List[str]:
"""
@@ -159,142 +151,116 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Batches of Document objects
"""
doc_batch: list[Document] = []
try:
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
if not spot_names_to_process:
logger.warning("No spots found in Highspot")
raise ValueError("No spots found in Highspot")
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots"
)
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
if spot_id is None:
logger.warning(f"Spot ID not found for spot {spot_name}")
continue
offset = 0
has_more = True
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots"
)
while has_more:
logger.info(
f"Retrieving items from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
logger.info(f"Received Items: {items}")
if not items:
has_more = False
continue
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
if spot_id is None:
logger.warning(f"Spot ID not found for spot {spot_name}")
continue
offset = 0
has_more = True
for item in items:
try:
item_id = item.get("id")
if not item_id:
logger.warning("Item without ID found, skipping")
continue
item_details = self.client.get_item(item_id)
if not item_details:
logger.warning(
f"Item {item_id} details not found, skipping"
)
continue
# Apply time filter if specified
if start or end:
updated_at = item_details.get("date_updated")
if updated_at:
# Convert to datetime for comparison
try:
updated_time = datetime.fromisoformat(
updated_at.replace("Z", "+00:00")
)
if (
start
and updated_time.timestamp() < start
) or (
end and updated_time.timestamp() > end
):
continue
except (ValueError, TypeError):
# Skip if date cannot be parsed
logger.warning(
f"Invalid date format for item {item_id}: {updated_at}"
)
continue
content = self._get_item_content(item_details)
title = item_details.get("title", "")
doc_batch.append(
Document(
id=f"HIGHSPOT_{item_id}",
sections=[
TextSection(
link=item_details.get(
"url",
f"https://www.highspot.com/items/{item_id}",
),
text=content,
)
],
source=DocumentSource.HIGHSPOT,
semantic_identifier=title,
metadata={
"spot_name": spot_name,
"type": item_details.get(
"content_type", ""
),
"created_at": item_details.get(
"date_added", ""
),
"author": item_details.get("author", ""),
"language": item_details.get(
"language", ""
),
"can_download": str(
item_details.get("can_download", False)
),
},
doc_updated_at=item_details.get("date_updated"),
)
)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
except HighspotClientError as e:
item_id = "ID" if not item_id else item_id
logger.error(
f"Error retrieving item {item_id}: {str(e)}"
)
except Exception as e:
item_id = "ID" if not item_id else item_id
logger.error(
f"Unexpected error for item {item_id}: {str(e)}"
)
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(f"Error processing spot {spot_name}: {str(e)}")
except Exception as e:
logger.error(
f"Unexpected error processing spot {spot_name}: {str(e)}"
while has_more:
logger.info(
f"Retrieving items from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
logger.info(f"Received Items: {items}")
if not items:
has_more = False
continue
except Exception as e:
logger.error(f"Error in Highspot connector: {str(e)}")
raise
for item in items:
try:
item_id = item.get("id")
if not item_id:
logger.warning("Item without ID found, skipping")
continue
item_details = self.client.get_item(item_id)
if not item_details:
logger.warning(
f"Item {item_id} details not found, skipping"
)
continue
# Apply time filter if specified
if start or end:
updated_at = item_details.get("date_updated")
if updated_at:
# Convert to datetime for comparison
try:
updated_time = datetime.fromisoformat(
updated_at.replace("Z", "+00:00")
)
if (
start and updated_time.timestamp() < start
) or (end and updated_time.timestamp() > end):
continue
except (ValueError, TypeError):
# Skip if date cannot be parsed
logger.warning(
f"Invalid date format for item {item_id}: {updated_at}"
)
continue
content = self._get_item_content(item_details)
title = item_details.get("title", "")
doc_batch.append(
Document(
id=f"HIGHSPOT_{item_id}",
sections=[
TextSection(
link=item_details.get(
"url",
f"https://www.highspot.com/items/{item_id}",
),
text=content,
)
],
source=DocumentSource.HIGHSPOT,
semantic_identifier=title,
metadata={
"spot_name": spot_name,
"type": item_details.get("content_type", ""),
"created_at": item_details.get(
"date_added", ""
),
"author": item_details.get("author", ""),
"language": item_details.get("language", ""),
"can_download": str(
item_details.get("can_download", False)
),
},
doc_updated_at=item_details.get("date_updated"),
)
)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
except HighspotClientError as e:
item_id = "ID" if not item_id else item_id
logger.error(f"Error retrieving item {item_id}: {str(e)}")
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(f"Error processing spot {spot_name}: {str(e)}")
if doc_batch:
yield doc_batch
@@ -320,9 +286,7 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
# Extract title and description once at the beginning
title, description = self._extract_title_and_description(item_details)
default_content = f"{title}\n{description}"
logger.info(
f"Processing item {item_id} with extension {file_extension} and file name {content_name}"
)
logger.info(f"Processing item {item_id} with extension {file_extension}")
try:
if content_type == "WebLink":
@@ -334,39 +298,30 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
elif (
is_valid_format
and (
file_extension in ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
or file_extension in ACCEPTED_DOCUMENT_FILE_EXTENSIONS
)
and file_extension in ALL_ACCEPTED_FILE_EXTENSIONS
and can_download
):
# For documents, try to get the text content
if not item_id: # Ensure item_id is defined
return default_content
content_response = self.client.get_item_content(item_id)
# Process and extract text from binary content based on type
if content_response:
text_content = extract_file_text(
BytesIO(content_response), content_name, False
BytesIO(content_response), content_name
)
return text_content if text_content else default_content
return text_content
return default_content
else:
return default_content
except HighspotClientError as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
# Use item_id safely in the warning message
error_context = f"item {item_id}" if item_id else "item"
logger.warning(f"Could not retrieve content for {error_context}: {str(e)}")
return default_content
except ValueError as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
logger.error(f"Value error for {error_context}: {str(e)}")
return default_content
except Exception as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
logger.error(
f"Unexpected error retrieving content for {error_context}: {str(e)}"
)
return default_content
return ""
def _extract_title_and_description(
self, item_details: Dict[str, Any]
@@ -403,63 +358,55 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Batches of SlimDocument objects
"""
slim_doc_batch: list[SlimDocument] = []
try:
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
if not spot_names_to_process:
logger.warning("No spots found in Highspot")
raise ValueError("No spots found in Highspot")
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
)
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
offset = 0
has_more = True
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
)
while has_more:
logger.info(
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
offset = 0
has_more = True
items = response.get("collection", [])
if not items:
has_more = False
continue
for item in items:
item_id = item.get("id")
if not item_id:
continue
slim_doc_batch.append(
SlimDocument(id=f"HIGHSPOT_{item_id}")
)
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
while has_more:
logger.info(
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
if slim_doc_batch:
yield slim_doc_batch
except Exception as e:
logger.error(f"Error in Highspot Slim Connector: {str(e)}")
raise
items = response.get("collection", [])
if not items:
has_more = False
continue
for item in items:
item_id = item.get("id")
if not item_id:
continue
slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}"))
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
)
if slim_doc_batch:
yield slim_doc_batch
def validate_credentials(self) -> bool:
"""

View File

@@ -1,4 +1,3 @@
import sys
from datetime import datetime
from enum import Enum
from typing import Any
@@ -41,9 +40,6 @@ class TextSection(Section):
text: str
link: str | None = None
def __sizeof__(self) -> int:
return sys.getsizeof(self.text) + sys.getsizeof(self.link)
class ImageSection(Section):
"""Section containing an image reference"""
@@ -51,9 +47,6 @@ class ImageSection(Section):
image_file_name: str
link: str | None = None
def __sizeof__(self) -> int:
return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link)
class BasicExpertInfo(BaseModel):
"""Basic Information for the owner of a document, any of the fields can be left as None
@@ -117,14 +110,6 @@ class BasicExpertInfo(BaseModel):
)
)
def __sizeof__(self) -> int:
size = sys.getsizeof(self.display_name)
size += sys.getsizeof(self.first_name)
size += sys.getsizeof(self.middle_initial)
size += sys.getsizeof(self.last_name)
size += sys.getsizeof(self.email)
return size
class DocumentBase(BaseModel):
"""Used for Onyx ingestion api, the ID is inferred before use if not provided"""
@@ -178,35 +163,6 @@ class DocumentBase(BaseModel):
attributes.append(k + INDEX_SEPARATOR + v)
return attributes
def __sizeof__(self) -> int:
size = sys.getsizeof(self.id)
for section in self.sections:
size += sys.getsizeof(section)
size += sys.getsizeof(self.source)
size += sys.getsizeof(self.semantic_identifier)
size += sys.getsizeof(self.doc_updated_at)
size += sys.getsizeof(self.chunk_count)
if self.primary_owners is not None:
for primary_owner in self.primary_owners:
size += sys.getsizeof(primary_owner)
else:
size += sys.getsizeof(self.primary_owners)
if self.secondary_owners is not None:
for secondary_owner in self.secondary_owners:
size += sys.getsizeof(secondary_owner)
else:
size += sys.getsizeof(self.secondary_owners)
size += sys.getsizeof(self.title)
size += sys.getsizeof(self.from_ingestion_api)
size += sys.getsizeof(self.additional_info)
return size
def get_text_content(self) -> str:
return " ".join([section.text for section in self.sections if section.text])
class Document(DocumentBase):
"""Used for Onyx ingestion api, the ID is required"""
@@ -235,12 +191,6 @@ class Document(DocumentBase):
from_ingestion_api=base.from_ingestion_api,
)
def __sizeof__(self) -> int:
size = super().__sizeof__()
size += sys.getsizeof(self.id)
size += sys.getsizeof(self.source)
return size
class IndexingDocument(Document):
"""Document with processed sections for indexing"""

View File

@@ -1,9 +1,4 @@
import gc
import os
import sys
import tempfile
from collections import defaultdict
from pathlib import Path
from typing import Any
from simple_salesforce import Salesforce
@@ -26,13 +21,9 @@ from onyx.connectors.salesforce.salesforce_calls import get_all_children_of_sf_t
from onyx.connectors.salesforce.sqlite_functions import get_affected_parent_ids_by_type
from onyx.connectors.salesforce.sqlite_functions import get_record
from onyx.connectors.salesforce.sqlite_functions import init_db
from onyx.connectors.salesforce.sqlite_functions import sqlite_log_stats
from onyx.connectors.salesforce.sqlite_functions import update_sf_db_with_csv
from onyx.connectors.salesforce.utils import BASE_DATA_PATH
from onyx.connectors.salesforce.utils import get_sqlite_db_path
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from shared_configs.configs import MULTI_TENANT
logger = setup_logger()
@@ -41,8 +32,6 @@ _DEFAULT_PARENT_OBJECT_TYPES = ["Account"]
class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
MAX_BATCH_BYTES = 1024 * 1024
def __init__(
self,
batch_size: int = INDEX_BATCH_SIZE,
@@ -75,45 +64,22 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
raise ConnectorMissingCredentialError("Salesforce")
return self._sf_client
@staticmethod
def reconstruct_object_types(directory: str) -> dict[str, list[str] | None]:
"""
Scans the given directory for all CSV files and reconstructs the available object types.
Assumes filenames are formatted as "ObjectType.filename.csv" or "ObjectType.csv".
Args:
directory (str): The path to the directory containing CSV files.
Returns:
dict[str, list[str]]: A dictionary mapping object types to lists of file paths.
"""
object_types = defaultdict(list)
for filename in os.listdir(directory):
if filename.endswith(".csv"):
parts = filename.split(".", 1) # Split on the first period
object_type = parts[0] # Take the first part as the object type
object_types[object_type].append(os.path.join(directory, filename))
return dict(object_types)
@staticmethod
def _download_object_csvs(
directory: str,
parent_object_list: list[str],
sf_client: Salesforce,
def _fetch_from_salesforce(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> None:
all_object_types: set[str] = set(parent_object_list)
) -> GenerateDocumentsOutput:
init_db()
all_object_types: set[str] = set(self.parent_object_list)
logger.info(
f"Parent object types: num={len(parent_object_list)} list={parent_object_list}"
)
logger.info(f"Starting with {len(self.parent_object_list)} parent object types")
logger.debug(f"Parent object types: {self.parent_object_list}")
# This takes like 20 seconds
for parent_object_type in parent_object_list:
child_types = get_all_children_of_sf_type(sf_client, parent_object_type)
for parent_object_type in self.parent_object_list:
child_types = get_all_children_of_sf_type(
self.sf_client, parent_object_type
)
all_object_types.update(child_types)
logger.debug(
f"Found {len(child_types)} child types for {parent_object_type}"
@@ -122,53 +88,20 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
# Always want to make sure user is grabbed for permissioning purposes
all_object_types.add("User")
logger.info(
f"All object types: num={len(all_object_types)} list={all_object_types}"
)
# gc.collect()
logger.info(f"Found total of {len(all_object_types)} object types to fetch")
logger.debug(f"All object types: {all_object_types}")
# checkpoint - we've found all object types, now time to fetch the data
logger.info("Fetching CSVs for all object types")
logger.info("Starting to fetch CSVs for all object types")
# This takes like 30 minutes first time and <2 minutes for updates
object_type_to_csv_path = fetch_all_csvs_in_parallel(
sf_client=sf_client,
sf_client=self.sf_client,
object_types=all_object_types,
start=start,
end=end,
target_dir=directory,
)
# print useful information
num_csvs = 0
num_bytes = 0
for object_type, csv_paths in object_type_to_csv_path.items():
if not csv_paths:
continue
for csv_path in csv_paths:
if not csv_path:
continue
file_path = Path(csv_path)
file_size = file_path.stat().st_size
num_csvs += 1
num_bytes += file_size
logger.info(
f"CSV info: object_type={object_type} path={csv_path} bytes={file_size}"
)
logger.info(f"CSV info total: total_csvs={num_csvs} total_bytes={num_bytes}")
@staticmethod
def _load_csvs_to_db(csv_directory: str, db_directory: str) -> set[str]:
updated_ids: set[str] = set()
object_type_to_csv_path = SalesforceConnector.reconstruct_object_types(
csv_directory
)
# This takes like 10 seconds
# This is for testing the rest of the functionality if data has
# already been fetched and put in sqlite
@@ -187,16 +120,10 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
# If path is None, it means it failed to fetch the csv
if csv_paths is None:
continue
# Go through each csv path and use it to update the db
for csv_path in csv_paths:
logger.debug(
f"Processing CSV: object_type={object_type} "
f"csv={csv_path} "
f"len={Path(csv_path).stat().st_size}"
)
logger.debug(f"Updating {object_type} with {csv_path}")
new_ids = update_sf_db_with_csv(
db_directory,
object_type=object_type,
csv_download_path=csv_path,
)
@@ -205,127 +132,49 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
f"Added {len(new_ids)} new/updated records for {object_type}"
)
os.remove(csv_path)
return updated_ids
def _fetch_from_salesforce(
self,
temp_dir: str,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateDocumentsOutput:
logger.info("_fetch_from_salesforce starting.")
if not self._sf_client:
raise RuntimeError("self._sf_client is None!")
init_db(temp_dir)
sqlite_log_stats(temp_dir)
# Step 1 - download
SalesforceConnector._download_object_csvs(
temp_dir, self.parent_object_list, self._sf_client, start, end
)
gc.collect()
# Step 2 - load CSV's to sqlite
updated_ids = SalesforceConnector._load_csvs_to_db(temp_dir, temp_dir)
gc.collect()
logger.info(f"Found {len(updated_ids)} total updated records")
logger.info(
f"Starting to process parent objects of types: {self.parent_object_list}"
)
# Step 3 - extract and index docs
batches_processed = 0
docs_processed = 0
docs_to_yield: list[Document] = []
docs_to_yield_bytes = 0
docs_processed = 0
# Takes 15-20 seconds per batch
for parent_type, parent_id_batch in get_affected_parent_ids_by_type(
temp_dir,
updated_ids=list(updated_ids),
parent_types=self.parent_object_list,
):
batches_processed += 1
logger.info(
f"Processing batch: index={batches_processed} "
f"object_type={parent_type} "
f"len={len(parent_id_batch)} "
f"processed={docs_processed} "
f"remaining={len(updated_ids) - docs_processed}"
f"Processing batch of {len(parent_id_batch)} {parent_type} objects"
)
for parent_id in parent_id_batch:
if not (parent_object := get_record(temp_dir, parent_id, parent_type)):
if not (parent_object := get_record(parent_id, parent_type)):
logger.warning(
f"Failed to get parent object {parent_id} for {parent_type}"
)
continue
doc = convert_sf_object_to_doc(
temp_dir,
sf_object=parent_object,
sf_instance=self.sf_client.sf_instance,
docs_to_yield.append(
convert_sf_object_to_doc(
sf_object=parent_object,
sf_instance=self.sf_client.sf_instance,
)
)
doc_sizeof = sys.getsizeof(doc)
docs_to_yield_bytes += doc_sizeof
docs_to_yield.append(doc)
docs_processed += 1
# memory usage is sensitive to the input length, so we're yielding immediately
# if the batch exceeds a certain byte length
if (
len(docs_to_yield) >= self.batch_size
or docs_to_yield_bytes > SalesforceConnector.MAX_BATCH_BYTES
):
if len(docs_to_yield) >= self.batch_size:
yield docs_to_yield
docs_to_yield = []
docs_to_yield_bytes = 0
# observed a memory leak / size issue with the account table if we don't gc.collect here.
gc.collect()
yield docs_to_yield
logger.info(
f"Final processing stats: "
f"processed={docs_processed} "
f"remaining={len(updated_ids) - docs_processed}"
)
def load_from_state(self) -> GenerateDocumentsOutput:
if MULTI_TENANT:
# if multi tenant, we cannot expect the sqlite db to be cached/present
with tempfile.TemporaryDirectory() as temp_dir:
return self._fetch_from_salesforce(temp_dir)
# nuke the db since we're starting from scratch
sqlite_db_path = get_sqlite_db_path(BASE_DATA_PATH)
if os.path.exists(sqlite_db_path):
logger.info(f"load_from_state: Removing db at {sqlite_db_path}.")
os.remove(sqlite_db_path)
return self._fetch_from_salesforce(BASE_DATA_PATH)
return self._fetch_from_salesforce()
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
if MULTI_TENANT:
# if multi tenant, we cannot expect the sqlite db to be cached/present
with tempfile.TemporaryDirectory() as temp_dir:
return self._fetch_from_salesforce(temp_dir, start=start, end=end)
if start == 0:
# nuke the db if we're starting from scratch
sqlite_db_path = get_sqlite_db_path(BASE_DATA_PATH)
if os.path.exists(sqlite_db_path):
logger.info(
f"poll_source: Starting at time 0, removing db at {sqlite_db_path}."
)
os.remove(sqlite_db_path)
return self._fetch_from_salesforce(BASE_DATA_PATH)
return self._fetch_from_salesforce(start=start, end=end)
def retrieve_all_slim_documents(
self,
@@ -360,7 +209,7 @@ if __name__ == "__main__":
"sf_security_token": os.environ["SF_SECURITY_TOKEN"],
}
)
start_time = time.monotonic()
start_time = time.time()
doc_count = 0
section_count = 0
text_count = 0
@@ -372,7 +221,7 @@ if __name__ == "__main__":
for section in doc.sections:
if isinstance(section, TextSection) and section.text is not None:
text_count += len(section.text)
end_time = time.monotonic()
end_time = time.time()
print(f"Doc count: {doc_count}")
print(f"Section count: {section_count}")

View File

@@ -124,14 +124,13 @@ def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> Text
def _extract_primary_owners(
directory: str,
sf_object: SalesforceObject,
) -> list[BasicExpertInfo] | None:
object_dict = sf_object.data
if not (last_modified_by_id := object_dict.get("LastModifiedById")):
logger.warning(f"No LastModifiedById found for {sf_object.id}")
return None
if not (last_modified_by := get_record(directory, last_modified_by_id)):
if not (last_modified_by := get_record(last_modified_by_id)):
logger.warning(f"No LastModifiedBy found for {last_modified_by_id}")
return None
@@ -160,7 +159,6 @@ def _extract_primary_owners(
def convert_sf_object_to_doc(
directory: str,
sf_object: SalesforceObject,
sf_instance: str,
) -> Document:
@@ -172,8 +170,8 @@ def convert_sf_object_to_doc(
extracted_semantic_identifier = object_dict.get("Name", "Unknown Object")
sections = [_extract_section(sf_object, base_url)]
for id in get_child_ids(directory, sf_object.id):
if not (child_object := get_record(directory, id)):
for id in get_child_ids(sf_object.id):
if not (child_object := get_record(id)):
continue
sections.append(_extract_section(child_object, base_url))
@@ -183,7 +181,7 @@ def convert_sf_object_to_doc(
source=DocumentSource.SALESFORCE,
semantic_identifier=extracted_semantic_identifier,
doc_updated_at=extracted_doc_updated_at,
primary_owners=_extract_primary_owners(directory, sf_object),
primary_owners=_extract_primary_owners(sf_object),
metadata={},
)
return doc

View File

@@ -11,12 +11,13 @@ from simple_salesforce.bulk2 import SFBulk2Type
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.salesforce.sqlite_functions import has_at_least_one_object_of_type
from onyx.connectors.salesforce.utils import get_object_type_path
from onyx.utils.logger import setup_logger
logger = setup_logger()
def _build_last_modified_time_filter_for_salesforce(
def _build_time_filter_for_salesforce(
start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> str:
if start is None or end is None:
@@ -29,19 +30,6 @@ def _build_last_modified_time_filter_for_salesforce(
)
def _build_created_date_time_filter_for_salesforce(
start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> str:
if start is None or end is None:
return ""
start_datetime = datetime.fromtimestamp(start, UTC)
end_datetime = datetime.fromtimestamp(end, UTC)
return (
f" WHERE CreatedDate > {start_datetime.isoformat()} "
f"AND CreatedDate < {end_datetime.isoformat()}"
)
def _get_sf_type_object_json(sf_client: Salesforce, type_name: str) -> Any:
sf_object = SFType(type_name, sf_client.session_id, sf_client.sf_instance)
return sf_object.describe()
@@ -121,6 +109,23 @@ def _check_if_object_type_is_empty(
return True
def _check_for_existing_csvs(sf_type: str) -> list[str] | None:
# Check if the csv already exists
if os.path.exists(get_object_type_path(sf_type)):
existing_csvs = [
os.path.join(get_object_type_path(sf_type), f)
for f in os.listdir(get_object_type_path(sf_type))
if f.endswith(".csv")
]
# If the csv already exists, return the path
# This is likely due to a previous run that failed
# after downloading the csv but before the data was
# written to the db
if existing_csvs:
return existing_csvs
return None
def _build_bulk_query(sf_client: Salesforce, sf_type: str, time_filter: str) -> str:
queryable_fields = _get_all_queryable_fields_of_sf_type(sf_client, sf_type)
query = f"SELECT {', '.join(queryable_fields)} FROM {sf_type}{time_filter}"
@@ -128,15 +133,16 @@ def _build_bulk_query(sf_client: Salesforce, sf_type: str, time_filter: str) ->
def _bulk_retrieve_from_salesforce(
sf_client: Salesforce, sf_type: str, time_filter: str, target_dir: str
sf_client: Salesforce,
sf_type: str,
time_filter: str,
) -> tuple[str, list[str] | None]:
"""Returns a tuple of
1. the salesforce object type
2. the list of CSV's
"""
if not _check_if_object_type_is_empty(sf_client, sf_type, time_filter):
return sf_type, None
if existing_csvs := _check_for_existing_csvs(sf_type):
return sf_type, existing_csvs
query = _build_bulk_query(sf_client, sf_type, time_filter)
bulk_2_handler = SFBulk2Handler(
@@ -153,33 +159,20 @@ def _bulk_retrieve_from_salesforce(
)
logger.info(f"Downloading {sf_type}")
logger.debug(f"Query: {query}")
logger.info(f"Query: {query}")
try:
# This downloads the file to a file in the target path with a random name
results = bulk_2_type.download(
query=query,
path=target_dir,
path=get_object_type_path(sf_type),
max_records=1000000,
)
# prepend each downloaded csv with the object type (delimiter = '.')
all_download_paths: list[str] = []
for result in results:
original_file_path = result["file"]
directory, filename = os.path.split(original_file_path)
new_filename = f"{sf_type}.{filename}"
new_file_path = os.path.join(directory, new_filename)
os.rename(original_file_path, new_file_path)
all_download_paths.append(new_file_path)
all_download_paths = [result["file"] for result in results]
logger.info(f"Downloaded {sf_type} to {all_download_paths}")
return sf_type, all_download_paths
except Exception as e:
logger.error(
f"Failed to download salesforce csv for object type {sf_type}: {e}"
)
logger.warning(f"Exceptioning query for object type {sf_type}: {query}")
logger.info(f"Failed to download salesforce csv for object type {sf_type}: {e}")
return sf_type, None
@@ -188,35 +181,12 @@ def fetch_all_csvs_in_parallel(
object_types: set[str],
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
target_dir: str,
) -> dict[str, list[str] | None]:
"""
Fetches all the csvs in parallel for the given object types
Returns a dict of (sf_type, full_download_path)
"""
# these types don't query properly and need looking at
# problem_types: set[str] = {
# "ContentDocumentLink",
# "RecordActionHistory",
# "PendingOrderSummary",
# "UnifiedActivityRelation",
# }
# these types don't have a LastModifiedDate field and instead use CreatedDate
created_date_types: set[str] = {
"AccountHistory",
"AccountTag",
"EntitySubscription",
}
last_modified_time_filter = _build_last_modified_time_filter_for_salesforce(
start, end
)
created_date_time_filter = _build_created_date_time_filter_for_salesforce(
start, end
)
time_filter = _build_time_filter_for_salesforce(start, end)
time_filter_for_each_object_type = {}
# We do this outside of the thread pool executor because this requires
# a database connection and we don't want to block the thread pool
@@ -225,11 +195,8 @@ def fetch_all_csvs_in_parallel(
"""Only add time filter if there is at least one object of the type
in the database. We aren't worried about partially completed object update runs
because this occurs after we check for existing csvs which covers this case"""
if has_at_least_one_object_of_type(target_dir, sf_type):
if sf_type in created_date_types:
time_filter_for_each_object_type[sf_type] = created_date_time_filter
else:
time_filter_for_each_object_type[sf_type] = last_modified_time_filter
if has_at_least_one_object_of_type(sf_type):
time_filter_for_each_object_type[sf_type] = time_filter
else:
time_filter_for_each_object_type[sf_type] = ""
@@ -240,7 +207,6 @@ def fetch_all_csvs_in_parallel(
sf_client=sf_client,
sf_type=object_type,
time_filter=time_filter_for_each_object_type[object_type],
target_dir=target_dir,
),
object_types,
)

View File

@@ -2,10 +2,8 @@ import csv
import json
import os
import sqlite3
import time
from collections.abc import Iterator
from contextlib import contextmanager
from pathlib import Path
from onyx.connectors.salesforce.utils import get_sqlite_db_path
from onyx.connectors.salesforce.utils import SalesforceObject
@@ -18,7 +16,6 @@ logger = setup_logger()
@contextmanager
def get_db_connection(
directory: str,
isolation_level: str | None = None,
) -> Iterator[sqlite3.Connection]:
"""Get a database connection with proper isolation level and error handling.
@@ -28,7 +25,7 @@ def get_db_connection(
can be "IMMEDIATE" or "EXCLUSIVE" for more strict isolation.
"""
# 60 second timeout for locks
conn = sqlite3.connect(get_sqlite_db_path(directory), timeout=60.0)
conn = sqlite3.connect(get_sqlite_db_path(), timeout=60.0)
if isolation_level is not None:
conn.isolation_level = isolation_level
@@ -41,41 +38,17 @@ def get_db_connection(
conn.close()
def sqlite_log_stats(directory: str) -> None:
with get_db_connection(directory, "EXCLUSIVE") as conn:
cache_pages = conn.execute("PRAGMA cache_size").fetchone()[0]
page_size = conn.execute("PRAGMA page_size").fetchone()[0]
if cache_pages >= 0:
cache_bytes = cache_pages * page_size
else:
cache_bytes = abs(cache_pages * 1024)
logger.info(
f"SQLite stats: sqlite_version={sqlite3.sqlite_version} "
f"cache_pages={cache_pages} "
f"page_size={page_size} "
f"cache_bytes={cache_bytes}"
)
def init_db(directory: str) -> None:
def init_db() -> None:
"""Initialize the SQLite database with required tables if they don't exist."""
# Create database directory if it doesn't exist
start = time.monotonic()
os.makedirs(os.path.dirname(get_sqlite_db_path()), exist_ok=True)
os.makedirs(os.path.dirname(get_sqlite_db_path(directory)), exist_ok=True)
with get_db_connection(directory, "EXCLUSIVE") as conn:
with get_db_connection("EXCLUSIVE") as conn:
cursor = conn.cursor()
db_exists = os.path.exists(get_sqlite_db_path(directory))
if db_exists:
file_path = Path(get_sqlite_db_path(directory))
file_size = file_path.stat().st_size
logger.info(f"init_db - found existing sqlite db: len={file_size}")
else:
# why is this only if the db doesn't exist?
db_exists = os.path.exists(get_sqlite_db_path())
if not db_exists:
# Enable WAL mode for better concurrent access and write performance
cursor.execute("PRAGMA journal_mode=WAL")
cursor.execute("PRAGMA synchronous=NORMAL")
@@ -170,31 +143,16 @@ def init_db(directory: str) -> None:
""",
)
elapsed = time.monotonic() - start
logger.info(f"init_db - create tables and indices: elapsed={elapsed:.2f}")
# Analyze tables to help query planner
# NOTE(rkuo): skip ANALYZE - it takes too long and we likely don't have
# complicated queries that need this
# start = time.monotonic()
# cursor.execute("ANALYZE relationships")
# cursor.execute("ANALYZE salesforce_objects")
# cursor.execute("ANALYZE relationship_types")
# cursor.execute("ANALYZE user_email_map")
# elapsed = time.monotonic() - start
# logger.info(f"init_db - analyze: elapsed={elapsed:.2f}")
cursor.execute("ANALYZE relationships")
cursor.execute("ANALYZE salesforce_objects")
cursor.execute("ANALYZE relationship_types")
cursor.execute("ANALYZE user_email_map")
# If database already existed but user_email_map needs to be populated
start = time.monotonic()
cursor.execute("SELECT COUNT(*) FROM user_email_map")
elapsed = time.monotonic() - start
logger.info(f"init_db - count user_email_map: elapsed={elapsed:.2f}")
start = time.monotonic()
if cursor.fetchone()[0] == 0:
_update_user_email_map(conn)
elapsed = time.monotonic() - start
logger.info(f"init_db - update_user_email_map: elapsed={elapsed:.2f}")
conn.commit()
@@ -282,15 +240,15 @@ def _update_user_email_map(conn: sqlite3.Connection) -> None:
def update_sf_db_with_csv(
directory: str,
object_type: str,
csv_download_path: str,
delete_csv_after_use: bool = True,
) -> list[str]:
"""Update the SF DB with a CSV file using SQLite storage."""
updated_ids = []
# Use IMMEDIATE to get a write lock at the start of the transaction
with get_db_connection(directory, "IMMEDIATE") as conn:
with get_db_connection("IMMEDIATE") as conn:
cursor = conn.cursor()
with open(csv_download_path, "r", newline="", encoding="utf-8") as f:
@@ -337,12 +295,17 @@ def update_sf_db_with_csv(
conn.commit()
if delete_csv_after_use:
# Remove the csv file after it has been used
# to successfully update the db
os.remove(csv_download_path)
return updated_ids
def get_child_ids(directory: str, parent_id: str) -> set[str]:
def get_child_ids(parent_id: str) -> set[str]:
"""Get all child IDs for a given parent ID."""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
# Force index usage with INDEXED BY
@@ -354,9 +317,9 @@ def get_child_ids(directory: str, parent_id: str) -> set[str]:
return child_ids
def get_type_from_id(directory: str, object_id: str) -> str | None:
def get_type_from_id(object_id: str) -> str | None:
"""Get the type of an object from its ID."""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT object_type FROM salesforce_objects WHERE id = ?", (object_id,)
@@ -369,15 +332,15 @@ def get_type_from_id(directory: str, object_id: str) -> str | None:
def get_record(
directory: str, object_id: str, object_type: str | None = None
object_id: str, object_type: str | None = None
) -> SalesforceObject | None:
"""Retrieve the record and return it as a SalesforceObject."""
if object_type is None:
object_type = get_type_from_id(directory, object_id)
object_type = get_type_from_id(object_id)
if not object_type:
return None
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT data FROM salesforce_objects WHERE id = ?", (object_id,))
result = cursor.fetchone()
@@ -389,9 +352,9 @@ def get_record(
return SalesforceObject(id=object_id, type=object_type, data=data)
def find_ids_by_type(directory: str, object_type: str) -> list[str]:
def find_ids_by_type(object_type: str) -> list[str]:
"""Find all object IDs for rows of the specified type."""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT id FROM salesforce_objects WHERE object_type = ?", (object_type,)
@@ -400,7 +363,6 @@ def find_ids_by_type(directory: str, object_type: str) -> list[str]:
def get_affected_parent_ids_by_type(
directory: str,
updated_ids: list[str],
parent_types: list[str],
batch_size: int = 500,
@@ -412,7 +374,7 @@ def get_affected_parent_ids_by_type(
updated_ids_batches = batch_list(updated_ids, batch_size)
updated_parent_ids: set[str] = set()
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
for batch_ids in updated_ids_batches:
@@ -457,7 +419,7 @@ def get_affected_parent_ids_by_type(
yield parent_type, new_affected_ids
def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
def has_at_least_one_object_of_type(object_type: str) -> bool:
"""Check if there is at least one object of the specified type in the database.
Args:
@@ -466,7 +428,7 @@ def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
Returns:
bool: True if at least one object exists, False otherwise
"""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM salesforce_objects WHERE object_type = ?",
@@ -481,7 +443,7 @@ def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
NULL_ID_STRING = "N/A"
def get_user_id_by_email(directory: str, email: str) -> str | None:
def get_user_id_by_email(email: str) -> str | None:
"""Get the Salesforce User ID for a given email address.
Args:
@@ -492,7 +454,7 @@ def get_user_id_by_email(directory: str, email: str) -> str | None:
- was_found: True if the email exists in the table, False if not found
- user_id: The Salesforce User ID if exists, None otherwise
"""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT user_id FROM user_email_map WHERE email = ?", (email,))
result = cursor.fetchone()
@@ -501,10 +463,10 @@ def get_user_id_by_email(directory: str, email: str) -> str | None:
return result[0]
def update_email_to_id_table(directory: str, email: str, id: str | None) -> None:
def update_email_to_id_table(email: str, id: str | None) -> None:
"""Update the email to ID map table with a new email and ID."""
id_to_use = id or NULL_ID_STRING
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"INSERT OR REPLACE INTO user_email_map (email, user_id) VALUES (?, ?)",

View File

@@ -30,9 +30,9 @@ class SalesforceObject:
BASE_DATA_PATH = os.path.join(os.path.dirname(__file__), "data")
def get_sqlite_db_path(directory: str) -> str:
def get_sqlite_db_path() -> str:
"""Get the path to the sqlite db file."""
return os.path.join(directory, "salesforce_db.sqlite")
return os.path.join(BASE_DATA_PATH, "salesforce_db.sqlite")
def get_object_type_path(object_type: str) -> str:

View File

@@ -14,8 +14,6 @@ from typing import cast
from pydantic import BaseModel
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
from slack_sdk.http_retry import ConnectionErrorRetryHandler
from slack_sdk.http_retry import RetryHandler
from typing_extensions import override
from onyx.configs.app_configs import ENABLE_EXPENSIVE_EXPERT_CALLS
@@ -28,8 +26,6 @@ from onyx.connectors.exceptions import InsufficientPermissionsError
from onyx.connectors.exceptions import UnexpectedValidationError
from onyx.connectors.interfaces import CheckpointConnector
from onyx.connectors.interfaces import CheckpointOutput
from onyx.connectors.interfaces import CredentialsConnector
from onyx.connectors.interfaces import CredentialsProviderInterface
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
@@ -42,16 +38,15 @@ from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import EntityFailure
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.connectors.slack.onyx_retry_handler import OnyxRedisSlackRetryHandler
from onyx.connectors.slack.utils import expert_info_from_slack_id
from onyx.connectors.slack.utils import get_message_link
from onyx.connectors.slack.utils import make_paginated_slack_api_call_w_retries
from onyx.connectors.slack.utils import make_slack_api_call_w_retries
from onyx.connectors.slack.utils import SlackTextCleaner
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.redis.redis_pool import get_redis_client
from onyx.utils.logger import setup_logger
logger = setup_logger()
_SLACK_LIMIT = 900
@@ -255,9 +250,7 @@ _DISALLOWED_MSG_SUBTYPES = {
def default_msg_filter(message: MessageType) -> bool:
# Don't keep messages from bots
if message.get("bot_id") or message.get("app_id"):
bot_profile_name = message.get("bot_profile", {}).get("name")
print(f"bot_profile_name: {bot_profile_name}")
if bot_profile_name == "DanswerBot Testing":
if message.get("bot_profile", {}).get("name") == "OnyxConnector":
return False
return True
@@ -500,13 +493,9 @@ def _process_message(
)
class SlackConnector(
SlimConnector, CredentialsConnector, CheckpointConnector[SlackCheckpoint]
):
class SlackConnector(SlimConnector, CheckpointConnector[SlackCheckpoint]):
FAST_TIMEOUT = 1
MAX_RETRIES = 7 # arbitrarily selected
def __init__(
self,
channels: list[str] | None = None,
@@ -525,49 +514,16 @@ class SlackConnector(
# just used for efficiency
self.text_cleaner: SlackTextCleaner | None = None
self.user_cache: dict[str, BasicExpertInfo | None] = {}
self.credentials_provider: CredentialsProviderInterface | None = None
self.credential_prefix: str | None = None
self.delay_lock: str | None = None # the redis key for the shared lock
self.delay_key: str | None = None # the redis key for the shared delay
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
raise NotImplementedError("Use set_credentials_provider with this connector.")
def set_credentials_provider(
self, credentials_provider: CredentialsProviderInterface
) -> None:
credentials = credentials_provider.get_credentials()
tenant_id = credentials_provider.get_tenant_id()
self.redis = get_redis_client(tenant_id=tenant_id)
self.credential_prefix = (
f"connector:slack:credential_{credentials_provider.get_provider_key()}"
)
self.delay_lock = f"{self.credential_prefix}:delay_lock"
self.delay_key = f"{self.credential_prefix}:delay"
# NOTE: slack has a built in RateLimitErrorRetryHandler, but it isn't designed
# for concurrent workers. We've extended it with OnyxRedisSlackRetryHandler.
connection_error_retry_handler = ConnectionErrorRetryHandler()
onyx_rate_limit_error_retry_handler = OnyxRedisSlackRetryHandler(
max_retry_count=self.MAX_RETRIES,
delay_lock=self.delay_lock,
delay_key=self.delay_key,
r=self.redis,
)
custom_retry_handlers: list[RetryHandler] = [
connection_error_retry_handler,
onyx_rate_limit_error_retry_handler,
]
bot_token = credentials["slack_bot_token"]
self.client = WebClient(token=bot_token, retry_handlers=custom_retry_handlers)
self.client = WebClient(token=bot_token)
# use for requests that must return quickly (e.g. realtime flows where user is waiting)
self.fast_client = WebClient(
token=bot_token, timeout=SlackConnector.FAST_TIMEOUT
)
self.text_cleaner = SlackTextCleaner(client=self.client)
self.credentials_provider = credentials_provider
return None
def retrieve_all_slim_documents(
self,

View File

@@ -1,159 +0,0 @@
import math
import random
import time
from typing import cast
from typing import Optional
from redis import Redis
from redis.lock import Lock as RedisLock
from slack_sdk.http_retry.handler import RetryHandler
from slack_sdk.http_retry.request import HttpRequest
from slack_sdk.http_retry.response import HttpResponse
from slack_sdk.http_retry.state import RetryState
from onyx.utils.logger import setup_logger
logger = setup_logger()
class OnyxRedisSlackRetryHandler(RetryHandler):
"""
This class uses Redis to share a rate limit among multiple threads.
Threads that encounter a rate limit will observe the shared delay, increment the
shared delay with the retry value, and use the new shared value as a wait interval.
This has the effect of serializing calls when a rate limit is hit, which is what
needs to happens if the server punishes us with additional limiting when we make
a call too early. We believe this is what Slack is doing based on empirical
observation, meaning we see indefinite hangs if we're too aggressive.
Another way to do this is just to do exponential backoff. Might be easier?
Adapted from slack's RateLimitErrorRetryHandler.
"""
LOCK_TTL = 60 # used to serialize access to the retry TTL
LOCK_BLOCKING_TIMEOUT = 60 # how long to wait for the lock
"""RetryHandler that does retries for rate limited errors."""
def __init__(
self,
max_retry_count: int,
delay_lock: str,
delay_key: str,
r: Redis,
):
"""
delay_lock: the redis key to use with RedisLock (to synchronize access to delay_key)
delay_key: the redis key containing a shared TTL
"""
super().__init__(max_retry_count=max_retry_count)
self._redis: Redis = r
self._delay_lock = delay_lock
self._delay_key = delay_key
def _can_retry(
self,
*,
state: RetryState,
request: HttpRequest,
response: Optional[HttpResponse] = None,
error: Optional[Exception] = None,
) -> bool:
return response is not None and response.status_code == 429
def prepare_for_next_attempt(
self,
*,
state: RetryState,
request: HttpRequest,
response: Optional[HttpResponse] = None,
error: Optional[Exception] = None,
) -> None:
"""It seems this function is responsible for the wait to retry ... aka we
actually sleep in this function."""
retry_after_value: list[str] | None = None
retry_after_header_name: Optional[str] = None
duration_s: float = 1.0 # seconds
if response is None:
# NOTE(rkuo): this logic comes from RateLimitErrorRetryHandler.
# This reads oddly, as if the caller itself could raise the exception.
# We don't have the luxury of changing this.
if error:
raise error
return
state.next_attempt_requested = True # this signals the caller to retry
# calculate wait duration based on retry-after + some jitter
for k in response.headers.keys():
if k.lower() == "retry-after":
retry_after_header_name = k
break
try:
if retry_after_header_name is None:
# This situation usually does not arise. Just in case.
raise ValueError(
"OnyxRedisSlackRetryHandler.prepare_for_next_attempt: retry-after header name is None"
)
retry_after_value = response.headers.get(retry_after_header_name)
if not retry_after_value:
raise ValueError(
"OnyxRedisSlackRetryHandler.prepare_for_next_attempt: retry-after header value is None"
)
retry_after_value_int = int(
retry_after_value[0]
) # will raise ValueError if somehow we can't convert to int
jitter = retry_after_value_int * 0.25 * random.random()
duration_s = math.ceil(retry_after_value_int + jitter)
except ValueError:
duration_s += random.random()
# lock and extend the ttl
lock: RedisLock = self._redis.lock(
self._delay_lock,
timeout=OnyxRedisSlackRetryHandler.LOCK_TTL,
thread_local=False,
)
acquired = lock.acquire(
blocking_timeout=OnyxRedisSlackRetryHandler.LOCK_BLOCKING_TIMEOUT / 2
)
ttl_ms: int | None = None
try:
if acquired:
# if we can get the lock, then read and extend the ttl
ttl_ms = cast(int, self._redis.pttl(self._delay_key))
if ttl_ms < 0: # negative values are error status codes ... see docs
ttl_ms = 0
ttl_ms_new = ttl_ms + int(duration_s * 1000.0)
self._redis.set(self._delay_key, "1", px=ttl_ms_new)
else:
# if we can't get the lock, just go ahead.
# TODO: if we know our actual parallelism, multiplying by that
# would be a pretty good idea
ttl_ms_new = int(duration_s * 1000.0)
finally:
if acquired:
lock.release()
logger.warning(
f"OnyxRedisSlackRetryHandler.prepare_for_next_attempt wait: "
f"retry-after={retry_after_value} "
f"shared_delay_ms={ttl_ms} new_shared_delay_ms={ttl_ms_new}"
)
# TODO: would be good to take an event var and sleep in short increments to
# allow for a clean exit / exception
time.sleep(ttl_ms_new / 1000.0)
state.increment_current_attempt()

View File

@@ -1,4 +1,5 @@
import re
import time
from collections.abc import Callable
from collections.abc import Generator
from functools import lru_cache
@@ -63,72 +64,71 @@ def _make_slack_api_call_paginated(
return paginated_call
# NOTE(rkuo): we may not need this any more if the integrated retry handlers work as
# expected. Do we want to keep this around?
def make_slack_api_rate_limited(
call: Callable[..., SlackResponse], max_retries: int = 7
) -> Callable[..., SlackResponse]:
"""Wraps calls to slack API so that they automatically handle rate limiting"""
# def make_slack_api_rate_limited(
# call: Callable[..., SlackResponse], max_retries: int = 7
# ) -> Callable[..., SlackResponse]:
# """Wraps calls to slack API so that they automatically handle rate limiting"""
@wraps(call)
def rate_limited_call(**kwargs: Any) -> SlackResponse:
last_exception = None
# @wraps(call)
# def rate_limited_call(**kwargs: Any) -> SlackResponse:
# last_exception = None
for _ in range(max_retries):
try:
# Make the API call
response = call(**kwargs)
# for _ in range(max_retries):
# try:
# # Make the API call
# response = call(**kwargs)
# Check for errors in the response, will raise `SlackApiError`
# if anything went wrong
response.validate()
return response
# # Check for errors in the response, will raise `SlackApiError`
# # if anything went wrong
# response.validate()
# return response
except SlackApiError as e:
last_exception = e
try:
error = e.response["error"]
except KeyError:
error = "unknown error"
# except SlackApiError as e:
# last_exception = e
# try:
# error = e.response["error"]
# except KeyError:
# error = "unknown error"
if error == "ratelimited":
# Handle rate limiting: get the 'Retry-After' header value and sleep for that duration
retry_after = int(e.response.headers.get("Retry-After", 1))
logger.info(
f"Slack call rate limited, retrying after {retry_after} seconds. Exception: {e}"
)
time.sleep(retry_after)
elif error in ["already_reacted", "no_reaction", "internal_error"]:
# Log internal_error and return the response instead of failing
logger.warning(
f"Slack call encountered '{error}', skipping and continuing..."
)
return e.response
else:
# Raise the error for non-transient errors
raise
# if error == "ratelimited":
# # Handle rate limiting: get the 'Retry-After' header value and sleep for that duration
# retry_after = int(e.response.headers.get("Retry-After", 1))
# logger.info(
# f"Slack call rate limited, retrying after {retry_after} seconds. Exception: {e}"
# )
# time.sleep(retry_after)
# elif error in ["already_reacted", "no_reaction", "internal_error"]:
# # Log internal_error and return the response instead of failing
# logger.warning(
# f"Slack call encountered '{error}', skipping and continuing..."
# )
# return e.response
# else:
# # Raise the error for non-transient errors
# raise
# If the code reaches this point, all retries have been exhausted
msg = f"Max retries ({max_retries}) exceeded"
if last_exception:
raise Exception(msg) from last_exception
else:
raise Exception(msg)
# # If the code reaches this point, all retries have been exhausted
# msg = f"Max retries ({max_retries}) exceeded"
# if last_exception:
# raise Exception(msg) from last_exception
# else:
# raise Exception(msg)
# return rate_limited_call
return rate_limited_call
def make_slack_api_call_w_retries(
call: Callable[..., SlackResponse], **kwargs: Any
) -> SlackResponse:
return basic_retry_wrapper(call)(**kwargs)
return basic_retry_wrapper(make_slack_api_rate_limited(call))(**kwargs)
def make_paginated_slack_api_call_w_retries(
call: Callable[..., SlackResponse], **kwargs: Any
) -> Generator[dict[str, Any], None, None]:
return _make_slack_api_call_paginated(basic_retry_wrapper(call))(**kwargs)
return _make_slack_api_call_paginated(
basic_retry_wrapper(make_slack_api_rate_limited(call))
)(**kwargs)
def expert_info_from_slack_id(
@@ -142,7 +142,7 @@ def expert_info_from_slack_id(
if user_id in user_cache:
return user_cache[user_id]
response = client.users_info(user=user_id)
response = make_slack_api_rate_limited(client.users_info)(user=user_id)
if not response["ok"]:
user_cache[user_id] = None
@@ -175,7 +175,9 @@ class SlackTextCleaner:
def _get_slack_name(self, user_id: str) -> str:
if user_id not in self._id_to_name_map:
try:
response = self._client.users_info(user=user_id)
response = make_slack_api_rate_limited(self._client.users_info)(
user=user_id
)
# prefer display name if set, since that is what is shown in Slack
self._id_to_name_map[user_id] = (
response["user"]["profile"]["display_name"]

View File

@@ -60,7 +60,7 @@ class SearchSettingsCreationRequest(InferenceSettings, IndexingSetting):
inference_settings = InferenceSettings.from_db_model(search_settings)
indexing_setting = IndexingSetting.from_db_model(search_settings)
return cls(**inference_settings.model_dump(), **indexing_setting.model_dump())
return cls(**inference_settings.dict(), **indexing_setting.dict())
class SavedSearchSettings(InferenceSettings, IndexingSetting):
@@ -80,9 +80,6 @@ class SavedSearchSettings(InferenceSettings, IndexingSetting):
reduced_dimension=search_settings.reduced_dimension,
# Whether switching to this model requires re-indexing
background_reindex_enabled=search_settings.background_reindex_enabled,
enable_contextual_rag=search_settings.enable_contextual_rag,
contextual_rag_llm_name=search_settings.contextual_rag_llm_name,
contextual_rag_llm_provider=search_settings.contextual_rag_llm_provider,
# Reranking Details
rerank_model_name=search_settings.rerank_model_name,
rerank_provider_type=search_settings.rerank_provider_type,
@@ -105,8 +102,6 @@ class BaseFilters(BaseModel):
document_set: list[str] | None = None
time_cutoff: datetime | None = None
tags: list[Tag] | None = None
user_file_ids: list[int] | None = None
user_folder_ids: list[int] | None = None
class IndexFilters(BaseFilters):
@@ -223,8 +218,6 @@ class InferenceChunk(BaseChunk):
# to specify that a set of words should be highlighted. For example:
# ["<hi>the</hi> <hi>answer</hi> is 42", "he couldn't find an <hi>answer</hi>"]
match_highlights: list[str]
doc_summary: str
chunk_context: str
# when the doc was last updated
updated_at: datetime | None

View File

@@ -5,13 +5,11 @@ from typing import cast
from sqlalchemy.orm import Session
from onyx.chat.models import ContextualPruningConfig
from onyx.chat.models import PromptConfig
from onyx.chat.models import SectionRelevancePiece
from onyx.chat.prune_and_merge import _merge_sections
from onyx.chat.prune_and_merge import ChunkRange
from onyx.chat.prune_and_merge import merge_chunk_intervals
from onyx.chat.prune_and_merge import prune_and_merge_sections
from onyx.configs.chat_configs import DISABLE_LLM_DOC_RELEVANCE
from onyx.context.search.enums import LLMEvaluationType
from onyx.context.search.enums import QueryFlow
@@ -63,7 +61,6 @@ class SearchPipeline:
| None = None,
rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
prompt_config: PromptConfig | None = None,
contextual_pruning_config: ContextualPruningConfig | None = None,
):
# NOTE: The Search Request contains a lot of fields that are overrides, many of them can be None
# and typically are None. The preprocessing will fetch default values to replace these empty overrides.
@@ -80,9 +77,6 @@ class SearchPipeline:
self.search_settings = get_current_search_settings(db_session)
self.document_index = get_default_document_index(self.search_settings, None)
self.prompt_config: PromptConfig | None = prompt_config
self.contextual_pruning_config: ContextualPruningConfig | None = (
contextual_pruning_config
)
# Preprocessing steps generate this
self._search_query: SearchQuery | None = None
@@ -164,47 +158,6 @@ class SearchPipeline:
return cast(list[InferenceChunk], self._retrieved_chunks)
def get_ordering_only_chunks(
self,
query: str,
user_file_ids: list[int] | None = None,
user_folder_ids: list[int] | None = None,
) -> list[InferenceChunk]:
"""Optimized method that only retrieves chunks for ordering purposes.
Skips all extra processing and uses minimal configuration to speed up retrieval.
"""
logger.info("Fast path: Using optimized chunk retrieval for ordering-only mode")
# Create minimal filters with just user file/folder IDs
filters = IndexFilters(
user_file_ids=user_file_ids or [],
user_folder_ids=user_folder_ids or [],
access_control_list=None,
)
# Use a simplified query that skips all unnecessary processing
minimal_query = SearchQuery(
query=query,
search_type=SearchType.SEMANTIC,
filters=filters,
# Set minimal options needed for retrieval
evaluation_type=LLMEvaluationType.SKIP,
recency_bias_multiplier=1.0,
chunks_above=0, # No need for surrounding context
chunks_below=0, # No need for surrounding context
processed_keywords=[], # Empty list instead of None
rerank_settings=None,
hybrid_alpha=0.0,
max_llm_filter_sections=0,
)
# Retrieve chunks using the minimal configuration
return retrieve_chunks(
query=minimal_query,
document_index=self.document_index,
db_session=self.db_session,
)
@log_function_time(print_only=True)
def _get_sections(self) -> list[InferenceSection]:
"""Returns an expanded section from each of the chunks.
@@ -227,7 +180,7 @@ class SearchPipeline:
# If ee is enabled, censor the chunk sections based on user access
# Otherwise, return the retrieved chunks
censored_chunks: list[InferenceChunk] = fetch_ee_implementation_or_noop(
censored_chunks = fetch_ee_implementation_or_noop(
"onyx.external_permissions.post_query_censoring",
"_post_query_chunk_censoring",
retrieved_chunks,
@@ -386,12 +339,6 @@ class SearchPipeline:
self._retrieved_sections = self._get_sections()
return self._retrieved_sections
@property
def merged_retrieved_sections(self) -> list[InferenceSection]:
"""Should be used to display in the UI in order to prevent displaying
multiple sections for the same document as separate "documents"."""
return _merge_sections(sections=self.retrieved_sections)
@property
def reranked_sections(self) -> list[InferenceSection]:
"""Reranking is always done at the chunk level since section merging could create arbitrarily
@@ -426,26 +373,7 @@ class SearchPipeline:
if self._final_context_sections is not None:
return self._final_context_sections
if (
self.contextual_pruning_config is not None
and self.prompt_config is not None
):
self._final_context_sections = prune_and_merge_sections(
sections=self.reranked_sections,
section_relevance_list=None,
prompt_config=self.prompt_config,
llm_config=self.llm.config,
question=self.search_query.query,
contextual_pruning_config=self.contextual_pruning_config,
)
else:
logger.error(
"Contextual pruning or prompt config not set, using default merge"
)
self._final_context_sections = _merge_sections(
sections=self.reranked_sections
)
self._final_context_sections = _merge_sections(sections=self.reranked_sections)
return self._final_context_sections
@property
@@ -457,10 +385,6 @@ class SearchPipeline:
self.search_query.evaluation_type == LLMEvaluationType.SKIP
or DISABLE_LLM_DOC_RELEVANCE
):
if self.search_query.evaluation_type == LLMEvaluationType.SKIP:
logger.info(
"Fast path: Skipping section relevance evaluation for ordering-only mode"
)
return None
if self.search_query.evaluation_type == LLMEvaluationType.UNSPECIFIED:
@@ -491,10 +415,6 @@ class SearchPipeline:
raise ValueError(
"Basic search evaluation operation called while DISABLE_LLM_DOC_RELEVANCE is enabled."
)
# NOTE: final_context_sections must be accessed before accessing self._postprocessing_generator
# since the property sets the generator. DO NOT REMOVE.
_ = self.final_context_sections
self._section_relevance = next(
cast(
Iterator[list[SectionRelevancePiece]],

View File

@@ -11,7 +11,6 @@ from langchain_core.messages import SystemMessage
from onyx.chat.models import SectionRelevancePiece
from onyx.configs.app_configs import BLURB_SIZE
from onyx.configs.app_configs import IMAGE_ANALYSIS_SYSTEM_PROMPT
from onyx.configs.chat_configs import DISABLE_LLM_DOC_RELEVANCE
from onyx.configs.constants import RETURN_SEPARATOR
from onyx.configs.llm_configs import get_search_time_image_analysis_enabled
from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MAX
@@ -197,21 +196,9 @@ def cleanup_chunks(chunks: list[InferenceChunkUncleaned]) -> list[InferenceChunk
RETURN_SEPARATOR
)
def _remove_contextual_rag(chunk: InferenceChunkUncleaned) -> str:
# remove document summary
if chunk.content.startswith(chunk.doc_summary):
chunk.content = chunk.content[len(chunk.doc_summary) :].lstrip()
# remove chunk context
if chunk.content.endswith(chunk.chunk_context):
chunk.content = chunk.content[
: len(chunk.content) - len(chunk.chunk_context)
].rstrip()
return chunk.content
for chunk in chunks:
chunk.content = _remove_title(chunk)
chunk.content = _remove_metadata_suffix(chunk)
chunk.content = _remove_contextual_rag(chunk)
return [chunk.to_inference_chunk() for chunk in chunks]
@@ -367,21 +354,6 @@ def filter_sections(
Returns a list of the unique chunk IDs that were marked as relevant
"""
# Log evaluation type to help with debugging
logger.info(f"filter_sections called with evaluation_type={query.evaluation_type}")
# Fast path: immediately return empty list for SKIP evaluation type (ordering-only mode)
if query.evaluation_type == LLMEvaluationType.SKIP:
return []
# Additional safeguard: Log a warning if this function is ever called with SKIP evaluation type
# This should never happen if our fast paths are working correctly
if query.evaluation_type == LLMEvaluationType.SKIP:
logger.warning(
"WARNING: filter_sections called with SKIP evaluation_type. This should never happen!"
)
return []
sections_to_filter = sections_to_filter[: query.max_llm_filter_sections]
contents = [
@@ -414,16 +386,6 @@ def search_postprocessing(
llm: LLM,
rerank_metrics_callback: Callable[[RerankMetricsContainer], None] | None = None,
) -> Iterator[list[InferenceSection] | list[SectionRelevancePiece]]:
# Fast path for ordering-only: detect it by checking if evaluation_type is SKIP
if search_query.evaluation_type == LLMEvaluationType.SKIP:
logger.info(
"Fast path: Detected ordering-only mode, bypassing all post-processing"
)
# Immediately yield the sections without any processing and an empty relevance list
yield retrieved_sections
yield cast(list[SectionRelevancePiece], [])
return
post_processing_tasks: list[FunctionCall] = []
if not retrieved_sections:
@@ -460,14 +422,10 @@ def search_postprocessing(
sections_yielded = True
llm_filter_task_id = None
# Only add LLM filtering if not in SKIP mode and if LLM doc relevance is not disabled
if (
search_query.evaluation_type not in [LLMEvaluationType.SKIP]
and not DISABLE_LLM_DOC_RELEVANCE
and search_query.evaluation_type
in [LLMEvaluationType.BASIC, LLMEvaluationType.UNSPECIFIED]
):
logger.info("Adding LLM filtering task for document relevance evaluation")
if search_query.evaluation_type in [
LLMEvaluationType.BASIC,
LLMEvaluationType.UNSPECIFIED,
]:
post_processing_tasks.append(
FunctionCall(
filter_sections,
@@ -479,10 +437,6 @@ def search_postprocessing(
)
)
llm_filter_task_id = post_processing_tasks[-1].result_id
elif search_query.evaluation_type == LLMEvaluationType.SKIP:
logger.info("Fast path: Skipping LLM filtering task for ordering-only mode")
elif DISABLE_LLM_DOC_RELEVANCE:
logger.info("Skipping LLM filtering task because LLM doc relevance is disabled")
post_processing_results = (
run_functions_in_parallel(post_processing_tasks)

View File

@@ -165,18 +165,7 @@ def retrieval_preprocessing(
user_acl_filters = (
None if bypass_acl else build_access_filters_for_user(user, db_session)
)
user_file_ids = preset_filters.user_file_ids or []
user_folder_ids = preset_filters.user_folder_ids or []
if persona and persona.user_files:
user_file_ids = user_file_ids + [
file.id
for file in persona.user_files
if file.id not in (preset_filters.user_file_ids or [])
]
final_filters = IndexFilters(
user_file_ids=user_file_ids,
user_folder_ids=user_folder_ids,
source_type=preset_filters.source_type or predicted_source_filters,
document_set=preset_filters.document_set,
time_cutoff=time_filter or predicted_time_cutoff,

View File

@@ -26,7 +26,6 @@ from onyx.agents.agent_search.shared_graph_utils.models import (
from onyx.auth.schemas import UserRole
from onyx.chat.models import DocumentRelevance
from onyx.configs.chat_configs import HARD_DELETE_CHATS
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import MessageType
from onyx.context.search.models import InferenceSection
from onyx.context.search.models import RetrievalDocs
@@ -45,11 +44,9 @@ from onyx.db.models import SearchDoc
from onyx.db.models import SearchDoc as DBSearchDoc
from onyx.db.models import ToolCall
from onyx.db.models import User
from onyx.db.models import UserFile
from onyx.db.persona import get_best_persona_id_for_user
from onyx.db.pg_file_store import delete_lobj_by_name
from onyx.file_store.models import FileDescriptor
from onyx.file_store.models import InMemoryChatFile
from onyx.llm.override_models import LLMOverride
from onyx.llm.override_models import PromptOverride
from onyx.server.query_and_chat.models import ChatMessageDetail
@@ -857,87 +854,6 @@ def get_db_search_doc_by_id(doc_id: int, db_session: Session) -> DBSearchDoc | N
return search_doc
def create_search_doc_from_user_file(
db_user_file: UserFile, associated_chat_file: InMemoryChatFile, db_session: Session
) -> SearchDoc:
"""Create a SearchDoc in the database from a UserFile and return it.
This ensures proper ID generation by SQLAlchemy and prevents duplicate key errors.
"""
blurb = ""
if associated_chat_file and associated_chat_file.content:
try:
# Try to decode as UTF-8, but handle errors gracefully
content_sample = associated_chat_file.content[:100]
# Remove null bytes which can cause SQL errors
content_sample = content_sample.replace(b"\x00", b"")
blurb = content_sample.decode("utf-8", errors="replace")
except Exception:
# If decoding fails completely, provide a generic description
blurb = f"[Binary file: {db_user_file.name}]"
db_search_doc = SearchDoc(
document_id=db_user_file.document_id,
chunk_ind=0, # Default to 0 for user files
semantic_id=db_user_file.name,
link=db_user_file.link_url,
blurb=blurb,
source_type=DocumentSource.FILE, # Assuming internal source for user files
boost=0, # Default boost
hidden=False, # Default visibility
doc_metadata={}, # Empty metadata
score=0.0, # Default score of 0.0 instead of None
is_relevant=None, # No relevance initially
relevance_explanation=None, # No explanation initially
match_highlights=[], # No highlights initially
updated_at=db_user_file.created_at, # Use created_at as updated_at
primary_owners=[], # Empty list instead of None
secondary_owners=[], # Empty list instead of None
is_internet=False, # Not from internet
)
db_session.add(db_search_doc)
db_session.flush() # Get the ID but don't commit yet
return db_search_doc
def translate_db_user_file_to_search_doc(
db_user_file: UserFile, associated_chat_file: InMemoryChatFile
) -> SearchDoc:
blurb = ""
if associated_chat_file and associated_chat_file.content:
try:
# Try to decode as UTF-8, but handle errors gracefully
content_sample = associated_chat_file.content[:100]
# Remove null bytes which can cause SQL errors
content_sample = content_sample.replace(b"\x00", b"")
blurb = content_sample.decode("utf-8", errors="replace")
except Exception:
# If decoding fails completely, provide a generic description
blurb = f"[Binary file: {db_user_file.name}]"
return SearchDoc(
# Don't set ID - let SQLAlchemy auto-generate it
document_id=db_user_file.document_id,
chunk_ind=0, # Default to 0 for user files
semantic_id=db_user_file.name,
link=db_user_file.link_url,
blurb=blurb,
source_type=DocumentSource.FILE, # Assuming internal source for user files
boost=0, # Default boost
hidden=False, # Default visibility
doc_metadata={}, # Empty metadata
score=0.0, # Default score of 0.0 instead of None
is_relevant=None, # No relevance initially
relevance_explanation=None, # No explanation initially
match_highlights=[], # No highlights initially
updated_at=db_user_file.created_at, # Use created_at as updated_at
primary_owners=[], # Empty list instead of None
secondary_owners=[], # Empty list instead of None
is_internet=False, # Not from internet
)
def translate_db_search_doc_to_server_search_doc(
db_search_doc: SearchDoc,
remove_doc_content: bool = False,

View File

@@ -27,7 +27,6 @@ from onyx.db.models import IndexModelStatus
from onyx.db.models import SearchSettings
from onyx.db.models import User
from onyx.db.models import User__UserGroup
from onyx.db.models import UserFile
from onyx.db.models import UserGroup__ConnectorCredentialPair
from onyx.db.models import UserRole
from onyx.server.models import StatusResponse
@@ -107,13 +106,11 @@ def get_connector_credential_pairs_for_user(
eager_load_connector: bool = False,
eager_load_credential: bool = False,
eager_load_user: bool = False,
include_user_files: bool = False,
) -> list[ConnectorCredentialPair]:
if eager_load_user:
assert (
eager_load_credential
), "eager_load_credential must be True if eager_load_user is True"
stmt = select(ConnectorCredentialPair).distinct()
if eager_load_connector:
@@ -129,9 +126,6 @@ def get_connector_credential_pairs_for_user(
if ids:
stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
if not include_user_files:
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
return list(db_session.scalars(stmt).unique().all())
@@ -159,16 +153,14 @@ def get_connector_credential_pairs_for_user_parallel(
def get_connector_credential_pairs(
db_session: Session, ids: list[int] | None = None, include_user_files: bool = False
db_session: Session,
ids: list[int] | None = None,
) -> list[ConnectorCredentialPair]:
stmt = select(ConnectorCredentialPair).distinct()
if ids:
stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
if not include_user_files:
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
return list(db_session.scalars(stmt).all())
@@ -215,15 +207,12 @@ def get_connector_credential_pair_for_user(
connector_id: int,
credential_id: int,
user: User | None,
include_user_files: bool = False,
get_editable: bool = True,
) -> ConnectorCredentialPair | None:
stmt = select(ConnectorCredentialPair)
stmt = _add_user_filters(stmt, user, get_editable)
stmt = stmt.where(ConnectorCredentialPair.connector_id == connector_id)
stmt = stmt.where(ConnectorCredentialPair.credential_id == credential_id)
if not include_user_files:
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
result = db_session.execute(stmt)
return result.scalar_one_or_none()
@@ -332,9 +321,6 @@ def _update_connector_credential_pair(
cc_pair.total_docs_indexed += net_docs
if status is not None:
cc_pair.status = status
if cc_pair.is_user_file:
cc_pair.status = ConnectorCredentialPairStatus.PAUSED
db_session.commit()
@@ -460,7 +446,6 @@ def add_credential_to_connector(
initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.ACTIVE,
last_successful_index_time: datetime | None = None,
seeding_flow: bool = False,
is_user_file: bool = False,
) -> StatusResponse:
connector = fetch_connector_by_id(connector_id, db_session)
@@ -526,7 +511,6 @@ def add_credential_to_connector(
access_type=access_type,
auto_sync_options=auto_sync_options,
last_successful_index_time=last_successful_index_time,
is_user_file=is_user_file,
)
db_session.add(association)
db_session.flush() # make sure the association has an id
@@ -603,29 +587,14 @@ def remove_credential_from_connector(
def fetch_connector_credential_pairs(
db_session: Session,
include_user_files: bool = False,
) -> list[ConnectorCredentialPair]:
stmt = select(ConnectorCredentialPair)
if not include_user_files:
stmt = stmt.where(ConnectorCredentialPair.is_user_file != True) # noqa: E712
return list(db_session.scalars(stmt).unique().all())
return db_session.query(ConnectorCredentialPair).all()
def resync_cc_pair(
cc_pair: ConnectorCredentialPair,
search_settings_id: int,
db_session: Session,
) -> None:
"""
Updates state stored in the connector_credential_pair table based on the
latest index attempt for the given search settings.
Args:
cc_pair: ConnectorCredentialPair to resync
search_settings_id: SearchSettings to use for resync
db_session: Database session
"""
def find_latest_index_attempt(
connector_id: int,
credential_id: int,
@@ -638,10 +607,11 @@ def resync_cc_pair(
ConnectorCredentialPair,
IndexAttempt.connector_credential_pair_id == ConnectorCredentialPair.id,
)
.join(SearchSettings, IndexAttempt.search_settings_id == SearchSettings.id)
.filter(
ConnectorCredentialPair.connector_id == connector_id,
ConnectorCredentialPair.credential_id == credential_id,
IndexAttempt.search_settings_id == search_settings_id,
SearchSettings.status == IndexModelStatus.PRESENT,
)
)
@@ -664,23 +634,3 @@ def resync_cc_pair(
)
db_session.commit()
def get_connector_credential_pairs_with_user_files(
db_session: Session,
) -> list[ConnectorCredentialPair]:
"""
Get all connector credential pairs that have associated user files.
Args:
db_session: Database session
Returns:
List of ConnectorCredentialPair objects that have user files
"""
return (
db_session.query(ConnectorCredentialPair)
.join(UserFile, UserFile.cc_pair_id == ConnectorCredentialPair.id)
.distinct()
.all()
)

View File

@@ -43,8 +43,6 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
ONE_HOUR_IN_SECONDS = 60 * 60
def check_docs_exist(db_session: Session) -> bool:
stmt = select(exists(DbDocument))
@@ -609,46 +607,6 @@ def delete_documents_complete__no_commit(
delete_documents__no_commit(db_session, document_ids)
def delete_all_documents_for_connector_credential_pair(
db_session: Session,
connector_id: int,
credential_id: int,
timeout: int = ONE_HOUR_IN_SECONDS,
) -> None:
"""Delete all documents for a given connector credential pair.
This will delete all documents and their associated data (chunks, feedback, tags, etc.)
NOTE: a bit inefficient, but it's not a big deal since this is done rarely - only during
an index swap. If we wanted to make this more efficient, we could use a single delete
statement + cascade.
"""
batch_size = 1000
start_time = time.monotonic()
while True:
# Get document IDs in batches
stmt = (
select(DocumentByConnectorCredentialPair.id)
.where(
DocumentByConnectorCredentialPair.connector_id == connector_id,
DocumentByConnectorCredentialPair.credential_id == credential_id,
)
.limit(batch_size)
)
document_ids = db_session.scalars(stmt).all()
if not document_ids:
break
delete_documents_complete__no_commit(
db_session=db_session, document_ids=list(document_ids)
)
db_session.commit()
if time.monotonic() - start_time > timeout:
raise RuntimeError("Timeout reached while deleting documents")
def acquire_document_locks(db_session: Session, document_ids: list[str]) -> bool:
"""Acquire locks for the specified documents. Ideally this shouldn't be
called with large list of document_ids (an exception could be made if the

Some files were not shown because too many files have changed in this diff Show More