mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-17 07:45:47 +00:00
Compare commits
1 Commits
sharepoint
...
double_ini
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b3953b2c2f |
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@@ -1,3 +1 @@
|
||||
* @onyx-dot-app/onyx-core-team
|
||||
# Helm charts Owners
|
||||
/helm/ @justin-tahara
|
||||
|
||||
26
.github/actions/custom-build-and-push/action.yml
vendored
26
.github/actions/custom-build-and-push/action.yml
vendored
@@ -25,26 +25,12 @@ inputs:
|
||||
tags:
|
||||
description: 'Image tags'
|
||||
required: true
|
||||
no-cache:
|
||||
description: 'Read from cache'
|
||||
required: false
|
||||
default: 'false'
|
||||
cache-from:
|
||||
description: 'Cache sources'
|
||||
required: false
|
||||
cache-to:
|
||||
description: 'Cache destinations'
|
||||
required: false
|
||||
outputs:
|
||||
description: 'Output destinations'
|
||||
required: false
|
||||
provenance:
|
||||
description: 'Generate provenance attestation'
|
||||
required: false
|
||||
default: 'false'
|
||||
build-args:
|
||||
description: 'Build arguments'
|
||||
required: false
|
||||
retry-wait-time:
|
||||
description: 'Time to wait before attempt 2 in seconds'
|
||||
required: false
|
||||
@@ -69,12 +55,8 @@ runs:
|
||||
push: ${{ inputs.push }}
|
||||
load: ${{ inputs.load }}
|
||||
tags: ${{ inputs.tags }}
|
||||
no-cache: ${{ inputs.no-cache }}
|
||||
cache-from: ${{ inputs.cache-from }}
|
||||
cache-to: ${{ inputs.cache-to }}
|
||||
outputs: ${{ inputs.outputs }}
|
||||
provenance: ${{ inputs.provenance }}
|
||||
build-args: ${{ inputs.build-args }}
|
||||
|
||||
- name: Wait before attempt 2
|
||||
if: steps.buildx1.outcome != 'success'
|
||||
@@ -95,12 +77,8 @@ runs:
|
||||
push: ${{ inputs.push }}
|
||||
load: ${{ inputs.load }}
|
||||
tags: ${{ inputs.tags }}
|
||||
no-cache: ${{ inputs.no-cache }}
|
||||
cache-from: ${{ inputs.cache-from }}
|
||||
cache-to: ${{ inputs.cache-to }}
|
||||
outputs: ${{ inputs.outputs }}
|
||||
provenance: ${{ inputs.provenance }}
|
||||
build-args: ${{ inputs.build-args }}
|
||||
|
||||
- name: Wait before attempt 3
|
||||
if: steps.buildx1.outcome != 'success' && steps.buildx2.outcome != 'success'
|
||||
@@ -121,12 +99,8 @@ runs:
|
||||
push: ${{ inputs.push }}
|
||||
load: ${{ inputs.load }}
|
||||
tags: ${{ inputs.tags }}
|
||||
no-cache: ${{ inputs.no-cache }}
|
||||
cache-from: ${{ inputs.cache-from }}
|
||||
cache-to: ${{ inputs.cache-to }}
|
||||
outputs: ${{ inputs.outputs }}
|
||||
provenance: ${{ inputs.provenance }}
|
||||
build-args: ${{ inputs.build-args }}
|
||||
|
||||
- name: Report failure
|
||||
if: steps.buildx1.outcome != 'success' && steps.buildx2.outcome != 'success' && steps.buildx3.outcome != 'success'
|
||||
|
||||
5
.github/pull_request_template.md
vendored
5
.github/pull_request_template.md
vendored
@@ -6,6 +6,9 @@
|
||||
|
||||
[Describe the tests you ran to verify your changes]
|
||||
|
||||
## Additional Options
|
||||
## Backporting (check the box to trigger backport action)
|
||||
|
||||
Note: You have to check that the action passes, otherwise resolve the conflicts manually and tag the patches.
|
||||
|
||||
- [ ] This PR should be backported (make sure to check that the backport attempt succeeds)
|
||||
- [ ] [Optional] Override Linear Check
|
||||
|
||||
24
.github/workflows/check-lazy-imports.yml
vendored
24
.github/workflows/check-lazy-imports.yml
vendored
@@ -1,24 +0,0 @@
|
||||
name: Check Lazy Imports
|
||||
|
||||
on:
|
||||
merge_group:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- 'release/**'
|
||||
|
||||
jobs:
|
||||
check-lazy-imports:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Check lazy imports
|
||||
run: python3 backend/scripts/check_lazy_imports.py
|
||||
@@ -7,47 +7,18 @@ on:
|
||||
|
||||
env:
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-backend-cloud' || 'onyxdotapp/onyx-backend' }}
|
||||
DEPLOYMENT: ${{ contains(github.ref_name, 'cloud') && 'cloud' || 'standalone' }}
|
||||
|
||||
# don't tag cloud images with "latest"
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') && !contains(github.ref_name, 'cloud') }}
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
# TODO: investigate a matrix build like the web container
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on:
|
||||
- runs-on
|
||||
- runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }}
|
||||
- run-id=${{ github.run_id }}
|
||||
- tag=platform-${{ matrix.platform }}
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
platform:
|
||||
- linux/amd64
|
||||
- linux/arm64
|
||||
|
||||
runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
|
||||
steps:
|
||||
- name: Prepare
|
||||
run: |
|
||||
platform=${{ matrix.platform }}
|
||||
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY_IMAGE }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=raw,value=${{ github.ref_name }}
|
||||
type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
@@ -63,104 +34,30 @@ jobs:
|
||||
sudo apt-get install -y build-essential
|
||||
|
||||
- name: Backend Image Docker Build and Push
|
||||
id: build
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: ${{ matrix.platform }}
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
build-args: |
|
||||
ONYX_VERSION=${{ github.ref_name }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/backend-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Export digest
|
||||
run: |
|
||||
mkdir -p /tmp/digests
|
||||
digest="${{ steps.build.outputs.digest }}"
|
||||
touch "/tmp/digests/${digest#sha256:}"
|
||||
|
||||
- name: Upload digest
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: backend-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
|
||||
path: /tmp/digests/*
|
||||
if-no-files-found: error
|
||||
retention-days: 1
|
||||
|
||||
merge:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build-and-push
|
||||
steps:
|
||||
# Needed for trivyignore
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Download digests
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: /tmp/digests
|
||||
pattern: backend-digests-*-${{ github.run_id }}
|
||||
merge-multiple: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY_IMAGE }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=raw,value=${{ github.ref_name }}
|
||||
type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Create manifest list and push
|
||||
working-directory: /tmp/digests
|
||||
run: |
|
||||
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||
$(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
|
||||
|
||||
- name: Inspect image
|
||||
run: |
|
||||
docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}
|
||||
|
||||
# trivy has their own rate limiting issues causing this action to flake
|
||||
# we worked around it by hardcoding to different db repos in env
|
||||
# can re-enable when they figure it out
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
# Security: Using pinned digest (0.65.0@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436)
|
||||
# Security: No Docker socket mount needed for remote registry scanning
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@v3
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
|
||||
TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
|
||||
with:
|
||||
timeout_minutes: 30
|
||||
max_attempts: 3
|
||||
retry_wait_seconds: 10
|
||||
command: |
|
||||
docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
|
||||
-v ${{ github.workspace }}/backend/.trivyignore:/tmp/.trivyignore:ro \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
--timeout 20m \
|
||||
--severity CRITICAL,HIGH \
|
||||
--ignorefile /tmp/.trivyignore \
|
||||
docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
# To run locally: trivy image --severity HIGH,CRITICAL onyxdotapp/onyx-backend
|
||||
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
severity: "CRITICAL,HIGH"
|
||||
trivyignores: ./backend/.trivyignore
|
||||
|
||||
@@ -4,12 +4,12 @@ name: Build and Push Cloud Web Image on Tag
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- "*cloud*"
|
||||
- "*"
|
||||
|
||||
env:
|
||||
REGISTRY_IMAGE: onyxdotapp/onyx-web-server-cloud
|
||||
DEPLOYMENT: cloud
|
||||
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on:
|
||||
@@ -38,10 +38,9 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY_IMAGE }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=raw,value=${{ github.ref_name }}
|
||||
type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
@@ -54,7 +53,7 @@ jobs:
|
||||
|
||||
- name: Build and push by digest
|
||||
id: build
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
@@ -71,12 +70,10 @@ jobs:
|
||||
NEXT_PUBLIC_FORGOT_PASSWORD_ENABLED=true
|
||||
NEXT_PUBLIC_INCLUDE_ERROR_POPUP_SUPPORT_LINK=true
|
||||
NODE_OPTIONS=--max-old-space-size=8192
|
||||
# needed due to weird interactions with the builds for different platforms
|
||||
no-cache: true
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/cloudweb-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/cloudweb-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
# no-cache needed due to weird interactions with the builds for different platforms
|
||||
# NOTE(rkuo): this may not be true any more with the proper cache prefixing by architecture - currently testing with it off
|
||||
|
||||
- name: Export digest
|
||||
run: |
|
||||
@@ -87,7 +84,7 @@ jobs:
|
||||
- name: Upload digest
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: cloudweb-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
|
||||
name: digests-${{ env.PLATFORM_PAIR }}
|
||||
path: /tmp/digests/*
|
||||
if-no-files-found: error
|
||||
retention-days: 1
|
||||
@@ -101,7 +98,7 @@ jobs:
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: /tmp/digests
|
||||
pattern: cloudweb-digests-*-${{ github.run_id }}
|
||||
pattern: digests-*
|
||||
merge-multiple: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
@@ -112,10 +109,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY_IMAGE }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=raw,value=${{ github.ref_name }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
@@ -139,20 +132,10 @@ jobs:
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@v3
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
|
||||
TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
|
||||
with:
|
||||
timeout_minutes: 30
|
||||
max_attempts: 3
|
||||
retry_wait_seconds: 10
|
||||
command: |
|
||||
docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
--timeout 20m \
|
||||
--severity CRITICAL,HIGH \
|
||||
docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
severity: "CRITICAL,HIGH"
|
||||
|
||||
@@ -7,13 +7,10 @@ on:
|
||||
|
||||
env:
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-model-server-cloud' || 'onyxdotapp/onyx-model-server' }}
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
|
||||
DOCKER_BUILDKIT: 1
|
||||
BUILDKIT_PROGRESS: plain
|
||||
DEPLOYMENT: ${{ contains(github.ref_name, 'cloud') && 'cloud' || 'standalone' }}
|
||||
|
||||
# don't tag cloud images with "latest"
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') && !contains(github.ref_name, 'cloud') }}
|
||||
|
||||
jobs:
|
||||
|
||||
# Bypassing this for now as the idea of not building is glitching
|
||||
@@ -54,8 +51,6 @@ jobs:
|
||||
if: needs.check_model_server_changes.outputs.changed == 'true'
|
||||
runs-on:
|
||||
[runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}-amd64"]
|
||||
env:
|
||||
PLATFORM_PAIR: linux-amd64
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
@@ -80,7 +75,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and Push AMD64
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
@@ -91,17 +86,12 @@ jobs:
|
||||
DANSWER_VERSION=${{ github.ref_name }}
|
||||
outputs: type=registry
|
||||
provenance: false
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
# no-cache: true
|
||||
|
||||
build-arm64:
|
||||
needs: [check_model_server_changes]
|
||||
if: needs.check_model_server_changes.outputs.changed == 'true'
|
||||
runs-on:
|
||||
[runs-on, runner=8cpu-linux-arm64, "run-id=${{ github.run_id }}-arm64"]
|
||||
env:
|
||||
PLATFORM_PAIR: linux-arm64
|
||||
[runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}-arm64"]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
@@ -126,7 +116,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and Push ARM64
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
@@ -137,8 +127,6 @@ jobs:
|
||||
DANSWER_VERSION=${{ github.ref_name }}
|
||||
outputs: type=registry
|
||||
provenance: false
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/model-server-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
merge-and-scan:
|
||||
needs: [build-amd64, build-arm64, check_model_server_changes]
|
||||
@@ -164,20 +152,11 @@ jobs:
|
||||
fi
|
||||
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@v3
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
|
||||
TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
|
||||
with:
|
||||
timeout_minutes: 30
|
||||
max_attempts: 3
|
||||
retry_wait_seconds: 10
|
||||
command: |
|
||||
docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
--timeout 20m \
|
||||
--severity CRITICAL,HIGH \
|
||||
docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
severity: "CRITICAL,HIGH"
|
||||
timeout: "10m"
|
||||
|
||||
@@ -8,25 +8,9 @@ on:
|
||||
env:
|
||||
REGISTRY_IMAGE: onyxdotapp/onyx-web-server
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
|
||||
DEPLOYMENT: standalone
|
||||
|
||||
jobs:
|
||||
precheck:
|
||||
runs-on: [runs-on, runner=2cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
outputs:
|
||||
should-run: ${{ steps.set-output.outputs.should-run }}
|
||||
steps:
|
||||
- name: Check if tag contains "cloud"
|
||||
id: set-output
|
||||
run: |
|
||||
if [[ "${{ github.ref_name }}" == *cloud* ]]; then
|
||||
echo "should-run=false" >> "$GITHUB_OUTPUT"
|
||||
else
|
||||
echo "should-run=true" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
build:
|
||||
needs: precheck
|
||||
if: needs.precheck.outputs.should-run == 'true'
|
||||
runs-on:
|
||||
- runs-on
|
||||
- runner=${{ matrix.platform == 'linux/amd64' && '8cpu-linux-x64' || '8cpu-linux-arm64' }}
|
||||
@@ -53,11 +37,9 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY_IMAGE }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=raw,value=${{ github.ref_name }}
|
||||
type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
|
||||
type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
@@ -70,7 +52,7 @@ jobs:
|
||||
|
||||
- name: Build and push by digest
|
||||
id: build
|
||||
uses: docker/build-push-action@v6
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
@@ -80,13 +62,11 @@ jobs:
|
||||
ONYX_VERSION=${{ github.ref_name }}
|
||||
NODE_OPTIONS=--max-old-space-size=8192
|
||||
|
||||
# needed due to weird interactions with the builds for different platforms
|
||||
no-cache: true
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/web-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/${{ env.DEPLOYMENT }}/web-${{ env.PLATFORM_PAIR }}/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
# no-cache needed due to weird interactions with the builds for different platforms
|
||||
# NOTE(rkuo): this may not be true any more with the proper cache prefixing by architecture - currently testing with it off
|
||||
|
||||
|
||||
- name: Export digest
|
||||
run: |
|
||||
mkdir -p /tmp/digests
|
||||
@@ -96,22 +76,21 @@ jobs:
|
||||
- name: Upload digest
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: web-digests-${{ env.PLATFORM_PAIR }}-${{ github.run_id }}
|
||||
name: digests-${{ env.PLATFORM_PAIR }}
|
||||
path: /tmp/digests/*
|
||||
if-no-files-found: error
|
||||
retention-days: 1
|
||||
|
||||
merge:
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- build
|
||||
if: needs.precheck.outputs.should-run == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Download digests
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: /tmp/digests
|
||||
pattern: web-digests-*-${{ github.run_id }}
|
||||
pattern: digests-*
|
||||
merge-multiple: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
@@ -122,11 +101,6 @@ jobs:
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY_IMAGE }}
|
||||
flavor: |
|
||||
latest=false
|
||||
tags: |
|
||||
type=raw,value=${{ github.ref_name }}
|
||||
type=raw,value=${{ env.LATEST_TAG == 'true' && 'latest' || '' }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
@@ -150,20 +124,10 @@ jobs:
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@v3
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
|
||||
TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
|
||||
with:
|
||||
timeout_minutes: 30
|
||||
max_attempts: 3
|
||||
retry_wait_seconds: 10
|
||||
command: |
|
||||
docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
--timeout 20m \
|
||||
--severity CRITICAL,HIGH \
|
||||
docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
severity: "CRITICAL,HIGH"
|
||||
|
||||
50
.github/workflows/helm-chart-releases.yml
vendored
50
.github/workflows/helm-chart-releases.yml
vendored
@@ -1,50 +0,0 @@
|
||||
name: Release Onyx Helm Charts
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
permissions: write-all
|
||||
|
||||
jobs:
|
||||
release:
|
||||
permissions:
|
||||
contents: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install Helm CLI
|
||||
uses: azure/setup-helm@v4
|
||||
with:
|
||||
version: v3.12.1
|
||||
|
||||
- name: Add required Helm repositories
|
||||
run: |
|
||||
helm repo add bitnami https://charts.bitnami.com/bitnami
|
||||
helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
|
||||
helm repo add keda https://kedacore.github.io/charts
|
||||
helm repo update
|
||||
|
||||
- name: Build chart dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
for chart_dir in deployment/helm/charts/*; do
|
||||
if [ -f "$chart_dir/Chart.yaml" ]; then
|
||||
echo "Building dependencies for $chart_dir"
|
||||
helm dependency build "$chart_dir"
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Publish Helm charts to gh-pages
|
||||
uses: stefanprodan/helm-gh-pages@v1.7.0
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
charts_dir: deployment/helm/charts
|
||||
branch: gh-pages
|
||||
commit_username: ${{ github.actor }}
|
||||
commit_email: ${{ github.actor }}@users.noreply.github.com
|
||||
124
.github/workflows/pr-backport-autotrigger.yml
vendored
Normal file
124
.github/workflows/pr-backport-autotrigger.yml
vendored
Normal file
@@ -0,0 +1,124 @@
|
||||
name: Backport on Merge
|
||||
|
||||
# Note this workflow does not trigger the builds, be sure to manually tag the branches to trigger the builds
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [closed] # Later we check for merge so only PRs that go in can get backported
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
actions: write
|
||||
|
||||
jobs:
|
||||
backport:
|
||||
if: github.event.pull_request.merged == true
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.YUHONG_GH_ACTIONS }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ssh-key: "${{ secrets.RKUO_DEPLOY_KEY }}"
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Git user
|
||||
run: |
|
||||
git config user.name "Richard Kuo [bot]"
|
||||
git config user.email "rkuo[bot]@onyx.app"
|
||||
git fetch --prune
|
||||
|
||||
- name: Check for Backport Checkbox
|
||||
id: checkbox-check
|
||||
run: |
|
||||
PR_BODY="${{ github.event.pull_request.body }}"
|
||||
if [[ "$PR_BODY" == *"[x] This PR should be backported"* ]]; then
|
||||
echo "backport=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "backport=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: List and sort release branches
|
||||
id: list-branches
|
||||
run: |
|
||||
git fetch --all --tags
|
||||
BRANCHES=$(git for-each-ref --format='%(refname:short)' refs/remotes/origin/release/* | sed 's|origin/release/||' | sort -Vr)
|
||||
BETA=$(echo "$BRANCHES" | head -n 1)
|
||||
STABLE=$(echo "$BRANCHES" | head -n 2 | tail -n 1)
|
||||
echo "beta=release/$BETA" >> $GITHUB_OUTPUT
|
||||
echo "stable=release/$STABLE" >> $GITHUB_OUTPUT
|
||||
# Fetch latest tags for beta and stable
|
||||
LATEST_BETA_TAG=$(git tag -l "v[0-9]*.[0-9]*.[0-9]*-beta.[0-9]*" | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+-beta\.[0-9]+$" | grep -v -- "-cloud" | sort -Vr | head -n 1)
|
||||
LATEST_STABLE_TAG=$(git tag -l "v[0-9]*.[0-9]*.[0-9]*" | grep -E "^v[0-9]+\.[0-9]+\.[0-9]+$" | sort -Vr | head -n 1)
|
||||
|
||||
# Handle case where no beta tags exist
|
||||
if [[ -z "$LATEST_BETA_TAG" ]]; then
|
||||
NEW_BETA_TAG="v1.0.0-beta.1"
|
||||
else
|
||||
NEW_BETA_TAG=$(echo $LATEST_BETA_TAG | awk -F '[.-]' '{print $1 "." $2 "." $3 "-beta." ($NF+1)}')
|
||||
fi
|
||||
|
||||
# Increment latest stable tag
|
||||
NEW_STABLE_TAG=$(echo $LATEST_STABLE_TAG | awk -F '.' '{print $1 "." $2 "." ($3+1)}')
|
||||
echo "latest_beta_tag=$LATEST_BETA_TAG" >> $GITHUB_OUTPUT
|
||||
echo "latest_stable_tag=$LATEST_STABLE_TAG" >> $GITHUB_OUTPUT
|
||||
echo "new_beta_tag=$NEW_BETA_TAG" >> $GITHUB_OUTPUT
|
||||
echo "new_stable_tag=$NEW_STABLE_TAG" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Echo branch and tag information
|
||||
run: |
|
||||
echo "Beta branch: ${{ steps.list-branches.outputs.beta }}"
|
||||
echo "Stable branch: ${{ steps.list-branches.outputs.stable }}"
|
||||
echo "Latest beta tag: ${{ steps.list-branches.outputs.latest_beta_tag }}"
|
||||
echo "Latest stable tag: ${{ steps.list-branches.outputs.latest_stable_tag }}"
|
||||
echo "New beta tag: ${{ steps.list-branches.outputs.new_beta_tag }}"
|
||||
echo "New stable tag: ${{ steps.list-branches.outputs.new_stable_tag }}"
|
||||
|
||||
- name: Trigger Backport
|
||||
if: steps.checkbox-check.outputs.backport == 'true'
|
||||
run: |
|
||||
set -e
|
||||
echo "Backporting to beta ${{ steps.list-branches.outputs.beta }} and stable ${{ steps.list-branches.outputs.stable }}"
|
||||
|
||||
# Echo the merge commit SHA
|
||||
echo "Merge commit SHA: ${{ github.event.pull_request.merge_commit_sha }}"
|
||||
|
||||
# Fetch all history for all branches and tags
|
||||
git fetch --prune
|
||||
|
||||
# Reset and prepare the beta branch
|
||||
git checkout ${{ steps.list-branches.outputs.beta }}
|
||||
echo "Last 5 commits on beta branch:"
|
||||
git log -n 5 --pretty=format:"%H"
|
||||
echo "" # Newline for formatting
|
||||
|
||||
# Cherry-pick the merge commit from the merged PR
|
||||
git cherry-pick -m 1 ${{ github.event.pull_request.merge_commit_sha }} || {
|
||||
echo "Cherry-pick to beta failed due to conflicts."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Create new beta branch/tag
|
||||
git tag ${{ steps.list-branches.outputs.new_beta_tag }}
|
||||
# Push the changes and tag to the beta branch using PAT
|
||||
git push origin ${{ steps.list-branches.outputs.beta }}
|
||||
git push origin ${{ steps.list-branches.outputs.new_beta_tag }}
|
||||
|
||||
# Reset and prepare the stable branch
|
||||
git checkout ${{ steps.list-branches.outputs.stable }}
|
||||
echo "Last 5 commits on stable branch:"
|
||||
git log -n 5 --pretty=format:"%H"
|
||||
echo "" # Newline for formatting
|
||||
|
||||
# Cherry-pick the merge commit from the merged PR
|
||||
git cherry-pick -m 1 ${{ github.event.pull_request.merge_commit_sha }} || {
|
||||
echo "Cherry-pick to stable failed due to conflicts."
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Create new stable branch/tag
|
||||
git tag ${{ steps.list-branches.outputs.new_stable_tag }}
|
||||
# Push the changes and tag to the stable branch using PAT
|
||||
git push origin ${{ steps.list-branches.outputs.stable }}
|
||||
git push origin ${{ steps.list-branches.outputs.new_stable_tag }}
|
||||
@@ -1,110 +0,0 @@
|
||||
name: External Dependency Unit Tests
|
||||
|
||||
on:
|
||||
merge_group:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
env:
|
||||
# AWS
|
||||
S3_AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
|
||||
S3_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
|
||||
|
||||
# MinIO
|
||||
S3_ENDPOINT_URL: "http://localhost:9004"
|
||||
|
||||
# Confluence
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
|
||||
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
|
||||
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
|
||||
# LLMs
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
|
||||
|
||||
jobs:
|
||||
discover-test-dirs:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Discover test directories
|
||||
id: set-matrix
|
||||
run: |
|
||||
# Find all subdirectories in backend/tests/external_dependency_unit
|
||||
dirs=$(find backend/tests/external_dependency_unit -mindepth 1 -maxdepth 1 -type d -exec basename {} \; | sort | jq -R -s -c 'split("\n")[:-1]')
|
||||
echo "test-dirs=$dirs" >> $GITHUB_OUTPUT
|
||||
|
||||
external-dependency-unit-tests:
|
||||
needs: discover-test-dirs
|
||||
# Use larger runner with more resources for Vespa
|
||||
runs-on: [runs-on, runner=16cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test-dir: ${{ fromJson(needs.discover-test-dirs.outputs.test-dirs) }}
|
||||
|
||||
env:
|
||||
PYTHONPATH: ./backend
|
||||
MODEL_SERVER_HOST: "disabled"
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: "pip"
|
||||
cache-dependency-path: |
|
||||
backend/requirements/default.txt
|
||||
backend/requirements/dev.txt
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
playwright install chromium
|
||||
playwright install-deps chromium
|
||||
|
||||
- name: Set up Standard Dependencies
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose up -d minio relational_db cache index
|
||||
|
||||
- name: Wait for services
|
||||
run: |
|
||||
echo "Waiting for services to be ready..."
|
||||
sleep 30
|
||||
|
||||
# Wait for Vespa specifically
|
||||
echo "Waiting for Vespa to be ready..."
|
||||
timeout 300 bash -c 'until curl -f -s http://localhost:8081/ApplicationStatus > /dev/null 2>&1; do echo "Vespa not ready, waiting..."; sleep 10; done' || echo "Vespa timeout - continuing anyway"
|
||||
|
||||
echo "Services should be ready now"
|
||||
|
||||
- name: Run migrations
|
||||
run: |
|
||||
cd backend
|
||||
# Run migrations to head
|
||||
alembic upgrade head
|
||||
alembic heads --verbose
|
||||
|
||||
- name: Run Tests for ${{ matrix.test-dir }}
|
||||
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
||||
run: |
|
||||
py.test \
|
||||
--durations=8 \
|
||||
-o junit_family=xunit2 \
|
||||
-xv \
|
||||
--ff \
|
||||
backend/tests/external_dependency_unit/${{ matrix.test-dir }}
|
||||
154
.github/workflows/pr-helm-chart-testing.yml
vendored
154
.github/workflows/pr-helm-chart-testing.yml
vendored
@@ -37,11 +37,6 @@ jobs:
|
||||
echo "changed=true" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# uncomment to force run chart-testing
|
||||
# - name: Force run chart-testing (list-changed)
|
||||
# id: list-changed
|
||||
# run: echo "changed=true" >> $GITHUB_OUTPUT
|
||||
|
||||
# lint all charts if any changes were detected
|
||||
- name: Run chart-testing (lint)
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
@@ -53,154 +48,9 @@ jobs:
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
uses: helm/kind-action@v1.12.0
|
||||
|
||||
- name: Pre-install cluster status check
|
||||
- name: Run chart-testing (install)
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
echo "=== Pre-install Cluster Status ==="
|
||||
kubectl get nodes -o wide
|
||||
kubectl get pods --all-namespaces
|
||||
kubectl get storageclass
|
||||
|
||||
- name: Add Helm repositories and update
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
echo "=== Adding Helm repositories ==="
|
||||
helm repo add bitnami https://charts.bitnami.com/bitnami
|
||||
helm repo add vespa https://onyx-dot-app.github.io/vespa-helm-charts
|
||||
helm repo update
|
||||
|
||||
- name: Pre-pull critical images
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
echo "=== Pre-pulling critical images to avoid timeout ==="
|
||||
# Get kind cluster name
|
||||
KIND_CLUSTER=$(kubectl config current-context | sed 's/kind-//')
|
||||
echo "Kind cluster: $KIND_CLUSTER"
|
||||
|
||||
# Pre-pull images that are likely to be used
|
||||
echo "Pre-pulling PostgreSQL image..."
|
||||
docker pull postgres:15-alpine || echo "Failed to pull postgres:15-alpine"
|
||||
kind load docker-image postgres:15-alpine --name $KIND_CLUSTER || echo "Failed to load postgres image"
|
||||
|
||||
echo "Pre-pulling Redis image..."
|
||||
docker pull redis:7-alpine || echo "Failed to pull redis:7-alpine"
|
||||
kind load docker-image redis:7-alpine --name $KIND_CLUSTER || echo "Failed to load redis image"
|
||||
|
||||
echo "Pre-pulling Onyx images..."
|
||||
docker pull docker.io/onyxdotapp/onyx-web-server:latest || echo "Failed to pull onyx web server"
|
||||
docker pull docker.io/onyxdotapp/onyx-backend:latest || echo "Failed to pull onyx backend"
|
||||
kind load docker-image docker.io/onyxdotapp/onyx-web-server:latest --name $KIND_CLUSTER || echo "Failed to load onyx web server"
|
||||
kind load docker-image docker.io/onyxdotapp/onyx-backend:latest --name $KIND_CLUSTER || echo "Failed to load onyx backend"
|
||||
|
||||
echo "=== Images loaded into Kind cluster ==="
|
||||
docker exec $KIND_CLUSTER-control-plane crictl images | grep -E "(postgres|redis|onyx)" || echo "Some images may still be loading..."
|
||||
|
||||
- name: Validate chart dependencies
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
echo "=== Validating chart dependencies ==="
|
||||
cd deployment/helm/charts/onyx
|
||||
helm dependency update
|
||||
helm lint .
|
||||
|
||||
- name: Run chart-testing (install) with enhanced monitoring
|
||||
timeout-minutes: 25
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
echo "=== Starting chart installation with monitoring ==="
|
||||
|
||||
# Function to monitor cluster state
|
||||
monitor_cluster() {
|
||||
while true; do
|
||||
echo "=== Cluster Status Check at $(date) ==="
|
||||
# Only show non-running pods to reduce noise
|
||||
NON_RUNNING_PODS=$(kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers 2>/dev/null | wc -l)
|
||||
if [ "$NON_RUNNING_PODS" -gt 0 ]; then
|
||||
echo "Non-running pods:"
|
||||
kubectl get pods --all-namespaces --field-selector=status.phase!=Running,status.phase!=Succeeded
|
||||
else
|
||||
echo "All pods running successfully"
|
||||
fi
|
||||
# Only show recent events if there are issues
|
||||
RECENT_EVENTS=$(kubectl get events --sort-by=.lastTimestamp --all-namespaces --field-selector=type!=Normal 2>/dev/null | tail -5)
|
||||
if [ -n "$RECENT_EVENTS" ]; then
|
||||
echo "Recent warnings/errors:"
|
||||
echo "$RECENT_EVENTS"
|
||||
fi
|
||||
sleep 60
|
||||
done
|
||||
}
|
||||
|
||||
# Start monitoring in background
|
||||
monitor_cluster &
|
||||
MONITOR_PID=$!
|
||||
|
||||
# Set up cleanup
|
||||
cleanup() {
|
||||
echo "=== Cleaning up monitoring process ==="
|
||||
kill $MONITOR_PID 2>/dev/null || true
|
||||
echo "=== Final cluster state ==="
|
||||
kubectl get pods --all-namespaces
|
||||
kubectl get events --all-namespaces --sort-by=.lastTimestamp | tail -20
|
||||
}
|
||||
|
||||
# Trap cleanup on exit
|
||||
trap cleanup EXIT
|
||||
|
||||
# Run the actual installation with detailed logging
|
||||
echo "=== Starting ct install ==="
|
||||
ct install --all \
|
||||
--helm-extra-set-args="\
|
||||
--set=nginx.enabled=false \
|
||||
--set=minio.enabled=false \
|
||||
--set=vespa.enabled=false \
|
||||
--set=slackbot.enabled=false \
|
||||
--set=postgresql.enabled=true \
|
||||
--set=postgresql.primary.persistence.enabled=false \
|
||||
--set=redis.enabled=true \
|
||||
--set=webserver.replicaCount=1 \
|
||||
--set=api.replicaCount=0 \
|
||||
--set=inferenceCapability.replicaCount=0 \
|
||||
--set=indexCapability.replicaCount=0 \
|
||||
--set=celery_beat.replicaCount=0 \
|
||||
--set=celery_worker_heavy.replicaCount=0 \
|
||||
--set=celery_worker_docfetching.replicaCount=0 \
|
||||
--set=celery_worker_docprocessing.replicaCount=0 \
|
||||
--set=celery_worker_light.replicaCount=0 \
|
||||
--set=celery_worker_monitoring.replicaCount=0 \
|
||||
--set=celery_worker_primary.replicaCount=0 \
|
||||
--set=celery_worker_user_files_indexing.replicaCount=0" \
|
||||
--helm-extra-args="--timeout 900s --debug" \
|
||||
--debug --config ct.yaml
|
||||
|
||||
echo "=== Installation completed successfully ==="
|
||||
kubectl get pods --all-namespaces
|
||||
|
||||
- name: Post-install verification
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
echo "=== Post-install verification ==="
|
||||
kubectl get pods --all-namespaces
|
||||
kubectl get services --all-namespaces
|
||||
# Only show issues if they exist
|
||||
kubectl describe pods --all-namespaces | grep -A 5 -B 2 "Failed\|Error\|Warning" || echo "No pod issues found"
|
||||
|
||||
- name: Cleanup on failure
|
||||
if: failure() && steps.list-changed.outputs.changed == 'true'
|
||||
run: |
|
||||
echo "=== Cleanup on failure ==="
|
||||
echo "=== Final cluster state ==="
|
||||
kubectl get pods --all-namespaces
|
||||
kubectl get events --all-namespaces --sort-by=.lastTimestamp | tail -10
|
||||
|
||||
echo "=== Pod descriptions for debugging ==="
|
||||
kubectl describe pods --all-namespaces | grep -A 10 -B 3 "Failed\|Error\|Warning\|Pending" || echo "No problematic pods found"
|
||||
|
||||
echo "=== Recent logs for debugging ==="
|
||||
kubectl logs --all-namespaces --tail=50 | grep -i "error\|timeout\|failed\|pull" || echo "No error logs found"
|
||||
|
||||
echo "=== Helm releases ==="
|
||||
helm list --all-namespaces
|
||||
run: ct install --all --helm-extra-set-args="--set=nginx.enabled=false" --debug --config ct.yaml
|
||||
# the following would install only changed charts, but we only have one chart so
|
||||
# don't worry about that for now
|
||||
# run: ct install --target-branch ${{ github.event.repository.default_branch }}
|
||||
|
||||
578
.github/workflows/pr-integration-tests.yml
vendored
578
.github/workflows/pr-integration-tests.yml
vendored
@@ -11,245 +11,142 @@ on:
|
||||
- "release/**"
|
||||
|
||||
env:
|
||||
# Private Registry Configuration
|
||||
PRIVATE_REGISTRY: experimental-registry.blacksmith.sh:5000
|
||||
PRIVATE_REGISTRY_USERNAME: ${{ secrets.PRIVATE_REGISTRY_USERNAME }}
|
||||
PRIVATE_REGISTRY_PASSWORD: ${{ secrets.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
# Test Environment Variables
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
|
||||
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
|
||||
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
|
||||
PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
|
||||
PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
|
||||
PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
|
||||
PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
|
||||
|
||||
jobs:
|
||||
discover-test-dirs:
|
||||
runs-on: blacksmith-2vcpu-ubuntu-2404-arm
|
||||
outputs:
|
||||
test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Discover test directories
|
||||
id: set-matrix
|
||||
run: |
|
||||
# Find all leaf-level directories in both test directories
|
||||
tests_dirs=$(find backend/tests/integration/tests -mindepth 1 -maxdepth 1 -type d ! -name "__pycache__" -exec basename {} \; | sort)
|
||||
connector_dirs=$(find backend/tests/integration/connector_job_tests -mindepth 1 -maxdepth 1 -type d ! -name "__pycache__" -exec basename {} \; | sort)
|
||||
|
||||
# Create JSON array with directory info
|
||||
all_dirs=""
|
||||
for dir in $tests_dirs; do
|
||||
all_dirs="$all_dirs{\"path\":\"tests/$dir\",\"name\":\"tests-$dir\"},"
|
||||
done
|
||||
for dir in $connector_dirs; do
|
||||
all_dirs="$all_dirs{\"path\":\"connector_job_tests/$dir\",\"name\":\"connector-$dir\"},"
|
||||
done
|
||||
|
||||
# Remove trailing comma and wrap in array
|
||||
all_dirs="[${all_dirs%,}]"
|
||||
echo "test-dirs=$all_dirs" >> $GITHUB_OUTPUT
|
||||
|
||||
prepare-build:
|
||||
runs-on: blacksmith-2vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: "pip"
|
||||
cache-dependency-path: |
|
||||
backend/requirements/default.txt
|
||||
backend/requirements/dev.txt
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
|
||||
- name: Generate OpenAPI schema
|
||||
working-directory: ./backend
|
||||
env:
|
||||
PYTHONPATH: "."
|
||||
run: |
|
||||
python scripts/onyx_openapi_schema.py --filename generated/openapi.json
|
||||
|
||||
- name: Generate OpenAPI Python client
|
||||
working-directory: ./backend
|
||||
run: |
|
||||
docker run --rm \
|
||||
-v "${{ github.workspace }}/backend/generated:/local" \
|
||||
openapitools/openapi-generator-cli generate \
|
||||
-i /local/openapi.json \
|
||||
-g python \
|
||||
-o /local/onyx_openapi_client \
|
||||
--package-name onyx_openapi_client \
|
||||
--skip-validate-spec \
|
||||
--openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
|
||||
|
||||
- name: Upload OpenAPI artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: openapi-artifacts
|
||||
path: backend/generated/
|
||||
|
||||
build-backend-image:
|
||||
runs-on: blacksmith-16vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push Backend Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }}
|
||||
push: true
|
||||
outputs: type=registry
|
||||
no-cache: true
|
||||
|
||||
build-model-server-image:
|
||||
runs-on: blacksmith-16vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push Model Server Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }}
|
||||
push: true
|
||||
outputs: type=registry
|
||||
provenance: false
|
||||
no-cache: true
|
||||
|
||||
build-integration-image:
|
||||
needs: prepare-build
|
||||
runs-on: blacksmith-16vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Download OpenAPI artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: openapi-artifacts
|
||||
path: backend/generated/
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push integration test Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/tests/integration/Dockerfile
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }}
|
||||
push: true
|
||||
outputs: type=registry
|
||||
no-cache: true
|
||||
|
||||
integration-tests:
|
||||
needs:
|
||||
[
|
||||
discover-test-dirs,
|
||||
build-backend-image,
|
||||
build-model-server-image,
|
||||
build-integration-image,
|
||||
]
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404-arm
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test-dir: ${{ fromJson(needs.discover-test-dirs.outputs.test-dirs) }}
|
||||
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on: [runs-on, runner=32cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Pull Docker images
|
||||
# tag every docker image with "test" so that we can spin up the correct set
|
||||
# of images during testing
|
||||
|
||||
# We don't need to build the Web Docker image since it's not yet used
|
||||
# in the integration tests. We have a separate action to verify that it builds
|
||||
# successfully.
|
||||
- name: Pull Web Docker image
|
||||
run: |
|
||||
# Pull all images from registry in parallel
|
||||
echo "Pulling Docker images in parallel..."
|
||||
# Pull images from private registry
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }}) &
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }}) &
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }}) &
|
||||
docker pull onyxdotapp/onyx-web-server:latest
|
||||
docker tag onyxdotapp/onyx-web-server:latest onyxdotapp/onyx-web-server:test
|
||||
|
||||
# Wait for all background jobs to complete
|
||||
wait
|
||||
echo "All Docker images pulled successfully"
|
||||
# we use the runs-on cache for docker builds
|
||||
# in conjunction with runs-on runners, it has better speed and unlimited caching
|
||||
# https://runs-on.com/caching/s3-cache-for-github-actions/
|
||||
# https://runs-on.com/caching/docker/
|
||||
# https://github.com/moby/buildkit#s3-cache-experimental
|
||||
|
||||
# Re-tag to remove registry prefix for docker-compose
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }} onyxdotapp/onyx-backend:test
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }} onyxdotapp/onyx-model-server:test
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }} onyxdotapp/onyx-integration:test
|
||||
# images are built and run locally for testing purposes. Not pushed.
|
||||
- name: Build Backend Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-backend:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Build Model Server Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-model-server:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Build integration test Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/tests/integration/Dockerfile
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-integration:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
# Start containers for multi-tenant tests
|
||||
- name: Start Docker containers for multi-tenant tests
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
|
||||
MULTI_TENANT=true \
|
||||
AUTH_TYPE=cloud \
|
||||
REQUIRE_EMAIL_VERIFICATION=false \
|
||||
DISABLE_TELEMETRY=true \
|
||||
IMAGE_TAG=test \
|
||||
DEV_MODE=true \
|
||||
docker compose -f docker-compose.multitenant-dev.yml -p onyx-stack up -d
|
||||
id: start_docker_multi_tenant
|
||||
|
||||
# In practice, `cloud` Auth type would require OAUTH credentials to be set.
|
||||
- name: Run Multi-Tenant Integration Tests
|
||||
run: |
|
||||
echo "Waiting for 3 minutes to ensure API server is ready..."
|
||||
sleep 180
|
||||
echo "Running integration tests..."
|
||||
docker run --rm --network onyx-stack_default \
|
||||
--name test-runner \
|
||||
-e POSTGRES_HOST=relational_db \
|
||||
-e POSTGRES_USER=postgres \
|
||||
-e POSTGRES_PASSWORD=password \
|
||||
-e POSTGRES_DB=postgres \
|
||||
-e POSTGRES_USE_NULL_POOL=true \
|
||||
-e VESPA_HOST=index \
|
||||
-e REDIS_HOST=cache \
|
||||
-e API_SERVER_HOST=api_server \
|
||||
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
|
||||
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e AUTH_TYPE=cloud \
|
||||
-e MULTI_TENANT=true \
|
||||
-e REQUIRE_EMAIL_VERIFICATION=false \
|
||||
-e DISABLE_TELEMETRY=true \
|
||||
-e IMAGE_TAG=test \
|
||||
-e DEV_MODE=true \
|
||||
onyxdotapp/onyx-integration:test \
|
||||
/app/tests/integration/multitenant_tests
|
||||
continue-on-error: true
|
||||
id: run_multitenant_tests
|
||||
|
||||
- name: Check multi-tenant test results
|
||||
run: |
|
||||
if [ ${{ steps.run_multitenant_tests.outcome }} == 'failure' ]; then
|
||||
echo "Multi-tenant integration tests failed. Exiting with error."
|
||||
exit 1
|
||||
else
|
||||
echo "All multi-tenant integration tests passed successfully."
|
||||
fi
|
||||
|
||||
- name: Stop multi-tenant Docker containers
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.multitenant-dev.yml -p onyx-stack down -v
|
||||
|
||||
# NOTE: Use pre-ping/null pool to reduce flakiness due to dropped connections
|
||||
# NOTE: don't need web server for integration tests
|
||||
- name: Start Docker containers
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
@@ -261,24 +158,14 @@ jobs:
|
||||
DISABLE_TELEMETRY=true \
|
||||
IMAGE_TAG=test \
|
||||
INTEGRATION_TESTS_MODE=true \
|
||||
CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS=0.001 \
|
||||
docker compose up \
|
||||
relational_db \
|
||||
index \
|
||||
cache \
|
||||
minio \
|
||||
api_server \
|
||||
inference_model_server \
|
||||
indexing_model_server \
|
||||
background \
|
||||
-d
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d
|
||||
id: start_docker
|
||||
|
||||
- name: Wait for service to be ready
|
||||
run: |
|
||||
echo "Starting wait-for-service script..."
|
||||
|
||||
docker logs -f onyx-api_server-1 &
|
||||
docker logs -f onyx-stack-api_server-1 &
|
||||
|
||||
start_time=$(date +%s)
|
||||
timeout=300 # 5 minutes in seconds
|
||||
@@ -314,44 +201,43 @@ jobs:
|
||||
docker compose -f docker-compose.mock-it-services.yml \
|
||||
-p mock-it-services-stack up -d
|
||||
|
||||
- name: Run Integration Tests for ${{ matrix.test-dir.name }}
|
||||
uses: nick-fields/retry@v3
|
||||
with:
|
||||
timeout_minutes: 20
|
||||
max_attempts: 3
|
||||
retry_wait_seconds: 10
|
||||
command: |
|
||||
echo "Running integration tests for ${{ matrix.test-dir.path }}..."
|
||||
docker run --rm --network onyx_default \
|
||||
--name test-runner \
|
||||
-e POSTGRES_HOST=relational_db \
|
||||
-e POSTGRES_USER=postgres \
|
||||
-e POSTGRES_PASSWORD=password \
|
||||
-e POSTGRES_DB=postgres \
|
||||
-e DB_READONLY_USER=db_readonly_user \
|
||||
-e DB_READONLY_PASSWORD=password \
|
||||
-e POSTGRES_POOL_PRE_PING=true \
|
||||
-e POSTGRES_USE_NULL_POOL=true \
|
||||
-e VESPA_HOST=index \
|
||||
-e REDIS_HOST=cache \
|
||||
-e API_SERVER_HOST=api_server \
|
||||
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
|
||||
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
|
||||
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
|
||||
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
|
||||
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
|
||||
-e JIRA_BASE_URL=${JIRA_BASE_URL} \
|
||||
-e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
|
||||
-e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
|
||||
-e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
|
||||
-e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
|
||||
-e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
|
||||
-e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
|
||||
-e MOCK_CONNECTOR_SERVER_PORT=8001 \
|
||||
onyxdotapp/onyx-integration:test \
|
||||
/app/tests/integration/${{ matrix.test-dir.path }}
|
||||
# NOTE: Use pre-ping/null to reduce flakiness due to dropped connections
|
||||
- name: Run Standard Integration Tests
|
||||
run: |
|
||||
echo "Running integration tests..."
|
||||
docker run --rm --network onyx-stack_default \
|
||||
--name test-runner \
|
||||
-e POSTGRES_HOST=relational_db \
|
||||
-e POSTGRES_USER=postgres \
|
||||
-e POSTGRES_PASSWORD=password \
|
||||
-e POSTGRES_DB=postgres \
|
||||
-e POSTGRES_POOL_PRE_PING=true \
|
||||
-e POSTGRES_USE_NULL_POOL=true \
|
||||
-e VESPA_HOST=index \
|
||||
-e REDIS_HOST=cache \
|
||||
-e API_SERVER_HOST=api_server \
|
||||
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
|
||||
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
|
||||
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
|
||||
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
|
||||
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
|
||||
-e MOCK_CONNECTOR_SERVER_PORT=8001 \
|
||||
onyxdotapp/onyx-integration:test \
|
||||
/app/tests/integration/tests \
|
||||
/app/tests/integration/connector_job_tests
|
||||
continue-on-error: true
|
||||
id: run_tests
|
||||
|
||||
- name: Check test results
|
||||
run: |
|
||||
if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then
|
||||
echo "Integration tests failed. Exiting with error."
|
||||
exit 1
|
||||
else
|
||||
echo "All integration tests passed successfully."
|
||||
fi
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Always gather logs BEFORE "down":
|
||||
@@ -359,19 +245,19 @@ jobs:
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose logs --no-color api_server > $GITHUB_WORKSPACE/api_server.log || true
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color api_server > $GITHUB_WORKSPACE/api_server.log || true
|
||||
|
||||
- name: Dump all-container logs (optional)
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
|
||||
|
||||
- name: Upload logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: docker-all-logs-${{ matrix.test-dir.name }}
|
||||
name: docker-all-logs
|
||||
path: ${{ github.workspace }}/docker-compose.log
|
||||
# ------------------------------------------------------------
|
||||
|
||||
@@ -379,158 +265,4 @@ jobs:
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose down -v
|
||||
|
||||
|
||||
multitenant-tests:
|
||||
needs:
|
||||
[
|
||||
build-backend-image,
|
||||
build-model-server-image,
|
||||
build-integration-image,
|
||||
]
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404-arm
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Pull Docker images
|
||||
run: |
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }}) &
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }}) &
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }}) &
|
||||
wait
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }} onyxdotapp/onyx-backend:test
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }} onyxdotapp/onyx-model-server:test
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }} onyxdotapp/onyx-integration:test
|
||||
|
||||
- name: Start Docker containers for multi-tenant tests
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
|
||||
MULTI_TENANT=true \
|
||||
AUTH_TYPE=cloud \
|
||||
REQUIRE_EMAIL_VERIFICATION=false \
|
||||
DISABLE_TELEMETRY=true \
|
||||
IMAGE_TAG=test \
|
||||
DEV_MODE=true \
|
||||
docker compose -f docker-compose.multitenant-dev.yml up \
|
||||
relational_db \
|
||||
index \
|
||||
cache \
|
||||
minio \
|
||||
api_server \
|
||||
inference_model_server \
|
||||
indexing_model_server \
|
||||
background \
|
||||
-d
|
||||
id: start_docker_multi_tenant
|
||||
|
||||
- name: Wait for service to be ready (multi-tenant)
|
||||
run: |
|
||||
echo "Starting wait-for-service script for multi-tenant..."
|
||||
docker logs -f onyx-api_server-1 &
|
||||
start_time=$(date +%s)
|
||||
timeout=300
|
||||
while true; do
|
||||
current_time=$(date +%s)
|
||||
elapsed_time=$((current_time - start_time))
|
||||
if [ $elapsed_time -ge $timeout ]; then
|
||||
echo "Timeout reached. Service did not become ready in 5 minutes."
|
||||
exit 1
|
||||
fi
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
|
||||
if [ "$response" = "200" ]; then
|
||||
echo "Service is ready!"
|
||||
break
|
||||
elif [ "$response" = "curl_error" ]; then
|
||||
echo "Curl encountered an error; retrying..."
|
||||
else
|
||||
echo "Service not ready yet (HTTP $response). Retrying in 5 seconds..."
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
echo "Finished waiting for service."
|
||||
|
||||
- name: Run Multi-Tenant Integration Tests
|
||||
run: |
|
||||
echo "Running multi-tenant integration tests..."
|
||||
docker run --rm --network onyx_default \
|
||||
--name test-runner \
|
||||
-e POSTGRES_HOST=relational_db \
|
||||
-e POSTGRES_USER=postgres \
|
||||
-e POSTGRES_PASSWORD=password \
|
||||
-e DB_READONLY_USER=db_readonly_user \
|
||||
-e DB_READONLY_PASSWORD=password \
|
||||
-e POSTGRES_DB=postgres \
|
||||
-e POSTGRES_USE_NULL_POOL=true \
|
||||
-e VESPA_HOST=index \
|
||||
-e REDIS_HOST=cache \
|
||||
-e API_SERVER_HOST=api_server \
|
||||
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
|
||||
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e AUTH_TYPE=cloud \
|
||||
-e MULTI_TENANT=true \
|
||||
-e SKIP_RESET=true \
|
||||
-e REQUIRE_EMAIL_VERIFICATION=false \
|
||||
-e DISABLE_TELEMETRY=true \
|
||||
-e IMAGE_TAG=test \
|
||||
-e DEV_MODE=true \
|
||||
onyxdotapp/onyx-integration:test \
|
||||
/app/tests/integration/multitenant_tests
|
||||
|
||||
- name: Dump API server logs (multi-tenant)
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.multitenant-dev.yml logs --no-color api_server > $GITHUB_WORKSPACE/api_server_multitenant.log || true
|
||||
|
||||
- name: Dump all-container logs (multi-tenant)
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.multitenant-dev.yml logs --no-color > $GITHUB_WORKSPACE/docker-compose-multitenant.log || true
|
||||
|
||||
- name: Upload logs (multi-tenant)
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: docker-all-logs-multitenant
|
||||
path: ${{ github.workspace }}/docker-compose-multitenant.log
|
||||
|
||||
- name: Stop multi-tenant Docker containers
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.multitenant-dev.yml down -v
|
||||
|
||||
required:
|
||||
runs-on: blacksmith-2vcpu-ubuntu-2404-arm
|
||||
needs: [integration-tests, multitenant-tests]
|
||||
if: ${{ always() }}
|
||||
steps:
|
||||
- uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const needs = ${{ toJSON(needs) }};
|
||||
const failed = Object.values(needs).some(n => n.result !== 'success');
|
||||
if (failed) {
|
||||
core.setFailed('One or more upstream jobs failed or were cancelled.');
|
||||
} else {
|
||||
core.notice('All required jobs succeeded.');
|
||||
}
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack down -v
|
||||
|
||||
38
.github/workflows/pr-labeler.yml
vendored
38
.github/workflows/pr-labeler.yml
vendored
@@ -1,38 +0,0 @@
|
||||
name: PR Labeler
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
branches:
|
||||
- main
|
||||
types:
|
||||
- opened
|
||||
- reopened
|
||||
- synchronize
|
||||
- edited
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
validate_pr_title:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check PR title for Conventional Commits
|
||||
env:
|
||||
PR_TITLE: ${{ github.event.pull_request.title }}
|
||||
run: |
|
||||
echo "PR Title: $PR_TITLE"
|
||||
if [[ ! "$PR_TITLE" =~ ^(feat|fix|docs|test|ci|refactor|perf|chore|revert|build)(\(.+\))?:\ .+ ]]; then
|
||||
echo "::error::❌ Your PR title does not follow the Conventional Commits format.
|
||||
This check ensures that all pull requests use clear, consistent titles that help automate changelogs and improve project history.
|
||||
|
||||
Please update your PR title to follow the Conventional Commits style.
|
||||
Here is a link to a blog explaining the reason why we've included the Conventional Commits style into our PR titles: https://xfuture-blog.com/working-with-conventional-commits
|
||||
|
||||
**Here are some examples of valid PR titles:**
|
||||
- feat: add user authentication
|
||||
- fix(login): handle null password error
|
||||
- docs(readme): update installation instructions"
|
||||
exit 1
|
||||
fi
|
||||
387
.github/workflows/pr-mit-integration-tests.yml
vendored
387
.github/workflows/pr-mit-integration-tests.yml
vendored
@@ -5,249 +5,90 @@ concurrency:
|
||||
|
||||
on:
|
||||
merge_group:
|
||||
types: [checks_requested]
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- "release/**"
|
||||
|
||||
env:
|
||||
# Private Registry Configuration
|
||||
PRIVATE_REGISTRY: experimental-registry.blacksmith.sh:5000
|
||||
PRIVATE_REGISTRY_USERNAME: ${{ secrets.PRIVATE_REGISTRY_USERNAME }}
|
||||
PRIVATE_REGISTRY_PASSWORD: ${{ secrets.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
# Test Environment Variables
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
|
||||
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
|
||||
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
|
||||
PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
|
||||
PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
|
||||
PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
|
||||
PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
|
||||
|
||||
jobs:
|
||||
discover-test-dirs:
|
||||
runs-on: blacksmith-2vcpu-ubuntu-2404-arm
|
||||
outputs:
|
||||
test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Discover test directories
|
||||
id: set-matrix
|
||||
run: |
|
||||
# Find all leaf-level directories in both test directories
|
||||
tests_dirs=$(find backend/tests/integration/tests -mindepth 1 -maxdepth 1 -type d ! -name "__pycache__" -exec basename {} \; | sort)
|
||||
connector_dirs=$(find backend/tests/integration/connector_job_tests -mindepth 1 -maxdepth 1 -type d ! -name "__pycache__" -exec basename {} \; | sort)
|
||||
|
||||
# Create JSON array with directory info
|
||||
all_dirs=""
|
||||
for dir in $tests_dirs; do
|
||||
all_dirs="$all_dirs{\"path\":\"tests/$dir\",\"name\":\"tests-$dir\"},"
|
||||
done
|
||||
for dir in $connector_dirs; do
|
||||
all_dirs="$all_dirs{\"path\":\"connector_job_tests/$dir\",\"name\":\"connector-$dir\"},"
|
||||
done
|
||||
|
||||
# Remove trailing comma and wrap in array
|
||||
all_dirs="[${all_dirs%,}]"
|
||||
echo "test-dirs=$all_dirs" >> $GITHUB_OUTPUT
|
||||
|
||||
prepare-build:
|
||||
runs-on: blacksmith-2vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: "pip"
|
||||
cache-dependency-path: |
|
||||
backend/requirements/default.txt
|
||||
backend/requirements/dev.txt
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
|
||||
- name: Generate OpenAPI schema
|
||||
working-directory: ./backend
|
||||
env:
|
||||
PYTHONPATH: "."
|
||||
run: |
|
||||
python scripts/onyx_openapi_schema.py --filename generated/openapi.json
|
||||
|
||||
- name: Generate OpenAPI Python client
|
||||
working-directory: ./backend
|
||||
run: |
|
||||
docker run --rm \
|
||||
-v "${{ github.workspace }}/backend/generated:/local" \
|
||||
openapitools/openapi-generator-cli generate \
|
||||
-i /local/openapi.json \
|
||||
-g python \
|
||||
-o /local/onyx_openapi_client \
|
||||
--package-name onyx_openapi_client \
|
||||
--skip-validate-spec \
|
||||
--openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
|
||||
|
||||
- name: Upload OpenAPI artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: openapi-artifacts
|
||||
path: backend/generated/
|
||||
|
||||
build-backend-image:
|
||||
runs-on: blacksmith-16vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push Backend Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }}
|
||||
push: true
|
||||
outputs: type=registry
|
||||
no-cache: true
|
||||
|
||||
build-model-server-image:
|
||||
runs-on: blacksmith-16vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push Model Server Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }}
|
||||
push: true
|
||||
outputs: type=registry
|
||||
provenance: false
|
||||
no-cache: true
|
||||
|
||||
build-integration-image:
|
||||
needs: prepare-build
|
||||
runs-on: blacksmith-16vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
|
||||
- name: Download OpenAPI artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: openapi-artifacts
|
||||
path: backend/generated/
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push integration test Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/tests/integration/Dockerfile
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }}
|
||||
push: true
|
||||
outputs: type=registry
|
||||
no-cache: true
|
||||
|
||||
integration-tests-mit:
|
||||
needs:
|
||||
[
|
||||
discover-test-dirs,
|
||||
build-backend-image,
|
||||
build-model-server-image,
|
||||
build-integration-image,
|
||||
]
|
||||
# See https://docs.blacksmith.sh/blacksmith-runners/overview
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404-arm
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test-dir: ${{ fromJson(needs.discover-test-dirs.outputs.test-dirs) }}
|
||||
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on: [runs-on, runner=32cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Login to Private Registry
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ${{ env.PRIVATE_REGISTRY }}
|
||||
username: ${{ env.PRIVATE_REGISTRY_USERNAME }}
|
||||
password: ${{ env.PRIVATE_REGISTRY_PASSWORD }}
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Pull Docker images
|
||||
# tag every docker image with "test" so that we can spin up the correct set
|
||||
# of images during testing
|
||||
|
||||
# We don't need to build the Web Docker image since it's not yet used
|
||||
# in the integration tests. We have a separate action to verify that it builds
|
||||
# successfully.
|
||||
- name: Pull Web Docker image
|
||||
run: |
|
||||
# Pull all images from registry in parallel
|
||||
echo "Pulling Docker images in parallel..."
|
||||
# Pull images from private registry
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }}) &
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }}) &
|
||||
(docker pull --platform linux/arm64 ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }}) &
|
||||
docker pull onyxdotapp/onyx-web-server:latest
|
||||
docker tag onyxdotapp/onyx-web-server:latest onyxdotapp/onyx-web-server:test
|
||||
|
||||
# Wait for all background jobs to complete
|
||||
wait
|
||||
echo "All Docker images pulled successfully"
|
||||
# we use the runs-on cache for docker builds
|
||||
# in conjunction with runs-on runners, it has better speed and unlimited caching
|
||||
# https://runs-on.com/caching/s3-cache-for-github-actions/
|
||||
# https://runs-on.com/caching/docker/
|
||||
# https://github.com/moby/buildkit#s3-cache-experimental
|
||||
|
||||
# Re-tag to remove registry prefix for docker-compose
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-backend:test-${{ github.run_id }} onyxdotapp/onyx-backend:test
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-model-server:test-${{ github.run_id }} onyxdotapp/onyx-model-server:test
|
||||
docker tag ${{ env.PRIVATE_REGISTRY }}/integration-test-onyx-integration:test-${{ github.run_id }} onyxdotapp/onyx-integration:test
|
||||
# images are built and run locally for testing purposes. Not pushed.
|
||||
- name: Build Backend Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-backend:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Build Model Server Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-model-server:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Build integration test Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/tests/integration/Dockerfile
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-integration:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/integration/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
# NOTE: Use pre-ping/null pool to reduce flakiness due to dropped connections
|
||||
# NOTE: don't need web server for integration tests
|
||||
- name: Start Docker containers
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
@@ -258,23 +99,14 @@ jobs:
|
||||
DISABLE_TELEMETRY=true \
|
||||
IMAGE_TAG=test \
|
||||
INTEGRATION_TESTS_MODE=true \
|
||||
docker compose up \
|
||||
relational_db \
|
||||
index \
|
||||
cache \
|
||||
minio \
|
||||
api_server \
|
||||
inference_model_server \
|
||||
indexing_model_server \
|
||||
background \
|
||||
-d
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d
|
||||
id: start_docker
|
||||
|
||||
- name: Wait for service to be ready
|
||||
run: |
|
||||
echo "Starting wait-for-service script..."
|
||||
|
||||
docker logs -f onyx-api_server-1 &
|
||||
docker logs -f onyx-stack-api_server-1 &
|
||||
|
||||
start_time=$(date +%s)
|
||||
timeout=300 # 5 minutes in seconds
|
||||
@@ -311,44 +143,42 @@ jobs:
|
||||
-p mock-it-services-stack up -d
|
||||
|
||||
# NOTE: Use pre-ping/null to reduce flakiness due to dropped connections
|
||||
- name: Run Integration Tests for ${{ matrix.test-dir.name }}
|
||||
uses: nick-fields/retry@v3
|
||||
with:
|
||||
timeout_minutes: 20
|
||||
max_attempts: 3
|
||||
retry_wait_seconds: 10
|
||||
command: |
|
||||
echo "Running integration tests for ${{ matrix.test-dir.path }}..."
|
||||
docker run --rm --network onyx_default \
|
||||
--name test-runner \
|
||||
-e POSTGRES_HOST=relational_db \
|
||||
-e POSTGRES_USER=postgres \
|
||||
-e POSTGRES_PASSWORD=password \
|
||||
-e POSTGRES_DB=postgres \
|
||||
-e DB_READONLY_USER=db_readonly_user \
|
||||
-e DB_READONLY_PASSWORD=password \
|
||||
-e POSTGRES_POOL_PRE_PING=true \
|
||||
-e POSTGRES_USE_NULL_POOL=true \
|
||||
-e VESPA_HOST=index \
|
||||
-e REDIS_HOST=cache \
|
||||
-e API_SERVER_HOST=api_server \
|
||||
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
|
||||
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
|
||||
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
|
||||
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
|
||||
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
|
||||
-e JIRA_BASE_URL=${JIRA_BASE_URL} \
|
||||
-e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
|
||||
-e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
|
||||
-e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
|
||||
-e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
|
||||
-e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
|
||||
-e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
|
||||
-e MOCK_CONNECTOR_SERVER_PORT=8001 \
|
||||
onyxdotapp/onyx-integration:test \
|
||||
/app/tests/integration/${{ matrix.test-dir.path }}
|
||||
- name: Run Standard Integration Tests
|
||||
run: |
|
||||
echo "Running integration tests..."
|
||||
docker run --rm --network onyx-stack_default \
|
||||
--name test-runner \
|
||||
-e POSTGRES_HOST=relational_db \
|
||||
-e POSTGRES_USER=postgres \
|
||||
-e POSTGRES_PASSWORD=password \
|
||||
-e POSTGRES_DB=postgres \
|
||||
-e POSTGRES_POOL_PRE_PING=true \
|
||||
-e POSTGRES_USE_NULL_POOL=true \
|
||||
-e VESPA_HOST=index \
|
||||
-e REDIS_HOST=cache \
|
||||
-e API_SERVER_HOST=api_server \
|
||||
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
|
||||
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
|
||||
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
|
||||
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
|
||||
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
|
||||
-e MOCK_CONNECTOR_SERVER_PORT=8001 \
|
||||
onyxdotapp/onyx-integration:test \
|
||||
/app/tests/integration/tests \
|
||||
/app/tests/integration/connector_job_tests
|
||||
continue-on-error: true
|
||||
id: run_tests
|
||||
|
||||
- name: Check test results
|
||||
run: |
|
||||
if [ ${{ steps.run_tests.outcome }} == 'failure' ]; then
|
||||
echo "Integration tests failed. Exiting with error."
|
||||
exit 1
|
||||
else
|
||||
echo "All integration tests passed successfully."
|
||||
fi
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# Always gather logs BEFORE "down":
|
||||
@@ -356,19 +186,19 @@ jobs:
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose logs --no-color api_server > $GITHUB_WORKSPACE/api_server.log || true
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color api_server > $GITHUB_WORKSPACE/api_server.log || true
|
||||
|
||||
- name: Dump all-container logs (optional)
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
|
||||
|
||||
- name: Upload logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: docker-all-logs-${{ matrix.test-dir.name }}
|
||||
name: docker-all-logs
|
||||
path: ${{ github.workspace }}/docker-compose.log
|
||||
# ------------------------------------------------------------
|
||||
|
||||
@@ -376,21 +206,4 @@ jobs:
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose down -v
|
||||
|
||||
|
||||
required:
|
||||
runs-on: blacksmith-2vcpu-ubuntu-2404-arm
|
||||
needs: [integration-tests-mit]
|
||||
if: ${{ always() }}
|
||||
steps:
|
||||
- uses: actions/github-script@v7
|
||||
with:
|
||||
script: |
|
||||
const needs = ${{ toJSON(needs) }};
|
||||
const failed = Object.values(needs).some(n => n.result !== 'success');
|
||||
if (failed) {
|
||||
core.setFailed('One or more upstream jobs failed or were cancelled.');
|
||||
} else {
|
||||
core.notice('All required jobs succeeded.');
|
||||
}
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack down -v
|
||||
|
||||
239
.github/workflows/pr-playwright-tests.yml
vendored
239
.github/workflows/pr-playwright-tests.yml
vendored
@@ -6,165 +6,43 @@ concurrency:
|
||||
on: push
|
||||
|
||||
env:
|
||||
# AWS ECR Configuration
|
||||
AWS_REGION: ${{ secrets.AWS_REGION || 'us-west-2' }}
|
||||
ECR_REGISTRY: ${{ secrets.ECR_REGISTRY }}
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_ECR }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_ECR }}
|
||||
BUILDX_NO_DEFAULT_ATTESTATIONS: 1
|
||||
|
||||
# Test Environment Variables
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
GEN_AI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
EXA_API_KEY: ${{ secrets.EXA_API_KEY }}
|
||||
|
||||
# for federated slack tests
|
||||
SLACK_CLIENT_ID: ${{ secrets.SLACK_CLIENT_ID }}
|
||||
SLACK_CLIENT_SECRET: ${{ secrets.SLACK_CLIENT_SECRET }}
|
||||
|
||||
MOCK_LLM_RESPONSE: true
|
||||
|
||||
jobs:
|
||||
build-web-image:
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
id: login-ecr
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push Web Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.ECR_REGISTRY }}/integration-test-onyx-web-server:playwright-test-${{ github.run_id }}
|
||||
provenance: false
|
||||
sbom: false
|
||||
push: true
|
||||
|
||||
build-backend-image:
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
id: login-ecr
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push Backend Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.ECR_REGISTRY }}/integration-test-onyx-backend:playwright-test-${{ github.run_id }}
|
||||
provenance: false
|
||||
sbom: false
|
||||
push: true
|
||||
|
||||
build-model-server-image:
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404-arm
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
id: login-ecr
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: useblacksmith/setup-docker-builder@v1
|
||||
|
||||
- name: Build and push Model Server Docker image
|
||||
uses: useblacksmith/build-push-action@v2
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/arm64
|
||||
tags: ${{ env.ECR_REGISTRY }}/integration-test-onyx-model-server:playwright-test-${{ github.run_id }}
|
||||
provenance: false
|
||||
sbom: false
|
||||
push: true
|
||||
|
||||
playwright-tests:
|
||||
needs: [build-web-image, build-backend-image, build-model-server-image]
|
||||
name: Playwright Tests
|
||||
runs-on: blacksmith-8vcpu-ubuntu-2404-arm
|
||||
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on:
|
||||
[
|
||||
runs-on,
|
||||
runner=32cpu-linux-x64,
|
||||
disk=large,
|
||||
"run-id=${{ github.run_id }}",
|
||||
]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
aws-access-key-id: ${{ env.AWS_ACCESS_KEY_ID }}
|
||||
aws-secret-access-key: ${{ env.AWS_SECRET_ACCESS_KEY }}
|
||||
aws-region: ${{ env.AWS_REGION }}
|
||||
|
||||
- name: Login to Amazon ECR
|
||||
id: login-ecr
|
||||
uses: aws-actions/amazon-ecr-login@v2
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Pull Docker images
|
||||
run: |
|
||||
# Pull all images from ECR in parallel
|
||||
echo "Pulling Docker images in parallel..."
|
||||
(docker pull ${{ env.ECR_REGISTRY }}/integration-test-onyx-web-server:playwright-test-${{ github.run_id }}) &
|
||||
(docker pull ${{ env.ECR_REGISTRY }}/integration-test-onyx-backend:playwright-test-${{ github.run_id }}) &
|
||||
(docker pull ${{ env.ECR_REGISTRY }}/integration-test-onyx-model-server:playwright-test-${{ github.run_id }}) &
|
||||
|
||||
# Wait for all background jobs to complete
|
||||
wait
|
||||
echo "All Docker images pulled successfully"
|
||||
|
||||
# Re-tag with expected names for docker-compose
|
||||
docker tag ${{ env.ECR_REGISTRY }}/integration-test-onyx-web-server:playwright-test-${{ github.run_id }} onyxdotapp/onyx-web-server:test
|
||||
docker tag ${{ env.ECR_REGISTRY }}/integration-test-onyx-backend:playwright-test-${{ github.run_id }} onyxdotapp/onyx-backend:test
|
||||
docker tag ${{ env.ECR_REGISTRY }}/integration-test-onyx-model-server:playwright-test-${{ github.run_id }} onyxdotapp/onyx-model-server:test
|
||||
python-version: "3.11"
|
||||
cache: "pip"
|
||||
cache-dependency-path: |
|
||||
backend/requirements/default.txt
|
||||
backend/requirements/dev.txt
|
||||
backend/requirements/model_server.txt
|
||||
- run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
|
||||
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v4
|
||||
@@ -179,24 +57,79 @@ jobs:
|
||||
working-directory: ./web
|
||||
run: npx playwright install --with-deps
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
# tag every docker image with "test" so that we can spin up the correct set
|
||||
# of images during testing
|
||||
|
||||
# we use the runs-on cache for docker builds
|
||||
# in conjunction with runs-on runners, it has better speed and unlimited caching
|
||||
# https://runs-on.com/caching/s3-cache-for-github-actions/
|
||||
# https://runs-on.com/caching/docker/
|
||||
# https://github.com/moby/buildkit#s3-cache-experimental
|
||||
|
||||
# images are built and run locally for testing purposes. Not pushed.
|
||||
|
||||
- name: Build Web Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-web-server:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Build Backend Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-backend:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Build Model Server Docker image
|
||||
uses: ./.github/actions/custom-build-and-push
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/amd64
|
||||
tags: onyxdotapp/onyx-model-server:test
|
||||
push: false
|
||||
load: true
|
||||
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
|
||||
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
|
||||
|
||||
- name: Start Docker containers
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
|
||||
AUTH_TYPE=basic \
|
||||
GEN_AI_API_KEY=${{ env.OPENAI_API_KEY }} \
|
||||
EXA_API_KEY=${{ env.EXA_API_KEY }} \
|
||||
GEN_AI_API_KEY=${{ secrets.OPENAI_API_KEY }} \
|
||||
REQUIRE_EMAIL_VERIFICATION=false \
|
||||
DISABLE_TELEMETRY=true \
|
||||
IMAGE_TAG=test \
|
||||
docker compose up -d
|
||||
docker compose -f docker-compose.dev.yml -p danswer-stack up -d
|
||||
id: start_docker
|
||||
|
||||
- name: Wait for service to be ready
|
||||
run: |
|
||||
echo "Starting wait-for-service script..."
|
||||
|
||||
docker logs -f onyx-api_server-1 &
|
||||
docker logs -f danswer-stack-api_server-1 &
|
||||
|
||||
start_time=$(date +%s)
|
||||
timeout=300 # 5 minutes in seconds
|
||||
@@ -226,6 +159,12 @@ jobs:
|
||||
done
|
||||
echo "Finished waiting for service."
|
||||
|
||||
- name: Run pytest playwright test init
|
||||
working-directory: ./backend
|
||||
env:
|
||||
PYTEST_IGNORE_SKIP: true
|
||||
run: pytest -s tests/integration/tests/playwright/test_playwright.py
|
||||
|
||||
- name: Run Playwright tests
|
||||
working-directory: ./web
|
||||
run: npx playwright test
|
||||
@@ -244,7 +183,7 @@ jobs:
|
||||
if: success() || failure()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose logs > docker-compose.log
|
||||
docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
|
||||
mv docker-compose.log ${{ github.workspace }}/docker-compose.log
|
||||
|
||||
- name: Upload logs
|
||||
@@ -257,7 +196,7 @@ jobs:
|
||||
- name: Stop Docker containers
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose down -v
|
||||
docker compose -f docker-compose.dev.yml -p danswer-stack down -v
|
||||
|
||||
# NOTE: Chromatic UI diff testing is currently disabled.
|
||||
# We are using Playwright for local and CI testing without visual regression checks.
|
||||
|
||||
25
.github/workflows/pr-python-checks.yml
vendored
25
.github/workflows/pr-python-checks.yml
vendored
@@ -31,31 +31,16 @@ jobs:
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
|
||||
|
||||
- name: Generate OpenAPI schema
|
||||
working-directory: ./backend
|
||||
env:
|
||||
PYTHONPATH: "."
|
||||
run: |
|
||||
python scripts/onyx_openapi_schema.py --filename generated/openapi.json
|
||||
|
||||
- name: Generate OpenAPI Python client
|
||||
working-directory: ./backend
|
||||
run: |
|
||||
docker run --rm \
|
||||
-v "${{ github.workspace }}/backend/generated:/local" \
|
||||
openapitools/openapi-generator-cli generate \
|
||||
-i /local/openapi.json \
|
||||
-g python \
|
||||
-o /local/onyx_openapi_client \
|
||||
--package-name onyx_openapi_client \
|
||||
--skip-validate-spec \
|
||||
--openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
|
||||
|
||||
- name: Run MyPy
|
||||
run: |
|
||||
cd backend
|
||||
mypy .
|
||||
|
||||
- name: Run ruff
|
||||
run: |
|
||||
cd backend
|
||||
ruff .
|
||||
|
||||
- name: Check import order with reorder-python-imports
|
||||
run: |
|
||||
cd backend
|
||||
|
||||
46
.github/workflows/pr-python-connector-tests.yml
vendored
46
.github/workflows/pr-python-connector-tests.yml
vendored
@@ -12,21 +12,18 @@ env:
|
||||
# AWS
|
||||
AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_ACCESS_KEY_ID_DAILY_CONNECTOR_TESTS }}
|
||||
AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS: ${{ secrets.AWS_SECRET_ACCESS_KEY_DAILY_CONNECTOR_TESTS }}
|
||||
|
||||
|
||||
# Confluence
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
|
||||
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
|
||||
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
|
||||
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
|
||||
# Jira
|
||||
JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
|
||||
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
|
||||
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
|
||||
|
||||
# Gong
|
||||
GONG_ACCESS_KEY: ${{ secrets.GONG_ACCESS_KEY }}
|
||||
GONG_ACCESS_KEY_SECRET: ${{ secrets.GONG_ACCESS_KEY_SECRET }}
|
||||
|
||||
@@ -36,66 +33,37 @@ env:
|
||||
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
|
||||
GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
|
||||
GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
|
||||
|
||||
# Slab
|
||||
SLAB_BOT_TOKEN: ${{ secrets.SLAB_BOT_TOKEN }}
|
||||
|
||||
# Zendesk
|
||||
ZENDESK_SUBDOMAIN: ${{ secrets.ZENDESK_SUBDOMAIN }}
|
||||
ZENDESK_EMAIL: ${{ secrets.ZENDESK_EMAIL }}
|
||||
ZENDESK_TOKEN: ${{ secrets.ZENDESK_TOKEN }}
|
||||
|
||||
# Salesforce
|
||||
SF_USERNAME: ${{ secrets.SF_USERNAME }}
|
||||
SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
|
||||
SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}
|
||||
|
||||
# Hubspot
|
||||
HUBSPOT_ACCESS_TOKEN: ${{ secrets.HUBSPOT_ACCESS_TOKEN }}
|
||||
|
||||
# IMAP
|
||||
IMAP_HOST: ${{ secrets.IMAP_HOST }}
|
||||
IMAP_USERNAME: ${{ secrets.IMAP_USERNAME }}
|
||||
IMAP_PASSWORD: ${{ secrets.IMAP_PASSWORD }}
|
||||
IMAP_MAILBOXES: ${{ secrets.IMAP_MAILBOXES }}
|
||||
|
||||
# Airtable
|
||||
AIRTABLE_TEST_BASE_ID: ${{ secrets.AIRTABLE_TEST_BASE_ID }}
|
||||
AIRTABLE_TEST_TABLE_ID: ${{ secrets.AIRTABLE_TEST_TABLE_ID }}
|
||||
AIRTABLE_TEST_TABLE_NAME: ${{ secrets.AIRTABLE_TEST_TABLE_NAME }}
|
||||
AIRTABLE_ACCESS_TOKEN: ${{ secrets.AIRTABLE_ACCESS_TOKEN }}
|
||||
|
||||
# Sharepoint
|
||||
SHAREPOINT_CLIENT_ID: ${{ secrets.SHAREPOINT_CLIENT_ID }}
|
||||
SHAREPOINT_CLIENT_SECRET: ${{ secrets.SHAREPOINT_CLIENT_SECRET }}
|
||||
SHAREPOINT_CLIENT_DIRECTORY_ID: ${{ secrets.SHAREPOINT_CLIENT_DIRECTORY_ID }}
|
||||
SHAREPOINT_SITE: ${{ secrets.SHAREPOINT_SITE }}
|
||||
|
||||
# Github
|
||||
ACCESS_TOKEN_GITHUB: ${{ secrets.ACCESS_TOKEN_GITHUB }}
|
||||
|
||||
# Gitlab
|
||||
GITLAB_ACCESS_TOKEN: ${{ secrets.GITLAB_ACCESS_TOKEN }}
|
||||
|
||||
# Gitbook
|
||||
GITBOOK_SPACE_ID: ${{ secrets.GITBOOK_SPACE_ID }}
|
||||
GITBOOK_API_KEY: ${{ secrets.GITBOOK_API_KEY }}
|
||||
|
||||
# Notion
|
||||
NOTION_INTEGRATION_TOKEN: ${{ secrets.NOTION_INTEGRATION_TOKEN }}
|
||||
|
||||
# Highspot
|
||||
HIGHSPOT_KEY: ${{ secrets.HIGHSPOT_KEY }}
|
||||
HIGHSPOT_SECRET: ${{ secrets.HIGHSPOT_SECRET }}
|
||||
|
||||
# Slack
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
# Teams
|
||||
TEAMS_APPLICATION_ID: ${{ secrets.TEAMS_APPLICATION_ID }}
|
||||
TEAMS_DIRECTORY_ID: ${{ secrets.TEAMS_DIRECTORY_ID }}
|
||||
TEAMS_SECRET: ${{ secrets.TEAMS_SECRET }}
|
||||
|
||||
jobs:
|
||||
connectors-check:
|
||||
# See https://runs-on.com/runners/linux/
|
||||
@@ -127,15 +95,7 @@ jobs:
|
||||
|
||||
- name: Run Tests
|
||||
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
||||
run: |
|
||||
py.test \
|
||||
-n 8 \
|
||||
--dist loadfile \
|
||||
--durations=8 \
|
||||
-o junit_family=xunit2 \
|
||||
-xv \
|
||||
--ff \
|
||||
backend/tests/daily/connectors
|
||||
run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
|
||||
|
||||
- name: Alert on Failure
|
||||
if: failure() && github.event_name == 'schedule'
|
||||
|
||||
6
.github/workflows/pr-python-model-tests.yml
vendored
6
.github/workflows/pr-python-model-tests.yml
vendored
@@ -77,7 +77,7 @@ jobs:
|
||||
REQUIRE_EMAIL_VERIFICATION=false \
|
||||
DISABLE_TELEMETRY=true \
|
||||
IMAGE_TAG=test \
|
||||
docker compose -f docker-compose.model-server-test.yml up -d indexing_model_server
|
||||
docker compose -f docker-compose.model-server-test.yml -p onyx-stack up -d indexing_model_server
|
||||
id: start_docker
|
||||
|
||||
- name: Wait for service to be ready
|
||||
@@ -132,7 +132,7 @@ jobs:
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.model-server-test.yml logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
|
||||
docker compose -f docker-compose.model-server-test.yml -p onyx-stack logs --no-color > $GITHUB_WORKSPACE/docker-compose.log || true
|
||||
|
||||
- name: Upload logs
|
||||
if: always()
|
||||
@@ -145,5 +145,5 @@ jobs:
|
||||
if: always()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.model-server-test.yml down -v
|
||||
docker compose -f docker-compose.model-server-test.yml -p onyx-stack down -v
|
||||
|
||||
|
||||
5
.github/workflows/pr-python-tests.yml
vendored
5
.github/workflows/pr-python-tests.yml
vendored
@@ -15,9 +15,6 @@ jobs:
|
||||
env:
|
||||
PYTHONPATH: ./backend
|
||||
REDIS_CLOUD_PYTEST_PASSWORD: ${{ secrets.REDIS_CLOUD_PYTEST_PASSWORD }}
|
||||
SF_USERNAME: ${{ secrets.SF_USERNAME }}
|
||||
SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
|
||||
SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
@@ -31,14 +28,12 @@ jobs:
|
||||
cache-dependency-path: |
|
||||
backend/requirements/default.txt
|
||||
backend/requirements/dev.txt
|
||||
backend/requirements/model_server.txt
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
|
||||
|
||||
- name: Run Tests
|
||||
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
||||
|
||||
17
.gitignore
vendored
17
.gitignore
vendored
@@ -14,29 +14,12 @@
|
||||
/web/test-results/
|
||||
backend/onyx/agent_search/main/test_data.json
|
||||
backend/tests/regression/answer_quality/test_data.json
|
||||
backend/tests/regression/search_quality/eval-*
|
||||
backend/tests/regression/search_quality/search_eval_config.yaml
|
||||
backend/tests/regression/search_quality/*.json
|
||||
backend/onyx/evals/data/
|
||||
*.log
|
||||
|
||||
# secret files
|
||||
.env
|
||||
jira_test_env
|
||||
settings.json
|
||||
|
||||
# others
|
||||
/deployment/data/nginx/app.conf
|
||||
*.sw?
|
||||
/backend/tests/regression/answer_quality/search_test_config.yaml
|
||||
*.egg-info
|
||||
|
||||
# Local .terraform directories
|
||||
**/.terraform/*
|
||||
|
||||
# Local .tfstate files
|
||||
*.tfstate
|
||||
*.tfstate.*
|
||||
|
||||
# Local .terraform.lock.hcl file
|
||||
.terraform.lock.hcl
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
{
|
||||
"mcpServers": {
|
||||
"onyx-mcp": {
|
||||
"type": "http",
|
||||
"url": "http://localhost:8000/mcp"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -37,15 +37,6 @@ repos:
|
||||
additional_dependencies:
|
||||
- prettier
|
||||
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: check-lazy-imports
|
||||
name: Check lazy imports are not directly imported
|
||||
entry: python3 backend/scripts/check_lazy_imports.py
|
||||
language: system
|
||||
files: ^backend/(?!\.venv/).*\.py$
|
||||
pass_filenames: false
|
||||
|
||||
# We would like to have a mypy pre-commit hook, but due to the fact that
|
||||
# pre-commit runs in it's own isolated environment, we would need to install
|
||||
# and keep in sync all dependencies so mypy has access to the appropriate type
|
||||
|
||||
28
.vscode/env_template.txt
vendored
28
.vscode/env_template.txt
vendored
@@ -10,7 +10,7 @@ SKIP_WARM_UP=True
|
||||
|
||||
# Always keep these on for Dev
|
||||
# Logs all model prompts to stdout
|
||||
LOG_ONYX_MODEL_INTERACTIONS=True
|
||||
LOG_DANSWER_MODEL_INTERACTIONS=True
|
||||
# More verbose logging
|
||||
LOG_LEVEL=debug
|
||||
|
||||
@@ -23,9 +23,6 @@ DISABLE_LLM_DOC_RELEVANCE=False
|
||||
# Useful if you want to toggle auth on/off (google_oauth/OIDC specifically)
|
||||
OAUTH_CLIENT_ID=<REPLACE THIS>
|
||||
OAUTH_CLIENT_SECRET=<REPLACE THIS>
|
||||
OPENID_CONFIG_URL=<REPLACE THIS>
|
||||
SAML_CONF_DIR=/<ABSOLUTE PATH TO ONYX>/onyx/backend/ee/onyx/configs/saml_config
|
||||
|
||||
# Generally not useful for dev, we don't generally want to set up an SMTP server for dev
|
||||
REQUIRE_EMAIL_VERIFICATION=False
|
||||
|
||||
@@ -39,8 +36,8 @@ FAST_GEN_AI_MODEL_VERSION=gpt-4o
|
||||
|
||||
# For Danswer Slack Bot, overrides the UI values so no need to set this up via UI every time
|
||||
# Only needed if using DanswerBot
|
||||
#ONYX_BOT_SLACK_APP_TOKEN=<REPLACE THIS>
|
||||
#ONYX_BOT_SLACK_BOT_TOKEN=<REPLACE THIS>
|
||||
#DANSWER_BOT_SLACK_APP_TOKEN=<REPLACE THIS>
|
||||
#DANSWER_BOT_SLACK_BOT_TOKEN=<REPLACE THIS>
|
||||
|
||||
|
||||
# Python stuff
|
||||
@@ -48,8 +45,8 @@ PYTHONPATH=../backend
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
|
||||
# Internet Search
|
||||
EXA_API_KEY=<REPLACE THIS>
|
||||
# Internet Search
|
||||
BING_API_KEY=<REPLACE THIS>
|
||||
|
||||
|
||||
# Enable the full set of Danswer Enterprise Edition features
|
||||
@@ -61,18 +58,3 @@ AGENT_RETRIEVAL_STATS=False # Note: This setting will incur substantial re-ran
|
||||
AGENT_RERANKING_STATS=True
|
||||
AGENT_MAX_QUERY_RETRIEVAL_RESULTS=20
|
||||
AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS=20
|
||||
|
||||
# S3 File Store Configuration (MinIO for local development)
|
||||
S3_ENDPOINT_URL=http://localhost:9004
|
||||
S3_FILE_STORE_BUCKET_NAME=onyx-file-store-bucket
|
||||
S3_AWS_ACCESS_KEY_ID=minioadmin
|
||||
S3_AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
|
||||
# Show extra/uncommon connectors
|
||||
SHOW_EXTRA_CONNECTORS=True
|
||||
|
||||
# Local langsmith tracing
|
||||
LANGSMITH_TRACING="true"
|
||||
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
|
||||
LANGSMITH_API_KEY=<REPLACE_THIS>
|
||||
LANGSMITH_PROJECT=<REPLACE_THIS>
|
||||
185
.vscode/launch.template.jsonc
vendored
185
.vscode/launch.template.jsonc
vendored
@@ -24,23 +24,21 @@
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery docfetching",
|
||||
"Celery docprocessing",
|
||||
"Celery indexing",
|
||||
"Celery user files indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
},
|
||||
"stopAll": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web / Model / API",
|
||||
"configurations": ["Web Server", "Model Server", "API Server"],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
},
|
||||
"stopAll": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Celery (all)",
|
||||
@@ -48,15 +46,14 @@
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery docfetching",
|
||||
"Celery docprocessing",
|
||||
"Celery indexing",
|
||||
"Celery user files indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
},
|
||||
"stopAll": true
|
||||
}
|
||||
}
|
||||
],
|
||||
"configurations": [
|
||||
@@ -111,7 +108,7 @@
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_ONYX_MODEL_INTERACTIONS": "True",
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
@@ -122,7 +119,7 @@
|
||||
"consoleTitle": "API Server Console"
|
||||
},
|
||||
// For the listener to access the Slack API,
|
||||
// ONYX_BOT_SLACK_APP_TOKEN & ONYX_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
|
||||
// DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
|
||||
{
|
||||
"name": "Slack Bot",
|
||||
"consoleName": "Slack Bot",
|
||||
@@ -192,7 +189,7 @@
|
||||
"--loglevel=INFO",
|
||||
"--hostname=light@%n",
|
||||
"-Q",
|
||||
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert,index_attempt_cleanup"
|
||||
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
@@ -229,66 +226,35 @@
|
||||
"consoleTitle": "Celery heavy Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery docfetching",
|
||||
"name": "Celery indexing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
"ENABLE_MULTIPASS_INDEXING": "false",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.docfetching",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=docfetching@%n",
|
||||
"-Q",
|
||||
"connector_doc_fetching,user_files_indexing"
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=indexing@%n",
|
||||
"-Q",
|
||||
"connector_indexing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery docfetching Console",
|
||||
"justMyCode": false
|
||||
},
|
||||
{
|
||||
"name": "Celery docprocessing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"ENABLE_MULTIPASS_INDEXING": "false",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.docprocessing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=6",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=docprocessing@%n",
|
||||
"-Q",
|
||||
"docprocessing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery docprocessing Console",
|
||||
"justMyCode": false
|
||||
},
|
||||
"consoleTitle": "Celery indexing Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery monitoring",
|
||||
"type": "debugpy",
|
||||
@@ -337,6 +303,35 @@
|
||||
},
|
||||
"consoleTitle": "Celery beat Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery user files indexing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=user_files_indexing@%n",
|
||||
"-Q",
|
||||
"user_files_indexing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery user files indexing Console"
|
||||
},
|
||||
{
|
||||
"name": "Pytest",
|
||||
"consoleName": "Pytest",
|
||||
@@ -385,28 +380,6 @@
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Eval CLI",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/backend/onyx/evals/eval_cli.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
},
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--verbose"
|
||||
],
|
||||
"consoleTitle": "Eval CLI Console"
|
||||
},
|
||||
{
|
||||
// Celery jobs launched through a single background script (legacy)
|
||||
// Recommend using the "Celery (all)" compound launch instead.
|
||||
@@ -418,7 +391,7 @@
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_ONYX_MODEL_INTERACTIONS": "True",
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
@@ -439,46 +412,6 @@
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
// script to generate the openapi schema
|
||||
"name": "Onyx OpenAPI Schema Generator",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/onyx_openapi_schema.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"env": {
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--filename",
|
||||
"generated/openapi.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
// script to debug multi tenant db issues
|
||||
"name": "Onyx DB Manager (Top Chunks)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/debugging/onyx_db.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"env": {
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--password",
|
||||
"your_password_here",
|
||||
"--port",
|
||||
"5433",
|
||||
"--report",
|
||||
"top-chunks",
|
||||
"--filename",
|
||||
"generated/tenants_by_num_docs.csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Debug React Web App in Chrome",
|
||||
"type": "chrome",
|
||||
@@ -488,4 +421,4 @@
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
101
.vscode/tasks.template.jsonc
vendored
101
.vscode/tasks.template.jsonc
vendored
@@ -1,101 +0,0 @@
|
||||
{
|
||||
"version": "2.0.0",
|
||||
"tasks": [
|
||||
{
|
||||
"type": "austin",
|
||||
"label": "Profile celery beat",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend"
|
||||
},
|
||||
"command": [
|
||||
"sudo",
|
||||
"-E"
|
||||
],
|
||||
"args": [
|
||||
"celery",
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.beat",
|
||||
"beat",
|
||||
"--loglevel=INFO"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "shell",
|
||||
"label": "Generate Onyx OpenAPI Python client",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend"
|
||||
},
|
||||
"command": [
|
||||
"openapi-generator"
|
||||
],
|
||||
"args": [
|
||||
"generate",
|
||||
"-i",
|
||||
"generated/openapi.json",
|
||||
"-g",
|
||||
"python",
|
||||
"-o",
|
||||
"generated/onyx_openapi_client",
|
||||
"--package-name",
|
||||
"onyx_openapi_client",
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "shell",
|
||||
"label": "Generate Typescript Fetch client (openapi-generator)",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}"
|
||||
},
|
||||
"command": [
|
||||
"openapi-generator"
|
||||
],
|
||||
"args": [
|
||||
"generate",
|
||||
"-i",
|
||||
"backend/generated/openapi.json",
|
||||
"-g",
|
||||
"typescript-fetch",
|
||||
"-o",
|
||||
"${workspaceFolder}/web/src/lib/generated/onyx_api",
|
||||
"--additional-properties=disallowAdditionalPropertiesIfNotPresent=false,legacyDiscriminatorBehavior=false,supportsES6=true",
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "shell",
|
||||
"label": "Generate TypeScript Client (openapi-ts)",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/web"
|
||||
},
|
||||
"command": [
|
||||
"npx"
|
||||
],
|
||||
"args": [
|
||||
"openapi-typescript",
|
||||
"../backend/generated/openapi.json",
|
||||
"--output",
|
||||
"./src/lib/generated/onyx-schema.ts",
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "shell",
|
||||
"label": "Generate TypeScript Client (orval)",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/web"
|
||||
},
|
||||
"command": [
|
||||
"npx"
|
||||
],
|
||||
"args": [
|
||||
"orval",
|
||||
"--config",
|
||||
"orval.config.js",
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
295
AGENTS.md
295
AGENTS.md
@@ -1,295 +0,0 @@
|
||||
# AGENTS.md
|
||||
|
||||
This file provides guidance to Codex when working with code in this repository.
|
||||
|
||||
## KEY NOTES
|
||||
|
||||
- If you run into any missing python dependency errors, try running your command with `source backend/.venv/bin/activate` \
|
||||
to assume the python venv.
|
||||
- To make tests work, check the `.env` file at the root of the project to find an OpenAI key.
|
||||
- If using `playwright` to explore the frontend, you can usually log in with username `a@test.com` and password
|
||||
`a`. The app can be accessed at `http://localhost:3000`.
|
||||
- You should assume that all Onyx services are running. To verify, you can check the `backend/log` directory to
|
||||
make sure we see logs coming out from the relevant service.
|
||||
- To connect to the Postgres database, use: `docker exec -it onyx-relational_db-1 psql -U postgres -c "<SQL>"`
|
||||
- When making calls to the backend, always go through the frontend. E.g. make a call to `http://localhost:3000/api/persona` not `http://localhost:8080/api/persona`
|
||||
- Put ALL db operations under the `backend/onyx/db` / `backend/ee/onyx/db` directories. Don't run queries
|
||||
outside of those directories.
|
||||
|
||||
## Project Overview
|
||||
|
||||
**Onyx** (formerly Danswer) is an open-source Gen-AI and Enterprise Search platform that connects to company documents, apps, and people. It features a modular architecture with both Community Edition (MIT licensed) and Enterprise Edition offerings.
|
||||
|
||||
|
||||
### Background Workers (Celery)
|
||||
|
||||
Onyx uses Celery for asynchronous task processing with multiple specialized workers:
|
||||
|
||||
#### Worker Types
|
||||
|
||||
1. **Primary Worker** (`celery_app.py`)
|
||||
- Coordinates core background tasks and system-wide operations
|
||||
- Handles connector management, document sync, pruning, and periodic checks
|
||||
- Runs with 4 threads concurrency
|
||||
- Tasks: connector deletion, vespa sync, pruning, LLM model updates, user file sync
|
||||
|
||||
2. **Docfetching Worker** (`docfetching`)
|
||||
- Fetches documents from external data sources (connectors)
|
||||
- Spawns docprocessing tasks for each document batch
|
||||
- Implements watchdog monitoring for stuck connectors
|
||||
- Configurable concurrency (default from env)
|
||||
|
||||
3. **Docprocessing Worker** (`docprocessing`)
|
||||
- Processes fetched documents through the indexing pipeline:
|
||||
- Upserts documents to PostgreSQL
|
||||
- Chunks documents and adds contextual information
|
||||
- Embeds chunks via model server
|
||||
- Writes chunks to Vespa vector database
|
||||
- Updates document metadata
|
||||
- Configurable concurrency (default from env)
|
||||
|
||||
4. **Light Worker** (`light`)
|
||||
- Handles lightweight, fast operations
|
||||
- Tasks: vespa operations, document permissions sync, external group sync
|
||||
- Higher concurrency for quick tasks
|
||||
|
||||
5. **Heavy Worker** (`heavy`)
|
||||
- Handles resource-intensive operations
|
||||
- Primary task: document pruning operations
|
||||
- Runs with 4 threads concurrency
|
||||
|
||||
6. **KG Processing Worker** (`kg_processing`)
|
||||
- Handles Knowledge Graph processing and clustering
|
||||
- Builds relationships between documents
|
||||
- Runs clustering algorithms
|
||||
- Configurable concurrency
|
||||
|
||||
7. **Monitoring Worker** (`monitoring`)
|
||||
- System health monitoring and metrics collection
|
||||
- Monitors Celery queues, process memory, and system status
|
||||
- Single thread (monitoring doesn't need parallelism)
|
||||
- Cloud-specific monitoring tasks
|
||||
|
||||
8. **Beat Worker** (`beat`)
|
||||
- Celery's scheduler for periodic tasks
|
||||
- Uses DynamicTenantScheduler for multi-tenant support
|
||||
- Schedules tasks like:
|
||||
- Indexing checks (every 15 seconds)
|
||||
- Connector deletion checks (every 20 seconds)
|
||||
- Vespa sync checks (every 20 seconds)
|
||||
- Pruning checks (every 20 seconds)
|
||||
- KG processing (every 60 seconds)
|
||||
- Monitoring tasks (every 5 minutes)
|
||||
- Cleanup tasks (hourly)
|
||||
|
||||
#### Key Features
|
||||
|
||||
- **Thread-based Workers**: All workers use thread pools (not processes) for stability
|
||||
- **Tenant Awareness**: Multi-tenant support with per-tenant task isolation. There is a
|
||||
middleware layer that automatically finds the appropriate tenant ID when sending tasks
|
||||
via Celery Beat.
|
||||
- **Task Prioritization**: High, Medium, Low priority queues
|
||||
- **Monitoring**: Built-in heartbeat and liveness checking
|
||||
- **Failure Handling**: Automatic retry and failure recovery mechanisms
|
||||
- **Redis Coordination**: Inter-process communication via Redis
|
||||
- **PostgreSQL State**: Task state and metadata stored in PostgreSQL
|
||||
|
||||
|
||||
#### Important Notes
|
||||
|
||||
**Defining Tasks**:
|
||||
- Always use `@shared_task` rather than `@celery_app`
|
||||
- Put tasks under `background/celery/tasks/` or `ee/background/celery/tasks`
|
||||
|
||||
**Defining APIs**:
|
||||
When creating new FastAPI APIs, do NOT use the `response_model` field. Instead, just type the
|
||||
function.
|
||||
|
||||
**Testing Updates**:
|
||||
If you make any updates to a celery worker and you want to test these changes, you will need
|
||||
to ask me to restart the celery worker. There is no auto-restart on code-change mechanism.
|
||||
|
||||
### Code Quality
|
||||
```bash
|
||||
# Install and run pre-commit hooks
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
NOTE: Always make sure everything is strictly typed (both in Python and Typescript).
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
### Technology Stack
|
||||
- **Backend**: Python 3.11, FastAPI, SQLAlchemy, Alembic, Celery
|
||||
- **Frontend**: Next.js 15+, React 18, TypeScript, Tailwind CSS
|
||||
- **Database**: PostgreSQL with Redis caching
|
||||
- **Search**: Vespa vector database
|
||||
- **Auth**: OAuth2, SAML, multi-provider support
|
||||
- **AI/ML**: LangChain, LiteLLM, multiple embedding models
|
||||
|
||||
### Directory Structure
|
||||
|
||||
```
|
||||
backend/
|
||||
├── onyx/
|
||||
│ ├── auth/ # Authentication & authorization
|
||||
│ ├── chat/ # Chat functionality & LLM interactions
|
||||
│ ├── connectors/ # Data source connectors
|
||||
│ ├── db/ # Database models & operations
|
||||
│ ├── document_index/ # Vespa integration
|
||||
│ ├── federated_connectors/ # External search connectors
|
||||
│ ├── llm/ # LLM provider integrations
|
||||
│ └── server/ # API endpoints & routers
|
||||
├── ee/ # Enterprise Edition features
|
||||
├── alembic/ # Database migrations
|
||||
└── tests/ # Test suites
|
||||
|
||||
web/
|
||||
├── src/app/ # Next.js app router pages
|
||||
├── src/components/ # Reusable React components
|
||||
└── src/lib/ # Utilities & business logic
|
||||
```
|
||||
|
||||
## Database & Migrations
|
||||
|
||||
### Running Migrations
|
||||
```bash
|
||||
# Standard migrations
|
||||
alembic upgrade head
|
||||
|
||||
# Multi-tenant (Enterprise)
|
||||
alembic -n schema_private upgrade head
|
||||
```
|
||||
|
||||
### Creating Migrations
|
||||
```bash
|
||||
# Auto-generate migration
|
||||
alembic revision --autogenerate -m "description"
|
||||
|
||||
# Multi-tenant migration
|
||||
alembic -n schema_private revision --autogenerate -m "description"
|
||||
```
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
There are 4 main types of tests within Onyx:
|
||||
|
||||
### Unit Tests
|
||||
These should not assume any Onyx/external services are available to be called.
|
||||
Interactions with the outside world should be mocked using `unittest.mock`. Generally, only
|
||||
write these for complex, isolated modules e.g. `citation_processing.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest -xv backend/tests/unit
|
||||
```
|
||||
|
||||
### External Dependency Unit Tests
|
||||
These tests assume that all external dependencies of Onyx are available and callable (e.g. Postgres, Redis,
|
||||
MinIO/S3, Vespa are running + OpenAI can be called + any request to the internet is fine + etc.).
|
||||
|
||||
However, the actual Onyx containers are not running and with these tests we call the function to test directly.
|
||||
We can also mock components/calls at will.
|
||||
|
||||
The goal with these tests are to minimize mocking while giving some flexibility to mock things that are flakey,
|
||||
need strictly controlled behavior, or need to have their internal behavior validated (e.g. verify a function is called
|
||||
with certain args, something that would be impossible with proper integration tests).
|
||||
|
||||
A great example of this type of test is `backend/tests/external_dependency_unit/connectors/confluence/test_confluence_group_sync.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest backend/tests/external_dependency_unit
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
Standard integration tests. Every test in `backend/tests/integration` runs against a real Onyx deployment. We cannot
|
||||
mock anything in these tests. Prefer writing integration tests (or External Dependency Unit Tests if mocking/internal
|
||||
verification is necessary) over any other type of test.
|
||||
|
||||
Tests are parallelized at a directory level.
|
||||
|
||||
When writing integration tests, make sure to check the root `conftest.py` for useful fixtures + the `backend/tests/integration/common_utils` directory for utilities. Prefer (if one exists), calling the appropriate Manager
|
||||
class in the utils over directly calling the APIs with a library like `requests`. Prefer using fixtures rather than
|
||||
calling the utilities directly (e.g. do NOT create admin users with
|
||||
`admin_user = UserManager.create(name="admin_user")`, instead use the `admin_user` fixture).
|
||||
|
||||
A great example of this type of test is `backend/tests/integration/dev_apis/test_simple_chat_api.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest backend/tests/integration
|
||||
```
|
||||
|
||||
### Playwright (E2E) Tests
|
||||
These tests are an even more complete version of the Integration Tests mentioned above. Has all services of Onyx
|
||||
running, *including* the Web Server.
|
||||
|
||||
Use these tests for anything that requires significant frontend <-> backend coordination.
|
||||
|
||||
Tests are located at `web/tests/e2e`. Tests are written in TypeScript.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
npx playwright test <TEST_NAME>
|
||||
```
|
||||
|
||||
|
||||
## Logs
|
||||
|
||||
When (1) writing integration tests or (2) doing live tests (e.g. curl / playwright) you can get access
|
||||
to logs via the `backend/log/<service_name>_debug.log` file. All Onyx services (api_server, web_server, celery_X)
|
||||
will be tailing their logs to this file.
|
||||
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- Never commit API keys or secrets to repository
|
||||
- Use encrypted credential storage for connector credentials
|
||||
- Follow RBAC patterns for new features
|
||||
- Implement proper input validation with Pydantic models
|
||||
- Use parameterized queries to prevent SQL injection
|
||||
|
||||
## AI/LLM Integration
|
||||
|
||||
- Multiple LLM providers supported via LiteLLM
|
||||
- Configurable models per feature (chat, search, embeddings)
|
||||
- Streaming support for real-time responses
|
||||
- Token management and rate limiting
|
||||
- Custom prompts and agent actions
|
||||
|
||||
## UI/UX Patterns
|
||||
|
||||
- Tailwind CSS with design system in `web/src/components/ui/`
|
||||
- Radix UI and Headless UI for accessible components
|
||||
- SWR for data fetching and caching
|
||||
- Form validation with react-hook-form
|
||||
- Error handling with popup notifications
|
||||
|
||||
## Creating a Plan
|
||||
When creating a plan in the `plans` directory, make sure to include at least these elements:
|
||||
|
||||
**Issues to Address**
|
||||
What the change is meant to do.
|
||||
|
||||
**Important Notes**
|
||||
Things you come across in your research that are important to the implementation.
|
||||
|
||||
**Implementation strategy**
|
||||
How you are going to make the changes happen. High level approach.
|
||||
|
||||
**Tests**
|
||||
What unit (use rarely), external dependency unit, integration, and playwright tests you plan to write to
|
||||
verify the correct behavior. Don't overtest. Usually, a given change only needs one type of test.
|
||||
|
||||
Do NOT include these: *Timeline*, *Rollback plan*
|
||||
|
||||
This is a minimal list - feel free to include more. Do NOT write code as part of your plan.
|
||||
Keep it high level. You can reference certain files or functions though.
|
||||
|
||||
Before writing your plan, make sure to do research. Explore the relevant sections in the codebase.
|
||||
295
CLAUDE.md
295
CLAUDE.md
@@ -1,295 +0,0 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## KEY NOTES
|
||||
|
||||
- If you run into any missing python dependency errors, try running your command with `source backend/.venv/bin/activate` \
|
||||
to assume the python venv.
|
||||
- To make tests work, check the `.env` file at the root of the project to find an OpenAI key.
|
||||
- If using `playwright` to explore the frontend, you can usually log in with username `a@test.com` and password
|
||||
`a`. The app can be accessed at `http://localhost:3000`.
|
||||
- You should assume that all Onyx services are running. To verify, you can check the `backend/log` directory to
|
||||
make sure we see logs coming out from the relevant service.
|
||||
- To connect to the Postgres database, use: `docker exec -it onyx-relational_db-1 psql -U postgres -c "<SQL>"`
|
||||
- When making calls to the backend, always go through the frontend. E.g. make a call to `http://localhost:3000/api/persona` not `http://localhost:8080/api/persona`
|
||||
- Put ALL db operations under the `backend/onyx/db` / `backend/ee/onyx/db` directories. Don't run queries
|
||||
outside of those directories.
|
||||
|
||||
## Project Overview
|
||||
|
||||
**Onyx** (formerly Danswer) is an open-source Gen-AI and Enterprise Search platform that connects to company documents, apps, and people. It features a modular architecture with both Community Edition (MIT licensed) and Enterprise Edition offerings.
|
||||
|
||||
|
||||
### Background Workers (Celery)
|
||||
|
||||
Onyx uses Celery for asynchronous task processing with multiple specialized workers:
|
||||
|
||||
#### Worker Types
|
||||
|
||||
1. **Primary Worker** (`celery_app.py`)
|
||||
- Coordinates core background tasks and system-wide operations
|
||||
- Handles connector management, document sync, pruning, and periodic checks
|
||||
- Runs with 4 threads concurrency
|
||||
- Tasks: connector deletion, vespa sync, pruning, LLM model updates, user file sync
|
||||
|
||||
2. **Docfetching Worker** (`docfetching`)
|
||||
- Fetches documents from external data sources (connectors)
|
||||
- Spawns docprocessing tasks for each document batch
|
||||
- Implements watchdog monitoring for stuck connectors
|
||||
- Configurable concurrency (default from env)
|
||||
|
||||
3. **Docprocessing Worker** (`docprocessing`)
|
||||
- Processes fetched documents through the indexing pipeline:
|
||||
- Upserts documents to PostgreSQL
|
||||
- Chunks documents and adds contextual information
|
||||
- Embeds chunks via model server
|
||||
- Writes chunks to Vespa vector database
|
||||
- Updates document metadata
|
||||
- Configurable concurrency (default from env)
|
||||
|
||||
4. **Light Worker** (`light`)
|
||||
- Handles lightweight, fast operations
|
||||
- Tasks: vespa operations, document permissions sync, external group sync
|
||||
- Higher concurrency for quick tasks
|
||||
|
||||
5. **Heavy Worker** (`heavy`)
|
||||
- Handles resource-intensive operations
|
||||
- Primary task: document pruning operations
|
||||
- Runs with 4 threads concurrency
|
||||
|
||||
6. **KG Processing Worker** (`kg_processing`)
|
||||
- Handles Knowledge Graph processing and clustering
|
||||
- Builds relationships between documents
|
||||
- Runs clustering algorithms
|
||||
- Configurable concurrency
|
||||
|
||||
7. **Monitoring Worker** (`monitoring`)
|
||||
- System health monitoring and metrics collection
|
||||
- Monitors Celery queues, process memory, and system status
|
||||
- Single thread (monitoring doesn't need parallelism)
|
||||
- Cloud-specific monitoring tasks
|
||||
|
||||
8. **Beat Worker** (`beat`)
|
||||
- Celery's scheduler for periodic tasks
|
||||
- Uses DynamicTenantScheduler for multi-tenant support
|
||||
- Schedules tasks like:
|
||||
- Indexing checks (every 15 seconds)
|
||||
- Connector deletion checks (every 20 seconds)
|
||||
- Vespa sync checks (every 20 seconds)
|
||||
- Pruning checks (every 20 seconds)
|
||||
- KG processing (every 60 seconds)
|
||||
- Monitoring tasks (every 5 minutes)
|
||||
- Cleanup tasks (hourly)
|
||||
|
||||
#### Key Features
|
||||
|
||||
- **Thread-based Workers**: All workers use thread pools (not processes) for stability
|
||||
- **Tenant Awareness**: Multi-tenant support with per-tenant task isolation. There is a
|
||||
middleware layer that automatically finds the appropriate tenant ID when sending tasks
|
||||
via Celery Beat.
|
||||
- **Task Prioritization**: High, Medium, Low priority queues
|
||||
- **Monitoring**: Built-in heartbeat and liveness checking
|
||||
- **Failure Handling**: Automatic retry and failure recovery mechanisms
|
||||
- **Redis Coordination**: Inter-process communication via Redis
|
||||
- **PostgreSQL State**: Task state and metadata stored in PostgreSQL
|
||||
|
||||
|
||||
#### Important Notes
|
||||
|
||||
**Defining Tasks**:
|
||||
- Always use `@shared_task` rather than `@celery_app`
|
||||
- Put tasks under `background/celery/tasks/` or `ee/background/celery/tasks`
|
||||
|
||||
**Defining APIs**:
|
||||
When creating new FastAPI APIs, do NOT use the `response_model` field. Instead, just type the
|
||||
function.
|
||||
|
||||
**Testing Updates**:
|
||||
If you make any updates to a celery worker and you want to test these changes, you will need
|
||||
to ask me to restart the celery worker. There is no auto-restart on code-change mechanism.
|
||||
|
||||
### Code Quality
|
||||
```bash
|
||||
# Install and run pre-commit hooks
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
NOTE: Always make sure everything is strictly typed (both in Python and Typescript).
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
### Technology Stack
|
||||
- **Backend**: Python 3.11, FastAPI, SQLAlchemy, Alembic, Celery
|
||||
- **Frontend**: Next.js 15+, React 18, TypeScript, Tailwind CSS
|
||||
- **Database**: PostgreSQL with Redis caching
|
||||
- **Search**: Vespa vector database
|
||||
- **Auth**: OAuth2, SAML, multi-provider support
|
||||
- **AI/ML**: LangChain, LiteLLM, multiple embedding models
|
||||
|
||||
### Directory Structure
|
||||
|
||||
```
|
||||
backend/
|
||||
├── onyx/
|
||||
│ ├── auth/ # Authentication & authorization
|
||||
│ ├── chat/ # Chat functionality & LLM interactions
|
||||
│ ├── connectors/ # Data source connectors
|
||||
│ ├── db/ # Database models & operations
|
||||
│ ├── document_index/ # Vespa integration
|
||||
│ ├── federated_connectors/ # External search connectors
|
||||
│ ├── llm/ # LLM provider integrations
|
||||
│ └── server/ # API endpoints & routers
|
||||
├── ee/ # Enterprise Edition features
|
||||
├── alembic/ # Database migrations
|
||||
└── tests/ # Test suites
|
||||
|
||||
web/
|
||||
├── src/app/ # Next.js app router pages
|
||||
├── src/components/ # Reusable React components
|
||||
└── src/lib/ # Utilities & business logic
|
||||
```
|
||||
|
||||
## Database & Migrations
|
||||
|
||||
### Running Migrations
|
||||
```bash
|
||||
# Standard migrations
|
||||
alembic upgrade head
|
||||
|
||||
# Multi-tenant (Enterprise)
|
||||
alembic -n schema_private upgrade head
|
||||
```
|
||||
|
||||
### Creating Migrations
|
||||
```bash
|
||||
# Auto-generate migration
|
||||
alembic revision --autogenerate -m "description"
|
||||
|
||||
# Multi-tenant migration
|
||||
alembic -n schema_private revision --autogenerate -m "description"
|
||||
```
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
There are 4 main types of tests within Onyx:
|
||||
|
||||
### Unit Tests
|
||||
These should not assume any Onyx/external services are available to be called.
|
||||
Interactions with the outside world should be mocked using `unittest.mock`. Generally, only
|
||||
write these for complex, isolated modules e.g. `citation_processing.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest -xv backend/tests/unit
|
||||
```
|
||||
|
||||
### External Dependency Unit Tests
|
||||
These tests assume that all external dependencies of Onyx are available and callable (e.g. Postgres, Redis,
|
||||
MinIO/S3, Vespa are running + OpenAI can be called + any request to the internet is fine + etc.).
|
||||
|
||||
However, the actual Onyx containers are not running and with these tests we call the function to test directly.
|
||||
We can also mock components/calls at will.
|
||||
|
||||
The goal with these tests are to minimize mocking while giving some flexibility to mock things that are flakey,
|
||||
need strictly controlled behavior, or need to have their internal behavior validated (e.g. verify a function is called
|
||||
with certain args, something that would be impossible with proper integration tests).
|
||||
|
||||
A great example of this type of test is `backend/tests/external_dependency_unit/connectors/confluence/test_confluence_group_sync.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest backend/tests/external_dependency_unit
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
Standard integration tests. Every test in `backend/tests/integration` runs against a real Onyx deployment. We cannot
|
||||
mock anything in these tests. Prefer writing integration tests (or External Dependency Unit Tests if mocking/internal
|
||||
verification is necessary) over any other type of test.
|
||||
|
||||
Tests are parallelized at a directory level.
|
||||
|
||||
When writing integration tests, make sure to check the root `conftest.py` for useful fixtures + the `backend/tests/integration/common_utils` directory for utilities. Prefer (if one exists), calling the appropriate Manager
|
||||
class in the utils over directly calling the APIs with a library like `requests`. Prefer using fixtures rather than
|
||||
calling the utilities directly (e.g. do NOT create admin users with
|
||||
`admin_user = UserManager.create(name="admin_user")`, instead use the `admin_user` fixture).
|
||||
|
||||
A great example of this type of test is `backend/tests/integration/dev_apis/test_simple_chat_api.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest backend/tests/integration
|
||||
```
|
||||
|
||||
### Playwright (E2E) Tests
|
||||
These tests are an even more complete version of the Integration Tests mentioned above. Has all services of Onyx
|
||||
running, *including* the Web Server.
|
||||
|
||||
Use these tests for anything that requires significant frontend <-> backend coordination.
|
||||
|
||||
Tests are located at `web/tests/e2e`. Tests are written in TypeScript.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
npx playwright test <TEST_NAME>
|
||||
```
|
||||
|
||||
|
||||
## Logs
|
||||
|
||||
When (1) writing integration tests or (2) doing live tests (e.g. curl / playwright) you can get access
|
||||
to logs via the `backend/log/<service_name>_debug.log` file. All Onyx services (api_server, web_server, celery_X)
|
||||
will be tailing their logs to this file.
|
||||
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- Never commit API keys or secrets to repository
|
||||
- Use encrypted credential storage for connector credentials
|
||||
- Follow RBAC patterns for new features
|
||||
- Implement proper input validation with Pydantic models
|
||||
- Use parameterized queries to prevent SQL injection
|
||||
|
||||
## AI/LLM Integration
|
||||
|
||||
- Multiple LLM providers supported via LiteLLM
|
||||
- Configurable models per feature (chat, search, embeddings)
|
||||
- Streaming support for real-time responses
|
||||
- Token management and rate limiting
|
||||
- Custom prompts and agent actions
|
||||
|
||||
## UI/UX Patterns
|
||||
|
||||
- Tailwind CSS with design system in `web/src/components/ui/`
|
||||
- Radix UI and Headless UI for accessible components
|
||||
- SWR for data fetching and caching
|
||||
- Form validation with react-hook-form
|
||||
- Error handling with popup notifications
|
||||
|
||||
## Creating a Plan
|
||||
When creating a plan in the `plans` directory, make sure to include at least these elements:
|
||||
|
||||
**Issues to Address**
|
||||
What the change is meant to do.
|
||||
|
||||
**Important Notes**
|
||||
Things you come across in your research that are important to the implementation.
|
||||
|
||||
**Implementation strategy**
|
||||
How you are going to make the changes happen. High level approach.
|
||||
|
||||
**Tests**
|
||||
What unit (use rarely), external dependency unit, integration, and playwright tests you plan to write to
|
||||
verify the correct behavior. Don't overtest. Usually, a given change only needs one type of test.
|
||||
|
||||
Do NOT include these: *Timeline*, *Rollback plan*
|
||||
|
||||
This is a minimal list - feel free to include more. Do NOT write code as part of your plan.
|
||||
Keep it high level. You can reference certain files or functions though.
|
||||
|
||||
Before writing your plan, make sure to do research. Explore the relevant sections in the codebase.
|
||||
@@ -1,4 +1,4 @@
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/CONTRIBUTING.md"} -->
|
||||
|
||||
# Contributing to Onyx
|
||||
|
||||
@@ -12,8 +12,8 @@ As an open source project in a rapidly changing space, we welcome all contributi
|
||||
|
||||
The [GitHub Issues](https://github.com/onyx-dot-app/onyx/issues) page is a great place to start for contribution ideas.
|
||||
|
||||
To ensure that your contribution is aligned with the project's direction, please reach out to any maintainer on the Onyx team
|
||||
via [Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA) /
|
||||
To ensure that your contribution is aligned with the project's direction, please reach out to Hagen (or any other maintainer) on the Onyx team
|
||||
via [Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA) /
|
||||
[Discord](https://discord.gg/TDJ59cGV2X) or [email](mailto:founders@onyx.app).
|
||||
|
||||
Issues that have been explicitly approved by the maintainers (aligned with the direction of the project)
|
||||
@@ -28,7 +28,7 @@ Your input is vital to making sure that Onyx moves in the right direction.
|
||||
Before starting on implementation, please raise a GitHub issue.
|
||||
|
||||
Also, always feel free to message the founders (Chris Weaver / Yuhong Sun) on
|
||||
[Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-34lu4m7xg-TsKGO6h8PDvR5W27zTdyhA) /
|
||||
[Slack](https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA) /
|
||||
[Discord](https://discord.gg/TDJ59cGV2X) directly about anything at all.
|
||||
|
||||
### Contributing Code
|
||||
@@ -59,7 +59,6 @@ Onyx being a fully functional app, relies on some external software, specificall
|
||||
- [Postgres](https://www.postgresql.org/) (Relational DB)
|
||||
- [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
|
||||
- [Redis](https://redis.io/) (Cache)
|
||||
- [MinIO](https://min.io/) (File Store)
|
||||
- [Nginx](https://nginx.org/) (Not needed for development flows generally)
|
||||
|
||||
> **Note:**
|
||||
@@ -84,6 +83,10 @@ python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
> **Note:**
|
||||
> This virtual environment MUST NOT be set up WITHIN the onyx directory if you plan on using mypy within certain IDEs.
|
||||
> For simplicity, we recommend setting up the virtual environment outside of the onyx directory.
|
||||
|
||||
_For Windows, activate the virtual environment using Command Prompt:_
|
||||
|
||||
```bash
|
||||
@@ -99,10 +102,10 @@ If using PowerShell, the command slightly differs:
|
||||
Install the required python dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r backend/requirements/default.txt
|
||||
pip install -r backend/requirements/dev.txt
|
||||
pip install -r backend/requirements/ee.txt
|
||||
pip install -r backend/requirements/model_server.txt
|
||||
pip install -r onyx/backend/requirements/default.txt
|
||||
pip install -r onyx/backend/requirements/dev.txt
|
||||
pip install -r onyx/backend/requirements/ee.txt
|
||||
pip install -r onyx/backend/requirements/model_server.txt
|
||||
```
|
||||
|
||||
Install Playwright for Python (headless browser required by the Web Connector)
|
||||
@@ -168,10 +171,10 @@ Otherwise, you can follow the instructions below to run the application for deve
|
||||
|
||||
You will need Docker installed to run these containers.
|
||||
|
||||
First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis/MinIO with:
|
||||
First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis with:
|
||||
|
||||
```bash
|
||||
docker compose up -d index relational_db cache minio
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache
|
||||
```
|
||||
|
||||
(index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
|
||||
@@ -253,7 +256,7 @@ You can run the full Onyx application stack from pre-built images including all
|
||||
Navigate to `onyx/deployment/docker_compose` and run:
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d
|
||||
```
|
||||
|
||||
After Docker pulls and starts these containers, navigate to `http://localhost:3000` to use Onyx.
|
||||
@@ -261,7 +264,7 @@ After Docker pulls and starts these containers, navigate to `http://localhost:30
|
||||
If you want to make changes to Onyx and run those changes in Docker, you can also build a local version of the Onyx container images that incorporates your changes like so:
|
||||
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d --build
|
||||
```
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ This guide explains how to set up and use VSCode's debugging capabilities with t
|
||||
## Initial Setup
|
||||
|
||||
1. **Environment Setup**:
|
||||
- Copy `.vscode/env_template.txt` to `.vscode/.env`
|
||||
- Copy `.vscode/.env.template` to `.vscode/.env`
|
||||
- Fill in the necessary environment variables in `.vscode/.env`
|
||||
2. **launch.json**:
|
||||
- Copy `.vscode/launch.template.jsonc` to `.vscode/launch.json`
|
||||
@@ -17,9 +17,10 @@ Before starting, make sure the Docker Daemon is running.
|
||||
1. Open the Debug view in VSCode (Cmd+Shift+D on macOS)
|
||||
2. From the dropdown at the top, select "Clear and Restart External Volumes and Containers" and press the green play button
|
||||
3. From the dropdown at the top, select "Run All Onyx Services" and press the green play button
|
||||
4. Now, you can navigate to onyx in your browser (default is http://localhost:3000) and start using the app
|
||||
5. You can set breakpoints by clicking to the left of line numbers to help debug while the app is running
|
||||
6. Use the debug toolbar to step through code, inspect variables, etc.
|
||||
4. CD into web, run "npm i" followed by npm run dev.
|
||||
5. Now, you can navigate to onyx in your browser (default is http://localhost:3000) and start using the app
|
||||
6. You can set breakpoints by clicking to the left of line numbers to help debug while the app is running
|
||||
7. Use the debug toolbar to step through code, inspect variables, etc.
|
||||
|
||||
## Features
|
||||
|
||||
|
||||
134
README.md
134
README.md
@@ -1,103 +1,117 @@
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/README.md"} -->
|
||||
|
||||
<a name="readme-top"></a>
|
||||
|
||||
<h2 align="center">
|
||||
<a href="https://www.onyx.app/"> <img width="50%" src="https://github.com/onyx-dot-app/onyx/blob/logo/OnyxLogoCropped.jpg?raw=true)" /></a>
|
||||
<a href="https://www.onyx.app/"> <img width="50%" src="https://github.com/onyx-dot-app/onyx/blob/logo/OnyxLogoCropped.jpg?raw=true)" /></a>
|
||||
</h2>
|
||||
|
||||
<p align="center">Open Source AI Platform</p>
|
||||
<p align="center">
|
||||
<p align="center">Open Source Gen-AI + Enterprise Search.</p>
|
||||
|
||||
<p align="center">
|
||||
<a href="https://discord.gg/TDJ59cGV2X" target="_blank">
|
||||
<img src="https://img.shields.io/badge/discord-join-blue.svg?logo=discord&logoColor=white" alt="Discord">
|
||||
</a>
|
||||
<a href="https://docs.onyx.app/" target="_blank">
|
||||
<img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
|
||||
</a>
|
||||
<a href="https://docs.onyx.app/" target="_blank">
|
||||
<img src="https://img.shields.io/website?url=https://www.onyx.app&up_message=visit&up_color=blue" alt="Documentation">
|
||||
</a>
|
||||
<a href="https://github.com/onyx-dot-app/onyx/blob/main/LICENSE" target="_blank">
|
||||
<img src="https://img.shields.io/static/v1?label=license&message=MIT&color=blue" alt="License">
|
||||
</a>
|
||||
<a href="https://docs.onyx.app/" target="_blank">
|
||||
<img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
|
||||
</a>
|
||||
<a href="https://join.slack.com/t/onyx-dot-app/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA" target="_blank">
|
||||
<img src="https://img.shields.io/badge/slack-join-blue.svg?logo=slack" alt="Slack">
|
||||
</a>
|
||||
<a href="https://discord.gg/TDJ59cGV2X" target="_blank">
|
||||
<img src="https://img.shields.io/badge/discord-join-blue.svg?logo=discord&logoColor=white" alt="Discord">
|
||||
</a>
|
||||
<a href="https://github.com/onyx-dot-app/onyx/blob/main/README.md" target="_blank">
|
||||
<img src="https://img.shields.io/static/v1?label=license&message=MIT&color=blue" alt="License">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<strong>[Onyx](https://www.onyx.app/)</strong> (formerly Danswer) is the AI platform connected to your company's docs, apps, and people.
|
||||
Onyx provides a feature rich Chat interface and plugs into any LLM of your choice.
|
||||
Keep knowledge and access controls sync-ed across over 40 connectors like Google Drive, Slack, Confluence, Salesforce, etc.
|
||||
Create custom AI agents with unique prompts, knowledge, and actions that the agents can take.
|
||||
Onyx can be deployed securely anywhere and for any scale - on a laptop, on-premise, or to cloud.
|
||||
|
||||
|
||||
**[Onyx](https://www.onyx.app/)** is a feature-rich, self-hostable Chat UI that works with any LLM. It is easy to deploy and can run in a completely airgapped environment.
|
||||
<h3>Feature Highlights</h3>
|
||||
|
||||
Onyx comes loaded with advanced features like Agents, Web Search, RAG, MCP, Deep Research, Connectors to 40+ knowledge sources, and more.
|
||||
**Deep research over your team's knowledge:**
|
||||
|
||||
> [!TIP]
|
||||
> Run Onyx with one command (or see deployment section below):
|
||||
> ```
|
||||
> curl -fsSL https://raw.githubusercontent.com/onyx-dot-app/onyx/main/deployment/docker_compose/install.sh > install.sh && chmod +x install.sh && ./install.sh
|
||||
> ```
|
||||
https://private-user-images.githubusercontent.com/32520769/414509312-48392e83-95d0-4fb5-8650-a396e05e0a32.mp4?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3Mzk5Mjg2MzYsIm5iZiI6MTczOTkyODMzNiwicGF0aCI6Ii8zMjUyMDc2OS80MTQ1MDkzMTItNDgzOTJlODMtOTVkMC00ZmI1LTg2NTAtYTM5NmUwNWUwYTMyLm1wND9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNTAyMTklMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjUwMjE5VDAxMjUzNlomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWFhMzk5Njg2Y2Y5YjFmNDNiYTQ2YzM5ZTg5YWJiYTU2NWMyY2YwNmUyODE2NWUxMDRiMWQxZWJmODI4YTA0MTUmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0In0.a9D8A0sgKE9AoaoE-mfFbJ6_OKYeqaf7TZ4Han2JfW8
|
||||
|
||||
****
|
||||
|
||||
**Use Onyx as a secure AI Chat with any LLM:**
|
||||
|
||||

|
||||
|
||||
|
||||
**Easily set up connectors to your apps:**
|
||||
|
||||
## ⭐ Features
|
||||
- **🤖 Custom Agents:** Build AI Agents with unique instructions, knowledge and actions.
|
||||
- **🌍 Web Search:** Browse the web with Google PSE, Exa, and Serper as well as an in-house scraper or Firecrawl.
|
||||
- **🔍 RAG:** Best in class hybrid-search + knowledge graph for uploaded files and ingested documents from connectors.
|
||||
- **🔄 Connectors:** Pull knowledge, metadata, and access information from over 40 applications.
|
||||
- **🔬 Deep Research:** Get in depth answers with an agentic multi-step search.
|
||||
- **▶️ Actions & MCP:** Give AI Agents the ability to interact with external systems.
|
||||
- **💻 Code Interpreter:** Execute code to analyze data, render graphs and create files.
|
||||
- **🎨 Image Generation:** Generate images based on user prompts.
|
||||
- **👥 Collaboration:** Chat sharing, feedback gathering, user management, usage analytics, and more.
|
||||
|
||||
Onyx works with all LLMs (like OpenAI, Anthropic, Gemini, etc.) and self-hosted LLMs (like Ollama, vLLM, etc.)
|
||||
|
||||
To learn more about the features, check out our [documentation](https://docs.onyx.app/welcome)!
|
||||

|
||||
|
||||
|
||||
**Access Onyx where your team already works:**
|
||||
|
||||
## 🚀 Deployment
|
||||
Onyx supports deployments in Docker, Kubernetes, Terraform, along with guides for major cloud providers.
|
||||
|
||||
See guides below:
|
||||
- [Docker](https://docs.onyx.app/deployment/local/docker) or [Quickstart](https://docs.onyx.app/deployment/getting_started/quickstart) (best for most users)
|
||||
- [Kubernetes](https://docs.onyx.app/deployment/local/kubernetes) (best for large teams)
|
||||
- [Terraform](https://docs.onyx.app/deployment/local/terraform) (best for teams already using Terraform)
|
||||
- Cloud specific guides (best if specifically using [AWS EKS](https://docs.onyx.app/deployment/cloud/aws/eks), [Azure VMs](https://docs.onyx.app/deployment/cloud/azure), etc.)
|
||||
|
||||
> [!TIP]
|
||||
> **To try Onyx for free without deploying, check out [Onyx Cloud](https://cloud.onyx.app/signup)**.
|
||||

|
||||
|
||||
|
||||
## Deployment
|
||||
**To try it out for free and get started in seconds, check out [Onyx Cloud](https://cloud.onyx.app/signup)**.
|
||||
|
||||
## 🔍 Other Notable Benefits
|
||||
Onyx is built for teams of all sizes, from individual users to the largest global enterprises.
|
||||
Onyx can also be run locally (even on a laptop) or deployed on a virtual machine with a single
|
||||
`docker compose` command. Checkout our [docs](https://docs.onyx.app/quickstart) to learn more.
|
||||
|
||||
- **Enterprise Search**: far more than simple RAG, Onyx has custom indexing and retrieval that remains performant and accurate for scales of up to tens of millions of documents.
|
||||
- **Security**: SSO (OIDC/SAML/OAuth2), RBAC, encryption of credentials, etc.
|
||||
- **Management UI**: different user roles such as basic, curator, and admin.
|
||||
- **Document Permissioning**: mirrors user access from external apps for RAG use cases.
|
||||
We also have built-in support for high-availability/scalable deployment on Kubernetes.
|
||||
References [here](https://github.com/onyx-dot-app/onyx/tree/main/deployment).
|
||||
|
||||
|
||||
## 🔍 Other Notable Benefits of Onyx
|
||||
- Custom deep learning models for indexing and inference time, only through Onyx + learning from user feedback.
|
||||
- Flexible security features like SSO (OIDC/SAML/OAuth2), RBAC, encryption of credentials, etc.
|
||||
- Knowledge curation features like document-sets, query history, usage analytics, etc.
|
||||
- Scalable deployment options tested up to many tens of thousands users and hundreds of millions of documents.
|
||||
|
||||
|
||||
## 🚧 Roadmap
|
||||
To see ongoing and upcoming projects, check out our [roadmap](https://github.com/orgs/onyx-dot-app/projects/2)!
|
||||
- New methods in information retrieval (StructRAG, LightGraphRAG, etc.)
|
||||
- Personalized Search
|
||||
- Organizational understanding and ability to locate and suggest experts from your team.
|
||||
- Code Search
|
||||
- SQL and Structured Query Language
|
||||
|
||||
|
||||
## 🔌 Connectors
|
||||
Keep knowledge and access up to sync across 40+ connectors:
|
||||
|
||||
- Google Drive
|
||||
- Confluence
|
||||
- Slack
|
||||
- Gmail
|
||||
- Salesforce
|
||||
- Microsoft Sharepoint
|
||||
- Github
|
||||
- Jira
|
||||
- Zendesk
|
||||
- Gong
|
||||
- Microsoft Teams
|
||||
- Dropbox
|
||||
- Local Files
|
||||
- Websites
|
||||
- And more ...
|
||||
|
||||
See the full list [here](https://docs.onyx.app/connectors).
|
||||
|
||||
|
||||
## 📚 Licensing
|
||||
There are two editions of Onyx:
|
||||
|
||||
- Onyx Community Edition (CE) is available freely under the MIT license.
|
||||
- Onyx Community Edition (CE) is available freely under the MIT Expat license. Simply follow the Deployment guide above.
|
||||
- Onyx Enterprise Edition (EE) includes extra features that are primarily useful for larger organizations.
|
||||
For feature details, check out [our website](https://www.onyx.app/pricing).
|
||||
|
||||
|
||||
|
||||
## 👪 Community
|
||||
Join our open source community on **[Discord](https://discord.gg/TDJ59cGV2X)**!
|
||||
|
||||
To try the Onyx Enterprise Edition:
|
||||
1. Checkout [Onyx Cloud](https://cloud.onyx.app/signup).
|
||||
2. For self-hosting the Enterprise Edition, contact us at [founders@onyx.app](mailto:founders@onyx.app) or book a call with us on our [Cal](https://cal.com/team/onyx/founders).
|
||||
|
||||
|
||||
## 💡 Contributing
|
||||
Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details.
|
||||
|
||||
|
||||
4
backend/.gitignore
vendored
4
backend/.gitignore
vendored
@@ -9,6 +9,4 @@ api_keys.py
|
||||
vespa-app.zip
|
||||
dynamic_config_storage/
|
||||
celerybeat-schedule*
|
||||
onyx/connectors/salesforce/data/
|
||||
.test.env
|
||||
/generated
|
||||
onyx/connectors/salesforce/data/
|
||||
@@ -12,8 +12,7 @@ ARG ONYX_VERSION=0.0.0-dev
|
||||
# DO_NOT_TRACK is used to disable telemetry for Unstructured
|
||||
ENV ONYX_VERSION=${ONYX_VERSION} \
|
||||
DANSWER_RUNNING_IN_DOCKER="true" \
|
||||
DO_NOT_TRACK="true" \
|
||||
PLAYWRIGHT_BROWSERS_PATH="/app/.cache/ms-playwright"
|
||||
DO_NOT_TRACK="true"
|
||||
|
||||
|
||||
RUN echo "ONYX_VERSION: ${ONYX_VERSION}"
|
||||
@@ -78,9 +77,6 @@ RUN apt-get update && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key
|
||||
|
||||
# Install postgresql-client for easy manual tests
|
||||
# Install it here to avoid it being cleaned up above
|
||||
RUN apt-get update && apt-get install -y postgresql-client
|
||||
|
||||
# Pre-downloading models for setups with limited egress
|
||||
RUN python -c "from tokenizers import Tokenizer; \
|
||||
@@ -89,7 +85,7 @@ Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
|
||||
# Pre-downloading NLTK for setups with limited egress
|
||||
RUN python -c "import nltk; \
|
||||
nltk.download('stopwords', quiet=True); \
|
||||
nltk.download('punkt_tab', quiet=True);"
|
||||
nltk.download('punkt', quiet=True);"
|
||||
# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed
|
||||
|
||||
# Set up application files
|
||||
@@ -117,14 +113,6 @@ COPY ./assets /app/assets
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
# Create non-root user for security best practices
|
||||
RUN groupadd -g 1001 onyx && \
|
||||
useradd -u 1001 -g onyx -m -s /bin/bash onyx && \
|
||||
chown -R onyx:onyx /app && \
|
||||
mkdir -p /var/log/onyx && \
|
||||
chmod 755 /var/log/onyx && \
|
||||
chown onyx:onyx /var/log/onyx
|
||||
|
||||
# Default command which does nothing
|
||||
# This container is used by api server and background which specify their own CMD
|
||||
CMD ["tail", "-f", "/dev/null"]
|
||||
|
||||
@@ -9,36 +9,11 @@ visit https://github.com/onyx-dot-app/onyx."
|
||||
# Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
|
||||
ARG ONYX_VERSION=0.0.0-dev
|
||||
ENV ONYX_VERSION=${ONYX_VERSION} \
|
||||
DANSWER_RUNNING_IN_DOCKER="true" \
|
||||
HF_HOME=/app/.cache/huggingface
|
||||
DANSWER_RUNNING_IN_DOCKER="true"
|
||||
|
||||
|
||||
RUN echo "ONYX_VERSION: ${ONYX_VERSION}"
|
||||
|
||||
# Create non-root user for security best practices
|
||||
RUN mkdir -p /app && \
|
||||
groupadd -g 1001 onyx && \
|
||||
useradd -u 1001 -g onyx -m -s /bin/bash onyx && \
|
||||
chown -R onyx:onyx /app && \
|
||||
mkdir -p /var/log/onyx && \
|
||||
chmod 755 /var/log/onyx && \
|
||||
chown onyx:onyx /var/log/onyx
|
||||
|
||||
# --- add toolchain needed for Rust/Python builds (fastuuid) ---
|
||||
ENV RUSTUP_HOME=/usr/local/rustup \
|
||||
CARGO_HOME=/usr/local/cargo \
|
||||
PATH=/usr/local/cargo/bin:$PATH
|
||||
|
||||
RUN set -eux; \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
curl \
|
||||
ca-certificates \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
# Install latest stable Rust (supports Cargo.lock v4)
|
||||
&& curl -sSf https://sh.rustup.rs | sh -s -- -y --profile minimal --default-toolchain stable \
|
||||
&& rustc --version && cargo --version
|
||||
|
||||
COPY ./requirements/model_server.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade \
|
||||
--retries 5 \
|
||||
@@ -63,11 +38,9 @@ snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
|
||||
from sentence_transformers import SentenceTransformer; \
|
||||
SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True);"
|
||||
|
||||
# In case the user has volumes mounted to /app/.cache/huggingface that they've downloaded while
|
||||
# running Onyx, move the current contents of the cache folder to a temporary location to ensure
|
||||
# it's preserved in order to combine with the user's cache contents
|
||||
RUN mv /app/.cache/huggingface /app/.cache/temp_huggingface && \
|
||||
chown -R onyx:onyx /app
|
||||
# In case the user has volumes mounted to /root/.cache/huggingface that they've downloaded while
|
||||
# running Onyx, don't overwrite it with the built in cache folder
|
||||
RUN mv /root/.cache/huggingface /root/.cache/temp_huggingface
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
<!-- ONYX_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
|
||||
<!-- DANSWER_METADATA={"link": "https://github.com/onyx-dot-app/onyx/blob/main/backend/alembic/README.md"} -->
|
||||
|
||||
# Alembic DB Migrations
|
||||
|
||||
@@ -20,44 +20,3 @@ To run all un-applied migrations:
|
||||
To undo migrations:
|
||||
`alembic downgrade -X`
|
||||
where X is the number of migrations you want to undo from the current state
|
||||
|
||||
### Multi-tenant migrations
|
||||
|
||||
For multi-tenant deployments, you can use additional options:
|
||||
|
||||
**Upgrade all tenants:**
|
||||
```bash
|
||||
alembic -x upgrade_all_tenants=true upgrade head
|
||||
```
|
||||
|
||||
**Upgrade specific schemas:**
|
||||
```bash
|
||||
# Single schema
|
||||
alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012 upgrade head
|
||||
|
||||
# Multiple schemas (comma-separated)
|
||||
alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012,public,another_tenant upgrade head
|
||||
```
|
||||
|
||||
**Upgrade tenants within an alphabetical range:**
|
||||
```bash
|
||||
# Upgrade tenants 100-200 when sorted alphabetically (positions 100 to 200)
|
||||
alembic -x upgrade_all_tenants=true -x tenant_range_start=100 -x tenant_range_end=200 upgrade head
|
||||
|
||||
# Upgrade tenants starting from position 1000 alphabetically
|
||||
alembic -x upgrade_all_tenants=true -x tenant_range_start=1000 upgrade head
|
||||
|
||||
# Upgrade first 500 tenants alphabetically
|
||||
alembic -x upgrade_all_tenants=true -x tenant_range_end=500 upgrade head
|
||||
```
|
||||
|
||||
**Continue on error (for batch operations):**
|
||||
```bash
|
||||
alembic -x upgrade_all_tenants=true -x continue=true upgrade head
|
||||
```
|
||||
|
||||
The tenant range filtering works by:
|
||||
1. Sorting tenant IDs alphabetically
|
||||
2. Using 1-based position numbers (1st, 2nd, 3rd tenant, etc.)
|
||||
3. Filtering to the specified range of positions
|
||||
4. Non-tenant schemas (like 'public') are always included
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
from typing import Any, Literal
|
||||
from onyx.db.engine.iam_auth import get_iam_auth_token
|
||||
from onyx.db.engine import get_iam_auth_token
|
||||
from onyx.configs.app_configs import USE_IAM_AUTH
|
||||
from onyx.configs.app_configs import POSTGRES_HOST
|
||||
from onyx.configs.app_configs import POSTGRES_PORT
|
||||
from onyx.configs.app_configs import POSTGRES_USER
|
||||
from onyx.configs.app_configs import AWS_REGION_NAME
|
||||
from onyx.db.engine.sql_engine import build_connection_string
|
||||
from onyx.db.engine.tenant_utils import get_all_tenant_ids
|
||||
from onyx.db.engine import build_connection_string
|
||||
from onyx.db.engine import get_all_tenant_ids
|
||||
from sqlalchemy import event
|
||||
from sqlalchemy import pool
|
||||
from sqlalchemy import text
|
||||
@@ -21,14 +21,9 @@ from alembic import context
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
from sqlalchemy.sql.schema import SchemaItem
|
||||
from onyx.configs.constants import SSL_CERT_FILE
|
||||
from shared_configs.configs import (
|
||||
MULTI_TENANT,
|
||||
POSTGRES_DEFAULT_SCHEMA,
|
||||
TENANT_ID_PREFIX,
|
||||
)
|
||||
from shared_configs.configs import MULTI_TENANT, POSTGRES_DEFAULT_SCHEMA
|
||||
from onyx.db.models import Base
|
||||
from celery.backends.database.session import ResultModelBase # type: ignore
|
||||
from onyx.db.engine.sql_engine import SqlEngine
|
||||
|
||||
# Make sure in alembic.ini [logger_root] level=INFO is set or most logging will be
|
||||
# hidden! (defaults to level=WARN)
|
||||
@@ -73,67 +68,15 @@ def include_object(
|
||||
return True
|
||||
|
||||
|
||||
def filter_tenants_by_range(
|
||||
tenant_ids: list[str], start_range: int | None = None, end_range: int | None = None
|
||||
) -> list[str]:
|
||||
"""
|
||||
Filter tenant IDs by alphabetical position range.
|
||||
|
||||
Args:
|
||||
tenant_ids: List of tenant IDs to filter
|
||||
start_range: Starting position in alphabetically sorted list (1-based, inclusive)
|
||||
end_range: Ending position in alphabetically sorted list (1-based, inclusive)
|
||||
|
||||
Returns:
|
||||
Filtered list of tenant IDs in their original order
|
||||
"""
|
||||
if start_range is None and end_range is None:
|
||||
return tenant_ids
|
||||
|
||||
# Separate tenant IDs from non-tenant schemas
|
||||
tenant_schemas = [tid for tid in tenant_ids if tid.startswith(TENANT_ID_PREFIX)]
|
||||
non_tenant_schemas = [
|
||||
tid for tid in tenant_ids if not tid.startswith(TENANT_ID_PREFIX)
|
||||
]
|
||||
|
||||
# Sort tenant schemas alphabetically.
|
||||
# NOTE: can cause missed schemas if a schema is created in between workers
|
||||
# fetching of all tenant IDs. We accept this risk for now. Just re-running
|
||||
# the migration will fix the issue.
|
||||
sorted_tenant_schemas = sorted(tenant_schemas)
|
||||
|
||||
# Apply range filtering (0-based indexing)
|
||||
start_idx = start_range if start_range is not None else 0
|
||||
end_idx = end_range if end_range is not None else len(sorted_tenant_schemas)
|
||||
|
||||
# Ensure indices are within bounds
|
||||
start_idx = max(0, start_idx)
|
||||
end_idx = min(len(sorted_tenant_schemas), end_idx)
|
||||
|
||||
# Get the filtered tenant schemas
|
||||
filtered_tenant_schemas = sorted_tenant_schemas[start_idx:end_idx]
|
||||
|
||||
# Combine with non-tenant schemas and preserve original order
|
||||
filtered_tenants = []
|
||||
for tenant_id in tenant_ids:
|
||||
if tenant_id in filtered_tenant_schemas or tenant_id in non_tenant_schemas:
|
||||
filtered_tenants.append(tenant_id)
|
||||
|
||||
return filtered_tenants
|
||||
|
||||
|
||||
def get_schema_options() -> (
|
||||
tuple[bool, bool, bool, int | None, int | None, list[str] | None]
|
||||
):
|
||||
def get_schema_options() -> tuple[str, bool, bool, bool]:
|
||||
x_args_raw = context.get_x_argument()
|
||||
x_args = {}
|
||||
for arg in x_args_raw:
|
||||
if "=" in arg:
|
||||
key, value = arg.split("=", 1)
|
||||
x_args[key.strip()] = value.strip()
|
||||
else:
|
||||
raise ValueError(f"Invalid argument: {arg}")
|
||||
|
||||
for pair in arg.split(","):
|
||||
if "=" in pair:
|
||||
key, value = pair.split("=", 1)
|
||||
x_args[key.strip()] = value.strip()
|
||||
schema_name = x_args.get("schema", POSTGRES_DEFAULT_SCHEMA)
|
||||
create_schema = x_args.get("create_schema", "true").lower() == "true"
|
||||
upgrade_all_tenants = x_args.get("upgrade_all_tenants", "false").lower() == "true"
|
||||
|
||||
@@ -141,81 +84,17 @@ def get_schema_options() -> (
|
||||
# only applies to online migrations
|
||||
continue_on_error = x_args.get("continue", "false").lower() == "true"
|
||||
|
||||
# Tenant range filtering
|
||||
tenant_range_start = None
|
||||
tenant_range_end = None
|
||||
|
||||
if "tenant_range_start" in x_args:
|
||||
try:
|
||||
tenant_range_start = int(x_args["tenant_range_start"])
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Invalid tenant_range_start value: {x_args['tenant_range_start']}. Must be an integer."
|
||||
)
|
||||
|
||||
if "tenant_range_end" in x_args:
|
||||
try:
|
||||
tenant_range_end = int(x_args["tenant_range_end"])
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Invalid tenant_range_end value: {x_args['tenant_range_end']}. Must be an integer."
|
||||
)
|
||||
|
||||
# Validate range
|
||||
if tenant_range_start is not None and tenant_range_end is not None:
|
||||
if tenant_range_start > tenant_range_end:
|
||||
raise ValueError(
|
||||
f"tenant_range_start ({tenant_range_start}) cannot be greater than tenant_range_end ({tenant_range_end})"
|
||||
)
|
||||
|
||||
# Specific schema names filtering (replaces both schema_name and the old tenant_ids approach)
|
||||
schemas = None
|
||||
if "schemas" in x_args:
|
||||
schema_names_str = x_args["schemas"].strip()
|
||||
if schema_names_str:
|
||||
# Split by comma and strip whitespace
|
||||
schemas = [
|
||||
name.strip() for name in schema_names_str.split(",") if name.strip()
|
||||
]
|
||||
if schemas:
|
||||
logger.info(f"Specific schema names specified: {schemas}")
|
||||
|
||||
# Validate that only one method is used at a time
|
||||
range_filtering = tenant_range_start is not None or tenant_range_end is not None
|
||||
specific_filtering = schemas is not None and len(schemas) > 0
|
||||
|
||||
if range_filtering and specific_filtering:
|
||||
if (
|
||||
MULTI_TENANT
|
||||
and schema_name == POSTGRES_DEFAULT_SCHEMA
|
||||
and not upgrade_all_tenants
|
||||
):
|
||||
raise ValueError(
|
||||
"Cannot use both tenant range filtering (tenant_range_start/tenant_range_end) "
|
||||
"and specific schema filtering (schemas) at the same time. "
|
||||
"Please use only one filtering method."
|
||||
"Cannot run default migrations in public schema when multi-tenancy is enabled. "
|
||||
"Please specify a tenant-specific schema."
|
||||
)
|
||||
|
||||
if upgrade_all_tenants and specific_filtering:
|
||||
raise ValueError(
|
||||
"Cannot use both upgrade_all_tenants=true and schemas at the same time. "
|
||||
"Use either upgrade_all_tenants=true for all tenants, or schemas for specific schemas."
|
||||
)
|
||||
|
||||
# If any filtering parameters are specified, we're not doing the default single schema migration
|
||||
if range_filtering:
|
||||
upgrade_all_tenants = True
|
||||
|
||||
# Validate multi-tenant requirements
|
||||
if MULTI_TENANT and not upgrade_all_tenants and not specific_filtering:
|
||||
raise ValueError(
|
||||
"In multi-tenant mode, you must specify either upgrade_all_tenants=true "
|
||||
"or provide schemas. Cannot run default migration."
|
||||
)
|
||||
|
||||
return (
|
||||
create_schema,
|
||||
upgrade_all_tenants,
|
||||
continue_on_error,
|
||||
tenant_range_start,
|
||||
tenant_range_end,
|
||||
schemas,
|
||||
)
|
||||
return schema_name, create_schema, upgrade_all_tenants, continue_on_error
|
||||
|
||||
|
||||
def do_run_migrations(
|
||||
@@ -262,20 +141,12 @@ def provide_iam_token_for_alembic(
|
||||
|
||||
async def run_async_migrations() -> None:
|
||||
(
|
||||
schema_name,
|
||||
create_schema,
|
||||
upgrade_all_tenants,
|
||||
continue_on_error,
|
||||
tenant_range_start,
|
||||
tenant_range_end,
|
||||
schemas,
|
||||
) = get_schema_options()
|
||||
|
||||
if not schemas and not MULTI_TENANT:
|
||||
schemas = [POSTGRES_DEFAULT_SCHEMA]
|
||||
|
||||
# without init_engine, subsequent engine calls fail hard intentionally
|
||||
SqlEngine.init_engine(pool_size=20, max_overflow=5)
|
||||
|
||||
engine = create_async_engine(
|
||||
build_connection_string(),
|
||||
poolclass=pool.NullPool,
|
||||
@@ -289,50 +160,12 @@ async def run_async_migrations() -> None:
|
||||
) -> None:
|
||||
provide_iam_token_for_alembic(dialect, conn_rec, cargs, cparams)
|
||||
|
||||
if schemas:
|
||||
# Use specific schema names directly without fetching all tenants
|
||||
logger.info(f"Migrating specific schema names: {schemas}")
|
||||
|
||||
i_schema = 0
|
||||
num_schemas = len(schemas)
|
||||
for schema in schemas:
|
||||
i_schema += 1
|
||||
logger.info(
|
||||
f"Migrating schema: index={i_schema} num_schemas={num_schemas} schema={schema}"
|
||||
)
|
||||
try:
|
||||
async with engine.connect() as connection:
|
||||
await connection.run_sync(
|
||||
do_run_migrations,
|
||||
schema_name=schema,
|
||||
create_schema=create_schema,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error migrating schema {schema}: {e}")
|
||||
if not continue_on_error:
|
||||
logger.error("--continue=true is not set, raising exception!")
|
||||
raise
|
||||
|
||||
logger.warning("--continue=true is set, continuing to next schema.")
|
||||
|
||||
elif upgrade_all_tenants:
|
||||
if upgrade_all_tenants:
|
||||
tenant_schemas = get_all_tenant_ids()
|
||||
|
||||
filtered_tenant_schemas = filter_tenants_by_range(
|
||||
tenant_schemas, tenant_range_start, tenant_range_end
|
||||
)
|
||||
|
||||
if tenant_range_start is not None or tenant_range_end is not None:
|
||||
logger.info(
|
||||
f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
|
||||
)
|
||||
logger.info(
|
||||
f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
|
||||
)
|
||||
|
||||
i_tenant = 0
|
||||
num_tenants = len(filtered_tenant_schemas)
|
||||
for schema in filtered_tenant_schemas:
|
||||
num_tenants = len(tenant_schemas)
|
||||
for schema in tenant_schemas:
|
||||
i_tenant += 1
|
||||
logger.info(
|
||||
f"Migrating schema: index={i_tenant} num_tenants={num_tenants} schema={schema}"
|
||||
@@ -347,70 +180,36 @@ async def run_async_migrations() -> None:
|
||||
except Exception as e:
|
||||
logger.error(f"Error migrating schema {schema}: {e}")
|
||||
if not continue_on_error:
|
||||
logger.error("--continue=true is not set, raising exception!")
|
||||
logger.error("--continue is not set, raising exception!")
|
||||
raise
|
||||
|
||||
logger.warning("--continue=true is set, continuing to next schema.")
|
||||
logger.warning("--continue is set, continuing to next schema.")
|
||||
|
||||
else:
|
||||
# This should not happen in the new design since we require either
|
||||
# upgrade_all_tenants=true or schemas in multi-tenant mode
|
||||
# and for non-multi-tenant mode, we should use schemas with the default schema
|
||||
raise ValueError(
|
||||
"No migration target specified. Use either upgrade_all_tenants=true for all tenants "
|
||||
"or schemas for specific schemas."
|
||||
)
|
||||
try:
|
||||
logger.info(f"Migrating schema: {schema_name}")
|
||||
async with engine.connect() as connection:
|
||||
await connection.run_sync(
|
||||
do_run_migrations,
|
||||
schema_name=schema_name,
|
||||
create_schema=create_schema,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error migrating schema {schema_name}: {e}")
|
||||
raise
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
"""
|
||||
NOTE(rkuo): This generates a sql script that can be used to migrate the database ...
|
||||
instead of migrating the db live via an open connection
|
||||
|
||||
Not clear on when this would be used by us or if it even works.
|
||||
|
||||
If it is offline, then why are there calls to the db engine?
|
||||
|
||||
This doesn't really get used when we migrate in the cloud."""
|
||||
"""This doesn't really get used when we migrate in the cloud."""
|
||||
|
||||
logger.info("run_migrations_offline starting.")
|
||||
|
||||
# without init_engine, subsequent engine calls fail hard intentionally
|
||||
SqlEngine.init_engine(pool_size=20, max_overflow=5)
|
||||
|
||||
(
|
||||
create_schema,
|
||||
upgrade_all_tenants,
|
||||
continue_on_error,
|
||||
tenant_range_start,
|
||||
tenant_range_end,
|
||||
schemas,
|
||||
) = get_schema_options()
|
||||
schema_name, _, upgrade_all_tenants, continue_on_error = get_schema_options()
|
||||
url = build_connection_string()
|
||||
|
||||
if schemas:
|
||||
# Use specific schema names directly without fetching all tenants
|
||||
logger.info(f"Migrating specific schema names: {schemas}")
|
||||
|
||||
for schema in schemas:
|
||||
logger.info(f"Migrating schema: {schema}")
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata, # type: ignore
|
||||
literal_binds=True,
|
||||
include_object=include_object,
|
||||
version_table_schema=schema,
|
||||
include_schemas=True,
|
||||
script_location=config.get_main_option("script_location"),
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
elif upgrade_all_tenants:
|
||||
if upgrade_all_tenants:
|
||||
engine = create_async_engine(url)
|
||||
|
||||
if USE_IAM_AUTH:
|
||||
@@ -424,19 +223,7 @@ def run_migrations_offline() -> None:
|
||||
tenant_schemas = get_all_tenant_ids()
|
||||
engine.sync_engine.dispose()
|
||||
|
||||
filtered_tenant_schemas = filter_tenants_by_range(
|
||||
tenant_schemas, tenant_range_start, tenant_range_end
|
||||
)
|
||||
|
||||
if tenant_range_start is not None or tenant_range_end is not None:
|
||||
logger.info(
|
||||
f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
|
||||
)
|
||||
logger.info(
|
||||
f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
|
||||
)
|
||||
|
||||
for schema in filtered_tenant_schemas:
|
||||
for schema in tenant_schemas:
|
||||
logger.info(f"Migrating schema: {schema}")
|
||||
context.configure(
|
||||
url=url,
|
||||
@@ -452,12 +239,21 @@ def run_migrations_offline() -> None:
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
else:
|
||||
# This should not happen in the new design
|
||||
raise ValueError(
|
||||
"No migration target specified. Use either upgrade_all_tenants=true for all tenants "
|
||||
"or schemas for specific schemas."
|
||||
logger.info(f"Migrating schema: {schema_name}")
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata, # type: ignore
|
||||
literal_binds=True,
|
||||
include_object=include_object,
|
||||
version_table_schema=schema_name,
|
||||
include_schemas=True,
|
||||
script_location=config.get_main_option("script_location"),
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
logger.info("run_migrations_online starting.")
|
||||
|
||||
@@ -1,121 +0,0 @@
|
||||
"""rework-kg-config
|
||||
|
||||
Revision ID: 03bf8be6b53a
|
||||
Revises: 65bc6e0f8500
|
||||
Create Date: 2025-06-16 10:52:34.815335
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from sqlalchemy import text
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "03bf8be6b53a"
|
||||
down_revision = "65bc6e0f8500"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# get current config
|
||||
current_configs = (
|
||||
op.get_bind()
|
||||
.execute(text("SELECT kg_variable_name, kg_variable_values FROM kg_config"))
|
||||
.all()
|
||||
)
|
||||
current_config_dict = {
|
||||
config.kg_variable_name: (
|
||||
config.kg_variable_values[0]
|
||||
if config.kg_variable_name
|
||||
not in ("KG_VENDOR_DOMAINS", "KG_IGNORE_EMAIL_DOMAINS")
|
||||
else config.kg_variable_values
|
||||
)
|
||||
for config in current_configs
|
||||
if config.kg_variable_values
|
||||
}
|
||||
|
||||
# not using the KGConfigSettings model here in case it changes in the future
|
||||
kg_config_settings = json.dumps(
|
||||
{
|
||||
"KG_EXPOSED": current_config_dict.get("KG_EXPOSED", False),
|
||||
"KG_ENABLED": current_config_dict.get("KG_ENABLED", False),
|
||||
"KG_VENDOR": current_config_dict.get("KG_VENDOR", None),
|
||||
"KG_VENDOR_DOMAINS": current_config_dict.get("KG_VENDOR_DOMAINS", []),
|
||||
"KG_IGNORE_EMAIL_DOMAINS": current_config_dict.get(
|
||||
"KG_IGNORE_EMAIL_DOMAINS", []
|
||||
),
|
||||
"KG_COVERAGE_START": current_config_dict.get(
|
||||
"KG_COVERAGE_START",
|
||||
(datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d"),
|
||||
),
|
||||
"KG_MAX_COVERAGE_DAYS": current_config_dict.get("KG_MAX_COVERAGE_DAYS", 90),
|
||||
"KG_MAX_PARENT_RECURSION_DEPTH": current_config_dict.get(
|
||||
"KG_MAX_PARENT_RECURSION_DEPTH", 2
|
||||
),
|
||||
"KG_BETA_PERSONA_ID": current_config_dict.get("KG_BETA_PERSONA_ID", None),
|
||||
}
|
||||
)
|
||||
op.execute(
|
||||
f"INSERT INTO key_value_store (key, value) VALUES ('kg_config', '{kg_config_settings}')"
|
||||
)
|
||||
|
||||
# drop kg config table
|
||||
op.drop_table("kg_config")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# get current config
|
||||
current_config_dict = {
|
||||
"KG_EXPOSED": False,
|
||||
"KG_ENABLED": False,
|
||||
"KG_VENDOR": [],
|
||||
"KG_VENDOR_DOMAINS": [],
|
||||
"KG_IGNORE_EMAIL_DOMAINS": [],
|
||||
"KG_COVERAGE_START": (datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d"),
|
||||
"KG_MAX_COVERAGE_DAYS": 90,
|
||||
"KG_MAX_PARENT_RECURSION_DEPTH": 2,
|
||||
}
|
||||
current_configs = (
|
||||
op.get_bind()
|
||||
.execute(text("SELECT value FROM key_value_store WHERE key = 'kg_config'"))
|
||||
.one_or_none()
|
||||
)
|
||||
if current_configs is not None:
|
||||
current_config_dict.update(current_configs[0])
|
||||
insert_values = [
|
||||
{
|
||||
"kg_variable_name": name,
|
||||
"kg_variable_values": (
|
||||
[str(val).lower() if isinstance(val, bool) else str(val)]
|
||||
if not isinstance(val, list)
|
||||
else val
|
||||
),
|
||||
}
|
||||
for name, val in current_config_dict.items()
|
||||
]
|
||||
|
||||
op.create_table(
|
||||
"kg_config",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column("kg_variable_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("kg_variable_values", postgresql.ARRAY(sa.String()), nullable=False),
|
||||
sa.UniqueConstraint("kg_variable_name", name="uq_kg_config_variable_name"),
|
||||
)
|
||||
op.bulk_insert(
|
||||
sa.table(
|
||||
"kg_config",
|
||||
sa.column("kg_variable_name", sa.String),
|
||||
sa.column("kg_variable_values", postgresql.ARRAY(sa.String)),
|
||||
),
|
||||
insert_values,
|
||||
)
|
||||
|
||||
op.execute("DELETE FROM key_value_store WHERE key = 'kg_config'")
|
||||
@@ -1,72 +0,0 @@
|
||||
"""add federated connector tables
|
||||
|
||||
Revision ID: 0816326d83aa
|
||||
Revises: 12635f6655b7
|
||||
Create Date: 2025-06-29 14:09:45.109518
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "0816326d83aa"
|
||||
down_revision = "12635f6655b7"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create federated_connector table
|
||||
op.create_table(
|
||||
"federated_connector",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("source", sa.String(), nullable=False),
|
||||
sa.Column("credentials", sa.LargeBinary(), nullable=False),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create federated_connector_oauth_token table
|
||||
op.create_table(
|
||||
"federated_connector_oauth_token",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("federated_connector_id", sa.Integer(), nullable=False),
|
||||
sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("token", sa.LargeBinary(), nullable=False),
|
||||
sa.Column("expires_at", sa.DateTime(), nullable=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create federated_connector__document_set table
|
||||
op.create_table(
|
||||
"federated_connector__document_set",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("federated_connector_id", sa.Integer(), nullable=False),
|
||||
sa.Column("document_set_id", sa.Integer(), nullable=False),
|
||||
sa.Column("entities", postgresql.JSONB(), nullable=False),
|
||||
sa.ForeignKeyConstraint(
|
||||
["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["document_set_id"], ["document_set.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint(
|
||||
"federated_connector_id",
|
||||
"document_set_id",
|
||||
name="uq_federated_connector_document_set",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop tables in reverse order due to foreign key dependencies
|
||||
op.drop_table("federated_connector__document_set")
|
||||
op.drop_table("federated_connector_oauth_token")
|
||||
op.drop_table("federated_connector")
|
||||
@@ -1,596 +0,0 @@
|
||||
"""drive-canonical-ids
|
||||
|
||||
Revision ID: 12635f6655b7
|
||||
Revises: 58c50ef19f08
|
||||
Create Date: 2025-06-20 14:44:54.241159
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from httpx import HTTPStatusError
|
||||
import httpx
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.db.search_settings import SearchSettings
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
from onyx.document_index.vespa.shared_utils.utils import (
|
||||
replace_invalid_doc_id_characters,
|
||||
)
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.utils.logger import setup_logger
|
||||
import os
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "12635f6655b7"
|
||||
down_revision = "58c50ef19f08"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
SKIP_CANON_DRIVE_IDS = os.environ.get("SKIP_CANON_DRIVE_IDS", "true").lower() == "true"
|
||||
|
||||
|
||||
def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
|
||||
result = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_fetch = result.fetchall()
|
||||
search_settings = (
|
||||
SearchSettings(**search_settings_fetch[0]._asdict())
|
||||
if search_settings_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
result2 = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_future_fetch = result2.fetchall()
|
||||
search_settings_future = (
|
||||
SearchSettings(**search_settings_future_fetch[0]._asdict())
|
||||
if search_settings_future_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
if not isinstance(search_settings, SearchSettings):
|
||||
raise RuntimeError(
|
||||
"current search settings is of type " + str(type(search_settings))
|
||||
)
|
||||
if (
|
||||
not isinstance(search_settings_future, SearchSettings)
|
||||
and search_settings_future is not None
|
||||
):
|
||||
raise RuntimeError(
|
||||
"future search settings is of type " + str(type(search_settings_future))
|
||||
)
|
||||
|
||||
return search_settings, search_settings_future
|
||||
|
||||
|
||||
def normalize_google_drive_url(url: str) -> str:
|
||||
"""Remove query parameters from Google Drive URLs to create canonical document IDs.
|
||||
NOTE: copied from drive doc_conversion.py
|
||||
"""
|
||||
parsed_url = urlparse(url)
|
||||
parsed_url = parsed_url._replace(query="")
|
||||
spl_path = parsed_url.path.split("/")
|
||||
if spl_path and (spl_path[-1] in ["edit", "view", "preview"]):
|
||||
spl_path.pop()
|
||||
parsed_url = parsed_url._replace(path="/".join(spl_path))
|
||||
# Remove query parameters and reconstruct URL
|
||||
return urlunparse(parsed_url)
|
||||
|
||||
|
||||
def get_google_drive_documents_from_database() -> list[dict]:
|
||||
"""Get all Google Drive documents from the database."""
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT d.id
|
||||
FROM document d
|
||||
JOIN document_by_connector_credential_pair dcc ON d.id = dcc.id
|
||||
JOIN connector_credential_pair cc ON dcc.connector_id = cc.connector_id
|
||||
AND dcc.credential_id = cc.credential_id
|
||||
JOIN connector c ON cc.connector_id = c.id
|
||||
WHERE c.source = 'GOOGLE_DRIVE'
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
documents = []
|
||||
for row in result:
|
||||
documents.append({"document_id": row.id})
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def update_document_id_in_database(
|
||||
old_doc_id: str, new_doc_id: str, index_name: str
|
||||
) -> None:
|
||||
"""Update document IDs in all relevant database tables using copy-and-swap approach."""
|
||||
bind = op.get_bind()
|
||||
|
||||
# print(f"Updating database tables for document {old_doc_id} -> {new_doc_id}")
|
||||
|
||||
# Check if new document ID already exists
|
||||
result = bind.execute(
|
||||
sa.text("SELECT COUNT(*) FROM document WHERE id = :new_id"),
|
||||
{"new_id": new_doc_id},
|
||||
)
|
||||
row = result.fetchone()
|
||||
if row and row[0] > 0:
|
||||
# print(f"Document with ID {new_doc_id} already exists, deleting old one")
|
||||
delete_document_from_db(old_doc_id, index_name)
|
||||
return
|
||||
|
||||
# Step 1: Create a new document row with the new ID (copy all fields from old row)
|
||||
# Use a conservative approach to handle columns that might not exist in all installations
|
||||
try:
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners,
|
||||
external_user_emails, external_user_group_ids, is_public,
|
||||
chunk_count, last_modified, last_synced, kg_stage, kg_processing_time)
|
||||
SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners,
|
||||
external_user_emails, external_user_group_ids, is_public,
|
||||
chunk_count, last_modified, last_synced, kg_stage, kg_processing_time
|
||||
FROM document
|
||||
WHERE id = :old_id
|
||||
"""
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated database tables for document {old_doc_id} -> {new_doc_id}")
|
||||
except Exception as e:
|
||||
# If the full INSERT fails, try a more basic version with only core columns
|
||||
logger.warning(f"Full INSERT failed, trying basic version: {e}")
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners)
|
||||
SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners
|
||||
FROM document
|
||||
WHERE id = :old_id
|
||||
"""
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
|
||||
# Step 2: Update all foreign key references to point to the new ID
|
||||
|
||||
# Update document_by_connector_credential_pair table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE document_by_connector_credential_pair SET id = :new_id WHERE id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated document_by_connector_credential_pair table for document {old_doc_id} -> {new_doc_id}")
|
||||
|
||||
# Update search_doc table (stores search results for chat replay)
|
||||
# This is critical for agent functionality
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE search_doc SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated search_doc table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update document_retrieval_feedback table (user feedback on documents)
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE document_retrieval_feedback SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated document_retrieval_feedback table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update document__tag table (document-tag relationships)
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE document__tag SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated document__tag table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update user_file table (user uploaded files linked to documents)
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE user_file SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated user_file table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update KG and chunk_stats tables (these may not exist in all installations)
|
||||
try:
|
||||
# Update kg_entity table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_entity SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_entity table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update kg_entity_extraction_staging table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_entity_extraction_staging SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_entity_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update kg_relationship table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_relationship SET source_document = :new_id WHERE source_document = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_relationship table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update kg_relationship_extraction_staging table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_relationship_extraction_staging SET source_document = :new_id WHERE source_document = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_relationship_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update chunk_stats table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE chunk_stats SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated chunk_stats table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update chunk_stats ID field which includes document_id
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chunk_stats
|
||||
SET id = REPLACE(id, :old_id, :new_id)
|
||||
WHERE id LIKE :old_id_pattern
|
||||
"""
|
||||
),
|
||||
{
|
||||
"new_id": new_doc_id,
|
||||
"old_id": old_doc_id,
|
||||
"old_id_pattern": f"{old_doc_id}__%",
|
||||
},
|
||||
)
|
||||
# print(f"Successfully updated chunk_stats ID field for document {old_doc_id} -> {new_doc_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Some KG/chunk tables may not exist or failed to update: {e}")
|
||||
|
||||
# Step 3: Delete the old document row (this should now be safe since all FKs point to new row)
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM document WHERE id = :old_id"), {"old_id": old_doc_id}
|
||||
)
|
||||
# print(f"Successfully deleted document {old_doc_id} from database")
|
||||
|
||||
|
||||
def _visit_chunks(
|
||||
*,
|
||||
http_client: httpx.Client,
|
||||
index_name: str,
|
||||
selection: str,
|
||||
continuation: str | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Helper that calls the /document/v1 visit API once and returns (docs, next_token)."""
|
||||
|
||||
# Use the same URL as the document API, but with visit-specific params
|
||||
base_url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
|
||||
|
||||
params: dict[str, str] = {
|
||||
"selection": selection,
|
||||
"wantedDocumentCount": "1000",
|
||||
}
|
||||
if continuation:
|
||||
params["continuation"] = continuation
|
||||
|
||||
# print(f"Visiting chunks for selection '{selection}' with params {params}")
|
||||
resp = http_client.get(base_url, params=params, timeout=None)
|
||||
# print(f"Visited chunks for document {selection}")
|
||||
resp.raise_for_status()
|
||||
|
||||
payload = resp.json()
|
||||
return payload.get("documents", []), payload.get("continuation")
|
||||
|
||||
|
||||
def delete_document_chunks_from_vespa(index_name: str, doc_id: str) -> None:
|
||||
"""Delete all chunks for *doc_id* from Vespa using continuation-token paging (no offset)."""
|
||||
|
||||
total_deleted = 0
|
||||
# Use exact match instead of contains - Document Selector Language doesn't support contains
|
||||
selection = f'{index_name}.document_id=="{doc_id}"'
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
continuation: str | None = None
|
||||
while True:
|
||||
docs, continuation = _visit_chunks(
|
||||
http_client=http_client,
|
||||
index_name=index_name,
|
||||
selection=selection,
|
||||
continuation=continuation,
|
||||
)
|
||||
|
||||
if not docs:
|
||||
break
|
||||
|
||||
for doc in docs:
|
||||
vespa_full_id = doc.get("id")
|
||||
if not vespa_full_id:
|
||||
continue
|
||||
|
||||
vespa_doc_uuid = vespa_full_id.split("::")[-1]
|
||||
delete_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
|
||||
|
||||
try:
|
||||
resp = http_client.delete(delete_url)
|
||||
resp.raise_for_status()
|
||||
total_deleted += 1
|
||||
except Exception as e:
|
||||
print(f"Failed to delete chunk {vespa_doc_uuid}: {e}")
|
||||
|
||||
if not continuation:
|
||||
break
|
||||
|
||||
|
||||
def update_document_id_in_vespa(
|
||||
index_name: str, old_doc_id: str, new_doc_id: str
|
||||
) -> None:
|
||||
"""Update all chunks' document_id field from *old_doc_id* to *new_doc_id* using continuation paging."""
|
||||
|
||||
clean_new_doc_id = replace_invalid_doc_id_characters(new_doc_id)
|
||||
|
||||
# Use exact match instead of contains - Document Selector Language doesn't support contains
|
||||
selection = f'{index_name}.document_id=="{old_doc_id}"'
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
continuation: str | None = None
|
||||
while True:
|
||||
# print(f"Visiting chunks for document {old_doc_id} -> {new_doc_id}")
|
||||
docs, continuation = _visit_chunks(
|
||||
http_client=http_client,
|
||||
index_name=index_name,
|
||||
selection=selection,
|
||||
continuation=continuation,
|
||||
)
|
||||
|
||||
if not docs:
|
||||
break
|
||||
|
||||
for doc in docs:
|
||||
vespa_full_id = doc.get("id")
|
||||
if not vespa_full_id:
|
||||
continue
|
||||
|
||||
vespa_doc_uuid = vespa_full_id.split("::")[-1]
|
||||
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
|
||||
|
||||
update_request = {
|
||||
"fields": {"document_id": {"assign": clean_new_doc_id}}
|
||||
}
|
||||
|
||||
try:
|
||||
resp = http_client.put(vespa_url, json=update_request)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"Failed to update chunk {vespa_doc_uuid}: {e}")
|
||||
raise
|
||||
|
||||
if not continuation:
|
||||
break
|
||||
|
||||
|
||||
def delete_document_from_db(current_doc_id: str, index_name: str) -> None:
|
||||
# Delete all foreign key references first, then delete the document
|
||||
try:
|
||||
bind = op.get_bind()
|
||||
|
||||
# Delete from agent-related tables first (order matters due to foreign keys)
|
||||
# Delete from agent__sub_query__search_doc first since it references search_doc
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM agent__sub_query__search_doc
|
||||
WHERE search_doc_id IN (
|
||||
SELECT id FROM search_doc WHERE document_id = :doc_id
|
||||
)
|
||||
"""
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from chat_message__search_doc
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM chat_message__search_doc
|
||||
WHERE search_doc_id IN (
|
||||
SELECT id FROM search_doc WHERE document_id = :doc_id
|
||||
)
|
||||
"""
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Now we can safely delete from search_doc
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM search_doc WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from document_by_connector_credential_pair
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM document_by_connector_credential_pair WHERE id = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from other tables that reference this document
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM document_retrieval_feedback WHERE document_id = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM document__tag WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM user_file WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from KG tables if they exist
|
||||
try:
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM kg_entity WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM kg_entity_extraction_staging WHERE document_id = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM kg_relationship WHERE source_document = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM kg_relationship_extraction_staging WHERE source_document = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM chunk_stats WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM chunk_stats WHERE id LIKE :doc_id_pattern"),
|
||||
{"doc_id_pattern": f"{current_doc_id}__%"},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Some KG/chunk tables may not exist or failed to delete from: {e}"
|
||||
)
|
||||
|
||||
# Finally delete the document itself
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM document WHERE id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete chunks from vespa
|
||||
delete_document_chunks_from_vespa(index_name, current_doc_id)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to delete duplicate document {current_doc_id}: {e}")
|
||||
# Continue with other documents instead of failing the entire migration
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
if SKIP_CANON_DRIVE_IDS:
|
||||
return
|
||||
current_search_settings, future_search_settings = active_search_settings()
|
||||
document_index = get_default_document_index(
|
||||
current_search_settings,
|
||||
future_search_settings,
|
||||
)
|
||||
|
||||
# Get the index name
|
||||
if hasattr(document_index, "index_name"):
|
||||
index_name = document_index.index_name
|
||||
else:
|
||||
# Default index name if we can't get it from the document_index
|
||||
index_name = "danswer_index"
|
||||
|
||||
# Get all Google Drive documents from the database (this is faster and more reliable)
|
||||
gdrive_documents = get_google_drive_documents_from_database()
|
||||
|
||||
if not gdrive_documents:
|
||||
return
|
||||
|
||||
# Track normalized document IDs to detect duplicates
|
||||
all_normalized_doc_ids = set()
|
||||
updated_count = 0
|
||||
|
||||
for doc_info in gdrive_documents:
|
||||
current_doc_id = doc_info["document_id"]
|
||||
normalized_doc_id = normalize_google_drive_url(current_doc_id)
|
||||
|
||||
print(f"Processing document {current_doc_id} -> {normalized_doc_id}")
|
||||
# Check for duplicates
|
||||
if normalized_doc_id in all_normalized_doc_ids:
|
||||
# print(f"Deleting duplicate document {current_doc_id}")
|
||||
delete_document_from_db(current_doc_id, index_name)
|
||||
continue
|
||||
|
||||
all_normalized_doc_ids.add(normalized_doc_id)
|
||||
|
||||
# If the document ID already doesn't have query parameters, skip it
|
||||
if current_doc_id == normalized_doc_id:
|
||||
# print(f"Skipping document {current_doc_id} -> {normalized_doc_id} because it already has no query parameters")
|
||||
continue
|
||||
|
||||
try:
|
||||
# Update both database and Vespa in order
|
||||
# Database first to ensure consistency
|
||||
update_document_id_in_database(
|
||||
current_doc_id, normalized_doc_id, index_name
|
||||
)
|
||||
|
||||
# For Vespa, we can now use the original document IDs since we're using contains matching
|
||||
update_document_id_in_vespa(index_name, current_doc_id, normalized_doc_id)
|
||||
updated_count += 1
|
||||
# print(f"Finished updating document {current_doc_id} -> {normalized_doc_id}")
|
||||
except Exception as e:
|
||||
print(f"Failed to update document {current_doc_id}: {e}")
|
||||
|
||||
if isinstance(e, HTTPStatusError):
|
||||
print(f"HTTPStatusError: {e}")
|
||||
print(f"Response: {e.response.text}")
|
||||
print(f"Status: {e.response.status_code}")
|
||||
print(f"Headers: {e.response.headers}")
|
||||
print(f"Request: {e.request.url}")
|
||||
print(f"Request headers: {e.request.headers}")
|
||||
# Note: Rollback is complex with copy-and-swap approach since the old document is already deleted
|
||||
# In case of failure, manual intervention may be required
|
||||
# Continue with other documents instead of failing the entire migration
|
||||
continue
|
||||
|
||||
logger.info(f"Migration complete. Updated {updated_count} Google Drive documents")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# this is a one way migration, so no downgrade.
|
||||
# It wouldn't make sense to store the extra query parameters
|
||||
# and duplicate documents to allow a reversal.
|
||||
pass
|
||||
@@ -1,45 +0,0 @@
|
||||
"""Add foreign key to user__external_user_group_id
|
||||
|
||||
Revision ID: 238b84885828
|
||||
Revises: a7688ab35c45
|
||||
Create Date: 2025-05-19 17:15:33.424584
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "238b84885828"
|
||||
down_revision = "a7688ab35c45"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# First, clean up any entries that don't have a valid cc_pair_id
|
||||
op.execute(
|
||||
"""
|
||||
DELETE FROM user__external_user_group_id
|
||||
WHERE cc_pair_id NOT IN (SELECT id FROM connector_credential_pair)
|
||||
"""
|
||||
)
|
||||
|
||||
# Add foreign key constraint with cascade delete
|
||||
op.create_foreign_key(
|
||||
"fk_user__external_user_group_id_cc_pair_id",
|
||||
"user__external_user_group_id",
|
||||
"connector_credential_pair",
|
||||
["cc_pair_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the foreign key constraint
|
||||
op.drop_constraint(
|
||||
"fk_user__external_user_group_id_cc_pair_id",
|
||||
"user__external_user_group_id",
|
||||
type_="foreignkey",
|
||||
)
|
||||
@@ -144,34 +144,27 @@ def upgrade() -> None:
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("TRUNCATE TABLE index_attempt")
|
||||
conn = op.get_bind()
|
||||
inspector = sa.inspect(conn)
|
||||
existing_columns = {col["name"] for col in inspector.get_columns("index_attempt")}
|
||||
|
||||
if "input_type" not in existing_columns:
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
|
||||
if "source" not in existing_columns:
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
|
||||
if "connector_specific_config" not in existing_columns:
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"connector_specific_config",
|
||||
postgresql.JSONB(astext_type=sa.Text()),
|
||||
autoincrement=False,
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"connector_specific_config",
|
||||
postgresql.JSONB(astext_type=sa.Text()),
|
||||
autoincrement=False,
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Check if the constraint exists before dropping
|
||||
conn = op.get_bind()
|
||||
inspector = sa.inspect(conn)
|
||||
constraints = inspector.get_foreign_keys("index_attempt")
|
||||
|
||||
if any(
|
||||
@@ -190,12 +183,8 @@ def downgrade() -> None:
|
||||
"fk_index_attempt_connector_id", "index_attempt", type_="foreignkey"
|
||||
)
|
||||
|
||||
if "credential_id" in existing_columns:
|
||||
op.drop_column("index_attempt", "credential_id")
|
||||
|
||||
if "connector_id" in existing_columns:
|
||||
op.drop_column("index_attempt", "connector_id")
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS connector_credential_pair CASCADE")
|
||||
op.execute("DROP TABLE IF EXISTS credential CASCADE")
|
||||
op.execute("DROP TABLE IF EXISTS connector CASCADE")
|
||||
op.drop_column("index_attempt", "credential_id")
|
||||
op.drop_column("index_attempt", "connector_id")
|
||||
op.drop_table("connector_credential_pair")
|
||||
op.drop_table("credential")
|
||||
op.drop_table("connector")
|
||||
|
||||
@@ -1,115 +0,0 @@
|
||||
"""add_indexing_coordination
|
||||
|
||||
Revision ID: 2f95e36923e6
|
||||
Revises: 0816326d83aa
|
||||
Create Date: 2025-07-10 16:17:57.762182
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "2f95e36923e6"
|
||||
down_revision = "0816326d83aa"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add database-based coordination fields (replacing Redis fencing)
|
||||
op.add_column(
|
||||
"index_attempt", sa.Column("celery_task_id", sa.String(), nullable=True)
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"cancellation_requested",
|
||||
sa.Boolean(),
|
||||
nullable=False,
|
||||
server_default="false",
|
||||
),
|
||||
)
|
||||
|
||||
# Add batch coordination fields (replacing FileStore state)
|
||||
op.add_column(
|
||||
"index_attempt", sa.Column("total_batches", sa.Integer(), nullable=True)
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"completed_batches", sa.Integer(), nullable=False, server_default="0"
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"total_failures_batch_level",
|
||||
sa.Integer(),
|
||||
nullable=False,
|
||||
server_default="0",
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("total_chunks", sa.Integer(), nullable=False, server_default="0"),
|
||||
)
|
||||
|
||||
# Progress tracking for stall detection
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("last_progress_time", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"last_batches_completed_count",
|
||||
sa.Integer(),
|
||||
nullable=False,
|
||||
server_default="0",
|
||||
),
|
||||
)
|
||||
|
||||
# Heartbeat tracking for worker liveness detection
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"heartbeat_counter", sa.Integer(), nullable=False, server_default="0"
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"last_heartbeat_value", sa.Integer(), nullable=False, server_default="0"
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("last_heartbeat_time", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
|
||||
# Add index for coordination queries
|
||||
op.create_index(
|
||||
"ix_index_attempt_active_coordination",
|
||||
"index_attempt",
|
||||
["connector_credential_pair_id", "search_settings_id", "status"],
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Remove the new index
|
||||
op.drop_index("ix_index_attempt_active_coordination", table_name="index_attempt")
|
||||
|
||||
# Remove the new columns
|
||||
op.drop_column("index_attempt", "last_batches_completed_count")
|
||||
op.drop_column("index_attempt", "last_progress_time")
|
||||
op.drop_column("index_attempt", "last_heartbeat_time")
|
||||
op.drop_column("index_attempt", "last_heartbeat_value")
|
||||
op.drop_column("index_attempt", "heartbeat_counter")
|
||||
op.drop_column("index_attempt", "total_chunks")
|
||||
op.drop_column("index_attempt", "total_failures_batch_level")
|
||||
op.drop_column("index_attempt", "completed_batches")
|
||||
op.drop_column("index_attempt", "total_batches")
|
||||
op.drop_column("index_attempt", "cancellation_requested")
|
||||
op.drop_column("index_attempt", "celery_task_id")
|
||||
@@ -1,136 +0,0 @@
|
||||
"""update_kg_trigger_functions
|
||||
|
||||
Revision ID: 36e9220ab794
|
||||
Revises: c9e2cd766c29
|
||||
Create Date: 2025-06-22 17:33:25.833733
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "36e9220ab794"
|
||||
down_revision = "c9e2cd766c29"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def _get_tenant_contextvar(session: Session) -> str:
|
||||
"""Get the current schema for the migration"""
|
||||
current_tenant = session.execute(text("SELECT current_schema()")).scalar()
|
||||
if isinstance(current_tenant, str):
|
||||
return current_tenant
|
||||
else:
|
||||
raise ValueError("Current tenant is not a string")
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
|
||||
bind = op.get_bind()
|
||||
session = Session(bind=bind)
|
||||
|
||||
# Create kg_entity trigger to update kg_entity.name and its trigrams
|
||||
tenant_id = _get_tenant_contextvar(session)
|
||||
alphanum_pattern = r"[^a-z0-9]+"
|
||||
truncate_length = 1000
|
||||
function = "update_kg_entity_name"
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
|
||||
RETURNS TRIGGER AS $$
|
||||
DECLARE
|
||||
name text;
|
||||
cleaned_name text;
|
||||
BEGIN
|
||||
-- Set name to semantic_id if document_id is not NULL
|
||||
IF NEW.document_id IS NOT NULL THEN
|
||||
SELECT lower(semantic_id) INTO name
|
||||
FROM "{tenant_id}".document
|
||||
WHERE id = NEW.document_id;
|
||||
ELSE
|
||||
name = lower(NEW.name);
|
||||
END IF;
|
||||
|
||||
-- Clean name and truncate if too long
|
||||
cleaned_name = regexp_replace(
|
||||
name,
|
||||
'{alphanum_pattern}', '', 'g'
|
||||
);
|
||||
IF length(cleaned_name) > {truncate_length} THEN
|
||||
cleaned_name = left(cleaned_name, {truncate_length});
|
||||
END IF;
|
||||
|
||||
-- Set name and name trigrams
|
||||
NEW.name = name;
|
||||
NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
)
|
||||
trigger = f"{function}_trigger"
|
||||
op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".kg_entity')
|
||||
op.execute(
|
||||
f"""
|
||||
CREATE TRIGGER {trigger}
|
||||
BEFORE INSERT OR UPDATE OF name
|
||||
ON "{tenant_id}".kg_entity
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION "{tenant_id}".{function}();
|
||||
"""
|
||||
)
|
||||
|
||||
# Create kg_entity trigger to update kg_entity.name and its trigrams
|
||||
function = "update_kg_entity_name_from_doc"
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
|
||||
RETURNS TRIGGER AS $$
|
||||
DECLARE
|
||||
doc_name text;
|
||||
cleaned_name text;
|
||||
BEGIN
|
||||
doc_name = lower(NEW.semantic_id);
|
||||
|
||||
-- Clean name and truncate if too long
|
||||
cleaned_name = regexp_replace(
|
||||
doc_name,
|
||||
'{alphanum_pattern}', '', 'g'
|
||||
);
|
||||
IF length(cleaned_name) > {truncate_length} THEN
|
||||
cleaned_name = left(cleaned_name, {truncate_length});
|
||||
END IF;
|
||||
|
||||
-- Set name and name trigrams for all entities referencing this document
|
||||
UPDATE "{tenant_id}".kg_entity
|
||||
SET
|
||||
name = doc_name,
|
||||
name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
|
||||
WHERE document_id = NEW.id;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
)
|
||||
trigger = f"{function}_trigger"
|
||||
op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".document')
|
||||
op.execute(
|
||||
f"""
|
||||
CREATE TRIGGER {trigger}
|
||||
AFTER UPDATE OF semantic_id
|
||||
ON "{tenant_id}".document
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION "{tenant_id}".{function}();
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
pass
|
||||
@@ -21,14 +21,22 @@ depends_on = None
|
||||
# an outage by creating an index without using CONCURRENTLY. This migration:
|
||||
#
|
||||
# 1. Creates more efficient full-text search capabilities using tsvector columns and GIN indexes
|
||||
# 2. Adds indexes to both chat_message and chat_session tables for comprehensive search
|
||||
# 3. Note: CONCURRENTLY was removed due to operational issues
|
||||
# 2. Uses CONCURRENTLY for all index creation to prevent table locking
|
||||
# 3. Explicitly manages transactions with COMMIT statements to allow CONCURRENTLY to work
|
||||
# (see: https://www.postgresql.org/docs/9.4/sql-createindex.html#SQL-CREATEINDEX-CONCURRENTLY)
|
||||
# (see: https://github.com/sqlalchemy/alembic/issues/277)
|
||||
# 4. Adds indexes to both chat_message and chat_session tables for comprehensive search
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# First, drop any existing indexes to avoid conflicts
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
|
||||
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
|
||||
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_message_message_lower;")
|
||||
|
||||
# Drop existing columns if they exist
|
||||
@@ -44,9 +52,12 @@ def upgrade() -> None:
|
||||
"""
|
||||
)
|
||||
|
||||
# Commit the current transaction before creating concurrent indexes
|
||||
op.execute("COMMIT")
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_chat_message_tsv
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
|
||||
ON chat_message
|
||||
USING GIN (message_tsv)
|
||||
"""
|
||||
@@ -61,9 +72,12 @@ def upgrade() -> None:
|
||||
"""
|
||||
)
|
||||
|
||||
# Commit again before creating the second concurrent index
|
||||
op.execute("COMMIT")
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_chat_session_desc_tsv
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
|
||||
ON chat_session
|
||||
USING GIN (description_tsv)
|
||||
"""
|
||||
@@ -71,9 +85,12 @@ def upgrade() -> None:
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the indexes first
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")
|
||||
# Drop the indexes first (use CONCURRENTLY for dropping too)
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
|
||||
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
|
||||
|
||||
# Then drop the columns
|
||||
op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv;")
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
"""add_doc_metadata_field_in_document_model
|
||||
|
||||
Revision ID: 3fc5d75723b3
|
||||
Revises: 2f95e36923e6
|
||||
Create Date: 2025-07-28 18:45:37.985406
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "3fc5d75723b3"
|
||||
down_revision = "2f95e36923e6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column(
|
||||
"doc_metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("document", "doc_metadata")
|
||||
@@ -1,150 +0,0 @@
|
||||
"""Fix invalid model-configurations state
|
||||
|
||||
Revision ID: 47a07e1a38f1
|
||||
Revises: 7a70b7664e37
|
||||
Create Date: 2025-04-23 15:39:43.159504
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
from onyx.llm.llm_provider_options import (
|
||||
fetch_model_names_for_provider_as_set,
|
||||
fetch_visible_model_names_for_provider_as_set,
|
||||
)
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "47a07e1a38f1"
|
||||
down_revision = "7a70b7664e37"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
class _SimpleModelConfiguration(BaseModel):
|
||||
# Configure model to read from attributes
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
id: int
|
||||
llm_provider_id: int
|
||||
name: str
|
||||
is_visible: bool
|
||||
max_input_tokens: int | None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
llm_provider_table = sa.sql.table(
|
||||
"llm_provider",
|
||||
sa.column("id", sa.Integer),
|
||||
sa.column("provider", sa.String),
|
||||
sa.column("model_names", postgresql.ARRAY(sa.String)),
|
||||
sa.column("display_model_names", postgresql.ARRAY(sa.String)),
|
||||
sa.column("default_model_name", sa.String),
|
||||
sa.column("fast_default_model_name", sa.String),
|
||||
)
|
||||
model_configuration_table = sa.sql.table(
|
||||
"model_configuration",
|
||||
sa.column("id", sa.Integer),
|
||||
sa.column("llm_provider_id", sa.Integer),
|
||||
sa.column("name", sa.String),
|
||||
sa.column("is_visible", sa.Boolean),
|
||||
sa.column("max_input_tokens", sa.Integer),
|
||||
)
|
||||
|
||||
connection = op.get_bind()
|
||||
|
||||
llm_providers = connection.execute(
|
||||
sa.select(
|
||||
llm_provider_table.c.id,
|
||||
llm_provider_table.c.provider,
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for llm_provider in llm_providers:
|
||||
llm_provider_id, provider_name = llm_provider
|
||||
|
||||
default_models = fetch_model_names_for_provider_as_set(provider_name)
|
||||
display_models = fetch_visible_model_names_for_provider_as_set(
|
||||
provider_name=provider_name
|
||||
)
|
||||
|
||||
# if `fetch_model_names_for_provider_as_set` returns `None`, then
|
||||
# that means that `provider_name` is not a well-known llm provider.
|
||||
if not default_models:
|
||||
continue
|
||||
|
||||
if not display_models:
|
||||
raise RuntimeError(
|
||||
"If `default_models` is non-None, `display_models` must be non-None too."
|
||||
)
|
||||
|
||||
model_configurations = [
|
||||
_SimpleModelConfiguration.model_validate(model_configuration)
|
||||
for model_configuration in connection.execute(
|
||||
sa.select(
|
||||
model_configuration_table.c.id,
|
||||
model_configuration_table.c.llm_provider_id,
|
||||
model_configuration_table.c.name,
|
||||
model_configuration_table.c.is_visible,
|
||||
model_configuration_table.c.max_input_tokens,
|
||||
).where(model_configuration_table.c.llm_provider_id == llm_provider_id)
|
||||
).fetchall()
|
||||
]
|
||||
|
||||
if model_configurations:
|
||||
at_least_one_is_visible = any(
|
||||
[
|
||||
model_configuration.is_visible
|
||||
for model_configuration in model_configurations
|
||||
]
|
||||
)
|
||||
|
||||
# If there is at least one model which is public, this is a valid state.
|
||||
# Therefore, don't touch it and move on to the next one.
|
||||
if at_least_one_is_visible:
|
||||
continue
|
||||
|
||||
existing_visible_model_names: set[str] = set(
|
||||
[
|
||||
model_configuration.name
|
||||
for model_configuration in model_configurations
|
||||
if model_configuration.is_visible
|
||||
]
|
||||
)
|
||||
|
||||
difference = display_models.difference(existing_visible_model_names)
|
||||
|
||||
for model_name in difference:
|
||||
if not model_name:
|
||||
continue
|
||||
|
||||
insert_statement = postgresql.insert(model_configuration_table).values(
|
||||
llm_provider_id=llm_provider_id,
|
||||
name=model_name,
|
||||
is_visible=True,
|
||||
max_input_tokens=None,
|
||||
)
|
||||
|
||||
connection.execute(
|
||||
insert_statement.on_conflict_do_update(
|
||||
index_elements=["llm_provider_id", "name"],
|
||||
set_={"is_visible": insert_statement.excluded.is_visible},
|
||||
)
|
||||
)
|
||||
else:
|
||||
for model_name in default_models:
|
||||
connection.execute(
|
||||
model_configuration_table.insert().values(
|
||||
llm_provider_id=llm_provider_id,
|
||||
name=model_name,
|
||||
is_visible=model_name in display_models,
|
||||
max_input_tokens=None,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
pass
|
||||
@@ -1,691 +0,0 @@
|
||||
"""create knowledge graph tables
|
||||
|
||||
Revision ID: 495cb26ce93e
|
||||
Revises: ca04500b9ee8
|
||||
Create Date: 2025-03-19 08:51:14.341989
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from sqlalchemy import text
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from onyx.configs.app_configs import DB_READONLY_USER
|
||||
from onyx.configs.app_configs import DB_READONLY_PASSWORD
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "495cb26ce93e"
|
||||
down_revision = "ca04500b9ee8"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
|
||||
# Create a new permission-less user to be later used for knowledge graph queries.
|
||||
# The user will later get temporary read privileges for a specific view that will be
|
||||
# ad hoc generated specific to a knowledge graph query.
|
||||
#
|
||||
# Note: in order for the migration to run, the DB_READONLY_USER and DB_READONLY_PASSWORD
|
||||
# environment variables MUST be set. Otherwise, an exception will be raised.
|
||||
|
||||
if not MULTI_TENANT:
|
||||
|
||||
# Enable pg_trgm extension if not already enabled
|
||||
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
|
||||
|
||||
# Create read-only db user here only in single tenant mode. For multi-tenant mode,
|
||||
# the user is created in the alembic_tenants migration.
|
||||
if not (DB_READONLY_USER and DB_READONLY_PASSWORD):
|
||||
raise Exception("DB_READONLY_USER or DB_READONLY_PASSWORD is not set")
|
||||
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Check if the read-only user already exists
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
|
||||
-- Create the read-only user with the specified password
|
||||
EXECUTE format('CREATE USER %I WITH PASSWORD %L', '{DB_READONLY_USER}', '{DB_READONLY_PASSWORD}');
|
||||
-- First revoke all privileges to ensure a clean slate
|
||||
EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
|
||||
-- Grant only the CONNECT privilege to allow the user to connect to the database
|
||||
-- but not perform any operations without additional specific grants
|
||||
EXECUTE format('GRANT CONNECT ON DATABASE %I TO %I', current_database(), '{DB_READONLY_USER}');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Grant usage on current schema to readonly user
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
|
||||
EXECUTE format('GRANT USAGE ON SCHEMA %I TO %I', current_schema(), '{DB_READONLY_USER}');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_config CASCADE")
|
||||
op.create_table(
|
||||
"kg_config",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column("kg_variable_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("kg_variable_values", postgresql.ARRAY(sa.String()), nullable=False),
|
||||
sa.UniqueConstraint("kg_variable_name", name="uq_kg_config_variable_name"),
|
||||
)
|
||||
|
||||
# Insert initial data into kg_config table
|
||||
op.bulk_insert(
|
||||
sa.table(
|
||||
"kg_config",
|
||||
sa.column("kg_variable_name", sa.String),
|
||||
sa.column("kg_variable_values", postgresql.ARRAY(sa.String)),
|
||||
),
|
||||
[
|
||||
{"kg_variable_name": "KG_EXPOSED", "kg_variable_values": ["false"]},
|
||||
{"kg_variable_name": "KG_ENABLED", "kg_variable_values": ["false"]},
|
||||
{"kg_variable_name": "KG_VENDOR", "kg_variable_values": []},
|
||||
{"kg_variable_name": "KG_VENDOR_DOMAINS", "kg_variable_values": []},
|
||||
{"kg_variable_name": "KG_IGNORE_EMAIL_DOMAINS", "kg_variable_values": []},
|
||||
{
|
||||
"kg_variable_name": "KG_EXTRACTION_IN_PROGRESS",
|
||||
"kg_variable_values": ["false"],
|
||||
},
|
||||
{
|
||||
"kg_variable_name": "KG_CLUSTERING_IN_PROGRESS",
|
||||
"kg_variable_values": ["false"],
|
||||
},
|
||||
{
|
||||
"kg_variable_name": "KG_COVERAGE_START",
|
||||
"kg_variable_values": [
|
||||
(datetime.now() - timedelta(days=90)).strftime("%Y-%m-%d")
|
||||
],
|
||||
},
|
||||
{"kg_variable_name": "KG_MAX_COVERAGE_DAYS", "kg_variable_values": ["90"]},
|
||||
{
|
||||
"kg_variable_name": "KG_MAX_PARENT_RECURSION_DEPTH",
|
||||
"kg_variable_values": ["2"],
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_entity_type CASCADE")
|
||||
op.create_table(
|
||||
"kg_entity_type",
|
||||
sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column("description", sa.String(), nullable=True),
|
||||
sa.Column("grounding", sa.String(), nullable=False),
|
||||
sa.Column(
|
||||
"attributes",
|
||||
postgresql.JSONB,
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
|
||||
sa.Column("active", sa.Boolean(), nullable=False, default=False),
|
||||
sa.Column("deep_extraction", sa.Boolean(), nullable=False, default=False),
|
||||
sa.Column(
|
||||
"time_updated",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
onupdate=sa.text("now()"),
|
||||
),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
sa.Column("grounded_source_name", sa.String(), nullable=True),
|
||||
sa.Column("entity_values", postgresql.ARRAY(sa.String()), nullable=True),
|
||||
sa.Column(
|
||||
"clustering",
|
||||
postgresql.JSONB,
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship_type CASCADE")
|
||||
# Create KGRelationshipType table
|
||||
op.create_table(
|
||||
"kg_relationship_type",
|
||||
sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column("name", sa.String(), nullable=False, index=True),
|
||||
sa.Column(
|
||||
"source_entity_type_id_name", sa.String(), nullable=False, index=True
|
||||
),
|
||||
sa.Column(
|
||||
"target_entity_type_id_name", sa.String(), nullable=False, index=True
|
||||
),
|
||||
sa.Column("definition", sa.Boolean(), nullable=False, default=False),
|
||||
sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
|
||||
sa.Column("type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("active", sa.Boolean(), nullable=False, default=True),
|
||||
sa.Column(
|
||||
"time_updated",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
onupdate=sa.text("now()"),
|
||||
),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
sa.Column(
|
||||
"clustering",
|
||||
postgresql.JSONB,
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["source_entity_type_id_name"], ["kg_entity_type.id_name"]
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["target_entity_type_id_name"], ["kg_entity_type.id_name"]
|
||||
),
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship_type_extraction_staging CASCADE")
|
||||
# Create KGRelationshipTypeExtractionStaging table
|
||||
op.create_table(
|
||||
"kg_relationship_type_extraction_staging",
|
||||
sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column("name", sa.String(), nullable=False, index=True),
|
||||
sa.Column(
|
||||
"source_entity_type_id_name", sa.String(), nullable=False, index=True
|
||||
),
|
||||
sa.Column(
|
||||
"target_entity_type_id_name", sa.String(), nullable=False, index=True
|
||||
),
|
||||
sa.Column("definition", sa.Boolean(), nullable=False, default=False),
|
||||
sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
|
||||
sa.Column("type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("active", sa.Boolean(), nullable=False, default=True),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
sa.Column(
|
||||
"clustering",
|
||||
postgresql.JSONB,
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.Column("transferred", sa.Boolean(), nullable=False, server_default="false"),
|
||||
sa.ForeignKeyConstraint(
|
||||
["source_entity_type_id_name"], ["kg_entity_type.id_name"]
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["target_entity_type_id_name"], ["kg_entity_type.id_name"]
|
||||
),
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_entity CASCADE")
|
||||
|
||||
# Create KGEntity table
|
||||
op.create_table(
|
||||
"kg_entity",
|
||||
sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column("name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("entity_class", sa.String(), nullable=True, index=True),
|
||||
sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
|
||||
sa.Column("entity_key", sa.String(), nullable=True, index=True),
|
||||
sa.Column("name_trigrams", postgresql.ARRAY(sa.String(3)), nullable=True),
|
||||
sa.Column("document_id", sa.String(), nullable=True, index=True),
|
||||
sa.Column(
|
||||
"alternative_names",
|
||||
postgresql.ARRAY(sa.String()),
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.Column("entity_type_id_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("description", sa.String(), nullable=True),
|
||||
sa.Column(
|
||||
"keywords",
|
||||
postgresql.ARRAY(sa.String()),
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
|
||||
sa.Column(
|
||||
"acl", postgresql.ARRAY(sa.String()), nullable=False, server_default="{}"
|
||||
),
|
||||
sa.Column("boosts", postgresql.JSONB, nullable=False, server_default="{}"),
|
||||
sa.Column("attributes", postgresql.JSONB, nullable=False, server_default="{}"),
|
||||
sa.Column("event_time", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column(
|
||||
"time_updated",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
onupdate=sa.text("now()"),
|
||||
),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
sa.ForeignKeyConstraint(["entity_type_id_name"], ["kg_entity_type.id_name"]),
|
||||
sa.ForeignKeyConstraint(["document_id"], ["document.id"]),
|
||||
sa.UniqueConstraint(
|
||||
"name",
|
||||
"entity_type_id_name",
|
||||
"document_id",
|
||||
name="uq_kg_entity_name_type_doc",
|
||||
),
|
||||
)
|
||||
op.create_index("ix_entity_type_acl", "kg_entity", ["entity_type_id_name", "acl"])
|
||||
op.create_index(
|
||||
"ix_entity_name_search", "kg_entity", ["name", "entity_type_id_name"]
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_entity_extraction_staging CASCADE")
|
||||
# Create KGEntityExtractionStaging table
|
||||
op.create_table(
|
||||
"kg_entity_extraction_staging",
|
||||
sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column("name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("document_id", sa.String(), nullable=True, index=True),
|
||||
sa.Column(
|
||||
"alternative_names",
|
||||
postgresql.ARRAY(sa.String()),
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.Column("entity_type_id_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("description", sa.String(), nullable=True),
|
||||
sa.Column(
|
||||
"keywords",
|
||||
postgresql.ARRAY(sa.String()),
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
|
||||
sa.Column(
|
||||
"acl", postgresql.ARRAY(sa.String()), nullable=False, server_default="{}"
|
||||
),
|
||||
sa.Column("boosts", postgresql.JSONB, nullable=False, server_default="{}"),
|
||||
sa.Column("attributes", postgresql.JSONB, nullable=False, server_default="{}"),
|
||||
sa.Column("transferred_id_name", sa.String(), nullable=True, default=None),
|
||||
sa.Column("entity_class", sa.String(), nullable=True, index=True),
|
||||
sa.Column("entity_key", sa.String(), nullable=True, index=True),
|
||||
sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
|
||||
sa.Column("parent_key", sa.String(), nullable=True, index=True),
|
||||
sa.Column("event_time", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
sa.ForeignKeyConstraint(["entity_type_id_name"], ["kg_entity_type.id_name"]),
|
||||
sa.ForeignKeyConstraint(["document_id"], ["document.id"]),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_entity_extraction_staging_acl",
|
||||
"kg_entity_extraction_staging",
|
||||
["entity_type_id_name", "acl"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_entity_extraction_staging_name_search",
|
||||
"kg_entity_extraction_staging",
|
||||
["name", "entity_type_id_name"],
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship CASCADE")
|
||||
# Create KGRelationship table
|
||||
op.create_table(
|
||||
"kg_relationship",
|
||||
sa.Column("id_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("source_node", sa.String(), nullable=False, index=True),
|
||||
sa.Column("target_node", sa.String(), nullable=False, index=True),
|
||||
sa.Column("source_node_type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("target_node_type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("source_document", sa.String(), nullable=True, index=True),
|
||||
sa.Column("type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("relationship_type_id_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
|
||||
sa.Column(
|
||||
"time_updated",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
onupdate=sa.text("now()"),
|
||||
),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
sa.ForeignKeyConstraint(["source_node"], ["kg_entity.id_name"]),
|
||||
sa.ForeignKeyConstraint(["target_node"], ["kg_entity.id_name"]),
|
||||
sa.ForeignKeyConstraint(["source_node_type"], ["kg_entity_type.id_name"]),
|
||||
sa.ForeignKeyConstraint(["target_node_type"], ["kg_entity_type.id_name"]),
|
||||
sa.ForeignKeyConstraint(["source_document"], ["document.id"]),
|
||||
sa.ForeignKeyConstraint(
|
||||
["relationship_type_id_name"], ["kg_relationship_type.id_name"]
|
||||
),
|
||||
sa.UniqueConstraint(
|
||||
"source_node",
|
||||
"target_node",
|
||||
"type",
|
||||
name="uq_kg_relationship_source_target_type",
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id_name", "source_document"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kg_relationship_nodes", "kg_relationship", ["source_node", "target_node"]
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship_extraction_staging CASCADE")
|
||||
# Create KGRelationshipExtractionStaging table
|
||||
op.create_table(
|
||||
"kg_relationship_extraction_staging",
|
||||
sa.Column("id_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("source_node", sa.String(), nullable=False, index=True),
|
||||
sa.Column("target_node", sa.String(), nullable=False, index=True),
|
||||
sa.Column("source_node_type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("target_node_type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("source_document", sa.String(), nullable=True, index=True),
|
||||
sa.Column("type", sa.String(), nullable=False, index=True),
|
||||
sa.Column("relationship_type_id_name", sa.String(), nullable=False, index=True),
|
||||
sa.Column("occurrences", sa.Integer(), server_default="1", nullable=False),
|
||||
sa.Column("transferred", sa.Boolean(), nullable=False, server_default="false"),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["source_node"], ["kg_entity_extraction_staging.id_name"]
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["target_node"], ["kg_entity_extraction_staging.id_name"]
|
||||
),
|
||||
sa.ForeignKeyConstraint(["source_node_type"], ["kg_entity_type.id_name"]),
|
||||
sa.ForeignKeyConstraint(["target_node_type"], ["kg_entity_type.id_name"]),
|
||||
sa.ForeignKeyConstraint(["source_document"], ["document.id"]),
|
||||
sa.ForeignKeyConstraint(
|
||||
["relationship_type_id_name"],
|
||||
["kg_relationship_type_extraction_staging.id_name"],
|
||||
),
|
||||
sa.UniqueConstraint(
|
||||
"source_node",
|
||||
"target_node",
|
||||
"type",
|
||||
name="uq_kg_relationship_extraction_staging_source_target_type",
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id_name", "source_document"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_kg_relationship_extraction_staging_nodes",
|
||||
"kg_relationship_extraction_staging",
|
||||
["source_node", "target_node"],
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_term CASCADE")
|
||||
# Create KGTerm table
|
||||
op.create_table(
|
||||
"kg_term",
|
||||
sa.Column("id_term", sa.String(), primary_key=True, nullable=False, index=True),
|
||||
sa.Column(
|
||||
"entity_types",
|
||||
postgresql.ARRAY(sa.String()),
|
||||
nullable=False,
|
||||
server_default="{}",
|
||||
),
|
||||
sa.Column(
|
||||
"time_updated",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
onupdate=sa.text("now()"),
|
||||
),
|
||||
sa.Column(
|
||||
"time_created", sa.DateTime(timezone=True), server_default=sa.text("now()")
|
||||
),
|
||||
)
|
||||
op.create_index("ix_search_term_entities", "kg_term", ["entity_types"])
|
||||
op.create_index("ix_search_term_term", "kg_term", ["id_term"])
|
||||
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column("kg_stage", sa.String(), nullable=True, index=True),
|
||||
)
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column("kg_processing_time", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"connector",
|
||||
sa.Column(
|
||||
"kg_processing_enabled",
|
||||
sa.Boolean(),
|
||||
nullable=True,
|
||||
server_default="false",
|
||||
),
|
||||
)
|
||||
|
||||
op.add_column(
|
||||
"connector",
|
||||
sa.Column(
|
||||
"kg_coverage_days",
|
||||
sa.Integer(),
|
||||
nullable=True,
|
||||
server_default=None,
|
||||
),
|
||||
)
|
||||
|
||||
# Create GIN index for clustering and normalization
|
||||
op.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_kg_entity_clustering_trigrams "
|
||||
f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA}.gin_trgm_ops)"
|
||||
)
|
||||
op.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_kg_entity_normalization_trigrams "
|
||||
"ON kg_entity USING GIN (name_trigrams)"
|
||||
)
|
||||
|
||||
# Create kg_entity trigger to update kg_entity.name and its trigrams
|
||||
alphanum_pattern = r"[^a-z0-9]+"
|
||||
truncate_length = 1000
|
||||
function = "update_kg_entity_name"
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
CREATE OR REPLACE FUNCTION {function}()
|
||||
RETURNS TRIGGER AS $$
|
||||
DECLARE
|
||||
name text;
|
||||
cleaned_name text;
|
||||
BEGIN
|
||||
-- Set name to semantic_id if document_id is not NULL
|
||||
IF NEW.document_id IS NOT NULL THEN
|
||||
SELECT lower(semantic_id) INTO name
|
||||
FROM document
|
||||
WHERE id = NEW.document_id;
|
||||
ELSE
|
||||
name = lower(NEW.name);
|
||||
END IF;
|
||||
|
||||
-- Clean name and truncate if too long
|
||||
cleaned_name = regexp_replace(
|
||||
name,
|
||||
'{alphanum_pattern}', '', 'g'
|
||||
);
|
||||
IF length(cleaned_name) > {truncate_length} THEN
|
||||
cleaned_name = left(cleaned_name, {truncate_length});
|
||||
END IF;
|
||||
|
||||
-- Set name and name trigrams
|
||||
NEW.name = name;
|
||||
NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
)
|
||||
trigger = f"{function}_trigger"
|
||||
op.execute(f"DROP TRIGGER IF EXISTS {trigger} ON kg_entity")
|
||||
op.execute(
|
||||
f"""
|
||||
CREATE TRIGGER {trigger}
|
||||
BEFORE INSERT OR UPDATE OF name
|
||||
ON kg_entity
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION {function}();
|
||||
"""
|
||||
)
|
||||
|
||||
# Create kg_entity trigger to update kg_entity.name and its trigrams
|
||||
function = "update_kg_entity_name_from_doc"
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
CREATE OR REPLACE FUNCTION {function}()
|
||||
RETURNS TRIGGER AS $$
|
||||
DECLARE
|
||||
doc_name text;
|
||||
cleaned_name text;
|
||||
BEGIN
|
||||
doc_name = lower(NEW.semantic_id);
|
||||
|
||||
-- Clean name and truncate if too long
|
||||
cleaned_name = regexp_replace(
|
||||
doc_name,
|
||||
'{alphanum_pattern}', '', 'g'
|
||||
);
|
||||
IF length(cleaned_name) > {truncate_length} THEN
|
||||
cleaned_name = left(cleaned_name, {truncate_length});
|
||||
END IF;
|
||||
|
||||
-- Set name and name trigrams for all entities referencing this document
|
||||
UPDATE kg_entity
|
||||
SET
|
||||
name = doc_name,
|
||||
name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
|
||||
WHERE document_id = NEW.id;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
)
|
||||
trigger = f"{function}_trigger"
|
||||
op.execute(f"DROP TRIGGER IF EXISTS {trigger} ON document")
|
||||
op.execute(
|
||||
f"""
|
||||
CREATE TRIGGER {trigger}
|
||||
AFTER UPDATE OF semantic_id
|
||||
ON document
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION {function}();
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
|
||||
# Drop all views that start with 'kg_'
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
DECLARE
|
||||
view_name text;
|
||||
BEGIN
|
||||
FOR view_name IN
|
||||
SELECT c.relname
|
||||
FROM pg_catalog.pg_class c
|
||||
JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
|
||||
WHERE c.relkind = 'v'
|
||||
AND n.nspname = current_schema()
|
||||
AND c.relname LIKE 'kg_relationships_with_access%'
|
||||
LOOP
|
||||
EXECUTE 'DROP VIEW IF EXISTS ' || quote_ident(view_name);
|
||||
END LOOP;
|
||||
END $$;
|
||||
"""
|
||||
)
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
DO $$
|
||||
DECLARE
|
||||
view_name text;
|
||||
BEGIN
|
||||
FOR view_name IN
|
||||
SELECT c.relname
|
||||
FROM pg_catalog.pg_class c
|
||||
JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace
|
||||
WHERE c.relkind = 'v'
|
||||
AND n.nspname = current_schema()
|
||||
AND c.relname LIKE 'allowed_docs%'
|
||||
LOOP
|
||||
EXECUTE 'DROP VIEW IF EXISTS ' || quote_ident(view_name);
|
||||
END LOOP;
|
||||
END $$;
|
||||
"""
|
||||
)
|
||||
|
||||
for table, function in (
|
||||
("kg_entity", "update_kg_entity_name"),
|
||||
("document", "update_kg_entity_name_from_doc"),
|
||||
):
|
||||
op.execute(f"DROP TRIGGER IF EXISTS {function}_trigger ON {table}")
|
||||
op.execute(f"DROP FUNCTION IF EXISTS {function}()")
|
||||
|
||||
# Drop index
|
||||
op.execute("DROP INDEX IF EXISTS idx_kg_entity_clustering_trigrams")
|
||||
op.execute("DROP INDEX IF EXISTS idx_kg_entity_normalization_trigrams")
|
||||
|
||||
# Drop tables in reverse order of creation to handle dependencies
|
||||
op.drop_table("kg_term")
|
||||
op.drop_table("kg_relationship")
|
||||
op.drop_table("kg_entity")
|
||||
op.drop_table("kg_relationship_type")
|
||||
op.drop_table("kg_relationship_extraction_staging")
|
||||
op.drop_table("kg_relationship_type_extraction_staging")
|
||||
op.drop_table("kg_entity_extraction_staging")
|
||||
op.drop_table("kg_entity_type")
|
||||
op.drop_column("connector", "kg_processing_enabled")
|
||||
op.drop_column("connector", "kg_coverage_days")
|
||||
op.drop_column("document", "kg_stage")
|
||||
op.drop_column("document", "kg_processing_time")
|
||||
op.drop_table("kg_config")
|
||||
|
||||
# Revoke usage on current schema for the readonly user
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
|
||||
EXECUTE format('REVOKE ALL ON SCHEMA %I FROM %I', current_schema(), '{DB_READONLY_USER}');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
if not MULTI_TENANT:
|
||||
# Drop read-only db user here only in single tenant mode. For multi-tenant mode,
|
||||
# the user is dropped in the alembic_tenants migration.
|
||||
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
|
||||
-- First revoke all privileges from the database
|
||||
EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
|
||||
-- Then drop the user
|
||||
EXECUTE format('DROP USER %I', '{DB_READONLY_USER}');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
)
|
||||
op.execute(text("DROP EXTENSION IF EXISTS pg_trgm"))
|
||||
@@ -1,380 +0,0 @@
|
||||
"""merge_default_assistants_into_unified
|
||||
|
||||
Revision ID: 505c488f6662
|
||||
Revises: d09fc20a3c66
|
||||
Create Date: 2025-09-09 19:00:56.816626
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
from typing import NamedTuple
|
||||
from uuid import UUID
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "505c488f6662"
|
||||
down_revision = "d09fc20a3c66"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
# Constants for the unified assistant
|
||||
UNIFIED_ASSISTANT_NAME = "Assistant"
|
||||
UNIFIED_ASSISTANT_DESCRIPTION = (
|
||||
"Your AI assistant with search, web browsing, and image generation capabilities."
|
||||
)
|
||||
UNIFIED_ASSISTANT_NUM_CHUNKS = 25
|
||||
UNIFIED_ASSISTANT_DISPLAY_PRIORITY = 0
|
||||
UNIFIED_ASSISTANT_LLM_FILTER_EXTRACTION = True
|
||||
UNIFIED_ASSISTANT_LLM_RELEVANCE_FILTER = False
|
||||
UNIFIED_ASSISTANT_RECENCY_BIAS = "AUTO" # NOTE: needs to be capitalized
|
||||
UNIFIED_ASSISTANT_CHUNKS_ABOVE = 0
|
||||
UNIFIED_ASSISTANT_CHUNKS_BELOW = 0
|
||||
UNIFIED_ASSISTANT_DATETIME_AWARE = True
|
||||
|
||||
# NOTE: tool specific prompts are handled on the fly and automatically injected
|
||||
# into the prompt before passing to the LLM.
|
||||
DEFAULT_SYSTEM_PROMPT = """
|
||||
You are a highly capable, thoughtful, and precise assistant. Your goal is to deeply understand the \
|
||||
user's intent, ask clarifying questions when needed, think step-by-step through complex problems, \
|
||||
provide clear and accurate answers, and proactively anticipate helpful follow-up information. Always \
|
||||
prioritize being truthful, nuanced, insightful, and efficient.
|
||||
The current date is [[CURRENT_DATETIME]]
|
||||
|
||||
You use different text styles, bolding, emojis (sparingly), block quotes, and other formatting to make \
|
||||
your responses more readable and engaging.
|
||||
You use proper Markdown and LaTeX to format your responses for math, scientific, and chemical formulas, \
|
||||
symbols, etc.: '$$\\n[expression]\\n$$' for standalone cases and '\\( [expression] \\)' when inline.
|
||||
For code you prefer to use Markdown and specify the language.
|
||||
You can use Markdown horizontal rules (---) to separate sections of your responses.
|
||||
You can use Markdown tables to format your responses for data, lists, and other structured information.
|
||||
""".strip()
|
||||
|
||||
|
||||
INSERT_DICT: dict[str, Any] = {
|
||||
"name": UNIFIED_ASSISTANT_NAME,
|
||||
"description": UNIFIED_ASSISTANT_DESCRIPTION,
|
||||
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
||||
"num_chunks": UNIFIED_ASSISTANT_NUM_CHUNKS,
|
||||
"display_priority": UNIFIED_ASSISTANT_DISPLAY_PRIORITY,
|
||||
"llm_filter_extraction": UNIFIED_ASSISTANT_LLM_FILTER_EXTRACTION,
|
||||
"llm_relevance_filter": UNIFIED_ASSISTANT_LLM_RELEVANCE_FILTER,
|
||||
"recency_bias": UNIFIED_ASSISTANT_RECENCY_BIAS,
|
||||
"chunks_above": UNIFIED_ASSISTANT_CHUNKS_ABOVE,
|
||||
"chunks_below": UNIFIED_ASSISTANT_CHUNKS_BELOW,
|
||||
"datetime_aware": UNIFIED_ASSISTANT_DATETIME_AWARE,
|
||||
}
|
||||
|
||||
GENERAL_ASSISTANT_ID = -1
|
||||
ART_ASSISTANT_ID = -3
|
||||
|
||||
|
||||
class UserRow(NamedTuple):
|
||||
"""Typed representation of user row from database query."""
|
||||
|
||||
id: UUID
|
||||
chosen_assistants: list[int] | None
|
||||
visible_assistants: list[int] | None
|
||||
hidden_assistants: list[int] | None
|
||||
pinned_assistants: list[int] | None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
# Start transaction
|
||||
conn.execute(sa.text("BEGIN"))
|
||||
|
||||
try:
|
||||
# Step 1: Create or update the unified assistant (ID 0)
|
||||
search_assistant = conn.execute(
|
||||
sa.text("SELECT * FROM persona WHERE id = 0")
|
||||
).fetchone()
|
||||
|
||||
if search_assistant:
|
||||
# Update existing Search assistant to be the unified assistant
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET name = :name,
|
||||
description = :description,
|
||||
system_prompt = :system_prompt,
|
||||
num_chunks = :num_chunks,
|
||||
is_default_persona = true,
|
||||
is_visible = true,
|
||||
deleted = false,
|
||||
display_priority = :display_priority,
|
||||
llm_filter_extraction = :llm_filter_extraction,
|
||||
llm_relevance_filter = :llm_relevance_filter,
|
||||
recency_bias = :recency_bias,
|
||||
chunks_above = :chunks_above,
|
||||
chunks_below = :chunks_below,
|
||||
datetime_aware = :datetime_aware,
|
||||
starter_messages = null
|
||||
WHERE id = 0
|
||||
"""
|
||||
),
|
||||
INSERT_DICT,
|
||||
)
|
||||
else:
|
||||
# Create new unified assistant with ID 0
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO persona (
|
||||
id, name, description, system_prompt, num_chunks,
|
||||
is_default_persona, is_visible, deleted, display_priority,
|
||||
llm_filter_extraction, llm_relevance_filter, recency_bias,
|
||||
chunks_above, chunks_below, datetime_aware, starter_messages,
|
||||
builtin_persona
|
||||
) VALUES (
|
||||
0, :name, :description, :system_prompt, :num_chunks,
|
||||
true, true, false, :display_priority, :llm_filter_extraction,
|
||||
:llm_relevance_filter, :recency_bias, :chunks_above, :chunks_below,
|
||||
:datetime_aware, null, true
|
||||
)
|
||||
"""
|
||||
),
|
||||
INSERT_DICT,
|
||||
)
|
||||
|
||||
# Step 2: Mark ALL builtin assistants as deleted (except the unified assistant ID 0)
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET deleted = true, is_visible = false, is_default_persona = false
|
||||
WHERE builtin_persona = true AND id != 0
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Step 3: Add all built-in tools to the unified assistant
|
||||
# First, get the tool IDs for SearchTool, ImageGenerationTool, and WebSearchTool
|
||||
search_tool = conn.execute(
|
||||
sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'SearchTool'")
|
||||
).fetchone()
|
||||
|
||||
if not search_tool:
|
||||
raise ValueError(
|
||||
"SearchTool not found in database. Ensure tools migration has run first."
|
||||
)
|
||||
|
||||
image_gen_tool = conn.execute(
|
||||
sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'ImageGenerationTool'")
|
||||
).fetchone()
|
||||
|
||||
if not image_gen_tool:
|
||||
raise ValueError(
|
||||
"ImageGenerationTool not found in database. Ensure tools migration has run first."
|
||||
)
|
||||
|
||||
# WebSearchTool is optional - may not be configured
|
||||
web_search_tool = conn.execute(
|
||||
sa.text("SELECT id FROM tool WHERE in_code_tool_id = 'WebSearchTool'")
|
||||
).fetchone()
|
||||
|
||||
# Clear existing tool associations for persona 0
|
||||
conn.execute(sa.text("DELETE FROM persona__tool WHERE persona_id = 0"))
|
||||
|
||||
# Add tools to the unified assistant
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO persona__tool (persona_id, tool_id)
|
||||
VALUES (0, :tool_id)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
),
|
||||
{"tool_id": search_tool[0]},
|
||||
)
|
||||
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO persona__tool (persona_id, tool_id)
|
||||
VALUES (0, :tool_id)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
),
|
||||
{"tool_id": image_gen_tool[0]},
|
||||
)
|
||||
|
||||
if web_search_tool:
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO persona__tool (persona_id, tool_id)
|
||||
VALUES (0, :tool_id)
|
||||
ON CONFLICT DO NOTHING
|
||||
"""
|
||||
),
|
||||
{"tool_id": web_search_tool[0]},
|
||||
)
|
||||
|
||||
# Step 4: Migrate existing chat sessions from all builtin assistants to unified assistant
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chat_session
|
||||
SET persona_id = 0
|
||||
WHERE persona_id IN (
|
||||
SELECT id FROM persona WHERE builtin_persona = true AND id != 0
|
||||
)
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Step 5: Migrate user preferences - remove references to all builtin assistants
|
||||
# First, get all builtin assistant IDs (except 0)
|
||||
builtin_assistants_result = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT id FROM persona
|
||||
WHERE builtin_persona = true AND id != 0
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
builtin_assistant_ids = [row[0] for row in builtin_assistants_result]
|
||||
|
||||
# Get all users with preferences
|
||||
users_result = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT id, chosen_assistants, visible_assistants,
|
||||
hidden_assistants, pinned_assistants
|
||||
FROM "user"
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for user_row in users_result:
|
||||
user = UserRow(*user_row)
|
||||
user_id: UUID = user.id
|
||||
updates: dict[str, Any] = {}
|
||||
|
||||
# Remove all builtin assistants from chosen_assistants
|
||||
if user.chosen_assistants:
|
||||
new_chosen: list[int] = [
|
||||
assistant_id
|
||||
for assistant_id in user.chosen_assistants
|
||||
if assistant_id not in builtin_assistant_ids
|
||||
]
|
||||
if new_chosen != user.chosen_assistants:
|
||||
updates["chosen_assistants"] = json.dumps(new_chosen)
|
||||
|
||||
# Remove all builtin assistants from visible_assistants
|
||||
if user.visible_assistants:
|
||||
new_visible: list[int] = [
|
||||
assistant_id
|
||||
for assistant_id in user.visible_assistants
|
||||
if assistant_id not in builtin_assistant_ids
|
||||
]
|
||||
if new_visible != user.visible_assistants:
|
||||
updates["visible_assistants"] = json.dumps(new_visible)
|
||||
|
||||
# Add all builtin assistants to hidden_assistants
|
||||
if user.hidden_assistants:
|
||||
new_hidden: list[int] = list(user.hidden_assistants)
|
||||
for old_id in builtin_assistant_ids:
|
||||
if old_id not in new_hidden:
|
||||
new_hidden.append(old_id)
|
||||
if new_hidden != user.hidden_assistants:
|
||||
updates["hidden_assistants"] = json.dumps(new_hidden)
|
||||
else:
|
||||
updates["hidden_assistants"] = json.dumps(builtin_assistant_ids)
|
||||
|
||||
# Remove all builtin assistants from pinned_assistants
|
||||
if user.pinned_assistants:
|
||||
new_pinned: list[int] = [
|
||||
assistant_id
|
||||
for assistant_id in user.pinned_assistants
|
||||
if assistant_id not in builtin_assistant_ids
|
||||
]
|
||||
if new_pinned != user.pinned_assistants:
|
||||
updates["pinned_assistants"] = json.dumps(new_pinned)
|
||||
|
||||
# Apply updates if any
|
||||
if updates:
|
||||
set_clause = ", ".join([f"{k} = :{k}" for k in updates.keys()])
|
||||
updates["user_id"] = str(user_id) # Convert UUID to string for SQL
|
||||
conn.execute(
|
||||
sa.text(f'UPDATE "user" SET {set_clause} WHERE id = :user_id'),
|
||||
updates,
|
||||
)
|
||||
|
||||
# Commit transaction
|
||||
conn.execute(sa.text("COMMIT"))
|
||||
|
||||
except Exception as e:
|
||||
# Rollback on error
|
||||
conn.execute(sa.text("ROLLBACK"))
|
||||
raise e
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
# Start transaction
|
||||
conn.execute(sa.text("BEGIN"))
|
||||
|
||||
try:
|
||||
# Only restore General (ID -1) and Art (ID -3) assistants
|
||||
# Step 1: Keep Search assistant (ID 0) as default but restore original state
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET is_default_persona = true,
|
||||
is_visible = true,
|
||||
deleted = false
|
||||
WHERE id = 0
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Step 2: Restore General assistant (ID -1)
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET deleted = false,
|
||||
is_visible = true,
|
||||
is_default_persona = true
|
||||
WHERE id = :general_assistant_id
|
||||
"""
|
||||
),
|
||||
{"general_assistant_id": GENERAL_ASSISTANT_ID},
|
||||
)
|
||||
|
||||
# Step 3: Restore Art assistant (ID -3)
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET deleted = false,
|
||||
is_visible = true,
|
||||
is_default_persona = true
|
||||
WHERE id = :art_assistant_id
|
||||
"""
|
||||
),
|
||||
{"art_assistant_id": ART_ASSISTANT_ID},
|
||||
)
|
||||
|
||||
# Note: We don't restore the original tool associations, names, or descriptions
|
||||
# as those would require more complex logic to determine original state.
|
||||
# We also cannot restore original chat session persona_ids as we don't
|
||||
# have the original mappings.
|
||||
# Other builtin assistants remain deleted as per the requirement.
|
||||
|
||||
# Commit transaction
|
||||
conn.execute(sa.text("COMMIT"))
|
||||
|
||||
except Exception as e:
|
||||
# Rollback on error
|
||||
conn.execute(sa.text("ROLLBACK"))
|
||||
raise e
|
||||
@@ -1,90 +0,0 @@
|
||||
"""add stale column to external user group tables
|
||||
|
||||
Revision ID: 58c50ef19f08
|
||||
Revises: 7b9b952abdf6
|
||||
Create Date: 2025-06-25 14:08:14.162380
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "58c50ef19f08"
|
||||
down_revision = "7b9b952abdf6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add the stale column with default value False to user__external_user_group_id
|
||||
op.add_column(
|
||||
"user__external_user_group_id",
|
||||
sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
|
||||
)
|
||||
|
||||
# Create index for efficient querying of stale rows by cc_pair_id
|
||||
op.create_index(
|
||||
"ix_user__external_user_group_id_cc_pair_id_stale",
|
||||
"user__external_user_group_id",
|
||||
["cc_pair_id", "stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# Create index for efficient querying of all stale rows
|
||||
op.create_index(
|
||||
"ix_user__external_user_group_id_stale",
|
||||
"user__external_user_group_id",
|
||||
["stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# Add the stale column with default value False to public_external_user_group
|
||||
op.add_column(
|
||||
"public_external_user_group",
|
||||
sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
|
||||
)
|
||||
|
||||
# Create index for efficient querying of stale rows by cc_pair_id
|
||||
op.create_index(
|
||||
"ix_public_external_user_group_cc_pair_id_stale",
|
||||
"public_external_user_group",
|
||||
["cc_pair_id", "stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# Create index for efficient querying of all stale rows
|
||||
op.create_index(
|
||||
"ix_public_external_user_group_stale",
|
||||
"public_external_user_group",
|
||||
["stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the indices for public_external_user_group first
|
||||
op.drop_index(
|
||||
"ix_public_external_user_group_stale", table_name="public_external_user_group"
|
||||
)
|
||||
op.drop_index(
|
||||
"ix_public_external_user_group_cc_pair_id_stale",
|
||||
table_name="public_external_user_group",
|
||||
)
|
||||
|
||||
# Drop the stale column from public_external_user_group
|
||||
op.drop_column("public_external_user_group", "stale")
|
||||
|
||||
# Drop the indices for user__external_user_group_id
|
||||
op.drop_index(
|
||||
"ix_user__external_user_group_id_stale",
|
||||
table_name="user__external_user_group_id",
|
||||
)
|
||||
op.drop_index(
|
||||
"ix_user__external_user_group_id_cc_pair_id_stale",
|
||||
table_name="user__external_user_group_id",
|
||||
)
|
||||
|
||||
# Drop the stale column from user__external_user_group_id
|
||||
op.drop_column("user__external_user_group_id", "stale")
|
||||
@@ -1,115 +0,0 @@
|
||||
"""add research agent database tables and chat message research fields
|
||||
|
||||
Revision ID: 5ae8240accb3
|
||||
Revises: b558f51620b4
|
||||
Create Date: 2025-08-06 14:29:24.691388
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "5ae8240accb3"
|
||||
down_revision = "b558f51620b4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add research_type and research_plan columns to chat_message table
|
||||
op.add_column(
|
||||
"chat_message",
|
||||
sa.Column("research_type", sa.String(), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"chat_message",
|
||||
sa.Column("research_plan", postgresql.JSONB(), nullable=True),
|
||||
)
|
||||
|
||||
# Create research_agent_iteration table
|
||||
op.create_table(
|
||||
"research_agent_iteration",
|
||||
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
||||
sa.Column(
|
||||
"primary_question_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("chat_message.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("iteration_nr", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("purpose", sa.String(), nullable=True),
|
||||
sa.Column("reasoning", sa.String(), nullable=True),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint(
|
||||
"primary_question_id",
|
||||
"iteration_nr",
|
||||
name="_research_agent_iteration_unique_constraint",
|
||||
),
|
||||
)
|
||||
|
||||
# Create research_agent_iteration_sub_step table
|
||||
op.create_table(
|
||||
"research_agent_iteration_sub_step",
|
||||
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
||||
sa.Column(
|
||||
"primary_question_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("chat_message.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"parent_question_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("research_agent_iteration_sub_step.id", ondelete="CASCADE"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("iteration_nr", sa.Integer(), nullable=False),
|
||||
sa.Column("iteration_sub_step_nr", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("sub_step_instructions", sa.String(), nullable=True),
|
||||
sa.Column(
|
||||
"sub_step_tool_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("tool.id"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("reasoning", sa.String(), nullable=True),
|
||||
sa.Column("sub_answer", sa.String(), nullable=True),
|
||||
sa.Column("cited_doc_results", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("claims", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("generated_images", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("additional_data", postgresql.JSONB(), nullable=True),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.ForeignKeyConstraint(
|
||||
["primary_question_id", "iteration_nr"],
|
||||
[
|
||||
"research_agent_iteration.primary_question_id",
|
||||
"research_agent_iteration.iteration_nr",
|
||||
],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop tables in reverse order
|
||||
op.drop_table("research_agent_iteration_sub_step")
|
||||
op.drop_table("research_agent_iteration")
|
||||
|
||||
# Remove columns from chat_message table
|
||||
op.drop_column("chat_message", "research_plan")
|
||||
op.drop_column("chat_message", "research_type")
|
||||
@@ -1,24 +0,0 @@
|
||||
"""Add content type to UserFile
|
||||
|
||||
Revision ID: 5c448911b12f
|
||||
Revises: 47a07e1a38f1
|
||||
Create Date: 2025-04-25 16:59:48.182672
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "5c448911b12f"
|
||||
down_revision = "47a07e1a38f1"
|
||||
branch_labels: None = None
|
||||
depends_on: None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column("user_file", sa.Column("content_type", sa.String(), nullable=True))
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("user_file", "content_type")
|
||||
@@ -1,132 +0,0 @@
|
||||
"""add file names to file connector config
|
||||
|
||||
Revision ID: 62c3a055a141
|
||||
Revises: 3fc5d75723b3
|
||||
Create Date: 2025-07-30 17:01:24.417551
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "62c3a055a141"
|
||||
down_revision = "3fc5d75723b3"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
SKIP_FILE_NAME_MIGRATION = (
|
||||
os.environ.get("SKIP_FILE_NAME_MIGRATION", "true").lower() == "true"
|
||||
)
|
||||
|
||||
logger = logging.getLogger("alembic.runtime.migration")
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
if SKIP_FILE_NAME_MIGRATION:
|
||||
logger.info(
|
||||
"Skipping file name migration. Hint: set SKIP_FILE_NAME_MIGRATION=false to run this migration"
|
||||
)
|
||||
return
|
||||
logger.info("Running file name migration")
|
||||
# Get connection
|
||||
conn = op.get_bind()
|
||||
|
||||
# Get all FILE connectors with their configs
|
||||
file_connectors = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT id, connector_specific_config
|
||||
FROM connector
|
||||
WHERE source = 'FILE'
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for connector_id, config in file_connectors:
|
||||
# Parse config if it's a string
|
||||
if isinstance(config, str):
|
||||
config = json.loads(config)
|
||||
|
||||
# Get file_locations list
|
||||
file_locations = config.get("file_locations", [])
|
||||
|
||||
# Get display names for each file_id
|
||||
file_names = []
|
||||
for file_id in file_locations:
|
||||
result = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT display_name
|
||||
FROM file_record
|
||||
WHERE file_id = :file_id
|
||||
"""
|
||||
),
|
||||
{"file_id": file_id},
|
||||
).fetchone()
|
||||
|
||||
if result:
|
||||
file_names.append(result[0])
|
||||
else:
|
||||
file_names.append(file_id) # Should not happen
|
||||
|
||||
# Add file_names to config
|
||||
new_config = dict(config)
|
||||
new_config["file_names"] = file_names
|
||||
|
||||
# Update the connector
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE connector
|
||||
SET connector_specific_config = :new_config
|
||||
WHERE id = :connector_id
|
||||
"""
|
||||
),
|
||||
{"connector_id": connector_id, "new_config": json.dumps(new_config)},
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Get connection
|
||||
conn = op.get_bind()
|
||||
|
||||
# Remove file_names from all FILE connectors
|
||||
file_connectors = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT id, connector_specific_config
|
||||
FROM connector
|
||||
WHERE source = 'FILE'
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for connector_id, config in file_connectors:
|
||||
# Parse config if it's a string
|
||||
if isinstance(config, str):
|
||||
config = json.loads(config)
|
||||
|
||||
# Remove file_names if it exists
|
||||
if "file_names" in config:
|
||||
new_config = dict(config)
|
||||
del new_config["file_names"]
|
||||
|
||||
# Update the connector
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE connector
|
||||
SET connector_specific_config = :new_config
|
||||
WHERE id = :connector_id
|
||||
"""
|
||||
),
|
||||
{
|
||||
"connector_id": connector_id,
|
||||
"new_config": json.dumps(new_config),
|
||||
},
|
||||
)
|
||||
@@ -1,41 +0,0 @@
|
||||
"""remove kg subtype from db
|
||||
|
||||
Revision ID: 65bc6e0f8500
|
||||
Revises: cec7ec36c505
|
||||
Create Date: 2025-06-13 10:04:27.705976
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "65bc6e0f8500"
|
||||
down_revision = "cec7ec36c505"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.drop_column("kg_entity", "entity_class")
|
||||
op.drop_column("kg_entity", "entity_subtype")
|
||||
op.drop_column("kg_entity_extraction_staging", "entity_class")
|
||||
op.drop_column("kg_entity_extraction_staging", "entity_subtype")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.add_column(
|
||||
"kg_entity_extraction_staging",
|
||||
sa.Column("entity_subtype", sa.String(), nullable=True, index=True),
|
||||
)
|
||||
op.add_column(
|
||||
"kg_entity_extraction_staging",
|
||||
sa.Column("entity_class", sa.String(), nullable=True, index=True),
|
||||
)
|
||||
op.add_column(
|
||||
"kg_entity", sa.Column("entity_subtype", sa.String(), nullable=True, index=True)
|
||||
)
|
||||
op.add_column(
|
||||
"kg_entity", sa.Column("entity_class", sa.String(), nullable=True, index=True)
|
||||
)
|
||||
@@ -6,6 +6,12 @@ Create Date: 2025-04-01 07:26:10.539362
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import inspect
|
||||
import datetime
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "6a804aeb4830"
|
||||
down_revision = "8e1ac4f39a9f"
|
||||
@@ -13,10 +19,99 @@ branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
# Leaving this around only because some people might be on this migration
|
||||
# originally was a duplicate of the user files migration
|
||||
def upgrade() -> None:
|
||||
pass
|
||||
# Check if user_file table already exists
|
||||
conn = op.get_bind()
|
||||
inspector = inspect(conn)
|
||||
|
||||
if not inspector.has_table("user_file"):
|
||||
# Create user_folder table without parent_id
|
||||
op.create_table(
|
||||
"user_folder",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
|
||||
sa.Column("name", sa.String(length=255), nullable=True),
|
||||
sa.Column("description", sa.String(length=255), nullable=True),
|
||||
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(timezone=True), server_default=sa.func.now()
|
||||
),
|
||||
)
|
||||
|
||||
# Create user_file table with folder_id instead of parent_folder_id
|
||||
op.create_table(
|
||||
"user_file",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True),
|
||||
sa.Column("user_id", sa.UUID(), sa.ForeignKey("user.id"), nullable=True),
|
||||
sa.Column(
|
||||
"folder_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_folder.id"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("link_url", sa.String(), nullable=True),
|
||||
sa.Column("token_count", sa.Integer(), nullable=True),
|
||||
sa.Column("file_type", sa.String(), nullable=True),
|
||||
sa.Column("file_id", sa.String(length=255), nullable=False),
|
||||
sa.Column("document_id", sa.String(length=255), nullable=False),
|
||||
sa.Column("name", sa.String(length=255), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(),
|
||||
default=datetime.datetime.utcnow,
|
||||
),
|
||||
sa.Column(
|
||||
"cc_pair_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("connector_credential_pair.id"),
|
||||
nullable=True,
|
||||
unique=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Create persona__user_file table
|
||||
op.create_table(
|
||||
"persona__user_file",
|
||||
sa.Column(
|
||||
"persona_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("persona.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
sa.Column(
|
||||
"user_file_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_file.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Create persona__user_folder table
|
||||
op.create_table(
|
||||
"persona__user_folder",
|
||||
sa.Column(
|
||||
"persona_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("persona.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
sa.Column(
|
||||
"user_folder_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("user_folder.id"),
|
||||
primary_key=True,
|
||||
),
|
||||
)
|
||||
|
||||
op.add_column(
|
||||
"connector_credential_pair",
|
||||
sa.Column("is_user_file", sa.Boolean(), nullable=True, default=False),
|
||||
)
|
||||
|
||||
# Update existing records to have is_user_file=False instead of NULL
|
||||
op.execute(
|
||||
"UPDATE connector_credential_pair SET is_user_file = FALSE WHERE is_user_file IS NULL"
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
|
||||
@@ -6,8 +6,11 @@ Create Date: 2024-04-15 01:36:02.952809
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import cast
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from onyx.key_value_store.factory import get_kv_store
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "703313b75876"
|
||||
@@ -51,10 +54,27 @@ def upgrade() -> None:
|
||||
sa.PrimaryKeyConstraint("rate_limit_id", "user_group_id"),
|
||||
)
|
||||
|
||||
# NOTE: rate limit settings used to be stored in the "token_budget_settings" key in the
|
||||
# KeyValueStore. This will now be lost. The KV store works differently than it used to
|
||||
# so the migration is fairly complicated and likely not worth it to support (pretty much
|
||||
# nobody will have it set)
|
||||
try:
|
||||
settings_json = cast(str, get_kv_store().load("token_budget_settings"))
|
||||
settings = json.loads(settings_json)
|
||||
|
||||
is_enabled = settings.get("enable_token_budget", False)
|
||||
token_budget = settings.get("token_budget", -1)
|
||||
period_hours = settings.get("period_hours", -1)
|
||||
|
||||
if is_enabled and token_budget > 0 and period_hours > 0:
|
||||
op.execute(
|
||||
f"INSERT INTO token_rate_limit \
|
||||
(enabled, token_budget, period_hours, scope) VALUES \
|
||||
({is_enabled}, {token_budget}, {period_hours}, 'GLOBAL')"
|
||||
)
|
||||
|
||||
# Delete the dynamic config
|
||||
get_kv_store().delete("token_budget_settings")
|
||||
|
||||
except Exception:
|
||||
# Ignore if the dynamic config is not found
|
||||
pass
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
|
||||
@@ -1,237 +0,0 @@
|
||||
"""Add model-configuration table
|
||||
|
||||
Revision ID: 7a70b7664e37
|
||||
Revises: d961aca62eb3
|
||||
Create Date: 2025-04-10 15:00:35.984669
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
from onyx.llm.llm_provider_options import (
|
||||
fetch_model_names_for_provider_as_set,
|
||||
fetch_visible_model_names_for_provider_as_set,
|
||||
)
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "7a70b7664e37"
|
||||
down_revision = "d961aca62eb3"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def _resolve(
|
||||
provider_name: str,
|
||||
model_names: list[str] | None,
|
||||
display_model_names: list[str] | None,
|
||||
default_model_name: str,
|
||||
fast_default_model_name: str | None,
|
||||
) -> set[tuple[str, bool]]:
|
||||
models = set(model_names) if model_names else None
|
||||
display_models = set(display_model_names) if display_model_names else None
|
||||
|
||||
# If both are defined, we need to make sure that `model_names` is a superset of `display_model_names`.
|
||||
if models and display_models:
|
||||
models = display_models.union(models)
|
||||
|
||||
# If only `model_names` is defined, then:
|
||||
# - If default-model-names are available for the `provider_name`, then set `display_model_names` to it
|
||||
# and set `model_names` to the union of those default-model-names with itself.
|
||||
# - If no default-model-names are available, then set `display_models` to `models`.
|
||||
#
|
||||
# This preserves the invariant that `display_models` is a subset of `models`.
|
||||
elif models and not display_models:
|
||||
visible_default_models = fetch_visible_model_names_for_provider_as_set(
|
||||
provider_name=provider_name
|
||||
)
|
||||
if visible_default_models:
|
||||
display_models = set(visible_default_models)
|
||||
models = display_models.union(models)
|
||||
else:
|
||||
display_models = set(models)
|
||||
|
||||
# If only the `display_model_names` are defined, then set `models` to the union of `display_model_names`
|
||||
# and the default-model-names for that provider.
|
||||
#
|
||||
# This will also preserve the invariant that `display_models` is a subset of `models`.
|
||||
elif not models and display_models:
|
||||
default_models = fetch_model_names_for_provider_as_set(
|
||||
provider_name=provider_name
|
||||
)
|
||||
if default_models:
|
||||
models = display_models.union(default_models)
|
||||
else:
|
||||
models = set(display_models)
|
||||
|
||||
# If neither are defined, then set `models` and `display_models` to the default-model-names for the given provider.
|
||||
#
|
||||
# This will also preserve the invariant that `display_models` is a subset of `models`.
|
||||
else:
|
||||
default_models = fetch_model_names_for_provider_as_set(
|
||||
provider_name=provider_name
|
||||
)
|
||||
visible_default_models = fetch_visible_model_names_for_provider_as_set(
|
||||
provider_name=provider_name
|
||||
)
|
||||
|
||||
if default_models:
|
||||
if not visible_default_models:
|
||||
raise RuntimeError
|
||||
raise RuntimeError(
|
||||
"If `default_models` is non-None, `visible_default_models` must be non-None too."
|
||||
)
|
||||
models = default_models
|
||||
display_models = visible_default_models
|
||||
|
||||
# This is not a well-known llm-provider; we can't provide any model suggestions.
|
||||
# Therefore, we set to the empty set and continue
|
||||
else:
|
||||
models = set()
|
||||
display_models = set()
|
||||
|
||||
# It is possible that `default_model_name` is not in `models` and is not in `display_models`.
|
||||
# It is also possible that `fast_default_model_name` is not in `models` and is not in `display_models`.
|
||||
models.add(default_model_name)
|
||||
if fast_default_model_name:
|
||||
models.add(fast_default_model_name)
|
||||
display_models.add(default_model_name)
|
||||
if fast_default_model_name:
|
||||
display_models.add(fast_default_model_name)
|
||||
|
||||
return set([(model, model in display_models) for model in models])
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"model_configuration",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("llm_provider_id", sa.Integer(), nullable=False),
|
||||
sa.Column("name", sa.String(), nullable=False),
|
||||
sa.Column("is_visible", sa.Boolean(), nullable=False),
|
||||
sa.Column("max_input_tokens", sa.Integer(), nullable=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["llm_provider_id"], ["llm_provider.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint("llm_provider_id", "name"),
|
||||
)
|
||||
|
||||
# Create temporary sqlalchemy references to tables for data migration
|
||||
llm_provider_table = sa.sql.table(
|
||||
"llm_provider",
|
||||
sa.column("id", sa.Integer),
|
||||
sa.column("provider", sa.Integer),
|
||||
sa.column("model_names", postgresql.ARRAY(sa.String)),
|
||||
sa.column("display_model_names", postgresql.ARRAY(sa.String)),
|
||||
sa.column("default_model_name", sa.String),
|
||||
sa.column("fast_default_model_name", sa.String),
|
||||
)
|
||||
model_configuration_table = sa.sql.table(
|
||||
"model_configuration",
|
||||
sa.column("id", sa.Integer),
|
||||
sa.column("llm_provider_id", sa.Integer),
|
||||
sa.column("name", sa.String),
|
||||
sa.column("is_visible", sa.Boolean),
|
||||
sa.column("max_input_tokens", sa.Integer),
|
||||
)
|
||||
connection = op.get_bind()
|
||||
llm_providers = connection.execute(
|
||||
sa.select(
|
||||
llm_provider_table.c.id,
|
||||
llm_provider_table.c.provider,
|
||||
llm_provider_table.c.model_names,
|
||||
llm_provider_table.c.display_model_names,
|
||||
llm_provider_table.c.default_model_name,
|
||||
llm_provider_table.c.fast_default_model_name,
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for llm_provider in llm_providers:
|
||||
provider_id = llm_provider[0]
|
||||
provider_name = llm_provider[1]
|
||||
model_names = llm_provider[2]
|
||||
display_model_names = llm_provider[3]
|
||||
default_model_name = llm_provider[4]
|
||||
fast_default_model_name = llm_provider[5]
|
||||
|
||||
model_configurations = _resolve(
|
||||
provider_name=provider_name,
|
||||
model_names=model_names,
|
||||
display_model_names=display_model_names,
|
||||
default_model_name=default_model_name,
|
||||
fast_default_model_name=fast_default_model_name,
|
||||
)
|
||||
|
||||
for model_name, is_visible in model_configurations:
|
||||
connection.execute(
|
||||
model_configuration_table.insert().values(
|
||||
llm_provider_id=provider_id,
|
||||
name=model_name,
|
||||
is_visible=is_visible,
|
||||
max_input_tokens=None,
|
||||
)
|
||||
)
|
||||
|
||||
op.drop_column("llm_provider", "model_names")
|
||||
op.drop_column("llm_provider", "display_model_names")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
llm_provider = sa.table(
|
||||
"llm_provider",
|
||||
sa.column("id", sa.Integer),
|
||||
sa.column("model_names", postgresql.ARRAY(sa.String)),
|
||||
sa.column("display_model_names", postgresql.ARRAY(sa.String)),
|
||||
)
|
||||
|
||||
model_configuration = sa.table(
|
||||
"model_configuration",
|
||||
sa.column("id", sa.Integer),
|
||||
sa.column("llm_provider_id", sa.Integer),
|
||||
sa.column("name", sa.String),
|
||||
sa.column("is_visible", sa.Boolean),
|
||||
sa.column("max_input_tokens", sa.Integer),
|
||||
)
|
||||
op.add_column(
|
||||
"llm_provider",
|
||||
sa.Column(
|
||||
"model_names",
|
||||
postgresql.ARRAY(sa.VARCHAR()),
|
||||
autoincrement=False,
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"llm_provider",
|
||||
sa.Column(
|
||||
"display_model_names",
|
||||
postgresql.ARRAY(sa.VARCHAR()),
|
||||
autoincrement=False,
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
connection = op.get_bind()
|
||||
provider_ids = connection.execute(sa.select(llm_provider.c.id)).fetchall()
|
||||
|
||||
for (provider_id,) in provider_ids:
|
||||
# Get all models for this provider
|
||||
models = connection.execute(
|
||||
sa.select(
|
||||
model_configuration.c.name, model_configuration.c.is_visible
|
||||
).where(model_configuration.c.llm_provider_id == provider_id)
|
||||
).fetchall()
|
||||
|
||||
all_models = [model[0] for model in models]
|
||||
visible_models = [model[0] for model in models if model[1]]
|
||||
|
||||
# Update provider with arrays
|
||||
op.execute(
|
||||
llm_provider.update()
|
||||
.where(llm_provider.c.id == provider_id)
|
||||
.values(model_names=all_models, display_model_names=visible_models)
|
||||
)
|
||||
|
||||
op.drop_table("model_configuration")
|
||||
@@ -1,318 +0,0 @@
|
||||
"""update-entities
|
||||
|
||||
Revision ID: 7b9b952abdf6
|
||||
Revises: 36e9220ab794
|
||||
Create Date: 2025-06-23 20:24:08.139201
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "7b9b952abdf6"
|
||||
down_revision = "36e9220ab794"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
# new entity type metadata_attribute_conversion
|
||||
new_entity_type_conversion = {
|
||||
"LINEAR": {
|
||||
"team": {"name": "team", "keep": True, "implication_property": None},
|
||||
"state": {"name": "state", "keep": True, "implication_property": None},
|
||||
"priority": {
|
||||
"name": "priority",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"estimate": {
|
||||
"name": "estimate",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"started_at": {
|
||||
"name": "started_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"completed_at": {
|
||||
"name": "completed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"due_date": {
|
||||
"name": "due_date",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"creator": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignee": {
|
||||
"name": "assignee",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"JIRA": {
|
||||
"issuetype": {
|
||||
"name": "subtype",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"status": {"name": "status", "keep": True, "implication_property": None},
|
||||
"priority": {
|
||||
"name": "priority",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"project_name": {
|
||||
"name": "project",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"updated": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"resolution_date": {
|
||||
"name": "completed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"duedate": {"name": "due_date", "keep": True, "implication_property": None},
|
||||
"reporter_email": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignee_email": {
|
||||
"name": "assignee",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
"key": {"name": "key", "keep": True, "implication_property": None},
|
||||
"parent": {"name": "parent", "keep": True, "implication_property": None},
|
||||
},
|
||||
"GITHUB_PR": {
|
||||
"repo": {"name": "repository", "keep": True, "implication_property": None},
|
||||
"state": {"name": "state", "keep": True, "implication_property": None},
|
||||
"num_commits": {
|
||||
"name": "num_commits",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"num_files_changed": {
|
||||
"name": "num_files_changed",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"labels": {"name": "labels", "keep": True, "implication_property": None},
|
||||
"merged": {"name": "merged", "keep": True, "implication_property": None},
|
||||
"merged_at": {
|
||||
"name": "merged_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"closed_at": {
|
||||
"name": "closed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"updated_at": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"user": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignees": {
|
||||
"name": "assignees",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"GITHUB_ISSUE": {
|
||||
"repo": {"name": "repository", "keep": True, "implication_property": None},
|
||||
"state": {"name": "state", "keep": True, "implication_property": None},
|
||||
"labels": {"name": "labels", "keep": True, "implication_property": None},
|
||||
"closed_at": {
|
||||
"name": "closed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"updated_at": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"user": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignees": {
|
||||
"name": "assignees",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"FIREFLIES": {},
|
||||
"ACCOUNT": {},
|
||||
"OPPORTUNITY": {
|
||||
"name": {"name": "name", "keep": True, "implication_property": None},
|
||||
"stage_name": {"name": "stage", "keep": True, "implication_property": None},
|
||||
"type": {"name": "type", "keep": True, "implication_property": None},
|
||||
"amount": {"name": "amount", "keep": True, "implication_property": None},
|
||||
"fiscal_year": {
|
||||
"name": "fiscal_year",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"fiscal_quarter": {
|
||||
"name": "fiscal_quarter",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"is_closed": {
|
||||
"name": "is_closed",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"close_date": {
|
||||
"name": "close_date",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"probability": {
|
||||
"name": "close_probability",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_date": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"last_modified_date": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"account": {
|
||||
"name": "account",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "ACCOUNT",
|
||||
"implied_relationship_name": "is_account_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"VENDOR": {},
|
||||
"EMPLOYEE": {},
|
||||
}
|
||||
|
||||
current_entity_types = conn.execute(
|
||||
sa.text("SELECT id_name, attributes from kg_entity_type")
|
||||
).all()
|
||||
for entity_type, attributes in current_entity_types:
|
||||
# delete removed entity types
|
||||
if entity_type not in new_entity_type_conversion:
|
||||
op.execute(
|
||||
sa.text(f"DELETE FROM kg_entity_type WHERE id_name = '{entity_type}'")
|
||||
)
|
||||
continue
|
||||
|
||||
# update entity type attributes
|
||||
if "metadata_attributes" in attributes:
|
||||
del attributes["metadata_attributes"]
|
||||
attributes["metadata_attribute_conversion"] = new_entity_type_conversion[
|
||||
entity_type
|
||||
]
|
||||
attributes_str = json.dumps(attributes).replace("'", "''")
|
||||
op.execute(
|
||||
sa.text(
|
||||
f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
|
||||
f"WHERE id_name = '{entity_type}'"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
current_entity_types = conn.execute(
|
||||
sa.text("SELECT id_name, attributes from kg_entity_type")
|
||||
).all()
|
||||
for entity_type, attributes in current_entity_types:
|
||||
conversion = {}
|
||||
if "metadata_attribute_conversion" in attributes:
|
||||
conversion = attributes.pop("metadata_attribute_conversion")
|
||||
attributes["metadata_attributes"] = {
|
||||
attr: prop["name"] for attr, prop in conversion.items() if prop["keep"]
|
||||
}
|
||||
|
||||
attributes_str = json.dumps(attributes).replace("'", "''")
|
||||
op.execute(
|
||||
sa.text(
|
||||
f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
|
||||
f"WHERE id_name = '{entity_type}'"
|
||||
),
|
||||
)
|
||||
@@ -1,249 +0,0 @@
|
||||
"""add_mcp_server_and_connection_config_models
|
||||
|
||||
Revision ID: 7ed603b64d5a
|
||||
Revises: b329d00a9ea6
|
||||
Create Date: 2025-07-28 17:35:59.900680
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from onyx.db.enums import MCPAuthenticationType
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "7ed603b64d5a"
|
||||
down_revision = "b329d00a9ea6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Create tables and columns for MCP Server support"""
|
||||
|
||||
# 1. MCP Server main table (no FK constraints yet to avoid circular refs)
|
||||
op.create_table(
|
||||
"mcp_server",
|
||||
sa.Column("id", sa.Integer(), primary_key=True),
|
||||
sa.Column("owner", sa.String(), nullable=False),
|
||||
sa.Column("name", sa.String(), nullable=False),
|
||||
sa.Column("description", sa.String(), nullable=True),
|
||||
sa.Column("server_url", sa.String(), nullable=False),
|
||||
sa.Column(
|
||||
"auth_type",
|
||||
sa.Enum(
|
||||
MCPAuthenticationType,
|
||||
name="mcp_authentication_type",
|
||||
native_enum=False,
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("admin_connection_config_id", sa.Integer(), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# 2. MCP Connection Config table (can reference mcp_server now that it exists)
|
||||
op.create_table(
|
||||
"mcp_connection_config",
|
||||
sa.Column("id", sa.Integer(), primary_key=True),
|
||||
sa.Column("mcp_server_id", sa.Integer(), nullable=True),
|
||||
sa.Column("user_email", sa.String(), nullable=False, default=""),
|
||||
sa.Column("config", sa.LargeBinary(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["mcp_server_id"], ["mcp_server.id"], ondelete="CASCADE"
|
||||
),
|
||||
)
|
||||
|
||||
# Helpful indexes
|
||||
op.create_index(
|
||||
"ix_mcp_connection_config_server_user",
|
||||
"mcp_connection_config",
|
||||
["mcp_server_id", "user_email"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_mcp_connection_config_user_email",
|
||||
"mcp_connection_config",
|
||||
["user_email"],
|
||||
)
|
||||
|
||||
# 3. Add the back-references from mcp_server to connection configs
|
||||
op.create_foreign_key(
|
||||
"mcp_server_admin_config_fk",
|
||||
"mcp_server",
|
||||
"mcp_connection_config",
|
||||
["admin_connection_config_id"],
|
||||
["id"],
|
||||
ondelete="SET NULL",
|
||||
)
|
||||
|
||||
# 4. Association / access-control tables
|
||||
op.create_table(
|
||||
"mcp_server__user",
|
||||
sa.Column("mcp_server_id", sa.Integer(), primary_key=True),
|
||||
sa.Column("user_id", sa.UUID(), primary_key=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["mcp_server_id"], ["mcp_server.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"mcp_server__user_group",
|
||||
sa.Column("mcp_server_id", sa.Integer(), primary_key=True),
|
||||
sa.Column("user_group_id", sa.Integer(), primary_key=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["mcp_server_id"], ["mcp_server.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(["user_group_id"], ["user_group.id"]),
|
||||
)
|
||||
|
||||
# 5. Update existing `tool` table – allow tools to belong to an MCP server
|
||||
op.add_column(
|
||||
"tool",
|
||||
sa.Column("mcp_server_id", sa.Integer(), nullable=True),
|
||||
)
|
||||
# Add column for MCP tool input schema
|
||||
op.add_column(
|
||||
"tool",
|
||||
sa.Column("mcp_input_schema", postgresql.JSONB(), nullable=True),
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"tool_mcp_server_fk",
|
||||
"tool",
|
||||
"mcp_server",
|
||||
["mcp_server_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 6. Update persona__tool foreign keys to cascade delete
|
||||
# This ensures that when a tool is deleted (including via MCP server deletion),
|
||||
# the corresponding persona__tool rows are also deleted
|
||||
op.drop_constraint(
|
||||
"persona__tool_tool_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
op.drop_constraint(
|
||||
"persona__tool_persona_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
|
||||
op.create_foreign_key(
|
||||
"persona__tool_persona_id_fkey",
|
||||
"persona__tool",
|
||||
"persona",
|
||||
["persona_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"persona__tool_tool_id_fkey",
|
||||
"persona__tool",
|
||||
"tool",
|
||||
["tool_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 7. Update research_agent_iteration_sub_step foreign key to SET NULL on delete
|
||||
# This ensures that when a tool is deleted, the sub_step_tool_id is set to NULL
|
||||
# instead of causing a foreign key constraint violation
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"tool",
|
||||
["sub_step_tool_id"],
|
||||
["id"],
|
||||
ondelete="SET NULL",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Drop all MCP-related tables / columns"""
|
||||
|
||||
# # # 1. Drop FK & columns from tool
|
||||
# op.drop_constraint("tool_mcp_server_fk", "tool", type_="foreignkey")
|
||||
op.execute("DELETE FROM tool WHERE mcp_server_id IS NOT NULL")
|
||||
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"tool",
|
||||
["sub_step_tool_id"],
|
||||
["id"],
|
||||
)
|
||||
|
||||
# Restore original persona__tool foreign keys (without CASCADE)
|
||||
op.drop_constraint(
|
||||
"persona__tool_persona_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
op.drop_constraint(
|
||||
"persona__tool_tool_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
|
||||
op.create_foreign_key(
|
||||
"persona__tool_persona_id_fkey",
|
||||
"persona__tool",
|
||||
"persona",
|
||||
["persona_id"],
|
||||
["id"],
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"persona__tool_tool_id_fkey",
|
||||
"persona__tool",
|
||||
"tool",
|
||||
["tool_id"],
|
||||
["id"],
|
||||
)
|
||||
op.drop_column("tool", "mcp_input_schema")
|
||||
op.drop_column("tool", "mcp_server_id")
|
||||
|
||||
# 2. Drop association tables
|
||||
op.drop_table("mcp_server__user_group")
|
||||
op.drop_table("mcp_server__user")
|
||||
|
||||
# 3. Drop FK from mcp_server to connection configs
|
||||
op.drop_constraint("mcp_server_admin_config_fk", "mcp_server", type_="foreignkey")
|
||||
|
||||
# 4. Drop connection config indexes & table
|
||||
op.drop_index(
|
||||
"ix_mcp_connection_config_user_email", table_name="mcp_connection_config"
|
||||
)
|
||||
op.drop_index(
|
||||
"ix_mcp_connection_config_server_user", table_name="mcp_connection_config"
|
||||
)
|
||||
op.drop_table("mcp_connection_config")
|
||||
|
||||
# 5. Finally drop mcp_server table
|
||||
op.drop_table("mcp_server")
|
||||
@@ -1,38 +0,0 @@
|
||||
"""drop include citations
|
||||
|
||||
Revision ID: 8818cf73fa1a
|
||||
Revises: 7ed603b64d5a
|
||||
Create Date: 2025-09-02 19:43:50.060680
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "8818cf73fa1a"
|
||||
down_revision = "7ed603b64d5a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.drop_column("prompt", "include_citations")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.add_column(
|
||||
"prompt",
|
||||
sa.Column(
|
||||
"include_citations",
|
||||
sa.BOOLEAN(),
|
||||
autoincrement=False,
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
# Set include_citations based on prompt name: FALSE for ImageGeneration, TRUE for others
|
||||
op.execute(
|
||||
sa.text(
|
||||
"UPDATE prompt SET include_citations = CASE WHEN name = 'ImageGeneration' THEN FALSE ELSE TRUE END"
|
||||
)
|
||||
)
|
||||
@@ -1,341 +0,0 @@
|
||||
"""tag-fix
|
||||
|
||||
Revision ID: 90e3b9af7da4
|
||||
Revises: 62c3a055a141
|
||||
Create Date: 2025-08-01 20:58:14.607624
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from typing import cast
|
||||
from typing import Generator
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.db.search_settings import SearchSettings
|
||||
from onyx.configs.app_configs import AUTH_TYPE
|
||||
from onyx.configs.constants import AuthType
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
|
||||
logger = logging.getLogger("alembic.runtime.migration")
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "90e3b9af7da4"
|
||||
down_revision = "62c3a055a141"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
SKIP_TAG_FIX = os.environ.get("SKIP_TAG_FIX", "true").lower() == "true"
|
||||
|
||||
# override for cloud
|
||||
if AUTH_TYPE == AuthType.CLOUD:
|
||||
SKIP_TAG_FIX = True
|
||||
|
||||
|
||||
def set_is_list_for_known_tags() -> None:
|
||||
"""
|
||||
Sets is_list to true for all tags that are known to be lists.
|
||||
"""
|
||||
LIST_METADATA: list[tuple[str, str]] = [
|
||||
("CLICKUP", "tags"),
|
||||
("CONFLUENCE", "labels"),
|
||||
("DISCOURSE", "tags"),
|
||||
("FRESHDESK", "emails"),
|
||||
("GITHUB", "assignees"),
|
||||
("GITHUB", "labels"),
|
||||
("GURU", "tags"),
|
||||
("GURU", "folders"),
|
||||
("HUBSPOT", "associated_contact_ids"),
|
||||
("HUBSPOT", "associated_company_ids"),
|
||||
("HUBSPOT", "associated_deal_ids"),
|
||||
("HUBSPOT", "associated_ticket_ids"),
|
||||
("JIRA", "labels"),
|
||||
("MEDIAWIKI", "categories"),
|
||||
("ZENDESK", "labels"),
|
||||
("ZENDESK", "content_tags"),
|
||||
]
|
||||
|
||||
bind = op.get_bind()
|
||||
for source, key in LIST_METADATA:
|
||||
bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
UPDATE tag
|
||||
SET is_list = true
|
||||
WHERE tag_key = '{key}'
|
||||
AND source = '{source}'
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def set_is_list_for_list_tags() -> None:
|
||||
"""
|
||||
Sets is_list to true for all tags which have multiple values for a given
|
||||
document, key, and source triplet. This only works if we remove old tags
|
||||
from the database.
|
||||
"""
|
||||
bind = op.get_bind()
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE tag
|
||||
SET is_list = true
|
||||
FROM (
|
||||
SELECT DISTINCT tag.tag_key, tag.source
|
||||
FROM tag
|
||||
JOIN document__tag ON tag.id = document__tag.tag_id
|
||||
GROUP BY tag.tag_key, tag.source, document__tag.document_id
|
||||
HAVING count(*) > 1
|
||||
) AS list_tags
|
||||
WHERE tag.tag_key = list_tags.tag_key
|
||||
AND tag.source = list_tags.source
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def log_list_tags() -> None:
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT DISTINCT source, tag_key
|
||||
FROM tag
|
||||
WHERE is_list
|
||||
ORDER BY source, tag_key
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
logger.info(
|
||||
"List tags:\n" + "\n".join(f" {source}: {key}" for source, key in result)
|
||||
)
|
||||
|
||||
|
||||
def remove_old_tags() -> None:
|
||||
"""
|
||||
Removes old tags from the database.
|
||||
Previously, there was a bug where if a document got indexed with a tag and then
|
||||
the document got reindexed, the old tag would not be removed.
|
||||
This function removes those old tags by comparing it against the tags in vespa.
|
||||
"""
|
||||
current_search_settings, future_search_settings = active_search_settings()
|
||||
document_index = get_default_document_index(
|
||||
current_search_settings, future_search_settings
|
||||
)
|
||||
|
||||
# Get the index name
|
||||
if hasattr(document_index, "index_name"):
|
||||
index_name = document_index.index_name
|
||||
else:
|
||||
# Default index name if we can't get it from the document_index
|
||||
index_name = "danswer_index"
|
||||
|
||||
for batch in _get_batch_documents_with_multiple_tags():
|
||||
n_deleted = 0
|
||||
|
||||
for document_id in batch:
|
||||
true_metadata = _get_vespa_metadata(document_id, index_name)
|
||||
tags = _get_document_tags(document_id)
|
||||
|
||||
# identify document__tags to delete
|
||||
to_delete: list[str] = []
|
||||
for tag_id, tag_key, tag_value in tags:
|
||||
true_val = true_metadata.get(tag_key, "")
|
||||
if (isinstance(true_val, list) and tag_value not in true_val) or (
|
||||
isinstance(true_val, str) and tag_value != true_val
|
||||
):
|
||||
to_delete.append(str(tag_id))
|
||||
|
||||
if not to_delete:
|
||||
continue
|
||||
|
||||
# delete old document__tags
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
DELETE FROM document__tag
|
||||
WHERE document_id = '{document_id}'
|
||||
AND tag_id IN ({','.join(to_delete)})
|
||||
"""
|
||||
)
|
||||
)
|
||||
n_deleted += result.rowcount
|
||||
logger.info(f"Processed {len(batch)} documents and deleted {n_deleted} tags")
|
||||
|
||||
|
||||
def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
|
||||
result = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_fetch = result.fetchall()
|
||||
search_settings = (
|
||||
SearchSettings(**search_settings_fetch[0]._asdict())
|
||||
if search_settings_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
result2 = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_future_fetch = result2.fetchall()
|
||||
search_settings_future = (
|
||||
SearchSettings(**search_settings_future_fetch[0]._asdict())
|
||||
if search_settings_future_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
if not isinstance(search_settings, SearchSettings):
|
||||
raise RuntimeError(
|
||||
"current search settings is of type " + str(type(search_settings))
|
||||
)
|
||||
if (
|
||||
not isinstance(search_settings_future, SearchSettings)
|
||||
and search_settings_future is not None
|
||||
):
|
||||
raise RuntimeError(
|
||||
"future search settings is of type " + str(type(search_settings_future))
|
||||
)
|
||||
|
||||
return search_settings, search_settings_future
|
||||
|
||||
|
||||
def _get_batch_documents_with_multiple_tags(
|
||||
batch_size: int = 128,
|
||||
) -> Generator[list[str], None, None]:
|
||||
"""
|
||||
Returns a list of document ids which contain a one to many tag.
|
||||
The document may either contain a list metadata value, or may contain leftover
|
||||
old tags from reindexing.
|
||||
"""
|
||||
offset_clause = ""
|
||||
bind = op.get_bind()
|
||||
|
||||
while True:
|
||||
batch = bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
SELECT DISTINCT document__tag.document_id
|
||||
FROM tag
|
||||
JOIN document__tag ON tag.id = document__tag.tag_id
|
||||
GROUP BY tag.tag_key, tag.source, document__tag.document_id
|
||||
HAVING count(*) > 1 {offset_clause}
|
||||
ORDER BY document__tag.document_id
|
||||
LIMIT {batch_size}
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
if not batch:
|
||||
break
|
||||
doc_ids = [document_id for document_id, in batch]
|
||||
yield doc_ids
|
||||
offset_clause = f"AND document__tag.document_id > '{doc_ids[-1]}'"
|
||||
|
||||
|
||||
def _get_vespa_metadata(
|
||||
document_id: str, index_name: str
|
||||
) -> dict[str, str | list[str]]:
|
||||
url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
|
||||
|
||||
# Document-Selector language
|
||||
selection = (
|
||||
f"{index_name}.document_id=='{document_id}' and {index_name}.chunk_id==0"
|
||||
)
|
||||
|
||||
params: dict[str, str | int] = {
|
||||
"selection": selection,
|
||||
"wantedDocumentCount": 1,
|
||||
"fieldSet": f"{index_name}:metadata",
|
||||
}
|
||||
|
||||
with get_vespa_http_client() as client:
|
||||
resp = client.get(url, params=params)
|
||||
resp.raise_for_status()
|
||||
|
||||
docs = resp.json().get("documents", [])
|
||||
if not docs:
|
||||
raise RuntimeError(f"No chunk-0 found for document {document_id}")
|
||||
|
||||
# for some reason, metadata is a string
|
||||
metadata = docs[0]["fields"]["metadata"]
|
||||
return json.loads(metadata)
|
||||
|
||||
|
||||
def _get_document_tags(document_id: str) -> list[tuple[int, str, str]]:
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
SELECT tag.id, tag.tag_key, tag.tag_value
|
||||
FROM tag
|
||||
JOIN document__tag ON tag.id = document__tag.tag_id
|
||||
WHERE document__tag.document_id = '{document_id}'
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
return cast(list[tuple[int, str, str]], result)
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"tag",
|
||||
sa.Column("is_list", sa.Boolean(), nullable=False, server_default="false"),
|
||||
)
|
||||
op.drop_constraint(
|
||||
constraint_name="_tag_key_value_source_uc",
|
||||
table_name="tag",
|
||||
type_="unique",
|
||||
)
|
||||
op.create_unique_constraint(
|
||||
constraint_name="_tag_key_value_source_list_uc",
|
||||
table_name="tag",
|
||||
columns=["tag_key", "tag_value", "source", "is_list"],
|
||||
)
|
||||
set_is_list_for_known_tags()
|
||||
|
||||
if SKIP_TAG_FIX:
|
||||
logger.warning(
|
||||
"Skipping removal of old tags. "
|
||||
"This can cause issues when using the knowledge graph, or "
|
||||
"when filtering for documents by tags."
|
||||
)
|
||||
log_list_tags()
|
||||
return
|
||||
|
||||
remove_old_tags()
|
||||
set_is_list_for_list_tags()
|
||||
|
||||
# debug
|
||||
log_list_tags()
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# the migration adds and populates the is_list column, and removes old bugged tags
|
||||
# there isn't a point in adding back the bugged tags, so we just drop the column
|
||||
op.drop_constraint(
|
||||
constraint_name="_tag_key_value_source_list_uc",
|
||||
table_name="tag",
|
||||
type_="unique",
|
||||
)
|
||||
op.create_unique_constraint(
|
||||
constraint_name="_tag_key_value_source_uc",
|
||||
table_name="tag",
|
||||
columns=["tag_key", "tag_value", "source"],
|
||||
)
|
||||
op.drop_column("tag", "is_list")
|
||||
@@ -103,7 +103,6 @@ def upgrade() -> None:
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("connector_credential_pair", "is_user_file")
|
||||
# Drop the persona__user_folder table
|
||||
op.drop_table("persona__user_folder")
|
||||
# Drop the persona__user_file table
|
||||
@@ -112,3 +111,4 @@ def downgrade() -> None:
|
||||
op.drop_table("user_file")
|
||||
# Drop the user_folder table
|
||||
op.drop_table("user_folder")
|
||||
op.drop_column("connector_credential_pair", "is_user_file")
|
||||
|
||||
@@ -1,32 +0,0 @@
|
||||
"""Add public_external_user_group table
|
||||
|
||||
Revision ID: a7688ab35c45
|
||||
Revises: 5c448911b12f
|
||||
Create Date: 2025-05-06 20:55:12.747875
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "a7688ab35c45"
|
||||
down_revision = "5c448911b12f"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"public_external_user_group",
|
||||
sa.Column("external_user_group_id", sa.String(), nullable=False),
|
||||
sa.Column("cc_pair_id", sa.Integer(), nullable=False),
|
||||
sa.PrimaryKeyConstraint("external_user_group_id", "cc_pair_id"),
|
||||
sa.ForeignKeyConstraint(
|
||||
["cc_pair_id"], ["connector_credential_pair.id"], ondelete="CASCADE"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("public_external_user_group")
|
||||
@@ -1,225 +0,0 @@
|
||||
"""merge prompt into persona
|
||||
|
||||
Revision ID: abbfec3a5ac5
|
||||
Revises: 8818cf73fa1a
|
||||
Create Date: 2024-12-19 12:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "abbfec3a5ac5"
|
||||
down_revision = "8818cf73fa1a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
MAX_PROMPT_LENGTH = 5_000_000
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""NOTE: Prompts without any Personas will just be lost."""
|
||||
# Step 1: Add new columns to persona table (only if they don't exist)
|
||||
|
||||
# Check if columns exist before adding them
|
||||
connection = op.get_bind()
|
||||
inspector = sa.inspect(connection)
|
||||
existing_columns = [col["name"] for col in inspector.get_columns("persona")]
|
||||
|
||||
if "system_prompt" not in existing_columns:
|
||||
op.add_column(
|
||||
"persona",
|
||||
sa.Column(
|
||||
"system_prompt", sa.String(length=MAX_PROMPT_LENGTH), nullable=True
|
||||
),
|
||||
)
|
||||
|
||||
if "task_prompt" not in existing_columns:
|
||||
op.add_column(
|
||||
"persona",
|
||||
sa.Column(
|
||||
"task_prompt", sa.String(length=MAX_PROMPT_LENGTH), nullable=True
|
||||
),
|
||||
)
|
||||
|
||||
if "datetime_aware" not in existing_columns:
|
||||
op.add_column(
|
||||
"persona",
|
||||
sa.Column(
|
||||
"datetime_aware", sa.Boolean(), nullable=False, server_default="true"
|
||||
),
|
||||
)
|
||||
|
||||
# Step 2: Migrate data from prompt table to persona table (only if tables exist)
|
||||
existing_tables = inspector.get_table_names()
|
||||
|
||||
if "prompt" in existing_tables and "persona__prompt" in existing_tables:
|
||||
# For personas that have associated prompts, copy the prompt data
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET
|
||||
system_prompt = p.system_prompt,
|
||||
task_prompt = p.task_prompt,
|
||||
datetime_aware = p.datetime_aware
|
||||
FROM (
|
||||
-- Get the first prompt for each persona (in case there are multiple)
|
||||
SELECT DISTINCT ON (pp.persona_id)
|
||||
pp.persona_id,
|
||||
pr.system_prompt,
|
||||
pr.task_prompt,
|
||||
pr.datetime_aware
|
||||
FROM persona__prompt pp
|
||||
JOIN prompt pr ON pp.prompt_id = pr.id
|
||||
) p
|
||||
WHERE persona.id = p.persona_id
|
||||
"""
|
||||
)
|
||||
|
||||
# Step 3: Update chat_message references
|
||||
# Since chat messages referenced prompt_id, we need to update them to use persona_id
|
||||
# This is complex as we need to map from prompt_id to persona_id
|
||||
|
||||
# Check if chat_message has prompt_id column
|
||||
chat_message_columns = [
|
||||
col["name"] for col in inspector.get_columns("chat_message")
|
||||
]
|
||||
if "prompt_id" in chat_message_columns:
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE chat_message
|
||||
DROP CONSTRAINT IF EXISTS chat_message__prompt_fk
|
||||
"""
|
||||
)
|
||||
op.drop_column("chat_message", "prompt_id")
|
||||
|
||||
# Step 4: Handle personas without prompts - set default values if needed (always run this)
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET
|
||||
system_prompt = COALESCE(system_prompt, ''),
|
||||
task_prompt = COALESCE(task_prompt, '')
|
||||
WHERE system_prompt IS NULL OR task_prompt IS NULL
|
||||
"""
|
||||
)
|
||||
|
||||
# Step 5: Drop the persona__prompt association table (if it exists)
|
||||
if "persona__prompt" in existing_tables:
|
||||
op.drop_table("persona__prompt")
|
||||
|
||||
# Step 6: Drop the prompt table (if it exists)
|
||||
if "prompt" in existing_tables:
|
||||
op.drop_table("prompt")
|
||||
|
||||
# Step 7: Make system_prompt and task_prompt non-nullable after migration (only if they exist)
|
||||
op.alter_column(
|
||||
"persona",
|
||||
"system_prompt",
|
||||
existing_type=sa.String(length=MAX_PROMPT_LENGTH),
|
||||
nullable=False,
|
||||
server_default=None,
|
||||
)
|
||||
|
||||
op.alter_column(
|
||||
"persona",
|
||||
"task_prompt",
|
||||
existing_type=sa.String(length=MAX_PROMPT_LENGTH),
|
||||
nullable=False,
|
||||
server_default=None,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Step 1: Recreate the prompt table
|
||||
op.create_table(
|
||||
"prompt",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("name", sa.String(), nullable=False),
|
||||
sa.Column("description", sa.String(), nullable=False),
|
||||
sa.Column("system_prompt", sa.String(length=MAX_PROMPT_LENGTH), nullable=False),
|
||||
sa.Column("task_prompt", sa.String(length=MAX_PROMPT_LENGTH), nullable=False),
|
||||
sa.Column(
|
||||
"datetime_aware", sa.Boolean(), nullable=False, server_default="true"
|
||||
),
|
||||
sa.Column(
|
||||
"default_prompt", sa.Boolean(), nullable=False, server_default="false"
|
||||
),
|
||||
sa.Column("deleted", sa.Boolean(), nullable=False, server_default="false"),
|
||||
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Step 2: Recreate the persona__prompt association table
|
||||
op.create_table(
|
||||
"persona__prompt",
|
||||
sa.Column("persona_id", sa.Integer(), nullable=False),
|
||||
sa.Column("prompt_id", sa.Integer(), nullable=False),
|
||||
sa.ForeignKeyConstraint(
|
||||
["persona_id"],
|
||||
["persona.id"],
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["prompt_id"],
|
||||
["prompt.id"],
|
||||
),
|
||||
sa.PrimaryKeyConstraint("persona_id", "prompt_id"),
|
||||
)
|
||||
|
||||
# Step 3: Migrate data back from persona to prompt table
|
||||
op.execute(
|
||||
"""
|
||||
INSERT INTO prompt (
|
||||
name,
|
||||
description,
|
||||
system_prompt,
|
||||
task_prompt,
|
||||
datetime_aware,
|
||||
default_prompt,
|
||||
deleted,
|
||||
user_id
|
||||
)
|
||||
SELECT
|
||||
CONCAT('Prompt for ', name),
|
||||
description,
|
||||
system_prompt,
|
||||
task_prompt,
|
||||
datetime_aware,
|
||||
is_default_persona,
|
||||
deleted,
|
||||
user_id
|
||||
FROM persona
|
||||
WHERE system_prompt IS NOT NULL AND system_prompt != ''
|
||||
RETURNING id, name
|
||||
"""
|
||||
)
|
||||
|
||||
# Step 4: Re-establish persona__prompt relationships
|
||||
op.execute(
|
||||
"""
|
||||
INSERT INTO persona__prompt (persona_id, prompt_id)
|
||||
SELECT
|
||||
p.id as persona_id,
|
||||
pr.id as prompt_id
|
||||
FROM persona p
|
||||
JOIN prompt pr ON pr.name = CONCAT('Prompt for ', p.name)
|
||||
WHERE p.system_prompt IS NOT NULL AND p.system_prompt != ''
|
||||
"""
|
||||
)
|
||||
|
||||
# Step 5: Add prompt_id column back to chat_message
|
||||
op.add_column("chat_message", sa.Column("prompt_id", sa.Integer(), nullable=True))
|
||||
|
||||
# Step 6: Re-establish foreign key constraint
|
||||
op.create_foreign_key(
|
||||
"chat_message__prompt_fk", "chat_message", "prompt", ["prompt_id"], ["id"]
|
||||
)
|
||||
|
||||
# Step 7: Remove columns from persona table
|
||||
op.drop_column("persona", "datetime_aware")
|
||||
op.drop_column("persona", "task_prompt")
|
||||
op.drop_column("persona", "system_prompt")
|
||||
@@ -1,38 +0,0 @@
|
||||
"""Adding assistant-specific user preferences
|
||||
|
||||
Revision ID: b329d00a9ea6
|
||||
Revises: f9b8c7d6e5a4
|
||||
Create Date: 2025-08-26 23:14:44.592985
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import fastapi_users_db_sqlalchemy
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "b329d00a9ea6"
|
||||
down_revision = "f9b8c7d6e5a4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"assistant__user_specific_config",
|
||||
sa.Column("assistant_id", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
fastapi_users_db_sqlalchemy.generics.GUID(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("disabled_tool_ids", postgresql.ARRAY(sa.Integer()), nullable=False),
|
||||
sa.ForeignKeyConstraint(["assistant_id"], ["persona.id"], ondelete="CASCADE"),
|
||||
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
|
||||
sa.PrimaryKeyConstraint("assistant_id", "user_id"),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("assistant__user_specific_config")
|
||||
@@ -1,27 +0,0 @@
|
||||
"""add_user_oauth_token_to_slack_bot
|
||||
|
||||
Revision ID: b4ef3ae0bf6e
|
||||
Revises: 505c488f6662
|
||||
Create Date: 2025-08-26 17:47:41.788462
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "b4ef3ae0bf6e"
|
||||
down_revision = "505c488f6662"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add user_token column to slack_bot table
|
||||
op.add_column("slack_bot", sa.Column("user_token", sa.LargeBinary(), nullable=True))
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Remove user_token column from slack_bot table
|
||||
op.drop_column("slack_bot", "user_token")
|
||||
@@ -1,33 +0,0 @@
|
||||
"""Pause finished user file connectors
|
||||
|
||||
Revision ID: b558f51620b4
|
||||
Revises: 90e3b9af7da4
|
||||
Create Date: 2025-08-15 17:17:02.456704
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "b558f51620b4"
|
||||
down_revision = "90e3b9af7da4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Set all user file connector credential pairs with ACTIVE status to PAUSED
|
||||
# This ensures user files don't continue to run indexing tasks after processing
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE connector_credential_pair
|
||||
SET status = 'PAUSED'
|
||||
WHERE is_user_file = true
|
||||
AND status = 'ACTIVE'
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
pass
|
||||
@@ -1,43 +0,0 @@
|
||||
"""adjust prompt length
|
||||
|
||||
Revision ID: b7ec9b5b505f
|
||||
Revises: abbfec3a5ac5
|
||||
Create Date: 2025-09-10 18:51:15.629197
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "b7ec9b5b505f"
|
||||
down_revision = "abbfec3a5ac5"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
MAX_PROMPT_LENGTH = 5_000_000
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# NOTE: need to run this since the previous migration PREVIOUSLY set the length to 8000
|
||||
op.alter_column(
|
||||
"persona",
|
||||
"system_prompt",
|
||||
existing_type=sa.String(length=8000),
|
||||
type_=sa.String(length=MAX_PROMPT_LENGTH),
|
||||
existing_nullable=False,
|
||||
)
|
||||
op.alter_column(
|
||||
"persona",
|
||||
"task_prompt",
|
||||
existing_type=sa.String(length=8000),
|
||||
type_=sa.String(length=MAX_PROMPT_LENGTH),
|
||||
existing_nullable=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Downgrade not necessary
|
||||
pass
|
||||
@@ -1,147 +0,0 @@
|
||||
"""migrate_agent_sub_questions_to_research_iterations
|
||||
|
||||
Revision ID: bd7c3bf8beba
|
||||
Revises: f8a9b2c3d4e5
|
||||
Create Date: 2025-08-18 11:33:27.098287
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "bd7c3bf8beba"
|
||||
down_revision = "f8a9b2c3d4e5"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Get connection to execute raw SQL
|
||||
connection = op.get_bind()
|
||||
|
||||
# First, insert data into research_agent_iteration table
|
||||
# This creates one iteration record per primary_question_id using the earliest time_created
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO research_agent_iteration (primary_question_id, created_at, iteration_nr, purpose, reasoning)
|
||||
SELECT
|
||||
primary_question_id,
|
||||
MIN(time_created) as created_at,
|
||||
1 as iteration_nr,
|
||||
'Generating and researching subquestions' as purpose,
|
||||
'(No previous reasoning)' as reasoning
|
||||
FROM agent__sub_question
|
||||
JOIN chat_message on agent__sub_question.primary_question_id = chat_message.id
|
||||
WHERE primary_question_id IS NOT NULL
|
||||
AND chat_message.is_agentic = true
|
||||
GROUP BY primary_question_id
|
||||
ON CONFLICT DO NOTHING;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Then, insert data into research_agent_iteration_sub_step table
|
||||
# This migrates each sub-question as a sub-step
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO research_agent_iteration_sub_step (
|
||||
primary_question_id,
|
||||
iteration_nr,
|
||||
iteration_sub_step_nr,
|
||||
created_at,
|
||||
sub_step_instructions,
|
||||
sub_step_tool_id,
|
||||
sub_answer,
|
||||
cited_doc_results
|
||||
)
|
||||
SELECT
|
||||
primary_question_id,
|
||||
1 as iteration_nr,
|
||||
level_question_num as iteration_sub_step_nr,
|
||||
time_created as created_at,
|
||||
sub_question as sub_step_instructions,
|
||||
1 as sub_step_tool_id,
|
||||
sub_answer,
|
||||
sub_question_doc_results as cited_doc_results
|
||||
FROM agent__sub_question
|
||||
JOIN chat_message on agent__sub_question.primary_question_id = chat_message.id
|
||||
WHERE chat_message.is_agentic = true
|
||||
AND primary_question_id IS NOT NULL
|
||||
ON CONFLICT DO NOTHING;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Update chat_message records: set legacy agentic type and answer purpose for existing agentic messages
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chat_message
|
||||
SET research_answer_purpose = 'ANSWER'
|
||||
WHERE is_agentic = true
|
||||
AND research_type IS NULL and
|
||||
message_type = 'ASSISTANT';
|
||||
"""
|
||||
)
|
||||
)
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chat_message
|
||||
SET research_type = 'LEGACY_AGENTIC'
|
||||
WHERE is_agentic = true
|
||||
AND research_type IS NULL;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Get connection to execute raw SQL
|
||||
connection = op.get_bind()
|
||||
|
||||
# Note: This downgrade removes all research agent iteration data
|
||||
# There's no way to perfectly restore the original agent__sub_question data
|
||||
# if it was deleted after this migration
|
||||
|
||||
# Delete all research_agent_iteration_sub_step records that were migrated
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM research_agent_iteration_sub_step
|
||||
USING chat_message
|
||||
WHERE research_agent_iteration_sub_step.primary_question_id = chat_message.id
|
||||
AND chat_message.research_type = 'LEGACY_AGENTIC';
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Delete all research_agent_iteration records that were migrated
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM research_agent_iteration
|
||||
USING chat_message
|
||||
WHERE research_agent_iteration.primary_question_id = chat_message.id
|
||||
AND chat_message.research_type = 'LEGACY_AGENTIC';
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Revert chat_message updates: clear research fields for legacy agentic messages
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chat_message
|
||||
SET research_type = NULL,
|
||||
research_answer_purpose = NULL
|
||||
WHERE is_agentic = true
|
||||
AND research_type = 'LEGACY_AGENTIC'
|
||||
AND message_type = 'ASSISTANT';
|
||||
"""
|
||||
)
|
||||
)
|
||||
@@ -1,315 +0,0 @@
|
||||
"""modify_file_store_for_external_storage
|
||||
|
||||
Revision ID: c9e2cd766c29
|
||||
Revises: 03bf8be6b53a
|
||||
Create Date: 2025-06-13 14:02:09.867679
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
from typing import cast, Any
|
||||
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from onyx.db._deprecated.pg_file_store import delete_lobj_by_id, read_lobj
|
||||
from onyx.file_store.file_store import get_s3_file_store
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "c9e2cd766c29"
|
||||
down_revision = "03bf8be6b53a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
try:
|
||||
# Modify existing file_store table to support external storage
|
||||
op.rename_table("file_store", "file_record")
|
||||
|
||||
# Make lobj_oid nullable (for external storage files)
|
||||
op.alter_column("file_record", "lobj_oid", nullable=True)
|
||||
|
||||
# Add external storage columns with generic names
|
||||
op.add_column(
|
||||
"file_record", sa.Column("bucket_name", sa.String(), nullable=True)
|
||||
)
|
||||
op.add_column(
|
||||
"file_record", sa.Column("object_key", sa.String(), nullable=True)
|
||||
)
|
||||
|
||||
# Add timestamps for tracking
|
||||
op.add_column(
|
||||
"file_record",
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"file_record",
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
op.alter_column("file_record", "file_name", new_column_name="file_id")
|
||||
except Exception as e:
|
||||
if "does not exist" in str(e) or 'relation "file_store" does not exist' in str(
|
||||
e
|
||||
):
|
||||
print(
|
||||
f"Ran into error - {e}. Likely means we had a partial success in the past, continuing..."
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
print(
|
||||
"External storage configured - migrating files from PostgreSQL to external storage..."
|
||||
)
|
||||
# if we fail midway through this, we'll have a partial success. Running the migration
|
||||
# again should allow us to continue.
|
||||
_migrate_files_to_external_storage()
|
||||
print("File migration completed successfully!")
|
||||
|
||||
# Remove lobj_oid column
|
||||
op.drop_column("file_record", "lobj_oid")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Revert schema changes and migrate files from external storage back to PostgreSQL large objects."""
|
||||
|
||||
print(
|
||||
"Reverting to PostgreSQL-backed file store – migrating files from external storage …"
|
||||
)
|
||||
|
||||
# 1. Ensure `lobj_oid` exists on the current `file_record` table (nullable for now).
|
||||
op.add_column("file_record", sa.Column("lobj_oid", sa.Integer(), nullable=True))
|
||||
|
||||
# 2. Move content from external storage back into PostgreSQL large objects (table is still
|
||||
# called `file_record` so application code continues to work during the copy).
|
||||
try:
|
||||
_migrate_files_to_postgres()
|
||||
except Exception:
|
||||
print("Error during downgrade migration, rolling back …")
|
||||
op.drop_column("file_record", "lobj_oid")
|
||||
raise
|
||||
|
||||
# 3. After migration every row should now have `lobj_oid` populated – mark NOT NULL.
|
||||
op.alter_column("file_record", "lobj_oid", nullable=False)
|
||||
|
||||
# 4. Remove columns that are only relevant to external storage.
|
||||
op.drop_column("file_record", "updated_at")
|
||||
op.drop_column("file_record", "created_at")
|
||||
op.drop_column("file_record", "object_key")
|
||||
op.drop_column("file_record", "bucket_name")
|
||||
|
||||
# 5. Rename `file_id` back to `file_name` (still on `file_record`).
|
||||
op.alter_column("file_record", "file_id", new_column_name="file_name")
|
||||
|
||||
# 6. Finally, rename the table back to its original name expected by the legacy codebase.
|
||||
op.rename_table("file_record", "file_store")
|
||||
|
||||
print(
|
||||
"Downgrade migration completed – files are now stored inside PostgreSQL again."
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helper: migrate from external storage (S3/MinIO) back into PostgreSQL large objects
|
||||
|
||||
|
||||
def _migrate_files_to_postgres() -> None:
|
||||
"""Move any files whose content lives in external S3-compatible storage back into PostgreSQL.
|
||||
|
||||
The logic mirrors *inverse* of `_migrate_files_to_external_storage` used on upgrade.
|
||||
"""
|
||||
|
||||
# Obtain DB session from Alembic context
|
||||
bind = op.get_bind()
|
||||
session = Session(bind=bind)
|
||||
|
||||
# Fetch rows that have external storage pointers (bucket/object_key not NULL)
|
||||
result = session.execute(
|
||||
text(
|
||||
"SELECT file_id, bucket_name, object_key FROM file_record "
|
||||
"WHERE bucket_name IS NOT NULL AND object_key IS NOT NULL"
|
||||
)
|
||||
)
|
||||
|
||||
files_to_migrate = [row[0] for row in result.fetchall()]
|
||||
total_files = len(files_to_migrate)
|
||||
|
||||
if total_files == 0:
|
||||
print("No files found in external storage to migrate back to PostgreSQL.")
|
||||
return
|
||||
|
||||
print(f"Found {total_files} files to migrate back to PostgreSQL large objects.")
|
||||
|
||||
_set_tenant_contextvar(session)
|
||||
migrated_count = 0
|
||||
|
||||
# only create external store if we have files to migrate. This line
|
||||
# makes it so we need to have S3/MinIO configured to run this migration.
|
||||
external_store = get_s3_file_store()
|
||||
|
||||
for i, file_id in enumerate(files_to_migrate, 1):
|
||||
print(f"Migrating file {i}/{total_files}: {file_id}")
|
||||
|
||||
# Read file content from external storage (always binary)
|
||||
try:
|
||||
file_io = external_store.read_file(
|
||||
file_id=file_id, mode="b", use_tempfile=True
|
||||
)
|
||||
file_io.seek(0)
|
||||
|
||||
# Import lazily to avoid circular deps at Alembic runtime
|
||||
from onyx.db._deprecated.pg_file_store import (
|
||||
create_populate_lobj,
|
||||
) # noqa: E402
|
||||
|
||||
# Create new Postgres large object and populate it
|
||||
lobj_oid = create_populate_lobj(content=file_io, db_session=session)
|
||||
|
||||
# Update DB row: set lobj_oid, clear bucket/object_key
|
||||
session.execute(
|
||||
text(
|
||||
"UPDATE file_record SET lobj_oid = :lobj_oid, bucket_name = NULL, "
|
||||
"object_key = NULL WHERE file_id = :file_id"
|
||||
),
|
||||
{"lobj_oid": lobj_oid, "file_id": file_id},
|
||||
)
|
||||
except ClientError as e:
|
||||
if "NoSuchKey" in str(e):
|
||||
print(
|
||||
f"File {file_id} not found in external storage. Deleting from database."
|
||||
)
|
||||
session.execute(
|
||||
text("DELETE FROM file_record WHERE file_id = :file_id"),
|
||||
{"file_id": file_id},
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
migrated_count += 1
|
||||
print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
|
||||
|
||||
# Flush the SQLAlchemy session so statements are sent to the DB, but **do not**
|
||||
# commit the transaction. The surrounding Alembic migration will commit once
|
||||
# the *entire* downgrade succeeds. This keeps the whole downgrade atomic and
|
||||
# avoids leaving the database in a partially-migrated state if a later schema
|
||||
# operation fails.
|
||||
session.flush()
|
||||
|
||||
print(
|
||||
f"Migration back to PostgreSQL completed: {migrated_count} files staged for commit."
|
||||
)
|
||||
|
||||
|
||||
def _migrate_files_to_external_storage() -> None:
|
||||
"""Migrate files from PostgreSQL large objects to external storage"""
|
||||
# Get database session
|
||||
bind = op.get_bind()
|
||||
session = Session(bind=bind)
|
||||
external_store = get_s3_file_store()
|
||||
|
||||
# Find all files currently stored in PostgreSQL (lobj_oid is not null)
|
||||
result = session.execute(
|
||||
text(
|
||||
"SELECT file_id FROM file_record WHERE lobj_oid IS NOT NULL "
|
||||
"AND bucket_name IS NULL AND object_key IS NULL"
|
||||
)
|
||||
)
|
||||
|
||||
files_to_migrate = [row[0] for row in result.fetchall()]
|
||||
total_files = len(files_to_migrate)
|
||||
|
||||
if total_files == 0:
|
||||
print("No files found in PostgreSQL storage to migrate.")
|
||||
return
|
||||
|
||||
# might need to move this above the if statement when creating a new multi-tenant
|
||||
# system. VERY extreme edge case.
|
||||
external_store.initialize()
|
||||
print(f"Found {total_files} files to migrate from PostgreSQL to external storage.")
|
||||
|
||||
_set_tenant_contextvar(session)
|
||||
migrated_count = 0
|
||||
|
||||
for i, file_id in enumerate(files_to_migrate, 1):
|
||||
print(f"Migrating file {i}/{total_files}: {file_id}")
|
||||
|
||||
# Read file record to get metadata
|
||||
file_record = session.execute(
|
||||
text("SELECT * FROM file_record WHERE file_id = :file_id"),
|
||||
{"file_id": file_id},
|
||||
).fetchone()
|
||||
|
||||
if file_record is None:
|
||||
print(f"File {file_id} not found in PostgreSQL storage.")
|
||||
continue
|
||||
|
||||
lobj_id = cast(int, file_record.lobj_oid) # type: ignore
|
||||
file_metadata = cast(Any, file_record.file_metadata) # type: ignore
|
||||
|
||||
# Read file content from PostgreSQL
|
||||
try:
|
||||
file_content = read_lobj(
|
||||
lobj_id, db_session=session, mode="b", use_tempfile=True
|
||||
)
|
||||
except Exception as e:
|
||||
if "large object" in str(e) and "does not exist" in str(e):
|
||||
print(f"File {file_id} not found in PostgreSQL storage.")
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
|
||||
# Handle file_metadata type conversion
|
||||
file_metadata = None
|
||||
if file_metadata is not None:
|
||||
if isinstance(file_metadata, dict):
|
||||
file_metadata = file_metadata
|
||||
else:
|
||||
# Convert other types to dict if possible, otherwise None
|
||||
try:
|
||||
file_metadata = dict(file_record.file_metadata) # type: ignore
|
||||
except (TypeError, ValueError):
|
||||
file_metadata = None
|
||||
|
||||
# Save to external storage (this will handle the database record update and cleanup)
|
||||
# NOTE: this WILL .commit() the transaction.
|
||||
external_store.save_file(
|
||||
file_id=file_id,
|
||||
content=file_content,
|
||||
display_name=file_record.display_name,
|
||||
file_origin=file_record.file_origin,
|
||||
file_type=file_record.file_type,
|
||||
file_metadata=file_metadata,
|
||||
)
|
||||
delete_lobj_by_id(lobj_id, db_session=session)
|
||||
|
||||
migrated_count += 1
|
||||
print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
|
||||
|
||||
# See note above – flush but do **not** commit so the outer Alembic transaction
|
||||
# controls atomicity.
|
||||
session.flush()
|
||||
|
||||
print(
|
||||
f"Migration completed: {migrated_count} files staged for commit to external storage."
|
||||
)
|
||||
|
||||
|
||||
def _set_tenant_contextvar(session: Session) -> None:
|
||||
"""Set the tenant contextvar to the default schema"""
|
||||
current_tenant = session.execute(text("SELECT current_schema()")).scalar()
|
||||
print(f"Migrating files for tenant: {current_tenant}")
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(current_tenant)
|
||||
@@ -1,128 +0,0 @@
|
||||
"""add_cascade_deletes_to_agent_tables
|
||||
|
||||
Revision ID: ca04500b9ee8
|
||||
Revises: 238b84885828
|
||||
Create Date: 2025-05-30 16:03:51.112263
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "ca04500b9ee8"
|
||||
down_revision = "238b84885828"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Drop existing foreign key constraints
|
||||
op.drop_constraint(
|
||||
"agent__sub_question_primary_question_id_fkey",
|
||||
"agent__sub_question",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.drop_constraint(
|
||||
"agent__sub_query_parent_question_id_fkey",
|
||||
"agent__sub_query",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.drop_constraint(
|
||||
"chat_message__standard_answer_chat_message_id_fkey",
|
||||
"chat_message__standard_answer",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.drop_constraint(
|
||||
"agent__sub_query__search_doc_sub_query_id_fkey",
|
||||
"agent__sub_query__search_doc",
|
||||
type_="foreignkey",
|
||||
)
|
||||
|
||||
# Recreate foreign key constraints with CASCADE delete
|
||||
op.create_foreign_key(
|
||||
"agent__sub_question_primary_question_id_fkey",
|
||||
"agent__sub_question",
|
||||
"chat_message",
|
||||
["primary_question_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"agent__sub_query_parent_question_id_fkey",
|
||||
"agent__sub_query",
|
||||
"agent__sub_question",
|
||||
["parent_question_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"chat_message__standard_answer_chat_message_id_fkey",
|
||||
"chat_message__standard_answer",
|
||||
"chat_message",
|
||||
["chat_message_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"agent__sub_query__search_doc_sub_query_id_fkey",
|
||||
"agent__sub_query__search_doc",
|
||||
"agent__sub_query",
|
||||
["sub_query_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop CASCADE foreign key constraints
|
||||
op.drop_constraint(
|
||||
"agent__sub_question_primary_question_id_fkey",
|
||||
"agent__sub_question",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.drop_constraint(
|
||||
"agent__sub_query_parent_question_id_fkey",
|
||||
"agent__sub_query",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.drop_constraint(
|
||||
"chat_message__standard_answer_chat_message_id_fkey",
|
||||
"chat_message__standard_answer",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.drop_constraint(
|
||||
"agent__sub_query__search_doc_sub_query_id_fkey",
|
||||
"agent__sub_query__search_doc",
|
||||
type_="foreignkey",
|
||||
)
|
||||
|
||||
# Recreate foreign key constraints without CASCADE delete
|
||||
op.create_foreign_key(
|
||||
"agent__sub_question_primary_question_id_fkey",
|
||||
"agent__sub_question",
|
||||
"chat_message",
|
||||
["primary_question_id"],
|
||||
["id"],
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"agent__sub_query_parent_question_id_fkey",
|
||||
"agent__sub_query",
|
||||
"agent__sub_question",
|
||||
["parent_question_id"],
|
||||
["id"],
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"chat_message__standard_answer_chat_message_id_fkey",
|
||||
"chat_message__standard_answer",
|
||||
"chat_message",
|
||||
["chat_message_id"],
|
||||
["id"],
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"agent__sub_query__search_doc_sub_query_id_fkey",
|
||||
"agent__sub_query__search_doc",
|
||||
"agent__sub_query",
|
||||
["sub_query_id"],
|
||||
["id"],
|
||||
)
|
||||
@@ -1,29 +0,0 @@
|
||||
"""kgentity_parent
|
||||
|
||||
Revision ID: cec7ec36c505
|
||||
Revises: 495cb26ce93e
|
||||
Create Date: 2025-06-07 20:07:46.400770
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "cec7ec36c505"
|
||||
down_revision = "495cb26ce93e"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"kg_entity",
|
||||
sa.Column("parent_key", sa.String(), nullable=True, index=True),
|
||||
)
|
||||
# NOTE: you will have to reindex the KG after this migration as the parent_key will be null
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("kg_entity", "parent_key")
|
||||
@@ -1,152 +0,0 @@
|
||||
"""seed_builtin_tools
|
||||
|
||||
Revision ID: d09fc20a3c66
|
||||
Revises: b7ec9b5b505f
|
||||
Create Date: 2025-09-09 19:32:16.824373
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "d09fc20a3c66"
|
||||
down_revision = "b7ec9b5b505f"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
# Tool definitions - core tools that should always be seeded
|
||||
# Names/in_code_tool_id are the same as the class names in the tool_implementations package
|
||||
BUILT_IN_TOOLS = [
|
||||
{
|
||||
"name": "SearchTool",
|
||||
"display_name": "Internal Search",
|
||||
"description": "The Search Action allows the Assistant to search through connected knowledge to help build an answer.",
|
||||
"in_code_tool_id": "SearchTool",
|
||||
},
|
||||
{
|
||||
"name": "ImageGenerationTool",
|
||||
"display_name": "Image Generation",
|
||||
"description": (
|
||||
"The Image Generation Action allows the assistant to use DALL-E 3 or GPT-IMAGE-1 to generate images. "
|
||||
"The action will be used when the user asks the assistant to generate an image."
|
||||
),
|
||||
"in_code_tool_id": "ImageGenerationTool",
|
||||
},
|
||||
{
|
||||
"name": "WebSearchTool",
|
||||
"display_name": "Web Search",
|
||||
"description": (
|
||||
"The Web Search Action allows the assistant "
|
||||
"to perform internet searches for up-to-date information."
|
||||
),
|
||||
"in_code_tool_id": "WebSearchTool",
|
||||
},
|
||||
{
|
||||
"name": "KnowledgeGraphTool",
|
||||
"display_name": "Knowledge Graph Search",
|
||||
"description": (
|
||||
"The Knowledge Graph Search Action allows the assistant to search the "
|
||||
"Knowledge Graph for information. This tool can (for now) only be active in the KG Beta Assistant, "
|
||||
"and it requires the Knowledge Graph to be enabled."
|
||||
),
|
||||
"in_code_tool_id": "KnowledgeGraphTool",
|
||||
},
|
||||
{
|
||||
"name": "OktaProfileTool",
|
||||
"display_name": "Okta Profile",
|
||||
"description": (
|
||||
"The Okta Profile Action allows the assistant to fetch the current user's information from Okta. "
|
||||
"This may include the user's name, email, phone number, address, and other details such as their "
|
||||
"manager and direct reports."
|
||||
),
|
||||
"in_code_tool_id": "OktaProfileTool",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
# Start transaction
|
||||
conn.execute(sa.text("BEGIN"))
|
||||
|
||||
try:
|
||||
# Get existing tools to check what already exists
|
||||
existing_tools = conn.execute(
|
||||
sa.text(
|
||||
"SELECT in_code_tool_id FROM tool WHERE in_code_tool_id IS NOT NULL"
|
||||
)
|
||||
).fetchall()
|
||||
existing_tool_ids = {row[0] for row in existing_tools}
|
||||
|
||||
# Insert or update built-in tools
|
||||
for tool in BUILT_IN_TOOLS:
|
||||
in_code_id = tool["in_code_tool_id"]
|
||||
|
||||
# Handle historical rename: InternetSearchTool -> WebSearchTool
|
||||
if (
|
||||
in_code_id == "WebSearchTool"
|
||||
and "WebSearchTool" not in existing_tool_ids
|
||||
and "InternetSearchTool" in existing_tool_ids
|
||||
):
|
||||
# Rename the existing InternetSearchTool row in place and update fields
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE tool
|
||||
SET name = :name,
|
||||
display_name = :display_name,
|
||||
description = :description,
|
||||
in_code_tool_id = :in_code_tool_id
|
||||
WHERE in_code_tool_id = 'InternetSearchTool'
|
||||
"""
|
||||
),
|
||||
tool,
|
||||
)
|
||||
# Keep the local view of existing ids in sync to avoid duplicate insert
|
||||
existing_tool_ids.discard("InternetSearchTool")
|
||||
existing_tool_ids.add("WebSearchTool")
|
||||
continue
|
||||
|
||||
if in_code_id in existing_tool_ids:
|
||||
# Update existing tool
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE tool
|
||||
SET name = :name,
|
||||
display_name = :display_name,
|
||||
description = :description
|
||||
WHERE in_code_tool_id = :in_code_tool_id
|
||||
"""
|
||||
),
|
||||
tool,
|
||||
)
|
||||
else:
|
||||
# Insert new tool
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO tool (name, display_name, description, in_code_tool_id)
|
||||
VALUES (:name, :display_name, :description, :in_code_tool_id)
|
||||
"""
|
||||
),
|
||||
tool,
|
||||
)
|
||||
|
||||
# Commit transaction
|
||||
conn.execute(sa.text("COMMIT"))
|
||||
|
||||
except Exception as e:
|
||||
# Rollback on error
|
||||
conn.execute(sa.text("ROLLBACK"))
|
||||
raise e
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# We don't remove the tools on downgrade since it's totally fine to just
|
||||
# have them around. If we upgrade again, it will be a no-op.
|
||||
pass
|
||||
@@ -1,57 +0,0 @@
|
||||
"""Update status length
|
||||
|
||||
Revision ID: d961aca62eb3
|
||||
Revises: cf90764725d8
|
||||
Create Date: 2025-03-23 16:10:05.683965
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "d961aca62eb3"
|
||||
down_revision = "cf90764725d8"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Drop the existing enum type constraint
|
||||
op.execute("ALTER TABLE connector_credential_pair ALTER COLUMN status TYPE varchar")
|
||||
|
||||
# Create new enum type with all values
|
||||
op.execute(
|
||||
"ALTER TABLE connector_credential_pair ALTER COLUMN status TYPE VARCHAR(20) USING status::varchar(20)"
|
||||
)
|
||||
|
||||
# Update the enum type to include all possible values
|
||||
op.alter_column(
|
||||
"connector_credential_pair",
|
||||
"status",
|
||||
type_=sa.Enum(
|
||||
"SCHEDULED",
|
||||
"INITIAL_INDEXING",
|
||||
"ACTIVE",
|
||||
"PAUSED",
|
||||
"DELETING",
|
||||
"INVALID",
|
||||
name="connectorcredentialpairstatus",
|
||||
native_enum=False,
|
||||
),
|
||||
existing_type=sa.String(20),
|
||||
nullable=False,
|
||||
)
|
||||
|
||||
op.add_column(
|
||||
"connector_credential_pair",
|
||||
sa.Column(
|
||||
"in_repeated_error_state", sa.Boolean, default=False, server_default="false"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# no need to convert back to the old enum type, since we're not using it anymore
|
||||
op.drop_column("connector_credential_pair", "in_repeated_error_state")
|
||||
@@ -11,7 +11,7 @@ import sqlalchemy as sa
|
||||
import json
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.jira.utils import extract_jira_project
|
||||
from onyx.connectors.onyx_jira.utils import extract_jira_project
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
@@ -21,9 +21,6 @@ branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
PRESERVED_CONFIG_KEYS = ["comment_email_blacklist", "batch_size", "labels_to_skip"]
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Get all Jira connectors
|
||||
conn = op.get_bind()
|
||||
@@ -65,9 +62,6 @@ def upgrade() -> None:
|
||||
f"WARNING: Jira connector {connector_id} has no project URL configured"
|
||||
)
|
||||
continue
|
||||
for old_key in PRESERVED_CONFIG_KEYS:
|
||||
if old_key in old_config:
|
||||
new_config[old_key] = old_config[old_key]
|
||||
|
||||
# Update the connector config
|
||||
conn.execute(
|
||||
@@ -114,10 +108,6 @@ def downgrade() -> None:
|
||||
else:
|
||||
continue
|
||||
|
||||
for old_key in PRESERVED_CONFIG_KEYS:
|
||||
if old_key in new_config:
|
||||
old_config[old_key] = new_config[old_key]
|
||||
|
||||
# Update the connector config
|
||||
conn.execute(
|
||||
sa.text(
|
||||
@@ -127,5 +117,5 @@ def downgrade() -> None:
|
||||
WHERE id = :id
|
||||
"""
|
||||
),
|
||||
{"id": connector_id, "old_config": json.dumps(old_config)},
|
||||
{"id": connector_id, "old_config": old_config},
|
||||
)
|
||||
|
||||
@@ -10,19 +10,12 @@ from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy import table, column, String, Integer, Boolean
|
||||
|
||||
from onyx.configs.model_configs import ASYM_PASSAGE_PREFIX
|
||||
from onyx.configs.model_configs import ASYM_QUERY_PREFIX
|
||||
from onyx.configs.model_configs import DOC_EMBEDDING_DIM
|
||||
from onyx.configs.model_configs import DOCUMENT_ENCODER_MODEL
|
||||
from onyx.configs.model_configs import NORMALIZE_EMBEDDINGS
|
||||
from onyx.configs.model_configs import OLD_DEFAULT_DOCUMENT_ENCODER_MODEL
|
||||
from onyx.configs.model_configs import OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM
|
||||
from onyx.configs.model_configs import OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS
|
||||
from onyx.db.enums import EmbeddingPrecision
|
||||
from onyx.db.search_settings import (
|
||||
get_new_default_embedding_model,
|
||||
get_old_default_embedding_model,
|
||||
user_has_overridden_embedding_model,
|
||||
)
|
||||
from onyx.db.models import IndexModelStatus
|
||||
from onyx.db.search_settings import user_has_overridden_embedding_model
|
||||
from onyx.indexing.models import IndexingSetting
|
||||
from onyx.natural_language_processing.search_nlp_models import clean_model_name
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "dbaa756c2ccf"
|
||||
@@ -31,47 +24,6 @@ branch_labels: None = None
|
||||
depends_on: None = None
|
||||
|
||||
|
||||
def _get_old_default_embedding_model() -> IndexingSetting:
|
||||
is_overridden = user_has_overridden_embedding_model()
|
||||
return IndexingSetting(
|
||||
model_name=(
|
||||
DOCUMENT_ENCODER_MODEL
|
||||
if is_overridden
|
||||
else OLD_DEFAULT_DOCUMENT_ENCODER_MODEL
|
||||
),
|
||||
model_dim=(
|
||||
DOC_EMBEDDING_DIM if is_overridden else OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM
|
||||
),
|
||||
embedding_precision=(EmbeddingPrecision.FLOAT),
|
||||
normalize=(
|
||||
NORMALIZE_EMBEDDINGS
|
||||
if is_overridden
|
||||
else OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS
|
||||
),
|
||||
query_prefix=(ASYM_QUERY_PREFIX if is_overridden else ""),
|
||||
passage_prefix=(ASYM_PASSAGE_PREFIX if is_overridden else ""),
|
||||
index_name="danswer_chunk",
|
||||
multipass_indexing=False,
|
||||
enable_contextual_rag=False,
|
||||
api_url=None,
|
||||
)
|
||||
|
||||
|
||||
def _get_new_default_embedding_model() -> IndexingSetting:
|
||||
return IndexingSetting(
|
||||
model_name=DOCUMENT_ENCODER_MODEL,
|
||||
model_dim=DOC_EMBEDDING_DIM,
|
||||
embedding_precision=(EmbeddingPrecision.BFLOAT16),
|
||||
normalize=NORMALIZE_EMBEDDINGS,
|
||||
query_prefix=ASYM_QUERY_PREFIX,
|
||||
passage_prefix=ASYM_PASSAGE_PREFIX,
|
||||
index_name=f"danswer_chunk_{clean_model_name(DOCUMENT_ENCODER_MODEL)}",
|
||||
multipass_indexing=False,
|
||||
enable_contextual_rag=False,
|
||||
api_url=None,
|
||||
)
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"embedding_model",
|
||||
@@ -109,7 +61,7 @@ def upgrade() -> None:
|
||||
# the user selected via env variables before this change. This is needed since
|
||||
# all index_attempts must be associated with an embedding model, so without this
|
||||
# we will run into violations of non-null contraints
|
||||
old_embedding_model = _get_old_default_embedding_model()
|
||||
old_embedding_model = get_old_default_embedding_model()
|
||||
op.bulk_insert(
|
||||
EmbeddingModel,
|
||||
[
|
||||
@@ -127,7 +79,7 @@ def upgrade() -> None:
|
||||
# if the user has not overridden the default embedding model via env variables,
|
||||
# insert the new default model into the database to auto-upgrade them
|
||||
if not user_has_overridden_embedding_model():
|
||||
new_embedding_model = _get_new_default_embedding_model()
|
||||
new_embedding_model = get_new_default_embedding_model()
|
||||
op.bulk_insert(
|
||||
EmbeddingModel,
|
||||
[
|
||||
|
||||
@@ -18,13 +18,11 @@ depends_on: None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.execute("DROP TABLE IF EXISTS document CASCADE")
|
||||
op.create_table(
|
||||
"document",
|
||||
sa.Column("id", sa.String(), nullable=False),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.execute("DROP TABLE IF EXISTS chunk CASCADE")
|
||||
op.create_table(
|
||||
"chunk",
|
||||
sa.Column("id", sa.String(), nullable=False),
|
||||
@@ -45,7 +43,6 @@ def upgrade() -> None:
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id", "document_store_type"),
|
||||
)
|
||||
op.execute("DROP TABLE IF EXISTS deletion_attempt CASCADE")
|
||||
op.create_table(
|
||||
"deletion_attempt",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
@@ -87,7 +84,6 @@ def upgrade() -> None:
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.execute("DROP TABLE IF EXISTS document_by_connector_credential_pair CASCADE")
|
||||
op.create_table(
|
||||
"document_by_connector_credential_pair",
|
||||
sa.Column("id", sa.String(), nullable=False),
|
||||
@@ -110,10 +106,7 @@ def upgrade() -> None:
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# upstream tables first
|
||||
op.drop_table("document_by_connector_credential_pair")
|
||||
op.drop_table("deletion_attempt")
|
||||
op.drop_table("chunk")
|
||||
|
||||
# Alembic op.drop_table() has no "cascade" flag – issue raw SQL
|
||||
op.execute("DROP TABLE IF EXISTS document CASCADE")
|
||||
op.drop_table("document")
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
"""add research_answer_purpose to chat_message
|
||||
|
||||
Revision ID: f8a9b2c3d4e5
|
||||
Revises: 5ae8240accb3
|
||||
Create Date: 2025-01-27 12:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "f8a9b2c3d4e5"
|
||||
down_revision = "5ae8240accb3"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add research_answer_purpose column to chat_message table
|
||||
op.add_column(
|
||||
"chat_message",
|
||||
sa.Column("research_answer_purpose", sa.String(), nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Remove research_answer_purpose column from chat_message table
|
||||
op.drop_column("chat_message", "research_answer_purpose")
|
||||
@@ -1,69 +0,0 @@
|
||||
"""remove foreign key constraints from research_agent_iteration_sub_step
|
||||
|
||||
Revision ID: f9b8c7d6e5a4
|
||||
Revises: bd7c3bf8beba
|
||||
Create Date: 2025-01-27 12:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "f9b8c7d6e5a4"
|
||||
down_revision = "bd7c3bf8beba"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Drop the existing foreign key constraint for parent_question_id
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_parent_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
|
||||
# Drop the parent_question_id column entirely
|
||||
op.drop_column("research_agent_iteration_sub_step", "parent_question_id")
|
||||
|
||||
# Drop the foreign key constraint for primary_question_id to chat_message.id
|
||||
# (keep the column as it's needed for the composite foreign key)
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_primary_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Restore the foreign key constraint for primary_question_id to chat_message.id
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_primary_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"chat_message",
|
||||
["primary_question_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# Add back the parent_question_id column
|
||||
op.add_column(
|
||||
"research_agent_iteration_sub_step",
|
||||
sa.Column(
|
||||
"parent_question_id",
|
||||
sa.Integer(),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Restore the foreign key constraint pointing to research_agent_iteration_sub_step.id
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_parent_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"research_agent_iteration_sub_step",
|
||||
["parent_question_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
@@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import create_async_engine
|
||||
from sqlalchemy.schema import SchemaItem
|
||||
|
||||
from alembic import context
|
||||
from onyx.db.engine.sql_engine import build_connection_string
|
||||
from onyx.db.engine import build_connection_string
|
||||
from onyx.db.models import PublicBase
|
||||
|
||||
# this is the Alembic Config object, which provides
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
"""add_db_readonly_user
|
||||
|
||||
Revision ID: 3b9f09038764
|
||||
Revises: 3b45e0018bf1
|
||||
Create Date: 2025-05-11 11:05:11.436977
|
||||
|
||||
"""
|
||||
|
||||
from sqlalchemy import text
|
||||
|
||||
from alembic import op
|
||||
from onyx.configs.app_configs import DB_READONLY_PASSWORD
|
||||
from onyx.configs.app_configs import DB_READONLY_USER
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "3b9f09038764"
|
||||
down_revision = "3b45e0018bf1"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
if MULTI_TENANT:
|
||||
|
||||
# Enable pg_trgm extension if not already enabled
|
||||
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
|
||||
|
||||
# Create read-only db user here only in multi-tenant mode. For single-tenant mode,
|
||||
# the user is created in the standard migration.
|
||||
if not (DB_READONLY_USER and DB_READONLY_PASSWORD):
|
||||
raise Exception("DB_READONLY_USER or DB_READONLY_PASSWORD is not set")
|
||||
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
-- Check if the read-only user already exists
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
|
||||
-- Create the read-only user with the specified password
|
||||
EXECUTE format('CREATE USER %I WITH PASSWORD %L', '{DB_READONLY_USER}', '{DB_READONLY_PASSWORD}');
|
||||
-- First revoke all privileges to ensure a clean slate
|
||||
EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
|
||||
-- Grant only the CONNECT privilege to allow the user to connect to the database
|
||||
-- but not perform any operations without additional specific grants
|
||||
EXECUTE format('GRANT CONNECT ON DATABASE %I TO %I', current_database(), '{DB_READONLY_USER}');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
if MULTI_TENANT:
|
||||
# Drop read-only db user here only in single tenant mode. For multi-tenant mode,
|
||||
# the user is dropped in the alembic_tenants migration.
|
||||
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
|
||||
-- First revoke all privileges from the database
|
||||
EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
|
||||
-- Then revoke all privileges from the public schema
|
||||
EXECUTE format('REVOKE ALL ON SCHEMA public FROM %I', '{DB_READONLY_USER}');
|
||||
-- Then drop the user
|
||||
EXECUTE format('DROP USER %I', '{DB_READONLY_USER}');
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
"""
|
||||
)
|
||||
)
|
||||
op.execute(text("DROP EXTENSION IF EXISTS pg_trgm"))
|
||||
@@ -1,10 +1,12 @@
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ee.onyx.db.external_perm import fetch_external_groups_for_user
|
||||
from ee.onyx.db.external_perm import fetch_public_external_group_ids
|
||||
from ee.onyx.db.user_group import fetch_user_groups_for_documents
|
||||
from ee.onyx.db.user_group import fetch_user_groups_for_user
|
||||
from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
|
||||
from ee.onyx.external_permissions.post_query_censoring import (
|
||||
DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION,
|
||||
)
|
||||
from ee.onyx.external_permissions.sync_params import DOC_PERMISSIONS_FUNC_MAP
|
||||
from onyx.access.access import (
|
||||
_get_access_for_documents as get_access_for_documents_without_groups,
|
||||
)
|
||||
@@ -15,10 +17,6 @@ from onyx.access.utils import prefix_user_group
|
||||
from onyx.db.document import get_document_sources
|
||||
from onyx.db.document import get_documents_by_ids
|
||||
from onyx.db.models import User
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _get_access_for_document(
|
||||
@@ -65,21 +63,13 @@ def _get_access_for_documents(
|
||||
document_ids=document_ids,
|
||||
)
|
||||
|
||||
all_public_ext_u_group_ids = set(fetch_public_external_group_ids(db_session))
|
||||
|
||||
access_map = {}
|
||||
for document_id, non_ee_access in non_ee_access_dict.items():
|
||||
document = doc_id_map[document_id]
|
||||
source = doc_id_to_source_map.get(document_id)
|
||||
if source is None:
|
||||
logger.error(f"Document {document_id} has no source")
|
||||
continue
|
||||
|
||||
perm_sync_config = get_source_perm_sync_config(source)
|
||||
is_only_censored = (
|
||||
perm_sync_config
|
||||
and perm_sync_config.censoring_config is not None
|
||||
and perm_sync_config.doc_sync_config is None
|
||||
source in DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION
|
||||
and source not in DOC_PERMISSIONS_FUNC_MAP
|
||||
)
|
||||
|
||||
ext_u_emails = (
|
||||
@@ -99,10 +89,7 @@ def _get_access_for_documents(
|
||||
# If its censored, then it's public anywhere during the search and then permissions are
|
||||
# applied after the search
|
||||
is_public_anywhere = (
|
||||
document.is_public
|
||||
or non_ee_access.is_public
|
||||
or is_only_censored
|
||||
or any(u_group in all_public_ext_u_group_ids for u_group in ext_u_groups)
|
||||
document.is_public or non_ee_access.is_public or is_only_censored
|
||||
)
|
||||
|
||||
# To avoid collisions of group namings between connectors, they need to be prefixed
|
||||
|
||||
@@ -1,129 +0,0 @@
|
||||
import csv
|
||||
import io
|
||||
from datetime import datetime
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
|
||||
from ee.onyx.server.query_history.api import fetch_and_process_chat_session_history
|
||||
from ee.onyx.server.query_history.api import ONYX_ANONYMIZED_EMAIL
|
||||
from ee.onyx.server.query_history.models import QuestionAnswerPairSnapshot
|
||||
from onyx.background.celery.apps.heavy import celery_app
|
||||
from onyx.background.task_utils import construct_query_history_report_name
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.app_configs import ONYX_QUERY_HISTORY_TYPE
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.configs.constants import FileType
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import QueryHistoryType
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.tasks import delete_task_with_id
|
||||
from onyx.db.tasks import mark_task_as_finished_with_id
|
||||
from onyx.db.tasks import mark_task_as_started_with_id
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.EXPORT_QUERY_HISTORY_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
bind=True,
|
||||
trail=False,
|
||||
)
|
||||
def export_query_history_task(
|
||||
self: Task,
|
||||
*,
|
||||
start: datetime,
|
||||
end: datetime,
|
||||
start_time: datetime,
|
||||
# Need to include the tenant_id since the TenantAwareTask needs this
|
||||
tenant_id: str,
|
||||
) -> None:
|
||||
if not self.request.id:
|
||||
raise RuntimeError("No task id defined for this task; cannot identify it")
|
||||
|
||||
task_id = self.request.id
|
||||
stream = io.StringIO()
|
||||
writer = csv.DictWriter(
|
||||
stream,
|
||||
fieldnames=list(QuestionAnswerPairSnapshot.model_fields.keys()),
|
||||
)
|
||||
writer.writeheader()
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
try:
|
||||
mark_task_as_started_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
)
|
||||
|
||||
snapshot_generator = fetch_and_process_chat_session_history(
|
||||
db_session=db_session,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
|
||||
for snapshot in snapshot_generator:
|
||||
if ONYX_QUERY_HISTORY_TYPE == QueryHistoryType.ANONYMIZED:
|
||||
snapshot.user_email = ONYX_ANONYMIZED_EMAIL
|
||||
|
||||
writer.writerows(
|
||||
qa_pair.to_json()
|
||||
for qa_pair in QuestionAnswerPairSnapshot.from_chat_session_snapshot(
|
||||
snapshot
|
||||
)
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Failed to export query history with {task_id=}")
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=False,
|
||||
)
|
||||
raise
|
||||
|
||||
report_name = construct_query_history_report_name(task_id)
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
try:
|
||||
stream.seek(0)
|
||||
get_default_file_store().save_file(
|
||||
content=stream,
|
||||
display_name=report_name,
|
||||
file_origin=FileOrigin.QUERY_HISTORY_CSV,
|
||||
file_type=FileType.CSV,
|
||||
file_metadata={
|
||||
"start": start.isoformat(),
|
||||
"end": end.isoformat(),
|
||||
"start_time": start_time.isoformat(),
|
||||
},
|
||||
file_id=report_name,
|
||||
)
|
||||
|
||||
delete_task_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Failed to save query history export file; {report_name=}"
|
||||
)
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=False,
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
celery_app.autodiscover_tasks(
|
||||
[
|
||||
"ee.onyx.background.celery.tasks.doc_permission_syncing",
|
||||
"ee.onyx.background.celery.tasks.external_group_syncing",
|
||||
"ee.onyx.background.celery.tasks.cleanup",
|
||||
]
|
||||
)
|
||||
@@ -1,8 +0,0 @@
|
||||
from onyx.background.celery.apps.light import celery_app
|
||||
|
||||
celery_app.autodiscover_tasks(
|
||||
[
|
||||
"ee.onyx.background.celery.tasks.doc_permission_syncing",
|
||||
"ee.onyx.background.celery.tasks.external_group_syncing",
|
||||
]
|
||||
)
|
||||
@@ -1,7 +0,0 @@
|
||||
from onyx.background.celery.apps.monitoring import celery_app
|
||||
|
||||
celery_app.autodiscover_tasks(
|
||||
[
|
||||
"ee.onyx.background.celery.tasks.tenant_provisioning",
|
||||
]
|
||||
)
|
||||
@@ -1,12 +1,81 @@
|
||||
from ee.onyx.background.celery_utils import should_perform_chat_ttl_check
|
||||
from ee.onyx.background.task_name_builders import name_chat_ttl_task
|
||||
from ee.onyx.server.reporting.usage_export_generation import create_new_usage_report
|
||||
from onyx.background.celery.apps.primary import celery_app
|
||||
from onyx.background.task_utils import build_celery_task_wrapper
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.db.chat import delete_chat_session
|
||||
from onyx.db.chat import get_chat_sessions_older_than
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.server.settings.store import load_settings
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# mark as EE for all tasks in this file
|
||||
|
||||
|
||||
celery_app.autodiscover_tasks(
|
||||
[
|
||||
"ee.onyx.background.celery.tasks.doc_permission_syncing",
|
||||
"ee.onyx.background.celery.tasks.external_group_syncing",
|
||||
"ee.onyx.background.celery.tasks.cloud",
|
||||
"ee.onyx.background.celery.tasks.ttl_management",
|
||||
"ee.onyx.background.celery.tasks.usage_reporting",
|
||||
]
|
||||
@build_celery_task_wrapper(name_chat_ttl_task)
|
||||
@celery_app.task(soft_time_limit=JOB_TIMEOUT)
|
||||
def perform_ttl_management_task(retention_limit_days: int, *, tenant_id: str) -> None:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
old_chat_sessions = get_chat_sessions_older_than(
|
||||
retention_limit_days, db_session
|
||||
)
|
||||
|
||||
for user_id, session_id in old_chat_sessions:
|
||||
# one session per delete so that we don't blow up if a deletion fails.
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
try:
|
||||
delete_chat_session(
|
||||
user_id,
|
||||
session_id,
|
||||
db_session,
|
||||
include_deleted=True,
|
||||
hard_delete=True,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"delete_chat_session exceptioned. "
|
||||
f"user_id={user_id} session_id={session_id}"
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Periodic Tasks
|
||||
#####
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="check_ttl_management_task",
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def check_ttl_management_task(*, tenant_id: str) -> None:
|
||||
"""Runs periodically to check if any ttl tasks should be run and adds them
|
||||
to the queue"""
|
||||
|
||||
settings = load_settings()
|
||||
retention_limit_days = settings.maximum_chat_retention_days
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
if should_perform_chat_ttl_check(retention_limit_days, db_session):
|
||||
perform_ttl_management_task.apply_async(
|
||||
kwargs=dict(
|
||||
retention_limit_days=retention_limit_days, tenant_id=tenant_id
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="autogenerate_usage_report_task",
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def autogenerate_usage_report_task(*, tenant_id: str) -> None:
|
||||
"""This generates usage report under the /admin/generate-usage/report endpoint"""
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
create_new_usage_report(
|
||||
db_session=db_session,
|
||||
user_id=None,
|
||||
period=None,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from datetime import timedelta
|
||||
from typing import Any
|
||||
|
||||
from ee.onyx.configs.app_configs import CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS
|
||||
from onyx.background.celery.tasks.beat_schedule import (
|
||||
beat_cloud_tasks as base_beat_system_tasks,
|
||||
)
|
||||
@@ -14,42 +13,34 @@ from onyx.background.celery.tasks.beat_schedule import (
|
||||
get_tasks_to_schedule as base_get_tasks_to_schedule,
|
||||
)
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryQueues
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
ee_beat_system_tasks: list[dict] = []
|
||||
|
||||
ee_beat_task_templates: list[dict] = [
|
||||
{
|
||||
"name": "autogenerate-usage-report",
|
||||
"task": OnyxCeleryTask.GENERATE_USAGE_REPORT_TASK,
|
||||
"schedule": timedelta(days=30),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
ee_beat_task_templates: list[dict] = []
|
||||
ee_beat_task_templates.extend(
|
||||
[
|
||||
{
|
||||
"name": "autogenerate-usage-report",
|
||||
"task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
|
||||
"schedule": timedelta(days=30),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "check-ttl-management",
|
||||
"task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
"schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
{
|
||||
"name": "check-ttl-management",
|
||||
"task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
"schedule": timedelta(hours=1),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "export-query-history-cleanup-task",
|
||||
"task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
|
||||
"schedule": timedelta(hours=1),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"queue": OnyxCeleryQueues.CSV_GENERATION,
|
||||
},
|
||||
},
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
ee_tasks_to_schedule: list[dict] = []
|
||||
|
||||
@@ -57,7 +48,7 @@ if not MULTI_TENANT:
|
||||
ee_tasks_to_schedule = [
|
||||
{
|
||||
"name": "autogenerate-usage-report",
|
||||
"task": OnyxCeleryTask.GENERATE_USAGE_REPORT_TASK,
|
||||
"task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
|
||||
"schedule": timedelta(days=30), # TODO: change this to config flag
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
@@ -67,20 +58,10 @@ if not MULTI_TENANT:
|
||||
{
|
||||
"name": "check-ttl-management",
|
||||
"task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
"schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "export-query-history-cleanup-task",
|
||||
"task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
|
||||
"schedule": timedelta(hours=1),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"queue": OnyxCeleryQueues.CSV_GENERATION,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
|
||||
from celery import shared_task
|
||||
|
||||
from ee.onyx.db.query_history import get_all_query_history_export_tasks
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.engine.sql_engine import get_session_with_tenant
|
||||
from onyx.db.enums import TaskStatus
|
||||
from onyx.db.tasks import delete_task_with_id
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def export_query_history_cleanup_task(*, tenant_id: str) -> None:
|
||||
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
|
||||
tasks = get_all_query_history_export_tasks(db_session=db_session)
|
||||
|
||||
for task in tasks:
|
||||
if task.status == TaskStatus.SUCCESS:
|
||||
delete_task_with_id(db_session=db_session, task_id=task.task_id)
|
||||
elif task.status == TaskStatus.FAILURE:
|
||||
if task.start_time:
|
||||
deadline = task.start_time + timedelta(hours=24)
|
||||
now = datetime.now()
|
||||
if now < deadline:
|
||||
continue
|
||||
|
||||
logger.error(
|
||||
f"Task with {task.task_id=} failed; it is being deleted now"
|
||||
)
|
||||
delete_task_with_id(db_session=db_session, task_id=task.task_id)
|
||||
@@ -1,104 +0,0 @@
|
||||
import time
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
from celery.exceptions import SoftTimeLimitExceeded
|
||||
from redis.lock import Lock as RedisLock
|
||||
|
||||
from ee.onyx.server.tenants.product_gating import get_gated_tenants
|
||||
from onyx.background.celery.apps.app_base import task_logger
|
||||
from onyx.background.celery.tasks.beat_schedule import BEAT_EXPIRES_DEFAULT
|
||||
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
|
||||
from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine.tenant_utils import get_all_tenant_ids
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import redis_lock_dump
|
||||
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
|
||||
ignore_result=True,
|
||||
trail=False,
|
||||
bind=True,
|
||||
)
|
||||
def cloud_beat_task_generator(
|
||||
self: Task,
|
||||
task_name: str,
|
||||
queue: str = OnyxCeleryTask.DEFAULT,
|
||||
priority: int = OnyxCeleryPriority.MEDIUM,
|
||||
expires: int = BEAT_EXPIRES_DEFAULT,
|
||||
) -> bool | None:
|
||||
"""a lightweight task used to kick off individual beat tasks per tenant."""
|
||||
time_start = time.monotonic()
|
||||
|
||||
redis_client = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)
|
||||
|
||||
lock_beat: RedisLock = redis_client.lock(
|
||||
f"{OnyxRedisLocks.CLOUD_BEAT_TASK_GENERATOR_LOCK}:{task_name}",
|
||||
timeout=CELERY_GENERIC_BEAT_LOCK_TIMEOUT,
|
||||
)
|
||||
|
||||
# these tasks should never overlap
|
||||
if not lock_beat.acquire(blocking=False):
|
||||
return None
|
||||
|
||||
last_lock_time = time.monotonic()
|
||||
tenant_ids: list[str] = []
|
||||
num_processed_tenants = 0
|
||||
|
||||
try:
|
||||
tenant_ids = get_all_tenant_ids()
|
||||
gated_tenants = get_gated_tenants()
|
||||
for tenant_id in tenant_ids:
|
||||
if tenant_id in gated_tenants:
|
||||
continue
|
||||
|
||||
current_time = time.monotonic()
|
||||
if current_time - last_lock_time >= (CELERY_GENERIC_BEAT_LOCK_TIMEOUT / 4):
|
||||
lock_beat.reacquire()
|
||||
last_lock_time = current_time
|
||||
|
||||
# needed in the cloud
|
||||
if IGNORED_SYNCING_TENANT_LIST and tenant_id in IGNORED_SYNCING_TENANT_LIST:
|
||||
continue
|
||||
|
||||
self.app.send_task(
|
||||
task_name,
|
||||
kwargs=dict(
|
||||
tenant_id=tenant_id,
|
||||
),
|
||||
queue=queue,
|
||||
priority=priority,
|
||||
expires=expires,
|
||||
ignore_result=True,
|
||||
)
|
||||
|
||||
num_processed_tenants += 1
|
||||
except SoftTimeLimitExceeded:
|
||||
task_logger.info(
|
||||
"Soft time limit exceeded, task is being terminated gracefully."
|
||||
)
|
||||
except Exception:
|
||||
task_logger.exception("Unexpected exception during cloud_beat_task_generator")
|
||||
finally:
|
||||
if not lock_beat.owned():
|
||||
task_logger.error(
|
||||
"cloud_beat_task_generator - Lock not owned on completion"
|
||||
)
|
||||
redis_lock_dump(lock_beat, redis_client)
|
||||
else:
|
||||
lock_beat.release()
|
||||
|
||||
time_elapsed = time.monotonic() - time_start
|
||||
task_logger.info(
|
||||
f"cloud_beat_task_generator finished: "
|
||||
f"task={task_name} "
|
||||
f"num_processed_tenants={num_processed_tenants} "
|
||||
f"num_tenants={len(tenant_ids)} "
|
||||
f"elapsed={time_elapsed:.2f}"
|
||||
)
|
||||
return True
|
||||
@@ -1,30 +0,0 @@
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ee.onyx.external_permissions.sync_params import (
|
||||
source_group_sync_is_cc_pair_agnostic,
|
||||
)
|
||||
from onyx.db.connector import mark_cc_pair_as_external_group_synced
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pairs_for_source
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
|
||||
|
||||
def _get_all_cc_pair_ids_to_mark_as_group_synced(
|
||||
db_session: Session, cc_pair: ConnectorCredentialPair
|
||||
) -> list[int]:
|
||||
if not source_group_sync_is_cc_pair_agnostic(cc_pair.connector.source):
|
||||
return [cc_pair.id]
|
||||
|
||||
cc_pairs = get_connector_credential_pairs_for_source(
|
||||
db_session, cc_pair.connector.source
|
||||
)
|
||||
return [cc_pair.id for cc_pair in cc_pairs]
|
||||
|
||||
|
||||
def mark_all_relevant_cc_pairs_as_external_group_synced(
|
||||
db_session: Session, cc_pair: ConnectorCredentialPair
|
||||
) -> None:
|
||||
"""For some source types, one successful group sync run should count for all
|
||||
cc pairs of that type. This function handles that case."""
|
||||
cc_pair_ids = _get_all_cc_pair_ids_to_mark_as_group_synced(db_session, cc_pair)
|
||||
for cc_pair_id in cc_pair_ids:
|
||||
mark_cc_pair_as_external_group_synced(db_session, cc_pair_id)
|
||||
@@ -1,106 +0,0 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from uuid import UUID
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
|
||||
from ee.onyx.background.celery_utils import should_perform_chat_ttl_check
|
||||
from ee.onyx.background.task_name_builders import name_chat_ttl_task
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.chat import delete_chat_session
|
||||
from onyx.db.chat import get_chat_sessions_older_than
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.enums import TaskStatus
|
||||
from onyx.db.tasks import mark_task_as_finished_with_id
|
||||
from onyx.db.tasks import register_task
|
||||
from onyx.server.settings.store import load_settings
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.PERFORM_TTL_MANAGEMENT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
bind=True,
|
||||
trail=False,
|
||||
)
|
||||
def perform_ttl_management_task(
|
||||
self: Task, retention_limit_days: int, *, tenant_id: str
|
||||
) -> None:
|
||||
task_id = self.request.id
|
||||
if not task_id:
|
||||
raise RuntimeError("No task id defined for this task; cannot identify it")
|
||||
|
||||
start_time = datetime.now(tz=timezone.utc)
|
||||
|
||||
user_id: UUID | None = None
|
||||
session_id: UUID | None = None
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# we generally want to move off this, but keeping for now
|
||||
register_task(
|
||||
db_session=db_session,
|
||||
task_name=name_chat_ttl_task(retention_limit_days, tenant_id),
|
||||
task_id=task_id,
|
||||
status=TaskStatus.STARTED,
|
||||
start_time=start_time,
|
||||
)
|
||||
|
||||
old_chat_sessions = get_chat_sessions_older_than(
|
||||
retention_limit_days, db_session
|
||||
)
|
||||
|
||||
for user_id, session_id in old_chat_sessions:
|
||||
# one session per delete so that we don't blow up if a deletion fails.
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
delete_chat_session(
|
||||
user_id,
|
||||
session_id,
|
||||
db_session,
|
||||
include_deleted=True,
|
||||
hard_delete=True,
|
||||
)
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=True,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"delete_chat_session exceptioned. "
|
||||
f"user_id={user_id} session_id={session_id}"
|
||||
)
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=False,
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def check_ttl_management_task(*, tenant_id: str) -> None:
|
||||
"""Runs periodically to check if any ttl tasks should be run and adds them
|
||||
to the queue"""
|
||||
|
||||
settings = load_settings()
|
||||
retention_limit_days = settings.maximum_chat_retention_days
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
if should_perform_chat_ttl_check(retention_limit_days, db_session):
|
||||
perform_ttl_management_task.apply_async(
|
||||
kwargs=dict(
|
||||
retention_limit_days=retention_limit_days, tenant_id=tenant_id
|
||||
),
|
||||
)
|
||||
@@ -1,46 +0,0 @@
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
|
||||
from ee.onyx.server.reporting.usage_export_generation import create_new_usage_report
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.GENERATE_USAGE_REPORT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
bind=True,
|
||||
trail=False,
|
||||
)
|
||||
def generate_usage_report_task(
|
||||
self: Task,
|
||||
*,
|
||||
tenant_id: str,
|
||||
user_id: str | None = None,
|
||||
period_from: str | None = None,
|
||||
period_to: str | None = None,
|
||||
) -> None:
|
||||
"""User-initiated usage report generation task"""
|
||||
# Parse period if provided
|
||||
period = None
|
||||
if period_from and period_to:
|
||||
period = (
|
||||
datetime.fromisoformat(period_from),
|
||||
datetime.fromisoformat(period_to),
|
||||
)
|
||||
|
||||
# Generate the report
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
create_new_usage_report(
|
||||
db_session=db_session,
|
||||
user_id=UUID(user_id) if user_id else None,
|
||||
period=period,
|
||||
)
|
||||
@@ -9,7 +9,7 @@ logger = setup_logger()
|
||||
|
||||
|
||||
def should_perform_chat_ttl_check(
|
||||
retention_limit_days: float | None, db_session: Session
|
||||
retention_limit_days: int | None, db_session: Session
|
||||
) -> bool:
|
||||
# TODO: make this a check for None and add behavior for 0 day TTL
|
||||
if not retention_limit_days:
|
||||
|
||||
@@ -1,16 +1,2 @@
|
||||
from datetime import datetime
|
||||
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
|
||||
|
||||
QUERY_HISTORY_TASK_NAME_PREFIX = OnyxCeleryTask.EXPORT_QUERY_HISTORY_TASK
|
||||
|
||||
|
||||
def name_chat_ttl_task(
|
||||
retention_limit_days: float, tenant_id: str | None = None
|
||||
) -> str:
|
||||
def name_chat_ttl_task(retention_limit_days: int, tenant_id: str | None = None) -> str:
|
||||
return f"chat_ttl_{retention_limit_days}_days"
|
||||
|
||||
|
||||
def query_history_task_name(start: datetime, end: datetime) -> str:
|
||||
return f"{QUERY_HISTORY_TASK_NAME_PREFIX}_{start}_{end}"
|
||||
|
||||
38
backend/ee/onyx/chat/process_message.py
Normal file
38
backend/ee/onyx/chat/process_message.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from ee.onyx.server.query_and_chat.models import OneShotQAResponse
|
||||
from onyx.chat.models import AllCitations
|
||||
from onyx.chat.models import LLMRelevanceFilterResponse
|
||||
from onyx.chat.models import OnyxAnswerPiece
|
||||
from onyx.chat.models import QADocsResponse
|
||||
from onyx.chat.models import StreamingError
|
||||
from onyx.chat.process_message import ChatPacketStream
|
||||
from onyx.server.query_and_chat.models import ChatMessageDetail
|
||||
from onyx.utils.timing import log_function_time
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def gather_stream_for_answer_api(
|
||||
packets: ChatPacketStream,
|
||||
) -> OneShotQAResponse:
|
||||
response = OneShotQAResponse()
|
||||
|
||||
answer = ""
|
||||
for packet in packets:
|
||||
if isinstance(packet, OnyxAnswerPiece) and packet.answer_piece:
|
||||
answer += packet.answer_piece
|
||||
elif isinstance(packet, QADocsResponse):
|
||||
response.docs = packet
|
||||
# Extraneous, provided for backwards compatibility
|
||||
response.rephrase = packet.rephrased_query
|
||||
elif isinstance(packet, StreamingError):
|
||||
response.error_msg = packet.error
|
||||
elif isinstance(packet, ChatMessageDetail):
|
||||
response.chat_message_id = packet.message_id
|
||||
elif isinstance(packet, LLMRelevanceFilterResponse):
|
||||
response.llm_selected_doc_indices = packet.llm_selected_doc_indices
|
||||
elif isinstance(packet, AllCitations):
|
||||
response.citations = packet.citations
|
||||
|
||||
if answer:
|
||||
response.answer = answer
|
||||
|
||||
return response
|
||||
@@ -25,25 +25,13 @@ SAML_CONF_DIR = os.environ.get("SAML_CONF_DIR") or "/app/ee/onyx/configs/saml_co
|
||||
#####
|
||||
# Auto Permission Sync
|
||||
#####
|
||||
# should generally only be used for sources that support polling of permissions
|
||||
# e.g. can pull in only permission changes rather than having to go through all
|
||||
# documents every time
|
||||
DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Confluence
|
||||
#####
|
||||
|
||||
# In seconds, default is 30 minutes
|
||||
# In seconds, default is 5 minutes
|
||||
CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 30 * 60
|
||||
)
|
||||
# In seconds, default is 30 minutes
|
||||
CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
|
||||
os.environ.get("CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
# This is a boolean that determines if anonymous access is public
|
||||
# Default behavior is to not make the page public and instead add a group
|
||||
@@ -51,79 +39,14 @@ CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC = (
|
||||
os.environ.get("CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC", "").lower() == "true"
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# JIRA
|
||||
#####
|
||||
|
||||
# In seconds, default is 30 minutes
|
||||
JIRA_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("JIRA_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Google Drive
|
||||
#####
|
||||
GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# GitHub
|
||||
#####
|
||||
# In seconds, default is 5 minutes
|
||||
GITHUB_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("GITHUB_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
# In seconds, default is 5 minutes
|
||||
GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Slack
|
||||
#####
|
||||
SLACK_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("SLACK_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
NUM_PERMISSION_WORKERS = int(os.environ.get("NUM_PERMISSION_WORKERS") or 2)
|
||||
|
||||
|
||||
#####
|
||||
# Teams
|
||||
#####
|
||||
# In seconds, default is 5 minutes
|
||||
TEAMS_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("TEAMS_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
#####
|
||||
# SharePoint
|
||||
#####
|
||||
# In seconds, default is 30 minutes
|
||||
SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
|
||||
)
|
||||
|
||||
# In seconds, default is 5 minutes
|
||||
SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
|
||||
####
|
||||
# Celery Job Frequency
|
||||
####
|
||||
CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS = float(
|
||||
os.environ.get("CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS") or 1
|
||||
) # float for easier testing
|
||||
|
||||
|
||||
STRIPE_SECRET_KEY = os.environ.get("STRIPE_SECRET_KEY")
|
||||
STRIPE_PRICE_ID = os.environ.get("STRIPE_PRICE")
|
||||
|
||||
@@ -139,6 +62,29 @@ JWT_PUBLIC_KEY_URL: str | None = os.getenv("JWT_PUBLIC_KEY_URL", None)
|
||||
SUPER_USERS = json.loads(os.environ.get("SUPER_USERS", "[]"))
|
||||
SUPER_CLOUD_API_KEY = os.environ.get("SUPER_CLOUD_API_KEY", "api_key")
|
||||
|
||||
OAUTH_SLACK_CLIENT_ID = os.environ.get("OAUTH_SLACK_CLIENT_ID", "")
|
||||
OAUTH_SLACK_CLIENT_SECRET = os.environ.get("OAUTH_SLACK_CLIENT_SECRET", "")
|
||||
OAUTH_CONFLUENCE_CLOUD_CLIENT_ID = os.environ.get(
|
||||
"OAUTH_CONFLUENCE_CLOUD_CLIENT_ID", ""
|
||||
)
|
||||
OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET = os.environ.get(
|
||||
"OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET", ""
|
||||
)
|
||||
OAUTH_JIRA_CLOUD_CLIENT_ID = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_ID", "")
|
||||
OAUTH_JIRA_CLOUD_CLIENT_SECRET = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_SECRET", "")
|
||||
OAUTH_GOOGLE_DRIVE_CLIENT_ID = os.environ.get("OAUTH_GOOGLE_DRIVE_CLIENT_ID", "")
|
||||
OAUTH_GOOGLE_DRIVE_CLIENT_SECRET = os.environ.get(
|
||||
"OAUTH_GOOGLE_DRIVE_CLIENT_SECRET", ""
|
||||
)
|
||||
|
||||
GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
SLACK_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("SLACK_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
# The posthog client does not accept empty API keys or hosts however it fails silently
|
||||
# when the capture is called. These defaults prevent Posthog issues from breaking the Onyx app
|
||||
POSTHOG_API_KEY = os.environ.get("POSTHOG_API_KEY") or "FooBar"
|
||||
@@ -146,4 +92,6 @@ POSTHOG_HOST = os.environ.get("POSTHOG_HOST") or "https://us.i.posthog.com"
|
||||
|
||||
HUBSPOT_TRACKING_URL = os.environ.get("HUBSPOT_TRACKING_URL")
|
||||
|
||||
ANONYMOUS_USER_COOKIE_NAME = "onyx_anonymous_user"
|
||||
|
||||
GATED_TENANTS_KEY = "gated_tenants"
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
from onyx.connectors.confluence.connector import ConfluenceConnector
|
||||
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from onyx.connectors.interfaces import BaseConnector
|
||||
|
||||
|
||||
def validate_confluence_perm_sync(connector: ConfluenceConnector) -> None:
|
||||
"""
|
||||
Validate that the connector is configured correctly for permissions syncing.
|
||||
"""
|
||||
|
||||
|
||||
def validate_drive_perm_sync(connector: GoogleDriveConnector) -> None:
|
||||
"""
|
||||
Validate that the connector is configured correctly for permissions syncing.
|
||||
"""
|
||||
|
||||
|
||||
def validate_perm_sync(connector: BaseConnector) -> None:
|
||||
"""
|
||||
Override this if your connector needs to validate permissions syncing.
|
||||
Raise an exception if invalid, otherwise do nothing.
|
||||
|
||||
Default is a no-op (always successful).
|
||||
"""
|
||||
if isinstance(connector, ConfluenceConnector):
|
||||
validate_confluence_perm_sync(connector)
|
||||
elif isinstance(connector, GoogleDriveConnector):
|
||||
validate_drive_perm_sync(connector)
|
||||
@@ -140,7 +140,7 @@ def fetch_onyxbot_analytics(
|
||||
(
|
||||
or_(
|
||||
ChatMessageFeedback.is_positive.is_(False),
|
||||
ChatMessageFeedback.required_followup.is_(True),
|
||||
ChatMessageFeedback.required_followup,
|
||||
),
|
||||
1,
|
||||
),
|
||||
@@ -173,7 +173,7 @@ def fetch_onyxbot_analytics(
|
||||
.all()
|
||||
)
|
||||
|
||||
return [tuple(row) for row in results]
|
||||
return results
|
||||
|
||||
|
||||
def fetch_persona_message_analytics(
|
||||
|
||||
@@ -4,12 +4,10 @@ from uuid import UUID
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.db.models import PublicExternalUserGroup
|
||||
from onyx.db.models import User
|
||||
from onyx.db.models import User__ExternalUserGroupId
|
||||
from onyx.db.users import batch_add_ext_perm_user_if_not_exists
|
||||
@@ -22,12 +20,6 @@ logger = setup_logger()
|
||||
class ExternalUserGroup(BaseModel):
|
||||
id: str
|
||||
user_emails: list[str]
|
||||
# `True` for cases like a Folder in Google Drive that give domain-wide
|
||||
# or "Anyone with link" access to all files in the folder.
|
||||
# if this is set, `user_emails` don't really matter.
|
||||
# When this is `True`, this `ExternalUserGroup` object doesn't really represent
|
||||
# an actual "group" in the source.
|
||||
gives_anyone_access: bool = False
|
||||
|
||||
|
||||
def delete_user__ext_group_for_user__no_commit(
|
||||
@@ -52,52 +44,20 @@ def delete_user__ext_group_for_cc_pair__no_commit(
|
||||
)
|
||||
|
||||
|
||||
def delete_public_external_group_for_cc_pair__no_commit(
|
||||
def replace_user__ext_group_for_cc_pair(
|
||||
db_session: Session,
|
||||
cc_pair_id: int,
|
||||
) -> None:
|
||||
db_session.execute(
|
||||
delete(PublicExternalUserGroup).where(
|
||||
PublicExternalUserGroup.cc_pair_id == cc_pair_id
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def mark_old_external_groups_as_stale(
|
||||
db_session: Session,
|
||||
cc_pair_id: int,
|
||||
) -> None:
|
||||
db_session.execute(
|
||||
update(User__ExternalUserGroupId)
|
||||
.where(User__ExternalUserGroupId.cc_pair_id == cc_pair_id)
|
||||
.values(stale=True)
|
||||
)
|
||||
db_session.execute(
|
||||
update(PublicExternalUserGroup)
|
||||
.where(PublicExternalUserGroup.cc_pair_id == cc_pair_id)
|
||||
.values(stale=True)
|
||||
)
|
||||
|
||||
|
||||
def upsert_external_groups(
|
||||
db_session: Session,
|
||||
cc_pair_id: int,
|
||||
external_groups: list[ExternalUserGroup],
|
||||
group_defs: list[ExternalUserGroup],
|
||||
source: DocumentSource,
|
||||
) -> None:
|
||||
"""
|
||||
Performs a true upsert operation for external user groups:
|
||||
- For existing groups (same user_id, external_user_group_id, cc_pair_id), updates the stale flag to False
|
||||
- For new groups, inserts them with stale=False
|
||||
- For public groups, uses upsert logic as well
|
||||
This function clears all existing external user group relations for a given cc_pair_id
|
||||
and replaces them with the new group definitions and commits the changes.
|
||||
"""
|
||||
# If there are no groups to add, return early
|
||||
if not external_groups:
|
||||
return
|
||||
|
||||
# collect all emails from all groups to batch add all users at once for efficiency
|
||||
all_group_member_emails = set()
|
||||
for external_group in external_groups:
|
||||
for external_group in group_defs:
|
||||
for user_email in external_group.user_emails:
|
||||
all_group_member_emails.add(user_email)
|
||||
|
||||
@@ -108,17 +68,17 @@ def upsert_external_groups(
|
||||
emails=list(all_group_member_emails),
|
||||
)
|
||||
|
||||
delete_user__ext_group_for_cc_pair__no_commit(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair_id,
|
||||
)
|
||||
|
||||
# map emails to ids
|
||||
email_id_map = {user.email.lower(): user.id for user in all_group_members}
|
||||
email_id_map = {user.email: user.id for user in all_group_members}
|
||||
|
||||
# Process each external group
|
||||
for external_group in external_groups:
|
||||
external_group_id = build_ext_group_name_for_onyx(
|
||||
ext_group_name=external_group.id,
|
||||
source=source,
|
||||
)
|
||||
|
||||
# Handle user-group mappings
|
||||
# use these ids to create new external user group relations relating group_id to user_ids
|
||||
new_external_permissions = []
|
||||
for external_group in group_defs:
|
||||
for user_email in external_group.user_emails:
|
||||
user_id = email_id_map.get(user_email.lower())
|
||||
if user_id is None:
|
||||
@@ -127,71 +87,19 @@ def upsert_external_groups(
|
||||
f" with email {user_email} not found"
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if the user-group mapping already exists
|
||||
existing_user_group = db_session.scalar(
|
||||
select(User__ExternalUserGroupId).where(
|
||||
User__ExternalUserGroupId.user_id == user_id,
|
||||
User__ExternalUserGroupId.external_user_group_id
|
||||
== external_group_id,
|
||||
User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
|
||||
)
|
||||
external_group_id = build_ext_group_name_for_onyx(
|
||||
ext_group_name=external_group.id,
|
||||
source=source,
|
||||
)
|
||||
|
||||
if existing_user_group:
|
||||
# Update existing record
|
||||
existing_user_group.stale = False
|
||||
else:
|
||||
# Insert new record
|
||||
new_user_group = User__ExternalUserGroupId(
|
||||
new_external_permissions.append(
|
||||
User__ExternalUserGroupId(
|
||||
user_id=user_id,
|
||||
external_user_group_id=external_group_id,
|
||||
cc_pair_id=cc_pair_id,
|
||||
stale=False,
|
||||
)
|
||||
db_session.add(new_user_group)
|
||||
|
||||
# Handle public group if needed
|
||||
if external_group.gives_anyone_access:
|
||||
# Check if the public group already exists
|
||||
existing_public_group = db_session.scalar(
|
||||
select(PublicExternalUserGroup).where(
|
||||
PublicExternalUserGroup.external_user_group_id == external_group_id,
|
||||
PublicExternalUserGroup.cc_pair_id == cc_pair_id,
|
||||
)
|
||||
)
|
||||
|
||||
if existing_public_group:
|
||||
# Update existing record
|
||||
existing_public_group.stale = False
|
||||
else:
|
||||
# Insert new record
|
||||
new_public_group = PublicExternalUserGroup(
|
||||
external_user_group_id=external_group_id,
|
||||
cc_pair_id=cc_pair_id,
|
||||
stale=False,
|
||||
)
|
||||
db_session.add(new_public_group)
|
||||
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def remove_stale_external_groups(
|
||||
db_session: Session,
|
||||
cc_pair_id: int,
|
||||
) -> None:
|
||||
db_session.execute(
|
||||
delete(User__ExternalUserGroupId).where(
|
||||
User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
|
||||
User__ExternalUserGroupId.stale.is_(True),
|
||||
)
|
||||
)
|
||||
db_session.execute(
|
||||
delete(PublicExternalUserGroup).where(
|
||||
PublicExternalUserGroup.cc_pair_id == cc_pair_id,
|
||||
PublicExternalUserGroup.stale.is_(True),
|
||||
)
|
||||
)
|
||||
db_session.add_all(new_external_permissions)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
@@ -222,11 +130,3 @@ def fetch_external_groups_for_user_email_and_group_ids(
|
||||
)
|
||||
).all()
|
||||
return list(user_ext_groups)
|
||||
|
||||
|
||||
def fetch_public_external_group_ids(
|
||||
db_session: Session,
|
||||
) -> list[str]:
|
||||
return list(
|
||||
db_session.scalars(select(PublicExternalUserGroup.external_user_group_id)).all()
|
||||
)
|
||||
|
||||
@@ -11,7 +11,6 @@ from onyx.server.features.persona.models import PersonaSharedNotificationData
|
||||
|
||||
def make_persona_private(
|
||||
persona_id: int,
|
||||
creator_user_id: UUID | None,
|
||||
user_ids: list[UUID] | None,
|
||||
group_ids: list[int] | None,
|
||||
db_session: Session,
|
||||
@@ -30,15 +29,15 @@ def make_persona_private(
|
||||
user_ids_set = set(user_ids)
|
||||
for user_id in user_ids_set:
|
||||
db_session.add(Persona__User(persona_id=persona_id, user_id=user_id))
|
||||
if user_id != creator_user_id:
|
||||
create_notification(
|
||||
user_id=user_id,
|
||||
notif_type=NotificationType.PERSONA_SHARED,
|
||||
db_session=db_session,
|
||||
additional_data=PersonaSharedNotificationData(
|
||||
persona_id=persona_id,
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
create_notification(
|
||||
user_id=user_id,
|
||||
notif_type=NotificationType.PERSONA_SHARED,
|
||||
db_session=db_session,
|
||||
additional_data=PersonaSharedNotificationData(
|
||||
persona_id=persona_id,
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
if group_ids:
|
||||
group_ids_set = set(group_ids)
|
||||
|
||||
@@ -15,13 +15,10 @@ from sqlalchemy.sql import select
|
||||
from sqlalchemy.sql.expression import literal
|
||||
from sqlalchemy.sql.expression import UnaryExpression
|
||||
|
||||
from ee.onyx.background.task_name_builders import QUERY_HISTORY_TASK_NAME_PREFIX
|
||||
from onyx.configs.constants import QAFeedbackType
|
||||
from onyx.db.models import ChatMessage
|
||||
from onyx.db.models import ChatMessageFeedback
|
||||
from onyx.db.models import ChatSession
|
||||
from onyx.db.models import TaskQueueState
|
||||
from onyx.db.tasks import get_all_tasks_with_prefix
|
||||
|
||||
|
||||
def _build_filter_conditions(
|
||||
@@ -174,9 +171,3 @@ def fetch_chat_sessions_eagerly_by_time(
|
||||
chat_sessions = query.all()
|
||||
|
||||
return chat_sessions
|
||||
|
||||
|
||||
def get_all_query_history_export_tasks(
|
||||
db_session: Session,
|
||||
) -> list[TaskQueueState]:
|
||||
return get_all_tasks_with_prefix(db_session, QUERY_HISTORY_TASK_NAME_PREFIX)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user