mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-28 21:25:44 +00:00
Compare commits
307 Commits
helm-reado
...
feat/proje
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2db102a745 | ||
|
|
56e4174a10 | ||
|
|
d7dd3a23a8 | ||
|
|
8446c6956e | ||
|
|
420002309e | ||
|
|
e40300cef4 | ||
|
|
712a86ae82 | ||
|
|
bb34971149 | ||
|
|
8b0fd5eb6a | ||
|
|
b692c97812 | ||
|
|
8dc0c1d25d | ||
|
|
f4754c865f | ||
|
|
bf66840e85 | ||
|
|
d0a338a761 | ||
|
|
1cb18b4c2a | ||
|
|
0ad817f339 | ||
|
|
9aa01daf05 | ||
|
|
5333a135b7 | ||
|
|
b86137131d | ||
|
|
68f9149570 | ||
|
|
fa7fdb5034 | ||
|
|
15d90bcc7a | ||
|
|
e0794d9aa1 | ||
|
|
d669e20d07 | ||
|
|
3de811a61f | ||
|
|
f619579bc9 | ||
|
|
7e6e3b4c1d | ||
|
|
de51aabb8e | ||
|
|
90357a2705 | ||
|
|
6369c2f3d8 | ||
|
|
138c3db5ac | ||
|
|
16c2ef2852 | ||
|
|
224a70eea9 | ||
|
|
c457982120 | ||
|
|
0649748da2 | ||
|
|
ddceddaa28 | ||
|
|
c6733a5026 | ||
|
|
7db744a5de | ||
|
|
cd2a8b0def | ||
|
|
f15bc26cd6 | ||
|
|
65f35f0293 | ||
|
|
4e3e608249 | ||
|
|
719a092a12 | ||
|
|
6a8fde7eb1 | ||
|
|
4fdd0812a0 | ||
|
|
4913dc1e85 | ||
|
|
4a43a9642e | ||
|
|
cc48a0c38e | ||
|
|
01ccfd2df7 | ||
|
|
36d75786ee | ||
|
|
f9bc38ba65 | ||
|
|
3da283221d | ||
|
|
90568d3bbb | ||
|
|
7955ca938c | ||
|
|
f5d357eb28 | ||
|
|
d83f616214 | ||
|
|
275c1bec3d | ||
|
|
7d1ef912e8 | ||
|
|
2fe1d4c373 | ||
|
|
2396ad309e | ||
|
|
0b13ef963a | ||
|
|
83073f3ded | ||
|
|
439a27a775 | ||
|
|
91773a4789 | ||
|
|
185beca648 | ||
|
|
2dc564c8df | ||
|
|
b259f53972 | ||
|
|
f8beb08e2f | ||
|
|
83c88c7cf6 | ||
|
|
2372dd40e0 | ||
|
|
5cb6bafe81 | ||
|
|
a0309b31c7 | ||
|
|
0fd268dba7 | ||
|
|
f345da7487 | ||
|
|
f2dacf03f1 | ||
|
|
e0fef50cf0 | ||
|
|
6ba3eeefa5 | ||
|
|
aa158abaa9 | ||
|
|
255c2af1d6 | ||
|
|
9ece3b0310 | ||
|
|
9e3aca03a7 | ||
|
|
dbd5d4d8f1 | ||
|
|
cdb97c3ce4 | ||
|
|
f30ced31a9 | ||
|
|
6cc6c43234 | ||
|
|
224d934cf4 | ||
|
|
8ecdc61ad3 | ||
|
|
08161db7ea | ||
|
|
b139764631 | ||
|
|
2b23dbde8d | ||
|
|
2dec009d63 | ||
|
|
91eadae353 | ||
|
|
8bff616e27 | ||
|
|
2c049e170f | ||
|
|
23e6d7ef3c | ||
|
|
ed81e75edd | ||
|
|
de22fc3a58 | ||
|
|
009b7f60f1 | ||
|
|
9d997e20df | ||
|
|
e6423c4541 | ||
|
|
cb969ad06a | ||
|
|
c4076d16b6 | ||
|
|
04a607a718 | ||
|
|
c1e1aa9dfd | ||
|
|
1ed7abae6e | ||
|
|
cf4855822b | ||
|
|
e242b1319c | ||
|
|
eba4b6620e | ||
|
|
3534515e11 | ||
|
|
5602ff8666 | ||
|
|
2fc70781b4 | ||
|
|
f76b4dec4c | ||
|
|
a5a516fa8a | ||
|
|
811a198134 | ||
|
|
5867ab1d7d | ||
|
|
dd6653eb1f | ||
|
|
db457ef432 | ||
|
|
de7fe939b2 | ||
|
|
38114d9542 | ||
|
|
32f20f2e2e | ||
|
|
3dd27099f7 | ||
|
|
91c4d43a80 | ||
|
|
a63ba1bb03 | ||
|
|
7b6189e74c | ||
|
|
ba423e5773 | ||
|
|
fe029eccae | ||
|
|
ea72af7698 | ||
|
|
17abf85533 | ||
|
|
3bd162acb9 | ||
|
|
664ce441eb | ||
|
|
6863fbee54 | ||
|
|
bb98088b80 | ||
|
|
ce8cb1112a | ||
|
|
a605bd4ca4 | ||
|
|
0e8b5af619 | ||
|
|
46f3af4f68 | ||
|
|
2af64ebf4c | ||
|
|
0eb1824158 | ||
|
|
e0a9a6fb66 | ||
|
|
fe194076c2 | ||
|
|
55dc24fd27 | ||
|
|
da02962a67 | ||
|
|
9bc62cc803 | ||
|
|
bf6705a9a5 | ||
|
|
df2fef3383 | ||
|
|
8cec3448d7 | ||
|
|
b81687995e | ||
|
|
87c2253451 | ||
|
|
297c2957b4 | ||
|
|
bacee0d09d | ||
|
|
297720c132 | ||
|
|
bd4bd00cef | ||
|
|
07c482f727 | ||
|
|
cf193dee29 | ||
|
|
1b47fa2700 | ||
|
|
e1a305d18a | ||
|
|
e2233d22c9 | ||
|
|
20d1175312 | ||
|
|
7117774287 | ||
|
|
77f2660bb2 | ||
|
|
1b2f4f3b87 | ||
|
|
d85b55a9d2 | ||
|
|
e2bae5a2d9 | ||
|
|
cc9c76c4fb | ||
|
|
258e08abcd | ||
|
|
67047e42a7 | ||
|
|
146628e734 | ||
|
|
c1d4b08132 | ||
|
|
f3f47d0709 | ||
|
|
fe26a1bfcc | ||
|
|
554cd0f891 | ||
|
|
f87d3e9849 | ||
|
|
72cdada893 | ||
|
|
c442ebaff6 | ||
|
|
56f16d107e | ||
|
|
0157ae099a | ||
|
|
565fb42457 | ||
|
|
a50a8b4a12 | ||
|
|
4baf4e7d96 | ||
|
|
8b7ab2eb66 | ||
|
|
1f75f3633e | ||
|
|
650884d76a | ||
|
|
8722bdb414 | ||
|
|
71037678c3 | ||
|
|
68de1015e1 | ||
|
|
e2b3a6e144 | ||
|
|
4f04b09efa | ||
|
|
5c4f44d258 | ||
|
|
19652ad60e | ||
|
|
70c96b6ab3 | ||
|
|
65076b916f | ||
|
|
06bc0e51db | ||
|
|
508b456b40 | ||
|
|
bf1e2a2661 | ||
|
|
991d5e4203 | ||
|
|
d21f012b04 | ||
|
|
86b7beab01 | ||
|
|
b4eaa81d8b | ||
|
|
ff2a4c8723 | ||
|
|
51027fd259 | ||
|
|
7e3fd2b12a | ||
|
|
d2fef6f0b7 | ||
|
|
bd06147d26 | ||
|
|
1f3cc9ed6e | ||
|
|
6086d9e51a | ||
|
|
e0de24f64e | ||
|
|
08b6b1f8b3 | ||
|
|
afed1a4b37 | ||
|
|
bca18cacdf | ||
|
|
335db91803 | ||
|
|
67c488ff1f | ||
|
|
deb7f13962 | ||
|
|
e2d3d65c60 | ||
|
|
b78a6834f5 | ||
|
|
4abe90aa2c | ||
|
|
de9568844b | ||
|
|
34268f9806 | ||
|
|
ed75678837 | ||
|
|
3bb58a3dd3 | ||
|
|
4b02feef31 | ||
|
|
6a4d49f02e | ||
|
|
d1736187d3 | ||
|
|
0e79b96091 | ||
|
|
ae302d473d | ||
|
|
feca4fda78 | ||
|
|
f7ed7cd3cd | ||
|
|
8377ab3ef2 | ||
|
|
95c23bf870 | ||
|
|
e49fb8f56d | ||
|
|
adf48de652 | ||
|
|
bca2500438 | ||
|
|
89f925662f | ||
|
|
b64c6d5d40 | ||
|
|
36c63950a6 | ||
|
|
3f31340e6f | ||
|
|
6ac2258c2e | ||
|
|
b4d3b43e8a | ||
|
|
ca281b71e3 | ||
|
|
9bd5a1de7a | ||
|
|
d3c5a4fba0 | ||
|
|
f50006ee63 | ||
|
|
e0092024af | ||
|
|
675ef524b0 | ||
|
|
240367c775 | ||
|
|
f0ed063860 | ||
|
|
bcf0ef0c87 | ||
|
|
0c7a245a46 | ||
|
|
583d82433a | ||
|
|
391e710b6e | ||
|
|
004e56a91b | ||
|
|
103300798f | ||
|
|
8349d6f0ea | ||
|
|
cd63bf6da9 | ||
|
|
5f03e85195 | ||
|
|
cbdbfcab5e | ||
|
|
6918611287 | ||
|
|
b0639add8f | ||
|
|
7af10308d7 | ||
|
|
5e14f23507 | ||
|
|
0bf3a5c609 | ||
|
|
82724826ce | ||
|
|
f9e061926a | ||
|
|
8afd07ff7a | ||
|
|
6523a38255 | ||
|
|
264878a1c9 | ||
|
|
e480946f8a | ||
|
|
be25b1efbd | ||
|
|
204493439b | ||
|
|
106c685afb | ||
|
|
809122fec3 | ||
|
|
c8741d8e9c | ||
|
|
885f01e6a7 | ||
|
|
3180a13cf1 | ||
|
|
630ac31355 | ||
|
|
80de62f47d | ||
|
|
c75d42aa99 | ||
|
|
e1766bca55 | ||
|
|
211102f5f0 | ||
|
|
c46cc4666f | ||
|
|
0b2536b82b | ||
|
|
600a86f11d | ||
|
|
4d97a03935 | ||
|
|
5d7169f244 | ||
|
|
df9329009c | ||
|
|
e74a0398dc | ||
|
|
94c5822cb7 | ||
|
|
dedac55098 | ||
|
|
2bbab5cefe | ||
|
|
4bef718fad | ||
|
|
e7376e9dc2 | ||
|
|
8d5136fe8b | ||
|
|
3272050975 | ||
|
|
1960714042 | ||
|
|
5bddb2632e | ||
|
|
5cd055dab8 | ||
|
|
fa32b7f21e | ||
|
|
37f7227000 | ||
|
|
c1f9a9d122 | ||
|
|
045b7cc7e2 | ||
|
|
970e07a93b | ||
|
|
d463a3f213 | ||
|
|
4ba44c5e48 | ||
|
|
6f8176092e | ||
|
|
198ec417ba | ||
|
|
fbdf7798cf | ||
|
|
7bd9c856aa | ||
|
|
948c719d73 |
2
.github/CODEOWNERS
vendored
2
.github/CODEOWNERS
vendored
@@ -1 +1,3 @@
|
||||
* @onyx-dot-app/onyx-core-team
|
||||
# Helm charts Owners
|
||||
/helm/ @justin-tahara
|
||||
|
||||
49
.github/workflows/helm-chart-releases.yml
vendored
Normal file
49
.github/workflows/helm-chart-releases.yml
vendored
Normal file
@@ -0,0 +1,49 @@
|
||||
name: Release Onyx Helm Charts
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
permissions: write-all
|
||||
|
||||
jobs:
|
||||
release:
|
||||
permissions:
|
||||
contents: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Install Helm CLI
|
||||
uses: azure/setup-helm@v4
|
||||
with:
|
||||
version: v3.12.1
|
||||
|
||||
- name: Add required Helm repositories
|
||||
run: |
|
||||
helm repo add bitnami https://charts.bitnami.com/bitnami
|
||||
helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
|
||||
helm repo update
|
||||
|
||||
- name: Build chart dependencies
|
||||
run: |
|
||||
set -euo pipefail
|
||||
for chart_dir in deployment/helm/charts/*; do
|
||||
if [ -f "$chart_dir/Chart.yaml" ]; then
|
||||
echo "Building dependencies for $chart_dir"
|
||||
helm dependency build "$chart_dir"
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Publish Helm charts to gh-pages
|
||||
uses: stefanprodan/helm-gh-pages@v1.7.0
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
charts_dir: deployment/helm/charts
|
||||
branch: gh-pages
|
||||
commit_username: ${{ github.actor }}
|
||||
commit_email: ${{ github.actor }}@users.noreply.github.com
|
||||
94
.github/workflows/pr-external-dependency-unit-tests.yml
vendored
Normal file
94
.github/workflows/pr-external-dependency-unit-tests.yml
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
name: External Dependency Unit Tests
|
||||
|
||||
on:
|
||||
merge_group:
|
||||
pull_request:
|
||||
branches: [main]
|
||||
|
||||
env:
|
||||
# AWS
|
||||
S3_AWS_ACCESS_KEY_ID: ${{ secrets.S3_AWS_ACCESS_KEY_ID }}
|
||||
S3_AWS_SECRET_ACCESS_KEY: ${{ secrets.S3_AWS_SECRET_ACCESS_KEY }}
|
||||
|
||||
# MinIO
|
||||
S3_ENDPOINT_URL: "http://localhost:9004"
|
||||
|
||||
# Confluence
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
|
||||
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
|
||||
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
|
||||
jobs:
|
||||
discover-test-dirs:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
test-dirs: ${{ steps.set-matrix.outputs.test-dirs }}
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Discover test directories
|
||||
id: set-matrix
|
||||
run: |
|
||||
# Find all subdirectories in backend/tests/external_dependency_unit
|
||||
dirs=$(find backend/tests/external_dependency_unit -mindepth 1 -maxdepth 1 -type d -exec basename {} \; | sort | jq -R -s -c 'split("\n")[:-1]')
|
||||
echo "test-dirs=$dirs" >> $GITHUB_OUTPUT
|
||||
|
||||
external-dependency-unit-tests:
|
||||
needs: discover-test-dirs
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
test-dir: ${{ fromJson(needs.discover-test-dirs.outputs.test-dirs) }}
|
||||
|
||||
env:
|
||||
PYTHONPATH: ./backend
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.11"
|
||||
cache: "pip"
|
||||
cache-dependency-path: |
|
||||
backend/requirements/default.txt
|
||||
backend/requirements/dev.txt
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
playwright install chromium
|
||||
playwright install-deps chromium
|
||||
|
||||
- name: Set up Standard Dependencies
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d minio relational_db cache index
|
||||
|
||||
- name: Run migrations
|
||||
run: |
|
||||
cd backend
|
||||
alembic upgrade head
|
||||
|
||||
- name: Run Tests for ${{ matrix.test-dir }}
|
||||
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
||||
run: |
|
||||
py.test \
|
||||
-n 8 \
|
||||
--dist loadfile \
|
||||
--durations=8 \
|
||||
-o junit_family=xunit2 \
|
||||
-xv \
|
||||
--ff \
|
||||
backend/tests/external_dependency_unit/${{ matrix.test-dir }}
|
||||
20
.github/workflows/pr-helm-chart-testing.yml
vendored
20
.github/workflows/pr-helm-chart-testing.yml
vendored
@@ -55,7 +55,25 @@ jobs:
|
||||
|
||||
- name: Run chart-testing (install)
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: ct install --all --helm-extra-set-args="--set=nginx.enabled=false" --debug --config ct.yaml
|
||||
run: ct install --all \
|
||||
--helm-extra-set-args="\
|
||||
--set=nginx.enabled=false \
|
||||
--set=postgresql.enabled=false \
|
||||
--set=redis.enabled=false \
|
||||
--set=minio.enabled=false \
|
||||
--set=vespa.enabled=false \
|
||||
--set=slackbot.enabled=false \
|
||||
--set=api.replicaCount=0 \
|
||||
--set=inferenceCapability.replicaCount=0 \
|
||||
--set=indexCapability.replicaCount=0 \
|
||||
--set=celery_beat.replicaCount=0 \
|
||||
--set=celery_worker_heavy.replicaCount=0 \
|
||||
--set=celery_worker_docprocessing.replicaCount=0 \
|
||||
--set=celery_worker_light.replicaCount=0 \
|
||||
--set=celery_worker_monitoring.replicaCount=0 \
|
||||
--set=celery_worker_primary.replicaCount=0 \
|
||||
--set=celery_worker_user_files_indexing.replicaCount=0" \
|
||||
--debug --config ct.yaml
|
||||
# the following would install only changed charts, but we only have one chart so
|
||||
# don't worry about that for now
|
||||
# run: ct install --target-branch ${{ github.event.repository.default_branch }}
|
||||
|
||||
20
.github/workflows/pr-integration-tests.yml
vendored
20
.github/workflows/pr-integration-tests.yml
vendored
@@ -16,6 +16,13 @@ env:
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
|
||||
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
|
||||
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
|
||||
PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
|
||||
PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
|
||||
PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
|
||||
PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
|
||||
PLATFORM_PAIR: linux-amd64
|
||||
|
||||
jobs:
|
||||
@@ -63,7 +70,9 @@ jobs:
|
||||
-i /local/openapi.json \
|
||||
-g python \
|
||||
-o /local/onyx_openapi_client \
|
||||
--package-name onyx_openapi_client
|
||||
--package-name onyx_openapi_client \
|
||||
--skip-validate-spec \
|
||||
--openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
@@ -245,6 +254,8 @@ jobs:
|
||||
-p mock-it-services-stack up -d
|
||||
|
||||
# NOTE: Use pre-ping/null to reduce flakiness due to dropped connections
|
||||
# NOTE: `-e ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true` should be added once
|
||||
# enterprise tests are fixed
|
||||
- name: Run Standard Integration Tests
|
||||
run: |
|
||||
echo "Running integration tests..."
|
||||
@@ -266,6 +277,13 @@ jobs:
|
||||
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
|
||||
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
|
||||
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
|
||||
-e JIRA_BASE_URL=${JIRA_BASE_URL} \
|
||||
-e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
|
||||
-e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
|
||||
-e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
|
||||
-e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
|
||||
-e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
|
||||
-e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
|
||||
-e MOCK_CONNECTOR_SERVER_PORT=8001 \
|
||||
|
||||
38
.github/workflows/pr-labeler.yml
vendored
Normal file
38
.github/workflows/pr-labeler.yml
vendored
Normal file
@@ -0,0 +1,38 @@
|
||||
name: PR Labeler
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
branches:
|
||||
- main
|
||||
types:
|
||||
- opened
|
||||
- reopened
|
||||
- synchronize
|
||||
- edited
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
|
||||
jobs:
|
||||
validate_pr_title:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check PR title for Conventional Commits
|
||||
env:
|
||||
PR_TITLE: ${{ github.event.pull_request.title }}
|
||||
run: |
|
||||
echo "PR Title: $PR_TITLE"
|
||||
if [[ ! "$PR_TITLE" =~ ^(feat|fix|docs|test|ci|refactor|perf|chore|revert|build)(\(.+\))?:\ .+ ]]; then
|
||||
echo "::error::❌ Your PR title does not follow the Conventional Commits format.
|
||||
This check ensures that all pull requests use clear, consistent titles that help automate changelogs and improve project history.
|
||||
|
||||
Please update your PR title to follow the Conventional Commits style.
|
||||
Here is a link to a blog explaining the reason why we've included the Conventional Commits style into our PR titles: https://xfuture-blog.com/working-with-conventional-commits
|
||||
|
||||
**Here are some examples of valid PR titles:**
|
||||
- feat: add user authentication
|
||||
- fix(login): handle null password error
|
||||
- docs(readme): update installation instructions"
|
||||
exit 1
|
||||
fi
|
||||
18
.github/workflows/pr-mit-integration-tests.yml
vendored
18
.github/workflows/pr-mit-integration-tests.yml
vendored
@@ -16,6 +16,13 @@ env:
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
|
||||
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
|
||||
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
|
||||
PERM_SYNC_SHAREPOINT_CLIENT_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_CLIENT_ID }}
|
||||
PERM_SYNC_SHAREPOINT_PRIVATE_KEY: ${{ secrets.PERM_SYNC_SHAREPOINT_PRIVATE_KEY }}
|
||||
PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD: ${{ secrets.PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD }}
|
||||
PERM_SYNC_SHAREPOINT_DIRECTORY_ID: ${{ secrets.PERM_SYNC_SHAREPOINT_DIRECTORY_ID }}
|
||||
PLATFORM_PAIR: linux-amd64
|
||||
jobs:
|
||||
integration-tests-mit:
|
||||
@@ -60,7 +67,9 @@ jobs:
|
||||
-i /local/openapi.json \
|
||||
-g python \
|
||||
-o /local/onyx_openapi_client \
|
||||
--package-name onyx_openapi_client
|
||||
--package-name onyx_openapi_client \
|
||||
--skip-validate-spec \
|
||||
--openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
@@ -201,6 +210,13 @@ jobs:
|
||||
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
|
||||
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
|
||||
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
|
||||
-e JIRA_BASE_URL=${JIRA_BASE_URL} \
|
||||
-e JIRA_USER_EMAIL=${JIRA_USER_EMAIL} \
|
||||
-e JIRA_API_TOKEN=${JIRA_API_TOKEN} \
|
||||
-e PERM_SYNC_SHAREPOINT_CLIENT_ID=${PERM_SYNC_SHAREPOINT_CLIENT_ID} \
|
||||
-e PERM_SYNC_SHAREPOINT_PRIVATE_KEY="${PERM_SYNC_SHAREPOINT_PRIVATE_KEY}" \
|
||||
-e PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD=${PERM_SYNC_SHAREPOINT_CERTIFICATE_PASSWORD} \
|
||||
-e PERM_SYNC_SHAREPOINT_DIRECTORY_ID=${PERM_SYNC_SHAREPOINT_DIRECTORY_ID} \
|
||||
-e TEST_WEB_HOSTNAME=test-runner \
|
||||
-e MOCK_CONNECTOR_SERVER_HOST=mock_connector_server \
|
||||
-e MOCK_CONNECTOR_SERVER_PORT=8001 \
|
||||
|
||||
7
.github/workflows/pr-playwright-tests.yml
vendored
7
.github/workflows/pr-playwright-tests.yml
vendored
@@ -10,7 +10,6 @@ env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
GEN_AI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
MOCK_LLM_RESPONSE: true
|
||||
PYTEST_PLAYWRIGHT_SKIP_INITIAL_RESET: true
|
||||
|
||||
jobs:
|
||||
playwright-tests:
|
||||
@@ -160,12 +159,6 @@ jobs:
|
||||
done
|
||||
echo "Finished waiting for service."
|
||||
|
||||
- name: Run pytest playwright test init
|
||||
working-directory: ./backend
|
||||
env:
|
||||
PYTEST_IGNORE_SKIP: true
|
||||
run: pytest -s tests/integration/tests/playwright/test_playwright.py
|
||||
|
||||
- name: Run Playwright tests
|
||||
working-directory: ./web
|
||||
run: npx playwright test
|
||||
|
||||
4
.github/workflows/pr-python-checks.yml
vendored
4
.github/workflows/pr-python-checks.yml
vendored
@@ -47,7 +47,9 @@ jobs:
|
||||
-i /local/openapi.json \
|
||||
-g python \
|
||||
-o /local/onyx_openapi_client \
|
||||
--package-name onyx_openapi_client
|
||||
--package-name onyx_openapi_client \
|
||||
--skip-validate-spec \
|
||||
--openapi-normalizer "SIMPLIFY_ONEOF_ANYOF=true,SET_OAS3_NULLABLE=true"
|
||||
|
||||
- name: Run MyPy
|
||||
run: |
|
||||
|
||||
12
.github/workflows/pr-python-connector-tests.yml
vendored
12
.github/workflows/pr-python-connector-tests.yml
vendored
@@ -16,12 +16,13 @@ env:
|
||||
# Confluence
|
||||
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
|
||||
CONFLUENCE_TEST_SPACE: ${{ secrets.CONFLUENCE_TEST_SPACE }}
|
||||
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
|
||||
CONFLUENCE_TEST_PAGE_ID: ${{ secrets.CONFLUENCE_TEST_PAGE_ID }}
|
||||
CONFLUENCE_IS_CLOUD: ${{ secrets.CONFLUENCE_IS_CLOUD }}
|
||||
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
|
||||
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
|
||||
|
||||
# Jira
|
||||
JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }}
|
||||
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
|
||||
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
|
||||
|
||||
@@ -49,6 +50,15 @@ env:
|
||||
SF_PASSWORD: ${{ secrets.SF_PASSWORD }}
|
||||
SF_SECURITY_TOKEN: ${{ secrets.SF_SECURITY_TOKEN }}
|
||||
|
||||
# Hubspot
|
||||
HUBSPOT_ACCESS_TOKEN: ${{ secrets.HUBSPOT_ACCESS_TOKEN }}
|
||||
|
||||
# IMAP
|
||||
IMAP_HOST: ${{ secrets.IMAP_HOST }}
|
||||
IMAP_USERNAME: ${{ secrets.IMAP_USERNAME }}
|
||||
IMAP_PASSWORD: ${{ secrets.IMAP_PASSWORD }}
|
||||
IMAP_MAILBOXES: ${{ secrets.IMAP_MAILBOXES }}
|
||||
|
||||
# Airtable
|
||||
AIRTABLE_TEST_BASE_ID: ${{ secrets.AIRTABLE_TEST_BASE_ID }}
|
||||
AIRTABLE_TEST_TABLE_ID: ${{ secrets.AIRTABLE_TEST_TABLE_ID }}
|
||||
|
||||
12
.gitignore
vendored
12
.gitignore
vendored
@@ -17,12 +17,24 @@ backend/tests/regression/answer_quality/test_data.json
|
||||
backend/tests/regression/search_quality/eval-*
|
||||
backend/tests/regression/search_quality/search_eval_config.yaml
|
||||
backend/tests/regression/search_quality/*.json
|
||||
*.log
|
||||
|
||||
# secret files
|
||||
.env
|
||||
jira_test_env
|
||||
settings.json
|
||||
|
||||
# others
|
||||
/deployment/data/nginx/app.conf
|
||||
*.sw?
|
||||
/backend/tests/regression/answer_quality/search_test_config.yaml
|
||||
|
||||
# Local .terraform directories
|
||||
**/.terraform/*
|
||||
|
||||
# Local .tfstate files
|
||||
*.tfstate
|
||||
*.tfstate.*
|
||||
|
||||
# Local .terraform.lock.hcl file
|
||||
.terraform.lock.hcl
|
||||
|
||||
22
.vscode/env_template.txt
vendored
22
.vscode/env_template.txt
vendored
@@ -23,6 +23,9 @@ DISABLE_LLM_DOC_RELEVANCE=False
|
||||
# Useful if you want to toggle auth on/off (google_oauth/OIDC specifically)
|
||||
OAUTH_CLIENT_ID=<REPLACE THIS>
|
||||
OAUTH_CLIENT_SECRET=<REPLACE THIS>
|
||||
OPENID_CONFIG_URL=<REPLACE THIS>
|
||||
SAML_CONF_DIR=/<ABSOLUTE PATH TO ONYX>/onyx/backend/ee/onyx/configs/saml_config
|
||||
|
||||
# Generally not useful for dev, we don't generally want to set up an SMTP server for dev
|
||||
REQUIRE_EMAIL_VERIFICATION=False
|
||||
|
||||
@@ -45,8 +48,8 @@ PYTHONPATH=../backend
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
|
||||
# Internet Search
|
||||
BING_API_KEY=<REPLACE THIS>
|
||||
# Internet Search
|
||||
EXA_API_KEY=<REPLACE THIS>
|
||||
|
||||
|
||||
# Enable the full set of Danswer Enterprise Edition features
|
||||
@@ -58,3 +61,18 @@ AGENT_RETRIEVAL_STATS=False # Note: This setting will incur substantial re-ran
|
||||
AGENT_RERANKING_STATS=True
|
||||
AGENT_MAX_QUERY_RETRIEVAL_RESULTS=20
|
||||
AGENT_RERANKING_MAX_QUERY_RETRIEVAL_RESULTS=20
|
||||
|
||||
# S3 File Store Configuration (MinIO for local development)
|
||||
S3_ENDPOINT_URL=http://localhost:9004
|
||||
S3_FILE_STORE_BUCKET_NAME=onyx-file-store-bucket
|
||||
S3_AWS_ACCESS_KEY_ID=minioadmin
|
||||
S3_AWS_SECRET_ACCESS_KEY=minioadmin
|
||||
|
||||
# Show extra/uncommon connectors
|
||||
SHOW_EXTRA_CONNECTORS=True
|
||||
|
||||
# Local langsmith tracing
|
||||
LANGSMITH_TRACING="true"
|
||||
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
|
||||
LANGSMITH_API_KEY=<REPLACE_THIS>
|
||||
LANGSMITH_PROJECT=<REPLACE_THIS>
|
||||
852
.vscode/launch.template.jsonc
vendored
852
.vscode/launch.template.jsonc
vendored
@@ -1,464 +1,496 @@
|
||||
/* Copy this file into '.vscode/launch.json' or merge its contents into your existing configurations. */
|
||||
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"compounds": [
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Compound ---",
|
||||
"configurations": ["--- Individual ---"],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Run All Onyx Services",
|
||||
"configurations": [
|
||||
"Web Server",
|
||||
"Model Server",
|
||||
"API Server",
|
||||
"Slack Bot",
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery indexing",
|
||||
"Celery user files indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web / Model / API",
|
||||
"configurations": ["Web Server", "Model Server", "API Server"],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Celery (all)",
|
||||
"configurations": [
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery indexing",
|
||||
"Celery user files indexing",
|
||||
"Celery beat",
|
||||
"Celery monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"compounds": [
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Compound ---",
|
||||
"configurations": ["--- Individual ---"],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
],
|
||||
"configurations": [
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Individual ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Run All Onyx Services",
|
||||
"configurations": [
|
||||
"Web Server",
|
||||
"Model Server",
|
||||
"API Server",
|
||||
"Slack Bot",
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery docfetching",
|
||||
"Celery docprocessing",
|
||||
"Celery beat",
|
||||
"Celery monitoring",
|
||||
"Celery user file processing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web / Model / API",
|
||||
"configurations": ["Web Server", "Model Server", "API Server"],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Celery (all)",
|
||||
"configurations": [
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery heavy",
|
||||
"Celery docfetching",
|
||||
"Celery docprocessing",
|
||||
"Celery beat",
|
||||
"Celery monitoring",
|
||||
"Celery user file processing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "1"
|
||||
},
|
||||
"stopAll": true
|
||||
}
|
||||
],
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Individual ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "2",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Web Server",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"cwd": "${workspaceRoot}/web",
|
||||
"runtimeExecutable": "npm",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"runtimeArgs": ["run", "dev"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
"name": "Web Server",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"cwd": "${workspaceRoot}/web",
|
||||
"runtimeExecutable": "npm",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"runtimeArgs": ["run", "dev"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"console": "integratedTerminal",
|
||||
"consoleTitle": "Web Server Console"
|
||||
"console": "integratedTerminal",
|
||||
"consoleTitle": "Web Server Console"
|
||||
},
|
||||
{
|
||||
"name": "Model Server",
|
||||
"consoleName": "Model Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
{
|
||||
"name": "Model Server",
|
||||
"consoleName": "Model Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
"args": ["model_server.main:app", "--reload", "--port", "9000"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Model Server Console"
|
||||
"args": ["model_server.main:app", "--reload", "--port", "9000"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
{
|
||||
"name": "API Server",
|
||||
"consoleName": "API Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
"args": ["onyx.main:app", "--reload", "--port", "8080"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "API Server Console"
|
||||
"consoleTitle": "Model Server Console"
|
||||
},
|
||||
{
|
||||
"name": "API Server",
|
||||
"consoleName": "API Server",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "uvicorn",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1"
|
||||
},
|
||||
// For the listener to access the Slack API,
|
||||
// DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
|
||||
{
|
||||
"name": "Slack Bot",
|
||||
"consoleName": "Slack Bot",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "onyx/onyxbot/slack/listener.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"args": ["onyx.main:app", "--reload", "--port", "8080"],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "API Server Console"
|
||||
},
|
||||
// For the listener to access the Slack API,
|
||||
// DANSWER_BOT_SLACK_APP_TOKEN & DANSWER_BOT_SLACK_BOT_TOKEN need to be set in .env file located in the root of the project
|
||||
{
|
||||
"name": "Slack Bot",
|
||||
"consoleName": "Slack Bot",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "onyx/onyxbot/slack/listener.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Slack Bot Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery primary",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.primary",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=primary@%n",
|
||||
"-Q",
|
||||
"celery"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery primary Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery light",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.light",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=64",
|
||||
"--prefetch-multiplier=8",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=light@%n",
|
||||
"-Q",
|
||||
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert,index_attempt_cleanup"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery light Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery heavy",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.heavy",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=heavy@%n",
|
||||
"-Q",
|
||||
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery heavy Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery docfetching",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Slack Bot Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery primary",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.primary",
|
||||
"onyx.background.celery.versioned_apps.docfetching",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=primary@%n",
|
||||
"--hostname=docfetching@%n",
|
||||
"-Q",
|
||||
"celery"
|
||||
],
|
||||
"presentation": {
|
||||
"connector_doc_fetching,user_files_indexing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery primary Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery light",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.light",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=64",
|
||||
"--prefetch-multiplier=8",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=light@%n",
|
||||
"-Q",
|
||||
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery light Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery heavy",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "INFO",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.heavy",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=4",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=heavy@%n",
|
||||
"-Q",
|
||||
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery heavy Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery indexing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"consoleTitle": "Celery docfetching Console",
|
||||
"justMyCode": false
|
||||
},
|
||||
{
|
||||
"name": "Celery docprocessing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"ENABLE_MULTIPASS_INDEXING": "false",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"onyx.background.celery.versioned_apps.docprocessing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--concurrency=6",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=indexing@%n",
|
||||
"--hostname=docprocessing@%n",
|
||||
"-Q",
|
||||
"connector_indexing"
|
||||
],
|
||||
"presentation": {
|
||||
"docprocessing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery indexing Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery monitoring",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.monitoring",
|
||||
"worker",
|
||||
"--pool=solo",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=monitoring@%n",
|
||||
"-Q",
|
||||
"monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery monitoring Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery beat",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.beat",
|
||||
"beat",
|
||||
"--loglevel=INFO"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery beat Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery user files indexing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.indexing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=user_files_indexing@%n",
|
||||
"-Q",
|
||||
"user_files_indexing"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery user files indexing Console"
|
||||
},
|
||||
{
|
||||
"name": "Pytest",
|
||||
"consoleName": "Pytest",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "pytest",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"-v"
|
||||
// Specify a sepcific module/test to run or provide nothing to run all tests
|
||||
//"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Pytest Console"
|
||||
},
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Tasks ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "3",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Clear and Restart External Volumes and Containers",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": [
|
||||
"${workspaceFolder}/backend/scripts/restart_containers.sh"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"stopOnEntry": true,
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
// Celery jobs launched through a single background script (legacy)
|
||||
// Recommend using the "Celery (all)" compound launch instead.
|
||||
"name": "Background Jobs",
|
||||
"consoleName": "Background Jobs",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/dev_run_background_jobs.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Install Python Requirements",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": [
|
||||
"-c",
|
||||
"pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
"consoleTitle": "Celery docprocessing Console",
|
||||
"justMyCode": false
|
||||
},
|
||||
{
|
||||
// script to generate the openapi schema
|
||||
"name": "Onyx OpenAPI Schema Generator",
|
||||
"name": "Celery monitoring",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/onyx_openapi_schema.py",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {},
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.monitoring",
|
||||
"worker",
|
||||
"--pool=solo",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=monitoring@%n",
|
||||
"-Q",
|
||||
"monitoring"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery monitoring Console"
|
||||
},
|
||||
{
|
||||
"name": "Celery beat",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "celery",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--filename",
|
||||
"generated/openapi.json",
|
||||
]
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.beat",
|
||||
"beat",
|
||||
"--loglevel=INFO"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery beat Console"
|
||||
},
|
||||
{
|
||||
// script to debug multi tenant db issues
|
||||
"name": "Onyx DB Manager (Top Chunks)",
|
||||
"name": "Celery user file processing",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/debugging/onyx_db.py",
|
||||
"module": "celery",
|
||||
"args": [
|
||||
"-A",
|
||||
"onyx.background.celery.versioned_apps.user_file_processing",
|
||||
"worker",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=user_file_processing@%n",
|
||||
"--pool=threads",
|
||||
"-Q",
|
||||
"user_file_processing"
|
||||
],
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Celery user file processing Console"
|
||||
},
|
||||
{
|
||||
"name": "Pytest",
|
||||
"consoleName": "Pytest",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "pytest",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--password",
|
||||
"your_password_here",
|
||||
"--port",
|
||||
"5433",
|
||||
"--report",
|
||||
"top-chunks",
|
||||
"--filename",
|
||||
"generated/tenants_by_num_docs.csv"
|
||||
]
|
||||
"-v"
|
||||
// Specify a sepcific module/test to run or provide nothing to run all tests
|
||||
//"tests/unit/onyx/llm/answering/test_prune_and_merge.py"
|
||||
],
|
||||
"presentation": {
|
||||
"group": "2"
|
||||
},
|
||||
"consoleTitle": "Pytest Console"
|
||||
},
|
||||
{
|
||||
"name": "Debug React Web App in Chrome",
|
||||
"type": "chrome",
|
||||
"request": "launch",
|
||||
"url": "http://localhost:3000",
|
||||
"webRoot": "${workspaceFolder}/web"
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Tasks ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "3",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Clear and Restart External Volumes and Containers",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": [
|
||||
"${workspaceFolder}/backend/scripts/restart_containers.sh"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"stopOnEntry": true,
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
// Celery jobs launched through a single background script (legacy)
|
||||
// Recommend using the "Celery (all)" compound launch instead.
|
||||
"name": "Background Jobs",
|
||||
"consoleName": "Background Jobs",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/dev_run_background_jobs.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"LOG_DANSWER_MODEL_INTERACTIONS": "True",
|
||||
"LOG_LEVEL": "DEBUG",
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Install Python Requirements",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "bash",
|
||||
"runtimeArgs": [
|
||||
"-c",
|
||||
"pip install -r backend/requirements/default.txt && pip install -r backend/requirements/dev.txt && pip install -r backend/requirements/ee.txt && pip install -r backend/requirements/model_server.txt"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
// script to generate the openapi schema
|
||||
"name": "Onyx OpenAPI Schema Generator",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/onyx_openapi_schema.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"env": {
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--filename",
|
||||
"generated/openapi.json"
|
||||
]
|
||||
}
|
||||
|
||||
},
|
||||
{
|
||||
// script to debug multi tenant db issues
|
||||
"name": "Onyx DB Manager (Top Chunks)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "scripts/debugging/onyx_db.py",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.env",
|
||||
"env": {
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"args": [
|
||||
"--password",
|
||||
"your_password_here",
|
||||
"--port",
|
||||
"5433",
|
||||
"--report",
|
||||
"top-chunks",
|
||||
"--filename",
|
||||
"generated/tenants_by_num_docs.csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Debug React Web App in Chrome",
|
||||
"type": "chrome",
|
||||
"request": "launch",
|
||||
"url": "http://localhost:3000",
|
||||
"webRoot": "${workspaceFolder}/web"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@@ -59,6 +59,7 @@ Onyx being a fully functional app, relies on some external software, specificall
|
||||
- [Postgres](https://www.postgresql.org/) (Relational DB)
|
||||
- [Vespa](https://vespa.ai/) (Vector DB/Search Engine)
|
||||
- [Redis](https://redis.io/) (Cache)
|
||||
- [MinIO](https://min.io/) (File Store)
|
||||
- [Nginx](https://nginx.org/) (Not needed for development flows generally)
|
||||
|
||||
> **Note:**
|
||||
@@ -102,10 +103,10 @@ If using PowerShell, the command slightly differs:
|
||||
Install the required python dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r onyx/backend/requirements/default.txt
|
||||
pip install -r onyx/backend/requirements/dev.txt
|
||||
pip install -r onyx/backend/requirements/ee.txt
|
||||
pip install -r onyx/backend/requirements/model_server.txt
|
||||
pip install -r backend/requirements/default.txt
|
||||
pip install -r backend/requirements/dev.txt
|
||||
pip install -r backend/requirements/ee.txt
|
||||
pip install -r backend/requirements/model_server.txt
|
||||
```
|
||||
|
||||
Install Playwright for Python (headless browser required by the Web Connector)
|
||||
@@ -171,10 +172,10 @@ Otherwise, you can follow the instructions below to run the application for deve
|
||||
|
||||
You will need Docker installed to run these containers.
|
||||
|
||||
First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis with:
|
||||
First navigate to `onyx/deployment/docker_compose`, then start up Postgres/Vespa/Redis/MinIO with:
|
||||
|
||||
```bash
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache
|
||||
docker compose -f docker-compose.dev.yml -p onyx-stack up -d index relational_db cache minio
|
||||
```
|
||||
|
||||
(index refers to Vespa, relational_db refers to Postgres, and cache refers to Redis)
|
||||
|
||||
@@ -5,7 +5,7 @@ This guide explains how to set up and use VSCode's debugging capabilities with t
|
||||
## Initial Setup
|
||||
|
||||
1. **Environment Setup**:
|
||||
- Copy `.vscode/.env.template` to `.vscode/.env`
|
||||
- Copy `.vscode/env_template.txt` to `.vscode/.env`
|
||||
- Fill in the necessary environment variables in `.vscode/.env`
|
||||
2. **launch.json**:
|
||||
- Copy `.vscode/launch.template.jsonc` to `.vscode/launch.json`
|
||||
@@ -17,10 +17,9 @@ Before starting, make sure the Docker Daemon is running.
|
||||
1. Open the Debug view in VSCode (Cmd+Shift+D on macOS)
|
||||
2. From the dropdown at the top, select "Clear and Restart External Volumes and Containers" and press the green play button
|
||||
3. From the dropdown at the top, select "Run All Onyx Services" and press the green play button
|
||||
4. CD into web, run "npm i" followed by npm run dev.
|
||||
5. Now, you can navigate to onyx in your browser (default is http://localhost:3000) and start using the app
|
||||
6. You can set breakpoints by clicking to the left of line numbers to help debug while the app is running
|
||||
7. Use the debug toolbar to step through code, inspect variables, etc.
|
||||
4. Now, you can navigate to onyx in your browser (default is http://localhost:3000) and start using the app
|
||||
5. You can set breakpoints by clicking to the left of line numbers to help debug while the app is running
|
||||
6. Use the debug toolbar to step through code, inspect variables, etc.
|
||||
|
||||
## Features
|
||||
|
||||
|
||||
@@ -12,7 +12,8 @@ ARG ONYX_VERSION=0.0.0-dev
|
||||
# DO_NOT_TRACK is used to disable telemetry for Unstructured
|
||||
ENV ONYX_VERSION=${ONYX_VERSION} \
|
||||
DANSWER_RUNNING_IN_DOCKER="true" \
|
||||
DO_NOT_TRACK="true"
|
||||
DO_NOT_TRACK="true" \
|
||||
PLAYWRIGHT_BROWSERS_PATH="/app/.cache/ms-playwright"
|
||||
|
||||
|
||||
RUN echo "ONYX_VERSION: ${ONYX_VERSION}"
|
||||
@@ -37,8 +38,7 @@ RUN apt-get update && \
|
||||
pkg-config \
|
||||
gcc \
|
||||
nano \
|
||||
vim \
|
||||
postgresql-client && \
|
||||
vim && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
apt-get clean
|
||||
|
||||
@@ -78,6 +78,9 @@ RUN apt-get update && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key
|
||||
|
||||
# Install postgresql-client for easy manual tests
|
||||
# Install it here to avoid it being cleaned up above
|
||||
RUN apt-get update && apt-get install -y postgresql-client
|
||||
|
||||
# Pre-downloading models for setups with limited egress
|
||||
RUN python -c "from tokenizers import Tokenizer; \
|
||||
@@ -114,6 +117,14 @@ COPY ./assets /app/assets
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
# Create non-root user for security best practices
|
||||
RUN groupadd -g 1001 onyx && \
|
||||
useradd -u 1001 -g onyx -m -s /bin/bash onyx && \
|
||||
chown -R onyx:onyx /app && \
|
||||
mkdir -p /var/log/onyx && \
|
||||
chmod 755 /var/log/onyx && \
|
||||
chown onyx:onyx /var/log/onyx
|
||||
|
||||
# Default command which does nothing
|
||||
# This container is used by api server and background which specify their own CMD
|
||||
CMD ["tail", "-f", "/dev/null"]
|
||||
|
||||
@@ -9,17 +9,41 @@ visit https://github.com/onyx-dot-app/onyx."
|
||||
# Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
|
||||
ARG ONYX_VERSION=0.0.0-dev
|
||||
ENV ONYX_VERSION=${ONYX_VERSION} \
|
||||
DANSWER_RUNNING_IN_DOCKER="true"
|
||||
|
||||
DANSWER_RUNNING_IN_DOCKER="true" \
|
||||
HF_HOME=/app/.cache/huggingface
|
||||
|
||||
RUN echo "ONYX_VERSION: ${ONYX_VERSION}"
|
||||
|
||||
# Create non-root user for security best practices
|
||||
RUN mkdir -p /app && \
|
||||
groupadd -g 1001 onyx && \
|
||||
useradd -u 1001 -g onyx -m -s /bin/bash onyx && \
|
||||
chown -R onyx:onyx /app && \
|
||||
mkdir -p /var/log/onyx && \
|
||||
chmod 755 /var/log/onyx && \
|
||||
chown onyx:onyx /var/log/onyx
|
||||
|
||||
# Install build tools needed for compiling Rust packages like fastuuid
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Rust (needed for fastuuid compilation)
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
COPY ./requirements/model_server.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade \
|
||||
--retries 5 \
|
||||
--timeout 30 \
|
||||
-r /tmp/requirements.txt
|
||||
|
||||
# Clean up build tools to reduce image size
|
||||
RUN apt-get remove -y build-essential curl && \
|
||||
apt-get autoremove -y && \
|
||||
rm -rf /root/.cargo /root/.rustup
|
||||
|
||||
RUN apt-get remove -y --allow-remove-essential perl-base && \
|
||||
apt-get autoremove -y
|
||||
|
||||
@@ -38,9 +62,11 @@ snapshot_download('mixedbread-ai/mxbai-rerank-xsmall-v1'); \
|
||||
from sentence_transformers import SentenceTransformer; \
|
||||
SentenceTransformer(model_name_or_path='nomic-ai/nomic-embed-text-v1', trust_remote_code=True);"
|
||||
|
||||
# In case the user has volumes mounted to /root/.cache/huggingface that they've downloaded while
|
||||
# running Onyx, don't overwrite it with the built in cache folder
|
||||
RUN mv /root/.cache/huggingface /root/.cache/temp_huggingface
|
||||
# In case the user has volumes mounted to /app/.cache/huggingface that they've downloaded while
|
||||
# running Onyx, move the current contents of the cache folder to a temporary location to ensure
|
||||
# it's preserved in order to combine with the user's cache contents
|
||||
RUN mv /app/.cache/huggingface /app/.cache/temp_huggingface && \
|
||||
chown -R onyx:onyx /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
@@ -20,3 +20,44 @@ To run all un-applied migrations:
|
||||
To undo migrations:
|
||||
`alembic downgrade -X`
|
||||
where X is the number of migrations you want to undo from the current state
|
||||
|
||||
### Multi-tenant migrations
|
||||
|
||||
For multi-tenant deployments, you can use additional options:
|
||||
|
||||
**Upgrade all tenants:**
|
||||
```bash
|
||||
alembic -x upgrade_all_tenants=true upgrade head
|
||||
```
|
||||
|
||||
**Upgrade specific schemas:**
|
||||
```bash
|
||||
# Single schema
|
||||
alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012 upgrade head
|
||||
|
||||
# Multiple schemas (comma-separated)
|
||||
alembic -x schemas=tenant_12345678-1234-1234-1234-123456789012,public,another_tenant upgrade head
|
||||
```
|
||||
|
||||
**Upgrade tenants within an alphabetical range:**
|
||||
```bash
|
||||
# Upgrade tenants 100-200 when sorted alphabetically (positions 100 to 200)
|
||||
alembic -x upgrade_all_tenants=true -x tenant_range_start=100 -x tenant_range_end=200 upgrade head
|
||||
|
||||
# Upgrade tenants starting from position 1000 alphabetically
|
||||
alembic -x upgrade_all_tenants=true -x tenant_range_start=1000 upgrade head
|
||||
|
||||
# Upgrade first 500 tenants alphabetically
|
||||
alembic -x upgrade_all_tenants=true -x tenant_range_end=500 upgrade head
|
||||
```
|
||||
|
||||
**Continue on error (for batch operations):**
|
||||
```bash
|
||||
alembic -x upgrade_all_tenants=true -x continue=true upgrade head
|
||||
```
|
||||
|
||||
The tenant range filtering works by:
|
||||
1. Sorting tenant IDs alphabetically
|
||||
2. Using 1-based position numbers (1st, 2nd, 3rd tenant, etc.)
|
||||
3. Filtering to the specified range of positions
|
||||
4. Non-tenant schemas (like 'public') are always included
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
from typing import Any, Literal
|
||||
from onyx.db.engine import get_iam_auth_token
|
||||
from onyx.db.engine.iam_auth import get_iam_auth_token
|
||||
from onyx.configs.app_configs import USE_IAM_AUTH
|
||||
from onyx.configs.app_configs import POSTGRES_HOST
|
||||
from onyx.configs.app_configs import POSTGRES_PORT
|
||||
from onyx.configs.app_configs import POSTGRES_USER
|
||||
from onyx.configs.app_configs import AWS_REGION_NAME
|
||||
from onyx.db.engine import build_connection_string
|
||||
from onyx.db.engine import get_all_tenant_ids
|
||||
from onyx.db.engine.sql_engine import build_connection_string
|
||||
from onyx.db.engine.tenant_utils import get_all_tenant_ids
|
||||
from sqlalchemy import event
|
||||
from sqlalchemy import pool
|
||||
from sqlalchemy import text
|
||||
@@ -21,10 +21,14 @@ from alembic import context
|
||||
from sqlalchemy.ext.asyncio import create_async_engine
|
||||
from sqlalchemy.sql.schema import SchemaItem
|
||||
from onyx.configs.constants import SSL_CERT_FILE
|
||||
from shared_configs.configs import MULTI_TENANT, POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.configs import (
|
||||
MULTI_TENANT,
|
||||
POSTGRES_DEFAULT_SCHEMA,
|
||||
TENANT_ID_PREFIX,
|
||||
)
|
||||
from onyx.db.models import Base
|
||||
from celery.backends.database.session import ResultModelBase # type: ignore
|
||||
from onyx.db.engine import SqlEngine
|
||||
from onyx.db.engine.sql_engine import SqlEngine
|
||||
|
||||
# Make sure in alembic.ini [logger_root] level=INFO is set or most logging will be
|
||||
# hidden! (defaults to level=WARN)
|
||||
@@ -69,15 +73,67 @@ def include_object(
|
||||
return True
|
||||
|
||||
|
||||
def get_schema_options() -> tuple[str, bool, bool, bool]:
|
||||
def filter_tenants_by_range(
|
||||
tenant_ids: list[str], start_range: int | None = None, end_range: int | None = None
|
||||
) -> list[str]:
|
||||
"""
|
||||
Filter tenant IDs by alphabetical position range.
|
||||
|
||||
Args:
|
||||
tenant_ids: List of tenant IDs to filter
|
||||
start_range: Starting position in alphabetically sorted list (1-based, inclusive)
|
||||
end_range: Ending position in alphabetically sorted list (1-based, inclusive)
|
||||
|
||||
Returns:
|
||||
Filtered list of tenant IDs in their original order
|
||||
"""
|
||||
if start_range is None and end_range is None:
|
||||
return tenant_ids
|
||||
|
||||
# Separate tenant IDs from non-tenant schemas
|
||||
tenant_schemas = [tid for tid in tenant_ids if tid.startswith(TENANT_ID_PREFIX)]
|
||||
non_tenant_schemas = [
|
||||
tid for tid in tenant_ids if not tid.startswith(TENANT_ID_PREFIX)
|
||||
]
|
||||
|
||||
# Sort tenant schemas alphabetically.
|
||||
# NOTE: can cause missed schemas if a schema is created in between workers
|
||||
# fetching of all tenant IDs. We accept this risk for now. Just re-running
|
||||
# the migration will fix the issue.
|
||||
sorted_tenant_schemas = sorted(tenant_schemas)
|
||||
|
||||
# Apply range filtering (0-based indexing)
|
||||
start_idx = start_range if start_range is not None else 0
|
||||
end_idx = end_range if end_range is not None else len(sorted_tenant_schemas)
|
||||
|
||||
# Ensure indices are within bounds
|
||||
start_idx = max(0, start_idx)
|
||||
end_idx = min(len(sorted_tenant_schemas), end_idx)
|
||||
|
||||
# Get the filtered tenant schemas
|
||||
filtered_tenant_schemas = sorted_tenant_schemas[start_idx:end_idx]
|
||||
|
||||
# Combine with non-tenant schemas and preserve original order
|
||||
filtered_tenants = []
|
||||
for tenant_id in tenant_ids:
|
||||
if tenant_id in filtered_tenant_schemas or tenant_id in non_tenant_schemas:
|
||||
filtered_tenants.append(tenant_id)
|
||||
|
||||
return filtered_tenants
|
||||
|
||||
|
||||
def get_schema_options() -> (
|
||||
tuple[bool, bool, bool, int | None, int | None, list[str] | None]
|
||||
):
|
||||
x_args_raw = context.get_x_argument()
|
||||
x_args = {}
|
||||
for arg in x_args_raw:
|
||||
for pair in arg.split(","):
|
||||
if "=" in pair:
|
||||
key, value = pair.split("=", 1)
|
||||
x_args[key.strip()] = value.strip()
|
||||
schema_name = x_args.get("schema", POSTGRES_DEFAULT_SCHEMA)
|
||||
if "=" in arg:
|
||||
key, value = arg.split("=", 1)
|
||||
x_args[key.strip()] = value.strip()
|
||||
else:
|
||||
raise ValueError(f"Invalid argument: {arg}")
|
||||
|
||||
create_schema = x_args.get("create_schema", "true").lower() == "true"
|
||||
upgrade_all_tenants = x_args.get("upgrade_all_tenants", "false").lower() == "true"
|
||||
|
||||
@@ -85,17 +141,81 @@ def get_schema_options() -> tuple[str, bool, bool, bool]:
|
||||
# only applies to online migrations
|
||||
continue_on_error = x_args.get("continue", "false").lower() == "true"
|
||||
|
||||
if (
|
||||
MULTI_TENANT
|
||||
and schema_name == POSTGRES_DEFAULT_SCHEMA
|
||||
and not upgrade_all_tenants
|
||||
):
|
||||
# Tenant range filtering
|
||||
tenant_range_start = None
|
||||
tenant_range_end = None
|
||||
|
||||
if "tenant_range_start" in x_args:
|
||||
try:
|
||||
tenant_range_start = int(x_args["tenant_range_start"])
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Invalid tenant_range_start value: {x_args['tenant_range_start']}. Must be an integer."
|
||||
)
|
||||
|
||||
if "tenant_range_end" in x_args:
|
||||
try:
|
||||
tenant_range_end = int(x_args["tenant_range_end"])
|
||||
except ValueError:
|
||||
raise ValueError(
|
||||
f"Invalid tenant_range_end value: {x_args['tenant_range_end']}. Must be an integer."
|
||||
)
|
||||
|
||||
# Validate range
|
||||
if tenant_range_start is not None and tenant_range_end is not None:
|
||||
if tenant_range_start > tenant_range_end:
|
||||
raise ValueError(
|
||||
f"tenant_range_start ({tenant_range_start}) cannot be greater than tenant_range_end ({tenant_range_end})"
|
||||
)
|
||||
|
||||
# Specific schema names filtering (replaces both schema_name and the old tenant_ids approach)
|
||||
schemas = None
|
||||
if "schemas" in x_args:
|
||||
schema_names_str = x_args["schemas"].strip()
|
||||
if schema_names_str:
|
||||
# Split by comma and strip whitespace
|
||||
schemas = [
|
||||
name.strip() for name in schema_names_str.split(",") if name.strip()
|
||||
]
|
||||
if schemas:
|
||||
logger.info(f"Specific schema names specified: {schemas}")
|
||||
|
||||
# Validate that only one method is used at a time
|
||||
range_filtering = tenant_range_start is not None or tenant_range_end is not None
|
||||
specific_filtering = schemas is not None and len(schemas) > 0
|
||||
|
||||
if range_filtering and specific_filtering:
|
||||
raise ValueError(
|
||||
"Cannot run default migrations in public schema when multi-tenancy is enabled. "
|
||||
"Please specify a tenant-specific schema."
|
||||
"Cannot use both tenant range filtering (tenant_range_start/tenant_range_end) "
|
||||
"and specific schema filtering (schemas) at the same time. "
|
||||
"Please use only one filtering method."
|
||||
)
|
||||
|
||||
return schema_name, create_schema, upgrade_all_tenants, continue_on_error
|
||||
if upgrade_all_tenants and specific_filtering:
|
||||
raise ValueError(
|
||||
"Cannot use both upgrade_all_tenants=true and schemas at the same time. "
|
||||
"Use either upgrade_all_tenants=true for all tenants, or schemas for specific schemas."
|
||||
)
|
||||
|
||||
# If any filtering parameters are specified, we're not doing the default single schema migration
|
||||
if range_filtering:
|
||||
upgrade_all_tenants = True
|
||||
|
||||
# Validate multi-tenant requirements
|
||||
if MULTI_TENANT and not upgrade_all_tenants and not specific_filtering:
|
||||
raise ValueError(
|
||||
"In multi-tenant mode, you must specify either upgrade_all_tenants=true "
|
||||
"or provide schemas. Cannot run default migration."
|
||||
)
|
||||
|
||||
return (
|
||||
create_schema,
|
||||
upgrade_all_tenants,
|
||||
continue_on_error,
|
||||
tenant_range_start,
|
||||
tenant_range_end,
|
||||
schemas,
|
||||
)
|
||||
|
||||
|
||||
def do_run_migrations(
|
||||
@@ -142,12 +262,17 @@ def provide_iam_token_for_alembic(
|
||||
|
||||
async def run_async_migrations() -> None:
|
||||
(
|
||||
schema_name,
|
||||
create_schema,
|
||||
upgrade_all_tenants,
|
||||
continue_on_error,
|
||||
tenant_range_start,
|
||||
tenant_range_end,
|
||||
schemas,
|
||||
) = get_schema_options()
|
||||
|
||||
if not schemas and not MULTI_TENANT:
|
||||
schemas = [POSTGRES_DEFAULT_SCHEMA]
|
||||
|
||||
# without init_engine, subsequent engine calls fail hard intentionally
|
||||
SqlEngine.init_engine(pool_size=20, max_overflow=5)
|
||||
|
||||
@@ -164,12 +289,50 @@ async def run_async_migrations() -> None:
|
||||
) -> None:
|
||||
provide_iam_token_for_alembic(dialect, conn_rec, cargs, cparams)
|
||||
|
||||
if upgrade_all_tenants:
|
||||
if schemas:
|
||||
# Use specific schema names directly without fetching all tenants
|
||||
logger.info(f"Migrating specific schema names: {schemas}")
|
||||
|
||||
i_schema = 0
|
||||
num_schemas = len(schemas)
|
||||
for schema in schemas:
|
||||
i_schema += 1
|
||||
logger.info(
|
||||
f"Migrating schema: index={i_schema} num_schemas={num_schemas} schema={schema}"
|
||||
)
|
||||
try:
|
||||
async with engine.connect() as connection:
|
||||
await connection.run_sync(
|
||||
do_run_migrations,
|
||||
schema_name=schema,
|
||||
create_schema=create_schema,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error migrating schema {schema}: {e}")
|
||||
if not continue_on_error:
|
||||
logger.error("--continue=true is not set, raising exception!")
|
||||
raise
|
||||
|
||||
logger.warning("--continue=true is set, continuing to next schema.")
|
||||
|
||||
elif upgrade_all_tenants:
|
||||
tenant_schemas = get_all_tenant_ids()
|
||||
|
||||
filtered_tenant_schemas = filter_tenants_by_range(
|
||||
tenant_schemas, tenant_range_start, tenant_range_end
|
||||
)
|
||||
|
||||
if tenant_range_start is not None or tenant_range_end is not None:
|
||||
logger.info(
|
||||
f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
|
||||
)
|
||||
logger.info(
|
||||
f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
|
||||
)
|
||||
|
||||
i_tenant = 0
|
||||
num_tenants = len(tenant_schemas)
|
||||
for schema in tenant_schemas:
|
||||
num_tenants = len(filtered_tenant_schemas)
|
||||
for schema in filtered_tenant_schemas:
|
||||
i_tenant += 1
|
||||
logger.info(
|
||||
f"Migrating schema: index={i_tenant} num_tenants={num_tenants} schema={schema}"
|
||||
@@ -190,17 +353,13 @@ async def run_async_migrations() -> None:
|
||||
logger.warning("--continue=true is set, continuing to next schema.")
|
||||
|
||||
else:
|
||||
try:
|
||||
logger.info(f"Migrating schema: {schema_name}")
|
||||
async with engine.connect() as connection:
|
||||
await connection.run_sync(
|
||||
do_run_migrations,
|
||||
schema_name=schema_name,
|
||||
create_schema=create_schema,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error migrating schema {schema_name}: {e}")
|
||||
raise
|
||||
# This should not happen in the new design since we require either
|
||||
# upgrade_all_tenants=true or schemas in multi-tenant mode
|
||||
# and for non-multi-tenant mode, we should use schemas with the default schema
|
||||
raise ValueError(
|
||||
"No migration target specified. Use either upgrade_all_tenants=true for all tenants "
|
||||
"or schemas for specific schemas."
|
||||
)
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
@@ -221,10 +380,37 @@ def run_migrations_offline() -> None:
|
||||
# without init_engine, subsequent engine calls fail hard intentionally
|
||||
SqlEngine.init_engine(pool_size=20, max_overflow=5)
|
||||
|
||||
schema_name, _, upgrade_all_tenants, continue_on_error = get_schema_options()
|
||||
(
|
||||
create_schema,
|
||||
upgrade_all_tenants,
|
||||
continue_on_error,
|
||||
tenant_range_start,
|
||||
tenant_range_end,
|
||||
schemas,
|
||||
) = get_schema_options()
|
||||
url = build_connection_string()
|
||||
|
||||
if upgrade_all_tenants:
|
||||
if schemas:
|
||||
# Use specific schema names directly without fetching all tenants
|
||||
logger.info(f"Migrating specific schema names: {schemas}")
|
||||
|
||||
for schema in schemas:
|
||||
logger.info(f"Migrating schema: {schema}")
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata, # type: ignore
|
||||
literal_binds=True,
|
||||
include_object=include_object,
|
||||
version_table_schema=schema,
|
||||
include_schemas=True,
|
||||
script_location=config.get_main_option("script_location"),
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
elif upgrade_all_tenants:
|
||||
engine = create_async_engine(url)
|
||||
|
||||
if USE_IAM_AUTH:
|
||||
@@ -238,7 +424,19 @@ def run_migrations_offline() -> None:
|
||||
tenant_schemas = get_all_tenant_ids()
|
||||
engine.sync_engine.dispose()
|
||||
|
||||
for schema in tenant_schemas:
|
||||
filtered_tenant_schemas = filter_tenants_by_range(
|
||||
tenant_schemas, tenant_range_start, tenant_range_end
|
||||
)
|
||||
|
||||
if tenant_range_start is not None or tenant_range_end is not None:
|
||||
logger.info(
|
||||
f"Filtering tenants by range: start={tenant_range_start}, end={tenant_range_end}"
|
||||
)
|
||||
logger.info(
|
||||
f"Total tenants: {len(tenant_schemas)}, Filtered tenants: {len(filtered_tenant_schemas)}"
|
||||
)
|
||||
|
||||
for schema in filtered_tenant_schemas:
|
||||
logger.info(f"Migrating schema: {schema}")
|
||||
context.configure(
|
||||
url=url,
|
||||
@@ -254,21 +452,12 @@ def run_migrations_offline() -> None:
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
else:
|
||||
logger.info(f"Migrating schema: {schema_name}")
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata, # type: ignore
|
||||
literal_binds=True,
|
||||
include_object=include_object,
|
||||
version_table_schema=schema_name,
|
||||
include_schemas=True,
|
||||
script_location=config.get_main_option("script_location"),
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
# This should not happen in the new design
|
||||
raise ValueError(
|
||||
"No migration target specified. Use either upgrade_all_tenants=true for all tenants "
|
||||
"or schemas for specific schemas."
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
logger.info("run_migrations_online starting.")
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
"""add federated connector tables
|
||||
|
||||
Revision ID: 0816326d83aa
|
||||
Revises: 12635f6655b7
|
||||
Create Date: 2025-06-29 14:09:45.109518
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "0816326d83aa"
|
||||
down_revision = "12635f6655b7"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create federated_connector table
|
||||
op.create_table(
|
||||
"federated_connector",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("source", sa.String(), nullable=False),
|
||||
sa.Column("credentials", sa.LargeBinary(), nullable=False),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create federated_connector_oauth_token table
|
||||
op.create_table(
|
||||
"federated_connector_oauth_token",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("federated_connector_id", sa.Integer(), nullable=False),
|
||||
sa.Column("user_id", postgresql.UUID(as_uuid=True), nullable=False),
|
||||
sa.Column("token", sa.LargeBinary(), nullable=False),
|
||||
sa.Column("expires_at", sa.DateTime(), nullable=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create federated_connector__document_set table
|
||||
op.create_table(
|
||||
"federated_connector__document_set",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("federated_connector_id", sa.Integer(), nullable=False),
|
||||
sa.Column("document_set_id", sa.Integer(), nullable=False),
|
||||
sa.Column("entities", postgresql.JSONB(), nullable=False),
|
||||
sa.ForeignKeyConstraint(
|
||||
["federated_connector_id"], ["federated_connector.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["document_set_id"], ["document_set.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint(
|
||||
"federated_connector_id",
|
||||
"document_set_id",
|
||||
name="uq_federated_connector_document_set",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop tables in reverse order due to foreign key dependencies
|
||||
op.drop_table("federated_connector__document_set")
|
||||
op.drop_table("federated_connector_oauth_token")
|
||||
op.drop_table("federated_connector")
|
||||
@@ -0,0 +1,380 @@
|
||||
"""add project__userfile table and userfile column changes
|
||||
|
||||
Revision ID: 085d844e3953
|
||||
Revises: 8818cf73fa1a
|
||||
Create Date: 2025-09-05 14:24:50.026940
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql as psql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "085d844e3953"
|
||||
down_revision = "8818cf73fa1a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# 0) Ensure UUID generator exists
|
||||
op.execute("CREATE EXTENSION IF NOT EXISTS pgcrypto")
|
||||
|
||||
# Drop persona__user_folder table
|
||||
try:
|
||||
op.drop_table("persona__user_folder")
|
||||
except Exception:
|
||||
# Table might not exist, that's okay
|
||||
pass
|
||||
|
||||
# Drop folder related tables and columns
|
||||
# First try to drop the foreign key constraint if it exists
|
||||
try:
|
||||
# TODO(subash): do proper deletion on constraints
|
||||
op.drop_constraint(
|
||||
"chat_session_folder_id_fkey", "chat_session", type_="foreignkey"
|
||||
)
|
||||
except Exception:
|
||||
# Constraint might not exist, that's okay
|
||||
pass
|
||||
|
||||
# Then drop the folder_id column if it exists
|
||||
try:
|
||||
op.drop_column("chat_session", "folder_id")
|
||||
except Exception:
|
||||
# Column might not exist, that's okay
|
||||
pass
|
||||
|
||||
# Finally drop the chat_folder table if it exists
|
||||
try:
|
||||
op.drop_table("chat_folder")
|
||||
except Exception:
|
||||
# Table might not exist, that's okay
|
||||
pass
|
||||
|
||||
# 1) Add transitional UUID column on user_file + UNIQUE so FKs can reference it
|
||||
op.add_column(
|
||||
"user_file",
|
||||
sa.Column(
|
||||
"new_id",
|
||||
psql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
),
|
||||
)
|
||||
op.create_unique_constraint("uq_user_file_new_id", "user_file", ["new_id"])
|
||||
|
||||
# 2) Move FK users to the transitional UUID
|
||||
# ---- persona__user_file.user_file_id (INT) -> UUID ----
|
||||
op.add_column(
|
||||
"persona__user_file",
|
||||
sa.Column("user_file_id_uuid", psql.UUID(as_uuid=True), nullable=True),
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE persona__user_file p
|
||||
SET user_file_id_uuid = uf.new_id
|
||||
FROM user_file uf
|
||||
WHERE p.user_file_id = uf.id
|
||||
"""
|
||||
)
|
||||
# swap FK to reference user_file.new_id (the transitional UNIQUE)
|
||||
op.drop_constraint(
|
||||
"persona__user_file_user_file_id_fkey",
|
||||
"persona__user_file",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.alter_column("persona__user_file", "user_file_id_uuid", nullable=False)
|
||||
op.create_foreign_key(
|
||||
"persona__user_file_user_file_id_fkey",
|
||||
"persona__user_file",
|
||||
"user_file",
|
||||
local_cols=["user_file_id_uuid"],
|
||||
remote_cols=["new_id"],
|
||||
)
|
||||
op.drop_column("persona__user_file", "user_file_id")
|
||||
op.alter_column(
|
||||
"persona__user_file",
|
||||
"user_file_id_uuid",
|
||||
new_column_name="user_file_id",
|
||||
existing_type=psql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
)
|
||||
# ---- end persona__user_file ----
|
||||
|
||||
# (Repeat 2) for any other FK tables that point to user_file.id)
|
||||
|
||||
# 3) Swap PK on user_file from int -> uuid
|
||||
op.drop_constraint("user_file_pkey", "user_file", type_="primary")
|
||||
op.drop_column("user_file", "id")
|
||||
op.alter_column(
|
||||
"user_file",
|
||||
"new_id",
|
||||
new_column_name="id",
|
||||
existing_type=psql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
)
|
||||
op.create_primary_key("user_file_pkey", "user_file", ["id"])
|
||||
|
||||
# 4) Now **force** FKs to bind to the PK:
|
||||
# (a) drop FK(s)
|
||||
op.drop_constraint(
|
||||
"persona__user_file_user_file_id_fkey",
|
||||
"persona__user_file",
|
||||
type_="foreignkey",
|
||||
)
|
||||
# (b) drop the transitional UNIQUE so it cannot be chosen
|
||||
op.drop_constraint("uq_user_file_new_id", "user_file", type_="unique")
|
||||
# (c) recreate FK(s) to user_file(id) — only PK remains, so it will bind there
|
||||
op.create_foreign_key(
|
||||
"persona__user_file_user_file_id_fkey",
|
||||
"persona__user_file",
|
||||
"user_file",
|
||||
local_cols=["user_file_id"],
|
||||
remote_cols=["id"],
|
||||
)
|
||||
|
||||
# 5) Rename user_folder -> user_project and update dependent FKs/columns
|
||||
try:
|
||||
op.rename_table("user_folder", "user_project")
|
||||
except Exception:
|
||||
# Table might already be renamed
|
||||
pass
|
||||
|
||||
# Drop user_file.folder_id if it exists (we don't keep one-to-many link)
|
||||
try:
|
||||
op.drop_column("user_file", "folder_id")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 6) Safe to create new tables referencing the UUID PK
|
||||
op.create_table(
|
||||
"project__user_file",
|
||||
sa.Column("project_id", sa.Integer(), nullable=False),
|
||||
sa.Column("user_file_id", psql.UUID(as_uuid=True), nullable=False),
|
||||
sa.ForeignKeyConstraint(["project_id"], ["user_project.id"]),
|
||||
sa.ForeignKeyConstraint(["user_file_id"], ["user_file.id"]),
|
||||
sa.PrimaryKeyConstraint("project_id", "user_file_id"),
|
||||
)
|
||||
|
||||
# 6) Remove CCPair relationship
|
||||
# Drop the foreign key constraint first
|
||||
op.drop_constraint(
|
||||
"user_file_cc_pair_id_fkey",
|
||||
"user_file",
|
||||
type_="foreignkey",
|
||||
)
|
||||
# Drop the unique constraint
|
||||
op.drop_constraint(
|
||||
"user_file_cc_pair_id_key",
|
||||
"user_file",
|
||||
type_="unique",
|
||||
)
|
||||
# Drop the column
|
||||
op.drop_column("user_file", "cc_pair_id")
|
||||
|
||||
# 7) Add extra columns
|
||||
op.add_column(
|
||||
"user_file",
|
||||
sa.Column(
|
||||
"status",
|
||||
sa.Enum(
|
||||
"processing",
|
||||
"completed",
|
||||
"failed",
|
||||
"canceled",
|
||||
name="userfilestatus",
|
||||
native_enum=False,
|
||||
),
|
||||
nullable=False,
|
||||
server_default="processing",
|
||||
),
|
||||
)
|
||||
op.add_column("user_file", sa.Column("chunk_count", sa.Integer(), nullable=True))
|
||||
# Drop deprecated document_id column if present
|
||||
try:
|
||||
op.drop_column("user_file", "document_id")
|
||||
except Exception:
|
||||
pass
|
||||
op.add_column(
|
||||
"user_file",
|
||||
sa.Column("last_accessed_at", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"user_project",
|
||||
sa.Column("prompt_id", sa.Integer(), nullable=True),
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"user_project_prompt_id_fkey",
|
||||
"user_project",
|
||||
"prompt",
|
||||
["prompt_id"],
|
||||
["id"],
|
||||
)
|
||||
op.add_column(
|
||||
"chat_session",
|
||||
sa.Column("project_id", sa.Integer(), nullable=True),
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"chat_session_project_id_fkey",
|
||||
"chat_session",
|
||||
"user_project",
|
||||
["project_id"],
|
||||
["id"],
|
||||
)
|
||||
# Add index on project_id for better query performance
|
||||
op.create_index(
|
||||
"ix_chat_session_project_id",
|
||||
"chat_session",
|
||||
["project_id"],
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Recreate persona__user_folder table
|
||||
op.create_table(
|
||||
"persona__user_folder",
|
||||
sa.Column("persona_id", sa.Integer(), nullable=False),
|
||||
sa.Column("user_folder_id", sa.Integer(), nullable=False),
|
||||
sa.ForeignKeyConstraint(["persona_id"], ["persona.id"]),
|
||||
sa.ForeignKeyConstraint(["user_folder_id"], ["user_folder.id"]),
|
||||
sa.PrimaryKeyConstraint("persona_id", "user_folder_id"),
|
||||
)
|
||||
|
||||
# Recreate folder related tables and columns
|
||||
# First create the chat_folder table
|
||||
op.create_table(
|
||||
"chat_folder",
|
||||
sa.Column("id", sa.Integer(), primary_key=True),
|
||||
sa.Column("user_id", psql.UUID(as_uuid=True), nullable=True),
|
||||
sa.Column("name", sa.String(), nullable=True),
|
||||
sa.Column("display_priority", sa.Integer(), nullable=True, default=0),
|
||||
)
|
||||
# Add foreign key for user_id after table creation
|
||||
op.create_foreign_key(
|
||||
"chat_folder_user_id_fkey",
|
||||
"chat_folder",
|
||||
"user",
|
||||
["user_id"],
|
||||
["id"],
|
||||
)
|
||||
|
||||
# Add folder_id column to chat_session
|
||||
op.add_column(
|
||||
"chat_session",
|
||||
sa.Column("folder_id", sa.Integer(), nullable=True),
|
||||
)
|
||||
# Create foreign key constraint after both tables exist
|
||||
op.create_foreign_key(
|
||||
"chat_session_folder_id_fkey",
|
||||
"chat_session",
|
||||
"chat_folder",
|
||||
["folder_id"],
|
||||
["id"],
|
||||
)
|
||||
|
||||
# Drop extra columns
|
||||
op.drop_column("user_file", "last_accessed_at")
|
||||
# Recreate document_id on downgrade
|
||||
try:
|
||||
op.add_column(
|
||||
"user_file", sa.Column("document_id", sa.String(), nullable=False)
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
op.drop_column("user_file", "chunk_count")
|
||||
op.drop_column("user_file", "status")
|
||||
op.execute("DROP TYPE IF EXISTS userfilestatus")
|
||||
|
||||
# Drop association table
|
||||
op.drop_table("project__user_file")
|
||||
# Drop index before dropping the column
|
||||
op.drop_index("ix_chat_session_project_id", table_name="chat_session")
|
||||
op.drop_column("chat_session", "project_id")
|
||||
# Recreate an integer PK (best-effort; original values aren’t retained)
|
||||
op.drop_constraint(
|
||||
"persona__user_file_user_file_id_fkey", "persona__user_file", type_="foreignkey"
|
||||
)
|
||||
op.drop_constraint("user_file_pkey", "user_file", type_="primary")
|
||||
|
||||
op.add_column(
|
||||
"user_file",
|
||||
sa.Column("id_int_tmp", sa.Integer(), autoincrement=True, nullable=False),
|
||||
)
|
||||
op.execute(
|
||||
"CREATE SEQUENCE IF NOT EXISTS user_file_id_seq OWNED BY user_file.id_int_tmp"
|
||||
)
|
||||
op.execute(
|
||||
"ALTER TABLE user_file ALTER COLUMN id_int_tmp SET DEFAULT nextval('user_file_id_seq')"
|
||||
)
|
||||
op.create_primary_key("user_file_pkey", "user_file", ["id_int_tmp"])
|
||||
|
||||
op.add_column(
|
||||
"persona__user_file",
|
||||
sa.Column("user_file_id_int_tmp", sa.Integer(), nullable=True),
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"persona__user_file_user_file_id_fkey",
|
||||
"persona__user_file",
|
||||
"user_file",
|
||||
["user_file_id_int_tmp"],
|
||||
["id_int_tmp"],
|
||||
)
|
||||
|
||||
# Remove UUID id and rename int back to id
|
||||
op.drop_column("user_file", "id")
|
||||
op.alter_column(
|
||||
"user_file",
|
||||
"id_int_tmp",
|
||||
new_column_name="id",
|
||||
existing_type=sa.Integer(),
|
||||
nullable=False,
|
||||
)
|
||||
|
||||
op.drop_column("persona__user_file", "user_file_id")
|
||||
op.alter_column(
|
||||
"persona__user_file",
|
||||
"user_file_id_int_tmp",
|
||||
new_column_name="user_file_id",
|
||||
existing_type=sa.Integer(),
|
||||
)
|
||||
|
||||
# Restore CCPair relationship
|
||||
op.add_column(
|
||||
"user_file",
|
||||
sa.Column("cc_pair_id", sa.Integer(), nullable=True),
|
||||
)
|
||||
op.create_unique_constraint(
|
||||
"user_file_cc_pair_id_key",
|
||||
"user_file",
|
||||
["cc_pair_id"],
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"user_file_cc_pair_id_fkey",
|
||||
"user_file",
|
||||
"connector_credential_pair",
|
||||
["cc_pair_id"],
|
||||
["id"],
|
||||
)
|
||||
|
||||
# Rename user_project back to user_folder and revert related changes
|
||||
try:
|
||||
op.drop_constraint(
|
||||
"user_project_prompt_id_fkey", "user_project", type_="foreignkey"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
op.drop_column("user_project", "prompt_id")
|
||||
except Exception:
|
||||
pass
|
||||
# Recreate user_file.folder_id (nullable) since we dropped it on upgrade
|
||||
try:
|
||||
op.add_column("user_file", sa.Column("folder_id", sa.Integer(), nullable=True))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
op.rename_table("user_project", "user_folder")
|
||||
except Exception:
|
||||
pass
|
||||
596
backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
Normal file
596
backend/alembic/versions/12635f6655b7_drive_canonical_ids.py
Normal file
@@ -0,0 +1,596 @@
|
||||
"""drive-canonical-ids
|
||||
|
||||
Revision ID: 12635f6655b7
|
||||
Revises: 58c50ef19f08
|
||||
Create Date: 2025-06-20 14:44:54.241159
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from httpx import HTTPStatusError
|
||||
import httpx
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.db.search_settings import SearchSettings
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
from onyx.document_index.vespa.shared_utils.utils import (
|
||||
replace_invalid_doc_id_characters,
|
||||
)
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.utils.logger import setup_logger
|
||||
import os
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "12635f6655b7"
|
||||
down_revision = "58c50ef19f08"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
SKIP_CANON_DRIVE_IDS = os.environ.get("SKIP_CANON_DRIVE_IDS", "true").lower() == "true"
|
||||
|
||||
|
||||
def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
|
||||
result = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_fetch = result.fetchall()
|
||||
search_settings = (
|
||||
SearchSettings(**search_settings_fetch[0]._asdict())
|
||||
if search_settings_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
result2 = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_future_fetch = result2.fetchall()
|
||||
search_settings_future = (
|
||||
SearchSettings(**search_settings_future_fetch[0]._asdict())
|
||||
if search_settings_future_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
if not isinstance(search_settings, SearchSettings):
|
||||
raise RuntimeError(
|
||||
"current search settings is of type " + str(type(search_settings))
|
||||
)
|
||||
if (
|
||||
not isinstance(search_settings_future, SearchSettings)
|
||||
and search_settings_future is not None
|
||||
):
|
||||
raise RuntimeError(
|
||||
"future search settings is of type " + str(type(search_settings_future))
|
||||
)
|
||||
|
||||
return search_settings, search_settings_future
|
||||
|
||||
|
||||
def normalize_google_drive_url(url: str) -> str:
|
||||
"""Remove query parameters from Google Drive URLs to create canonical document IDs.
|
||||
NOTE: copied from drive doc_conversion.py
|
||||
"""
|
||||
parsed_url = urlparse(url)
|
||||
parsed_url = parsed_url._replace(query="")
|
||||
spl_path = parsed_url.path.split("/")
|
||||
if spl_path and (spl_path[-1] in ["edit", "view", "preview"]):
|
||||
spl_path.pop()
|
||||
parsed_url = parsed_url._replace(path="/".join(spl_path))
|
||||
# Remove query parameters and reconstruct URL
|
||||
return urlunparse(parsed_url)
|
||||
|
||||
|
||||
def get_google_drive_documents_from_database() -> list[dict]:
|
||||
"""Get all Google Drive documents from the database."""
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT d.id
|
||||
FROM document d
|
||||
JOIN document_by_connector_credential_pair dcc ON d.id = dcc.id
|
||||
JOIN connector_credential_pair cc ON dcc.connector_id = cc.connector_id
|
||||
AND dcc.credential_id = cc.credential_id
|
||||
JOIN connector c ON cc.connector_id = c.id
|
||||
WHERE c.source = 'GOOGLE_DRIVE'
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
documents = []
|
||||
for row in result:
|
||||
documents.append({"document_id": row.id})
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def update_document_id_in_database(
|
||||
old_doc_id: str, new_doc_id: str, index_name: str
|
||||
) -> None:
|
||||
"""Update document IDs in all relevant database tables using copy-and-swap approach."""
|
||||
bind = op.get_bind()
|
||||
|
||||
# print(f"Updating database tables for document {old_doc_id} -> {new_doc_id}")
|
||||
|
||||
# Check if new document ID already exists
|
||||
result = bind.execute(
|
||||
sa.text("SELECT COUNT(*) FROM document WHERE id = :new_id"),
|
||||
{"new_id": new_doc_id},
|
||||
)
|
||||
row = result.fetchone()
|
||||
if row and row[0] > 0:
|
||||
# print(f"Document with ID {new_doc_id} already exists, deleting old one")
|
||||
delete_document_from_db(old_doc_id, index_name)
|
||||
return
|
||||
|
||||
# Step 1: Create a new document row with the new ID (copy all fields from old row)
|
||||
# Use a conservative approach to handle columns that might not exist in all installations
|
||||
try:
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners,
|
||||
external_user_emails, external_user_group_ids, is_public,
|
||||
chunk_count, last_modified, last_synced, kg_stage, kg_processing_time)
|
||||
SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners,
|
||||
external_user_emails, external_user_group_ids, is_public,
|
||||
chunk_count, last_modified, last_synced, kg_stage, kg_processing_time
|
||||
FROM document
|
||||
WHERE id = :old_id
|
||||
"""
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated database tables for document {old_doc_id} -> {new_doc_id}")
|
||||
except Exception as e:
|
||||
# If the full INSERT fails, try a more basic version with only core columns
|
||||
logger.warning(f"Full INSERT failed, trying basic version: {e}")
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners)
|
||||
SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id,
|
||||
link, doc_updated_at, primary_owners, secondary_owners
|
||||
FROM document
|
||||
WHERE id = :old_id
|
||||
"""
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
|
||||
# Step 2: Update all foreign key references to point to the new ID
|
||||
|
||||
# Update document_by_connector_credential_pair table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE document_by_connector_credential_pair SET id = :new_id WHERE id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated document_by_connector_credential_pair table for document {old_doc_id} -> {new_doc_id}")
|
||||
|
||||
# Update search_doc table (stores search results for chat replay)
|
||||
# This is critical for agent functionality
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE search_doc SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated search_doc table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update document_retrieval_feedback table (user feedback on documents)
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE document_retrieval_feedback SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated document_retrieval_feedback table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update document__tag table (document-tag relationships)
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE document__tag SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated document__tag table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update user_file table (user uploaded files linked to documents)
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE user_file SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated user_file table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update KG and chunk_stats tables (these may not exist in all installations)
|
||||
try:
|
||||
# Update kg_entity table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_entity SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_entity table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update kg_entity_extraction_staging table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_entity_extraction_staging SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_entity_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update kg_relationship table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_relationship SET source_document = :new_id WHERE source_document = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_relationship table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update kg_relationship_extraction_staging table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE kg_relationship_extraction_staging SET source_document = :new_id WHERE source_document = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated kg_relationship_extraction_staging table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update chunk_stats table
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"UPDATE chunk_stats SET document_id = :new_id WHERE document_id = :old_id"
|
||||
),
|
||||
{"new_id": new_doc_id, "old_id": old_doc_id},
|
||||
)
|
||||
# print(f"Successfully updated chunk_stats table for document {old_doc_id} -> {new_doc_id}")
|
||||
# Update chunk_stats ID field which includes document_id
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chunk_stats
|
||||
SET id = REPLACE(id, :old_id, :new_id)
|
||||
WHERE id LIKE :old_id_pattern
|
||||
"""
|
||||
),
|
||||
{
|
||||
"new_id": new_doc_id,
|
||||
"old_id": old_doc_id,
|
||||
"old_id_pattern": f"{old_doc_id}__%",
|
||||
},
|
||||
)
|
||||
# print(f"Successfully updated chunk_stats ID field for document {old_doc_id} -> {new_doc_id}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Some KG/chunk tables may not exist or failed to update: {e}")
|
||||
|
||||
# Step 3: Delete the old document row (this should now be safe since all FKs point to new row)
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM document WHERE id = :old_id"), {"old_id": old_doc_id}
|
||||
)
|
||||
# print(f"Successfully deleted document {old_doc_id} from database")
|
||||
|
||||
|
||||
def _visit_chunks(
|
||||
*,
|
||||
http_client: httpx.Client,
|
||||
index_name: str,
|
||||
selection: str,
|
||||
continuation: str | None = None,
|
||||
) -> tuple[list[dict], str | None]:
|
||||
"""Helper that calls the /document/v1 visit API once and returns (docs, next_token)."""
|
||||
|
||||
# Use the same URL as the document API, but with visit-specific params
|
||||
base_url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
|
||||
|
||||
params: dict[str, str] = {
|
||||
"selection": selection,
|
||||
"wantedDocumentCount": "1000",
|
||||
}
|
||||
if continuation:
|
||||
params["continuation"] = continuation
|
||||
|
||||
# print(f"Visiting chunks for selection '{selection}' with params {params}")
|
||||
resp = http_client.get(base_url, params=params, timeout=None)
|
||||
# print(f"Visited chunks for document {selection}")
|
||||
resp.raise_for_status()
|
||||
|
||||
payload = resp.json()
|
||||
return payload.get("documents", []), payload.get("continuation")
|
||||
|
||||
|
||||
def delete_document_chunks_from_vespa(index_name: str, doc_id: str) -> None:
|
||||
"""Delete all chunks for *doc_id* from Vespa using continuation-token paging (no offset)."""
|
||||
|
||||
total_deleted = 0
|
||||
# Use exact match instead of contains - Document Selector Language doesn't support contains
|
||||
selection = f'{index_name}.document_id=="{doc_id}"'
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
continuation: str | None = None
|
||||
while True:
|
||||
docs, continuation = _visit_chunks(
|
||||
http_client=http_client,
|
||||
index_name=index_name,
|
||||
selection=selection,
|
||||
continuation=continuation,
|
||||
)
|
||||
|
||||
if not docs:
|
||||
break
|
||||
|
||||
for doc in docs:
|
||||
vespa_full_id = doc.get("id")
|
||||
if not vespa_full_id:
|
||||
continue
|
||||
|
||||
vespa_doc_uuid = vespa_full_id.split("::")[-1]
|
||||
delete_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
|
||||
|
||||
try:
|
||||
resp = http_client.delete(delete_url)
|
||||
resp.raise_for_status()
|
||||
total_deleted += 1
|
||||
except Exception as e:
|
||||
print(f"Failed to delete chunk {vespa_doc_uuid}: {e}")
|
||||
|
||||
if not continuation:
|
||||
break
|
||||
|
||||
|
||||
def update_document_id_in_vespa(
|
||||
index_name: str, old_doc_id: str, new_doc_id: str
|
||||
) -> None:
|
||||
"""Update all chunks' document_id field from *old_doc_id* to *new_doc_id* using continuation paging."""
|
||||
|
||||
clean_new_doc_id = replace_invalid_doc_id_characters(new_doc_id)
|
||||
|
||||
# Use exact match instead of contains - Document Selector Language doesn't support contains
|
||||
selection = f'{index_name}.document_id=="{old_doc_id}"'
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
continuation: str | None = None
|
||||
while True:
|
||||
# print(f"Visiting chunks for document {old_doc_id} -> {new_doc_id}")
|
||||
docs, continuation = _visit_chunks(
|
||||
http_client=http_client,
|
||||
index_name=index_name,
|
||||
selection=selection,
|
||||
continuation=continuation,
|
||||
)
|
||||
|
||||
if not docs:
|
||||
break
|
||||
|
||||
for doc in docs:
|
||||
vespa_full_id = doc.get("id")
|
||||
if not vespa_full_id:
|
||||
continue
|
||||
|
||||
vespa_doc_uuid = vespa_full_id.split("::")[-1]
|
||||
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_uuid}"
|
||||
|
||||
update_request = {
|
||||
"fields": {"document_id": {"assign": clean_new_doc_id}}
|
||||
}
|
||||
|
||||
try:
|
||||
resp = http_client.put(vespa_url, json=update_request)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"Failed to update chunk {vespa_doc_uuid}: {e}")
|
||||
raise
|
||||
|
||||
if not continuation:
|
||||
break
|
||||
|
||||
|
||||
def delete_document_from_db(current_doc_id: str, index_name: str) -> None:
|
||||
# Delete all foreign key references first, then delete the document
|
||||
try:
|
||||
bind = op.get_bind()
|
||||
|
||||
# Delete from agent-related tables first (order matters due to foreign keys)
|
||||
# Delete from agent__sub_query__search_doc first since it references search_doc
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM agent__sub_query__search_doc
|
||||
WHERE search_doc_id IN (
|
||||
SELECT id FROM search_doc WHERE document_id = :doc_id
|
||||
)
|
||||
"""
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from chat_message__search_doc
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM chat_message__search_doc
|
||||
WHERE search_doc_id IN (
|
||||
SELECT id FROM search_doc WHERE document_id = :doc_id
|
||||
)
|
||||
"""
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Now we can safely delete from search_doc
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM search_doc WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from document_by_connector_credential_pair
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM document_by_connector_credential_pair WHERE id = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from other tables that reference this document
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM document_retrieval_feedback WHERE document_id = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM document__tag WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM user_file WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete from KG tables if they exist
|
||||
try:
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM kg_entity WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM kg_entity_extraction_staging WHERE document_id = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM kg_relationship WHERE source_document = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"DELETE FROM kg_relationship_extraction_staging WHERE source_document = :doc_id"
|
||||
),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM chunk_stats WHERE document_id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM chunk_stats WHERE id LIKE :doc_id_pattern"),
|
||||
{"doc_id_pattern": f"{current_doc_id}__%"},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Some KG/chunk tables may not exist or failed to delete from: {e}"
|
||||
)
|
||||
|
||||
# Finally delete the document itself
|
||||
bind.execute(
|
||||
sa.text("DELETE FROM document WHERE id = :doc_id"),
|
||||
{"doc_id": current_doc_id},
|
||||
)
|
||||
|
||||
# Delete chunks from vespa
|
||||
delete_document_chunks_from_vespa(index_name, current_doc_id)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to delete duplicate document {current_doc_id}: {e}")
|
||||
# Continue with other documents instead of failing the entire migration
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
if SKIP_CANON_DRIVE_IDS:
|
||||
return
|
||||
current_search_settings, future_search_settings = active_search_settings()
|
||||
document_index = get_default_document_index(
|
||||
current_search_settings,
|
||||
future_search_settings,
|
||||
)
|
||||
|
||||
# Get the index name
|
||||
if hasattr(document_index, "index_name"):
|
||||
index_name = document_index.index_name
|
||||
else:
|
||||
# Default index name if we can't get it from the document_index
|
||||
index_name = "danswer_index"
|
||||
|
||||
# Get all Google Drive documents from the database (this is faster and more reliable)
|
||||
gdrive_documents = get_google_drive_documents_from_database()
|
||||
|
||||
if not gdrive_documents:
|
||||
return
|
||||
|
||||
# Track normalized document IDs to detect duplicates
|
||||
all_normalized_doc_ids = set()
|
||||
updated_count = 0
|
||||
|
||||
for doc_info in gdrive_documents:
|
||||
current_doc_id = doc_info["document_id"]
|
||||
normalized_doc_id = normalize_google_drive_url(current_doc_id)
|
||||
|
||||
print(f"Processing document {current_doc_id} -> {normalized_doc_id}")
|
||||
# Check for duplicates
|
||||
if normalized_doc_id in all_normalized_doc_ids:
|
||||
# print(f"Deleting duplicate document {current_doc_id}")
|
||||
delete_document_from_db(current_doc_id, index_name)
|
||||
continue
|
||||
|
||||
all_normalized_doc_ids.add(normalized_doc_id)
|
||||
|
||||
# If the document ID already doesn't have query parameters, skip it
|
||||
if current_doc_id == normalized_doc_id:
|
||||
# print(f"Skipping document {current_doc_id} -> {normalized_doc_id} because it already has no query parameters")
|
||||
continue
|
||||
|
||||
try:
|
||||
# Update both database and Vespa in order
|
||||
# Database first to ensure consistency
|
||||
update_document_id_in_database(
|
||||
current_doc_id, normalized_doc_id, index_name
|
||||
)
|
||||
|
||||
# For Vespa, we can now use the original document IDs since we're using contains matching
|
||||
update_document_id_in_vespa(index_name, current_doc_id, normalized_doc_id)
|
||||
updated_count += 1
|
||||
# print(f"Finished updating document {current_doc_id} -> {normalized_doc_id}")
|
||||
except Exception as e:
|
||||
print(f"Failed to update document {current_doc_id}: {e}")
|
||||
|
||||
if isinstance(e, HTTPStatusError):
|
||||
print(f"HTTPStatusError: {e}")
|
||||
print(f"Response: {e.response.text}")
|
||||
print(f"Status: {e.response.status_code}")
|
||||
print(f"Headers: {e.response.headers}")
|
||||
print(f"Request: {e.request.url}")
|
||||
print(f"Request headers: {e.request.headers}")
|
||||
# Note: Rollback is complex with copy-and-swap approach since the old document is already deleted
|
||||
# In case of failure, manual intervention may be required
|
||||
# Continue with other documents instead of failing the entire migration
|
||||
continue
|
||||
|
||||
logger.info(f"Migration complete. Updated {updated_count} Google Drive documents")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# this is a one way migration, so no downgrade.
|
||||
# It wouldn't make sense to store the extra query parameters
|
||||
# and duplicate documents to allow a reversal.
|
||||
pass
|
||||
@@ -144,27 +144,34 @@ def upgrade() -> None:
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("TRUNCATE TABLE index_attempt")
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"connector_specific_config",
|
||||
postgresql.JSONB(astext_type=sa.Text()),
|
||||
autoincrement=False,
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Check if the constraint exists before dropping
|
||||
conn = op.get_bind()
|
||||
inspector = sa.inspect(conn)
|
||||
existing_columns = {col["name"] for col in inspector.get_columns("index_attempt")}
|
||||
|
||||
if "input_type" not in existing_columns:
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
|
||||
if "source" not in existing_columns:
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False),
|
||||
)
|
||||
|
||||
if "connector_specific_config" not in existing_columns:
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"connector_specific_config",
|
||||
postgresql.JSONB(astext_type=sa.Text()),
|
||||
autoincrement=False,
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# Check if the constraint exists before dropping
|
||||
constraints = inspector.get_foreign_keys("index_attempt")
|
||||
|
||||
if any(
|
||||
@@ -183,8 +190,12 @@ def downgrade() -> None:
|
||||
"fk_index_attempt_connector_id", "index_attempt", type_="foreignkey"
|
||||
)
|
||||
|
||||
op.drop_column("index_attempt", "credential_id")
|
||||
op.drop_column("index_attempt", "connector_id")
|
||||
op.drop_table("connector_credential_pair")
|
||||
op.drop_table("credential")
|
||||
op.drop_table("connector")
|
||||
if "credential_id" in existing_columns:
|
||||
op.drop_column("index_attempt", "credential_id")
|
||||
|
||||
if "connector_id" in existing_columns:
|
||||
op.drop_column("index_attempt", "connector_id")
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS connector_credential_pair CASCADE")
|
||||
op.execute("DROP TABLE IF EXISTS credential CASCADE")
|
||||
op.execute("DROP TABLE IF EXISTS connector CASCADE")
|
||||
|
||||
@@ -0,0 +1,115 @@
|
||||
"""add_indexing_coordination
|
||||
|
||||
Revision ID: 2f95e36923e6
|
||||
Revises: 0816326d83aa
|
||||
Create Date: 2025-07-10 16:17:57.762182
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "2f95e36923e6"
|
||||
down_revision = "0816326d83aa"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add database-based coordination fields (replacing Redis fencing)
|
||||
op.add_column(
|
||||
"index_attempt", sa.Column("celery_task_id", sa.String(), nullable=True)
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"cancellation_requested",
|
||||
sa.Boolean(),
|
||||
nullable=False,
|
||||
server_default="false",
|
||||
),
|
||||
)
|
||||
|
||||
# Add batch coordination fields (replacing FileStore state)
|
||||
op.add_column(
|
||||
"index_attempt", sa.Column("total_batches", sa.Integer(), nullable=True)
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"completed_batches", sa.Integer(), nullable=False, server_default="0"
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"total_failures_batch_level",
|
||||
sa.Integer(),
|
||||
nullable=False,
|
||||
server_default="0",
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("total_chunks", sa.Integer(), nullable=False, server_default="0"),
|
||||
)
|
||||
|
||||
# Progress tracking for stall detection
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("last_progress_time", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"last_batches_completed_count",
|
||||
sa.Integer(),
|
||||
nullable=False,
|
||||
server_default="0",
|
||||
),
|
||||
)
|
||||
|
||||
# Heartbeat tracking for worker liveness detection
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"heartbeat_counter", sa.Integer(), nullable=False, server_default="0"
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column(
|
||||
"last_heartbeat_value", sa.Integer(), nullable=False, server_default="0"
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"index_attempt",
|
||||
sa.Column("last_heartbeat_time", sa.DateTime(timezone=True), nullable=True),
|
||||
)
|
||||
|
||||
# Add index for coordination queries
|
||||
op.create_index(
|
||||
"ix_index_attempt_active_coordination",
|
||||
"index_attempt",
|
||||
["connector_credential_pair_id", "search_settings_id", "status"],
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Remove the new index
|
||||
op.drop_index("ix_index_attempt_active_coordination", table_name="index_attempt")
|
||||
|
||||
# Remove the new columns
|
||||
op.drop_column("index_attempt", "last_batches_completed_count")
|
||||
op.drop_column("index_attempt", "last_progress_time")
|
||||
op.drop_column("index_attempt", "last_heartbeat_time")
|
||||
op.drop_column("index_attempt", "last_heartbeat_value")
|
||||
op.drop_column("index_attempt", "heartbeat_counter")
|
||||
op.drop_column("index_attempt", "total_chunks")
|
||||
op.drop_column("index_attempt", "total_failures_batch_level")
|
||||
op.drop_column("index_attempt", "completed_batches")
|
||||
op.drop_column("index_attempt", "total_batches")
|
||||
op.drop_column("index_attempt", "cancellation_requested")
|
||||
op.drop_column("index_attempt", "celery_task_id")
|
||||
@@ -0,0 +1,136 @@
|
||||
"""update_kg_trigger_functions
|
||||
|
||||
Revision ID: 36e9220ab794
|
||||
Revises: c9e2cd766c29
|
||||
Create Date: 2025-06-22 17:33:25.833733
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "36e9220ab794"
|
||||
down_revision = "c9e2cd766c29"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def _get_tenant_contextvar(session: Session) -> str:
|
||||
"""Get the current schema for the migration"""
|
||||
current_tenant = session.execute(text("SELECT current_schema()")).scalar()
|
||||
if isinstance(current_tenant, str):
|
||||
return current_tenant
|
||||
else:
|
||||
raise ValueError("Current tenant is not a string")
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
|
||||
bind = op.get_bind()
|
||||
session = Session(bind=bind)
|
||||
|
||||
# Create kg_entity trigger to update kg_entity.name and its trigrams
|
||||
tenant_id = _get_tenant_contextvar(session)
|
||||
alphanum_pattern = r"[^a-z0-9]+"
|
||||
truncate_length = 1000
|
||||
function = "update_kg_entity_name"
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
|
||||
RETURNS TRIGGER AS $$
|
||||
DECLARE
|
||||
name text;
|
||||
cleaned_name text;
|
||||
BEGIN
|
||||
-- Set name to semantic_id if document_id is not NULL
|
||||
IF NEW.document_id IS NOT NULL THEN
|
||||
SELECT lower(semantic_id) INTO name
|
||||
FROM "{tenant_id}".document
|
||||
WHERE id = NEW.document_id;
|
||||
ELSE
|
||||
name = lower(NEW.name);
|
||||
END IF;
|
||||
|
||||
-- Clean name and truncate if too long
|
||||
cleaned_name = regexp_replace(
|
||||
name,
|
||||
'{alphanum_pattern}', '', 'g'
|
||||
);
|
||||
IF length(cleaned_name) > {truncate_length} THEN
|
||||
cleaned_name = left(cleaned_name, {truncate_length});
|
||||
END IF;
|
||||
|
||||
-- Set name and name trigrams
|
||||
NEW.name = name;
|
||||
NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
)
|
||||
trigger = f"{function}_trigger"
|
||||
op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".kg_entity')
|
||||
op.execute(
|
||||
f"""
|
||||
CREATE TRIGGER {trigger}
|
||||
BEFORE INSERT OR UPDATE OF name
|
||||
ON "{tenant_id}".kg_entity
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION "{tenant_id}".{function}();
|
||||
"""
|
||||
)
|
||||
|
||||
# Create kg_entity trigger to update kg_entity.name and its trigrams
|
||||
function = "update_kg_entity_name_from_doc"
|
||||
op.execute(
|
||||
text(
|
||||
f"""
|
||||
CREATE OR REPLACE FUNCTION "{tenant_id}".{function}()
|
||||
RETURNS TRIGGER AS $$
|
||||
DECLARE
|
||||
doc_name text;
|
||||
cleaned_name text;
|
||||
BEGIN
|
||||
doc_name = lower(NEW.semantic_id);
|
||||
|
||||
-- Clean name and truncate if too long
|
||||
cleaned_name = regexp_replace(
|
||||
doc_name,
|
||||
'{alphanum_pattern}', '', 'g'
|
||||
);
|
||||
IF length(cleaned_name) > {truncate_length} THEN
|
||||
cleaned_name = left(cleaned_name, {truncate_length});
|
||||
END IF;
|
||||
|
||||
-- Set name and name trigrams for all entities referencing this document
|
||||
UPDATE "{tenant_id}".kg_entity
|
||||
SET
|
||||
name = doc_name,
|
||||
name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
|
||||
WHERE document_id = NEW.id;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
"""
|
||||
)
|
||||
)
|
||||
trigger = f"{function}_trigger"
|
||||
op.execute(f'DROP TRIGGER IF EXISTS {trigger} ON "{tenant_id}".document')
|
||||
op.execute(
|
||||
f"""
|
||||
CREATE TRIGGER {trigger}
|
||||
AFTER UPDATE OF semantic_id
|
||||
ON "{tenant_id}".document
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION "{tenant_id}".{function}();
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
pass
|
||||
@@ -21,22 +21,14 @@ depends_on = None
|
||||
# an outage by creating an index without using CONCURRENTLY. This migration:
|
||||
#
|
||||
# 1. Creates more efficient full-text search capabilities using tsvector columns and GIN indexes
|
||||
# 2. Uses CONCURRENTLY for all index creation to prevent table locking
|
||||
# 3. Explicitly manages transactions with COMMIT statements to allow CONCURRENTLY to work
|
||||
# (see: https://www.postgresql.org/docs/9.4/sql-createindex.html#SQL-CREATEINDEX-CONCURRENTLY)
|
||||
# (see: https://github.com/sqlalchemy/alembic/issues/277)
|
||||
# 4. Adds indexes to both chat_message and chat_session tables for comprehensive search
|
||||
# 2. Adds indexes to both chat_message and chat_session tables for comprehensive search
|
||||
# 3. Note: CONCURRENTLY was removed due to operational issues
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# First, drop any existing indexes to avoid conflicts
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
|
||||
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
|
||||
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_message_message_lower;")
|
||||
|
||||
# Drop existing columns if they exist
|
||||
@@ -52,12 +44,9 @@ def upgrade() -> None:
|
||||
"""
|
||||
)
|
||||
|
||||
# Commit the current transaction before creating concurrent indexes
|
||||
op.execute("COMMIT")
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
|
||||
CREATE INDEX IF NOT EXISTS idx_chat_message_tsv
|
||||
ON chat_message
|
||||
USING GIN (message_tsv)
|
||||
"""
|
||||
@@ -72,12 +61,9 @@ def upgrade() -> None:
|
||||
"""
|
||||
)
|
||||
|
||||
# Commit again before creating the second concurrent index
|
||||
op.execute("COMMIT")
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
|
||||
CREATE INDEX IF NOT EXISTS idx_chat_session_desc_tsv
|
||||
ON chat_session
|
||||
USING GIN (description_tsv)
|
||||
"""
|
||||
@@ -85,12 +71,9 @@ def upgrade() -> None:
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the indexes first (use CONCURRENTLY for dropping too)
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;")
|
||||
|
||||
op.execute("COMMIT")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;")
|
||||
# Drop the indexes first
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_message_tsv;")
|
||||
op.execute("DROP INDEX IF EXISTS idx_chat_session_desc_tsv;")
|
||||
|
||||
# Then drop the columns
|
||||
op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv;")
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
"""add_doc_metadata_field_in_document_model
|
||||
|
||||
Revision ID: 3fc5d75723b3
|
||||
Revises: 2f95e36923e6
|
||||
Create Date: 2025-07-28 18:45:37.985406
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "3fc5d75723b3"
|
||||
down_revision = "2f95e36923e6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column(
|
||||
"doc_metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("document", "doc_metadata")
|
||||
@@ -15,7 +15,7 @@ from datetime import datetime, timedelta
|
||||
from onyx.configs.app_configs import DB_READONLY_USER
|
||||
from onyx.configs.app_configs import DB_READONLY_PASSWORD
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
@@ -80,6 +80,7 @@ def upgrade() -> None:
|
||||
)
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_config CASCADE")
|
||||
op.create_table(
|
||||
"kg_config",
|
||||
sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True),
|
||||
@@ -123,6 +124,7 @@ def upgrade() -> None:
|
||||
],
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_entity_type CASCADE")
|
||||
op.create_table(
|
||||
"kg_entity_type",
|
||||
sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True),
|
||||
@@ -156,6 +158,7 @@ def upgrade() -> None:
|
||||
),
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship_type CASCADE")
|
||||
# Create KGRelationshipType table
|
||||
op.create_table(
|
||||
"kg_relationship_type",
|
||||
@@ -194,6 +197,7 @@ def upgrade() -> None:
|
||||
),
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship_type_extraction_staging CASCADE")
|
||||
# Create KGRelationshipTypeExtractionStaging table
|
||||
op.create_table(
|
||||
"kg_relationship_type_extraction_staging",
|
||||
@@ -227,6 +231,8 @@ def upgrade() -> None:
|
||||
),
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_entity CASCADE")
|
||||
|
||||
# Create KGEntity table
|
||||
op.create_table(
|
||||
"kg_entity",
|
||||
@@ -281,6 +287,7 @@ def upgrade() -> None:
|
||||
"ix_entity_name_search", "kg_entity", ["name", "entity_type_id_name"]
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_entity_extraction_staging CASCADE")
|
||||
# Create KGEntityExtractionStaging table
|
||||
op.create_table(
|
||||
"kg_entity_extraction_staging",
|
||||
@@ -330,6 +337,7 @@ def upgrade() -> None:
|
||||
["name", "entity_type_id_name"],
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship CASCADE")
|
||||
# Create KGRelationship table
|
||||
op.create_table(
|
||||
"kg_relationship",
|
||||
@@ -371,6 +379,7 @@ def upgrade() -> None:
|
||||
"ix_kg_relationship_nodes", "kg_relationship", ["source_node", "target_node"]
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_relationship_extraction_staging CASCADE")
|
||||
# Create KGRelationshipExtractionStaging table
|
||||
op.create_table(
|
||||
"kg_relationship_extraction_staging",
|
||||
@@ -414,6 +423,7 @@ def upgrade() -> None:
|
||||
["source_node", "target_node"],
|
||||
)
|
||||
|
||||
op.execute("DROP TABLE IF EXISTS kg_term CASCADE")
|
||||
# Create KGTerm table
|
||||
op.create_table(
|
||||
"kg_term",
|
||||
@@ -467,11 +477,11 @@ def upgrade() -> None:
|
||||
|
||||
# Create GIN index for clustering and normalization
|
||||
op.execute(
|
||||
"CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_kg_entity_clustering_trigrams "
|
||||
f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.gin_trgm_ops)"
|
||||
"CREATE INDEX IF NOT EXISTS idx_kg_entity_clustering_trigrams "
|
||||
f"ON kg_entity USING GIN (name {POSTGRES_DEFAULT_SCHEMA}.gin_trgm_ops)"
|
||||
)
|
||||
op.execute(
|
||||
"CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_kg_entity_normalization_trigrams "
|
||||
"CREATE INDEX IF NOT EXISTS idx_kg_entity_normalization_trigrams "
|
||||
"ON kg_entity USING GIN (name_trigrams)"
|
||||
)
|
||||
|
||||
@@ -508,7 +518,7 @@ def upgrade() -> None:
|
||||
|
||||
-- Set name and name trigrams
|
||||
NEW.name = name;
|
||||
NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name);
|
||||
NEW.name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
@@ -553,7 +563,7 @@ def upgrade() -> None:
|
||||
UPDATE kg_entity
|
||||
SET
|
||||
name = doc_name,
|
||||
name_trigrams = {POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE}.show_trgm(cleaned_name)
|
||||
name_trigrams = {POSTGRES_DEFAULT_SCHEMA}.show_trgm(cleaned_name)
|
||||
WHERE document_id = NEW.id;
|
||||
RETURN NEW;
|
||||
END;
|
||||
@@ -625,9 +635,8 @@ def downgrade() -> None:
|
||||
op.execute(f"DROP FUNCTION IF EXISTS {function}()")
|
||||
|
||||
# Drop index
|
||||
op.execute("COMMIT") # Commit to allow CONCURRENTLY
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_kg_entity_clustering_trigrams")
|
||||
op.execute("DROP INDEX CONCURRENTLY IF EXISTS idx_kg_entity_normalization_trigrams")
|
||||
op.execute("DROP INDEX IF EXISTS idx_kg_entity_clustering_trigrams")
|
||||
op.execute("DROP INDEX IF EXISTS idx_kg_entity_normalization_trigrams")
|
||||
|
||||
# Drop tables in reverse order of creation to handle dependencies
|
||||
op.drop_table("kg_term")
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
"""add stale column to external user group tables
|
||||
|
||||
Revision ID: 58c50ef19f08
|
||||
Revises: 7b9b952abdf6
|
||||
Create Date: 2025-06-25 14:08:14.162380
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "58c50ef19f08"
|
||||
down_revision = "7b9b952abdf6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add the stale column with default value False to user__external_user_group_id
|
||||
op.add_column(
|
||||
"user__external_user_group_id",
|
||||
sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
|
||||
)
|
||||
|
||||
# Create index for efficient querying of stale rows by cc_pair_id
|
||||
op.create_index(
|
||||
"ix_user__external_user_group_id_cc_pair_id_stale",
|
||||
"user__external_user_group_id",
|
||||
["cc_pair_id", "stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# Create index for efficient querying of all stale rows
|
||||
op.create_index(
|
||||
"ix_user__external_user_group_id_stale",
|
||||
"user__external_user_group_id",
|
||||
["stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# Add the stale column with default value False to public_external_user_group
|
||||
op.add_column(
|
||||
"public_external_user_group",
|
||||
sa.Column("stale", sa.Boolean(), nullable=False, server_default="false"),
|
||||
)
|
||||
|
||||
# Create index for efficient querying of stale rows by cc_pair_id
|
||||
op.create_index(
|
||||
"ix_public_external_user_group_cc_pair_id_stale",
|
||||
"public_external_user_group",
|
||||
["cc_pair_id", "stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# Create index for efficient querying of all stale rows
|
||||
op.create_index(
|
||||
"ix_public_external_user_group_stale",
|
||||
"public_external_user_group",
|
||||
["stale"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the indices for public_external_user_group first
|
||||
op.drop_index(
|
||||
"ix_public_external_user_group_stale", table_name="public_external_user_group"
|
||||
)
|
||||
op.drop_index(
|
||||
"ix_public_external_user_group_cc_pair_id_stale",
|
||||
table_name="public_external_user_group",
|
||||
)
|
||||
|
||||
# Drop the stale column from public_external_user_group
|
||||
op.drop_column("public_external_user_group", "stale")
|
||||
|
||||
# Drop the indices for user__external_user_group_id
|
||||
op.drop_index(
|
||||
"ix_user__external_user_group_id_stale",
|
||||
table_name="user__external_user_group_id",
|
||||
)
|
||||
op.drop_index(
|
||||
"ix_user__external_user_group_id_cc_pair_id_stale",
|
||||
table_name="user__external_user_group_id",
|
||||
)
|
||||
|
||||
# Drop the stale column from user__external_user_group_id
|
||||
op.drop_column("user__external_user_group_id", "stale")
|
||||
@@ -0,0 +1,115 @@
|
||||
"""add research agent database tables and chat message research fields
|
||||
|
||||
Revision ID: 5ae8240accb3
|
||||
Revises: b558f51620b4
|
||||
Create Date: 2025-08-06 14:29:24.691388
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "5ae8240accb3"
|
||||
down_revision = "b558f51620b4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add research_type and research_plan columns to chat_message table
|
||||
op.add_column(
|
||||
"chat_message",
|
||||
sa.Column("research_type", sa.String(), nullable=True),
|
||||
)
|
||||
op.add_column(
|
||||
"chat_message",
|
||||
sa.Column("research_plan", postgresql.JSONB(), nullable=True),
|
||||
)
|
||||
|
||||
# Create research_agent_iteration table
|
||||
op.create_table(
|
||||
"research_agent_iteration",
|
||||
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
||||
sa.Column(
|
||||
"primary_question_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("chat_message.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("iteration_nr", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("purpose", sa.String(), nullable=True),
|
||||
sa.Column("reasoning", sa.String(), nullable=True),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint(
|
||||
"primary_question_id",
|
||||
"iteration_nr",
|
||||
name="_research_agent_iteration_unique_constraint",
|
||||
),
|
||||
)
|
||||
|
||||
# Create research_agent_iteration_sub_step table
|
||||
op.create_table(
|
||||
"research_agent_iteration_sub_step",
|
||||
sa.Column("id", sa.Integer(), autoincrement=True, nullable=False),
|
||||
sa.Column(
|
||||
"primary_question_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("chat_message.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"parent_question_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("research_agent_iteration_sub_step.id", ondelete="CASCADE"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("iteration_nr", sa.Integer(), nullable=False),
|
||||
sa.Column("iteration_sub_step_nr", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("sub_step_instructions", sa.String(), nullable=True),
|
||||
sa.Column(
|
||||
"sub_step_tool_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("tool.id"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("reasoning", sa.String(), nullable=True),
|
||||
sa.Column("sub_answer", sa.String(), nullable=True),
|
||||
sa.Column("cited_doc_results", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("claims", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("generated_images", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("additional_data", postgresql.JSONB(), nullable=True),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.ForeignKeyConstraint(
|
||||
["primary_question_id", "iteration_nr"],
|
||||
[
|
||||
"research_agent_iteration.primary_question_id",
|
||||
"research_agent_iteration.iteration_nr",
|
||||
],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop tables in reverse order
|
||||
op.drop_table("research_agent_iteration_sub_step")
|
||||
op.drop_table("research_agent_iteration")
|
||||
|
||||
# Remove columns from chat_message table
|
||||
op.drop_column("chat_message", "research_plan")
|
||||
op.drop_column("chat_message", "research_type")
|
||||
@@ -0,0 +1,132 @@
|
||||
"""add file names to file connector config
|
||||
|
||||
Revision ID: 62c3a055a141
|
||||
Revises: 3fc5d75723b3
|
||||
Create Date: 2025-07-30 17:01:24.417551
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "62c3a055a141"
|
||||
down_revision = "3fc5d75723b3"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
SKIP_FILE_NAME_MIGRATION = (
|
||||
os.environ.get("SKIP_FILE_NAME_MIGRATION", "true").lower() == "true"
|
||||
)
|
||||
|
||||
logger = logging.getLogger("alembic.runtime.migration")
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
if SKIP_FILE_NAME_MIGRATION:
|
||||
logger.info(
|
||||
"Skipping file name migration. Hint: set SKIP_FILE_NAME_MIGRATION=false to run this migration"
|
||||
)
|
||||
return
|
||||
logger.info("Running file name migration")
|
||||
# Get connection
|
||||
conn = op.get_bind()
|
||||
|
||||
# Get all FILE connectors with their configs
|
||||
file_connectors = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT id, connector_specific_config
|
||||
FROM connector
|
||||
WHERE source = 'FILE'
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for connector_id, config in file_connectors:
|
||||
# Parse config if it's a string
|
||||
if isinstance(config, str):
|
||||
config = json.loads(config)
|
||||
|
||||
# Get file_locations list
|
||||
file_locations = config.get("file_locations", [])
|
||||
|
||||
# Get display names for each file_id
|
||||
file_names = []
|
||||
for file_id in file_locations:
|
||||
result = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT display_name
|
||||
FROM file_record
|
||||
WHERE file_id = :file_id
|
||||
"""
|
||||
),
|
||||
{"file_id": file_id},
|
||||
).fetchone()
|
||||
|
||||
if result:
|
||||
file_names.append(result[0])
|
||||
else:
|
||||
file_names.append(file_id) # Should not happen
|
||||
|
||||
# Add file_names to config
|
||||
new_config = dict(config)
|
||||
new_config["file_names"] = file_names
|
||||
|
||||
# Update the connector
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE connector
|
||||
SET connector_specific_config = :new_config
|
||||
WHERE id = :connector_id
|
||||
"""
|
||||
),
|
||||
{"connector_id": connector_id, "new_config": json.dumps(new_config)},
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Get connection
|
||||
conn = op.get_bind()
|
||||
|
||||
# Remove file_names from all FILE connectors
|
||||
file_connectors = conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT id, connector_specific_config
|
||||
FROM connector
|
||||
WHERE source = 'FILE'
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
|
||||
for connector_id, config in file_connectors:
|
||||
# Parse config if it's a string
|
||||
if isinstance(config, str):
|
||||
config = json.loads(config)
|
||||
|
||||
# Remove file_names if it exists
|
||||
if "file_names" in config:
|
||||
new_config = dict(config)
|
||||
del new_config["file_names"]
|
||||
|
||||
# Update the connector
|
||||
conn.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE connector
|
||||
SET connector_specific_config = :new_config
|
||||
WHERE id = :connector_id
|
||||
"""
|
||||
),
|
||||
{
|
||||
"connector_id": connector_id,
|
||||
"new_config": json.dumps(new_config),
|
||||
},
|
||||
)
|
||||
318
backend/alembic/versions/7b9b952abdf6_update_entities.py
Normal file
318
backend/alembic/versions/7b9b952abdf6_update_entities.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""update-entities
|
||||
|
||||
Revision ID: 7b9b952abdf6
|
||||
Revises: 36e9220ab794
|
||||
Create Date: 2025-06-23 20:24:08.139201
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "7b9b952abdf6"
|
||||
down_revision = "36e9220ab794"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
# new entity type metadata_attribute_conversion
|
||||
new_entity_type_conversion = {
|
||||
"LINEAR": {
|
||||
"team": {"name": "team", "keep": True, "implication_property": None},
|
||||
"state": {"name": "state", "keep": True, "implication_property": None},
|
||||
"priority": {
|
||||
"name": "priority",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"estimate": {
|
||||
"name": "estimate",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"started_at": {
|
||||
"name": "started_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"completed_at": {
|
||||
"name": "completed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"due_date": {
|
||||
"name": "due_date",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"creator": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignee": {
|
||||
"name": "assignee",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"JIRA": {
|
||||
"issuetype": {
|
||||
"name": "subtype",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"status": {"name": "status", "keep": True, "implication_property": None},
|
||||
"priority": {
|
||||
"name": "priority",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"project_name": {
|
||||
"name": "project",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"updated": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"resolution_date": {
|
||||
"name": "completed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"duedate": {"name": "due_date", "keep": True, "implication_property": None},
|
||||
"reporter_email": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignee_email": {
|
||||
"name": "assignee",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
"key": {"name": "key", "keep": True, "implication_property": None},
|
||||
"parent": {"name": "parent", "keep": True, "implication_property": None},
|
||||
},
|
||||
"GITHUB_PR": {
|
||||
"repo": {"name": "repository", "keep": True, "implication_property": None},
|
||||
"state": {"name": "state", "keep": True, "implication_property": None},
|
||||
"num_commits": {
|
||||
"name": "num_commits",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"num_files_changed": {
|
||||
"name": "num_files_changed",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"labels": {"name": "labels", "keep": True, "implication_property": None},
|
||||
"merged": {"name": "merged", "keep": True, "implication_property": None},
|
||||
"merged_at": {
|
||||
"name": "merged_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"closed_at": {
|
||||
"name": "closed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"updated_at": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"user": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignees": {
|
||||
"name": "assignees",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"GITHUB_ISSUE": {
|
||||
"repo": {"name": "repository", "keep": True, "implication_property": None},
|
||||
"state": {"name": "state", "keep": True, "implication_property": None},
|
||||
"labels": {"name": "labels", "keep": True, "implication_property": None},
|
||||
"closed_at": {
|
||||
"name": "closed_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_at": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"updated_at": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"user": {
|
||||
"name": "creator",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_creator_of",
|
||||
},
|
||||
},
|
||||
"assignees": {
|
||||
"name": "assignees",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "from_email",
|
||||
"implied_relationship_name": "is_assignee_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"FIREFLIES": {},
|
||||
"ACCOUNT": {},
|
||||
"OPPORTUNITY": {
|
||||
"name": {"name": "name", "keep": True, "implication_property": None},
|
||||
"stage_name": {"name": "stage", "keep": True, "implication_property": None},
|
||||
"type": {"name": "type", "keep": True, "implication_property": None},
|
||||
"amount": {"name": "amount", "keep": True, "implication_property": None},
|
||||
"fiscal_year": {
|
||||
"name": "fiscal_year",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"fiscal_quarter": {
|
||||
"name": "fiscal_quarter",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"is_closed": {
|
||||
"name": "is_closed",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"close_date": {
|
||||
"name": "close_date",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"probability": {
|
||||
"name": "close_probability",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"created_date": {
|
||||
"name": "created_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"last_modified_date": {
|
||||
"name": "updated_at",
|
||||
"keep": True,
|
||||
"implication_property": None,
|
||||
},
|
||||
"account": {
|
||||
"name": "account",
|
||||
"keep": False,
|
||||
"implication_property": {
|
||||
"implied_entity_type": "ACCOUNT",
|
||||
"implied_relationship_name": "is_account_of",
|
||||
},
|
||||
},
|
||||
},
|
||||
"VENDOR": {},
|
||||
"EMPLOYEE": {},
|
||||
}
|
||||
|
||||
current_entity_types = conn.execute(
|
||||
sa.text("SELECT id_name, attributes from kg_entity_type")
|
||||
).all()
|
||||
for entity_type, attributes in current_entity_types:
|
||||
# delete removed entity types
|
||||
if entity_type not in new_entity_type_conversion:
|
||||
op.execute(
|
||||
sa.text(f"DELETE FROM kg_entity_type WHERE id_name = '{entity_type}'")
|
||||
)
|
||||
continue
|
||||
|
||||
# update entity type attributes
|
||||
if "metadata_attributes" in attributes:
|
||||
del attributes["metadata_attributes"]
|
||||
attributes["metadata_attribute_conversion"] = new_entity_type_conversion[
|
||||
entity_type
|
||||
]
|
||||
attributes_str = json.dumps(attributes).replace("'", "''")
|
||||
op.execute(
|
||||
sa.text(
|
||||
f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
|
||||
f"WHERE id_name = '{entity_type}'"
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
conn = op.get_bind()
|
||||
|
||||
current_entity_types = conn.execute(
|
||||
sa.text("SELECT id_name, attributes from kg_entity_type")
|
||||
).all()
|
||||
for entity_type, attributes in current_entity_types:
|
||||
conversion = {}
|
||||
if "metadata_attribute_conversion" in attributes:
|
||||
conversion = attributes.pop("metadata_attribute_conversion")
|
||||
attributes["metadata_attributes"] = {
|
||||
attr: prop["name"] for attr, prop in conversion.items() if prop["keep"]
|
||||
}
|
||||
|
||||
attributes_str = json.dumps(attributes).replace("'", "''")
|
||||
op.execute(
|
||||
sa.text(
|
||||
f"UPDATE kg_entity_type SET attributes = '{attributes_str}'"
|
||||
f"WHERE id_name = '{entity_type}'"
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,249 @@
|
||||
"""add_mcp_server_and_connection_config_models
|
||||
|
||||
Revision ID: 7ed603b64d5a
|
||||
Revises: b329d00a9ea6
|
||||
Create Date: 2025-07-28 17:35:59.900680
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from onyx.db.enums import MCPAuthenticationType
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "7ed603b64d5a"
|
||||
down_revision = "b329d00a9ea6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Create tables and columns for MCP Server support"""
|
||||
|
||||
# 1. MCP Server main table (no FK constraints yet to avoid circular refs)
|
||||
op.create_table(
|
||||
"mcp_server",
|
||||
sa.Column("id", sa.Integer(), primary_key=True),
|
||||
sa.Column("owner", sa.String(), nullable=False),
|
||||
sa.Column("name", sa.String(), nullable=False),
|
||||
sa.Column("description", sa.String(), nullable=True),
|
||||
sa.Column("server_url", sa.String(), nullable=False),
|
||||
sa.Column(
|
||||
"auth_type",
|
||||
sa.Enum(
|
||||
MCPAuthenticationType,
|
||||
name="mcp_authentication_type",
|
||||
native_enum=False,
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("admin_connection_config_id", sa.Integer(), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
# 2. MCP Connection Config table (can reference mcp_server now that it exists)
|
||||
op.create_table(
|
||||
"mcp_connection_config",
|
||||
sa.Column("id", sa.Integer(), primary_key=True),
|
||||
sa.Column("mcp_server_id", sa.Integer(), nullable=True),
|
||||
sa.Column("user_email", sa.String(), nullable=False, default=""),
|
||||
sa.Column("config", sa.LargeBinary(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"), # type: ignore
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["mcp_server_id"], ["mcp_server.id"], ondelete="CASCADE"
|
||||
),
|
||||
)
|
||||
|
||||
# Helpful indexes
|
||||
op.create_index(
|
||||
"ix_mcp_connection_config_server_user",
|
||||
"mcp_connection_config",
|
||||
["mcp_server_id", "user_email"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_mcp_connection_config_user_email",
|
||||
"mcp_connection_config",
|
||||
["user_email"],
|
||||
)
|
||||
|
||||
# 3. Add the back-references from mcp_server to connection configs
|
||||
op.create_foreign_key(
|
||||
"mcp_server_admin_config_fk",
|
||||
"mcp_server",
|
||||
"mcp_connection_config",
|
||||
["admin_connection_config_id"],
|
||||
["id"],
|
||||
ondelete="SET NULL",
|
||||
)
|
||||
|
||||
# 4. Association / access-control tables
|
||||
op.create_table(
|
||||
"mcp_server__user",
|
||||
sa.Column("mcp_server_id", sa.Integer(), primary_key=True),
|
||||
sa.Column("user_id", sa.UUID(), primary_key=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["mcp_server_id"], ["mcp_server.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
|
||||
)
|
||||
|
||||
op.create_table(
|
||||
"mcp_server__user_group",
|
||||
sa.Column("mcp_server_id", sa.Integer(), primary_key=True),
|
||||
sa.Column("user_group_id", sa.Integer(), primary_key=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["mcp_server_id"], ["mcp_server.id"], ondelete="CASCADE"
|
||||
),
|
||||
sa.ForeignKeyConstraint(["user_group_id"], ["user_group.id"]),
|
||||
)
|
||||
|
||||
# 5. Update existing `tool` table – allow tools to belong to an MCP server
|
||||
op.add_column(
|
||||
"tool",
|
||||
sa.Column("mcp_server_id", sa.Integer(), nullable=True),
|
||||
)
|
||||
# Add column for MCP tool input schema
|
||||
op.add_column(
|
||||
"tool",
|
||||
sa.Column("mcp_input_schema", postgresql.JSONB(), nullable=True),
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"tool_mcp_server_fk",
|
||||
"tool",
|
||||
"mcp_server",
|
||||
["mcp_server_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 6. Update persona__tool foreign keys to cascade delete
|
||||
# This ensures that when a tool is deleted (including via MCP server deletion),
|
||||
# the corresponding persona__tool rows are also deleted
|
||||
op.drop_constraint(
|
||||
"persona__tool_tool_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
op.drop_constraint(
|
||||
"persona__tool_persona_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
|
||||
op.create_foreign_key(
|
||||
"persona__tool_persona_id_fkey",
|
||||
"persona__tool",
|
||||
"persona",
|
||||
["persona_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"persona__tool_tool_id_fkey",
|
||||
"persona__tool",
|
||||
"tool",
|
||||
["tool_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 7. Update research_agent_iteration_sub_step foreign key to SET NULL on delete
|
||||
# This ensures that when a tool is deleted, the sub_step_tool_id is set to NULL
|
||||
# instead of causing a foreign key constraint violation
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"tool",
|
||||
["sub_step_tool_id"],
|
||||
["id"],
|
||||
ondelete="SET NULL",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Drop all MCP-related tables / columns"""
|
||||
|
||||
# # # 1. Drop FK & columns from tool
|
||||
# op.drop_constraint("tool_mcp_server_fk", "tool", type_="foreignkey")
|
||||
op.execute("DELETE FROM tool WHERE mcp_server_id IS NOT NULL")
|
||||
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_sub_step_tool_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"tool",
|
||||
["sub_step_tool_id"],
|
||||
["id"],
|
||||
)
|
||||
|
||||
# Restore original persona__tool foreign keys (without CASCADE)
|
||||
op.drop_constraint(
|
||||
"persona__tool_persona_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
op.drop_constraint(
|
||||
"persona__tool_tool_id_fkey", "persona__tool", type_="foreignkey"
|
||||
)
|
||||
|
||||
op.create_foreign_key(
|
||||
"persona__tool_persona_id_fkey",
|
||||
"persona__tool",
|
||||
"persona",
|
||||
["persona_id"],
|
||||
["id"],
|
||||
)
|
||||
op.create_foreign_key(
|
||||
"persona__tool_tool_id_fkey",
|
||||
"persona__tool",
|
||||
"tool",
|
||||
["tool_id"],
|
||||
["id"],
|
||||
)
|
||||
op.drop_column("tool", "mcp_input_schema")
|
||||
op.drop_column("tool", "mcp_server_id")
|
||||
|
||||
# 2. Drop association tables
|
||||
op.drop_table("mcp_server__user_group")
|
||||
op.drop_table("mcp_server__user")
|
||||
|
||||
# 3. Drop FK from mcp_server to connection configs
|
||||
op.drop_constraint("mcp_server_admin_config_fk", "mcp_server", type_="foreignkey")
|
||||
|
||||
# 4. Drop connection config indexes & table
|
||||
op.drop_index(
|
||||
"ix_mcp_connection_config_user_email", table_name="mcp_connection_config"
|
||||
)
|
||||
op.drop_index(
|
||||
"ix_mcp_connection_config_server_user", table_name="mcp_connection_config"
|
||||
)
|
||||
op.drop_table("mcp_connection_config")
|
||||
|
||||
# 5. Finally drop mcp_server table
|
||||
op.drop_table("mcp_server")
|
||||
@@ -0,0 +1,38 @@
|
||||
"""drop include citations
|
||||
|
||||
Revision ID: 8818cf73fa1a
|
||||
Revises: 7ed603b64d5a
|
||||
Create Date: 2025-09-02 19:43:50.060680
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "8818cf73fa1a"
|
||||
down_revision = "7ed603b64d5a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.drop_column("prompt", "include_citations")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.add_column(
|
||||
"prompt",
|
||||
sa.Column(
|
||||
"include_citations",
|
||||
sa.BOOLEAN(),
|
||||
autoincrement=False,
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
# Set include_citations based on prompt name: FALSE for ImageGeneration, TRUE for others
|
||||
op.execute(
|
||||
sa.text(
|
||||
"UPDATE prompt SET include_citations = CASE WHEN name = 'ImageGeneration' THEN FALSE ELSE TRUE END"
|
||||
)
|
||||
)
|
||||
341
backend/alembic/versions/90e3b9af7da4_tag_fix.py
Normal file
341
backend/alembic/versions/90e3b9af7da4_tag_fix.py
Normal file
@@ -0,0 +1,341 @@
|
||||
"""tag-fix
|
||||
|
||||
Revision ID: 90e3b9af7da4
|
||||
Revises: 62c3a055a141
|
||||
Create Date: 2025-08-01 20:58:14.607624
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from typing import cast
|
||||
from typing import Generator
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.db.search_settings import SearchSettings
|
||||
from onyx.configs.app_configs import AUTH_TYPE
|
||||
from onyx.configs.constants import AuthType
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
|
||||
logger = logging.getLogger("alembic.runtime.migration")
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "90e3b9af7da4"
|
||||
down_revision = "62c3a055a141"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
SKIP_TAG_FIX = os.environ.get("SKIP_TAG_FIX", "true").lower() == "true"
|
||||
|
||||
# override for cloud
|
||||
if AUTH_TYPE == AuthType.CLOUD:
|
||||
SKIP_TAG_FIX = True
|
||||
|
||||
|
||||
def set_is_list_for_known_tags() -> None:
|
||||
"""
|
||||
Sets is_list to true for all tags that are known to be lists.
|
||||
"""
|
||||
LIST_METADATA: list[tuple[str, str]] = [
|
||||
("CLICKUP", "tags"),
|
||||
("CONFLUENCE", "labels"),
|
||||
("DISCOURSE", "tags"),
|
||||
("FRESHDESK", "emails"),
|
||||
("GITHUB", "assignees"),
|
||||
("GITHUB", "labels"),
|
||||
("GURU", "tags"),
|
||||
("GURU", "folders"),
|
||||
("HUBSPOT", "associated_contact_ids"),
|
||||
("HUBSPOT", "associated_company_ids"),
|
||||
("HUBSPOT", "associated_deal_ids"),
|
||||
("HUBSPOT", "associated_ticket_ids"),
|
||||
("JIRA", "labels"),
|
||||
("MEDIAWIKI", "categories"),
|
||||
("ZENDESK", "labels"),
|
||||
("ZENDESK", "content_tags"),
|
||||
]
|
||||
|
||||
bind = op.get_bind()
|
||||
for source, key in LIST_METADATA:
|
||||
bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
UPDATE tag
|
||||
SET is_list = true
|
||||
WHERE tag_key = '{key}'
|
||||
AND source = '{source}'
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def set_is_list_for_list_tags() -> None:
|
||||
"""
|
||||
Sets is_list to true for all tags which have multiple values for a given
|
||||
document, key, and source triplet. This only works if we remove old tags
|
||||
from the database.
|
||||
"""
|
||||
bind = op.get_bind()
|
||||
bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE tag
|
||||
SET is_list = true
|
||||
FROM (
|
||||
SELECT DISTINCT tag.tag_key, tag.source
|
||||
FROM tag
|
||||
JOIN document__tag ON tag.id = document__tag.tag_id
|
||||
GROUP BY tag.tag_key, tag.source, document__tag.document_id
|
||||
HAVING count(*) > 1
|
||||
) AS list_tags
|
||||
WHERE tag.tag_key = list_tags.tag_key
|
||||
AND tag.source = list_tags.source
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def log_list_tags() -> None:
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT DISTINCT source, tag_key
|
||||
FROM tag
|
||||
WHERE is_list
|
||||
ORDER BY source, tag_key
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
logger.info(
|
||||
"List tags:\n" + "\n".join(f" {source}: {key}" for source, key in result)
|
||||
)
|
||||
|
||||
|
||||
def remove_old_tags() -> None:
|
||||
"""
|
||||
Removes old tags from the database.
|
||||
Previously, there was a bug where if a document got indexed with a tag and then
|
||||
the document got reindexed, the old tag would not be removed.
|
||||
This function removes those old tags by comparing it against the tags in vespa.
|
||||
"""
|
||||
current_search_settings, future_search_settings = active_search_settings()
|
||||
document_index = get_default_document_index(
|
||||
current_search_settings, future_search_settings
|
||||
)
|
||||
|
||||
# Get the index name
|
||||
if hasattr(document_index, "index_name"):
|
||||
index_name = document_index.index_name
|
||||
else:
|
||||
# Default index name if we can't get it from the document_index
|
||||
index_name = "danswer_index"
|
||||
|
||||
for batch in _get_batch_documents_with_multiple_tags():
|
||||
n_deleted = 0
|
||||
|
||||
for document_id in batch:
|
||||
true_metadata = _get_vespa_metadata(document_id, index_name)
|
||||
tags = _get_document_tags(document_id)
|
||||
|
||||
# identify document__tags to delete
|
||||
to_delete: list[str] = []
|
||||
for tag_id, tag_key, tag_value in tags:
|
||||
true_val = true_metadata.get(tag_key, "")
|
||||
if (isinstance(true_val, list) and tag_value not in true_val) or (
|
||||
isinstance(true_val, str) and tag_value != true_val
|
||||
):
|
||||
to_delete.append(str(tag_id))
|
||||
|
||||
if not to_delete:
|
||||
continue
|
||||
|
||||
# delete old document__tags
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
DELETE FROM document__tag
|
||||
WHERE document_id = '{document_id}'
|
||||
AND tag_id IN ({','.join(to_delete)})
|
||||
"""
|
||||
)
|
||||
)
|
||||
n_deleted += result.rowcount
|
||||
logger.info(f"Processed {len(batch)} documents and deleted {n_deleted} tags")
|
||||
|
||||
|
||||
def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]:
|
||||
result = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_fetch = result.fetchall()
|
||||
search_settings = (
|
||||
SearchSettings(**search_settings_fetch[0]._asdict())
|
||||
if search_settings_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
result2 = op.get_bind().execute(
|
||||
sa.text(
|
||||
"""
|
||||
SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1
|
||||
"""
|
||||
)
|
||||
)
|
||||
search_settings_future_fetch = result2.fetchall()
|
||||
search_settings_future = (
|
||||
SearchSettings(**search_settings_future_fetch[0]._asdict())
|
||||
if search_settings_future_fetch
|
||||
else None
|
||||
)
|
||||
|
||||
if not isinstance(search_settings, SearchSettings):
|
||||
raise RuntimeError(
|
||||
"current search settings is of type " + str(type(search_settings))
|
||||
)
|
||||
if (
|
||||
not isinstance(search_settings_future, SearchSettings)
|
||||
and search_settings_future is not None
|
||||
):
|
||||
raise RuntimeError(
|
||||
"future search settings is of type " + str(type(search_settings_future))
|
||||
)
|
||||
|
||||
return search_settings, search_settings_future
|
||||
|
||||
|
||||
def _get_batch_documents_with_multiple_tags(
|
||||
batch_size: int = 128,
|
||||
) -> Generator[list[str], None, None]:
|
||||
"""
|
||||
Returns a list of document ids which contain a one to many tag.
|
||||
The document may either contain a list metadata value, or may contain leftover
|
||||
old tags from reindexing.
|
||||
"""
|
||||
offset_clause = ""
|
||||
bind = op.get_bind()
|
||||
|
||||
while True:
|
||||
batch = bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
SELECT DISTINCT document__tag.document_id
|
||||
FROM tag
|
||||
JOIN document__tag ON tag.id = document__tag.tag_id
|
||||
GROUP BY tag.tag_key, tag.source, document__tag.document_id
|
||||
HAVING count(*) > 1 {offset_clause}
|
||||
ORDER BY document__tag.document_id
|
||||
LIMIT {batch_size}
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
if not batch:
|
||||
break
|
||||
doc_ids = [document_id for document_id, in batch]
|
||||
yield doc_ids
|
||||
offset_clause = f"AND document__tag.document_id > '{doc_ids[-1]}'"
|
||||
|
||||
|
||||
def _get_vespa_metadata(
|
||||
document_id: str, index_name: str
|
||||
) -> dict[str, str | list[str]]:
|
||||
url = DOCUMENT_ID_ENDPOINT.format(index_name=index_name)
|
||||
|
||||
# Document-Selector language
|
||||
selection = (
|
||||
f"{index_name}.document_id=='{document_id}' and {index_name}.chunk_id==0"
|
||||
)
|
||||
|
||||
params: dict[str, str | int] = {
|
||||
"selection": selection,
|
||||
"wantedDocumentCount": 1,
|
||||
"fieldSet": f"{index_name}:metadata",
|
||||
}
|
||||
|
||||
with get_vespa_http_client() as client:
|
||||
resp = client.get(url, params=params)
|
||||
resp.raise_for_status()
|
||||
|
||||
docs = resp.json().get("documents", [])
|
||||
if not docs:
|
||||
raise RuntimeError(f"No chunk-0 found for document {document_id}")
|
||||
|
||||
# for some reason, metadata is a string
|
||||
metadata = docs[0]["fields"]["metadata"]
|
||||
return json.loads(metadata)
|
||||
|
||||
|
||||
def _get_document_tags(document_id: str) -> list[tuple[int, str, str]]:
|
||||
bind = op.get_bind()
|
||||
result = bind.execute(
|
||||
sa.text(
|
||||
f"""
|
||||
SELECT tag.id, tag.tag_key, tag.tag_value
|
||||
FROM tag
|
||||
JOIN document__tag ON tag.id = document__tag.tag_id
|
||||
WHERE document__tag.document_id = '{document_id}'
|
||||
"""
|
||||
)
|
||||
).fetchall()
|
||||
return cast(list[tuple[int, str, str]], result)
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"tag",
|
||||
sa.Column("is_list", sa.Boolean(), nullable=False, server_default="false"),
|
||||
)
|
||||
op.drop_constraint(
|
||||
constraint_name="_tag_key_value_source_uc",
|
||||
table_name="tag",
|
||||
type_="unique",
|
||||
)
|
||||
op.create_unique_constraint(
|
||||
constraint_name="_tag_key_value_source_list_uc",
|
||||
table_name="tag",
|
||||
columns=["tag_key", "tag_value", "source", "is_list"],
|
||||
)
|
||||
set_is_list_for_known_tags()
|
||||
|
||||
if SKIP_TAG_FIX:
|
||||
logger.warning(
|
||||
"Skipping removal of old tags. "
|
||||
"This can cause issues when using the knowledge graph, or "
|
||||
"when filtering for documents by tags."
|
||||
)
|
||||
log_list_tags()
|
||||
return
|
||||
|
||||
remove_old_tags()
|
||||
set_is_list_for_list_tags()
|
||||
|
||||
# debug
|
||||
log_list_tags()
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# the migration adds and populates the is_list column, and removes old bugged tags
|
||||
# there isn't a point in adding back the bugged tags, so we just drop the column
|
||||
op.drop_constraint(
|
||||
constraint_name="_tag_key_value_source_list_uc",
|
||||
table_name="tag",
|
||||
type_="unique",
|
||||
)
|
||||
op.create_unique_constraint(
|
||||
constraint_name="_tag_key_value_source_uc",
|
||||
table_name="tag",
|
||||
columns=["tag_key", "tag_value", "source"],
|
||||
)
|
||||
op.drop_column("tag", "is_list")
|
||||
@@ -0,0 +1,38 @@
|
||||
"""Adding assistant-specific user preferences
|
||||
|
||||
Revision ID: b329d00a9ea6
|
||||
Revises: f9b8c7d6e5a4
|
||||
Create Date: 2025-08-26 23:14:44.592985
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import fastapi_users_db_sqlalchemy
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "b329d00a9ea6"
|
||||
down_revision = "f9b8c7d6e5a4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"assistant__user_specific_config",
|
||||
sa.Column("assistant_id", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
fastapi_users_db_sqlalchemy.generics.GUID(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("disabled_tool_ids", postgresql.ARRAY(sa.Integer()), nullable=False),
|
||||
sa.ForeignKeyConstraint(["assistant_id"], ["persona.id"], ondelete="CASCADE"),
|
||||
sa.ForeignKeyConstraint(["user_id"], ["user.id"], ondelete="CASCADE"),
|
||||
sa.PrimaryKeyConstraint("assistant_id", "user_id"),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("assistant__user_specific_config")
|
||||
@@ -0,0 +1,33 @@
|
||||
"""Pause finished user file connectors
|
||||
|
||||
Revision ID: b558f51620b4
|
||||
Revises: 90e3b9af7da4
|
||||
Create Date: 2025-08-15 17:17:02.456704
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "b558f51620b4"
|
||||
down_revision = "90e3b9af7da4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Set all user file connector credential pairs with ACTIVE status to PAUSED
|
||||
# This ensures user files don't continue to run indexing tasks after processing
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE connector_credential_pair
|
||||
SET status = 'PAUSED'
|
||||
WHERE is_user_file = true
|
||||
AND status = 'ACTIVE'
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
pass
|
||||
@@ -0,0 +1,147 @@
|
||||
"""migrate_agent_sub_questions_to_research_iterations
|
||||
|
||||
Revision ID: bd7c3bf8beba
|
||||
Revises: f8a9b2c3d4e5
|
||||
Create Date: 2025-08-18 11:33:27.098287
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "bd7c3bf8beba"
|
||||
down_revision = "f8a9b2c3d4e5"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Get connection to execute raw SQL
|
||||
connection = op.get_bind()
|
||||
|
||||
# First, insert data into research_agent_iteration table
|
||||
# This creates one iteration record per primary_question_id using the earliest time_created
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO research_agent_iteration (primary_question_id, created_at, iteration_nr, purpose, reasoning)
|
||||
SELECT
|
||||
primary_question_id,
|
||||
MIN(time_created) as created_at,
|
||||
1 as iteration_nr,
|
||||
'Generating and researching subquestions' as purpose,
|
||||
'(No previous reasoning)' as reasoning
|
||||
FROM agent__sub_question
|
||||
JOIN chat_message on agent__sub_question.primary_question_id = chat_message.id
|
||||
WHERE primary_question_id IS NOT NULL
|
||||
AND chat_message.is_agentic = true
|
||||
GROUP BY primary_question_id
|
||||
ON CONFLICT DO NOTHING;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Then, insert data into research_agent_iteration_sub_step table
|
||||
# This migrates each sub-question as a sub-step
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
INSERT INTO research_agent_iteration_sub_step (
|
||||
primary_question_id,
|
||||
iteration_nr,
|
||||
iteration_sub_step_nr,
|
||||
created_at,
|
||||
sub_step_instructions,
|
||||
sub_step_tool_id,
|
||||
sub_answer,
|
||||
cited_doc_results
|
||||
)
|
||||
SELECT
|
||||
primary_question_id,
|
||||
1 as iteration_nr,
|
||||
level_question_num as iteration_sub_step_nr,
|
||||
time_created as created_at,
|
||||
sub_question as sub_step_instructions,
|
||||
1 as sub_step_tool_id,
|
||||
sub_answer,
|
||||
sub_question_doc_results as cited_doc_results
|
||||
FROM agent__sub_question
|
||||
JOIN chat_message on agent__sub_question.primary_question_id = chat_message.id
|
||||
WHERE chat_message.is_agentic = true
|
||||
AND primary_question_id IS NOT NULL
|
||||
ON CONFLICT DO NOTHING;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Update chat_message records: set legacy agentic type and answer purpose for existing agentic messages
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chat_message
|
||||
SET research_answer_purpose = 'ANSWER'
|
||||
WHERE is_agentic = true
|
||||
AND research_type IS NULL and
|
||||
message_type = 'ASSISTANT';
|
||||
"""
|
||||
)
|
||||
)
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chat_message
|
||||
SET research_type = 'LEGACY_AGENTIC'
|
||||
WHERE is_agentic = true
|
||||
AND research_type IS NULL;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Get connection to execute raw SQL
|
||||
connection = op.get_bind()
|
||||
|
||||
# Note: This downgrade removes all research agent iteration data
|
||||
# There's no way to perfectly restore the original agent__sub_question data
|
||||
# if it was deleted after this migration
|
||||
|
||||
# Delete all research_agent_iteration_sub_step records that were migrated
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM research_agent_iteration_sub_step
|
||||
USING chat_message
|
||||
WHERE research_agent_iteration_sub_step.primary_question_id = chat_message.id
|
||||
AND chat_message.research_type = 'LEGACY_AGENTIC';
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Delete all research_agent_iteration records that were migrated
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
DELETE FROM research_agent_iteration
|
||||
USING chat_message
|
||||
WHERE research_agent_iteration.primary_question_id = chat_message.id
|
||||
AND chat_message.research_type = 'LEGACY_AGENTIC';
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Revert chat_message updates: clear research fields for legacy agentic messages
|
||||
connection.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE chat_message
|
||||
SET research_type = NULL,
|
||||
research_answer_purpose = NULL
|
||||
WHERE is_agentic = true
|
||||
AND research_type = 'LEGACY_AGENTIC'
|
||||
AND message_type = 'ASSISTANT';
|
||||
"""
|
||||
)
|
||||
)
|
||||
315
backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
Normal file
315
backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
Normal file
@@ -0,0 +1,315 @@
|
||||
"""modify_file_store_for_external_storage
|
||||
|
||||
Revision ID: c9e2cd766c29
|
||||
Revises: 03bf8be6b53a
|
||||
Create Date: 2025-06-13 14:02:09.867679
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
from typing import cast, Any
|
||||
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from onyx.db._deprecated.pg_file_store import delete_lobj_by_id, read_lobj
|
||||
from onyx.file_store.file_store import get_s3_file_store
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "c9e2cd766c29"
|
||||
down_revision = "03bf8be6b53a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
try:
|
||||
# Modify existing file_store table to support external storage
|
||||
op.rename_table("file_store", "file_record")
|
||||
|
||||
# Make lobj_oid nullable (for external storage files)
|
||||
op.alter_column("file_record", "lobj_oid", nullable=True)
|
||||
|
||||
# Add external storage columns with generic names
|
||||
op.add_column(
|
||||
"file_record", sa.Column("bucket_name", sa.String(), nullable=True)
|
||||
)
|
||||
op.add_column(
|
||||
"file_record", sa.Column("object_key", sa.String(), nullable=True)
|
||||
)
|
||||
|
||||
# Add timestamps for tracking
|
||||
op.add_column(
|
||||
"file_record",
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
op.add_column(
|
||||
"file_record",
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.func.now(),
|
||||
nullable=False,
|
||||
),
|
||||
)
|
||||
|
||||
op.alter_column("file_record", "file_name", new_column_name="file_id")
|
||||
except Exception as e:
|
||||
if "does not exist" in str(e) or 'relation "file_store" does not exist' in str(
|
||||
e
|
||||
):
|
||||
print(
|
||||
f"Ran into error - {e}. Likely means we had a partial success in the past, continuing..."
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
print(
|
||||
"External storage configured - migrating files from PostgreSQL to external storage..."
|
||||
)
|
||||
# if we fail midway through this, we'll have a partial success. Running the migration
|
||||
# again should allow us to continue.
|
||||
_migrate_files_to_external_storage()
|
||||
print("File migration completed successfully!")
|
||||
|
||||
# Remove lobj_oid column
|
||||
op.drop_column("file_record", "lobj_oid")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Revert schema changes and migrate files from external storage back to PostgreSQL large objects."""
|
||||
|
||||
print(
|
||||
"Reverting to PostgreSQL-backed file store – migrating files from external storage …"
|
||||
)
|
||||
|
||||
# 1. Ensure `lobj_oid` exists on the current `file_record` table (nullable for now).
|
||||
op.add_column("file_record", sa.Column("lobj_oid", sa.Integer(), nullable=True))
|
||||
|
||||
# 2. Move content from external storage back into PostgreSQL large objects (table is still
|
||||
# called `file_record` so application code continues to work during the copy).
|
||||
try:
|
||||
_migrate_files_to_postgres()
|
||||
except Exception:
|
||||
print("Error during downgrade migration, rolling back …")
|
||||
op.drop_column("file_record", "lobj_oid")
|
||||
raise
|
||||
|
||||
# 3. After migration every row should now have `lobj_oid` populated – mark NOT NULL.
|
||||
op.alter_column("file_record", "lobj_oid", nullable=False)
|
||||
|
||||
# 4. Remove columns that are only relevant to external storage.
|
||||
op.drop_column("file_record", "updated_at")
|
||||
op.drop_column("file_record", "created_at")
|
||||
op.drop_column("file_record", "object_key")
|
||||
op.drop_column("file_record", "bucket_name")
|
||||
|
||||
# 5. Rename `file_id` back to `file_name` (still on `file_record`).
|
||||
op.alter_column("file_record", "file_id", new_column_name="file_name")
|
||||
|
||||
# 6. Finally, rename the table back to its original name expected by the legacy codebase.
|
||||
op.rename_table("file_record", "file_store")
|
||||
|
||||
print(
|
||||
"Downgrade migration completed – files are now stored inside PostgreSQL again."
|
||||
)
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Helper: migrate from external storage (S3/MinIO) back into PostgreSQL large objects
|
||||
|
||||
|
||||
def _migrate_files_to_postgres() -> None:
|
||||
"""Move any files whose content lives in external S3-compatible storage back into PostgreSQL.
|
||||
|
||||
The logic mirrors *inverse* of `_migrate_files_to_external_storage` used on upgrade.
|
||||
"""
|
||||
|
||||
# Obtain DB session from Alembic context
|
||||
bind = op.get_bind()
|
||||
session = Session(bind=bind)
|
||||
|
||||
# Fetch rows that have external storage pointers (bucket/object_key not NULL)
|
||||
result = session.execute(
|
||||
text(
|
||||
"SELECT file_id, bucket_name, object_key FROM file_record "
|
||||
"WHERE bucket_name IS NOT NULL AND object_key IS NOT NULL"
|
||||
)
|
||||
)
|
||||
|
||||
files_to_migrate = [row[0] for row in result.fetchall()]
|
||||
total_files = len(files_to_migrate)
|
||||
|
||||
if total_files == 0:
|
||||
print("No files found in external storage to migrate back to PostgreSQL.")
|
||||
return
|
||||
|
||||
print(f"Found {total_files} files to migrate back to PostgreSQL large objects.")
|
||||
|
||||
_set_tenant_contextvar(session)
|
||||
migrated_count = 0
|
||||
|
||||
# only create external store if we have files to migrate. This line
|
||||
# makes it so we need to have S3/MinIO configured to run this migration.
|
||||
external_store = get_s3_file_store()
|
||||
|
||||
for i, file_id in enumerate(files_to_migrate, 1):
|
||||
print(f"Migrating file {i}/{total_files}: {file_id}")
|
||||
|
||||
# Read file content from external storage (always binary)
|
||||
try:
|
||||
file_io = external_store.read_file(
|
||||
file_id=file_id, mode="b", use_tempfile=True
|
||||
)
|
||||
file_io.seek(0)
|
||||
|
||||
# Import lazily to avoid circular deps at Alembic runtime
|
||||
from onyx.db._deprecated.pg_file_store import (
|
||||
create_populate_lobj,
|
||||
) # noqa: E402
|
||||
|
||||
# Create new Postgres large object and populate it
|
||||
lobj_oid = create_populate_lobj(content=file_io, db_session=session)
|
||||
|
||||
# Update DB row: set lobj_oid, clear bucket/object_key
|
||||
session.execute(
|
||||
text(
|
||||
"UPDATE file_record SET lobj_oid = :lobj_oid, bucket_name = NULL, "
|
||||
"object_key = NULL WHERE file_id = :file_id"
|
||||
),
|
||||
{"lobj_oid": lobj_oid, "file_id": file_id},
|
||||
)
|
||||
except ClientError as e:
|
||||
if "NoSuchKey" in str(e):
|
||||
print(
|
||||
f"File {file_id} not found in external storage. Deleting from database."
|
||||
)
|
||||
session.execute(
|
||||
text("DELETE FROM file_record WHERE file_id = :file_id"),
|
||||
{"file_id": file_id},
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
migrated_count += 1
|
||||
print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
|
||||
|
||||
# Flush the SQLAlchemy session so statements are sent to the DB, but **do not**
|
||||
# commit the transaction. The surrounding Alembic migration will commit once
|
||||
# the *entire* downgrade succeeds. This keeps the whole downgrade atomic and
|
||||
# avoids leaving the database in a partially-migrated state if a later schema
|
||||
# operation fails.
|
||||
session.flush()
|
||||
|
||||
print(
|
||||
f"Migration back to PostgreSQL completed: {migrated_count} files staged for commit."
|
||||
)
|
||||
|
||||
|
||||
def _migrate_files_to_external_storage() -> None:
|
||||
"""Migrate files from PostgreSQL large objects to external storage"""
|
||||
# Get database session
|
||||
bind = op.get_bind()
|
||||
session = Session(bind=bind)
|
||||
external_store = get_s3_file_store()
|
||||
|
||||
# Find all files currently stored in PostgreSQL (lobj_oid is not null)
|
||||
result = session.execute(
|
||||
text(
|
||||
"SELECT file_id FROM file_record WHERE lobj_oid IS NOT NULL "
|
||||
"AND bucket_name IS NULL AND object_key IS NULL"
|
||||
)
|
||||
)
|
||||
|
||||
files_to_migrate = [row[0] for row in result.fetchall()]
|
||||
total_files = len(files_to_migrate)
|
||||
|
||||
if total_files == 0:
|
||||
print("No files found in PostgreSQL storage to migrate.")
|
||||
return
|
||||
|
||||
# might need to move this above the if statement when creating a new multi-tenant
|
||||
# system. VERY extreme edge case.
|
||||
external_store.initialize()
|
||||
print(f"Found {total_files} files to migrate from PostgreSQL to external storage.")
|
||||
|
||||
_set_tenant_contextvar(session)
|
||||
migrated_count = 0
|
||||
|
||||
for i, file_id in enumerate(files_to_migrate, 1):
|
||||
print(f"Migrating file {i}/{total_files}: {file_id}")
|
||||
|
||||
# Read file record to get metadata
|
||||
file_record = session.execute(
|
||||
text("SELECT * FROM file_record WHERE file_id = :file_id"),
|
||||
{"file_id": file_id},
|
||||
).fetchone()
|
||||
|
||||
if file_record is None:
|
||||
print(f"File {file_id} not found in PostgreSQL storage.")
|
||||
continue
|
||||
|
||||
lobj_id = cast(int, file_record.lobj_oid) # type: ignore
|
||||
file_metadata = cast(Any, file_record.file_metadata) # type: ignore
|
||||
|
||||
# Read file content from PostgreSQL
|
||||
try:
|
||||
file_content = read_lobj(
|
||||
lobj_id, db_session=session, mode="b", use_tempfile=True
|
||||
)
|
||||
except Exception as e:
|
||||
if "large object" in str(e) and "does not exist" in str(e):
|
||||
print(f"File {file_id} not found in PostgreSQL storage.")
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
|
||||
# Handle file_metadata type conversion
|
||||
file_metadata = None
|
||||
if file_metadata is not None:
|
||||
if isinstance(file_metadata, dict):
|
||||
file_metadata = file_metadata
|
||||
else:
|
||||
# Convert other types to dict if possible, otherwise None
|
||||
try:
|
||||
file_metadata = dict(file_record.file_metadata) # type: ignore
|
||||
except (TypeError, ValueError):
|
||||
file_metadata = None
|
||||
|
||||
# Save to external storage (this will handle the database record update and cleanup)
|
||||
# NOTE: this WILL .commit() the transaction.
|
||||
external_store.save_file(
|
||||
file_id=file_id,
|
||||
content=file_content,
|
||||
display_name=file_record.display_name,
|
||||
file_origin=file_record.file_origin,
|
||||
file_type=file_record.file_type,
|
||||
file_metadata=file_metadata,
|
||||
)
|
||||
delete_lobj_by_id(lobj_id, db_session=session)
|
||||
|
||||
migrated_count += 1
|
||||
print(f"✓ Successfully migrated file {i}/{total_files}: {file_id}")
|
||||
|
||||
# See note above – flush but do **not** commit so the outer Alembic transaction
|
||||
# controls atomicity.
|
||||
session.flush()
|
||||
|
||||
print(
|
||||
f"Migration completed: {migrated_count} files staged for commit to external storage."
|
||||
)
|
||||
|
||||
|
||||
def _set_tenant_contextvar(session: Session) -> None:
|
||||
"""Set the tenant contextvar to the default schema"""
|
||||
current_tenant = session.execute(text("SELECT current_schema()")).scalar()
|
||||
print(f"Migrating files for tenant: {current_tenant}")
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(current_tenant)
|
||||
@@ -11,7 +11,7 @@ import sqlalchemy as sa
|
||||
import json
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.onyx_jira.utils import extract_jira_project
|
||||
from onyx.connectors.jira.utils import extract_jira_project
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
|
||||
@@ -18,11 +18,13 @@ depends_on: None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.execute("DROP TABLE IF EXISTS document CASCADE")
|
||||
op.create_table(
|
||||
"document",
|
||||
sa.Column("id", sa.String(), nullable=False),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.execute("DROP TABLE IF EXISTS chunk CASCADE")
|
||||
op.create_table(
|
||||
"chunk",
|
||||
sa.Column("id", sa.String(), nullable=False),
|
||||
@@ -43,6 +45,7 @@ def upgrade() -> None:
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id", "document_store_type"),
|
||||
)
|
||||
op.execute("DROP TABLE IF EXISTS deletion_attempt CASCADE")
|
||||
op.create_table(
|
||||
"deletion_attempt",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
@@ -84,6 +87,7 @@ def upgrade() -> None:
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.execute("DROP TABLE IF EXISTS document_by_connector_credential_pair CASCADE")
|
||||
op.create_table(
|
||||
"document_by_connector_credential_pair",
|
||||
sa.Column("id", sa.String(), nullable=False),
|
||||
@@ -106,7 +110,10 @@ def upgrade() -> None:
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# upstream tables first
|
||||
op.drop_table("document_by_connector_credential_pair")
|
||||
op.drop_table("deletion_attempt")
|
||||
op.drop_table("chunk")
|
||||
op.drop_table("document")
|
||||
|
||||
# Alembic op.drop_table() has no "cascade" flag – issue raw SQL
|
||||
op.execute("DROP TABLE IF EXISTS document CASCADE")
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
"""add research_answer_purpose to chat_message
|
||||
|
||||
Revision ID: f8a9b2c3d4e5
|
||||
Revises: 5ae8240accb3
|
||||
Create Date: 2025-01-27 12:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "f8a9b2c3d4e5"
|
||||
down_revision = "5ae8240accb3"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add research_answer_purpose column to chat_message table
|
||||
op.add_column(
|
||||
"chat_message",
|
||||
sa.Column("research_answer_purpose", sa.String(), nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Remove research_answer_purpose column from chat_message table
|
||||
op.drop_column("chat_message", "research_answer_purpose")
|
||||
@@ -0,0 +1,69 @@
|
||||
"""remove foreign key constraints from research_agent_iteration_sub_step
|
||||
|
||||
Revision ID: f9b8c7d6e5a4
|
||||
Revises: bd7c3bf8beba
|
||||
Create Date: 2025-01-27 12:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "f9b8c7d6e5a4"
|
||||
down_revision = "bd7c3bf8beba"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Drop the existing foreign key constraint for parent_question_id
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_parent_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
|
||||
# Drop the parent_question_id column entirely
|
||||
op.drop_column("research_agent_iteration_sub_step", "parent_question_id")
|
||||
|
||||
# Drop the foreign key constraint for primary_question_id to chat_message.id
|
||||
# (keep the column as it's needed for the composite foreign key)
|
||||
op.drop_constraint(
|
||||
"research_agent_iteration_sub_step_primary_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
type_="foreignkey",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Restore the foreign key constraint for primary_question_id to chat_message.id
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_primary_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"chat_message",
|
||||
["primary_question_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# Add back the parent_question_id column
|
||||
op.add_column(
|
||||
"research_agent_iteration_sub_step",
|
||||
sa.Column(
|
||||
"parent_question_id",
|
||||
sa.Integer(),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Restore the foreign key constraint pointing to research_agent_iteration_sub_step.id
|
||||
op.create_foreign_key(
|
||||
"research_agent_iteration_sub_step_parent_question_id_fkey",
|
||||
"research_agent_iteration_sub_step",
|
||||
"research_agent_iteration_sub_step",
|
||||
["parent_question_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
@@ -8,7 +8,7 @@ from sqlalchemy.ext.asyncio import create_async_engine
|
||||
from sqlalchemy.schema import SchemaItem
|
||||
|
||||
from alembic import context
|
||||
from onyx.db.engine import build_connection_string
|
||||
from onyx.db.engine.sql_engine import build_connection_string
|
||||
from onyx.db.models import PublicBase
|
||||
|
||||
# this is the Alembic Config object, which provides
|
||||
|
||||
@@ -16,7 +16,7 @@ from onyx.configs.constants import FileOrigin
|
||||
from onyx.configs.constants import FileType
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import QueryHistoryType
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.tasks import delete_task_with_id
|
||||
from onyx.db.tasks import mark_task_as_finished_with_id
|
||||
from onyx.db.tasks import mark_task_as_started_with_id
|
||||
@@ -35,7 +35,13 @@ logger = setup_logger()
|
||||
trail=False,
|
||||
)
|
||||
def export_query_history_task(
|
||||
self: Task, *, start: datetime, end: datetime, start_time: datetime
|
||||
self: Task,
|
||||
*,
|
||||
start: datetime,
|
||||
end: datetime,
|
||||
start_time: datetime,
|
||||
# Need to include the tenant_id since the TenantAwareTask needs this
|
||||
tenant_id: str,
|
||||
) -> None:
|
||||
if not self.request.id:
|
||||
raise RuntimeError("No task id defined for this task; cannot identify it")
|
||||
@@ -85,8 +91,7 @@ def export_query_history_task(
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
try:
|
||||
stream.seek(0)
|
||||
get_default_file_store(db_session).save_file(
|
||||
file_name=report_name,
|
||||
get_default_file_store().save_file(
|
||||
content=stream,
|
||||
display_name=report_name,
|
||||
file_origin=FileOrigin.QUERY_HISTORY_CSV,
|
||||
@@ -96,6 +101,7 @@ def export_query_history_task(
|
||||
"end": end.isoformat(),
|
||||
"start_time": start_time.isoformat(),
|
||||
},
|
||||
file_id=report_name,
|
||||
)
|
||||
|
||||
delete_task_with_id(
|
||||
|
||||
@@ -1,133 +1,4 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from uuid import UUID
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
|
||||
from ee.onyx.background.celery_utils import should_perform_chat_ttl_check
|
||||
from ee.onyx.background.task_name_builders import name_chat_ttl_task
|
||||
from ee.onyx.server.reporting.usage_export_generation import create_new_usage_report
|
||||
from onyx.background.celery.apps.primary import celery_app
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.chat import delete_chat_session
|
||||
from onyx.db.chat import get_chat_sessions_older_than
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.enums import TaskStatus
|
||||
from onyx.db.tasks import mark_task_as_finished_with_id
|
||||
from onyx.db.tasks import register_task
|
||||
from onyx.server.settings.store import load_settings
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# mark as EE for all tasks in this file
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.PERFORM_TTL_MANAGEMENT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
bind=True,
|
||||
trail=False,
|
||||
)
|
||||
def perform_ttl_management_task(
|
||||
self: Task, retention_limit_days: int, *, tenant_id: str
|
||||
) -> None:
|
||||
task_id = self.request.id
|
||||
if not task_id:
|
||||
raise RuntimeError("No task id defined for this task; cannot identify it")
|
||||
|
||||
start_time = datetime.now(tz=timezone.utc)
|
||||
|
||||
user_id: UUID | None = None
|
||||
session_id: UUID | None = None
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# we generally want to move off this, but keeping for now
|
||||
register_task(
|
||||
db_session=db_session,
|
||||
task_name=name_chat_ttl_task(retention_limit_days, tenant_id),
|
||||
task_id=task_id,
|
||||
status=TaskStatus.STARTED,
|
||||
start_time=start_time,
|
||||
)
|
||||
|
||||
old_chat_sessions = get_chat_sessions_older_than(
|
||||
retention_limit_days, db_session
|
||||
)
|
||||
|
||||
for user_id, session_id in old_chat_sessions:
|
||||
# one session per delete so that we don't blow up if a deletion fails.
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
delete_chat_session(
|
||||
user_id,
|
||||
session_id,
|
||||
db_session,
|
||||
include_deleted=True,
|
||||
hard_delete=True,
|
||||
)
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=True,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"delete_chat_session exceptioned. "
|
||||
f"user_id={user_id} session_id={session_id}"
|
||||
)
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=False,
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
#####
|
||||
# Periodic Tasks
|
||||
#####
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name=OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def check_ttl_management_task(*, tenant_id: str) -> None:
|
||||
"""Runs periodically to check if any ttl tasks should be run and adds them
|
||||
to the queue"""
|
||||
|
||||
settings = load_settings()
|
||||
retention_limit_days = settings.maximum_chat_retention_days
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
if should_perform_chat_ttl_check(retention_limit_days, db_session):
|
||||
perform_ttl_management_task.apply_async(
|
||||
kwargs=dict(
|
||||
retention_limit_days=retention_limit_days, tenant_id=tenant_id
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name=OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def autogenerate_usage_report_task(*, tenant_id: str) -> None:
|
||||
"""This generates usage report under the /admin/generate-usage/report endpoint"""
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
create_new_usage_report(
|
||||
db_session=db_session,
|
||||
user_id=None,
|
||||
period=None,
|
||||
)
|
||||
|
||||
|
||||
celery_app.autodiscover_tasks(
|
||||
@@ -135,5 +6,7 @@ celery_app.autodiscover_tasks(
|
||||
"ee.onyx.background.celery.tasks.doc_permission_syncing",
|
||||
"ee.onyx.background.celery.tasks.external_group_syncing",
|
||||
"ee.onyx.background.celery.tasks.cloud",
|
||||
"ee.onyx.background.celery.tasks.ttl_management",
|
||||
"ee.onyx.background.celery.tasks.usage_reporting",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -20,39 +20,36 @@ from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
ee_beat_system_tasks: list[dict] = []
|
||||
|
||||
ee_beat_task_templates: list[dict] = []
|
||||
ee_beat_task_templates.extend(
|
||||
[
|
||||
{
|
||||
"name": "autogenerate-usage-report",
|
||||
"task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
|
||||
"schedule": timedelta(days=30),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
ee_beat_task_templates: list[dict] = [
|
||||
{
|
||||
"name": "autogenerate-usage-report",
|
||||
"task": OnyxCeleryTask.GENERATE_USAGE_REPORT_TASK,
|
||||
"schedule": timedelta(days=30),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
{
|
||||
"name": "check-ttl-management",
|
||||
"task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
"schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "check-ttl-management",
|
||||
"task": OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
"schedule": timedelta(hours=CHECK_TTL_MANAGEMENT_TASK_FREQUENCY_IN_HOURS),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
{
|
||||
"name": "export-query-history-cleanup-task",
|
||||
"task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
|
||||
"schedule": timedelta(hours=1),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"queue": OnyxCeleryQueues.CSV_GENERATION,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "export-query-history-cleanup-task",
|
||||
"task": OnyxCeleryTask.EXPORT_QUERY_HISTORY_CLEANUP_TASK,
|
||||
"schedule": timedelta(hours=1),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"queue": OnyxCeleryQueues.CSV_GENERATION,
|
||||
},
|
||||
]
|
||||
)
|
||||
},
|
||||
]
|
||||
|
||||
ee_tasks_to_schedule: list[dict] = []
|
||||
|
||||
@@ -60,7 +57,7 @@ if not MULTI_TENANT:
|
||||
ee_tasks_to_schedule = [
|
||||
{
|
||||
"name": "autogenerate-usage-report",
|
||||
"task": OnyxCeleryTask.AUTOGENERATE_USAGE_REPORT_TASK,
|
||||
"task": OnyxCeleryTask.GENERATE_USAGE_REPORT_TASK,
|
||||
"schedule": timedelta(days=30), # TODO: change this to config flag
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
|
||||
@@ -6,7 +6,7 @@ from celery import shared_task
|
||||
from ee.onyx.db.query_history import get_all_query_history_export_tasks
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.engine.sql_engine import get_session_with_tenant
|
||||
from onyx.db.enums import TaskStatus
|
||||
from onyx.db.tasks import delete_task_with_id
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@@ -13,7 +13,7 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine import get_all_tenant_ids
|
||||
from onyx.db.engine.tenant_utils import get_all_tenant_ids
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import redis_lock_dump
|
||||
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
|
||||
|
||||
@@ -30,6 +30,7 @@ from onyx.background.celery.celery_redis import celery_find_task
|
||||
from onyx.background.celery.celery_redis import celery_get_queue_length
|
||||
from onyx.background.celery.celery_redis import celery_get_queued_task_ids
|
||||
from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
|
||||
from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
|
||||
from onyx.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT
|
||||
@@ -46,9 +47,10 @@ from onyx.connectors.factory import validate_ccpair_for_user
|
||||
from onyx.db.connector import mark_cc_pair_as_permissions_synced
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from onyx.db.document import get_document_ids_for_connector_credential_pair
|
||||
from onyx.db.document import get_documents_for_connector_credential_pair_limited_columns
|
||||
from onyx.db.document import upsert_document_by_connector_credential_pair
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.engine import get_session_with_tenant
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.engine.sql_engine import get_session_with_tenant
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import SyncStatus
|
||||
@@ -57,7 +59,9 @@ from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.sync_record import insert_sync_record
|
||||
from onyx.db.sync_record import update_sync_record_status
|
||||
from onyx.db.users import batch_add_ext_perm_user_if_not_exists
|
||||
from onyx.db.utils import DocumentRow
|
||||
from onyx.db.utils import is_retryable_sqlalchemy_error
|
||||
from onyx.db.utils import SortOrder
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
|
||||
@@ -73,6 +77,7 @@ from onyx.utils.logger import LoggerContextVars
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.telemetry import optional_telemetry
|
||||
from onyx.utils.telemetry import RecordType
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -87,6 +92,24 @@ LIGHT_SOFT_TIME_LIMIT = 105
|
||||
LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
|
||||
|
||||
|
||||
def _get_fence_validation_block_expiration() -> int:
|
||||
"""
|
||||
Compute the expiration time for the fence validation block signal.
|
||||
Base expiration is 300 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
|
||||
"""
|
||||
base_expiration = 300 # seconds
|
||||
|
||||
if not MULTI_TENANT:
|
||||
return base_expiration
|
||||
|
||||
try:
|
||||
beat_multiplier = OnyxRuntime.get_beat_multiplier()
|
||||
except Exception:
|
||||
beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
|
||||
|
||||
return int(base_expiration * beat_multiplier)
|
||||
|
||||
|
||||
"""Jobs / utils for kicking off doc permissions sync tasks."""
|
||||
|
||||
|
||||
@@ -194,7 +217,11 @@ def check_for_doc_permissions_sync(self: Task, *, tenant_id: str) -> bool | None
|
||||
"Exception while validating permission sync fences"
|
||||
)
|
||||
|
||||
r.set(OnyxRedisSignals.BLOCK_VALIDATE_PERMISSION_SYNC_FENCES, 1, ex=300)
|
||||
r.set(
|
||||
OnyxRedisSignals.BLOCK_VALIDATE_PERMISSION_SYNC_FENCES,
|
||||
1,
|
||||
ex=_get_fence_validation_block_expiration(),
|
||||
)
|
||||
|
||||
# use a lookup table to find active fences. We still have to verify the fence
|
||||
# exists since it is an optimization and not the source of truth.
|
||||
@@ -398,7 +425,7 @@ def connector_permission_sync_generator_task(
|
||||
|
||||
lock: RedisLock = r.lock(
|
||||
OnyxRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX
|
||||
+ f"_{redis_connector.id}",
|
||||
+ f"_{redis_connector.cc_pair_id}",
|
||||
timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT,
|
||||
thread_local=False,
|
||||
)
|
||||
@@ -425,6 +452,7 @@ def connector_permission_sync_generator_task(
|
||||
created = validate_ccpair_for_user(
|
||||
cc_pair.connector.id,
|
||||
cc_pair.credential.id,
|
||||
cc_pair.access_type,
|
||||
db_session,
|
||||
enforce_creation=False,
|
||||
)
|
||||
@@ -473,16 +501,31 @@ def connector_permission_sync_generator_task(
|
||||
# this is can be used to determine documents that are "missing" and thus
|
||||
# should no longer be accessible. The decision as to whether we should find
|
||||
# every document during the doc sync process is connector-specific.
|
||||
def fetch_all_existing_docs_fn() -> list[str]:
|
||||
return get_document_ids_for_connector_credential_pair(
|
||||
def fetch_all_existing_docs_fn(
|
||||
sort_order: SortOrder | None = None,
|
||||
) -> list[DocumentRow]:
|
||||
result = get_documents_for_connector_credential_pair_limited_columns(
|
||||
db_session=db_session,
|
||||
connector_id=cc_pair.connector.id,
|
||||
credential_id=cc_pair.credential.id,
|
||||
sort_order=sort_order,
|
||||
)
|
||||
return list(result)
|
||||
|
||||
def fetch_all_existing_docs_ids_fn() -> list[str]:
|
||||
result = get_document_ids_for_connector_credential_pair(
|
||||
db_session=db_session,
|
||||
connector_id=cc_pair.connector.id,
|
||||
credential_id=cc_pair.credential.id,
|
||||
)
|
||||
return result
|
||||
|
||||
doc_sync_func = sync_config.doc_sync_config.doc_sync_func
|
||||
document_external_accesses = doc_sync_func(
|
||||
cc_pair, fetch_all_existing_docs_fn, callback
|
||||
cc_pair,
|
||||
fetch_all_existing_docs_fn,
|
||||
fetch_all_existing_docs_ids_fn,
|
||||
callback,
|
||||
)
|
||||
|
||||
task_logger.info(
|
||||
@@ -597,91 +640,6 @@ def document_update_permissions(
|
||||
return True
|
||||
|
||||
|
||||
# NOTE(rkuo): Deprecating this due to degenerate behavior in Redis from sending
|
||||
# large permissions through celery (over 1MB in size)
|
||||
# @shared_task(
|
||||
# name=OnyxCeleryTask.UPDATE_EXTERNAL_DOCUMENT_PERMISSIONS_TASK,
|
||||
# soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
|
||||
# time_limit=LIGHT_TIME_LIMIT,
|
||||
# max_retries=DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES,
|
||||
# bind=True,
|
||||
# )
|
||||
# def update_external_document_permissions_task(
|
||||
# self: Task,
|
||||
# tenant_id: str,
|
||||
# serialized_doc_external_access: dict,
|
||||
# source_string: str,
|
||||
# connector_id: int,
|
||||
# credential_id: int,
|
||||
# ) -> bool:
|
||||
# start = time.monotonic()
|
||||
|
||||
# completion_status = OnyxCeleryTaskCompletionStatus.UNDEFINED
|
||||
|
||||
# document_external_access = DocExternalAccess.from_dict(
|
||||
# serialized_doc_external_access
|
||||
# )
|
||||
# doc_id = document_external_access.doc_id
|
||||
# external_access = document_external_access.external_access
|
||||
|
||||
# try:
|
||||
# with get_session_with_current_tenant() as db_session:
|
||||
# # Add the users to the DB if they don't exist
|
||||
# batch_add_ext_perm_user_if_not_exists(
|
||||
# db_session=db_session,
|
||||
# emails=list(external_access.external_user_emails),
|
||||
# continue_on_error=True,
|
||||
# )
|
||||
# # Then upsert the document's external permissions
|
||||
# created_new_doc = upsert_document_external_perms(
|
||||
# db_session=db_session,
|
||||
# doc_id=doc_id,
|
||||
# external_access=external_access,
|
||||
# source_type=DocumentSource(source_string),
|
||||
# )
|
||||
|
||||
# if created_new_doc:
|
||||
# # If a new document was created, we associate it with the cc_pair
|
||||
# upsert_document_by_connector_credential_pair(
|
||||
# db_session=db_session,
|
||||
# connector_id=connector_id,
|
||||
# credential_id=credential_id,
|
||||
# document_ids=[doc_id],
|
||||
# )
|
||||
|
||||
# elapsed = time.monotonic() - start
|
||||
# task_logger.info(
|
||||
# f"connector_id={connector_id} "
|
||||
# f"doc={doc_id} "
|
||||
# f"action=update_permissions "
|
||||
# f"elapsed={elapsed:.2f}"
|
||||
# )
|
||||
|
||||
# completion_status = OnyxCeleryTaskCompletionStatus.SUCCEEDED
|
||||
# except Exception as e:
|
||||
# error_msg = format_error_for_logging(e)
|
||||
# task_logger.warning(
|
||||
# f"Exception in update_external_document_permissions_task: connector_id={connector_id} doc_id={doc_id} {error_msg}"
|
||||
# )
|
||||
# task_logger.exception(
|
||||
# f"update_external_document_permissions_task exceptioned: "
|
||||
# f"connector_id={connector_id} doc_id={doc_id}"
|
||||
# )
|
||||
# completion_status = OnyxCeleryTaskCompletionStatus.NON_RETRYABLE_EXCEPTION
|
||||
# finally:
|
||||
# task_logger.info(
|
||||
# f"update_external_document_permissions_task completed: status={completion_status.value} doc={doc_id}"
|
||||
# )
|
||||
|
||||
# if completion_status != OnyxCeleryTaskCompletionStatus.SUCCEEDED:
|
||||
# return False
|
||||
|
||||
# task_logger.info(
|
||||
# f"update_external_document_permissions_task finished: connector_id={connector_id} doc_id={doc_id}"
|
||||
# )
|
||||
# return True
|
||||
|
||||
|
||||
def validate_permission_sync_fences(
|
||||
tenant_id: str,
|
||||
r: Redis,
|
||||
|
||||
@@ -20,7 +20,9 @@ from ee.onyx.background.celery.tasks.external_group_syncing.group_sync_utils imp
|
||||
from ee.onyx.db.connector_credential_pair import get_all_auto_sync_cc_pairs
|
||||
from ee.onyx.db.connector_credential_pair import get_cc_pairs_by_source
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from ee.onyx.db.external_perm import replace_user__ext_group_for_cc_pair
|
||||
from ee.onyx.db.external_perm import mark_old_external_groups_as_stale
|
||||
from ee.onyx.db.external_perm import remove_stale_external_groups
|
||||
from ee.onyx.db.external_perm import upsert_external_groups
|
||||
from ee.onyx.external_permissions.sync_params import (
|
||||
get_all_cc_pair_agnostic_group_sync_sources,
|
||||
)
|
||||
@@ -28,6 +30,7 @@ from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
|
||||
from onyx.background.celery.apps.app_base import task_logger
|
||||
from onyx.background.celery.celery_redis import celery_find_task
|
||||
from onyx.background.celery.celery_redis import celery_get_unacked_task_ids
|
||||
from onyx.background.celery.tasks.beat_schedule import CLOUD_BEAT_MULTIPLIER_DEFAULT
|
||||
from onyx.background.error_logging import emit_background_error
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT
|
||||
@@ -39,9 +42,8 @@ from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisConstants
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.configs.constants import OnyxRedisSignals
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import SyncStatus
|
||||
@@ -56,19 +58,34 @@ from onyx.redis.redis_connector_ext_group_sync import (
|
||||
)
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import get_redis_replica_client
|
||||
from onyx.server.runtime.onyx_runtime import OnyxRuntime
|
||||
from onyx.server.utils import make_short_id
|
||||
from onyx.utils.logger import format_error_for_logging
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
EXTERNAL_GROUPS_UPDATE_MAX_RETRIES = 3
|
||||
_EXTERNAL_GROUP_BATCH_SIZE = 100
|
||||
|
||||
|
||||
# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
|
||||
LIGHT_SOFT_TIME_LIMIT = 105
|
||||
LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
|
||||
def _get_fence_validation_block_expiration() -> int:
|
||||
"""
|
||||
Compute the expiration time for the fence validation block signal.
|
||||
Base expiration is 300 seconds, multiplied by the beat multiplier only in MULTI_TENANT mode.
|
||||
"""
|
||||
base_expiration = 300 # seconds
|
||||
|
||||
if not MULTI_TENANT:
|
||||
return base_expiration
|
||||
|
||||
try:
|
||||
beat_multiplier = OnyxRuntime.get_beat_multiplier()
|
||||
except Exception:
|
||||
beat_multiplier = CLOUD_BEAT_MULTIPLIER_DEFAULT
|
||||
|
||||
return int(base_expiration * beat_multiplier)
|
||||
|
||||
|
||||
def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
|
||||
@@ -198,7 +215,11 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
|
||||
"Exception while validating external group sync fences"
|
||||
)
|
||||
|
||||
r.set(OnyxRedisSignals.BLOCK_VALIDATE_EXTERNAL_GROUP_SYNC_FENCES, 1, ex=300)
|
||||
r.set(
|
||||
OnyxRedisSignals.BLOCK_VALIDATE_EXTERNAL_GROUP_SYNC_FENCES,
|
||||
1,
|
||||
ex=_get_fence_validation_block_expiration(),
|
||||
)
|
||||
except SoftTimeLimitExceeded:
|
||||
task_logger.info(
|
||||
"Soft time limit exceeded, task is being terminated gracefully."
|
||||
@@ -362,7 +383,7 @@ def connector_external_group_sync_generator_task(
|
||||
|
||||
lock: RedisLock = r.lock(
|
||||
OnyxRedisLocks.CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX
|
||||
+ f"_{redis_connector.id}",
|
||||
+ f"_{redis_connector.cc_pair_id}",
|
||||
timeout=CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT,
|
||||
)
|
||||
|
||||
@@ -377,63 +398,12 @@ def connector_external_group_sync_generator_task(
|
||||
payload.started = datetime.now(timezone.utc)
|
||||
redis_connector.external_group_sync.set_fence(payload)
|
||||
|
||||
_perform_external_group_sync(
|
||||
cc_pair_id=cc_pair_id,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
cc_pair = get_connector_credential_pair_from_id(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair_id,
|
||||
eager_load_credential=True,
|
||||
)
|
||||
if cc_pair is None:
|
||||
raise ValueError(
|
||||
f"No connector credential pair found for id: {cc_pair_id}"
|
||||
)
|
||||
|
||||
source_type = cc_pair.connector.source
|
||||
sync_config = get_source_perm_sync_config(source_type)
|
||||
if sync_config is None:
|
||||
msg = (
|
||||
f"No sync config found for {source_type} for cc_pair: {cc_pair_id}"
|
||||
)
|
||||
emit_background_error(msg, cc_pair_id=cc_pair_id)
|
||||
raise ValueError(msg)
|
||||
|
||||
if sync_config.group_sync_config is None:
|
||||
msg = f"No group sync config found for {source_type} for cc_pair: {cc_pair_id}"
|
||||
emit_background_error(msg, cc_pair_id=cc_pair_id)
|
||||
raise ValueError(msg)
|
||||
|
||||
ext_group_sync_func = sync_config.group_sync_config.group_sync_func
|
||||
|
||||
logger.info(
|
||||
f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}"
|
||||
)
|
||||
external_user_groups: list[ExternalUserGroup] = []
|
||||
try:
|
||||
external_user_groups = ext_group_sync_func(tenant_id, cc_pair)
|
||||
except ConnectorValidationError as e:
|
||||
# TODO: add some notification to the admins here
|
||||
logger.exception(
|
||||
f"Error syncing external groups for {source_type} for cc_pair: {cc_pair_id} {e}"
|
||||
)
|
||||
raise e
|
||||
|
||||
logger.info(
|
||||
f"Syncing {len(external_user_groups)} external user groups for {source_type}"
|
||||
)
|
||||
logger.debug(f"New external user groups: {external_user_groups}")
|
||||
|
||||
replace_user__ext_group_for_cc_pair(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair.id,
|
||||
group_defs=external_user_groups,
|
||||
source=cc_pair.connector.source,
|
||||
)
|
||||
logger.info(
|
||||
f"Synced {len(external_user_groups)} external user groups for {source_type}"
|
||||
)
|
||||
|
||||
mark_all_relevant_cc_pairs_as_external_group_synced(db_session, cc_pair)
|
||||
|
||||
update_sync_record_status(
|
||||
db_session=db_session,
|
||||
entity_id=cc_pair_id,
|
||||
@@ -475,6 +445,81 @@ def connector_external_group_sync_generator_task(
|
||||
)
|
||||
|
||||
|
||||
def _perform_external_group_sync(
|
||||
cc_pair_id: int,
|
||||
tenant_id: str,
|
||||
) -> None:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
cc_pair = get_connector_credential_pair_from_id(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair_id,
|
||||
eager_load_credential=True,
|
||||
)
|
||||
if cc_pair is None:
|
||||
raise ValueError(f"No connector credential pair found for id: {cc_pair_id}")
|
||||
|
||||
source_type = cc_pair.connector.source
|
||||
sync_config = get_source_perm_sync_config(source_type)
|
||||
if sync_config is None:
|
||||
msg = f"No sync config found for {source_type} for cc_pair: {cc_pair_id}"
|
||||
emit_background_error(msg, cc_pair_id=cc_pair_id)
|
||||
raise ValueError(msg)
|
||||
|
||||
if sync_config.group_sync_config is None:
|
||||
msg = f"No group sync config found for {source_type} for cc_pair: {cc_pair_id}"
|
||||
emit_background_error(msg, cc_pair_id=cc_pair_id)
|
||||
raise ValueError(msg)
|
||||
|
||||
ext_group_sync_func = sync_config.group_sync_config.group_sync_func
|
||||
|
||||
logger.info(
|
||||
f"Marking old external groups as stale for {source_type} for cc_pair: {cc_pair_id}"
|
||||
)
|
||||
mark_old_external_groups_as_stale(db_session, cc_pair_id)
|
||||
|
||||
logger.info(
|
||||
f"Syncing external groups for {source_type} for cc_pair: {cc_pair_id}"
|
||||
)
|
||||
external_user_group_batch: list[ExternalUserGroup] = []
|
||||
try:
|
||||
external_user_group_generator = ext_group_sync_func(tenant_id, cc_pair)
|
||||
for external_user_group in external_user_group_generator:
|
||||
external_user_group_batch.append(external_user_group)
|
||||
if len(external_user_group_batch) >= _EXTERNAL_GROUP_BATCH_SIZE:
|
||||
logger.debug(
|
||||
f"New external user groups: {external_user_group_batch}"
|
||||
)
|
||||
upsert_external_groups(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair_id,
|
||||
external_groups=external_user_group_batch,
|
||||
source=cc_pair.connector.source,
|
||||
)
|
||||
external_user_group_batch = []
|
||||
|
||||
if external_user_group_batch:
|
||||
logger.debug(f"New external user groups: {external_user_group_batch}")
|
||||
upsert_external_groups(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair_id,
|
||||
external_groups=external_user_group_batch,
|
||||
source=cc_pair.connector.source,
|
||||
)
|
||||
except Exception as e:
|
||||
# TODO: add some notification to the admins here
|
||||
logger.exception(
|
||||
f"Error syncing external groups for {source_type} for cc_pair: {cc_pair_id} {e}"
|
||||
)
|
||||
raise e
|
||||
|
||||
logger.info(
|
||||
f"Removing stale external groups for {source_type} for cc_pair: {cc_pair_id}"
|
||||
)
|
||||
remove_stale_external_groups(db_session, cc_pair_id)
|
||||
|
||||
mark_all_relevant_cc_pairs_as_external_group_synced(db_session, cc_pair)
|
||||
|
||||
|
||||
def validate_external_group_sync_fences(
|
||||
tenant_id: str,
|
||||
celery_app: Celery,
|
||||
|
||||
@@ -19,7 +19,7 @@ from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
|
||||
from onyx.configs.constants import OnyxCeleryQueues
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine import get_session_with_shared_schema
|
||||
from onyx.db.engine.sql_engine import get_session_with_shared_schema
|
||||
from onyx.db.models import AvailableTenant
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
106
backend/ee/onyx/background/celery/tasks/ttl_management/tasks.py
Normal file
106
backend/ee/onyx/background/celery/tasks/ttl_management/tasks.py
Normal file
@@ -0,0 +1,106 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from uuid import UUID
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
|
||||
from ee.onyx.background.celery_utils import should_perform_chat_ttl_check
|
||||
from ee.onyx.background.task_name_builders import name_chat_ttl_task
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.chat import delete_chat_session
|
||||
from onyx.db.chat import get_chat_sessions_older_than
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.enums import TaskStatus
|
||||
from onyx.db.tasks import mark_task_as_finished_with_id
|
||||
from onyx.db.tasks import register_task
|
||||
from onyx.server.settings.store import load_settings
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.PERFORM_TTL_MANAGEMENT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
bind=True,
|
||||
trail=False,
|
||||
)
|
||||
def perform_ttl_management_task(
|
||||
self: Task, retention_limit_days: int, *, tenant_id: str
|
||||
) -> None:
|
||||
task_id = self.request.id
|
||||
if not task_id:
|
||||
raise RuntimeError("No task id defined for this task; cannot identify it")
|
||||
|
||||
start_time = datetime.now(tz=timezone.utc)
|
||||
|
||||
user_id: UUID | None = None
|
||||
session_id: UUID | None = None
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# we generally want to move off this, but keeping for now
|
||||
register_task(
|
||||
db_session=db_session,
|
||||
task_name=name_chat_ttl_task(retention_limit_days, tenant_id),
|
||||
task_id=task_id,
|
||||
status=TaskStatus.STARTED,
|
||||
start_time=start_time,
|
||||
)
|
||||
|
||||
old_chat_sessions = get_chat_sessions_older_than(
|
||||
retention_limit_days, db_session
|
||||
)
|
||||
|
||||
for user_id, session_id in old_chat_sessions:
|
||||
# one session per delete so that we don't blow up if a deletion fails.
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
delete_chat_session(
|
||||
user_id,
|
||||
session_id,
|
||||
db_session,
|
||||
include_deleted=True,
|
||||
hard_delete=True,
|
||||
)
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=True,
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"delete_chat_session exceptioned. "
|
||||
f"user_id={user_id} session_id={session_id}"
|
||||
)
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
mark_task_as_finished_with_id(
|
||||
db_session=db_session,
|
||||
task_id=task_id,
|
||||
success=False,
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.CHECK_TTL_MANAGEMENT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
)
|
||||
def check_ttl_management_task(*, tenant_id: str) -> None:
|
||||
"""Runs periodically to check if any ttl tasks should be run and adds them
|
||||
to the queue"""
|
||||
|
||||
settings = load_settings()
|
||||
retention_limit_days = settings.maximum_chat_retention_days
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
if should_perform_chat_ttl_check(retention_limit_days, db_session):
|
||||
perform_ttl_management_task.apply_async(
|
||||
kwargs=dict(
|
||||
retention_limit_days=retention_limit_days, tenant_id=tenant_id
|
||||
),
|
||||
)
|
||||
@@ -0,0 +1,46 @@
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
|
||||
from ee.onyx.server.reporting.usage_export_generation import create_new_usage_report
|
||||
from onyx.configs.app_configs import JOB_TIMEOUT
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.GENERATE_USAGE_REPORT_TASK,
|
||||
ignore_result=True,
|
||||
soft_time_limit=JOB_TIMEOUT,
|
||||
bind=True,
|
||||
trail=False,
|
||||
)
|
||||
def generate_usage_report_task(
|
||||
self: Task,
|
||||
*,
|
||||
tenant_id: str,
|
||||
user_id: str | None = None,
|
||||
period_from: str | None = None,
|
||||
period_to: str | None = None,
|
||||
) -> None:
|
||||
"""User-initiated usage report generation task"""
|
||||
# Parse period if provided
|
||||
period = None
|
||||
if period_from and period_to:
|
||||
period = (
|
||||
datetime.fromisoformat(period_from),
|
||||
datetime.fromisoformat(period_to),
|
||||
)
|
||||
|
||||
# Generate the report
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
create_new_usage_report(
|
||||
db_session=db_session,
|
||||
user_id=UUID(user_id) if user_id else None,
|
||||
period=period,
|
||||
)
|
||||
@@ -1,38 +0,0 @@
|
||||
from ee.onyx.server.query_and_chat.models import OneShotQAResponse
|
||||
from onyx.chat.models import AllCitations
|
||||
from onyx.chat.models import LLMRelevanceFilterResponse
|
||||
from onyx.chat.models import OnyxAnswerPiece
|
||||
from onyx.chat.models import QADocsResponse
|
||||
from onyx.chat.models import StreamingError
|
||||
from onyx.chat.process_message import ChatPacketStream
|
||||
from onyx.server.query_and_chat.models import ChatMessageDetail
|
||||
from onyx.utils.timing import log_function_time
|
||||
|
||||
|
||||
@log_function_time()
|
||||
def gather_stream_for_answer_api(
|
||||
packets: ChatPacketStream,
|
||||
) -> OneShotQAResponse:
|
||||
response = OneShotQAResponse()
|
||||
|
||||
answer = ""
|
||||
for packet in packets:
|
||||
if isinstance(packet, OnyxAnswerPiece) and packet.answer_piece:
|
||||
answer += packet.answer_piece
|
||||
elif isinstance(packet, QADocsResponse):
|
||||
response.docs = packet
|
||||
# Extraneous, provided for backwards compatibility
|
||||
response.rephrase = packet.rephrased_query
|
||||
elif isinstance(packet, StreamingError):
|
||||
response.error_msg = packet.error
|
||||
elif isinstance(packet, ChatMessageDetail):
|
||||
response.chat_message_id = packet.message_id
|
||||
elif isinstance(packet, LLMRelevanceFilterResponse):
|
||||
response.llm_selected_doc_indices = packet.llm_selected_doc_indices
|
||||
elif isinstance(packet, AllCitations):
|
||||
response.citations = packet.citations
|
||||
|
||||
if answer:
|
||||
response.answer = answer
|
||||
|
||||
return response
|
||||
@@ -53,6 +53,16 @@ CONFLUENCE_ANONYMOUS_ACCESS_IS_PUBLIC = (
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# JIRA
|
||||
#####
|
||||
|
||||
# In seconds, default is 30 minutes
|
||||
JIRA_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("JIRA_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Google Drive
|
||||
#####
|
||||
@@ -61,6 +71,19 @@ GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# GitHub
|
||||
#####
|
||||
# In seconds, default is 5 minutes
|
||||
GITHUB_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("GITHUB_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
# In seconds, default is 5 minutes
|
||||
GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Slack
|
||||
#####
|
||||
@@ -71,6 +94,28 @@ SLACK_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
NUM_PERMISSION_WORKERS = int(os.environ.get("NUM_PERMISSION_WORKERS") or 2)
|
||||
|
||||
|
||||
#####
|
||||
# Teams
|
||||
#####
|
||||
# In seconds, default is 5 minutes
|
||||
TEAMS_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("TEAMS_PERMISSION_DOC_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
#####
|
||||
# SharePoint
|
||||
#####
|
||||
# In seconds, default is 30 minutes
|
||||
SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY = int(
|
||||
os.environ.get("SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY") or 30 * 60
|
||||
)
|
||||
|
||||
# In seconds, default is 5 minutes
|
||||
SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY = int(
|
||||
os.environ.get("SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY") or 5 * 60
|
||||
)
|
||||
|
||||
|
||||
####
|
||||
# Celery Job Frequency
|
||||
####
|
||||
|
||||
28
backend/ee/onyx/connectors/perm_sync_valid.py
Normal file
28
backend/ee/onyx/connectors/perm_sync_valid.py
Normal file
@@ -0,0 +1,28 @@
|
||||
from onyx.connectors.confluence.connector import ConfluenceConnector
|
||||
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from onyx.connectors.interfaces import BaseConnector
|
||||
|
||||
|
||||
def validate_confluence_perm_sync(connector: ConfluenceConnector) -> None:
|
||||
"""
|
||||
Validate that the connector is configured correctly for permissions syncing.
|
||||
"""
|
||||
|
||||
|
||||
def validate_drive_perm_sync(connector: GoogleDriveConnector) -> None:
|
||||
"""
|
||||
Validate that the connector is configured correctly for permissions syncing.
|
||||
"""
|
||||
|
||||
|
||||
def validate_perm_sync(connector: BaseConnector) -> None:
|
||||
"""
|
||||
Override this if your connector needs to validate permissions syncing.
|
||||
Raise an exception if invalid, otherwise do nothing.
|
||||
|
||||
Default is a no-op (always successful).
|
||||
"""
|
||||
if isinstance(connector, ConfluenceConnector):
|
||||
validate_confluence_perm_sync(connector)
|
||||
elif isinstance(connector, GoogleDriveConnector):
|
||||
validate_drive_perm_sync(connector)
|
||||
@@ -4,6 +4,7 @@ from uuid import UUID
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
@@ -62,20 +63,41 @@ def delete_public_external_group_for_cc_pair__no_commit(
|
||||
)
|
||||
|
||||
|
||||
def replace_user__ext_group_for_cc_pair(
|
||||
def mark_old_external_groups_as_stale(
|
||||
db_session: Session,
|
||||
cc_pair_id: int,
|
||||
group_defs: list[ExternalUserGroup],
|
||||
) -> None:
|
||||
db_session.execute(
|
||||
update(User__ExternalUserGroupId)
|
||||
.where(User__ExternalUserGroupId.cc_pair_id == cc_pair_id)
|
||||
.values(stale=True)
|
||||
)
|
||||
db_session.execute(
|
||||
update(PublicExternalUserGroup)
|
||||
.where(PublicExternalUserGroup.cc_pair_id == cc_pair_id)
|
||||
.values(stale=True)
|
||||
)
|
||||
|
||||
|
||||
def upsert_external_groups(
|
||||
db_session: Session,
|
||||
cc_pair_id: int,
|
||||
external_groups: list[ExternalUserGroup],
|
||||
source: DocumentSource,
|
||||
) -> None:
|
||||
"""
|
||||
This function clears all existing external user group relations for a given cc_pair_id
|
||||
and replaces them with the new group definitions and commits the changes.
|
||||
Performs a true upsert operation for external user groups:
|
||||
- For existing groups (same user_id, external_user_group_id, cc_pair_id), updates the stale flag to False
|
||||
- For new groups, inserts them with stale=False
|
||||
- For public groups, uses upsert logic as well
|
||||
"""
|
||||
# If there are no groups to add, return early
|
||||
if not external_groups:
|
||||
return
|
||||
|
||||
# collect all emails from all groups to batch add all users at once for efficiency
|
||||
all_group_member_emails = set()
|
||||
for external_group in group_defs:
|
||||
for external_group in external_groups:
|
||||
for user_email in external_group.user_emails:
|
||||
all_group_member_emails.add(user_email)
|
||||
|
||||
@@ -86,26 +108,17 @@ def replace_user__ext_group_for_cc_pair(
|
||||
emails=list(all_group_member_emails),
|
||||
)
|
||||
|
||||
delete_user__ext_group_for_cc_pair__no_commit(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair_id,
|
||||
)
|
||||
delete_public_external_group_for_cc_pair__no_commit(
|
||||
db_session=db_session,
|
||||
cc_pair_id=cc_pair_id,
|
||||
)
|
||||
|
||||
# map emails to ids
|
||||
email_id_map = {user.email: user.id for user in all_group_members}
|
||||
email_id_map = {user.email.lower(): user.id for user in all_group_members}
|
||||
|
||||
# use these ids to create new external user group relations relating group_id to user_ids
|
||||
new_external_permissions: list[User__ExternalUserGroupId] = []
|
||||
new_public_external_groups: list[PublicExternalUserGroup] = []
|
||||
for external_group in group_defs:
|
||||
# Process each external group
|
||||
for external_group in external_groups:
|
||||
external_group_id = build_ext_group_name_for_onyx(
|
||||
ext_group_name=external_group.id,
|
||||
source=source,
|
||||
)
|
||||
|
||||
# Handle user-group mappings
|
||||
for user_email in external_group.user_emails:
|
||||
user_id = email_id_map.get(user_email.lower())
|
||||
if user_id is None:
|
||||
@@ -114,24 +127,71 @@ def replace_user__ext_group_for_cc_pair(
|
||||
f" with email {user_email} not found"
|
||||
)
|
||||
continue
|
||||
new_external_permissions.append(
|
||||
User__ExternalUserGroupId(
|
||||
|
||||
# Check if the user-group mapping already exists
|
||||
existing_user_group = db_session.scalar(
|
||||
select(User__ExternalUserGroupId).where(
|
||||
User__ExternalUserGroupId.user_id == user_id,
|
||||
User__ExternalUserGroupId.external_user_group_id
|
||||
== external_group_id,
|
||||
User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
|
||||
)
|
||||
)
|
||||
|
||||
if existing_user_group:
|
||||
# Update existing record
|
||||
existing_user_group.stale = False
|
||||
else:
|
||||
# Insert new record
|
||||
new_user_group = User__ExternalUserGroupId(
|
||||
user_id=user_id,
|
||||
external_user_group_id=external_group_id,
|
||||
cc_pair_id=cc_pair_id,
|
||||
stale=False,
|
||||
)
|
||||
db_session.add(new_user_group)
|
||||
|
||||
# Handle public group if needed
|
||||
if external_group.gives_anyone_access:
|
||||
# Check if the public group already exists
|
||||
existing_public_group = db_session.scalar(
|
||||
select(PublicExternalUserGroup).where(
|
||||
PublicExternalUserGroup.external_user_group_id == external_group_id,
|
||||
PublicExternalUserGroup.cc_pair_id == cc_pair_id,
|
||||
)
|
||||
)
|
||||
|
||||
if external_group.gives_anyone_access:
|
||||
new_public_external_groups.append(
|
||||
PublicExternalUserGroup(
|
||||
if existing_public_group:
|
||||
# Update existing record
|
||||
existing_public_group.stale = False
|
||||
else:
|
||||
# Insert new record
|
||||
new_public_group = PublicExternalUserGroup(
|
||||
external_user_group_id=external_group_id,
|
||||
cc_pair_id=cc_pair_id,
|
||||
stale=False,
|
||||
)
|
||||
)
|
||||
db_session.add(new_public_group)
|
||||
|
||||
db_session.add_all(new_external_permissions)
|
||||
db_session.add_all(new_public_external_groups)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def remove_stale_external_groups(
|
||||
db_session: Session,
|
||||
cc_pair_id: int,
|
||||
) -> None:
|
||||
db_session.execute(
|
||||
delete(User__ExternalUserGroupId).where(
|
||||
User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
|
||||
User__ExternalUserGroupId.stale.is_(True),
|
||||
)
|
||||
)
|
||||
db_session.execute(
|
||||
delete(PublicExternalUserGroup).where(
|
||||
PublicExternalUserGroup.cc_pair_id == cc_pair_id,
|
||||
PublicExternalUserGroup.stale.is_(True),
|
||||
)
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
|
||||
@@ -114,12 +114,24 @@ def get_all_usage_reports(db_session: Session) -> list[UsageReportMetadata]:
|
||||
|
||||
|
||||
def get_usage_report_data(
|
||||
db_session: Session,
|
||||
report_name: str,
|
||||
report_display_name: str,
|
||||
) -> IO:
|
||||
file_store = get_default_file_store(db_session)
|
||||
"""
|
||||
Get the usage report data from the file store.
|
||||
|
||||
Args:
|
||||
db_session: The database session.
|
||||
report_display_name: The display name of the usage report. Also assumes
|
||||
that the file is stored with this as the ID in the file store.
|
||||
|
||||
Returns:
|
||||
The usage report data.
|
||||
"""
|
||||
file_store = get_default_file_store()
|
||||
# usage report may be very large, so don't load it all into memory
|
||||
return file_store.read_file(file_name=report_name, mode="b", use_tempfile=True)
|
||||
return file_store.read_file(
|
||||
file_id=report_display_name, mode="b", use_tempfile=True
|
||||
)
|
||||
|
||||
|
||||
def write_usage_report(
|
||||
|
||||
@@ -128,11 +128,14 @@ def validate_object_creation_for_user(
|
||||
target_group_ids: list[int] | None = None,
|
||||
object_is_public: bool | None = None,
|
||||
object_is_perm_sync: bool | None = None,
|
||||
object_is_owned_by_user: bool = False,
|
||||
object_is_new: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
All users can create/edit permission synced objects if they don't specify a group
|
||||
All admin actions are allowed.
|
||||
Prevents non-admins from creating/editing:
|
||||
Curators and global curators can create public objects.
|
||||
Prevents other non-admins from creating/editing:
|
||||
- public objects
|
||||
- objects with no groups
|
||||
- objects that belong to a group they don't curate
|
||||
@@ -143,13 +146,23 @@ def validate_object_creation_for_user(
|
||||
if not user or user.role == UserRole.ADMIN:
|
||||
return
|
||||
|
||||
if object_is_public:
|
||||
detail = "User does not have permission to create public credentials"
|
||||
# Allow curators and global curators to create public objects
|
||||
# w/o associated groups IF the object is new/owned by them
|
||||
if (
|
||||
object_is_public
|
||||
and user.role in [UserRole.CURATOR, UserRole.GLOBAL_CURATOR]
|
||||
and (object_is_new or object_is_owned_by_user)
|
||||
):
|
||||
return
|
||||
|
||||
if object_is_public and user.role == UserRole.BASIC:
|
||||
detail = "User does not have permission to create public objects"
|
||||
logger.error(detail)
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=detail,
|
||||
)
|
||||
|
||||
if not target_group_ids:
|
||||
detail = "Curators must specify 1+ groups"
|
||||
logger.error(detail)
|
||||
|
||||
@@ -18,9 +18,9 @@
|
||||
<!-- <document type="danswer_chunk" mode="index" /> -->
|
||||
{{ document_elements }}
|
||||
</documents>
|
||||
<nodes count="75">
|
||||
<resources vcpu="8.0" memory="64.0Gb" architecture="arm64" storage-type="local"
|
||||
disk="474.0Gb" />
|
||||
<nodes count="60">
|
||||
<resources vcpu="8.0" memory="128.0Gb" architecture="arm64" storage-type="local"
|
||||
disk="475.0Gb" />
|
||||
</nodes>
|
||||
<engine>
|
||||
<proton>
|
||||
|
||||
@@ -6,11 +6,12 @@ https://confluence.atlassian.com/conf85/check-who-can-view-a-page-1283360557.htm
|
||||
from collections.abc import Generator
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from ee.onyx.external_permissions.utils import generic_doc_sync
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.confluence.connector import ConfluenceConnector
|
||||
from onyx.connectors.credentials_provider import OnyxDBCredentialsProvider
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
@@ -19,9 +20,13 @@ from shared_configs.contextvars import get_current_tenant_id
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
CONFLUENCE_DOC_SYNC_LABEL = "confluence_doc_sync"
|
||||
|
||||
|
||||
def confluence_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
@@ -29,7 +34,6 @@ def confluence_doc_sync(
|
||||
Compares fetched documents against existing documents in the DB for the connector.
|
||||
If a document exists in the DB but not in the Confluence fetch, it's marked as restricted.
|
||||
"""
|
||||
logger.info(f"Starting confluence doc sync for CC Pair ID: {cc_pair.id}")
|
||||
confluence_connector = ConfluenceConnector(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
@@ -39,52 +43,11 @@ def confluence_doc_sync(
|
||||
)
|
||||
confluence_connector.set_credentials_provider(provider)
|
||||
|
||||
slim_docs: list[SlimDocument] = []
|
||||
logger.info("Fetching all slim documents from confluence")
|
||||
for doc_batch in confluence_connector.retrieve_all_slim_documents(
|
||||
callback=callback
|
||||
):
|
||||
logger.info(f"Got {len(doc_batch)} slim documents from confluence")
|
||||
if callback:
|
||||
if callback.should_stop():
|
||||
raise RuntimeError("confluence_doc_sync: Stop signal detected")
|
||||
|
||||
callback.progress("confluence_doc_sync", 1)
|
||||
|
||||
slim_docs.extend(doc_batch)
|
||||
|
||||
# Find documents that are no longer accessible in Confluence
|
||||
logger.info(f"Querying existing document IDs for CC Pair ID: {cc_pair.id}")
|
||||
existing_doc_ids = fetch_all_existing_docs_fn()
|
||||
|
||||
# Find missing doc IDs
|
||||
fetched_doc_ids = {doc.id for doc in slim_docs}
|
||||
missing_doc_ids = set(existing_doc_ids) - fetched_doc_ids
|
||||
|
||||
# Yield access removal for missing docs. Better to be safe.
|
||||
if missing_doc_ids:
|
||||
logger.warning(
|
||||
f"Found {len(missing_doc_ids)} documents that are in the DB but "
|
||||
"not present in Confluence fetch. Making them inaccessible."
|
||||
)
|
||||
for missing_id in missing_doc_ids:
|
||||
logger.warning(f"Removing access for document ID: {missing_id}")
|
||||
yield DocExternalAccess(
|
||||
doc_id=missing_id,
|
||||
external_access=ExternalAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=set(),
|
||||
is_public=False,
|
||||
),
|
||||
)
|
||||
|
||||
for doc in slim_docs:
|
||||
if not doc.external_access:
|
||||
raise RuntimeError(f"No external access found for document ID: {doc.id}")
|
||||
|
||||
yield DocExternalAccess(
|
||||
doc_id=doc.id,
|
||||
external_access=doc.external_access,
|
||||
)
|
||||
|
||||
logger.info("Finished confluence doc sync")
|
||||
yield from generic_doc_sync(
|
||||
cc_pair=cc_pair,
|
||||
fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
|
||||
callback=callback,
|
||||
doc_source=DocumentSource.CONFLUENCE,
|
||||
slim_connector=confluence_connector,
|
||||
label=CONFLUENCE_DOC_SYNC_LABEL,
|
||||
)
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GROUP_NAME
|
||||
from onyx.background.error_logging import emit_background_error
|
||||
@@ -65,7 +67,7 @@ def _build_group_member_email_map(
|
||||
def confluence_group_sync(
|
||||
tenant_id: str,
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> list[ExternalUserGroup]:
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
provider = OnyxDBCredentialsProvider(tenant_id, "confluence", cc_pair.credential_id)
|
||||
is_cloud = cc_pair.connector.connector_specific_config.get("is_cloud", False)
|
||||
wiki_base: str = cc_pair.connector.connector_specific_config["wiki_base"]
|
||||
@@ -89,10 +91,10 @@ def confluence_group_sync(
|
||||
confluence_client=confluence_client,
|
||||
cc_pair_id=cc_pair.id,
|
||||
)
|
||||
onyx_groups: list[ExternalUserGroup] = []
|
||||
|
||||
all_found_emails = set()
|
||||
for group_id, group_member_emails in group_member_email_map.items():
|
||||
onyx_groups.append(
|
||||
yield (
|
||||
ExternalUserGroup(
|
||||
id=group_id,
|
||||
user_emails=list(group_member_emails),
|
||||
@@ -107,6 +109,4 @@ def confluence_group_sync(
|
||||
id=ALL_CONF_EMAILS_GROUP_NAME,
|
||||
user_emails=list(all_found_emails),
|
||||
)
|
||||
onyx_groups.append(all_found_group)
|
||||
|
||||
return onyx_groups
|
||||
yield all_found_group
|
||||
|
||||
294
backend/ee/onyx/external_permissions/github/doc_sync.py
Normal file
294
backend/ee/onyx/external_permissions/github/doc_sync.py
Normal file
@@ -0,0 +1,294 @@
|
||||
import json
|
||||
from collections.abc import Generator
|
||||
|
||||
from github import Github
|
||||
from github.Repository import Repository
|
||||
|
||||
from ee.onyx.external_permissions.github.utils import fetch_repository_team_slugs
|
||||
from ee.onyx.external_permissions.github.utils import form_collaborators_group_id
|
||||
from ee.onyx.external_permissions.github.utils import form_organization_group_id
|
||||
from ee.onyx.external_permissions.github.utils import (
|
||||
form_outside_collaborators_group_id,
|
||||
)
|
||||
from ee.onyx.external_permissions.github.utils import get_external_access_permission
|
||||
from ee.onyx.external_permissions.github.utils import get_repository_visibility
|
||||
from ee.onyx.external_permissions.github.utils import GitHubVisibility
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.github.connector import DocMetadata
|
||||
from onyx.connectors.github.connector import GithubConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.utils import DocumentRow
|
||||
from onyx.db.utils import SortOrder
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
GITHUB_DOC_SYNC_LABEL = "github_doc_sync"
|
||||
|
||||
|
||||
def github_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
Sync GitHub documents with external access permissions.
|
||||
|
||||
This function checks each repository for visibility/team changes and updates
|
||||
document permissions accordingly without using checkpoints.
|
||||
"""
|
||||
logger.info(f"Starting GitHub document sync for CC pair ID: {cc_pair.id}")
|
||||
|
||||
# Initialize GitHub connector with credentials
|
||||
github_connector: GithubConnector = GithubConnector(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
|
||||
github_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
logger.info("GitHub connector credentials loaded successfully")
|
||||
|
||||
if not github_connector.github_client:
|
||||
logger.error("GitHub client initialization failed")
|
||||
raise ValueError("github_client is required")
|
||||
|
||||
# Get all repositories from GitHub API
|
||||
logger.info("Fetching all repositories from GitHub API")
|
||||
try:
|
||||
repos = []
|
||||
if github_connector.repositories:
|
||||
if "," in github_connector.repositories:
|
||||
# Multiple repositories specified
|
||||
repos = github_connector.get_github_repos(
|
||||
github_connector.github_client
|
||||
)
|
||||
else:
|
||||
# Single repository
|
||||
repos = [
|
||||
github_connector.get_github_repo(github_connector.github_client)
|
||||
]
|
||||
else:
|
||||
# All repositories
|
||||
repos = github_connector.get_all_repos(github_connector.github_client)
|
||||
|
||||
logger.info(f"Found {len(repos)} repositories to check")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch repositories: {e}")
|
||||
raise
|
||||
|
||||
repo_to_doc_list_map: dict[str, list[DocumentRow]] = {}
|
||||
# sort order is ascending because we want to get the oldest documents first
|
||||
existing_docs: list[DocumentRow] = fetch_all_existing_docs_fn(
|
||||
sort_order=SortOrder.ASC
|
||||
)
|
||||
logger.info(f"Found {len(existing_docs)} documents to check")
|
||||
for doc in existing_docs:
|
||||
try:
|
||||
doc_metadata = DocMetadata.model_validate_json(json.dumps(doc.doc_metadata))
|
||||
if doc_metadata.repo not in repo_to_doc_list_map:
|
||||
repo_to_doc_list_map[doc_metadata.repo] = []
|
||||
repo_to_doc_list_map[doc_metadata.repo].append(doc)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse doc metadata: {e} for doc {doc.id}")
|
||||
continue
|
||||
logger.info(f"Found {len(repo_to_doc_list_map)} documents to check")
|
||||
# Process each repository individually
|
||||
for repo in repos:
|
||||
try:
|
||||
logger.info(f"Processing repository: {repo.id} (name: {repo.name})")
|
||||
repo_doc_list: list[DocumentRow] = repo_to_doc_list_map.get(
|
||||
repo.full_name, []
|
||||
)
|
||||
if not repo_doc_list:
|
||||
logger.warning(
|
||||
f"No documents found for repository {repo.id} ({repo.name})"
|
||||
)
|
||||
continue
|
||||
|
||||
current_external_group_ids = repo_doc_list[0].external_user_group_ids or []
|
||||
# Check if repository has any permission changes
|
||||
has_changes = _check_repository_for_changes(
|
||||
repo=repo,
|
||||
github_client=github_connector.github_client,
|
||||
current_external_group_ids=current_external_group_ids,
|
||||
)
|
||||
|
||||
if has_changes:
|
||||
logger.info(
|
||||
f"Repository {repo.id} ({repo.name}) has changes, updating documents"
|
||||
)
|
||||
|
||||
# Get new external access permissions for this repository
|
||||
new_external_access = get_external_access_permission(
|
||||
repo, github_connector.github_client
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Found {len(repo_doc_list)} documents for repository {repo.full_name}"
|
||||
)
|
||||
|
||||
# Yield updated external access for each document
|
||||
for doc in repo_doc_list:
|
||||
if callback:
|
||||
callback.progress(GITHUB_DOC_SYNC_LABEL, 1)
|
||||
|
||||
yield DocExternalAccess(
|
||||
doc_id=doc.id,
|
||||
external_access=new_external_access,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
f"Repository {repo.id} ({repo.name}) has no changes, skipping"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing repository {repo.id} ({repo.name}): {e}")
|
||||
|
||||
logger.info(f"GitHub document sync completed for CC pair ID: {cc_pair.id}")
|
||||
|
||||
|
||||
def _check_repository_for_changes(
|
||||
repo: Repository,
|
||||
github_client: Github,
|
||||
current_external_group_ids: list[str],
|
||||
) -> bool:
|
||||
"""
|
||||
Check if repository has any permission changes (visibility or team updates).
|
||||
"""
|
||||
logger.info(f"Checking repository {repo.id} ({repo.name}) for changes")
|
||||
|
||||
# Check for repository visibility changes using the sample document data
|
||||
if _is_repo_visibility_changed_from_groups(
|
||||
repo=repo,
|
||||
current_external_group_ids=current_external_group_ids,
|
||||
):
|
||||
logger.info(f"Repository {repo.id} ({repo.name}) has visibility changes")
|
||||
return True
|
||||
|
||||
# Check for team membership changes if repository is private
|
||||
if get_repository_visibility(
|
||||
repo
|
||||
) == GitHubVisibility.PRIVATE and _teams_updated_from_groups(
|
||||
repo=repo,
|
||||
github_client=github_client,
|
||||
current_external_group_ids=current_external_group_ids,
|
||||
):
|
||||
logger.info(f"Repository {repo.id} ({repo.name}) has team changes")
|
||||
return True
|
||||
|
||||
logger.info(f"Repository {repo.id} ({repo.name}) has no changes")
|
||||
return False
|
||||
|
||||
|
||||
def _is_repo_visibility_changed_from_groups(
|
||||
repo: Repository,
|
||||
current_external_group_ids: list[str],
|
||||
) -> bool:
|
||||
"""
|
||||
Check if repository visibility has changed by analyzing existing external group IDs.
|
||||
|
||||
Args:
|
||||
repo: GitHub repository object
|
||||
current_external_group_ids: List of external group IDs from existing document
|
||||
|
||||
Returns:
|
||||
True if visibility has changed
|
||||
"""
|
||||
current_repo_visibility = get_repository_visibility(repo)
|
||||
logger.info(f"Current repository visibility: {current_repo_visibility.value}")
|
||||
|
||||
# Build expected group IDs for current visibility
|
||||
collaborators_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=form_collaborators_group_id(repo.id),
|
||||
)
|
||||
|
||||
org_group_id = None
|
||||
if repo.organization:
|
||||
org_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=form_organization_group_id(repo.organization.id),
|
||||
)
|
||||
|
||||
# Determine existing visibility from group IDs
|
||||
has_collaborators_group = collaborators_group_id in current_external_group_ids
|
||||
has_org_group = org_group_id and org_group_id in current_external_group_ids
|
||||
|
||||
if has_collaborators_group:
|
||||
existing_repo_visibility = GitHubVisibility.PRIVATE
|
||||
elif has_org_group:
|
||||
existing_repo_visibility = GitHubVisibility.INTERNAL
|
||||
else:
|
||||
existing_repo_visibility = GitHubVisibility.PUBLIC
|
||||
|
||||
logger.info(f"Inferred existing visibility: {existing_repo_visibility.value}")
|
||||
|
||||
visibility_changed = existing_repo_visibility != current_repo_visibility
|
||||
if visibility_changed:
|
||||
logger.info(
|
||||
f"Visibility changed for repo {repo.id} ({repo.name}): "
|
||||
f"{existing_repo_visibility.value} -> {current_repo_visibility.value}"
|
||||
)
|
||||
|
||||
return visibility_changed
|
||||
|
||||
|
||||
def _teams_updated_from_groups(
|
||||
repo: Repository,
|
||||
github_client: Github,
|
||||
current_external_group_ids: list[str],
|
||||
) -> bool:
|
||||
"""
|
||||
Check if repository team memberships have changed using existing group IDs.
|
||||
"""
|
||||
# Fetch current team slugs for the repository
|
||||
current_teams = fetch_repository_team_slugs(repo=repo, github_client=github_client)
|
||||
logger.info(
|
||||
f"Current teams for repository {repo.id} (name: {repo.name}): {current_teams}"
|
||||
)
|
||||
|
||||
# Build group IDs to exclude from team comparison (non-team groups)
|
||||
collaborators_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=form_collaborators_group_id(repo.id),
|
||||
)
|
||||
outside_collaborators_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=form_outside_collaborators_group_id(repo.id),
|
||||
)
|
||||
non_team_group_ids = {collaborators_group_id, outside_collaborators_group_id}
|
||||
|
||||
# Extract existing team IDs from current external group IDs
|
||||
existing_team_ids = set()
|
||||
for group_id in current_external_group_ids:
|
||||
# Skip all non-team groups, keep only team groups
|
||||
if group_id not in non_team_group_ids:
|
||||
existing_team_ids.add(group_id)
|
||||
|
||||
# Note: existing_team_ids from DB are already prefixed (e.g., "github__team-slug")
|
||||
# but current_teams from API are raw team slugs, so we need to add the prefix
|
||||
current_team_ids = set()
|
||||
for team_slug in current_teams:
|
||||
team_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=team_slug,
|
||||
)
|
||||
current_team_ids.add(team_group_id)
|
||||
|
||||
logger.info(
|
||||
f"Existing team IDs: {existing_team_ids}, Current team IDs: {current_team_ids}"
|
||||
)
|
||||
|
||||
# Compare actual team IDs to detect changes
|
||||
teams_changed = current_team_ids != existing_team_ids
|
||||
if teams_changed:
|
||||
logger.info(
|
||||
f"Team changes detected for repo {repo.id} (name: {repo.name}): "
|
||||
f"existing={existing_team_ids}, current={current_team_ids}"
|
||||
)
|
||||
|
||||
return teams_changed
|
||||
46
backend/ee/onyx/external_permissions/github/group_sync.py
Normal file
46
backend/ee/onyx/external_permissions/github/group_sync.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from github import Repository
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from ee.onyx.external_permissions.github.utils import get_external_user_group
|
||||
from onyx.connectors.github.connector import GithubConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def github_group_sync(
|
||||
tenant_id: str,
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
github_connector: GithubConnector = GithubConnector(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
github_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
if not github_connector.github_client:
|
||||
raise ValueError("github_client is required")
|
||||
|
||||
logger.info("Starting GitHub group sync...")
|
||||
repos: list[Repository.Repository] = []
|
||||
if github_connector.repositories:
|
||||
if "," in github_connector.repositories:
|
||||
# Multiple repositories specified
|
||||
repos = github_connector.get_github_repos(github_connector.github_client)
|
||||
else:
|
||||
# Single repository (backward compatibility)
|
||||
repos = [github_connector.get_github_repo(github_connector.github_client)]
|
||||
else:
|
||||
# All repositories
|
||||
repos = github_connector.get_all_repos(github_connector.github_client)
|
||||
|
||||
for repo in repos:
|
||||
try:
|
||||
for external_group in get_external_user_group(
|
||||
repo, github_connector.github_client
|
||||
):
|
||||
logger.info(f"External group: {external_group}")
|
||||
yield external_group
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing repository {repo.id} ({repo.name}): {e}")
|
||||
488
backend/ee/onyx/external_permissions/github/utils.py
Normal file
488
backend/ee/onyx/external_permissions/github/utils.py
Normal file
@@ -0,0 +1,488 @@
|
||||
from collections.abc import Callable
|
||||
from enum import Enum
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
from typing import Tuple
|
||||
from typing import TypeVar
|
||||
|
||||
from github import Github
|
||||
from github import RateLimitExceededException
|
||||
from github.GithubException import GithubException
|
||||
from github.NamedUser import NamedUser
|
||||
from github.Organization import Organization
|
||||
from github.PaginatedList import PaginatedList
|
||||
from github.Repository import Repository
|
||||
from github.Team import Team
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.github.rate_limit_utils import sleep_after_rate_limit_exception
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class GitHubVisibility(Enum):
|
||||
"""GitHub repository visibility options."""
|
||||
|
||||
PUBLIC = "public"
|
||||
PRIVATE = "private"
|
||||
INTERNAL = "internal"
|
||||
|
||||
|
||||
MAX_RETRY_COUNT = 3
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
# Higher-order function to wrap GitHub operations with retry and exception handling
|
||||
|
||||
|
||||
def _run_with_retry(
|
||||
operation: Callable[[], T],
|
||||
description: str,
|
||||
github_client: Github,
|
||||
retry_count: int = 0,
|
||||
) -> Optional[T]:
|
||||
"""Execute a GitHub operation with retry on rate limit and exception handling."""
|
||||
logger.debug(f"Starting operation '{description}', attempt {retry_count + 1}")
|
||||
try:
|
||||
result = operation()
|
||||
logger.debug(f"Operation '{description}' completed successfully")
|
||||
return result
|
||||
except RateLimitExceededException:
|
||||
if retry_count < MAX_RETRY_COUNT:
|
||||
sleep_after_rate_limit_exception(github_client)
|
||||
logger.warning(
|
||||
f"Rate limit exceeded while {description}. Retrying... "
|
||||
f"(attempt {retry_count + 1}/{MAX_RETRY_COUNT})"
|
||||
)
|
||||
return _run_with_retry(
|
||||
operation, description, github_client, retry_count + 1
|
||||
)
|
||||
else:
|
||||
error_msg = f"Max retries exceeded for {description}"
|
||||
logger.exception(error_msg)
|
||||
raise RuntimeError(error_msg)
|
||||
except GithubException as e:
|
||||
logger.warning(f"GitHub API error during {description}: {e}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.exception(f"Unexpected error during {description}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
class UserInfo(BaseModel):
|
||||
"""Represents a GitHub user with their basic information."""
|
||||
|
||||
login: str
|
||||
name: Optional[str] = None
|
||||
email: Optional[str] = None
|
||||
|
||||
|
||||
class TeamInfo(BaseModel):
|
||||
"""Represents a GitHub team with its members."""
|
||||
|
||||
name: str
|
||||
slug: str
|
||||
members: List[UserInfo]
|
||||
|
||||
|
||||
def _fetch_organization_members(
|
||||
github_client: Github, org_name: str, retry_count: int = 0
|
||||
) -> List[UserInfo]:
|
||||
"""Fetch all organization members including owners and regular members."""
|
||||
org_members: List[UserInfo] = []
|
||||
logger.info(f"Fetching organization members for {org_name}")
|
||||
|
||||
org = _run_with_retry(
|
||||
lambda: github_client.get_organization(org_name),
|
||||
f"get organization {org_name}",
|
||||
github_client,
|
||||
)
|
||||
if not org:
|
||||
logger.error(f"Failed to fetch organization {org_name}")
|
||||
raise RuntimeError(f"Failed to fetch organization {org_name}")
|
||||
|
||||
member_objs: PaginatedList[NamedUser] | list[NamedUser] = (
|
||||
_run_with_retry(
|
||||
lambda: org.get_members(filter_="all"),
|
||||
f"get members for organization {org_name}",
|
||||
github_client,
|
||||
)
|
||||
or []
|
||||
)
|
||||
|
||||
for member in member_objs:
|
||||
user_info = UserInfo(login=member.login, name=member.name, email=member.email)
|
||||
org_members.append(user_info)
|
||||
|
||||
logger.info(f"Fetched {len(org_members)} members for organization {org_name}")
|
||||
return org_members
|
||||
|
||||
|
||||
def _fetch_repository_teams_detailed(
|
||||
repo: Repository, github_client: Github, retry_count: int = 0
|
||||
) -> List[TeamInfo]:
|
||||
"""Fetch teams with access to the repository and their members."""
|
||||
teams_data: List[TeamInfo] = []
|
||||
logger.info(f"Fetching teams for repository {repo.full_name}")
|
||||
|
||||
team_objs: PaginatedList[Team] | list[Team] = (
|
||||
_run_with_retry(
|
||||
lambda: repo.get_teams(),
|
||||
f"get teams for repository {repo.full_name}",
|
||||
github_client,
|
||||
)
|
||||
or []
|
||||
)
|
||||
|
||||
for team in team_objs:
|
||||
logger.info(
|
||||
f"Processing team {team.name} (slug: {team.slug}) for repository {repo.full_name}"
|
||||
)
|
||||
|
||||
members: PaginatedList[NamedUser] | list[NamedUser] = (
|
||||
_run_with_retry(
|
||||
lambda: team.get_members(),
|
||||
f"get members for team {team.name}",
|
||||
github_client,
|
||||
)
|
||||
or []
|
||||
)
|
||||
|
||||
team_members = []
|
||||
for m in members:
|
||||
user_info = UserInfo(login=m.login, name=m.name, email=m.email)
|
||||
team_members.append(user_info)
|
||||
|
||||
team_info = TeamInfo(name=team.name, slug=team.slug, members=team_members)
|
||||
teams_data.append(team_info)
|
||||
logger.info(f"Team {team.name} has {len(team_members)} members")
|
||||
|
||||
logger.info(f"Fetched {len(teams_data)} teams for repository {repo.full_name}")
|
||||
return teams_data
|
||||
|
||||
|
||||
def fetch_repository_team_slugs(
|
||||
repo: Repository, github_client: Github, retry_count: int = 0
|
||||
) -> List[str]:
|
||||
"""Fetch team slugs with access to the repository."""
|
||||
logger.info(f"Fetching team slugs for repository {repo.full_name}")
|
||||
teams_data: List[str] = []
|
||||
|
||||
team_objs: PaginatedList[Team] | list[Team] = (
|
||||
_run_with_retry(
|
||||
lambda: repo.get_teams(),
|
||||
f"get teams for repository {repo.full_name}",
|
||||
github_client,
|
||||
)
|
||||
or []
|
||||
)
|
||||
|
||||
for team in team_objs:
|
||||
teams_data.append(team.slug)
|
||||
|
||||
logger.info(f"Fetched {len(teams_data)} team slugs for repository {repo.full_name}")
|
||||
return teams_data
|
||||
|
||||
|
||||
def _get_collaborators_and_outside_collaborators(
|
||||
github_client: Github,
|
||||
repo: Repository,
|
||||
) -> Tuple[List[UserInfo], List[UserInfo]]:
|
||||
"""Fetch and categorize collaborators into regular and outside collaborators."""
|
||||
collaborators: List[UserInfo] = []
|
||||
outside_collaborators: List[UserInfo] = []
|
||||
logger.info(f"Fetching collaborators for repository {repo.full_name}")
|
||||
|
||||
repo_collaborators: PaginatedList[NamedUser] | list[NamedUser] = (
|
||||
_run_with_retry(
|
||||
lambda: repo.get_collaborators(),
|
||||
f"get collaborators for repository {repo.full_name}",
|
||||
github_client,
|
||||
)
|
||||
or []
|
||||
)
|
||||
|
||||
for collaborator in repo_collaborators:
|
||||
is_outside = False
|
||||
|
||||
# Check if collaborator is outside the organization
|
||||
if repo.organization:
|
||||
org: Organization | None = _run_with_retry(
|
||||
lambda: github_client.get_organization(repo.organization.login),
|
||||
f"get organization {repo.organization.login}",
|
||||
github_client,
|
||||
)
|
||||
|
||||
if org is not None:
|
||||
org_obj = org
|
||||
membership = _run_with_retry(
|
||||
lambda: org_obj.has_in_members(collaborator),
|
||||
f"check membership for {collaborator.login} in org {org_obj.login}",
|
||||
github_client,
|
||||
)
|
||||
is_outside = membership is not None and not membership
|
||||
|
||||
info = UserInfo(
|
||||
login=collaborator.login, name=collaborator.name, email=collaborator.email
|
||||
)
|
||||
if repo.organization and is_outside:
|
||||
outside_collaborators.append(info)
|
||||
else:
|
||||
collaborators.append(info)
|
||||
|
||||
logger.info(
|
||||
f"Categorized {len(collaborators)} regular and {len(outside_collaborators)} outside collaborators for {repo.full_name}"
|
||||
)
|
||||
return collaborators, outside_collaborators
|
||||
|
||||
|
||||
def form_collaborators_group_id(repository_id: int) -> str:
|
||||
"""Generate group ID for repository collaborators."""
|
||||
if not repository_id:
|
||||
logger.exception("Repository ID is required to generate collaborators group ID")
|
||||
raise ValueError("Repository ID must be set to generate group ID.")
|
||||
group_id = f"{repository_id}_collaborators"
|
||||
return group_id
|
||||
|
||||
|
||||
def form_organization_group_id(organization_id: int) -> str:
|
||||
"""Generate group ID for organization using organization ID."""
|
||||
if not organization_id:
|
||||
logger.exception(
|
||||
"Organization ID is required to generate organization group ID"
|
||||
)
|
||||
raise ValueError("Organization ID must be set to generate group ID.")
|
||||
group_id = f"{organization_id}_organization"
|
||||
return group_id
|
||||
|
||||
|
||||
def form_outside_collaborators_group_id(repository_id: int) -> str:
|
||||
"""Generate group ID for outside collaborators."""
|
||||
if not repository_id:
|
||||
logger.exception(
|
||||
"Repository ID is required to generate outside collaborators group ID"
|
||||
)
|
||||
raise ValueError("Repository ID must be set to generate group ID.")
|
||||
group_id = f"{repository_id}_outside_collaborators"
|
||||
return group_id
|
||||
|
||||
|
||||
def get_repository_visibility(repo: Repository) -> GitHubVisibility:
|
||||
"""
|
||||
Get the visibility of a repository.
|
||||
Returns GitHubVisibility enum member.
|
||||
"""
|
||||
if hasattr(repo, "visibility"):
|
||||
visibility = repo.visibility
|
||||
logger.info(
|
||||
f"Repository {repo.full_name} visibility from attribute: {visibility}"
|
||||
)
|
||||
try:
|
||||
return GitHubVisibility(visibility)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
f"Unknown visibility '{visibility}' for repo {repo.full_name}, defaulting to private"
|
||||
)
|
||||
return GitHubVisibility.PRIVATE
|
||||
|
||||
logger.info(f"Repository {repo.full_name} is private")
|
||||
return GitHubVisibility.PRIVATE
|
||||
|
||||
|
||||
def get_external_access_permission(
|
||||
repo: Repository, github_client: Github, add_prefix: bool = False
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
Get the external access permission for a repository.
|
||||
Uses group-based permissions for efficiency and scalability.
|
||||
|
||||
add_prefix: When this method is called during the initial permission sync via the connector,
|
||||
the group ID isn't prefixed with the source while inserting the document record.
|
||||
So in that case, set add_prefix to True, allowing the method itself to handle
|
||||
prefixing. However, when the same method is invoked from doc_sync, our system
|
||||
already adds the prefix to the group ID while processing the ExternalAccess object.
|
||||
"""
|
||||
# We maintain collaborators, and outside collaborators as two separate groups
|
||||
# instead of adding individual user emails to ExternalAccess.external_user_emails for two reasons:
|
||||
# 1. Changes in repo collaborators (additions/removals) would require updating all documents.
|
||||
# 2. Repo permissions can change without updating the repo's updated_at timestamp,
|
||||
# forcing full permission syncs for all documents every time, which is inefficient.
|
||||
|
||||
repo_visibility = get_repository_visibility(repo)
|
||||
logger.info(
|
||||
f"Generating ExternalAccess for {repo.full_name}: visibility={repo_visibility.value}"
|
||||
)
|
||||
|
||||
if repo_visibility == GitHubVisibility.PUBLIC:
|
||||
logger.info(
|
||||
f"Repository {repo.full_name} is public - allowing access to all users"
|
||||
)
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=set(),
|
||||
is_public=True,
|
||||
)
|
||||
elif repo_visibility == GitHubVisibility.PRIVATE:
|
||||
logger.info(
|
||||
f"Repository {repo.full_name} is private - setting up restricted access"
|
||||
)
|
||||
|
||||
collaborators_group_id = form_collaborators_group_id(repo.id)
|
||||
outside_collaborators_group_id = form_outside_collaborators_group_id(repo.id)
|
||||
if add_prefix:
|
||||
collaborators_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=collaborators_group_id,
|
||||
)
|
||||
outside_collaborators_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=outside_collaborators_group_id,
|
||||
)
|
||||
group_ids = {collaborators_group_id, outside_collaborators_group_id}
|
||||
|
||||
team_slugs = fetch_repository_team_slugs(repo, github_client)
|
||||
if add_prefix:
|
||||
team_slugs = [
|
||||
build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=slug,
|
||||
)
|
||||
for slug in team_slugs
|
||||
]
|
||||
group_ids.update(team_slugs)
|
||||
|
||||
logger.info(f"ExternalAccess groups for {repo.full_name}: {group_ids}")
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=group_ids,
|
||||
is_public=False,
|
||||
)
|
||||
else:
|
||||
# Internal repositories - accessible to organization members
|
||||
logger.info(
|
||||
f"Repository {repo.full_name} is internal - accessible to org members"
|
||||
)
|
||||
org_group_id = form_organization_group_id(repo.organization.id)
|
||||
if add_prefix:
|
||||
org_group_id = build_ext_group_name_for_onyx(
|
||||
source=DocumentSource.GITHUB,
|
||||
ext_group_name=org_group_id,
|
||||
)
|
||||
group_ids = {org_group_id}
|
||||
logger.info(f"ExternalAccess groups for {repo.full_name}: {group_ids}")
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=group_ids,
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
|
||||
def get_external_user_group(
|
||||
repo: Repository, github_client: Github
|
||||
) -> list[ExternalUserGroup]:
|
||||
"""
|
||||
Get the external user group for a repository.
|
||||
Creates ExternalUserGroup objects with actual user emails for each permission group.
|
||||
"""
|
||||
repo_visibility = get_repository_visibility(repo)
|
||||
logger.info(
|
||||
f"Generating ExternalUserGroups for {repo.full_name}: visibility={repo_visibility.value}"
|
||||
)
|
||||
|
||||
if repo_visibility == GitHubVisibility.PRIVATE:
|
||||
logger.info(f"Processing private repository {repo.full_name}")
|
||||
|
||||
collaborators, outside_collaborators = (
|
||||
_get_collaborators_and_outside_collaborators(github_client, repo)
|
||||
)
|
||||
teams = _fetch_repository_teams_detailed(repo, github_client)
|
||||
external_user_groups = []
|
||||
|
||||
user_emails = set()
|
||||
for collab in collaborators:
|
||||
if collab.email:
|
||||
user_emails.add(collab.email)
|
||||
else:
|
||||
logger.error(f"Collaborator {collab.login} has no email")
|
||||
|
||||
if user_emails:
|
||||
collaborators_group = ExternalUserGroup(
|
||||
id=form_collaborators_group_id(repo.id),
|
||||
user_emails=list(user_emails),
|
||||
)
|
||||
external_user_groups.append(collaborators_group)
|
||||
logger.info(f"Created collaborators group with {len(user_emails)} emails")
|
||||
|
||||
# Create group for outside collaborators
|
||||
user_emails = set()
|
||||
for collab in outside_collaborators:
|
||||
if collab.email:
|
||||
user_emails.add(collab.email)
|
||||
else:
|
||||
logger.error(f"Outside collaborator {collab.login} has no email")
|
||||
|
||||
if user_emails:
|
||||
outside_collaborators_group = ExternalUserGroup(
|
||||
id=form_outside_collaborators_group_id(repo.id),
|
||||
user_emails=list(user_emails),
|
||||
)
|
||||
external_user_groups.append(outside_collaborators_group)
|
||||
logger.info(
|
||||
f"Created outside collaborators group with {len(user_emails)} emails"
|
||||
)
|
||||
|
||||
# Create groups for teams
|
||||
for team in teams:
|
||||
user_emails = set()
|
||||
for member in team.members:
|
||||
if member.email:
|
||||
user_emails.add(member.email)
|
||||
else:
|
||||
logger.error(f"Team member {member.login} has no email")
|
||||
|
||||
if user_emails:
|
||||
team_group = ExternalUserGroup(
|
||||
id=team.slug,
|
||||
user_emails=list(user_emails),
|
||||
)
|
||||
external_user_groups.append(team_group)
|
||||
logger.info(
|
||||
f"Created team group {team.name} with {len(user_emails)} emails"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Created {len(external_user_groups)} ExternalUserGroups for private repository {repo.full_name}"
|
||||
)
|
||||
return external_user_groups
|
||||
|
||||
if repo_visibility == GitHubVisibility.INTERNAL:
|
||||
logger.info(f"Processing internal repository {repo.full_name}")
|
||||
|
||||
org_group_id = form_organization_group_id(repo.organization.id)
|
||||
org_members = _fetch_organization_members(
|
||||
github_client, repo.organization.login
|
||||
)
|
||||
|
||||
user_emails = set()
|
||||
for member in org_members:
|
||||
if member.email:
|
||||
user_emails.add(member.email)
|
||||
else:
|
||||
logger.error(f"Org member {member.login} has no email")
|
||||
|
||||
org_group = ExternalUserGroup(
|
||||
id=org_group_id,
|
||||
user_emails=list(user_emails),
|
||||
)
|
||||
logger.info(
|
||||
f"Created organization group with {len(user_emails)} emails for internal repository {repo.full_name}"
|
||||
)
|
||||
return [org_group]
|
||||
|
||||
logger.info(f"Repository {repo.full_name} is public - no user groups needed")
|
||||
return []
|
||||
@@ -3,6 +3,7 @@ from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.connectors.gmail.connector import GmailConnector
|
||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
@@ -35,6 +36,7 @@ def _get_slim_doc_generator(
|
||||
def gmail_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
|
||||
@@ -8,6 +8,7 @@ from ee.onyx.external_permissions.google_drive.permission_retrieval import (
|
||||
get_permissions_by_ids,
|
||||
)
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
||||
@@ -40,8 +41,28 @@ def _get_slim_doc_generator(
|
||||
)
|
||||
|
||||
|
||||
def _merge_permissions_lists(
|
||||
permission_lists: list[list[GoogleDrivePermission]],
|
||||
) -> list[GoogleDrivePermission]:
|
||||
"""
|
||||
Merge a list of permission lists into a single list of permissions.
|
||||
"""
|
||||
seen_permission_ids: set[str] = set()
|
||||
merged_permissions: list[GoogleDrivePermission] = []
|
||||
for permission_list in permission_lists:
|
||||
for permission in permission_list:
|
||||
if permission.id not in seen_permission_ids:
|
||||
merged_permissions.append(permission)
|
||||
seen_permission_ids.add(permission.id)
|
||||
|
||||
return merged_permissions
|
||||
|
||||
|
||||
def get_external_access_for_raw_gdrive_file(
|
||||
file: GoogleDriveFileType, company_domain: str, drive_service: GoogleDriveService
|
||||
file: GoogleDriveFileType,
|
||||
company_domain: str,
|
||||
retriever_drive_service: GoogleDriveService | None,
|
||||
admin_drive_service: GoogleDriveService,
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
Get the external access for a raw Google Drive file.
|
||||
@@ -62,11 +83,28 @@ def get_external_access_for_raw_gdrive_file(
|
||||
GoogleDrivePermission.from_drive_permission(p) for p in permissions
|
||||
]
|
||||
elif permission_ids:
|
||||
permissions_list = get_permissions_by_ids(
|
||||
drive_service=drive_service,
|
||||
doc_id=doc_id,
|
||||
permission_ids=permission_ids,
|
||||
|
||||
def _get_permissions(
|
||||
drive_service: GoogleDriveService,
|
||||
) -> list[GoogleDrivePermission]:
|
||||
return get_permissions_by_ids(
|
||||
drive_service=drive_service,
|
||||
doc_id=doc_id,
|
||||
permission_ids=permission_ids,
|
||||
)
|
||||
|
||||
permissions_list = _get_permissions(
|
||||
retriever_drive_service or admin_drive_service
|
||||
)
|
||||
if len(permissions_list) != len(permission_ids) and retriever_drive_service:
|
||||
logger.warning(
|
||||
f"Failed to get all permissions for file {doc_id} with retriever service, "
|
||||
"trying admin service"
|
||||
)
|
||||
backup_permissions_list = _get_permissions(admin_drive_service)
|
||||
permissions_list = _merge_permissions_lists(
|
||||
[permissions_list, backup_permissions_list]
|
||||
)
|
||||
|
||||
folder_ids_to_inherit_permissions_from: set[str] = set()
|
||||
user_emails: set[str] = set()
|
||||
@@ -132,6 +170,7 @@ def get_external_access_for_raw_gdrive_file(
|
||||
def gdrive_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from pydantic import BaseModel
|
||||
|
||||
@@ -42,11 +44,17 @@ def _get_all_folders(
|
||||
|
||||
TODO: tweak things so we can fetch deltas.
|
||||
"""
|
||||
MAX_FAILED_PERCENTAGE = 0.5
|
||||
|
||||
all_folders: list[FolderInfo] = []
|
||||
seen_folder_ids: set[str] = set()
|
||||
|
||||
user_emails = google_drive_connector._get_all_user_emails()
|
||||
for user_email in user_emails:
|
||||
def _get_all_folders_for_user(
|
||||
google_drive_connector: GoogleDriveConnector,
|
||||
skip_folders_without_permissions: bool,
|
||||
user_email: str,
|
||||
) -> None:
|
||||
"""Helper to get folders for a specific user + update shared seen_folder_ids"""
|
||||
drive_service = get_drive_service(
|
||||
google_drive_connector.creds,
|
||||
user_email,
|
||||
@@ -96,9 +104,61 @@ def _get_all_folders(
|
||||
)
|
||||
)
|
||||
|
||||
failed_count = 0
|
||||
user_emails = google_drive_connector._get_all_user_emails()
|
||||
for user_email in user_emails:
|
||||
try:
|
||||
_get_all_folders_for_user(
|
||||
google_drive_connector, skip_folders_without_permissions, user_email
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(f"Error getting folders for user {user_email}")
|
||||
failed_count += 1
|
||||
|
||||
if failed_count > MAX_FAILED_PERCENTAGE * len(user_emails):
|
||||
raise RuntimeError("Too many failed folder fetches during group sync")
|
||||
|
||||
return all_folders
|
||||
|
||||
|
||||
def _drive_folder_to_onyx_group(
|
||||
folder: FolderInfo,
|
||||
group_email_to_member_emails_map: dict[str, list[str]],
|
||||
) -> ExternalUserGroup:
|
||||
"""
|
||||
Converts a folder into an Onyx group.
|
||||
"""
|
||||
anyone_can_access = False
|
||||
folder_member_emails: set[str] = set()
|
||||
|
||||
for permission in folder.permissions:
|
||||
if permission.type == PermissionType.USER:
|
||||
if permission.email_address is None:
|
||||
logger.warning(
|
||||
f"User email is None for folder {folder.id} permission {permission}"
|
||||
)
|
||||
continue
|
||||
folder_member_emails.add(permission.email_address)
|
||||
elif permission.type == PermissionType.GROUP:
|
||||
if permission.email_address not in group_email_to_member_emails_map:
|
||||
logger.warning(
|
||||
f"Group email {permission.email_address} for folder {folder.id} "
|
||||
"not found in group_email_to_member_emails_map"
|
||||
)
|
||||
continue
|
||||
folder_member_emails.update(
|
||||
group_email_to_member_emails_map[permission.email_address]
|
||||
)
|
||||
elif permission.type == PermissionType.ANYONE:
|
||||
anyone_can_access = True
|
||||
|
||||
return ExternalUserGroup(
|
||||
id=folder.id,
|
||||
user_emails=list(folder_member_emails),
|
||||
gives_anyone_access=anyone_can_access,
|
||||
)
|
||||
|
||||
|
||||
"""Individual Shared Drive / My Drive Permission Sync"""
|
||||
|
||||
|
||||
@@ -167,7 +227,29 @@ def _get_drive_members(
|
||||
return drive_id_to_members_map
|
||||
|
||||
|
||||
def _get_all_groups(
|
||||
def _drive_member_map_to_onyx_groups(
|
||||
drive_id_to_members_map: dict[str, tuple[set[str], set[str]]],
|
||||
group_email_to_member_emails_map: dict[str, list[str]],
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
"""The `user_emails` for the Shared Drive should be all individuals in the
|
||||
Shared Drive + the union of all flattened group emails."""
|
||||
for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items():
|
||||
drive_member_emails: set[str] = user_emails
|
||||
for group_email in group_emails:
|
||||
if group_email not in group_email_to_member_emails_map:
|
||||
logger.warning(
|
||||
f"Group email {group_email} for drive {drive_id} not found in "
|
||||
"group_email_to_member_emails_map"
|
||||
)
|
||||
continue
|
||||
drive_member_emails.update(group_email_to_member_emails_map[group_email])
|
||||
yield ExternalUserGroup(
|
||||
id=drive_id,
|
||||
user_emails=list(drive_member_emails),
|
||||
)
|
||||
|
||||
|
||||
def _get_all_google_groups(
|
||||
admin_service: AdminService,
|
||||
google_domain: str,
|
||||
) -> set[str]:
|
||||
@@ -185,6 +267,28 @@ def _get_all_groups(
|
||||
return group_emails
|
||||
|
||||
|
||||
def _google_group_to_onyx_group(
|
||||
admin_service: AdminService,
|
||||
group_email: str,
|
||||
) -> ExternalUserGroup:
|
||||
"""
|
||||
This maps google group emails to their member emails.
|
||||
"""
|
||||
group_member_emails: set[str] = set()
|
||||
for member in execute_paginated_retrieval(
|
||||
admin_service.members().list,
|
||||
list_key="members",
|
||||
groupKey=group_email,
|
||||
fields="members(email),nextPageToken",
|
||||
):
|
||||
group_member_emails.add(member["email"])
|
||||
|
||||
return ExternalUserGroup(
|
||||
id=group_email,
|
||||
user_emails=list(group_member_emails),
|
||||
)
|
||||
|
||||
|
||||
def _map_group_email_to_member_emails(
|
||||
admin_service: AdminService,
|
||||
group_emails: set[str],
|
||||
@@ -282,7 +386,7 @@ def _build_onyx_groups(
|
||||
def gdrive_group_sync(
|
||||
tenant_id: str,
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> list[ExternalUserGroup]:
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
# Initialize connector and build credential/service objects
|
||||
google_drive_connector = GoogleDriveConnector(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
@@ -296,26 +400,27 @@ def gdrive_group_sync(
|
||||
drive_id_to_members_map = _get_drive_members(google_drive_connector, admin_service)
|
||||
|
||||
# Get all group emails
|
||||
all_group_emails = _get_all_groups(
|
||||
all_group_emails = _get_all_google_groups(
|
||||
admin_service, google_drive_connector.google_domain
|
||||
)
|
||||
|
||||
# Each google group is an Onyx group, yield those
|
||||
group_email_to_member_emails_map: dict[str, list[str]] = {}
|
||||
for group_email in all_group_emails:
|
||||
onyx_group = _google_group_to_onyx_group(admin_service, group_email)
|
||||
group_email_to_member_emails_map[group_email] = onyx_group.user_emails
|
||||
yield onyx_group
|
||||
|
||||
# Each drive is a group, yield those
|
||||
for onyx_group in _drive_member_map_to_onyx_groups(
|
||||
drive_id_to_members_map, group_email_to_member_emails_map
|
||||
):
|
||||
yield onyx_group
|
||||
|
||||
# Get all folder permissions
|
||||
folder_info = _get_all_folders(
|
||||
google_drive_connector=google_drive_connector,
|
||||
skip_folders_without_permissions=True,
|
||||
)
|
||||
|
||||
# Map group emails to their members
|
||||
group_email_to_member_emails_map = _map_group_email_to_member_emails(
|
||||
admin_service, all_group_emails
|
||||
)
|
||||
|
||||
# Convert the maps to onyx groups
|
||||
onyx_groups = _build_onyx_groups(
|
||||
drive_id_to_members_map=drive_id_to_members_map,
|
||||
group_email_to_member_emails_map=group_email_to_member_emails_map,
|
||||
folder_info=folder_info,
|
||||
)
|
||||
|
||||
return onyx_groups
|
||||
for folder in folder_info:
|
||||
yield _drive_folder_to_onyx_group(folder, group_email_to_member_emails_map)
|
||||
|
||||
36
backend/ee/onyx/external_permissions/jira/doc_sync.py
Normal file
36
backend/ee/onyx/external_permissions/jira/doc_sync.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from ee.onyx.external_permissions.utils import generic_doc_sync
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.jira.connector import JiraConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
JIRA_DOC_SYNC_TAG = "jira_doc_sync"
|
||||
|
||||
|
||||
def jira_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
jira_connector = JiraConnector(
|
||||
**cc_pair.connector.connector_specific_config,
|
||||
)
|
||||
jira_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
yield from generic_doc_sync(
|
||||
cc_pair=cc_pair,
|
||||
fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
|
||||
callback=callback,
|
||||
doc_source=DocumentSource.JIRA,
|
||||
slim_connector=jira_connector,
|
||||
label=JIRA_DOC_SYNC_TAG,
|
||||
)
|
||||
25
backend/ee/onyx/external_permissions/jira/models.py
Normal file
25
backend/ee/onyx/external_permissions/jira/models.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import ConfigDict
|
||||
from pydantic.alias_generators import to_camel
|
||||
|
||||
|
||||
Holder = dict[str, Any]
|
||||
|
||||
|
||||
class Permission(BaseModel):
|
||||
id: int
|
||||
permission: str
|
||||
holder: Holder | None
|
||||
|
||||
|
||||
class User(BaseModel):
|
||||
account_id: str
|
||||
email_address: str
|
||||
display_name: str
|
||||
active: bool
|
||||
|
||||
model_config = ConfigDict(
|
||||
alias_generator=to_camel,
|
||||
)
|
||||
209
backend/ee/onyx/external_permissions/jira/page_access.py
Normal file
209
backend/ee/onyx/external_permissions/jira/page_access.py
Normal file
@@ -0,0 +1,209 @@
|
||||
from collections import defaultdict
|
||||
|
||||
from jira import JIRA
|
||||
from jira.resources import PermissionScheme
|
||||
from pydantic import ValidationError
|
||||
|
||||
from ee.onyx.external_permissions.jira.models import Holder
|
||||
from ee.onyx.external_permissions.jira.models import Permission
|
||||
from ee.onyx.external_permissions.jira.models import User
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
HolderMap = dict[str, list[Holder]]
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _build_holder_map(permissions: list[dict]) -> dict[str, list[Holder]]:
|
||||
"""
|
||||
A "Holder" in JIRA is a person / entity who "holds" the corresponding permission.
|
||||
It can have different types. They can be one of (but not limited to):
|
||||
- user (an explicitly whitelisted user)
|
||||
- projectRole (for project level "roles")
|
||||
- reporter (the reporter of an issue)
|
||||
|
||||
A "Holder" usually has following structure:
|
||||
- `{ "type": "user", "value": "$USER_ID", "user": { .. }, .. }`
|
||||
- `{ "type": "projectRole", "value": "$PROJECT_ID", .. }`
|
||||
|
||||
When we fetch the PermissionSchema from JIRA, we retrieve a list of "Holder"s.
|
||||
The list of "Holder"s can have multiple "Holder"s of the same type in the list (e.g., you can have two `"type": "user"`s in
|
||||
there, each corresponding to a different user).
|
||||
This function constructs a map of "Holder" types to a list of the "Holder"s which contained that type.
|
||||
|
||||
Returns:
|
||||
A dict from the "Holder" type to the actual "Holder" instance.
|
||||
|
||||
Example:
|
||||
```
|
||||
{
|
||||
"user": [
|
||||
{ "type": "user", "value": "10000", "user": { .. }, .. },
|
||||
{ "type": "user", "value": "10001", "user": { .. }, .. },
|
||||
],
|
||||
"projectRole": [
|
||||
{ "type": "projectRole", "value": "10010", .. },
|
||||
{ "type": "projectRole", "value": "10011", .. },
|
||||
],
|
||||
"applicationRole": [
|
||||
{ "type": "applicationRole" },
|
||||
],
|
||||
..
|
||||
}
|
||||
```
|
||||
"""
|
||||
|
||||
holder_map: defaultdict[str, list[Holder]] = defaultdict(list)
|
||||
|
||||
for raw_perm in permissions:
|
||||
if not hasattr(raw_perm, "raw"):
|
||||
logger.warn(f"Expected a 'raw' field, but none was found: {raw_perm=}")
|
||||
continue
|
||||
|
||||
permission = Permission(**raw_perm.raw)
|
||||
|
||||
# We only care about ability to browse through projects + issues (not other permissions such as read/write).
|
||||
if permission.permission != "BROWSE_PROJECTS":
|
||||
continue
|
||||
|
||||
# In order to associate this permission to some Atlassian entity, we need the "Holder".
|
||||
# If this doesn't exist, then we cannot associate this permission to anyone; just skip.
|
||||
if not permission.holder:
|
||||
logger.warn(
|
||||
f"Expected to find a permission holder, but none was found: {permission=}"
|
||||
)
|
||||
continue
|
||||
|
||||
type = permission.holder.get("type")
|
||||
if not type:
|
||||
logger.warn(
|
||||
f"Expected to find the type of permission holder, but none was found: {permission=}"
|
||||
)
|
||||
continue
|
||||
|
||||
holder_map[type].append(permission.holder)
|
||||
|
||||
return holder_map
|
||||
|
||||
|
||||
def _get_user_emails(user_holders: list[Holder]) -> list[str]:
|
||||
emails = []
|
||||
|
||||
for user_holder in user_holders:
|
||||
if "user" not in user_holder:
|
||||
continue
|
||||
raw_user_dict = user_holder["user"]
|
||||
|
||||
try:
|
||||
user_model = User.model_validate(raw_user_dict)
|
||||
except ValidationError:
|
||||
logger.error(
|
||||
"Expected to be able to serialize the raw-user-dict into an instance of `User`, but validation failed;"
|
||||
f"{raw_user_dict=}"
|
||||
)
|
||||
continue
|
||||
|
||||
emails.append(user_model.email_address)
|
||||
|
||||
return emails
|
||||
|
||||
|
||||
def _get_user_emails_from_project_roles(
|
||||
jira_client: JIRA,
|
||||
jira_project: str,
|
||||
project_role_holders: list[Holder],
|
||||
) -> list[str]:
|
||||
# NOTE (@raunakab) a `parallel_yield` may be helpful here...?
|
||||
roles = [
|
||||
jira_client.project_role(project=jira_project, id=project_role_holder["value"])
|
||||
for project_role_holder in project_role_holders
|
||||
if "value" in project_role_holder
|
||||
]
|
||||
|
||||
emails = []
|
||||
|
||||
for role in roles:
|
||||
if not hasattr(role, "actors"):
|
||||
continue
|
||||
|
||||
for actor in role.actors:
|
||||
if not hasattr(actor, "actorUser") or not hasattr(
|
||||
actor.actorUser, "accountId"
|
||||
):
|
||||
continue
|
||||
|
||||
user = jira_client.user(id=actor.actorUser.accountId)
|
||||
if not hasattr(user, "accountType") or user.accountType != "atlassian":
|
||||
continue
|
||||
|
||||
if not hasattr(user, "emailAddress"):
|
||||
msg = f"User's email address was not able to be retrieved; {actor.actorUser.accountId=}"
|
||||
if hasattr(user, "displayName"):
|
||||
msg += f" {actor.displayName=}"
|
||||
logger.warn(msg)
|
||||
continue
|
||||
|
||||
emails.append(user.emailAddress)
|
||||
|
||||
return emails
|
||||
|
||||
|
||||
def _build_external_access_from_holder_map(
|
||||
jira_client: JIRA, jira_project: str, holder_map: HolderMap
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
# Note:
|
||||
If the `holder_map` contains an instance of "anyone", then this is a public JIRA project.
|
||||
Otherwise, we fetch the "projectRole"s (i.e., the user-groups in JIRA speak), and the user emails.
|
||||
"""
|
||||
|
||||
if "anyone" in holder_map:
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(), external_user_group_ids=set(), is_public=True
|
||||
)
|
||||
|
||||
user_emails = (
|
||||
_get_user_emails(user_holders=holder_map["user"])
|
||||
if "user" in holder_map
|
||||
else []
|
||||
)
|
||||
project_role_user_emails = (
|
||||
_get_user_emails_from_project_roles(
|
||||
jira_client=jira_client,
|
||||
jira_project=jira_project,
|
||||
project_role_holders=holder_map["projectRole"],
|
||||
)
|
||||
if "projectRole" in holder_map
|
||||
else []
|
||||
)
|
||||
|
||||
external_user_emails = set(user_emails + project_role_user_emails)
|
||||
|
||||
return ExternalAccess(
|
||||
external_user_emails=external_user_emails,
|
||||
external_user_group_ids=set(),
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
|
||||
def get_project_permissions(
|
||||
jira_client: JIRA,
|
||||
jira_project: str,
|
||||
) -> ExternalAccess | None:
|
||||
project_permissions: PermissionScheme = jira_client.project_permissionscheme(
|
||||
project=jira_project
|
||||
)
|
||||
|
||||
if not hasattr(project_permissions, "permissions"):
|
||||
return None
|
||||
|
||||
if not isinstance(project_permissions.permissions, list):
|
||||
return None
|
||||
|
||||
holder_map = _build_holder_map(permissions=project_permissions.permissions)
|
||||
|
||||
return _build_external_access_from_holder_map(
|
||||
jira_client=jira_client, jira_project=jira_project, holder_map=holder_map
|
||||
)
|
||||
@@ -2,27 +2,45 @@ from collections.abc import Callable
|
||||
from collections.abc import Generator
|
||||
from typing import Optional
|
||||
from typing import Protocol
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup # noqa
|
||||
from onyx.access.models import DocExternalAccess # noqa
|
||||
from onyx.context.search.models import InferenceChunk
|
||||
|
||||
# Avoid circular imports
|
||||
if TYPE_CHECKING:
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup # noqa
|
||||
from onyx.access.models import DocExternalAccess # noqa
|
||||
from onyx.db.models import ConnectorCredentialPair # noqa
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface # noqa
|
||||
from onyx.db.models import ConnectorCredentialPair # noqa
|
||||
from onyx.db.utils import DocumentRow
|
||||
from onyx.db.utils import SortOrder
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface # noqa
|
||||
|
||||
|
||||
class FetchAllDocumentsFunction(Protocol):
|
||||
"""Protocol for a function that fetches all document IDs for a connector credential pair."""
|
||||
"""Protocol for a function that fetches documents for a connector credential pair.
|
||||
|
||||
def __call__(self) -> list[str]:
|
||||
This protocol defines the interface for functions that retrieve documents
|
||||
from the database, typically used in permission synchronization workflows.
|
||||
"""
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
sort_order: SortOrder | None,
|
||||
) -> list[DocumentRow]:
|
||||
"""
|
||||
Returns a list of document IDs for a connector credential pair.
|
||||
Fetches documents for a connector credential pair.
|
||||
"""
|
||||
...
|
||||
|
||||
This is typically used to determine which documents should no longer be
|
||||
accessible during the document sync process.
|
||||
|
||||
class FetchAllDocumentsIdsFunction(Protocol):
|
||||
"""Protocol for a function that fetches document IDs for a connector credential pair.
|
||||
|
||||
This protocol defines the interface for functions that retrieve document IDs
|
||||
from the database, typically used in permission synchronization workflows.
|
||||
"""
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Fetches document IDs for a connector credential pair.
|
||||
"""
|
||||
...
|
||||
|
||||
@@ -30,19 +48,20 @@ class FetchAllDocumentsFunction(Protocol):
|
||||
# Defining the input/output types for the sync functions
|
||||
DocSyncFuncType = Callable[
|
||||
[
|
||||
"ConnectorCredentialPair",
|
||||
ConnectorCredentialPair,
|
||||
FetchAllDocumentsFunction,
|
||||
Optional["IndexingHeartbeatInterface"],
|
||||
FetchAllDocumentsIdsFunction,
|
||||
Optional[IndexingHeartbeatInterface],
|
||||
],
|
||||
Generator["DocExternalAccess", None, None],
|
||||
Generator[DocExternalAccess, None, None],
|
||||
]
|
||||
|
||||
GroupSyncFuncType = Callable[
|
||||
[
|
||||
str,
|
||||
"ConnectorCredentialPair",
|
||||
str, # tenant_id
|
||||
ConnectorCredentialPair, # cc_pair
|
||||
],
|
||||
list["ExternalUserGroup"],
|
||||
Generator[ExternalUserGroup, None, None],
|
||||
]
|
||||
|
||||
# list of chunks to be censored and the user email. returns censored chunks
|
||||
|
||||
@@ -3,7 +3,7 @@ from ee.onyx.external_permissions.sync_params import get_all_censoring_enabled_s
|
||||
from ee.onyx.external_permissions.sync_params import get_source_perm_sync_config
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.context.search.pipeline import InferenceChunk
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.models import User
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@@ -22,7 +22,7 @@ def _get_all_censoring_enabled_sources() -> set[DocumentSource]:
|
||||
for every single chunk.
|
||||
"""
|
||||
all_censoring_enabled_sources = get_all_censoring_enabled_sources()
|
||||
with get_session_context_manager() as db_session:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
enabled_sync_connectors = get_all_auto_sync_cc_pairs(db_session)
|
||||
return {
|
||||
cc_pair.connector.source
|
||||
|
||||
@@ -10,7 +10,7 @@ from ee.onyx.external_permissions.salesforce.utils import (
|
||||
)
|
||||
from onyx.configs.app_configs import BLURB_SIZE
|
||||
from onyx.context.search.models import InferenceChunk
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -44,7 +44,7 @@ def _get_objects_access_for_user_email_from_salesforce(
|
||||
# This is cached in the function so the first query takes an extra 0.1-0.3 seconds
|
||||
# but subsequent queries for this source are essentially instant
|
||||
first_doc_id = chunks[0].document_id
|
||||
with get_session_context_manager() as db_session:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
salesforce_client = get_any_salesforce_client_for_doc_id(
|
||||
db_session, first_doc_id
|
||||
)
|
||||
@@ -217,7 +217,7 @@ def censor_salesforce_chunks(
|
||||
def _get_objects_access_for_user_email(
|
||||
object_ids: set[str], user_email: str
|
||||
) -> dict[str, bool]:
|
||||
with get_session_context_manager() as db_session:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
external_groups = fetch_external_groups_for_user_email_and_group_ids(
|
||||
db_session=db_session,
|
||||
user_email=user_email,
|
||||
|
||||
36
backend/ee/onyx/external_permissions/sharepoint/doc_sync.py
Normal file
36
backend/ee/onyx/external_permissions/sharepoint/doc_sync.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from ee.onyx.external_permissions.utils import generic_doc_sync
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.sharepoint.connector import SharepointConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
SHAREPOINT_DOC_SYNC_TAG = "sharepoint_doc_sync"
|
||||
|
||||
|
||||
def sharepoint_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
sharepoint_connector = SharepointConnector(
|
||||
**cc_pair.connector.connector_specific_config,
|
||||
)
|
||||
sharepoint_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
yield from generic_doc_sync(
|
||||
cc_pair=cc_pair,
|
||||
fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
|
||||
callback=callback,
|
||||
doc_source=DocumentSource.SHAREPOINT,
|
||||
slim_connector=sharepoint_connector,
|
||||
label=SHAREPOINT_DOC_SYNC_TAG,
|
||||
)
|
||||
@@ -0,0 +1,63 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from office365.sharepoint.client_context import ClientContext # type: ignore[import-untyped]
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from ee.onyx.external_permissions.sharepoint.permission_utils import (
|
||||
get_sharepoint_external_groups,
|
||||
)
|
||||
from onyx.connectors.sharepoint.connector import acquire_token_for_rest
|
||||
from onyx.connectors.sharepoint.connector import SharepointConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def sharepoint_group_sync(
|
||||
tenant_id: str,
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
"""Sync SharePoint groups and their members"""
|
||||
|
||||
# Get site URLs from connector config
|
||||
connector_config = cc_pair.connector.connector_specific_config
|
||||
|
||||
# Create SharePoint connector instance and load credentials
|
||||
connector = SharepointConnector(**connector_config)
|
||||
connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
if not connector.msal_app:
|
||||
raise RuntimeError("MSAL app not initialized in connector")
|
||||
|
||||
if not connector.sp_tenant_domain:
|
||||
raise RuntimeError("Tenant domain not initialized in connector")
|
||||
|
||||
# Get site descriptors from connector (either configured sites or all sites)
|
||||
site_descriptors = connector.site_descriptors or connector.fetch_sites()
|
||||
|
||||
if not site_descriptors:
|
||||
raise RuntimeError("No SharePoint sites found for group sync")
|
||||
|
||||
logger.info(f"Processing {len(site_descriptors)} sites for group sync")
|
||||
|
||||
msal_app = connector.msal_app
|
||||
sp_tenant_domain = connector.sp_tenant_domain
|
||||
# Process each site
|
||||
for site_descriptor in site_descriptors:
|
||||
logger.debug(f"Processing site: {site_descriptor.url}")
|
||||
|
||||
# Create client context for the site using connector's MSAL app
|
||||
ctx = ClientContext(site_descriptor.url).with_access_token(
|
||||
lambda: acquire_token_for_rest(msal_app, sp_tenant_domain)
|
||||
)
|
||||
|
||||
# Get external groups for this site
|
||||
external_groups = get_sharepoint_external_groups(ctx, connector.graph_client)
|
||||
|
||||
# Yield each group
|
||||
for group in external_groups:
|
||||
logger.debug(
|
||||
f"Found group: {group.id} with {len(group.user_emails)} members"
|
||||
)
|
||||
yield group
|
||||
@@ -0,0 +1,684 @@
|
||||
import re
|
||||
from collections import deque
|
||||
from typing import Any
|
||||
from urllib.parse import unquote
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from office365.graph_client import GraphClient # type: ignore[import-untyped]
|
||||
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore[import-untyped]
|
||||
from office365.runtime.client_request import ClientRequestException # type: ignore
|
||||
from office365.sharepoint.client_context import ClientContext # type: ignore[import-untyped]
|
||||
from office365.sharepoint.permissions.securable_object import RoleAssignmentCollection # type: ignore[import-untyped]
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.sharepoint.connector import sleep_and_retry
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
# These values represent different types of SharePoint principals used in permission assignments
|
||||
USER_PRINCIPAL_TYPE = 1 # Individual user accounts
|
||||
ANONYMOUS_USER_PRINCIPAL_TYPE = 3 # Anonymous/unauthenticated users (public access)
|
||||
AZURE_AD_GROUP_PRINCIPAL_TYPE = 4 # Azure Active Directory security groups
|
||||
SHAREPOINT_GROUP_PRINCIPAL_TYPE = 8 # SharePoint site groups (local to the site)
|
||||
MICROSOFT_DOMAIN = ".onmicrosoft"
|
||||
# Limited Access role type, limited access is a travel through permission not a actual permission
|
||||
LIMITED_ACCESS_ROLE_TYPES = [1, 9]
|
||||
LIMITED_ACCESS_ROLE_NAMES = ["Limited Access", "Web-Only Limited Access"]
|
||||
|
||||
|
||||
class SharepointGroup(BaseModel):
|
||||
model_config = {"frozen": True}
|
||||
|
||||
name: str
|
||||
login_name: str
|
||||
principal_type: int
|
||||
|
||||
|
||||
class GroupsResult(BaseModel):
|
||||
groups_to_emails: dict[str, set[str]]
|
||||
found_public_group: bool
|
||||
|
||||
|
||||
def _get_azuread_group_guid_by_name(
|
||||
graph_client: GraphClient, group_name: str
|
||||
) -> str | None:
|
||||
try:
|
||||
# Search for groups by display name
|
||||
groups = sleep_and_retry(
|
||||
graph_client.groups.filter(f"displayName eq '{group_name}'").get(),
|
||||
"get_azuread_group_guid_by_name",
|
||||
)
|
||||
|
||||
if groups and len(groups) > 0:
|
||||
return groups[0].id
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get Azure AD group GUID for name {group_name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_guid_from_claims_token(claims_token: str) -> str | None:
|
||||
|
||||
try:
|
||||
# Pattern to match GUID in claims token
|
||||
# Claims tokens often have format: c:0o.c|provider|GUID_suffix
|
||||
guid_pattern = r"([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})"
|
||||
|
||||
match = re.search(guid_pattern, claims_token, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract GUID from claims token {claims_token}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _get_group_guid_from_identifier(
|
||||
graph_client: GraphClient, identifier: str
|
||||
) -> str | None:
|
||||
try:
|
||||
# Check if it's already a GUID
|
||||
guid_pattern = r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$"
|
||||
if re.match(guid_pattern, identifier, re.IGNORECASE):
|
||||
return identifier
|
||||
|
||||
# Check if it's a SharePoint claims token
|
||||
if identifier.startswith("c:0") and "|" in identifier:
|
||||
guid = _extract_guid_from_claims_token(identifier)
|
||||
if guid:
|
||||
logger.info(f"Extracted GUID {guid} from claims token {identifier}")
|
||||
return guid
|
||||
|
||||
# Try to search by display name as fallback
|
||||
return _get_azuread_group_guid_by_name(graph_client, identifier)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get group GUID from identifier {identifier}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _get_security_group_owners(graph_client: GraphClient, group_id: str) -> list[str]:
|
||||
try:
|
||||
# Get group owners using Graph API
|
||||
group = graph_client.groups[group_id]
|
||||
owners = sleep_and_retry(
|
||||
group.owners.get_all(page_loaded=lambda _: None),
|
||||
"get_security_group_owners",
|
||||
)
|
||||
|
||||
owner_emails: list[str] = []
|
||||
logger.info(f"Owners: {owners}")
|
||||
|
||||
for owner in owners:
|
||||
owner_data = owner.to_json()
|
||||
|
||||
# Extract email from the JSON data
|
||||
mail: str | None = owner_data.get("mail")
|
||||
user_principal_name: str | None = owner_data.get("userPrincipalName")
|
||||
|
||||
# Check if owner is a user and has an email
|
||||
if mail:
|
||||
if MICROSOFT_DOMAIN in mail:
|
||||
mail = mail.replace(MICROSOFT_DOMAIN, "")
|
||||
owner_emails.append(mail)
|
||||
elif user_principal_name:
|
||||
if MICROSOFT_DOMAIN in user_principal_name:
|
||||
user_principal_name = user_principal_name.replace(
|
||||
MICROSOFT_DOMAIN, ""
|
||||
)
|
||||
owner_emails.append(user_principal_name)
|
||||
|
||||
logger.info(
|
||||
f"Retrieved {len(owner_emails)} owners from security group {group_id}"
|
||||
)
|
||||
return owner_emails
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get security group owners for group {group_id}: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def _get_sharepoint_list_item_id(drive_item: DriveItem) -> str | None:
|
||||
|
||||
try:
|
||||
# First try to get the list item directly from the drive item
|
||||
if hasattr(drive_item, "listItem"):
|
||||
list_item = drive_item.listItem
|
||||
if list_item:
|
||||
# Load the list item properties to get the ID
|
||||
sleep_and_retry(list_item.get(), "get_sharepoint_list_item_id")
|
||||
if hasattr(list_item, "id") and list_item.id:
|
||||
return str(list_item.id)
|
||||
|
||||
# The SharePoint list item ID is typically available in the sharepointIds property
|
||||
sharepoint_ids = getattr(drive_item, "sharepoint_ids", None)
|
||||
if sharepoint_ids and hasattr(sharepoint_ids, "listItemId"):
|
||||
return sharepoint_ids.listItemId
|
||||
|
||||
# Alternative: try to get it from the properties
|
||||
properties = getattr(drive_item, "properties", None)
|
||||
if properties:
|
||||
# Sometimes the SharePoint list item ID is in the properties
|
||||
for prop_name, prop_value in properties.items():
|
||||
if "listitemid" in prop_name.lower():
|
||||
return str(prop_value)
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error getting SharePoint list item ID for item {drive_item.id}: {e}"
|
||||
)
|
||||
raise e
|
||||
|
||||
|
||||
def _is_public_item(drive_item: DriveItem) -> bool:
|
||||
is_public = False
|
||||
try:
|
||||
permissions = sleep_and_retry(
|
||||
drive_item.permissions.get_all(page_loaded=lambda _: None), "is_public_item"
|
||||
)
|
||||
for permission in permissions:
|
||||
if permission.link and (
|
||||
permission.link.scope == "anonymous"
|
||||
or permission.link.scope == "organization"
|
||||
):
|
||||
is_public = True
|
||||
break
|
||||
return is_public
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to check if item {drive_item.id} is public: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _is_public_login_name(login_name: str) -> bool:
|
||||
# Patterns that indicate public access
|
||||
# This list is derived from the below link
|
||||
# https://learn.microsoft.com/en-us/answers/questions/2085339/guid-in-the-loginname-of-site-user-everyone-except
|
||||
public_login_patterns: list[str] = [
|
||||
"c:0-.f|rolemanager|spo-grid-all-users/",
|
||||
"c:0(.s|true",
|
||||
]
|
||||
for pattern in public_login_patterns:
|
||||
if pattern in login_name:
|
||||
logger.info(f"Login name {login_name} is public")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# AD groups allows same display name for multiple groups, so we need to add the GUID to the name
|
||||
def _get_group_name_with_suffix(
|
||||
login_name: str, group_name: str, graph_client: GraphClient
|
||||
) -> str:
|
||||
ad_group_suffix = _get_group_guid_from_identifier(graph_client, login_name)
|
||||
return f"{group_name}_{ad_group_suffix}"
|
||||
|
||||
|
||||
def _get_sharepoint_groups(
|
||||
client_context: ClientContext, group_name: str, graph_client: GraphClient
|
||||
) -> tuple[set[SharepointGroup], set[str]]:
|
||||
|
||||
groups: set[SharepointGroup] = set()
|
||||
user_emails: set[str] = set()
|
||||
|
||||
def process_users(users: list[Any]) -> None:
|
||||
nonlocal groups, user_emails
|
||||
|
||||
for user in users:
|
||||
logger.debug(f"User: {user.to_json()}")
|
||||
if user.principal_type == USER_PRINCIPAL_TYPE and hasattr(
|
||||
user, "user_principal_name"
|
||||
):
|
||||
if user.user_principal_name:
|
||||
email = user.user_principal_name
|
||||
if MICROSOFT_DOMAIN in email:
|
||||
email = email.replace(MICROSOFT_DOMAIN, "")
|
||||
user_emails.add(email)
|
||||
else:
|
||||
logger.warning(
|
||||
f"User don't have a user principal name: {user.login_name}"
|
||||
)
|
||||
elif user.principal_type in [
|
||||
AZURE_AD_GROUP_PRINCIPAL_TYPE,
|
||||
SHAREPOINT_GROUP_PRINCIPAL_TYPE,
|
||||
]:
|
||||
name = user.title
|
||||
if user.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
|
||||
name = _get_group_name_with_suffix(
|
||||
user.login_name, name, graph_client
|
||||
)
|
||||
groups.add(
|
||||
SharepointGroup(
|
||||
login_name=user.login_name,
|
||||
principal_type=user.principal_type,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
|
||||
group = client_context.web.site_groups.get_by_name(group_name)
|
||||
sleep_and_retry(
|
||||
group.users.get_all(page_loaded=process_users), "get_sharepoint_groups"
|
||||
)
|
||||
|
||||
return groups, user_emails
|
||||
|
||||
|
||||
def _get_azuread_groups(
|
||||
graph_client: GraphClient, group_name: str
|
||||
) -> tuple[set[SharepointGroup], set[str]]:
|
||||
|
||||
group_id = _get_group_guid_from_identifier(graph_client, group_name)
|
||||
if not group_id:
|
||||
logger.error(f"Failed to get Azure AD group GUID for name {group_name}")
|
||||
return set(), set()
|
||||
group = graph_client.groups[group_id]
|
||||
groups: set[SharepointGroup] = set()
|
||||
user_emails: set[str] = set()
|
||||
|
||||
def process_members(members: list[Any]) -> None:
|
||||
nonlocal groups, user_emails
|
||||
|
||||
for member in members:
|
||||
member_data = member.to_json()
|
||||
logger.debug(f"Member: {member_data}")
|
||||
# Check for user-specific attributes
|
||||
user_principal_name = member_data.get("userPrincipalName")
|
||||
mail = member_data.get("mail")
|
||||
display_name = member_data.get("displayName") or member_data.get(
|
||||
"display_name"
|
||||
)
|
||||
|
||||
# Check object attributes directly (if available)
|
||||
is_user = False
|
||||
is_group = False
|
||||
|
||||
# Users typically have userPrincipalName or mail
|
||||
if user_principal_name or (mail and "@" in str(mail)):
|
||||
is_user = True
|
||||
# Groups typically have displayName but no userPrincipalName
|
||||
elif display_name and not user_principal_name:
|
||||
# Additional check: try to access group-specific properties
|
||||
if (
|
||||
hasattr(member, "groupTypes")
|
||||
or member_data.get("groupTypes") is not None
|
||||
):
|
||||
is_group = True
|
||||
# Or check if it has an 'id' field typical for groups
|
||||
elif member_data.get("id") and not user_principal_name:
|
||||
is_group = True
|
||||
|
||||
# Check the object type name (fallback)
|
||||
if not is_user and not is_group:
|
||||
obj_type = type(member).__name__.lower()
|
||||
if "user" in obj_type:
|
||||
is_user = True
|
||||
elif "group" in obj_type:
|
||||
is_group = True
|
||||
|
||||
# Process based on identification
|
||||
if is_user:
|
||||
if user_principal_name:
|
||||
email = user_principal_name
|
||||
if MICROSOFT_DOMAIN in email:
|
||||
email = email.replace(MICROSOFT_DOMAIN, "")
|
||||
user_emails.add(email)
|
||||
elif mail:
|
||||
email = mail
|
||||
if MICROSOFT_DOMAIN in email:
|
||||
email = email.replace(MICROSOFT_DOMAIN, "")
|
||||
user_emails.add(email)
|
||||
logger.info(f"Added user: {user_principal_name or mail}")
|
||||
elif is_group:
|
||||
if not display_name:
|
||||
logger.error(f"No display name for group: {member_data.get('id')}")
|
||||
continue
|
||||
name = _get_group_name_with_suffix(
|
||||
member_data.get("id", ""), display_name, graph_client
|
||||
)
|
||||
groups.add(
|
||||
SharepointGroup(
|
||||
login_name=member_data.get("id", ""), # Use ID for groups
|
||||
principal_type=AZURE_AD_GROUP_PRINCIPAL_TYPE,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
logger.info(f"Added group: {name}")
|
||||
else:
|
||||
# Log unidentified members for debugging
|
||||
logger.warning(f"Could not identify member type for: {member_data}")
|
||||
|
||||
sleep_and_retry(
|
||||
group.members.get_all(page_loaded=process_members), "get_azuread_groups"
|
||||
)
|
||||
|
||||
owner_emails = _get_security_group_owners(graph_client, group_id)
|
||||
user_emails.update(owner_emails)
|
||||
|
||||
return groups, user_emails
|
||||
|
||||
|
||||
def _get_groups_and_members_recursively(
|
||||
client_context: ClientContext,
|
||||
graph_client: GraphClient,
|
||||
groups: set[SharepointGroup],
|
||||
is_group_sync: bool = False,
|
||||
) -> GroupsResult:
|
||||
"""
|
||||
Get all groups and their members recursively.
|
||||
"""
|
||||
group_queue: deque[SharepointGroup] = deque(groups)
|
||||
visited_groups: set[str] = set()
|
||||
visited_group_name_to_emails: dict[str, set[str]] = {}
|
||||
found_public_group = False
|
||||
while group_queue:
|
||||
group = group_queue.popleft()
|
||||
if group.login_name in visited_groups:
|
||||
continue
|
||||
visited_groups.add(group.login_name)
|
||||
visited_group_name_to_emails[group.name] = set()
|
||||
logger.info(
|
||||
f"Processing group: {group.name} principal type: {group.principal_type}"
|
||||
)
|
||||
if group.principal_type == SHAREPOINT_GROUP_PRINCIPAL_TYPE:
|
||||
group_info, user_emails = _get_sharepoint_groups(
|
||||
client_context, group.login_name, graph_client
|
||||
)
|
||||
visited_group_name_to_emails[group.name].update(user_emails)
|
||||
if group_info:
|
||||
group_queue.extend(group_info)
|
||||
if group.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
|
||||
try:
|
||||
# if the site is public, we have default groups assigned to it, so we return early
|
||||
if _is_public_login_name(group.login_name):
|
||||
found_public_group = True
|
||||
if not is_group_sync:
|
||||
return GroupsResult(
|
||||
groups_to_emails={}, found_public_group=True
|
||||
)
|
||||
else:
|
||||
# we don't want to sync public groups, so we skip them
|
||||
continue
|
||||
group_info, user_emails = _get_azuread_groups(
|
||||
graph_client, group.login_name
|
||||
)
|
||||
visited_group_name_to_emails[group.name].update(user_emails)
|
||||
if group_info:
|
||||
group_queue.extend(group_info)
|
||||
except ClientRequestException as e:
|
||||
# If the group is not found, we skip it. There is a chance that group is still referenced
|
||||
# in sharepoint but it is removed from Azure AD. There is no actual documentation on this, but based on
|
||||
# our testing we have seen this happen.
|
||||
if e.response is not None and e.response.status_code == 404:
|
||||
logger.warning(f"Group {group.login_name} not found")
|
||||
continue
|
||||
raise e
|
||||
|
||||
return GroupsResult(
|
||||
groups_to_emails=visited_group_name_to_emails,
|
||||
found_public_group=found_public_group,
|
||||
)
|
||||
|
||||
|
||||
def get_external_access_from_sharepoint(
|
||||
client_context: ClientContext,
|
||||
graph_client: GraphClient,
|
||||
drive_name: str | None,
|
||||
drive_item: DriveItem | None,
|
||||
site_page: dict[str, Any] | None,
|
||||
add_prefix: bool = False,
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
Get external access information from SharePoint.
|
||||
"""
|
||||
groups: set[SharepointGroup] = set()
|
||||
user_emails: set[str] = set()
|
||||
group_ids: set[str] = set()
|
||||
|
||||
# Add all members to a processing set first
|
||||
def add_user_and_group_to_sets(
|
||||
role_assignments: RoleAssignmentCollection,
|
||||
) -> None:
|
||||
nonlocal user_emails, groups
|
||||
for assignment in role_assignments:
|
||||
logger.debug(f"Assignment: {assignment.to_json()}")
|
||||
if assignment.role_definition_bindings:
|
||||
is_limited_access = True
|
||||
for role_definition_binding in assignment.role_definition_bindings:
|
||||
if (
|
||||
role_definition_binding.role_type_kind
|
||||
not in LIMITED_ACCESS_ROLE_TYPES
|
||||
or role_definition_binding.name not in LIMITED_ACCESS_ROLE_NAMES
|
||||
):
|
||||
is_limited_access = False
|
||||
break
|
||||
|
||||
# Skip if the role is only Limited Access, because this is not a actual permission its a travel through permission
|
||||
if is_limited_access:
|
||||
logger.info(
|
||||
"Skipping assignment because it has only Limited Access role"
|
||||
)
|
||||
continue
|
||||
if assignment.member:
|
||||
member = assignment.member
|
||||
if member.principal_type == USER_PRINCIPAL_TYPE and hasattr(
|
||||
member, "user_principal_name"
|
||||
):
|
||||
email = member.user_principal_name
|
||||
if MICROSOFT_DOMAIN in email:
|
||||
email = email.replace(MICROSOFT_DOMAIN, "")
|
||||
user_emails.add(email)
|
||||
elif member.principal_type in [
|
||||
AZURE_AD_GROUP_PRINCIPAL_TYPE,
|
||||
SHAREPOINT_GROUP_PRINCIPAL_TYPE,
|
||||
]:
|
||||
name = member.title
|
||||
if member.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
|
||||
name = _get_group_name_with_suffix(
|
||||
member.login_name, name, graph_client
|
||||
)
|
||||
groups.add(
|
||||
SharepointGroup(
|
||||
login_name=member.login_name,
|
||||
principal_type=member.principal_type,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
|
||||
if drive_item and drive_name:
|
||||
# Here we check if the item have have any public links, if so we return early
|
||||
is_public = _is_public_item(drive_item)
|
||||
if is_public:
|
||||
logger.info(f"Item {drive_item.id} is public")
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=set(),
|
||||
is_public=True,
|
||||
)
|
||||
|
||||
item_id = _get_sharepoint_list_item_id(drive_item)
|
||||
|
||||
if not item_id:
|
||||
raise RuntimeError(
|
||||
f"Failed to get SharePoint list item ID for item {drive_item.id}"
|
||||
)
|
||||
|
||||
if drive_name == "Shared Documents":
|
||||
drive_name = "Documents"
|
||||
|
||||
item = client_context.web.lists.get_by_title(drive_name).items.get_by_id(
|
||||
item_id
|
||||
)
|
||||
|
||||
sleep_and_retry(
|
||||
item.role_assignments.expand(["Member", "RoleDefinitionBindings"]).get_all(
|
||||
page_loaded=add_user_and_group_to_sets,
|
||||
),
|
||||
"get_external_access_from_sharepoint",
|
||||
)
|
||||
elif site_page:
|
||||
site_url = site_page.get("webUrl")
|
||||
# Prefer server-relative URL to avoid OData filters that break on apostrophes
|
||||
server_relative_url = unquote(urlparse(site_url).path)
|
||||
file_obj = client_context.web.get_file_by_server_relative_url(
|
||||
server_relative_url
|
||||
)
|
||||
item = file_obj.listItemAllFields
|
||||
|
||||
sleep_and_retry(
|
||||
item.role_assignments.expand(["Member", "RoleDefinitionBindings"]).get_all(
|
||||
page_loaded=add_user_and_group_to_sets,
|
||||
),
|
||||
"get_external_access_from_sharepoint",
|
||||
)
|
||||
else:
|
||||
raise RuntimeError("No drive item or site page provided")
|
||||
|
||||
groups_and_members: GroupsResult = _get_groups_and_members_recursively(
|
||||
client_context, graph_client, groups
|
||||
)
|
||||
|
||||
# If the site is public, w have default groups assigned to it, so we return early
|
||||
if groups_and_members.found_public_group:
|
||||
return ExternalAccess(
|
||||
external_user_emails=set(),
|
||||
external_user_group_ids=set(),
|
||||
is_public=True,
|
||||
)
|
||||
|
||||
for group_name, _ in groups_and_members.groups_to_emails.items():
|
||||
if add_prefix:
|
||||
group_name = build_ext_group_name_for_onyx(
|
||||
group_name, DocumentSource.SHAREPOINT
|
||||
)
|
||||
group_ids.add(group_name.lower())
|
||||
|
||||
logger.info(f"User emails: {len(user_emails)}")
|
||||
logger.info(f"Group IDs: {len(group_ids)}")
|
||||
|
||||
return ExternalAccess(
|
||||
external_user_emails=user_emails,
|
||||
external_user_group_ids=group_ids,
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
|
||||
def get_sharepoint_external_groups(
|
||||
client_context: ClientContext, graph_client: GraphClient
|
||||
) -> list[ExternalUserGroup]:
|
||||
|
||||
groups: set[SharepointGroup] = set()
|
||||
|
||||
def add_group_to_sets(role_assignments: RoleAssignmentCollection) -> None:
|
||||
nonlocal groups
|
||||
for assignment in role_assignments:
|
||||
if assignment.role_definition_bindings:
|
||||
is_limited_access = True
|
||||
for role_definition_binding in assignment.role_definition_bindings:
|
||||
if (
|
||||
role_definition_binding.role_type_kind
|
||||
not in LIMITED_ACCESS_ROLE_TYPES
|
||||
or role_definition_binding.name not in LIMITED_ACCESS_ROLE_NAMES
|
||||
):
|
||||
is_limited_access = False
|
||||
break
|
||||
|
||||
# Skip if the role assignment is only Limited Access, because this is not a actual permission its
|
||||
# a travel through permission
|
||||
if is_limited_access:
|
||||
logger.info(
|
||||
"Skipping assignment because it has only Limited Access role"
|
||||
)
|
||||
continue
|
||||
if assignment.member:
|
||||
member = assignment.member
|
||||
if member.principal_type in [
|
||||
AZURE_AD_GROUP_PRINCIPAL_TYPE,
|
||||
SHAREPOINT_GROUP_PRINCIPAL_TYPE,
|
||||
]:
|
||||
name = member.title
|
||||
if member.principal_type == AZURE_AD_GROUP_PRINCIPAL_TYPE:
|
||||
name = _get_group_name_with_suffix(
|
||||
member.login_name, name, graph_client
|
||||
)
|
||||
|
||||
groups.add(
|
||||
SharepointGroup(
|
||||
login_name=member.login_name,
|
||||
principal_type=member.principal_type,
|
||||
name=name,
|
||||
)
|
||||
)
|
||||
|
||||
sleep_and_retry(
|
||||
client_context.web.role_assignments.expand(
|
||||
["Member", "RoleDefinitionBindings"]
|
||||
).get_all(page_loaded=add_group_to_sets),
|
||||
"get_sharepoint_external_groups",
|
||||
)
|
||||
groups_and_members: GroupsResult = _get_groups_and_members_recursively(
|
||||
client_context, graph_client, groups, is_group_sync=True
|
||||
)
|
||||
|
||||
# get all Azure AD groups because if any group is assigned to the drive item, we don't want to miss them
|
||||
# We can't assign sharepoint groups to drive items or drives, so we don't need to get all sharepoint groups
|
||||
azure_ad_groups = sleep_and_retry(
|
||||
graph_client.groups.get_all(page_loaded=lambda _: None),
|
||||
"get_sharepoint_external_groups:get_azure_ad_groups",
|
||||
)
|
||||
logger.info(f"Azure AD Groups: {len(azure_ad_groups)}")
|
||||
identified_groups: set[str] = set(groups_and_members.groups_to_emails.keys())
|
||||
ad_groups_to_emails: dict[str, set[str]] = {}
|
||||
for group in azure_ad_groups:
|
||||
# If the group is already identified, we don't need to get the members
|
||||
if group.display_name in identified_groups:
|
||||
continue
|
||||
# AD groups allows same display name for multiple groups, so we need to add the GUID to the name
|
||||
name = group.display_name
|
||||
name = _get_group_name_with_suffix(group.id, name, graph_client)
|
||||
|
||||
members = sleep_and_retry(
|
||||
group.members.get_all(page_loaded=lambda _: None),
|
||||
"get_sharepoint_external_groups:get_azure_ad_groups:get_members",
|
||||
)
|
||||
for member in members:
|
||||
member_data = member.to_json()
|
||||
user_principal_name = member_data.get("userPrincipalName")
|
||||
mail = member_data.get("mail")
|
||||
if not ad_groups_to_emails.get(name):
|
||||
ad_groups_to_emails[name] = set()
|
||||
if user_principal_name:
|
||||
if MICROSOFT_DOMAIN in user_principal_name:
|
||||
user_principal_name = user_principal_name.replace(
|
||||
MICROSOFT_DOMAIN, ""
|
||||
)
|
||||
ad_groups_to_emails[name].add(user_principal_name)
|
||||
elif mail:
|
||||
if MICROSOFT_DOMAIN in mail:
|
||||
mail = mail.replace(MICROSOFT_DOMAIN, "")
|
||||
ad_groups_to_emails[name].add(mail)
|
||||
|
||||
external_user_groups: list[ExternalUserGroup] = []
|
||||
for group_name, emails in groups_and_members.groups_to_emails.items():
|
||||
external_user_group = ExternalUserGroup(
|
||||
id=group_name,
|
||||
user_emails=list(emails),
|
||||
)
|
||||
external_user_groups.append(external_user_group)
|
||||
|
||||
for group_name, emails in ad_groups_to_emails.items():
|
||||
external_user_group = ExternalUserGroup(
|
||||
id=group_name,
|
||||
user_emails=list(emails),
|
||||
)
|
||||
external_user_groups.append(external_user_group)
|
||||
|
||||
return external_user_groups
|
||||
@@ -3,6 +3,7 @@ from collections.abc import Generator
|
||||
from slack_sdk import WebClient
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from ee.onyx.external_permissions.slack.utils import fetch_user_id_to_email_map
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.models import ExternalAccess
|
||||
@@ -130,6 +131,7 @@ def _get_slack_document_access(
|
||||
def slack_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
|
||||
@@ -7,21 +7,34 @@ from pydantic import BaseModel
|
||||
from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import CONFLUENCE_PERMISSION_GROUP_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import DEFAULT_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import GITHUB_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import GOOGLE_DRIVE_PERMISSION_GROUP_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import JIRA_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import SLACK_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.configs.app_configs import TEAMS_PERMISSION_DOC_SYNC_FREQUENCY
|
||||
from ee.onyx.external_permissions.confluence.doc_sync import confluence_doc_sync
|
||||
from ee.onyx.external_permissions.confluence.group_sync import confluence_group_sync
|
||||
from ee.onyx.external_permissions.github.doc_sync import github_doc_sync
|
||||
from ee.onyx.external_permissions.github.group_sync import github_group_sync
|
||||
from ee.onyx.external_permissions.gmail.doc_sync import gmail_doc_sync
|
||||
from ee.onyx.external_permissions.google_drive.doc_sync import gdrive_doc_sync
|
||||
from ee.onyx.external_permissions.google_drive.group_sync import gdrive_group_sync
|
||||
from ee.onyx.external_permissions.jira.doc_sync import jira_doc_sync
|
||||
from ee.onyx.external_permissions.perm_sync_types import CensoringFuncType
|
||||
from ee.onyx.external_permissions.perm_sync_types import DocSyncFuncType
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import GroupSyncFuncType
|
||||
from ee.onyx.external_permissions.salesforce.postprocessing import (
|
||||
censor_salesforce_chunks,
|
||||
)
|
||||
from ee.onyx.external_permissions.sharepoint.doc_sync import sharepoint_doc_sync
|
||||
from ee.onyx.external_permissions.sharepoint.group_sync import sharepoint_group_sync
|
||||
from ee.onyx.external_permissions.slack.doc_sync import slack_doc_sync
|
||||
from ee.onyx.external_permissions.teams.doc_sync import teams_doc_sync
|
||||
from onyx.configs.constants import DocumentSource
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -59,6 +72,7 @@ class SyncConfig(BaseModel):
|
||||
def mock_doc_sync(
|
||||
cc_pair: "ConnectorCredentialPair",
|
||||
fetch_all_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: Optional["IndexingHeartbeatInterface"],
|
||||
) -> Generator["DocExternalAccess", None, None]:
|
||||
"""Mock doc sync function for testing - returns empty list since permissions are fetched during indexing"""
|
||||
@@ -90,15 +104,21 @@ _SOURCE_TO_SYNC_CONFIG: dict[DocumentSource, SyncConfig] = {
|
||||
group_sync_is_cc_pair_agnostic=True,
|
||||
),
|
||||
),
|
||||
DocumentSource.JIRA: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
doc_sync_frequency=JIRA_PERMISSION_DOC_SYNC_FREQUENCY,
|
||||
doc_sync_func=jira_doc_sync,
|
||||
initial_index_should_sync=True,
|
||||
),
|
||||
),
|
||||
# Groups are not needed for Slack.
|
||||
# All channel access is done at the individual user level.
|
||||
DocumentSource.SLACK: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
doc_sync_frequency=SLACK_PERMISSION_DOC_SYNC_FREQUENCY,
|
||||
doc_sync_func=slack_doc_sync,
|
||||
initial_index_should_sync=True,
|
||||
),
|
||||
# groups are not needed for Slack. All channel access is done at the
|
||||
# individual user level
|
||||
group_sync_config=None,
|
||||
),
|
||||
DocumentSource.GMAIL: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
@@ -107,6 +127,18 @@ _SOURCE_TO_SYNC_CONFIG: dict[DocumentSource, SyncConfig] = {
|
||||
initial_index_should_sync=False,
|
||||
),
|
||||
),
|
||||
DocumentSource.GITHUB: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
doc_sync_frequency=GITHUB_PERMISSION_DOC_SYNC_FREQUENCY,
|
||||
doc_sync_func=github_doc_sync,
|
||||
initial_index_should_sync=True,
|
||||
),
|
||||
group_sync_config=GroupSyncConfig(
|
||||
group_sync_frequency=GITHUB_PERMISSION_GROUP_SYNC_FREQUENCY,
|
||||
group_sync_func=github_group_sync,
|
||||
group_sync_is_cc_pair_agnostic=False,
|
||||
),
|
||||
),
|
||||
DocumentSource.SALESFORCE: SyncConfig(
|
||||
censoring_config=CensoringConfig(
|
||||
chunk_censoring_func=censor_salesforce_chunks,
|
||||
@@ -119,6 +151,27 @@ _SOURCE_TO_SYNC_CONFIG: dict[DocumentSource, SyncConfig] = {
|
||||
initial_index_should_sync=True,
|
||||
),
|
||||
),
|
||||
# Groups are not needed for Teams.
|
||||
# All channel access is done at the individual user level.
|
||||
DocumentSource.TEAMS: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
doc_sync_frequency=TEAMS_PERMISSION_DOC_SYNC_FREQUENCY,
|
||||
doc_sync_func=teams_doc_sync,
|
||||
initial_index_should_sync=True,
|
||||
),
|
||||
),
|
||||
DocumentSource.SHAREPOINT: SyncConfig(
|
||||
doc_sync_config=DocSyncConfig(
|
||||
doc_sync_frequency=SHAREPOINT_PERMISSION_DOC_SYNC_FREQUENCY,
|
||||
doc_sync_func=sharepoint_doc_sync,
|
||||
initial_index_should_sync=True,
|
||||
),
|
||||
group_sync_config=GroupSyncConfig(
|
||||
group_sync_frequency=SHAREPOINT_PERMISSION_GROUP_SYNC_FREQUENCY,
|
||||
group_sync_func=sharepoint_group_sync,
|
||||
group_sync_is_cc_pair_agnostic=False,
|
||||
),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
||||
37
backend/ee/onyx/external_permissions/teams/doc_sync.py
Normal file
37
backend/ee/onyx/external_permissions/teams/doc_sync.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsFunction
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from ee.onyx.external_permissions.utils import generic_doc_sync
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.teams.connector import TeamsConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
TEAMS_DOC_SYNC_LABEL = "teams_doc_sync"
|
||||
|
||||
|
||||
def teams_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
teams_connector = TeamsConnector(
|
||||
**cc_pair.connector.connector_specific_config,
|
||||
)
|
||||
teams_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
yield from generic_doc_sync(
|
||||
cc_pair=cc_pair,
|
||||
fetch_all_existing_docs_ids_fn=fetch_all_existing_docs_ids_fn,
|
||||
callback=callback,
|
||||
doc_source=DocumentSource.TEAMS,
|
||||
slim_connector=teams_connector,
|
||||
label=TEAMS_DOC_SYNC_LABEL,
|
||||
)
|
||||
83
backend/ee/onyx/external_permissions/utils.py
Normal file
83
backend/ee/onyx/external_permissions/utils.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from ee.onyx.external_permissions.perm_sync_types import FetchAllDocumentsIdsFunction
|
||||
from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def generic_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
doc_source: DocumentSource,
|
||||
slim_connector: SlimConnector,
|
||||
label: str,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
A convenience function for performing a generic document synchronization.
|
||||
|
||||
Notes:
|
||||
A generic doc sync includes:
|
||||
- fetching existing docs
|
||||
- fetching *all* new (slim) docs
|
||||
- yielding external-access permissions for existing docs which do not exist in the newly fetched slim-docs set (with their
|
||||
`external_access` set to "private")
|
||||
- yielding external-access permissions for newly fetched docs
|
||||
|
||||
Returns:
|
||||
A `Generator` which yields existing and newly fetched external-access permissions.
|
||||
"""
|
||||
|
||||
logger.info(f"Starting {doc_source} doc sync for CC Pair ID: {cc_pair.id}")
|
||||
|
||||
newly_fetched_doc_ids: set[str] = set()
|
||||
|
||||
logger.info(f"Fetching all slim documents from {doc_source}")
|
||||
for doc_batch in slim_connector.retrieve_all_slim_documents(callback=callback):
|
||||
logger.info(f"Got {len(doc_batch)} slim documents from {doc_source}")
|
||||
|
||||
if callback:
|
||||
if callback.should_stop():
|
||||
raise RuntimeError(f"{label}: Stop signal detected")
|
||||
callback.progress(label, 1)
|
||||
|
||||
for doc in doc_batch:
|
||||
if not doc.external_access:
|
||||
raise RuntimeError(
|
||||
f"No external access found for document ID; {cc_pair.id=} {doc_source=} {doc.id=}"
|
||||
)
|
||||
|
||||
newly_fetched_doc_ids.add(doc.id)
|
||||
|
||||
yield DocExternalAccess(
|
||||
doc_id=doc.id,
|
||||
external_access=doc.external_access,
|
||||
)
|
||||
|
||||
logger.info(f"Querying existing document IDs for CC Pair ID: {cc_pair.id=}")
|
||||
existing_doc_ids: list[str] = fetch_all_existing_docs_ids_fn()
|
||||
|
||||
missing_doc_ids = set(existing_doc_ids) - newly_fetched_doc_ids
|
||||
|
||||
if not missing_doc_ids:
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
f"Found {len(missing_doc_ids)=} documents that are in the DB but not present in fetch. Making them inaccessible."
|
||||
)
|
||||
|
||||
for missing_id in missing_doc_ids:
|
||||
logger.warning(f"Removing access for {missing_id=}")
|
||||
yield DocExternalAccess(
|
||||
doc_id=missing_id,
|
||||
external_access=ExternalAccess.empty(),
|
||||
)
|
||||
|
||||
logger.info(f"Finished {doc_source} doc sync")
|
||||
@@ -206,7 +206,7 @@ def _handle_standard_answers(
|
||||
|
||||
restate_question_blocks = get_restate_blocks(
|
||||
msg=query_msg.message,
|
||||
is_bot_msg=message_info.is_bot_msg,
|
||||
is_slash_command=message_info.is_slash_command,
|
||||
)
|
||||
|
||||
answer_blocks = build_standard_answer_blocks(
|
||||
|
||||
@@ -19,7 +19,7 @@ from ee.onyx.db.analytics import fetch_query_analytics
|
||||
from ee.onyx.db.analytics import user_can_view_assistant_stats
|
||||
from onyx.auth.users import current_admin_user
|
||||
from onyx.auth.users import current_user
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
|
||||
router = APIRouter(prefix="/analytics")
|
||||
|
||||
@@ -17,7 +17,7 @@ from onyx.background.celery.versioned_apps.client import app as client_app
|
||||
from onyx.db.connector_credential_pair import (
|
||||
get_connector_credential_pair_from_id_for_user,
|
||||
)
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
|
||||
@@ -26,9 +26,9 @@ from onyx.auth.users import current_admin_user
|
||||
from onyx.auth.users import current_user_with_expired_token
|
||||
from onyx.auth.users import get_user_manager
|
||||
from onyx.auth.users import UserManager
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.file_store.file_store import PostgresBackedFileStore
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.server.utils import BasicAuthenticationError
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
@@ -134,19 +134,19 @@ def ee_fetch_settings() -> EnterpriseSettings:
|
||||
def put_logo(
|
||||
file: UploadFile,
|
||||
is_logotype: bool = False,
|
||||
db_session: Session = Depends(get_session),
|
||||
_: User | None = Depends(current_admin_user),
|
||||
) -> None:
|
||||
upload_logo(file=file, db_session=db_session, is_logotype=is_logotype)
|
||||
upload_logo(file=file, is_logotype=is_logotype)
|
||||
|
||||
|
||||
def fetch_logo_helper(db_session: Session) -> Response:
|
||||
try:
|
||||
file_store = PostgresBackedFileStore(db_session)
|
||||
file_store = get_default_file_store()
|
||||
onyx_file = file_store.get_file_with_mime_type(get_logo_filename())
|
||||
if not onyx_file:
|
||||
raise ValueError("get_onyx_file returned None!")
|
||||
except Exception:
|
||||
logger.exception("Faield to fetch logo file")
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="No logo file found",
|
||||
@@ -157,7 +157,7 @@ def fetch_logo_helper(db_session: Session) -> Response:
|
||||
|
||||
def fetch_logotype_helper(db_session: Session) -> Response:
|
||||
try:
|
||||
file_store = PostgresBackedFileStore(db_session)
|
||||
file_store = get_default_file_store()
|
||||
onyx_file = file_store.get_file_with_mime_type(get_logotype_filename())
|
||||
if not onyx_file:
|
||||
raise ValueError("get_onyx_file returned None!")
|
||||
|
||||
@@ -6,7 +6,6 @@ from typing import IO
|
||||
|
||||
from fastapi import HTTPException
|
||||
from fastapi import UploadFile
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ee.onyx.server.enterprise_settings.models import AnalyticsScriptUpload
|
||||
from ee.onyx.server.enterprise_settings.models import EnterpriseSettings
|
||||
@@ -99,9 +98,7 @@ def guess_file_type(filename: str) -> str:
|
||||
return "application/octet-stream"
|
||||
|
||||
|
||||
def upload_logo(
|
||||
db_session: Session, file: UploadFile | str, is_logotype: bool = False
|
||||
) -> bool:
|
||||
def upload_logo(file: UploadFile | str, is_logotype: bool = False) -> bool:
|
||||
content: IO[Any]
|
||||
|
||||
if isinstance(file, str):
|
||||
@@ -129,13 +126,13 @@ def upload_logo(
|
||||
display_name = file.filename
|
||||
file_type = file.content_type or "image/jpeg"
|
||||
|
||||
file_store = get_default_file_store(db_session)
|
||||
file_store = get_default_file_store()
|
||||
file_store.save_file(
|
||||
file_name=_LOGOTYPE_FILENAME if is_logotype else _LOGO_FILENAME,
|
||||
content=content,
|
||||
display_name=display_name,
|
||||
file_origin=FileOrigin.OTHER,
|
||||
file_type=file_type,
|
||||
file_id=_LOGOTYPE_FILENAME if is_logotype else _LOGO_FILENAME,
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from ee.onyx.db.standard_answer import remove_standard_answer
|
||||
from ee.onyx.db.standard_answer import update_standard_answer
|
||||
from ee.onyx.db.standard_answer import update_standard_answer_category
|
||||
from onyx.auth.users import current_admin_user
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.server.manage.models import StandardAnswer
|
||||
from onyx.server.manage.models import StandardAnswerCategory
|
||||
|
||||
@@ -11,7 +11,7 @@ from ee.onyx.auth.users import decode_anonymous_user_jwt_token
|
||||
from onyx.auth.api_key import extract_tenant_from_api_key_header
|
||||
from onyx.configs.constants import ANONYMOUS_USER_COOKIE_NAME
|
||||
from onyx.configs.constants import TENANT_ID_COOKIE_NAME
|
||||
from onyx.db.engine import is_valid_schema_name
|
||||
from onyx.db.engine.sql_engine import is_valid_schema_name
|
||||
from onyx.redis.redis_pool import retrieve_auth_token_data_from_redis
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
@@ -12,10 +12,10 @@ from ee.onyx.server.oauth.slack import SlackOAuth
|
||||
from onyx.auth.users import current_admin_user
|
||||
from onyx.configs.app_configs import DEV_MODE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.db.engine import get_current_tenant_id
|
||||
from onyx.db.models import User
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@@ -25,12 +25,12 @@ from onyx.connectors.confluence.utils import CONFLUENCE_OAUTH_TOKEN_URL
|
||||
from onyx.db.credentials import create_credential
|
||||
from onyx.db.credentials import fetch_credential_by_id_for_user
|
||||
from onyx.db.credentials import update_credential_json
|
||||
from onyx.db.engine import get_current_tenant_id
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.server.documents.models import CredentialBase
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@@ -33,11 +33,11 @@ from onyx.connectors.google_utils.shared_constants import (
|
||||
GoogleOAuthAuthenticationMethod,
|
||||
)
|
||||
from onyx.db.credentials import create_credential
|
||||
from onyx.db.engine import get_current_tenant_id
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.server.documents.models import CredentialBase
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
|
||||
class GoogleDriveOAuth:
|
||||
|
||||
@@ -17,11 +17,11 @@ from onyx.configs.app_configs import OAUTH_SLACK_CLIENT_SECRET
|
||||
from onyx.configs.app_configs import WEB_DOMAIN
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.db.credentials import create_credential
|
||||
from onyx.db.engine import get_current_tenant_id
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.server.documents.models import CredentialBase
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
|
||||
class SlackOAuth:
|
||||
|
||||
@@ -1,51 +1,30 @@
|
||||
import re
|
||||
from typing import cast
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi import Depends
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ee.onyx.server.query_and_chat.models import AgentAnswer
|
||||
from ee.onyx.server.query_and_chat.models import AgentSubQuery
|
||||
from ee.onyx.server.query_and_chat.models import AgentSubQuestion
|
||||
from ee.onyx.server.query_and_chat.models import BasicCreateChatMessageRequest
|
||||
from ee.onyx.server.query_and_chat.models import (
|
||||
BasicCreateChatMessageWithHistoryRequest,
|
||||
)
|
||||
from ee.onyx.server.query_and_chat.models import ChatBasicResponse
|
||||
from onyx.auth.users import current_user
|
||||
from onyx.chat.chat_utils import combine_message_thread
|
||||
from onyx.chat.chat_utils import create_chat_chain
|
||||
from onyx.chat.models import AgentAnswerPiece
|
||||
from onyx.chat.models import AllCitations
|
||||
from onyx.chat.models import ExtendedToolResponse
|
||||
from onyx.chat.models import FinalUsedContextDocsResponse
|
||||
from onyx.chat.models import LlmDoc
|
||||
from onyx.chat.models import LLMRelevanceFilterResponse
|
||||
from onyx.chat.models import OnyxAnswerPiece
|
||||
from onyx.chat.models import QADocsResponse
|
||||
from onyx.chat.models import RefinedAnswerImprovement
|
||||
from onyx.chat.models import StreamingError
|
||||
from onyx.chat.models import SubQueryPiece
|
||||
from onyx.chat.models import SubQuestionIdentifier
|
||||
from onyx.chat.models import SubQuestionPiece
|
||||
from onyx.chat.process_message import ChatPacketStream
|
||||
from onyx.chat.models import ChatBasicResponse
|
||||
from onyx.chat.process_message import gather_stream
|
||||
from onyx.chat.process_message import stream_chat_message_objects
|
||||
from onyx.configs.chat_configs import CHAT_TARGET_CHUNK_PERCENTAGE
|
||||
from onyx.configs.constants import MessageType
|
||||
from onyx.context.search.models import OptionalSearchSetting
|
||||
from onyx.context.search.models import RetrievalDetails
|
||||
from onyx.context.search.models import SavedSearchDoc
|
||||
from onyx.db.chat import create_chat_session
|
||||
from onyx.db.chat import create_new_chat_message
|
||||
from onyx.db.chat import get_or_create_root_message
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import User
|
||||
from onyx.llm.factory import get_llms_for_persona
|
||||
from onyx.natural_language_processing.utils import get_tokenizer
|
||||
from onyx.secondary_llm_flows.query_expansion import thread_based_query_rephrase
|
||||
from onyx.server.query_and_chat.models import ChatMessageDetail
|
||||
from onyx.server.query_and_chat.models import CreateChatMessageRequest
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@@ -54,177 +33,6 @@ logger = setup_logger()
|
||||
router = APIRouter(prefix="/chat")
|
||||
|
||||
|
||||
def _get_final_context_doc_indices(
|
||||
final_context_docs: list[LlmDoc] | None,
|
||||
top_docs: list[SavedSearchDoc] | None,
|
||||
) -> list[int] | None:
|
||||
"""
|
||||
this function returns a list of indices of the simple search docs
|
||||
that were actually fed to the LLM.
|
||||
"""
|
||||
if final_context_docs is None or top_docs is None:
|
||||
return None
|
||||
|
||||
final_context_doc_ids = {doc.document_id for doc in final_context_docs}
|
||||
return [
|
||||
i for i, doc in enumerate(top_docs) if doc.document_id in final_context_doc_ids
|
||||
]
|
||||
|
||||
|
||||
def _convert_packet_stream_to_response(
|
||||
packets: ChatPacketStream,
|
||||
) -> ChatBasicResponse:
|
||||
response = ChatBasicResponse()
|
||||
final_context_docs: list[LlmDoc] = []
|
||||
|
||||
answer = ""
|
||||
|
||||
# accumulate stream data with these dicts
|
||||
agent_sub_questions: dict[tuple[int, int], AgentSubQuestion] = {}
|
||||
agent_answers: dict[tuple[int, int], AgentAnswer] = {}
|
||||
agent_sub_queries: dict[tuple[int, int, int], AgentSubQuery] = {}
|
||||
|
||||
for packet in packets:
|
||||
if isinstance(packet, OnyxAnswerPiece) and packet.answer_piece:
|
||||
answer += packet.answer_piece
|
||||
elif isinstance(packet, QADocsResponse):
|
||||
response.top_documents = packet.top_documents
|
||||
|
||||
# This is a no-op if agent_sub_questions hasn't already been filled
|
||||
if packet.level is not None and packet.level_question_num is not None:
|
||||
id = (packet.level, packet.level_question_num)
|
||||
if id in agent_sub_questions:
|
||||
agent_sub_questions[id].document_ids = [
|
||||
saved_search_doc.document_id
|
||||
for saved_search_doc in packet.top_documents
|
||||
]
|
||||
elif isinstance(packet, StreamingError):
|
||||
response.error_msg = packet.error
|
||||
elif isinstance(packet, ChatMessageDetail):
|
||||
response.message_id = packet.message_id
|
||||
elif isinstance(packet, LLMRelevanceFilterResponse):
|
||||
response.llm_selected_doc_indices = packet.llm_selected_doc_indices
|
||||
|
||||
# TODO: deprecate `llm_chunks_indices`
|
||||
response.llm_chunks_indices = packet.llm_selected_doc_indices
|
||||
elif isinstance(packet, FinalUsedContextDocsResponse):
|
||||
final_context_docs = packet.final_context_docs
|
||||
elif isinstance(packet, AllCitations):
|
||||
response.cited_documents = {
|
||||
citation.citation_num: citation.document_id
|
||||
for citation in packet.citations
|
||||
}
|
||||
# agentic packets
|
||||
elif isinstance(packet, SubQuestionPiece):
|
||||
if packet.level is not None and packet.level_question_num is not None:
|
||||
id = (packet.level, packet.level_question_num)
|
||||
if agent_sub_questions.get(id) is None:
|
||||
agent_sub_questions[id] = AgentSubQuestion(
|
||||
level=packet.level,
|
||||
level_question_num=packet.level_question_num,
|
||||
sub_question=packet.sub_question,
|
||||
document_ids=[],
|
||||
)
|
||||
else:
|
||||
agent_sub_questions[id].sub_question += packet.sub_question
|
||||
|
||||
elif isinstance(packet, AgentAnswerPiece):
|
||||
if packet.level is not None and packet.level_question_num is not None:
|
||||
id = (packet.level, packet.level_question_num)
|
||||
if agent_answers.get(id) is None:
|
||||
agent_answers[id] = AgentAnswer(
|
||||
level=packet.level,
|
||||
level_question_num=packet.level_question_num,
|
||||
answer=packet.answer_piece,
|
||||
answer_type=packet.answer_type,
|
||||
)
|
||||
else:
|
||||
agent_answers[id].answer += packet.answer_piece
|
||||
elif isinstance(packet, SubQueryPiece):
|
||||
if packet.level is not None and packet.level_question_num is not None:
|
||||
sub_query_id = (
|
||||
packet.level,
|
||||
packet.level_question_num,
|
||||
packet.query_id,
|
||||
)
|
||||
if agent_sub_queries.get(sub_query_id) is None:
|
||||
agent_sub_queries[sub_query_id] = AgentSubQuery(
|
||||
level=packet.level,
|
||||
level_question_num=packet.level_question_num,
|
||||
sub_query=packet.sub_query,
|
||||
query_id=packet.query_id,
|
||||
)
|
||||
else:
|
||||
agent_sub_queries[sub_query_id].sub_query += packet.sub_query
|
||||
elif isinstance(packet, ExtendedToolResponse):
|
||||
# we shouldn't get this ... it gets intercepted and translated to QADocsResponse
|
||||
logger.warning(
|
||||
"_convert_packet_stream_to_response: Unexpected chat packet type ExtendedToolResponse!"
|
||||
)
|
||||
elif isinstance(packet, RefinedAnswerImprovement):
|
||||
response.agent_refined_answer_improvement = (
|
||||
packet.refined_answer_improvement
|
||||
)
|
||||
else:
|
||||
logger.warning(
|
||||
f"_convert_packet_stream_to_response - Unrecognized chat packet: type={type(packet)}"
|
||||
)
|
||||
|
||||
response.final_context_doc_indices = _get_final_context_doc_indices(
|
||||
final_context_docs, response.top_documents
|
||||
)
|
||||
|
||||
# organize / sort agent metadata for output
|
||||
if len(agent_sub_questions) > 0:
|
||||
response.agent_sub_questions = cast(
|
||||
dict[int, list[AgentSubQuestion]],
|
||||
SubQuestionIdentifier.make_dict_by_level(agent_sub_questions),
|
||||
)
|
||||
|
||||
if len(agent_answers) > 0:
|
||||
# return the agent_level_answer from the first level or the last one depending
|
||||
# on agent_refined_answer_improvement
|
||||
response.agent_answers = cast(
|
||||
dict[int, list[AgentAnswer]],
|
||||
SubQuestionIdentifier.make_dict_by_level(agent_answers),
|
||||
)
|
||||
if response.agent_answers:
|
||||
selected_answer_level = (
|
||||
0
|
||||
if not response.agent_refined_answer_improvement
|
||||
else len(response.agent_answers) - 1
|
||||
)
|
||||
level_answers = response.agent_answers[selected_answer_level]
|
||||
for level_answer in level_answers:
|
||||
if level_answer.answer_type != "agent_level_answer":
|
||||
continue
|
||||
|
||||
answer = level_answer.answer
|
||||
break
|
||||
|
||||
if len(agent_sub_queries) > 0:
|
||||
# subqueries are often emitted with trailing whitespace ... clean it up here
|
||||
# perhaps fix at the source?
|
||||
for v in agent_sub_queries.values():
|
||||
v.sub_query = v.sub_query.strip()
|
||||
|
||||
response.agent_sub_queries = (
|
||||
AgentSubQuery.make_dict_by_level_and_question_index(agent_sub_queries)
|
||||
)
|
||||
|
||||
response.answer = answer
|
||||
if answer:
|
||||
response.answer_citationless = remove_answer_citations(answer)
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def remove_answer_citations(answer: str) -> str:
|
||||
pattern = r"\s*\[\[\d+\]\]\(http[s]?://[^\s]+\)"
|
||||
|
||||
return re.sub(pattern, "", answer)
|
||||
|
||||
|
||||
@router.post("/send-message-simple-api")
|
||||
def handle_simplified_chat_message(
|
||||
chat_message_req: BasicCreateChatMessageRequest,
|
||||
@@ -237,13 +45,36 @@ def handle_simplified_chat_message(
|
||||
if not chat_message_req.message:
|
||||
raise HTTPException(status_code=400, detail="Empty chat message is invalid")
|
||||
|
||||
# Handle chat session creation if chat_session_id is not provided
|
||||
if chat_message_req.chat_session_id is None:
|
||||
if chat_message_req.persona_id is None:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Either chat_session_id or persona_id must be provided",
|
||||
)
|
||||
|
||||
# Create a new chat session with the provided persona_id
|
||||
try:
|
||||
new_chat_session = create_chat_session(
|
||||
db_session=db_session,
|
||||
description="", # Leave empty for simple API
|
||||
user_id=user.id if user else None,
|
||||
persona_id=chat_message_req.persona_id,
|
||||
)
|
||||
chat_session_id = new_chat_session.id
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
raise HTTPException(status_code=400, detail="Invalid Persona provided.")
|
||||
else:
|
||||
chat_session_id = chat_message_req.chat_session_id
|
||||
|
||||
try:
|
||||
parent_message, _ = create_chat_chain(
|
||||
chat_session_id=chat_message_req.chat_session_id, db_session=db_session
|
||||
chat_session_id=chat_session_id, db_session=db_session
|
||||
)
|
||||
except Exception:
|
||||
parent_message = get_or_create_root_message(
|
||||
chat_session_id=chat_message_req.chat_session_id, db_session=db_session
|
||||
chat_session_id=chat_session_id, db_session=db_session
|
||||
)
|
||||
|
||||
if (
|
||||
@@ -258,7 +89,7 @@ def handle_simplified_chat_message(
|
||||
retrieval_options = chat_message_req.retrieval_options
|
||||
|
||||
full_chat_msg_info = CreateChatMessageRequest(
|
||||
chat_session_id=chat_message_req.chat_session_id,
|
||||
chat_session_id=chat_session_id,
|
||||
parent_message_id=parent_message.id,
|
||||
message=chat_message_req.message,
|
||||
file_descriptors=[],
|
||||
@@ -283,7 +114,7 @@ def handle_simplified_chat_message(
|
||||
enforce_chat_session_id_for_search_docs=False,
|
||||
)
|
||||
|
||||
return _convert_packet_stream_to_response(packets)
|
||||
return gather_stream(packets)
|
||||
|
||||
|
||||
@router.post("/send-message-simple-with-history")
|
||||
@@ -403,4 +234,4 @@ def handle_send_message_simple_with_history(
|
||||
enforce_chat_session_id_for_search_docs=False,
|
||||
)
|
||||
|
||||
return _convert_packet_stream_to_response(packets)
|
||||
return gather_stream(packets)
|
||||
|
||||
@@ -6,10 +6,8 @@ from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
from pydantic import model_validator
|
||||
|
||||
from onyx.chat.models import CitationInfo
|
||||
from onyx.chat.models import PersonaOverrideConfig
|
||||
from onyx.chat.models import QADocsResponse
|
||||
from onyx.chat.models import SubQuestionIdentifier
|
||||
from onyx.chat.models import ThreadMessage
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.context.search.enums import LLMEvaluationType
|
||||
@@ -17,8 +15,9 @@ from onyx.context.search.enums import SearchType
|
||||
from onyx.context.search.models import ChunkContext
|
||||
from onyx.context.search.models import RerankingDetails
|
||||
from onyx.context.search.models import RetrievalDetails
|
||||
from onyx.context.search.models import SavedSearchDoc
|
||||
from onyx.server.manage.models import StandardAnswer
|
||||
from onyx.server.query_and_chat.streaming_models import CitationInfo
|
||||
from onyx.server.query_and_chat.streaming_models import SubQuestionIdentifier
|
||||
|
||||
|
||||
class StandardAnswerRequest(BaseModel):
|
||||
@@ -41,11 +40,13 @@ class DocumentSearchRequest(ChunkContext):
|
||||
|
||||
|
||||
class BasicCreateChatMessageRequest(ChunkContext):
|
||||
"""Before creating messages, be sure to create a chat_session and get an id
|
||||
"""If a chat_session_id is not provided, a persona_id must be provided to automatically create a new chat session
|
||||
Note, for simplicity this option only allows for a single linear chain of messages
|
||||
"""
|
||||
|
||||
chat_session_id: UUID
|
||||
chat_session_id: UUID | None = None
|
||||
# Optional persona_id to create a new chat session if chat_session_id is not provided
|
||||
persona_id: int | None = None
|
||||
# New message contents
|
||||
message: str
|
||||
# Defaults to using retrieval with no additional filters
|
||||
@@ -62,6 +63,12 @@ class BasicCreateChatMessageRequest(ChunkContext):
|
||||
# If True, uses agentic search instead of basic search
|
||||
use_agentic_search: bool = False
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_chat_session_or_persona(self) -> "BasicCreateChatMessageRequest":
|
||||
if self.chat_session_id is None and self.persona_id is None:
|
||||
raise ValueError("Either chat_session_id or persona_id must be provided")
|
||||
return self
|
||||
|
||||
|
||||
class BasicCreateChatMessageWithHistoryRequest(ChunkContext):
|
||||
# Last element is the new query. All previous elements are historical context
|
||||
@@ -148,30 +155,6 @@ class AgentSubQuery(SubQuestionIdentifier):
|
||||
return sorted_dict
|
||||
|
||||
|
||||
class ChatBasicResponse(BaseModel):
|
||||
# This is built piece by piece, any of these can be None as the flow could break
|
||||
answer: str | None = None
|
||||
answer_citationless: str | None = None
|
||||
|
||||
top_documents: list[SavedSearchDoc] | None = None
|
||||
|
||||
error_msg: str | None = None
|
||||
message_id: int | None = None
|
||||
llm_selected_doc_indices: list[int] | None = None
|
||||
final_context_doc_indices: list[int] | None = None
|
||||
# this is a map of the citation number to the document id
|
||||
cited_documents: dict[int, str] | None = None
|
||||
|
||||
# FOR BACKWARDS COMPATIBILITY
|
||||
llm_chunks_indices: list[int] | None = None
|
||||
|
||||
# agentic fields
|
||||
agent_sub_questions: dict[int, list[AgentSubQuestion]] | None = None
|
||||
agent_answers: dict[int, list[AgentAnswer]] | None = None
|
||||
agent_sub_queries: dict[int, dict[int, list[AgentSubQuery]]] | None = None
|
||||
agent_refined_answer_improvement: bool | None = None
|
||||
|
||||
|
||||
class OneShotQARequest(ChunkContext):
|
||||
# Supports simplier APIs that don't deal with chat histories or message edits
|
||||
# Easier APIs to work with for developers
|
||||
@@ -182,7 +165,6 @@ class OneShotQARequest(ChunkContext):
|
||||
prompt_id: int | None = None
|
||||
retrieval_options: RetrievalDetails = Field(default_factory=RetrievalDetails)
|
||||
rerank_settings: RerankingDetails | None = None
|
||||
return_contexts: bool = False
|
||||
|
||||
# allows the caller to specify the exact search query they want to use
|
||||
# can be used if the message sent to the LLM / query should not be the same
|
||||
@@ -214,6 +196,5 @@ class OneShotQAResponse(BaseModel):
|
||||
rephrase: str | None = None
|
||||
citations: list[CitationInfo] | None = None
|
||||
docs: QADocsResponse | None = None
|
||||
llm_selected_doc_indices: list[int] | None = None
|
||||
error_msg: str | None = None
|
||||
chat_message_id: int | None = None
|
||||
|
||||
@@ -8,7 +8,6 @@ from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from ee.onyx.chat.process_message import gather_stream_for_answer_api
|
||||
from ee.onyx.onyxbot.slack.handlers.handle_standard_answers import (
|
||||
oneoff_standard_answers,
|
||||
)
|
||||
@@ -20,8 +19,10 @@ from ee.onyx.server.query_and_chat.models import StandardAnswerResponse
|
||||
from onyx.auth.users import current_user
|
||||
from onyx.chat.chat_utils import combine_message_thread
|
||||
from onyx.chat.chat_utils import prepare_chat_message_request
|
||||
from onyx.chat.models import AnswerStream
|
||||
from onyx.chat.models import PersonaOverrideConfig
|
||||
from onyx.chat.process_message import ChatPacketStream
|
||||
from onyx.chat.models import QADocsResponse
|
||||
from onyx.chat.process_message import gather_stream
|
||||
from onyx.chat.process_message import stream_chat_message_objects
|
||||
from onyx.configs.onyxbot_configs import MAX_THREAD_CONTEXT_PERCENTAGE
|
||||
from onyx.context.search.models import SavedSearchDocWithContent
|
||||
@@ -31,7 +32,7 @@ from onyx.context.search.utils import dedupe_documents
|
||||
from onyx.context.search.utils import drop_llm_indices
|
||||
from onyx.context.search.utils import relevant_sections_to_indices
|
||||
from onyx.db.chat import get_prompt_by_id
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.models import Persona
|
||||
from onyx.db.models import User
|
||||
from onyx.db.persona import get_persona_by_id
|
||||
@@ -39,6 +40,7 @@ from onyx.llm.factory import get_default_llms
|
||||
from onyx.llm.factory import get_llms_for_persona
|
||||
from onyx.llm.factory import get_main_llm_from_tuple
|
||||
from onyx.natural_language_processing.utils import get_tokenizer
|
||||
from onyx.server.query_and_chat.streaming_models import CitationInfo
|
||||
from onyx.server.utils import get_json_line
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@@ -140,7 +142,7 @@ def get_answer_stream(
|
||||
query_request: OneShotQARequest,
|
||||
user: User | None = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> ChatPacketStream:
|
||||
) -> AnswerStream:
|
||||
query = query_request.messages[0].message
|
||||
logger.notice(f"Received query for Answer API: {query}")
|
||||
|
||||
@@ -205,7 +207,6 @@ def get_answer_stream(
|
||||
new_msg_req=request,
|
||||
user=user,
|
||||
db_session=db_session,
|
||||
include_contexts=query_request.return_contexts,
|
||||
)
|
||||
|
||||
return packets
|
||||
@@ -219,12 +220,28 @@ def get_answer_with_citation(
|
||||
) -> OneShotQAResponse:
|
||||
try:
|
||||
packets = get_answer_stream(request, user, db_session)
|
||||
answer = gather_stream_for_answer_api(packets)
|
||||
answer = gather_stream(packets)
|
||||
|
||||
if answer.error_msg:
|
||||
raise RuntimeError(answer.error_msg)
|
||||
|
||||
return answer
|
||||
return OneShotQAResponse(
|
||||
answer=answer.answer,
|
||||
chat_message_id=answer.message_id,
|
||||
error_msg=answer.error_msg,
|
||||
citations=[
|
||||
CitationInfo(citation_num=i, document_id=doc_id)
|
||||
for i, doc_id in answer.cited_documents.items()
|
||||
],
|
||||
docs=QADocsResponse(
|
||||
top_documents=answer.top_documents,
|
||||
predicted_flow=None,
|
||||
predicted_search=None,
|
||||
applied_source_filters=None,
|
||||
applied_time_cutoff=None,
|
||||
recency_bias_multiplier=0.0,
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in get_answer_with_citation: {str(e)}", exc_info=True)
|
||||
raise HTTPException(status_code=500, detail="An internal server error occurred")
|
||||
|
||||
@@ -13,7 +13,7 @@ from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.db.api_key import is_api_key_email_address
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.models import ChatMessage
|
||||
from onyx.db.models import ChatSession
|
||||
from onyx.db.models import TokenRateLimit
|
||||
|
||||
@@ -37,11 +37,11 @@ from onyx.configs.constants import QueryHistoryType
|
||||
from onyx.configs.constants import SessionType
|
||||
from onyx.db.chat import get_chat_session_by_id
|
||||
from onyx.db.chat import get_chat_sessions_by_user
|
||||
from onyx.db.engine import get_session
|
||||
from onyx.db.engine.sql_engine import get_session
|
||||
from onyx.db.enums import TaskStatus
|
||||
from onyx.db.file_record import get_query_history_export_files
|
||||
from onyx.db.models import ChatSession
|
||||
from onyx.db.models import User
|
||||
from onyx.db.pg_file_store import get_query_history_export_files
|
||||
from onyx.db.tasks import get_task_with_id
|
||||
from onyx.db.tasks import register_task
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
@@ -49,6 +49,7 @@ from onyx.server.documents.models import PaginatedReturn
|
||||
from onyx.server.query_and_chat.models import ChatSessionDetails
|
||||
from onyx.server.query_and_chat.models import ChatSessionsResponse
|
||||
from onyx.utils.threadpool_concurrency import parallel_yield
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -181,7 +182,6 @@ def admin_get_chat_sessions(
|
||||
time_created=chat.time_created.isoformat(),
|
||||
time_updated=chat.time_updated.isoformat(),
|
||||
shared_status=chat.shared_status,
|
||||
folder_id=chat.folder_id,
|
||||
current_alternate_model=chat.current_alternate_model,
|
||||
)
|
||||
for chat in chat_sessions
|
||||
@@ -334,6 +334,7 @@ def start_query_history_export(
|
||||
"start": start,
|
||||
"end": end,
|
||||
"start_time": start_time,
|
||||
"tenant_id": get_current_tenant_id(),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -356,11 +357,11 @@ def get_query_history_export_status(
|
||||
# If task is None, then it's possible that the task has already finished processing.
|
||||
# Therefore, we should then check if the export file has already been stored inside of the file-store.
|
||||
# If that *also* doesn't exist, then we can return a 404.
|
||||
file_store = get_default_file_store(db_session)
|
||||
file_store = get_default_file_store()
|
||||
|
||||
report_name = construct_query_history_report_name(request_id)
|
||||
has_file = file_store.has_file(
|
||||
file_name=report_name,
|
||||
file_id=report_name,
|
||||
file_origin=FileOrigin.QUERY_HISTORY_CSV,
|
||||
file_type=FileType.CSV,
|
||||
)
|
||||
@@ -383,9 +384,9 @@ def download_query_history_csv(
|
||||
ensure_query_history_is_enabled(disallowed=[QueryHistoryType.DISABLED])
|
||||
|
||||
report_name = construct_query_history_report_name(request_id)
|
||||
file_store = get_default_file_store(db_session)
|
||||
file_store = get_default_file_store()
|
||||
has_file = file_store.has_file(
|
||||
file_name=report_name,
|
||||
file_id=report_name,
|
||||
file_origin=FileOrigin.QUERY_HISTORY_CSV,
|
||||
file_type=FileType.CSV,
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user