mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-20 17:25:44 +00:00
Compare commits
147 Commits
craft_chan
...
litellm_co
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5115b621c8 | ||
|
|
a924b49405 | ||
|
|
2d2d998811 | ||
|
|
0925b5fbd4 | ||
|
|
a02d8414ee | ||
|
|
c8abc4a115 | ||
|
|
cec37bff6a | ||
|
|
06d5d3971b | ||
|
|
ed287a2fc0 | ||
|
|
60857d1e73 | ||
|
|
bb5c22104e | ||
|
|
03d919c918 | ||
|
|
71d2ae563a | ||
|
|
19f9c7357c | ||
|
|
f8fa5b243c | ||
|
|
5f845c208f | ||
|
|
d8595f8de0 | ||
|
|
5b00d1ef9c | ||
|
|
41b6ed92a9 | ||
|
|
07f35336ad | ||
|
|
4728bb87c7 | ||
|
|
adfa2f30af | ||
|
|
9dac4165fb | ||
|
|
7d2ede5efc | ||
|
|
4592f6885f | ||
|
|
9dc14fad79 | ||
|
|
ff6e471cfb | ||
|
|
09b9443405 | ||
|
|
14cd6d08e8 | ||
|
|
5ee16697ce | ||
|
|
b794f7e10d | ||
|
|
bb3275bb75 | ||
|
|
7644e225a5 | ||
|
|
811600b84a | ||
|
|
40ce8615ff | ||
|
|
0cee3f6960 | ||
|
|
8883e5608f | ||
|
|
7c2f3ded44 | ||
|
|
aa094ce1f0 | ||
|
|
4b0c800db7 | ||
|
|
8386742c10 | ||
|
|
f2e5e4f040 | ||
|
|
c0498cf2fc | ||
|
|
954ee1706a | ||
|
|
0745765a56 | ||
|
|
10feb6ae77 | ||
|
|
f5b170af1e | ||
|
|
2d2f252e95 | ||
|
|
a05f304960 | ||
|
|
7ce5120302 | ||
|
|
2d8f864251 | ||
|
|
3b48c2104b | ||
|
|
a9ec6a2434 | ||
|
|
e85575c6cc | ||
|
|
c966c81e8a | ||
|
|
a0d6ebe66d | ||
|
|
d75b501a1f | ||
|
|
89dd44bee8 | ||
|
|
c5451ffe53 | ||
|
|
85da1d85ce | ||
|
|
00d90c5e27 | ||
|
|
ea7654e4b8 | ||
|
|
eb90775e42 | ||
|
|
75865fcdfd | ||
|
|
d50dc8fa68 | ||
|
|
39b96973ec | ||
|
|
a342c4d848 | ||
|
|
7c084a35b6 | ||
|
|
946eba5ba5 | ||
|
|
ec4f85f4a4 | ||
|
|
d8fd6d398e | ||
|
|
ef85a14b6e | ||
|
|
97b44b530e | ||
|
|
e05a34cad3 | ||
|
|
d80a4270cb | ||
|
|
a26b4ff888 | ||
|
|
185d2bb813 | ||
|
|
d5b64e8472 | ||
|
|
378a216af3 | ||
|
|
2c002c48f7 | ||
|
|
9c20549e58 | ||
|
|
ffd30ae72a | ||
|
|
e18496dfa7 | ||
|
|
560a78a5d0 | ||
|
|
10bc398746 | ||
|
|
9356f79461 | ||
|
|
e246b53108 | ||
|
|
26533d58e2 | ||
|
|
a32f27f4c8 | ||
|
|
413a96f138 | ||
|
|
73a6721886 | ||
|
|
01872a7196 | ||
|
|
0ba1f715f2 | ||
|
|
94d0dc0ffe | ||
|
|
039daa0027 | ||
|
|
62b1c55494 | ||
|
|
1800d4b9d7 | ||
|
|
5ed2d78471 | ||
|
|
ff28dc9c72 | ||
|
|
e88a7ac868 | ||
|
|
79c1bbe666 | ||
|
|
b1168d4526 | ||
|
|
21751b2cf2 | ||
|
|
cb33263ef0 | ||
|
|
9f9a68f2eb | ||
|
|
9c09c07980 | ||
|
|
9aaac7f1ad | ||
|
|
8b2071a3ae | ||
|
|
733d55c948 | ||
|
|
1498238c43 | ||
|
|
f0657dc1a3 | ||
|
|
96e71c496b | ||
|
|
db4e1dc1a3 | ||
|
|
bce5f0889f | ||
|
|
fa2f4e781a | ||
|
|
abdb683584 | ||
|
|
b7b4737b05 | ||
|
|
3f9b143429 | ||
|
|
dbf08a3483 | ||
|
|
43e2e7c69c | ||
|
|
1da20bc240 | ||
|
|
58b376d7b7 | ||
|
|
23e47a48e1 | ||
|
|
cda5b00174 | ||
|
|
6f4ababb11 | ||
|
|
e90656efbe | ||
|
|
b3803808e0 | ||
|
|
f5415bace6 | ||
|
|
b255297365 | ||
|
|
5463d6aadc | ||
|
|
b547d487c1 | ||
|
|
18821b612b | ||
|
|
2368cef307 | ||
|
|
668cc71be4 | ||
|
|
09f3ad8985 | ||
|
|
38e88c7b5c | ||
|
|
cc7bfdbcde | ||
|
|
0e3c511974 | ||
|
|
9606461ba0 | ||
|
|
d01fcbbf7a | ||
|
|
325a38e502 | ||
|
|
3916556397 | ||
|
|
a7edcd6880 | ||
|
|
f18f0ffd96 | ||
|
|
06c060bb1f | ||
|
|
94ebe9e221 | ||
|
|
99c9c378cd |
4
.github/CODEOWNERS
vendored
4
.github/CODEOWNERS
vendored
@@ -6,5 +6,5 @@
|
||||
/web/STANDARDS.md @raunakab @Weves
|
||||
|
||||
# Agent context files
|
||||
/CLAUDE.md.template @Weves
|
||||
/AGENTS.md.template @Weves
|
||||
/CLAUDE.md @Weves
|
||||
/AGENTS.md @Weves
|
||||
|
||||
64
.github/workflows/deployment.yml
vendored
64
.github/workflows/deployment.yml
vendored
@@ -82,7 +82,7 @@ jobs:
|
||||
if [[ "$TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
|
||||
IS_STABLE=true
|
||||
fi
|
||||
if [[ "$TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+-beta\.[0-9]+$ ]]; then
|
||||
if [[ "$TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+-beta(\.[0-9]+)?$ ]]; then
|
||||
IS_BETA=true
|
||||
fi
|
||||
|
||||
@@ -174,23 +174,10 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
MONITOR_DEPLOYMENTS_WEBHOOK, deploy/monitor-deployments-webhook
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Send Slack notification
|
||||
uses: ./.github/actions/slack-notify
|
||||
with:
|
||||
webhook-url: ${{ env.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
webhook-url: ${{ secrets.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
failed-jobs: "• check-version-tag"
|
||||
title: "🚨 Version Tag Check Failed"
|
||||
ref-name: ${{ github.ref_name }}
|
||||
@@ -262,7 +249,7 @@ jobs:
|
||||
xdg-utils
|
||||
|
||||
- name: setup node
|
||||
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v6.1.0
|
||||
uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # ratchet:actions/setup-node@v6.2.0
|
||||
with:
|
||||
node-version: 24
|
||||
package-manager-cache: false
|
||||
@@ -422,7 +409,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -495,7 +482,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -555,7 +542,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -633,7 +620,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -714,7 +701,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -782,7 +769,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -857,7 +844,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -929,7 +916,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -988,7 +975,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -1066,7 +1053,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -1139,7 +1126,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -1200,7 +1187,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -1280,7 +1267,7 @@ jobs:
|
||||
buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -1359,7 +1346,7 @@ jobs:
|
||||
buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -1422,7 +1409,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
@@ -1709,19 +1696,6 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
MONITOR_DEPLOYMENTS_WEBHOOK, deploy/monitor-deployments-webhook
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Determine failed jobs
|
||||
id: failed-jobs
|
||||
shell: bash
|
||||
@@ -1787,7 +1761,7 @@ jobs:
|
||||
- name: Send Slack notification
|
||||
uses: ./.github/actions/slack-notify
|
||||
with:
|
||||
webhook-url: ${{ env.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
webhook-url: ${{ secrets.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
failed-jobs: ${{ steps.failed-jobs.outputs.jobs }}
|
||||
title: "🚨 Deployment Workflow Failed"
|
||||
ref-name: ${{ github.ref_name }}
|
||||
|
||||
2
.github/workflows/docker-tag-beta.yml
vendored
2
.github/workflows/docker-tag-beta.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
2
.github/workflows/docker-tag-latest.yml
vendored
2
.github/workflows/docker-tag-latest.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
4
.github/workflows/nightly-scan-licenses.yml
vendored
4
.github/workflows/nightly-scan-licenses.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # ratchet:actions/setup-python@v6
|
||||
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # ratchet:actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: 'pip'
|
||||
@@ -97,7 +97,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
5
.github/workflows/pr-database-tests.yml
vendored
5
.github/workflows/pr-database-tests.yml
vendored
@@ -40,13 +40,16 @@ jobs:
|
||||
|
||||
- name: Generate OpenAPI schema and Python client
|
||||
shell: bash
|
||||
# TODO(Nik): https://linear.app/onyx-app/issue/ENG-1/update-test-infra-to-use-test-license
|
||||
env:
|
||||
LICENSE_ENFORCEMENT_ENABLED: "false"
|
||||
run: |
|
||||
ods openapi all
|
||||
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
8
.github/workflows/pr-desktop-build.yml
vendored
8
.github/workflows/pr-desktop-build.yml
vendored
@@ -45,12 +45,12 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8
|
||||
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020
|
||||
uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238
|
||||
with:
|
||||
node-version: 24
|
||||
cache: "npm" # zizmor: ignore[cache-poisoning]
|
||||
@@ -63,7 +63,7 @@ jobs:
|
||||
targets: ${{ matrix.target }}
|
||||
|
||||
- name: Cache Cargo registry and build
|
||||
uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # zizmor: ignore[cache-poisoning]
|
||||
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # zizmor: ignore[cache-poisoning]
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/bin/
|
||||
@@ -105,7 +105,7 @@ jobs:
|
||||
|
||||
- name: Upload build artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02
|
||||
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
|
||||
with:
|
||||
name: desktop-build-${{ matrix.platform }}-${{ github.run_id }}
|
||||
path: |
|
||||
|
||||
@@ -110,7 +110,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -118,6 +118,7 @@ jobs:
|
||||
- name: Create .env file for Docker Compose
|
||||
run: |
|
||||
cat <<EOF > deployment/docker_compose/.env
|
||||
COMPOSE_PROFILES=s3-filestore
|
||||
CODE_INTERPRETER_BETA_ENABLED=true
|
||||
DISABLE_TELEMETRY=true
|
||||
EOF
|
||||
|
||||
14
.github/workflows/pr-integration-tests.yml
vendored
14
.github/workflows/pr-integration-tests.yml
vendored
@@ -109,7 +109,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -169,7 +169,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -214,7 +214,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -287,7 +287,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -300,7 +300,10 @@ jobs:
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
run: |
|
||||
cat <<EOF > deployment/docker_compose/.env
|
||||
COMPOSE_PROFILES=s3-filestore
|
||||
ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true
|
||||
# TODO(Nik): https://linear.app/onyx-app/issue/ENG-1/update-test-infra-to-use-test-license
|
||||
LICENSE_ENFORCEMENT_ENABLED=false
|
||||
AUTH_TYPE=basic
|
||||
POSTGRES_POOL_PRE_PING=true
|
||||
POSTGRES_USE_NULL_POOL=true
|
||||
@@ -465,7 +468,7 @@ jobs:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -477,6 +480,7 @@ jobs:
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
|
||||
LICENSE_ENFORCEMENT_ENABLED=false \
|
||||
MULTI_TENANT=true \
|
||||
AUTH_TYPE=cloud \
|
||||
REQUIRE_EMAIL_VERIFICATION=false \
|
||||
|
||||
2
.github/workflows/pr-jest-tests.yml
vendored
2
.github/workflows/pr-jest-tests.yml
vendored
@@ -28,7 +28,7 @@ jobs:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v4
|
||||
uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # ratchet:actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
cache: "npm"
|
||||
|
||||
@@ -101,7 +101,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -161,7 +161,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -220,7 +220,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -279,7 +279,7 @@ jobs:
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -292,6 +292,7 @@ jobs:
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
run: |
|
||||
cat <<EOF > deployment/docker_compose/.env
|
||||
COMPOSE_PROFILES=s3-filestore
|
||||
AUTH_TYPE=basic
|
||||
POSTGRES_POOL_PRE_PING=true
|
||||
POSTGRES_USE_NULL_POOL=true
|
||||
|
||||
23
.github/workflows/pr-playwright-tests.yml
vendored
23
.github/workflows/pr-playwright-tests.yml
vendored
@@ -90,7 +90,7 @@ jobs:
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -151,7 +151,7 @@ jobs:
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -212,7 +212,7 @@ jobs:
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -249,7 +249,7 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
project: [admin, no-auth, exclusive]
|
||||
project: [admin, exclusive]
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
@@ -259,7 +259,7 @@ jobs:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v4
|
||||
uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # ratchet:actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
cache: "npm"
|
||||
@@ -289,7 +289,10 @@ jobs:
|
||||
RUN_ID: ${{ github.run_id }}
|
||||
run: |
|
||||
cat <<EOF > deployment/docker_compose/.env
|
||||
COMPOSE_PROFILES=s3-filestore
|
||||
ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true
|
||||
# TODO(Nik): https://linear.app/onyx-app/issue/ENG-1/update-test-infra-to-use-test-license
|
||||
LICENSE_ENFORCEMENT_ENABLED=false
|
||||
AUTH_TYPE=basic
|
||||
GEN_AI_API_KEY=${OPENAI_API_KEY_VALUE}
|
||||
EXA_API_KEY=${EXA_API_KEY_VALUE}
|
||||
@@ -299,15 +302,12 @@ jobs:
|
||||
ONYX_MODEL_SERVER_IMAGE=${ECR_CACHE}:playwright-test-model-server-${RUN_ID}
|
||||
ONYX_WEB_SERVER_IMAGE=${ECR_CACHE}:playwright-test-web-${RUN_ID}
|
||||
EOF
|
||||
if [ "${{ matrix.project }}" = "no-auth" ]; then
|
||||
echo "PLAYWRIGHT_FORCE_EMPTY_LLM_PROVIDERS=true" >> deployment/docker_compose/.env
|
||||
fi
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
@@ -430,9 +430,6 @@ jobs:
|
||||
run: |
|
||||
# Create test-results directory to ensure it exists for artifact upload
|
||||
mkdir -p test-results
|
||||
if [ "${PROJECT}" = "no-auth" ]; then
|
||||
export PLAYWRIGHT_FORCE_EMPTY_LLM_PROVIDERS=true
|
||||
fi
|
||||
npx playwright test --project ${PROJECT}
|
||||
|
||||
- uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
|
||||
@@ -493,7 +490,7 @@ jobs:
|
||||
# fetch-depth: 0
|
||||
|
||||
# - name: Setup node
|
||||
# uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v4
|
||||
# uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # ratchet:actions/setup-node@v4
|
||||
# with:
|
||||
# node-version: 22
|
||||
|
||||
|
||||
3
.github/workflows/pr-python-checks.yml
vendored
3
.github/workflows/pr-python-checks.yml
vendored
@@ -42,6 +42,9 @@ jobs:
|
||||
|
||||
- name: Generate OpenAPI schema and Python client
|
||||
shell: bash
|
||||
# TODO(Nik): https://linear.app/onyx-app/issue/ENG-1/update-test-infra-to-use-test-license
|
||||
env:
|
||||
LICENSE_ENFORCEMENT_ENABLED: "false"
|
||||
run: |
|
||||
ods openapi all
|
||||
|
||||
|
||||
2
.github/workflows/pr-python-model-tests.yml
vendored
2
.github/workflows/pr-python-model-tests.yml
vendored
@@ -64,7 +64,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
2
.github/workflows/pr-python-tests.yml
vendored
2
.github/workflows/pr-python-tests.yml
vendored
@@ -27,6 +27,8 @@ jobs:
|
||||
PYTHONPATH: ./backend
|
||||
REDIS_CLOUD_PYTEST_PASSWORD: ${{ secrets.REDIS_CLOUD_PYTEST_PASSWORD }}
|
||||
DISABLE_TELEMETRY: "true"
|
||||
# TODO(Nik): https://linear.app/onyx-app/issue/ENG-1/update-test-infra-to-use-test-license
|
||||
LICENSE_ENFORCEMENT_ENABLED: "false"
|
||||
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
4
.github/workflows/pr-quality-checks.yml
vendored
4
.github/workflows/pr-quality-checks.yml
vendored
@@ -24,13 +24,13 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
persist-credentials: false
|
||||
- uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # ratchet:actions/setup-python@v6
|
||||
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # ratchet:actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.11"
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # ratchet:hashicorp/setup-terraform@v3
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@395ad3262231945c25e8478fd5baf05154b1d79f # ratchet:actions/setup-node@v6
|
||||
uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # ratchet:actions/setup-node@v6
|
||||
with: # zizmor: ignore[cache-poisoning]
|
||||
node-version: 22
|
||||
cache: "npm"
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -40,10 +40,6 @@ settings.json
|
||||
/backend/tests/regression/answer_quality/search_test_config.yaml
|
||||
*.egg-info
|
||||
|
||||
# Claude
|
||||
AGENTS.md
|
||||
CLAUDE.md
|
||||
|
||||
# Local .terraform directories
|
||||
**/.terraform/*
|
||||
|
||||
|
||||
@@ -1,26 +1,25 @@
|
||||
# CLAUDE.md
|
||||
# PROJECT KNOWLEDGE BASE
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
This file provides guidance to AI agents when working with code in this repository.
|
||||
|
||||
## KEY NOTES
|
||||
|
||||
- If you run into any missing python dependency errors, try running your command with `source .venv/bin/activate` \
|
||||
to assume the python venv.
|
||||
to assume the python venv.
|
||||
- To make tests work, check the `.env` file at the root of the project to find an OpenAI key.
|
||||
- If using `playwright` to explore the frontend, you can usually log in with username `a@example.com` and password
|
||||
`a`. The app can be accessed at `http://localhost:3000`.
|
||||
`a`. The app can be accessed at `http://localhost:3000`.
|
||||
- You should assume that all Onyx services are running. To verify, you can check the `backend/log` directory to
|
||||
make sure we see logs coming out from the relevant service.
|
||||
make sure we see logs coming out from the relevant service.
|
||||
- To connect to the Postgres database, use: `docker exec -it onyx-relational_db-1 psql -U postgres -c "<SQL>"`
|
||||
- When making calls to the backend, always go through the frontend. E.g. make a call to `http://localhost:3000/api/persona` not `http://localhost:8080/api/persona`
|
||||
- Put ALL db operations under the `backend/onyx/db` / `backend/ee/onyx/db` directories. Don't run queries
|
||||
outside of those directories.
|
||||
outside of those directories.
|
||||
|
||||
## Project Overview
|
||||
|
||||
**Onyx** (formerly Danswer) is an open-source Gen-AI and Enterprise Search platform that connects to company documents, apps, and people. It features a modular architecture with both Community Edition (MIT licensed) and Enterprise Edition offerings.
|
||||
|
||||
|
||||
### Background Workers (Celery)
|
||||
|
||||
Onyx uses Celery for asynchronous task processing with multiple specialized workers:
|
||||
@@ -92,6 +91,7 @@ Onyx uses Celery for asynchronous task processing with multiple specialized work
|
||||
Onyx supports two deployment modes for background workers, controlled by the `USE_LIGHTWEIGHT_BACKGROUND_WORKER` environment variable:
|
||||
|
||||
**Lightweight Mode** (default, `USE_LIGHTWEIGHT_BACKGROUND_WORKER=true`):
|
||||
|
||||
- Runs a single consolidated `background` worker that handles all background tasks:
|
||||
- Light worker tasks (Vespa operations, permissions sync, deletion)
|
||||
- Document processing (indexing pipeline)
|
||||
@@ -105,12 +105,14 @@ Onyx supports two deployment modes for background workers, controlled by the `US
|
||||
- Default concurrency: 20 threads (increased to handle combined workload)
|
||||
|
||||
**Standard Mode** (`USE_LIGHTWEIGHT_BACKGROUND_WORKER=false`):
|
||||
|
||||
- Runs separate specialized workers as documented above (light, docprocessing, docfetching, heavy, kg_processing, monitoring, user_file_processing)
|
||||
- Better isolation and scalability
|
||||
- Can scale individual workers independently based on workload
|
||||
- Suitable for production deployments with higher load
|
||||
|
||||
The deployment mode affects:
|
||||
|
||||
- **Backend**: Worker processes spawned by supervisord or dev scripts
|
||||
- **Helm**: Which Kubernetes deployments are created
|
||||
- **Dev Environment**: Which workers `dev_run_background_jobs.py` spawns
|
||||
@@ -119,18 +121,18 @@ The deployment mode affects:
|
||||
|
||||
- **Thread-based Workers**: All workers use thread pools (not processes) for stability
|
||||
- **Tenant Awareness**: Multi-tenant support with per-tenant task isolation. There is a
|
||||
middleware layer that automatically finds the appropriate tenant ID when sending tasks
|
||||
via Celery Beat.
|
||||
middleware layer that automatically finds the appropriate tenant ID when sending tasks
|
||||
via Celery Beat.
|
||||
- **Task Prioritization**: High, Medium, Low priority queues
|
||||
- **Monitoring**: Built-in heartbeat and liveness checking
|
||||
- **Failure Handling**: Automatic retry and failure recovery mechanisms
|
||||
- **Redis Coordination**: Inter-process communication via Redis
|
||||
- **PostgreSQL State**: Task state and metadata stored in PostgreSQL
|
||||
|
||||
|
||||
#### Important Notes
|
||||
|
||||
**Defining Tasks**:
|
||||
**Defining Tasks**:
|
||||
|
||||
- Always use `@shared_task` rather than `@celery_app`
|
||||
- Put tasks under `background/celery/tasks/` or `ee/background/celery/tasks`
|
||||
|
||||
@@ -143,6 +145,7 @@ If you make any updates to a celery worker and you want to test these changes, y
|
||||
to ask me to restart the celery worker. There is no auto-restart on code-change mechanism.
|
||||
|
||||
### Code Quality
|
||||
|
||||
```bash
|
||||
# Install and run pre-commit hooks
|
||||
pre-commit install
|
||||
@@ -154,6 +157,7 @@ NOTE: Always make sure everything is strictly typed (both in Python and Typescri
|
||||
## Architecture Overview
|
||||
|
||||
### Technology Stack
|
||||
|
||||
- **Backend**: Python 3.11, FastAPI, SQLAlchemy, Alembic, Celery
|
||||
- **Frontend**: Next.js 15+, React 18, TypeScript, Tailwind CSS
|
||||
- **Database**: PostgreSQL with Redis caching
|
||||
@@ -435,6 +439,7 @@ function ContactForm() {
|
||||
**Reason:** Our custom color system uses CSS variables that automatically handle dark mode and maintain design consistency across the app. Standard Tailwind colors bypass this system.
|
||||
|
||||
**Available color categories:**
|
||||
|
||||
- **Text:** `text-01` through `text-05`, `text-inverted-XX`
|
||||
- **Backgrounds:** `background-neutral-XX`, `background-tint-XX` (and inverted variants)
|
||||
- **Borders:** `border-01` through `border-05`, `border-inverted-XX`
|
||||
@@ -467,6 +472,7 @@ function ContactForm() {
|
||||
## Database & Migrations
|
||||
|
||||
### Running Migrations
|
||||
|
||||
```bash
|
||||
# Standard migrations
|
||||
alembic upgrade head
|
||||
@@ -476,6 +482,7 @@ alembic -n schema_private upgrade head
|
||||
```
|
||||
|
||||
### Creating Migrations
|
||||
|
||||
```bash
|
||||
# Create migration
|
||||
alembic revision -m "description"
|
||||
@@ -488,13 +495,14 @@ Write the migration manually and place it in the file that alembic creates when
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
First, you must activate the virtual environment with `source .venv/bin/activate`.
|
||||
First, you must activate the virtual environment with `source .venv/bin/activate`.
|
||||
|
||||
There are 4 main types of tests within Onyx:
|
||||
|
||||
### Unit Tests
|
||||
|
||||
These should not assume any Onyx/external services are available to be called.
|
||||
Interactions with the outside world should be mocked using `unittest.mock`. Generally, only
|
||||
Interactions with the outside world should be mocked using `unittest.mock`. Generally, only
|
||||
write these for complex, isolated modules e.g. `citation_processing.py`.
|
||||
|
||||
To run them:
|
||||
@@ -504,13 +512,14 @@ pytest -xv backend/tests/unit
|
||||
```
|
||||
|
||||
### External Dependency Unit Tests
|
||||
These tests assume that all external dependencies of Onyx are available and callable (e.g. Postgres, Redis,
|
||||
|
||||
These tests assume that all external dependencies of Onyx are available and callable (e.g. Postgres, Redis,
|
||||
MinIO/S3, Vespa are running + OpenAI can be called + any request to the internet is fine + etc.).
|
||||
|
||||
However, the actual Onyx containers are not running and with these tests we call the function to test directly.
|
||||
We can also mock components/calls at will.
|
||||
We can also mock components/calls at will.
|
||||
|
||||
The goal with these tests are to minimize mocking while giving some flexibility to mock things that are flakey,
|
||||
The goal with these tests are to minimize mocking while giving some flexibility to mock things that are flakey,
|
||||
need strictly controlled behavior, or need to have their internal behavior validated (e.g. verify a function is called
|
||||
with certain args, something that would be impossible with proper integration tests).
|
||||
|
||||
@@ -523,15 +532,16 @@ python -m dotenv -f .vscode/.env run -- pytest backend/tests/external_dependency
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
Standard integration tests. Every test in `backend/tests/integration` runs against a real Onyx deployment. We cannot
|
||||
mock anything in these tests. Prefer writing integration tests (or External Dependency Unit Tests if mocking/internal
|
||||
|
||||
Standard integration tests. Every test in `backend/tests/integration` runs against a real Onyx deployment. We cannot
|
||||
mock anything in these tests. Prefer writing integration tests (or External Dependency Unit Tests if mocking/internal
|
||||
verification is necessary) over any other type of test.
|
||||
|
||||
Tests are parallelized at a directory level.
|
||||
|
||||
When writing integration tests, make sure to check the root `conftest.py` for useful fixtures + the `backend/tests/integration/common_utils` directory for utilities. Prefer (if one exists), calling the appropriate Manager
|
||||
When writing integration tests, make sure to check the root `conftest.py` for useful fixtures + the `backend/tests/integration/common_utils` directory for utilities. Prefer (if one exists), calling the appropriate Manager
|
||||
class in the utils over directly calling the APIs with a library like `requests`. Prefer using fixtures rather than
|
||||
calling the utilities directly (e.g. do NOT create admin users with
|
||||
calling the utilities directly (e.g. do NOT create admin users with
|
||||
`admin_user = UserManager.create(name="admin_user")`, instead use the `admin_user` fixture).
|
||||
|
||||
A great example of this type of test is `backend/tests/integration/dev_apis/test_simple_chat_api.py`.
|
||||
@@ -543,8 +553,9 @@ python -m dotenv -f .vscode/.env run -- pytest backend/tests/integration
|
||||
```
|
||||
|
||||
### Playwright (E2E) Tests
|
||||
These tests are an even more complete version of the Integration Tests mentioned above. Has all services of Onyx
|
||||
running, *including* the Web Server.
|
||||
|
||||
These tests are an even more complete version of the Integration Tests mentioned above. Has all services of Onyx
|
||||
running, _including_ the Web Server.
|
||||
|
||||
Use these tests for anything that requires significant frontend <-> backend coordination.
|
||||
|
||||
@@ -556,13 +567,11 @@ To run them:
|
||||
npx playwright test <TEST_NAME>
|
||||
```
|
||||
|
||||
|
||||
## Logs
|
||||
|
||||
When (1) writing integration tests or (2) doing live tests (e.g. curl / playwright) you can get access
|
||||
to logs via the `backend/log/<service_name>_debug.log` file. All Onyx services (api_server, web_server, celery_X)
|
||||
will be tailing their logs to this file.
|
||||
|
||||
will be tailing their logs to this file.
|
||||
|
||||
## Security Considerations
|
||||
|
||||
@@ -581,6 +590,7 @@ will be tailing their logs to this file.
|
||||
- Custom prompts and agent actions
|
||||
|
||||
## Creating a Plan
|
||||
|
||||
When creating a plan in the `plans` directory, make sure to include at least these elements:
|
||||
|
||||
**Issues to Address**
|
||||
@@ -593,10 +603,10 @@ Things you come across in your research that are important to the implementation
|
||||
How you are going to make the changes happen. High level approach.
|
||||
|
||||
**Tests**
|
||||
What unit (use rarely), external dependency unit, integration, and playwright tests you plan to write to
|
||||
What unit (use rarely), external dependency unit, integration, and playwright tests you plan to write to
|
||||
verify the correct behavior. Don't overtest. Usually, a given change only needs one type of test.
|
||||
|
||||
Do NOT include these: *Timeline*, *Rollback plan*
|
||||
Do NOT include these: _Timeline_, _Rollback plan_
|
||||
|
||||
This is a minimal list - feel free to include more. Do NOT write code as part of your plan.
|
||||
Keep it high level. You can reference certain files or functions though.
|
||||
@@ -1,599 +0,0 @@
|
||||
# AGENTS.md
|
||||
|
||||
This file provides guidance to AI agents when working with code in this repository.
|
||||
|
||||
## KEY NOTES
|
||||
|
||||
- If you run into any missing python dependency errors, try running your command with `source .venv/bin/activate` \
|
||||
to assume the python venv.
|
||||
- To make tests work, check the `.env` file at the root of the project to find an OpenAI key.
|
||||
- If using `playwright` to explore the frontend, you can usually log in with username `a@example.com` and password
|
||||
`a`. The app can be accessed at `http://localhost:3000`.
|
||||
- You should assume that all Onyx services are running. To verify, you can check the `backend/log` directory to
|
||||
make sure we see logs coming out from the relevant service.
|
||||
- To connect to the Postgres database, use: `docker exec -it onyx-relational_db-1 psql -U postgres -c "<SQL>"`
|
||||
- When making calls to the backend, always go through the frontend. E.g. make a call to `http://localhost:3000/api/persona` not `http://localhost:8080/api/persona`
|
||||
- Put ALL db operations under the `backend/onyx/db` / `backend/ee/onyx/db` directories. Don't run queries
|
||||
outside of those directories.
|
||||
|
||||
## Project Overview
|
||||
|
||||
**Onyx** (formerly Danswer) is an open-source Gen-AI and Enterprise Search platform that connects to company documents, apps, and people. It features a modular architecture with both Community Edition (MIT licensed) and Enterprise Edition offerings.
|
||||
|
||||
|
||||
### Background Workers (Celery)
|
||||
|
||||
Onyx uses Celery for asynchronous task processing with multiple specialized workers:
|
||||
|
||||
#### Worker Types
|
||||
|
||||
1. **Primary Worker** (`celery_app.py`)
|
||||
- Coordinates core background tasks and system-wide operations
|
||||
- Handles connector management, document sync, pruning, and periodic checks
|
||||
- Runs with 4 threads concurrency
|
||||
- Tasks: connector deletion, vespa sync, pruning, LLM model updates, user file sync
|
||||
|
||||
2. **Docfetching Worker** (`docfetching`)
|
||||
- Fetches documents from external data sources (connectors)
|
||||
- Spawns docprocessing tasks for each document batch
|
||||
- Implements watchdog monitoring for stuck connectors
|
||||
- Configurable concurrency (default from env)
|
||||
|
||||
3. **Docprocessing Worker** (`docprocessing`)
|
||||
- Processes fetched documents through the indexing pipeline:
|
||||
- Upserts documents to PostgreSQL
|
||||
- Chunks documents and adds contextual information
|
||||
- Embeds chunks via model server
|
||||
- Writes chunks to Vespa vector database
|
||||
- Updates document metadata
|
||||
- Configurable concurrency (default from env)
|
||||
|
||||
4. **Light Worker** (`light`)
|
||||
- Handles lightweight, fast operations
|
||||
- Tasks: vespa operations, document permissions sync, external group sync
|
||||
- Higher concurrency for quick tasks
|
||||
|
||||
5. **Heavy Worker** (`heavy`)
|
||||
- Handles resource-intensive operations
|
||||
- Primary task: document pruning operations
|
||||
- Runs with 4 threads concurrency
|
||||
|
||||
6. **KG Processing Worker** (`kg_processing`)
|
||||
- Handles Knowledge Graph processing and clustering
|
||||
- Builds relationships between documents
|
||||
- Runs clustering algorithms
|
||||
- Configurable concurrency
|
||||
|
||||
7. **Monitoring Worker** (`monitoring`)
|
||||
- System health monitoring and metrics collection
|
||||
- Monitors Celery queues, process memory, and system status
|
||||
- Single thread (monitoring doesn't need parallelism)
|
||||
- Cloud-specific monitoring tasks
|
||||
|
||||
8. **User File Processing Worker** (`user_file_processing`)
|
||||
- Processes user-uploaded files
|
||||
- Handles user file indexing and project synchronization
|
||||
- Configurable concurrency
|
||||
|
||||
9. **Beat Worker** (`beat`)
|
||||
- Celery's scheduler for periodic tasks
|
||||
- Uses DynamicTenantScheduler for multi-tenant support
|
||||
- Schedules tasks like:
|
||||
- Indexing checks (every 15 seconds)
|
||||
- Connector deletion checks (every 20 seconds)
|
||||
- Vespa sync checks (every 20 seconds)
|
||||
- Pruning checks (every 20 seconds)
|
||||
- KG processing (every 60 seconds)
|
||||
- Monitoring tasks (every 5 minutes)
|
||||
- Cleanup tasks (hourly)
|
||||
|
||||
#### Worker Deployment Modes
|
||||
|
||||
Onyx supports two deployment modes for background workers, controlled by the `USE_LIGHTWEIGHT_BACKGROUND_WORKER` environment variable:
|
||||
|
||||
**Lightweight Mode** (default, `USE_LIGHTWEIGHT_BACKGROUND_WORKER=true`):
|
||||
- Runs a single consolidated `background` worker that handles all background tasks:
|
||||
- Pruning operations (from `heavy` worker)
|
||||
- Knowledge graph processing (from `kg_processing` worker)
|
||||
- Monitoring tasks (from `monitoring` worker)
|
||||
- User file processing (from `user_file_processing` worker)
|
||||
- Lower resource footprint (single worker process)
|
||||
- Suitable for smaller deployments or development environments
|
||||
- Default concurrency: 6 threads
|
||||
|
||||
**Standard Mode** (`USE_LIGHTWEIGHT_BACKGROUND_WORKER=false`):
|
||||
- Runs separate specialized workers as documented above (heavy, kg_processing, monitoring, user_file_processing)
|
||||
- Better isolation and scalability
|
||||
- Can scale individual workers independently based on workload
|
||||
- Suitable for production deployments with higher load
|
||||
|
||||
The deployment mode affects:
|
||||
- **Backend**: Worker processes spawned by supervisord or dev scripts
|
||||
- **Helm**: Which Kubernetes deployments are created
|
||||
- **Dev Environment**: Which workers `dev_run_background_jobs.py` spawns
|
||||
|
||||
#### Key Features
|
||||
|
||||
- **Thread-based Workers**: All workers use thread pools (not processes) for stability
|
||||
- **Tenant Awareness**: Multi-tenant support with per-tenant task isolation. There is a
|
||||
middleware layer that automatically finds the appropriate tenant ID when sending tasks
|
||||
via Celery Beat.
|
||||
- **Task Prioritization**: High, Medium, Low priority queues
|
||||
- **Monitoring**: Built-in heartbeat and liveness checking
|
||||
- **Failure Handling**: Automatic retry and failure recovery mechanisms
|
||||
- **Redis Coordination**: Inter-process communication via Redis
|
||||
- **PostgreSQL State**: Task state and metadata stored in PostgreSQL
|
||||
|
||||
|
||||
#### Important Notes
|
||||
|
||||
**Defining Tasks**:
|
||||
- Always use `@shared_task` rather than `@celery_app`
|
||||
- Put tasks under `background/celery/tasks/` or `ee/background/celery/tasks`
|
||||
|
||||
**Defining APIs**:
|
||||
When creating new FastAPI APIs, do NOT use the `response_model` field. Instead, just type the
|
||||
function.
|
||||
|
||||
**Testing Updates**:
|
||||
If you make any updates to a celery worker and you want to test these changes, you will need
|
||||
to ask me to restart the celery worker. There is no auto-restart on code-change mechanism.
|
||||
|
||||
### Code Quality
|
||||
```bash
|
||||
# Install and run pre-commit hooks
|
||||
pre-commit install
|
||||
pre-commit run --all-files
|
||||
```
|
||||
|
||||
NOTE: Always make sure everything is strictly typed (both in Python and Typescript).
|
||||
|
||||
## Architecture Overview
|
||||
|
||||
### Technology Stack
|
||||
- **Backend**: Python 3.11, FastAPI, SQLAlchemy, Alembic, Celery
|
||||
- **Frontend**: Next.js 15+, React 18, TypeScript, Tailwind CSS
|
||||
- **Database**: PostgreSQL with Redis caching
|
||||
- **Search**: Vespa vector database
|
||||
- **Auth**: OAuth2, SAML, multi-provider support
|
||||
- **AI/ML**: LangChain, LiteLLM, multiple embedding models
|
||||
|
||||
### Directory Structure
|
||||
|
||||
```
|
||||
backend/
|
||||
├── onyx/
|
||||
│ ├── auth/ # Authentication & authorization
|
||||
│ ├── chat/ # Chat functionality & LLM interactions
|
||||
│ ├── connectors/ # Data source connectors
|
||||
│ ├── db/ # Database models & operations
|
||||
│ ├── document_index/ # Vespa integration
|
||||
│ ├── federated_connectors/ # External search connectors
|
||||
│ ├── llm/ # LLM provider integrations
|
||||
│ └── server/ # API endpoints & routers
|
||||
├── ee/ # Enterprise Edition features
|
||||
├── alembic/ # Database migrations
|
||||
└── tests/ # Test suites
|
||||
|
||||
web/
|
||||
├── src/app/ # Next.js app router pages
|
||||
├── src/components/ # Reusable React components
|
||||
└── src/lib/ # Utilities & business logic
|
||||
```
|
||||
|
||||
## Frontend Standards
|
||||
|
||||
### 1. Import Standards
|
||||
|
||||
**Always use absolute imports with the `@` prefix.**
|
||||
|
||||
**Reason:** Moving files around becomes easier since you don't also have to update those import statements. This makes modifications to the codebase much nicer.
|
||||
|
||||
```typescript
|
||||
// ✅ Good
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { useAuth } from "@/hooks/useAuth";
|
||||
import { Text } from "@/refresh-components/texts/Text";
|
||||
|
||||
// ❌ Bad
|
||||
import { Button } from "../../../components/ui/button";
|
||||
import { useAuth } from "./hooks/useAuth";
|
||||
```
|
||||
|
||||
### 2. React Component Functions
|
||||
|
||||
**Prefer regular functions over arrow functions for React components.**
|
||||
|
||||
**Reason:** Functions just become easier to read.
|
||||
|
||||
```typescript
|
||||
// ✅ Good
|
||||
function UserProfile({ userId }: UserProfileProps) {
|
||||
return <div>User Profile</div>
|
||||
}
|
||||
|
||||
// ❌ Bad
|
||||
const UserProfile = ({ userId }: UserProfileProps) => {
|
||||
return <div>User Profile</div>
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Props Interface Extraction
|
||||
|
||||
**Extract prop types into their own interface definitions.**
|
||||
|
||||
**Reason:** Functions just become easier to read.
|
||||
|
||||
```typescript
|
||||
// ✅ Good
|
||||
interface UserCardProps {
|
||||
user: User
|
||||
showActions?: boolean
|
||||
onEdit?: (userId: string) => void
|
||||
}
|
||||
|
||||
function UserCard({ user, showActions = false, onEdit }: UserCardProps) {
|
||||
return <div>User Card</div>
|
||||
}
|
||||
|
||||
// ❌ Bad
|
||||
function UserCard({
|
||||
user,
|
||||
showActions = false,
|
||||
onEdit
|
||||
}: {
|
||||
user: User
|
||||
showActions?: boolean
|
||||
onEdit?: (userId: string) => void
|
||||
}) {
|
||||
return <div>User Card</div>
|
||||
}
|
||||
```
|
||||
|
||||
### 4. Spacing Guidelines
|
||||
|
||||
**Prefer padding over margins for spacing.**
|
||||
|
||||
**Reason:** We want to consolidate usage to paddings instead of margins.
|
||||
|
||||
```typescript
|
||||
// ✅ Good
|
||||
<div className="p-4 space-y-2">
|
||||
<div className="p-2">Content</div>
|
||||
</div>
|
||||
|
||||
// ❌ Bad
|
||||
<div className="m-4 space-y-2">
|
||||
<div className="m-2">Content</div>
|
||||
</div>
|
||||
```
|
||||
|
||||
### 5. Tailwind Dark Mode
|
||||
|
||||
**Strictly forbid using the `dark:` modifier in Tailwind classes, except for logo icon handling.**
|
||||
|
||||
**Reason:** The `colors.css` file already, VERY CAREFULLY, defines what the exact opposite colour of each light-mode colour is. Overriding this behaviour is VERY bad and will lead to horrible UI breakages.
|
||||
|
||||
**Exception:** The `createLogoIcon` helper in `web/src/components/icons/icons.tsx` uses `dark:` modifiers (`dark:invert`, `dark:hidden`, `dark:block`) to handle third-party logo icons that cannot automatically adapt through `colors.css`. This is the ONLY acceptable use of dark mode modifiers.
|
||||
|
||||
```typescript
|
||||
// ✅ Good - Standard components use `web/tailwind-themes/tailwind.config.js` / `web/src/app/css/colors.css`
|
||||
<div className="bg-background-neutral-03 text-text-02">
|
||||
Content
|
||||
</div>
|
||||
|
||||
// ✅ Good - Logo icons with dark mode handling via createLogoIcon
|
||||
export const GithubIcon = createLogoIcon(githubLightIcon, {
|
||||
monochromatic: true, // Will apply dark:invert internally
|
||||
});
|
||||
|
||||
export const GitbookIcon = createLogoIcon(gitbookLightIcon, {
|
||||
darkSrc: gitbookDarkIcon, // Will use dark:hidden/dark:block internally
|
||||
});
|
||||
|
||||
// ❌ Bad - Manual dark mode overrides
|
||||
<div className="bg-white dark:bg-black text-black dark:text-white">
|
||||
Content
|
||||
</div>
|
||||
```
|
||||
|
||||
### 6. Class Name Utilities
|
||||
|
||||
**Use the `cn` utility instead of raw string formatting for classNames.**
|
||||
|
||||
**Reason:** `cn`s are easier to read. They also allow for more complex types (i.e., string-arrays) to get formatted properly (it flattens each element in that string array down). As a result, it can allow things such as conditionals (i.e., `myCondition && "some-tailwind-class"`, which evaluates to `false` when `myCondition` is `false`) to get filtered out.
|
||||
|
||||
```typescript
|
||||
import { cn } from '@/lib/utils'
|
||||
|
||||
// ✅ Good
|
||||
<div className={cn(
|
||||
'base-class',
|
||||
isActive && 'active-class',
|
||||
className
|
||||
)}>
|
||||
Content
|
||||
</div>
|
||||
|
||||
// ❌ Bad
|
||||
<div className={`base-class ${isActive ? 'active-class' : ''} ${className}`}>
|
||||
Content
|
||||
</div>
|
||||
```
|
||||
|
||||
### 7. Custom Hooks Organization
|
||||
|
||||
**Follow a "hook-per-file" layout. Each hook should live in its own file within `web/src/hooks`.**
|
||||
|
||||
**Reason:** This is just a layout preference. Keeps code clean.
|
||||
|
||||
```typescript
|
||||
// web/src/hooks/useUserData.ts
|
||||
export function useUserData(userId: string) {
|
||||
// hook implementation
|
||||
}
|
||||
|
||||
// web/src/hooks/useLocalStorage.ts
|
||||
export function useLocalStorage<T>(key: string, initialValue: T) {
|
||||
// hook implementation
|
||||
}
|
||||
```
|
||||
|
||||
### 8. Icon Usage
|
||||
|
||||
**ONLY use icons from the `web/src/icons` directory. Do NOT use icons from `react-icons`, `lucide`, or other external libraries.**
|
||||
|
||||
**Reason:** We have a very carefully curated selection of icons that match our Onyx guidelines. We do NOT want to muddy those up with different aesthetic stylings.
|
||||
|
||||
```typescript
|
||||
// ✅ Good
|
||||
import SvgX from "@/icons/x";
|
||||
import SvgMoreHorizontal from "@/icons/more-horizontal";
|
||||
|
||||
// ❌ Bad
|
||||
import { User } from "lucide-react";
|
||||
import { FiSearch } from "react-icons/fi";
|
||||
```
|
||||
|
||||
**Missing Icons**: If an icon is needed but doesn't exist in the `web/src/icons` directory, import it from Figma using the Figma MCP tool and add it to the icons directory.
|
||||
If you need help with this step, reach out to `raunak@onyx.app`.
|
||||
|
||||
### 9. Text Rendering
|
||||
|
||||
**Prefer using the `refresh-components/texts/Text` component for all text rendering. Avoid "naked" text nodes.**
|
||||
|
||||
**Reason:** The `Text` component is fully compliant with the stylings provided in Figma. It provides easy utilities to specify the text-colour and font-size in the form of flags. Super duper easy.
|
||||
|
||||
```typescript
|
||||
// ✅ Good
|
||||
import { Text } from '@/refresh-components/texts/Text'
|
||||
|
||||
function UserCard({ name }: { name: string }) {
|
||||
return (
|
||||
<Text
|
||||
{/* The `text03` flag makes the text it renders to be coloured the 3rd-scale grey */}
|
||||
text03
|
||||
{/* The `mainAction` flag makes the text it renders to be "main-action" font + line-height + weightage, as described in the Figma */}
|
||||
mainAction
|
||||
>
|
||||
{name}
|
||||
</Text>
|
||||
)
|
||||
}
|
||||
|
||||
// ❌ Bad
|
||||
function UserCard({ name }: { name: string }) {
|
||||
return (
|
||||
<div>
|
||||
<h2>{name}</h2>
|
||||
<p>User details</p>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
### 10. Component Usage
|
||||
|
||||
**Heavily avoid raw HTML input components. Always use components from the `web/src/refresh-components` or `web/lib/opal/src` directory.**
|
||||
|
||||
**Reason:** We've put in a lot of effort to unify the components that are rendered in the Onyx app. Using raw components breaks the entire UI of the application, and leaves it in a muddier state than before.
|
||||
|
||||
```typescript
|
||||
// ✅ Good
|
||||
import Button from '@/refresh-components/buttons/Button'
|
||||
import InputTypeIn from '@/refresh-components/inputs/InputTypeIn'
|
||||
import SvgPlusCircle from '@/icons/plus-circle'
|
||||
|
||||
function ContactForm() {
|
||||
return (
|
||||
<form>
|
||||
<InputTypeIn placeholder="Search..." />
|
||||
<Button type="submit" leftIcon={SvgPlusCircle}>Submit</Button>
|
||||
</form>
|
||||
)
|
||||
}
|
||||
|
||||
// ❌ Bad
|
||||
function ContactForm() {
|
||||
return (
|
||||
<form>
|
||||
<input placeholder="Name" />
|
||||
<textarea placeholder="Message" />
|
||||
<button type="submit">Submit</button>
|
||||
</form>
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
### 11. Colors
|
||||
|
||||
**Always use custom overrides for colors and borders rather than built in Tailwind CSS colors. These overrides live in `web/tailwind-themes/tailwind.config.js`.**
|
||||
|
||||
**Reason:** Our custom color system uses CSS variables that automatically handle dark mode and maintain design consistency across the app. Standard Tailwind colors bypass this system.
|
||||
|
||||
**Available color categories:**
|
||||
- **Text:** `text-01` through `text-05`, `text-inverted-XX`
|
||||
- **Backgrounds:** `background-neutral-XX`, `background-tint-XX` (and inverted variants)
|
||||
- **Borders:** `border-01` through `border-05`, `border-inverted-XX`
|
||||
- **Actions:** `action-link-XX`, `action-danger-XX`
|
||||
- **Status:** `status-info-XX`, `status-success-XX`, `status-warning-XX`, `status-error-XX`
|
||||
- **Theme:** `theme-primary-XX`, `theme-red-XX`, `theme-blue-XX`, etc.
|
||||
|
||||
```typescript
|
||||
// ✅ Good - Use custom Onyx color classes
|
||||
<div className="bg-background-neutral-01 border border-border-02" />
|
||||
<div className="bg-background-tint-02 border border-border-01" />
|
||||
<div className="bg-status-success-01" />
|
||||
<div className="bg-action-link-01" />
|
||||
<div className="bg-theme-primary-05" />
|
||||
|
||||
// ❌ Bad - Do NOT use standard Tailwind colors
|
||||
<div className="bg-gray-100 border border-gray-300 text-gray-600" />
|
||||
<div className="bg-white border border-slate-200" />
|
||||
<div className="bg-green-100 text-green-700" />
|
||||
<div className="bg-blue-100 text-blue-600" />
|
||||
<div className="bg-indigo-500" />
|
||||
```
|
||||
|
||||
### 12. Data Fetching
|
||||
|
||||
**Prefer using `useSWR` for data fetching. Data should generally be fetched on the client side. Components that need data should display a loader / placeholder while waiting for that data. Prefer loading data within the component that needs it rather than at the top level and passing it down.**
|
||||
|
||||
**Reason:** Client side fetching allows us to load the skeleton of the page without waiting for data to load, leading to a snappier UX. Loading data where needed reduces dependencies between a component and its parent component(s).
|
||||
|
||||
## Database & Migrations
|
||||
|
||||
### Running Migrations
|
||||
```bash
|
||||
# Standard migrations
|
||||
alembic upgrade head
|
||||
|
||||
# Multi-tenant (Enterprise)
|
||||
alembic -n schema_private upgrade head
|
||||
```
|
||||
|
||||
### Creating Migrations
|
||||
```bash
|
||||
# Create migration
|
||||
alembic revision -m "description"
|
||||
|
||||
# Multi-tenant migration
|
||||
alembic -n schema_private revision -m "description"
|
||||
```
|
||||
|
||||
Write the migration manually and place it in the file that alembic creates when running the above command.
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
There are 4 main types of tests within Onyx:
|
||||
|
||||
### Unit Tests
|
||||
These should not assume any Onyx/external services are available to be called.
|
||||
Interactions with the outside world should be mocked using `unittest.mock`. Generally, only
|
||||
write these for complex, isolated modules e.g. `citation_processing.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest -xv backend/tests/unit
|
||||
```
|
||||
|
||||
### External Dependency Unit Tests
|
||||
These tests assume that all external dependencies of Onyx are available and callable (e.g. Postgres, Redis,
|
||||
MinIO/S3, Vespa are running + OpenAI can be called + any request to the internet is fine + etc.).
|
||||
|
||||
However, the actual Onyx containers are not running and with these tests we call the function to test directly.
|
||||
We can also mock components/calls at will.
|
||||
|
||||
The goal with these tests are to minimize mocking while giving some flexibility to mock things that are flakey,
|
||||
need strictly controlled behavior, or need to have their internal behavior validated (e.g. verify a function is called
|
||||
with certain args, something that would be impossible with proper integration tests).
|
||||
|
||||
A great example of this type of test is `backend/tests/external_dependency_unit/connectors/confluence/test_confluence_group_sync.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest backend/tests/external_dependency_unit
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
Standard integration tests. Every test in `backend/tests/integration` runs against a real Onyx deployment. We cannot
|
||||
mock anything in these tests. Prefer writing integration tests (or External Dependency Unit Tests if mocking/internal
|
||||
verification is necessary) over any other type of test.
|
||||
|
||||
Tests are parallelized at a directory level.
|
||||
|
||||
When writing integration tests, make sure to check the root `conftest.py` for useful fixtures + the `backend/tests/integration/common_utils` directory for utilities. Prefer (if one exists), calling the appropriate Manager
|
||||
class in the utils over directly calling the APIs with a library like `requests`. Prefer using fixtures rather than
|
||||
calling the utilities directly (e.g. do NOT create admin users with
|
||||
`admin_user = UserManager.create(name="admin_user")`, instead use the `admin_user` fixture).
|
||||
|
||||
A great example of this type of test is `backend/tests/integration/dev_apis/test_simple_chat_api.py`.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
python -m dotenv -f .vscode/.env run -- pytest backend/tests/integration
|
||||
```
|
||||
|
||||
### Playwright (E2E) Tests
|
||||
These tests are an even more complete version of the Integration Tests mentioned above. Has all services of Onyx
|
||||
running, *including* the Web Server.
|
||||
|
||||
Use these tests for anything that requires significant frontend <-> backend coordination.
|
||||
|
||||
Tests are located at `web/tests/e2e`. Tests are written in TypeScript.
|
||||
|
||||
To run them:
|
||||
|
||||
```bash
|
||||
npx playwright test <TEST_NAME>
|
||||
```
|
||||
|
||||
|
||||
## Logs
|
||||
|
||||
When (1) writing integration tests or (2) doing live tests (e.g. curl / playwright) you can get access
|
||||
to logs via the `backend/log/<service_name>_debug.log` file. All Onyx services (api_server, web_server, celery_X)
|
||||
will be tailing their logs to this file.
|
||||
|
||||
|
||||
## Security Considerations
|
||||
|
||||
- Never commit API keys or secrets to repository
|
||||
- Use encrypted credential storage for connector credentials
|
||||
- Follow RBAC patterns for new features
|
||||
- Implement proper input validation with Pydantic models
|
||||
- Use parameterized queries to prevent SQL injection
|
||||
|
||||
## AI/LLM Integration
|
||||
|
||||
- Multiple LLM providers supported via LiteLLM
|
||||
- Configurable models per feature (chat, search, embeddings)
|
||||
- Streaming support for real-time responses
|
||||
- Token management and rate limiting
|
||||
- Custom prompts and agent actions
|
||||
|
||||
## Creating a Plan
|
||||
When creating a plan in the `plans` directory, make sure to include at least these elements:
|
||||
|
||||
**Issues to Address**
|
||||
What the change is meant to do.
|
||||
|
||||
**Important Notes**
|
||||
Things you come across in your research that are important to the implementation.
|
||||
|
||||
**Implementation strategy**
|
||||
How you are going to make the changes happen. High level approach.
|
||||
|
||||
**Tests**
|
||||
What unit (use rarely), external dependency unit, integration, and playwright tests you plan to write to
|
||||
verify the correct behavior. Don't overtest. Usually, a given change only needs one type of test.
|
||||
|
||||
Do NOT include these: *Timeline*, *Rollback plan*
|
||||
|
||||
This is a minimal list - feel free to include more. Do NOT write code as part of your plan.
|
||||
Keep it high level. You can reference certain files or functions though.
|
||||
|
||||
Before writing your plan, make sure to do research. Explore the relevant sections in the codebase.
|
||||
@@ -149,6 +149,11 @@ RUN if [ "$ENABLE_CRAFT" = "true" ]; then \
|
||||
ENABLE_CRAFT=true /app/scripts/setup_craft_templates.sh; \
|
||||
fi
|
||||
|
||||
# Set Craft template paths to the in-image locations
|
||||
# These match the paths where setup_craft_templates.sh creates the templates
|
||||
ENV OUTPUTS_TEMPLATE_PATH=/app/onyx/server/features/build/sandbox/kubernetes/docker/templates/outputs
|
||||
ENV VENV_TEMPLATE_PATH=/app/onyx/server/features/build/sandbox/kubernetes/docker/templates/venv
|
||||
|
||||
# Put logo in assets
|
||||
COPY --chown=onyx:onyx ./assets /app/assets
|
||||
|
||||
|
||||
@@ -48,6 +48,7 @@ WORKDIR /app
|
||||
# Utils used by model server
|
||||
COPY ./onyx/utils/logger.py /app/onyx/utils/logger.py
|
||||
COPY ./onyx/utils/middleware.py /app/onyx/utils/middleware.py
|
||||
COPY ./onyx/utils/tenant.py /app/onyx/utils/tenant.py
|
||||
|
||||
# Place to fetch version information
|
||||
COPY ./onyx/__init__.py /app/onyx/__init__.py
|
||||
|
||||
@@ -57,7 +57,7 @@ if USE_IAM_AUTH:
|
||||
|
||||
|
||||
def include_object(
|
||||
object: SchemaItem,
|
||||
object: SchemaItem, # noqa: ARG001
|
||||
name: str | None,
|
||||
type_: Literal[
|
||||
"schema",
|
||||
@@ -67,8 +67,8 @@ def include_object(
|
||||
"unique_constraint",
|
||||
"foreign_key_constraint",
|
||||
],
|
||||
reflected: bool,
|
||||
compare_to: SchemaItem | None,
|
||||
reflected: bool, # noqa: ARG001
|
||||
compare_to: SchemaItem | None, # noqa: ARG001
|
||||
) -> bool:
|
||||
if type_ == "table" and name in EXCLUDE_TABLES:
|
||||
return False
|
||||
@@ -244,7 +244,7 @@ def do_run_migrations(
|
||||
|
||||
|
||||
def provide_iam_token_for_alembic(
|
||||
dialect: Any, conn_rec: Any, cargs: Any, cparams: Any
|
||||
dialect: Any, conn_rec: Any, cargs: Any, cparams: Any # noqa: ARG001
|
||||
) -> None:
|
||||
if USE_IAM_AUTH:
|
||||
# Database connection settings
|
||||
|
||||
343
backend/alembic/run_multitenant_migrations.py
Executable file
343
backend/alembic/run_multitenant_migrations.py
Executable file
@@ -0,0 +1,343 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Parallel Alembic Migration Runner
|
||||
|
||||
Upgrades tenant schemas to head in batched, parallel alembic subprocesses.
|
||||
Each subprocess handles a batch of schemas (via ``-x schemas=a,b,c``),
|
||||
reducing per-process overhead compared to one-schema-per-process.
|
||||
|
||||
Usage examples::
|
||||
|
||||
# defaults: 6 workers, 50 schemas/batch
|
||||
python alembic/run_multitenant_migrations.py
|
||||
|
||||
# custom settings
|
||||
python alembic/run_multitenant_migrations.py -j 8 -b 100
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import List, NamedTuple
|
||||
|
||||
from alembic.config import Config
|
||||
from alembic.script import ScriptDirectory
|
||||
from sqlalchemy import text
|
||||
|
||||
from onyx.db.engine.sql_engine import is_valid_schema_name
|
||||
from onyx.db.engine.sql_engine import SqlEngine
|
||||
from onyx.db.engine.tenant_utils import get_all_tenant_ids
|
||||
from shared_configs.configs import TENANT_ID_PREFIX
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Data types
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class Args(NamedTuple):
|
||||
jobs: int
|
||||
batch_size: int
|
||||
|
||||
|
||||
class BatchResult(NamedTuple):
|
||||
schemas: list[str]
|
||||
success: bool
|
||||
output: str
|
||||
elapsed_sec: float
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core functions
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def run_alembic_for_batch(schemas: list[str]) -> BatchResult:
|
||||
"""Run ``alembic upgrade head`` for a batch of schemas in one subprocess.
|
||||
|
||||
If the batch fails, it is automatically retried with ``-x continue=true``
|
||||
so that the remaining schemas in the batch still get migrated. The retry
|
||||
output (which contains alembic's per-schema error messages) is returned
|
||||
for diagnosis.
|
||||
"""
|
||||
csv = ",".join(schemas)
|
||||
base_cmd = ["alembic", "-x", f"schemas={csv}"]
|
||||
|
||||
start = time.monotonic()
|
||||
result = subprocess.run(
|
||||
[*base_cmd, "upgrade", "head"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
elapsed = time.monotonic() - start
|
||||
return BatchResult(schemas, True, result.stdout or "", elapsed)
|
||||
|
||||
# At least one schema failed. Print the initial error output, then
|
||||
# re-run with continue=true so the remaining schemas still get migrated.
|
||||
if result.stdout:
|
||||
print(f"Initial error output:\n{result.stdout}", file=sys.stderr, flush=True)
|
||||
print(
|
||||
f"Batch failed (exit {result.returncode}), retrying with 'continue=true'...",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
retry = subprocess.run(
|
||||
[*base_cmd, "-x", "continue=true", "upgrade", "head"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
)
|
||||
elapsed = time.monotonic() - start
|
||||
return BatchResult(schemas, False, retry.stdout or "", elapsed)
|
||||
|
||||
|
||||
def get_head_revision() -> str | None:
|
||||
"""Get the head revision from the alembic script directory."""
|
||||
alembic_cfg = Config("alembic.ini")
|
||||
script = ScriptDirectory.from_config(alembic_cfg)
|
||||
return script.get_current_head()
|
||||
|
||||
|
||||
def get_schemas_needing_migration(
|
||||
tenant_schemas: List[str], head_rev: str
|
||||
) -> List[str]:
|
||||
"""Return only schemas whose current alembic version is not at head."""
|
||||
if not tenant_schemas:
|
||||
return []
|
||||
|
||||
engine = SqlEngine.get_engine()
|
||||
|
||||
with engine.connect() as conn:
|
||||
# Find which schemas actually have an alembic_version table
|
||||
rows = conn.execute(
|
||||
text(
|
||||
"SELECT table_schema FROM information_schema.tables "
|
||||
"WHERE table_name = 'alembic_version' "
|
||||
"AND table_schema = ANY(:schemas)"
|
||||
),
|
||||
{"schemas": tenant_schemas},
|
||||
)
|
||||
schemas_with_table = set(row[0] for row in rows)
|
||||
|
||||
# Schemas without the table definitely need migration
|
||||
needs_migration = [s for s in tenant_schemas if s not in schemas_with_table]
|
||||
|
||||
if not schemas_with_table:
|
||||
return needs_migration
|
||||
|
||||
# Validate schema names before interpolating into SQL
|
||||
for schema in schemas_with_table:
|
||||
if not is_valid_schema_name(schema):
|
||||
raise ValueError(f"Invalid schema name: {schema}")
|
||||
|
||||
# Single query to get every schema's current revision at once.
|
||||
# Use integer tags instead of interpolating schema names into
|
||||
# string literals to avoid quoting issues.
|
||||
schema_list = list(schemas_with_table)
|
||||
union_parts = [
|
||||
f'SELECT {i} AS idx, version_num FROM "{schema}".alembic_version'
|
||||
for i, schema in enumerate(schema_list)
|
||||
]
|
||||
rows = conn.execute(text(" UNION ALL ".join(union_parts)))
|
||||
version_by_schema = {schema_list[row[0]]: row[1] for row in rows}
|
||||
|
||||
needs_migration.extend(
|
||||
s for s in schemas_with_table if version_by_schema.get(s) != head_rev
|
||||
)
|
||||
|
||||
return needs_migration
|
||||
|
||||
|
||||
def run_migrations_parallel(
|
||||
schemas: list[str],
|
||||
max_workers: int,
|
||||
batch_size: int,
|
||||
) -> bool:
|
||||
"""Chunk *schemas* into batches and run them in parallel.
|
||||
|
||||
A background monitor thread prints a status line every 60 s listing
|
||||
which batches are still in-flight, making it easy to spot hung tenants.
|
||||
"""
|
||||
batches = [schemas[i : i + batch_size] for i in range(0, len(schemas), batch_size)]
|
||||
total_batches = len(batches)
|
||||
print(
|
||||
f"{len(schemas)} schemas in {total_batches} batch(es) "
|
||||
f"with {max_workers} workers (batch size: {batch_size})...",
|
||||
flush=True,
|
||||
)
|
||||
all_success = True
|
||||
|
||||
# Thread-safe tracking of in-flight batches for the monitor thread.
|
||||
in_flight: dict[int, list[str]] = {}
|
||||
prev_in_flight: set[int] = set()
|
||||
lock = threading.Lock()
|
||||
stop_event = threading.Event()
|
||||
|
||||
def _monitor() -> None:
|
||||
"""Print a status line every 60 s listing batches still in-flight.
|
||||
|
||||
Only prints batches that were also present in the previous tick,
|
||||
making it easy to spot batches that are stuck.
|
||||
"""
|
||||
nonlocal prev_in_flight
|
||||
while not stop_event.wait(60):
|
||||
with lock:
|
||||
if not in_flight:
|
||||
prev_in_flight = set()
|
||||
continue
|
||||
current = set(in_flight)
|
||||
stuck = current & prev_in_flight
|
||||
prev_in_flight = current
|
||||
|
||||
if not stuck:
|
||||
continue
|
||||
|
||||
schemas = [s for idx in sorted(stuck) for s in in_flight[idx]]
|
||||
print(
|
||||
f"⏳ batch(es) still running since last check "
|
||||
f"({', '.join(str(i + 1) for i in sorted(stuck))}): "
|
||||
+ ", ".join(schemas),
|
||||
flush=True,
|
||||
)
|
||||
|
||||
monitor_thread = threading.Thread(target=_monitor, daemon=True)
|
||||
monitor_thread.start()
|
||||
|
||||
try:
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
|
||||
def _run(batch_idx: int, batch: list[str]) -> BatchResult:
|
||||
with lock:
|
||||
in_flight[batch_idx] = batch
|
||||
print(
|
||||
f"Batch {batch_idx + 1}/{total_batches} started "
|
||||
f"({len(batch)} schemas): {', '.join(batch)}",
|
||||
flush=True,
|
||||
)
|
||||
result = run_alembic_for_batch(batch)
|
||||
with lock:
|
||||
in_flight.pop(batch_idx, None)
|
||||
return result
|
||||
|
||||
future_to_idx = {
|
||||
executor.submit(_run, i, b): i for i, b in enumerate(batches)
|
||||
}
|
||||
|
||||
for future in as_completed(future_to_idx):
|
||||
batch_idx = future_to_idx[future]
|
||||
try:
|
||||
result = future.result()
|
||||
status = "✓" if result.success else "✗"
|
||||
|
||||
print(
|
||||
f"Batch {batch_idx + 1}/{total_batches} "
|
||||
f"{status} {len(result.schemas)} schemas "
|
||||
f"in {result.elapsed_sec:.1f}s",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
if not result.success:
|
||||
# Print last 20 lines of retry output for diagnosis
|
||||
tail = result.output.strip().splitlines()[-20:]
|
||||
for line in tail:
|
||||
print(f" {line}", flush=True)
|
||||
all_success = False
|
||||
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Batch {batch_idx + 1}/{total_batches} " f"✗ exception: {e}",
|
||||
flush=True,
|
||||
)
|
||||
all_success = False
|
||||
finally:
|
||||
stop_event.set()
|
||||
monitor_thread.join(timeout=2)
|
||||
|
||||
return all_success
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def parse_args() -> Args:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run alembic migrations for all tenant schemas in parallel"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-j",
|
||||
"--jobs",
|
||||
type=int,
|
||||
default=6,
|
||||
metavar="N",
|
||||
help="Number of parallel alembic processes (default: 6)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b",
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=50,
|
||||
metavar="N",
|
||||
help="Schemas per alembic process (default: 50)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
if args.jobs < 1:
|
||||
parser.error("--jobs must be >= 1")
|
||||
if args.batch_size < 1:
|
||||
parser.error("--batch-size must be >= 1")
|
||||
return Args(jobs=args.jobs, batch_size=args.batch_size)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
|
||||
head_rev = get_head_revision()
|
||||
if head_rev is None:
|
||||
print("Could not determine head revision.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
with SqlEngine.scoped_engine(pool_size=5, max_overflow=2):
|
||||
tenant_ids = get_all_tenant_ids()
|
||||
tenant_schemas = [tid for tid in tenant_ids if tid.startswith(TENANT_ID_PREFIX)]
|
||||
|
||||
if not tenant_schemas:
|
||||
print(
|
||||
"No tenant schemas found. Is MULTI_TENANT=true set?",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
schemas_to_migrate = get_schemas_needing_migration(tenant_schemas, head_rev)
|
||||
|
||||
if not schemas_to_migrate:
|
||||
print(
|
||||
f"All {len(tenant_schemas)} tenants are already at head "
|
||||
f"revision ({head_rev})."
|
||||
)
|
||||
return 0
|
||||
|
||||
print(
|
||||
f"{len(schemas_to_migrate)}/{len(tenant_schemas)} tenants need "
|
||||
f"migration (head: {head_rev})."
|
||||
)
|
||||
|
||||
success = run_migrations_parallel(
|
||||
schemas_to_migrate,
|
||||
max_workers=args.jobs,
|
||||
batch_size=args.batch_size,
|
||||
)
|
||||
|
||||
print(f"\n{'All migrations successful' if success else 'Some migrations failed'}")
|
||||
return 0 if success else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -1,58 +0,0 @@
|
||||
"""LLMProvider deprecated fields are nullable
|
||||
|
||||
Revision ID: 001984c88745
|
||||
Revises: 01f8e6d95a33
|
||||
Create Date: 2026-02-01 22:24:34.171100
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "001984c88745"
|
||||
down_revision = "01f8e6d95a33"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Make default_model_name nullable (was NOT NULL)
|
||||
op.alter_column(
|
||||
"llm_provider",
|
||||
"default_model_name",
|
||||
existing_type=sa.String(),
|
||||
nullable=True,
|
||||
)
|
||||
|
||||
# Remove server_default from is_default_vision_provider (was server_default=false())
|
||||
op.alter_column(
|
||||
"llm_provider",
|
||||
"is_default_vision_provider",
|
||||
existing_type=sa.Boolean(),
|
||||
server_default=None,
|
||||
)
|
||||
|
||||
# is_default_provider and default_vision_model are already nullable with no server_default
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Restore default_model_name to NOT NULL (set empty string for any NULLs first)
|
||||
op.execute(
|
||||
"UPDATE llm_provider SET default_model_name = '' WHERE default_model_name IS NULL"
|
||||
)
|
||||
op.alter_column(
|
||||
"llm_provider",
|
||||
"default_model_name",
|
||||
existing_type=sa.String(),
|
||||
nullable=False,
|
||||
)
|
||||
|
||||
# Restore server_default for is_default_vision_provider
|
||||
op.alter_column(
|
||||
"llm_provider",
|
||||
"is_default_vision_provider",
|
||||
existing_type=sa.Boolean(),
|
||||
server_default=sa.false(),
|
||||
)
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Populate flow mapping data
|
||||
|
||||
Revision ID: 01f8e6d95a33
|
||||
Revises: f220515df7b4
|
||||
Revises: d5c86e2c6dc6
|
||||
Create Date: 2026-01-31 17:37:10.485558
|
||||
|
||||
"""
|
||||
@@ -11,7 +11,7 @@ from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "01f8e6d95a33"
|
||||
down_revision = "f220515df7b4"
|
||||
down_revision = "d5c86e2c6dc6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
@@ -23,7 +23,7 @@ def upgrade() -> None:
|
||||
"""
|
||||
INSERT INTO llm_model_flow (llm_model_flow_type, is_default, model_configuration_id)
|
||||
SELECT
|
||||
'chat' AS llm_model_flow_type,
|
||||
'CHAT' AS llm_model_flow_type,
|
||||
COALESCE(
|
||||
(lp.is_default_provider IS TRUE AND lp.default_model_name = mc.name),
|
||||
FALSE
|
||||
@@ -44,7 +44,7 @@ def upgrade() -> None:
|
||||
"""
|
||||
INSERT INTO llm_model_flow (llm_model_flow_type, is_default, model_configuration_id)
|
||||
SELECT
|
||||
'vision' AS llm_model_flow_type,
|
||||
'VISION' AS llm_model_flow_type,
|
||||
COALESCE(
|
||||
(lp.is_default_vision_provider IS TRUE AND lp.default_vision_model = mc.name),
|
||||
FALSE
|
||||
@@ -68,7 +68,7 @@ def downgrade() -> None:
|
||||
default_vision_model = mc.name
|
||||
FROM llm_model_flow mf
|
||||
JOIN model_configuration mc ON mc.id = mf.model_configuration_id
|
||||
WHERE mf.llm_model_flow_type = 'vision'
|
||||
WHERE mf.llm_model_flow_type = 'VISION'
|
||||
AND mf.is_default = TRUE
|
||||
AND mc.llm_provider_id = lp.id;
|
||||
"""
|
||||
@@ -83,7 +83,7 @@ def downgrade() -> None:
|
||||
default_model_name = mc.name
|
||||
FROM llm_model_flow mf
|
||||
JOIN model_configuration mc ON mc.id = mf.model_configuration_id
|
||||
WHERE mf.llm_model_flow_type = 'chat'
|
||||
WHERE mf.llm_model_flow_type = 'CHAT'
|
||||
AND mf.is_default = TRUE
|
||||
AND mc.llm_provider_id = lp.id;
|
||||
"""
|
||||
@@ -100,7 +100,7 @@ def downgrade() -> None:
|
||||
FROM model_configuration mc
|
||||
JOIN llm_model_flow mf ON mf.model_configuration_id = mc.id
|
||||
WHERE mc.llm_provider_id = lp.id
|
||||
AND mf.llm_model_flow_type = 'chat'
|
||||
AND mf.llm_model_flow_type = 'CHAT'
|
||||
ORDER BY mc.is_visible DESC, mc.id ASC
|
||||
LIMIT 1
|
||||
)
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
"""add_user_preferences
|
||||
|
||||
Revision ID: 175ea04c7087
|
||||
Revises: d56ffa94ca32
|
||||
Create Date: 2026-02-04 18:16:24.830873
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "175ea04c7087"
|
||||
down_revision = "d56ffa94ca32"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"user",
|
||||
sa.Column("user_preferences", sa.Text(), nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("user", "user_preferences")
|
||||
@@ -0,0 +1,36 @@
|
||||
"""add_chat_compression_fields
|
||||
|
||||
Revision ID: 90b409d06e50
|
||||
Revises: f220515df7b4
|
||||
Create Date: 2026-01-26 09:13:09.635427
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "90b409d06e50"
|
||||
down_revision = "f220515df7b4"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add last_summarized_message_id to chat_message
|
||||
# This field marks a message as a summary and indicates the last message it covers.
|
||||
# Summaries are branch-aware via their parent_message_id pointing to the branch.
|
||||
op.add_column(
|
||||
"chat_message",
|
||||
sa.Column(
|
||||
"last_summarized_message_id",
|
||||
sa.Integer(),
|
||||
sa.ForeignKey("chat_message.id", ondelete="SET NULL"),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("chat_message", "last_summarized_message_id")
|
||||
35
backend/alembic/versions/d56ffa94ca32_add_file_content.py
Normal file
35
backend/alembic/versions/d56ffa94ca32_add_file_content.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""add_file_content
|
||||
|
||||
Revision ID: d56ffa94ca32
|
||||
Revises: 01f8e6d95a33
|
||||
Create Date: 2026-02-06 15:29:34.192960
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "d56ffa94ca32"
|
||||
down_revision = "01f8e6d95a33"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_table(
|
||||
"file_content",
|
||||
sa.Column(
|
||||
"file_id",
|
||||
sa.String(),
|
||||
sa.ForeignKey("file_record.file_id", ondelete="CASCADE"),
|
||||
primary_key=True,
|
||||
),
|
||||
sa.Column("lobj_oid", sa.BigInteger(), nullable=False),
|
||||
sa.Column("file_size", sa.BigInteger(), nullable=False, server_default="0"),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("file_content")
|
||||
@@ -0,0 +1,35 @@
|
||||
"""add_cascade_delete_to_search_query_user_id
|
||||
|
||||
Revision ID: d5c86e2c6dc6
|
||||
Revises: 90b409d06e50
|
||||
Create Date: 2026-02-04 16:05:04.749804
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "d5c86e2c6dc6"
|
||||
down_revision = "90b409d06e50"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.drop_constraint("search_query_user_id_fkey", "search_query", type_="foreignkey")
|
||||
op.create_foreign_key(
|
||||
"search_query_user_id_fkey",
|
||||
"search_query",
|
||||
"user",
|
||||
["user_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_constraint("search_query_user_id_fkey", "search_query", type_="foreignkey")
|
||||
op.create_foreign_key(
|
||||
"search_query_user_id_fkey", "search_query", "user", ["user_id"], ["id"]
|
||||
)
|
||||
@@ -39,7 +39,7 @@ EXCLUDE_TABLES = {"kombu_queue", "kombu_message"}
|
||||
|
||||
|
||||
def include_object(
|
||||
object: SchemaItem,
|
||||
object: SchemaItem, # noqa: ARG001
|
||||
name: str | None,
|
||||
type_: Literal[
|
||||
"schema",
|
||||
@@ -49,8 +49,8 @@ def include_object(
|
||||
"unique_constraint",
|
||||
"foreign_key_constraint",
|
||||
],
|
||||
reflected: bool,
|
||||
compare_to: SchemaItem | None,
|
||||
reflected: bool, # noqa: ARG001
|
||||
compare_to: SchemaItem | None, # noqa: ARG001
|
||||
) -> bool:
|
||||
if type_ == "table" and name in EXCLUDE_TABLES:
|
||||
return False
|
||||
|
||||
@@ -951,7 +951,7 @@ class PermissionSyncCallback(IndexingHeartbeatInterface):
|
||||
|
||||
return False
|
||||
|
||||
def progress(self, tag: str, amount: int) -> None:
|
||||
def progress(self, tag: str, amount: int) -> None: # noqa: ARG002
|
||||
try:
|
||||
self.redis_connector.permissions.set_active()
|
||||
|
||||
@@ -982,7 +982,7 @@ class PermissionSyncCallback(IndexingHeartbeatInterface):
|
||||
|
||||
|
||||
def monitor_ccpair_permissions_taskset(
|
||||
tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session
|
||||
tenant_id: str, key_bytes: bytes, r: Redis, db_session: Session # noqa: ARG001
|
||||
) -> None:
|
||||
fence_key = key_bytes.decode("utf-8")
|
||||
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
|
||||
|
||||
@@ -259,7 +259,7 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
|
||||
def try_creating_external_group_sync_task(
|
||||
app: Celery,
|
||||
cc_pair_id: int,
|
||||
r: Redis,
|
||||
r: Redis, # noqa: ARG001
|
||||
tenant_id: str,
|
||||
) -> str | None:
|
||||
"""Returns an int if syncing is needed. The int represents the number of sync tasks generated.
|
||||
@@ -344,7 +344,7 @@ def try_creating_external_group_sync_task(
|
||||
bind=True,
|
||||
)
|
||||
def connector_external_group_sync_generator_task(
|
||||
self: Task,
|
||||
self: Task, # noqa: ARG001
|
||||
cc_pair_id: int,
|
||||
tenant_id: str,
|
||||
) -> None:
|
||||
@@ -590,8 +590,8 @@ def _perform_external_group_sync(
|
||||
|
||||
def validate_external_group_sync_fences(
|
||||
tenant_id: str,
|
||||
celery_app: Celery,
|
||||
r: Redis,
|
||||
celery_app: Celery, # noqa: ARG001
|
||||
r: Redis, # noqa: ARG001
|
||||
r_replica: Redis,
|
||||
r_celery: Redis,
|
||||
lock_beat: RedisLock,
|
||||
|
||||
@@ -40,7 +40,7 @@ def export_query_history_task(
|
||||
end: datetime,
|
||||
start_time: datetime,
|
||||
# Need to include the tenant_id since the TenantAwareTask needs this
|
||||
tenant_id: str,
|
||||
tenant_id: str, # noqa: ARG001
|
||||
) -> None:
|
||||
if not self.request.id:
|
||||
raise RuntimeError("No task id defined for this task; cannot identify it")
|
||||
|
||||
@@ -43,7 +43,7 @@ _TENANT_PROVISIONING_TIME_LIMIT = 60 * 10 # 10 minutes
|
||||
trail=False,
|
||||
bind=True,
|
||||
)
|
||||
def check_available_tenants(self: Task) -> None:
|
||||
def check_available_tenants(self: Task) -> None: # noqa: ARG001
|
||||
"""
|
||||
Check if we have enough pre-provisioned tenants available.
|
||||
If not, trigger the pre-provisioning of new tenants.
|
||||
|
||||
@@ -21,9 +21,9 @@ logger = setup_logger()
|
||||
trail=False,
|
||||
)
|
||||
def generate_usage_report_task(
|
||||
self: Task,
|
||||
self: Task, # noqa: ARG001
|
||||
*,
|
||||
tenant_id: str,
|
||||
tenant_id: str, # noqa: ARG001
|
||||
user_id: str | None = None,
|
||||
period_from: str | None = None,
|
||||
period_to: str | None = None,
|
||||
|
||||
@@ -7,7 +7,7 @@ QUERY_HISTORY_TASK_NAME_PREFIX = OnyxCeleryTask.EXPORT_QUERY_HISTORY_TASK
|
||||
|
||||
|
||||
def name_chat_ttl_task(
|
||||
retention_limit_days: float, tenant_id: str | None = None
|
||||
retention_limit_days: float, tenant_id: str | None = None # noqa: ARG001
|
||||
) -> str:
|
||||
return f"chat_ttl_{retention_limit_days}_days"
|
||||
|
||||
|
||||
@@ -134,7 +134,7 @@ GATED_TENANTS_KEY = "gated_tenants"
|
||||
|
||||
# License enforcement - when True, blocks API access for gated/expired licenses
|
||||
LICENSE_ENFORCEMENT_ENABLED = (
|
||||
os.environ.get("LICENSE_ENFORCEMENT_ENABLED", "").lower() == "true"
|
||||
os.environ.get("LICENSE_ENFORCEMENT_ENABLED", "true").lower() == "true"
|
||||
)
|
||||
|
||||
# Cloud data plane URL - self-hosted instances call this to reach cloud proxy endpoints
|
||||
|
||||
@@ -54,7 +54,7 @@ def delete_document_set_privacy__no_commit(
|
||||
def fetch_document_sets(
|
||||
user_id: UUID | None,
|
||||
db_session: Session,
|
||||
include_outdated: bool = True, # Parameter only for versioned implementation, unused
|
||||
include_outdated: bool = True, # Parameter only for versioned implementation, unused # noqa: ARG001
|
||||
) -> list[tuple[DocumentSet, list[ConnectorCredentialPair]]]:
|
||||
assert user_id is not None
|
||||
|
||||
|
||||
@@ -5,8 +5,10 @@ It filters hierarchy nodes based on user email and external group membership.
|
||||
"""
|
||||
|
||||
from sqlalchemy import any_
|
||||
from sqlalchemy import cast
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import String
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.sql.elements import ColumnElement
|
||||
@@ -32,7 +34,7 @@ def _build_hierarchy_access_filter(
|
||||
if external_group_ids:
|
||||
access_filters.append(
|
||||
HierarchyNode.external_user_group_ids.overlap(
|
||||
postgresql.array(external_group_ids)
|
||||
cast(postgresql.array(external_group_ids), postgresql.ARRAY(String))
|
||||
)
|
||||
)
|
||||
return or_(*access_filters)
|
||||
|
||||
@@ -11,6 +11,7 @@ from ee.onyx.server.license.models import LicenseMetadata
|
||||
from ee.onyx.server.license.models import LicensePayload
|
||||
from ee.onyx.server.license.models import LicenseSource
|
||||
from onyx.auth.schemas import UserRole
|
||||
from onyx.configs.constants import ANONYMOUS_USER_EMAIL
|
||||
from onyx.db.models import License
|
||||
from onyx.db.models import User
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
@@ -107,7 +108,8 @@ def get_used_seats(tenant_id: str | None = None) -> int:
|
||||
Get current seat usage directly from database.
|
||||
|
||||
For multi-tenant: counts users in UserTenantMapping for this tenant.
|
||||
For self-hosted: counts all active users (excludes EXT_PERM_USER role).
|
||||
For self-hosted: counts all active users (excludes EXT_PERM_USER role
|
||||
and the anonymous system user).
|
||||
|
||||
TODO: Exclude API key dummy users from seat counting. API keys create
|
||||
users with emails like `__DANSWER_API_KEY_*` that should not count toward
|
||||
@@ -127,6 +129,7 @@ def get_used_seats(tenant_id: str | None = None) -> int:
|
||||
.where(
|
||||
User.is_active == True, # type: ignore # noqa: E712
|
||||
User.role != UserRole.EXT_PERM_USER,
|
||||
User.email != ANONYMOUS_USER_EMAIL, # type: ignore
|
||||
)
|
||||
)
|
||||
return result.scalar() or 0
|
||||
|
||||
@@ -643,7 +643,7 @@ def add_users_to_user_group(
|
||||
|
||||
def update_user_group(
|
||||
db_session: Session,
|
||||
user: User,
|
||||
user: User, # noqa: ARG001
|
||||
user_group_id: int,
|
||||
user_group_update: UserGroupUpdate,
|
||||
) -> UserGroup:
|
||||
|
||||
@@ -25,7 +25,7 @@ CONFLUENCE_DOC_SYNC_LABEL = "confluence_doc_sync"
|
||||
|
||||
def confluence_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[ElementExternalAccess, None, None]:
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from typing import Any
|
||||
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.confluence.onyx_confluence import (
|
||||
get_user_email_from_username__server,
|
||||
)
|
||||
@@ -72,6 +74,7 @@ def get_page_restrictions(
|
||||
page_id: str,
|
||||
page_restrictions: dict[str, Any],
|
||||
ancestors: list[dict[str, Any]],
|
||||
add_prefix: bool = False,
|
||||
) -> ExternalAccess | None:
|
||||
"""
|
||||
This function gets the restrictions for a page. In Confluence, a child can have
|
||||
@@ -79,6 +82,9 @@ def get_page_restrictions(
|
||||
|
||||
If no restrictions are found anywhere, then return None, indicating that the page
|
||||
should inherit the space's restrictions.
|
||||
|
||||
add_prefix: When True, prefix group IDs with source type (for indexing path).
|
||||
When False (default), leave unprefixed (for permission sync path).
|
||||
"""
|
||||
found_user_emails: set[str] = set()
|
||||
found_group_names: set[str] = set()
|
||||
@@ -92,13 +98,22 @@ def get_page_restrictions(
|
||||
restrictions=page_restrictions,
|
||||
)
|
||||
)
|
||||
|
||||
def _maybe_prefix_groups(group_names: set[str]) -> set[str]:
|
||||
if add_prefix:
|
||||
return {
|
||||
build_ext_group_name_for_onyx(g, DocumentSource.CONFLUENCE)
|
||||
for g in group_names
|
||||
}
|
||||
return group_names
|
||||
|
||||
# if there are individual page-level restrictions, then this is the accurate
|
||||
# restriction for the page. You cannot both have page-level restrictions AND
|
||||
# inherit restrictions from the parent.
|
||||
if found_any_page_level_restriction:
|
||||
return ExternalAccess(
|
||||
external_user_emails=found_user_emails,
|
||||
external_user_group_ids=found_group_names,
|
||||
external_user_group_ids=_maybe_prefix_groups(found_group_names),
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
@@ -125,7 +140,7 @@ def get_page_restrictions(
|
||||
)
|
||||
return ExternalAccess(
|
||||
external_user_emails=ancestor_user_emails,
|
||||
external_user_group_ids=ancestor_group_names,
|
||||
external_user_group_ids=_maybe_prefix_groups(ancestor_group_names),
|
||||
is_public=False,
|
||||
)
|
||||
|
||||
|
||||
@@ -3,6 +3,8 @@ from ee.onyx.external_permissions.confluence.constants import ALL_CONF_EMAILS_GR
|
||||
from ee.onyx.external_permissions.confluence.constants import REQUEST_PAGINATION_LIMIT
|
||||
from ee.onyx.external_permissions.confluence.constants import VIEWSPACE_PERMISSION_TYPE
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.confluence.onyx_confluence import (
|
||||
get_user_email_from_username__server,
|
||||
)
|
||||
@@ -112,6 +114,7 @@ def get_space_permission(
|
||||
confluence_client: OnyxConfluence,
|
||||
space_key: str,
|
||||
is_cloud: bool,
|
||||
add_prefix: bool = False,
|
||||
) -> ExternalAccess:
|
||||
if is_cloud:
|
||||
space_permissions = _get_cloud_space_permissions(confluence_client, space_key)
|
||||
@@ -130,13 +133,32 @@ def get_space_permission(
|
||||
f"permissions for space '{space_key}'"
|
||||
)
|
||||
|
||||
# Prefix group IDs with source type if requested (for indexing path)
|
||||
if add_prefix and space_permissions.external_user_group_ids:
|
||||
prefixed_groups = {
|
||||
build_ext_group_name_for_onyx(g, DocumentSource.CONFLUENCE)
|
||||
for g in space_permissions.external_user_group_ids
|
||||
}
|
||||
return ExternalAccess(
|
||||
external_user_emails=space_permissions.external_user_emails,
|
||||
external_user_group_ids=prefixed_groups,
|
||||
is_public=space_permissions.is_public,
|
||||
)
|
||||
|
||||
return space_permissions
|
||||
|
||||
|
||||
def get_all_space_permissions(
|
||||
confluence_client: OnyxConfluence,
|
||||
is_cloud: bool,
|
||||
add_prefix: bool = False,
|
||||
) -> dict[str, ExternalAccess]:
|
||||
"""
|
||||
Get access permissions for all spaces in Confluence.
|
||||
|
||||
add_prefix: When True, prefix group IDs with source type (for indexing path).
|
||||
When False (default), leave unprefixed (for permission sync path).
|
||||
"""
|
||||
logger.debug("Getting space permissions")
|
||||
# Gets all the spaces in the Confluence instance
|
||||
all_space_keys = [
|
||||
@@ -151,7 +173,9 @@ def get_all_space_permissions(
|
||||
logger.debug(f"Got {len(all_space_keys)} spaces from confluence")
|
||||
space_permissions_by_space_key: dict[str, ExternalAccess] = {}
|
||||
for space_key in all_space_keys:
|
||||
space_permissions = get_space_permission(confluence_client, space_key, is_cloud)
|
||||
space_permissions = get_space_permission(
|
||||
confluence_client, space_key, is_cloud, add_prefix
|
||||
)
|
||||
|
||||
# Stores the permissions for each space
|
||||
space_permissions_by_space_key[space_key] = space_permissions
|
||||
|
||||
@@ -34,7 +34,7 @@ GITHUB_DOC_SYNC_LABEL = "github_doc_sync"
|
||||
def github_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction, # noqa: ARG001
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
@@ -50,7 +50,12 @@ def github_doc_sync(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
|
||||
github_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
github_connector.load_credentials(credential_json)
|
||||
logger.info("GitHub connector credentials loaded successfully")
|
||||
|
||||
if not github_connector.github_client:
|
||||
|
||||
@@ -12,13 +12,18 @@ logger = setup_logger()
|
||||
|
||||
|
||||
def github_group_sync(
|
||||
tenant_id: str,
|
||||
tenant_id: str, # noqa: ARG001
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
github_connector: GithubConnector = GithubConnector(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
github_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
github_connector.load_credentials(credential_json)
|
||||
if not github_connector.github_client:
|
||||
raise ValueError("github_client is required")
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ class TeamInfo(BaseModel):
|
||||
|
||||
|
||||
def _fetch_organization_members(
|
||||
github_client: Github, org_name: str, retry_count: int = 0
|
||||
github_client: Github, org_name: str, retry_count: int = 0 # noqa: ARG001
|
||||
) -> List[UserInfo]:
|
||||
"""Fetch all organization members including owners and regular members."""
|
||||
org_members: List[UserInfo] = []
|
||||
@@ -124,7 +124,7 @@ def _fetch_organization_members(
|
||||
|
||||
|
||||
def _fetch_repository_teams_detailed(
|
||||
repo: Repository, github_client: Github, retry_count: int = 0
|
||||
repo: Repository, github_client: Github, retry_count: int = 0 # noqa: ARG001
|
||||
) -> List[TeamInfo]:
|
||||
"""Fetch teams with access to the repository and their members."""
|
||||
teams_data: List[TeamInfo] = []
|
||||
@@ -167,7 +167,7 @@ def _fetch_repository_teams_detailed(
|
||||
|
||||
|
||||
def fetch_repository_team_slugs(
|
||||
repo: Repository, github_client: Github, retry_count: int = 0
|
||||
repo: Repository, github_client: Github, retry_count: int = 0 # noqa: ARG001
|
||||
) -> List[str]:
|
||||
"""Fetch team slugs with access to the repository."""
|
||||
logger.info(f"Fetching team slugs for repository {repo.full_name}")
|
||||
|
||||
@@ -39,8 +39,8 @@ def _get_slim_doc_generator(
|
||||
|
||||
def gmail_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction, # noqa: ARG001
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[ElementExternalAccess, None, None]:
|
||||
"""
|
||||
@@ -50,7 +50,12 @@ def gmail_doc_sync(
|
||||
already populated.
|
||||
"""
|
||||
gmail_connector = GmailConnector(**cc_pair.connector.connector_specific_config)
|
||||
gmail_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
gmail_connector.load_credentials(credential_json)
|
||||
|
||||
slim_doc_generator = _get_slim_doc_generator(
|
||||
cc_pair, gmail_connector, callback=callback
|
||||
|
||||
@@ -13,6 +13,7 @@ from onyx.access.models import DocExternalAccess
|
||||
from onyx.access.models import ElementExternalAccess
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.access.models import NodeExternalAccess
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from onyx.connectors.google_drive.models import GoogleDriveFileType
|
||||
@@ -67,11 +68,17 @@ def get_external_access_for_raw_gdrive_file(
|
||||
company_domain: str,
|
||||
retriever_drive_service: GoogleDriveService | None,
|
||||
admin_drive_service: GoogleDriveService,
|
||||
add_prefix: bool = False,
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
Get the external access for a raw Google Drive file.
|
||||
|
||||
Assumes the file we retrieved has EITHER `permissions` or `permission_ids`
|
||||
|
||||
add_prefix: When this method is called during the initial indexing via the connector,
|
||||
set add_prefix to True so group IDs are prefixed with the source type.
|
||||
When invoked from doc_sync (permission sync), use the default (False)
|
||||
since upsert_document_external_perms handles prefixing.
|
||||
"""
|
||||
doc_id = file.get("id")
|
||||
if not doc_id:
|
||||
@@ -164,6 +171,13 @@ def get_external_access_for_raw_gdrive_file(
|
||||
| ({drive_id} if drive_id is not None else set())
|
||||
)
|
||||
|
||||
# Prefix group IDs with source type if requested (for indexing path)
|
||||
if add_prefix:
|
||||
group_ids = {
|
||||
build_ext_group_name_for_onyx(group_id, DocumentSource.GOOGLE_DRIVE)
|
||||
for group_id in group_ids
|
||||
}
|
||||
|
||||
return ExternalAccess(
|
||||
external_user_emails=user_emails,
|
||||
external_user_group_ids=group_ids,
|
||||
@@ -175,6 +189,7 @@ def get_external_access_for_folder(
|
||||
folder: GoogleDriveFileType,
|
||||
google_domain: str,
|
||||
drive_service: GoogleDriveService,
|
||||
add_prefix: bool = False,
|
||||
) -> ExternalAccess:
|
||||
"""
|
||||
Extract ExternalAccess from a folder's permissions.
|
||||
@@ -186,6 +201,8 @@ def get_external_access_for_folder(
|
||||
folder: The folder metadata from Google Drive API (must include permissionIds field)
|
||||
google_domain: The company's Google Workspace domain (e.g., "company.com")
|
||||
drive_service: Google Drive service for fetching permission details
|
||||
add_prefix: When True, prefix group IDs with source type (for indexing path).
|
||||
When False (default), leave unprefixed (for permission sync path).
|
||||
|
||||
Returns:
|
||||
ExternalAccess with extracted permission info
|
||||
@@ -248,17 +265,25 @@ def get_external_access_for_folder(
|
||||
# If allowFileDiscovery is False, it's "link only" access
|
||||
is_public = permission.allow_file_discovery is not False
|
||||
|
||||
# Prefix group IDs with source type if requested (for indexing path)
|
||||
group_ids: set[str] = group_emails
|
||||
if add_prefix:
|
||||
group_ids = {
|
||||
build_ext_group_name_for_onyx(group_id, DocumentSource.GOOGLE_DRIVE)
|
||||
for group_id in group_emails
|
||||
}
|
||||
|
||||
return ExternalAccess(
|
||||
external_user_emails=user_emails,
|
||||
external_user_group_ids=group_emails,
|
||||
external_user_group_ids=group_ids,
|
||||
is_public=is_public,
|
||||
)
|
||||
|
||||
|
||||
def gdrive_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction, # noqa: ARG001
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[ElementExternalAccess, None, None]:
|
||||
"""
|
||||
@@ -270,7 +295,12 @@ def gdrive_doc_sync(
|
||||
google_drive_connector = GoogleDriveConnector(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
google_drive_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
google_drive_connector.load_credentials(credential_json)
|
||||
|
||||
slim_doc_generator = _get_slim_doc_generator(cc_pair, google_drive_connector)
|
||||
|
||||
|
||||
@@ -384,14 +384,19 @@ def _build_onyx_groups(
|
||||
|
||||
|
||||
def gdrive_group_sync(
|
||||
tenant_id: str,
|
||||
tenant_id: str, # noqa: ARG001
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
# Initialize connector and build credential/service objects
|
||||
google_drive_connector = GoogleDriveConnector(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
google_drive_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
google_drive_connector.load_credentials(credential_json)
|
||||
admin_service = get_admin_service(
|
||||
google_drive_connector.creds, google_drive_connector.primary_admin_email
|
||||
)
|
||||
|
||||
@@ -17,14 +17,19 @@ JIRA_DOC_SYNC_TAG = "jira_doc_sync"
|
||||
|
||||
def jira_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> Generator[ElementExternalAccess, None, None]:
|
||||
jira_connector = JiraConnector(
|
||||
**cc_pair.connector.connector_specific_config,
|
||||
)
|
||||
jira_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
jira_connector.load_credentials(credential_json)
|
||||
|
||||
yield from generic_doc_sync(
|
||||
cc_pair=cc_pair,
|
||||
|
||||
@@ -102,7 +102,7 @@ def _build_group_member_email_map(
|
||||
|
||||
|
||||
def jira_group_sync(
|
||||
tenant_id: str,
|
||||
tenant_id: str, # noqa: ARG001
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
"""
|
||||
@@ -119,8 +119,13 @@ def jira_group_sync(
|
||||
if not jira_base_url:
|
||||
raise ValueError("No jira_base_url found in connector config")
|
||||
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
jira_client = build_jira_client(
|
||||
credentials=cc_pair.credential.credential_json,
|
||||
credentials=credential_json,
|
||||
jira_base=jira_base_url,
|
||||
scoped_token=scoped_token,
|
||||
)
|
||||
|
||||
@@ -8,6 +8,8 @@ from ee.onyx.external_permissions.jira.models import Holder
|
||||
from ee.onyx.external_permissions.jira.models import Permission
|
||||
from ee.onyx.external_permissions.jira.models import User
|
||||
from onyx.access.models import ExternalAccess
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
HolderMap = dict[str, list[Holder]]
|
||||
@@ -252,7 +254,14 @@ def _build_external_access_from_holder_map(
|
||||
def get_project_permissions(
|
||||
jira_client: JIRA,
|
||||
jira_project: str,
|
||||
add_prefix: bool = False,
|
||||
) -> ExternalAccess | None:
|
||||
"""
|
||||
Get project permissions from Jira.
|
||||
|
||||
add_prefix: When True, prefix group IDs with source type (for indexing path).
|
||||
When False (default), leave unprefixed (for permission sync path).
|
||||
"""
|
||||
project_permissions: PermissionScheme = jira_client.project_permissionscheme(
|
||||
project=jira_project
|
||||
)
|
||||
@@ -267,6 +276,20 @@ def get_project_permissions(
|
||||
|
||||
holder_map = _build_holder_map(permissions=project_permissions.permissions)
|
||||
|
||||
return _build_external_access_from_holder_map(
|
||||
external_access = _build_external_access_from_holder_map(
|
||||
jira_client=jira_client, jira_project=jira_project, holder_map=holder_map
|
||||
)
|
||||
|
||||
# Prefix group IDs with source type if requested (for indexing path)
|
||||
if add_prefix and external_access and external_access.external_user_group_ids:
|
||||
prefixed_groups = {
|
||||
build_ext_group_name_for_onyx(g, DocumentSource.JIRA)
|
||||
for g in external_access.external_user_group_ids
|
||||
}
|
||||
return ExternalAccess(
|
||||
external_user_emails=external_access.external_user_emails,
|
||||
external_user_group_ids=prefixed_groups,
|
||||
is_public=external_access.is_public,
|
||||
)
|
||||
|
||||
return external_access
|
||||
|
||||
@@ -23,7 +23,7 @@ ContentRange = tuple[int, int | None] # (start_index, end_index) None means to
|
||||
|
||||
# NOTE: Used for testing timing
|
||||
def _get_dummy_object_access_map(
|
||||
object_ids: set[str], user_email: str, chunks: list[InferenceChunk]
|
||||
object_ids: set[str], user_email: str, chunks: list[InferenceChunk] # noqa: ARG001
|
||||
) -> dict[str, bool]:
|
||||
time.sleep(0.15)
|
||||
# return {object_id: True for object_id in object_ids}
|
||||
|
||||
@@ -30,7 +30,11 @@ def get_any_salesforce_client_for_doc_id(
|
||||
if _ANY_SALESFORCE_CLIENT is None:
|
||||
cc_pairs = get_cc_pairs_for_document(db_session, doc_id)
|
||||
first_cc_pair = cc_pairs[0]
|
||||
credential_json = first_cc_pair.credential.credential_json
|
||||
credential_json = (
|
||||
first_cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if first_cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
_ANY_SALESFORCE_CLIENT = Salesforce(
|
||||
username=credential_json["sf_username"],
|
||||
password=credential_json["sf_password"],
|
||||
@@ -158,7 +162,11 @@ def _get_salesforce_client_for_doc_id(db_session: Session, doc_id: str) -> Sales
|
||||
)
|
||||
if cc_pair is None:
|
||||
raise ValueError(f"CC pair {cc_pair_id} not found")
|
||||
credential_json = cc_pair.credential.credential_json
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
_CC_PAIR_ID_SALESFORCE_CLIENT_MAP[cc_pair_id] = Salesforce(
|
||||
username=credential_json["sf_username"],
|
||||
password=credential_json["sf_password"],
|
||||
|
||||
@@ -17,14 +17,19 @@ SHAREPOINT_DOC_SYNC_TAG = "sharepoint_doc_sync"
|
||||
|
||||
def sharepoint_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> Generator[ElementExternalAccess, None, None]:
|
||||
sharepoint_connector = SharepointConnector(
|
||||
**cc_pair.connector.connector_specific_config,
|
||||
)
|
||||
sharepoint_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
sharepoint_connector.load_credentials(credential_json)
|
||||
|
||||
yield from generic_doc_sync(
|
||||
cc_pair=cc_pair,
|
||||
|
||||
@@ -15,7 +15,7 @@ logger = setup_logger()
|
||||
|
||||
|
||||
def sharepoint_group_sync(
|
||||
tenant_id: str,
|
||||
tenant_id: str, # noqa: ARG001
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> Generator[ExternalUserGroup, None, None]:
|
||||
"""Sync SharePoint groups and their members"""
|
||||
@@ -25,7 +25,12 @@ def sharepoint_group_sync(
|
||||
|
||||
# Create SharePoint connector instance and load credentials
|
||||
connector = SharepointConnector(**connector_config)
|
||||
connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
connector.load_credentials(credential_json)
|
||||
|
||||
if not connector.msal_app:
|
||||
raise RuntimeError("MSAL app not initialized in connector")
|
||||
|
||||
@@ -103,7 +103,7 @@ def _fetch_channel_permissions(
|
||||
|
||||
def _get_slack_document_access(
|
||||
slack_connector: SlackConnector,
|
||||
channel_permissions: dict[str, ExternalAccess],
|
||||
channel_permissions: dict[str, ExternalAccess], # noqa: ARG001
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
slim_doc_generator = slack_connector.retrieve_all_slim_docs_perm_sync(
|
||||
@@ -136,8 +136,8 @@ def _get_slack_document_access(
|
||||
|
||||
def slack_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction, # noqa: ARG001
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[DocExternalAccess, None, None]:
|
||||
"""
|
||||
@@ -151,9 +151,14 @@ def slack_doc_sync(
|
||||
tenant_id = get_current_tenant_id()
|
||||
provider = OnyxDBCredentialsProvider(tenant_id, "slack", cc_pair.credential.id)
|
||||
r = get_redis_client(tenant_id=tenant_id)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
slack_client = SlackConnector.make_slack_web_client(
|
||||
provider.get_provider_key(),
|
||||
cc_pair.credential.credential_json["slack_bot_token"],
|
||||
credential_json["slack_bot_token"],
|
||||
SlackConnector.MAX_RETRIES,
|
||||
r,
|
||||
)
|
||||
|
||||
@@ -63,9 +63,14 @@ def slack_group_sync(
|
||||
|
||||
provider = OnyxDBCredentialsProvider(tenant_id, "slack", cc_pair.credential.id)
|
||||
r = get_redis_client(tenant_id=tenant_id)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
slack_client = SlackConnector.make_slack_web_client(
|
||||
provider.get_provider_key(),
|
||||
cc_pair.credential.credential_json["slack_bot_token"],
|
||||
credential_json["slack_bot_token"],
|
||||
SlackConnector.MAX_RETRIES,
|
||||
r,
|
||||
)
|
||||
|
||||
@@ -72,10 +72,10 @@ class SyncConfig(BaseModel):
|
||||
|
||||
# Mock doc sync function for testing (no-op)
|
||||
def mock_doc_sync(
|
||||
cc_pair: "ConnectorCredentialPair",
|
||||
fetch_all_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: Optional["IndexingHeartbeatInterface"],
|
||||
cc_pair: "ConnectorCredentialPair", # noqa: ARG001
|
||||
fetch_all_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_docs_ids_fn: FetchAllDocumentsIdsFunction, # noqa: ARG001
|
||||
callback: Optional["IndexingHeartbeatInterface"], # noqa: ARG001
|
||||
) -> Generator["DocExternalAccess", None, None]:
|
||||
"""Mock doc sync function for testing - returns empty list since permissions are fetched during indexing"""
|
||||
yield from []
|
||||
|
||||
@@ -18,14 +18,19 @@ TEAMS_DOC_SYNC_LABEL = "teams_doc_sync"
|
||||
|
||||
def teams_doc_sync(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction,
|
||||
fetch_all_existing_docs_fn: FetchAllDocumentsFunction, # noqa: ARG001
|
||||
fetch_all_existing_docs_ids_fn: FetchAllDocumentsIdsFunction,
|
||||
callback: IndexingHeartbeatInterface | None,
|
||||
) -> Generator[ElementExternalAccess, None, None]:
|
||||
teams_connector = TeamsConnector(
|
||||
**cc_pair.connector.connector_specific_config,
|
||||
)
|
||||
teams_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
credential_json = (
|
||||
cc_pair.credential.credential_json.get_value(apply_mask=False)
|
||||
if cc_pair.credential.credential_json
|
||||
else {}
|
||||
)
|
||||
teams_connector.load_credentials(credential_json)
|
||||
|
||||
yield from generic_doc_sync(
|
||||
cc_pair=cc_pair,
|
||||
|
||||
@@ -32,6 +32,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from ee.onyx.auth.users import current_admin_user
|
||||
from ee.onyx.db.license import get_license
|
||||
from ee.onyx.db.license import get_used_seats
|
||||
from ee.onyx.server.billing.models import BillingInformationResponse
|
||||
from ee.onyx.server.billing.models import CreateCheckoutSessionRequest
|
||||
from ee.onyx.server.billing.models import CreateCheckoutSessionResponse
|
||||
@@ -164,6 +165,16 @@ async def create_checkout_session(
|
||||
seats = request.seats if request else None
|
||||
email = request.email if request else None
|
||||
|
||||
# Validate that requested seats is not less than current used seats
|
||||
if seats is not None:
|
||||
used_seats = get_used_seats(tenant_id)
|
||||
if seats < used_seats:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Cannot subscribe with fewer seats than current usage. "
|
||||
f"You have {used_seats} active users/integrations but requested {seats} seats.",
|
||||
)
|
||||
|
||||
# Build redirect URL for after checkout completion
|
||||
redirect_url = f"{WEB_DOMAIN}/admin/billing?checkout=success"
|
||||
|
||||
@@ -265,6 +276,15 @@ async def update_seats(
|
||||
if not MULTI_TENANT and not license_data:
|
||||
raise HTTPException(status_code=400, detail="No license found")
|
||||
|
||||
# Validate that new seat count is not less than current used seats
|
||||
used_seats = get_used_seats(tenant_id)
|
||||
if request.new_seat_count < used_seats:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Cannot reduce seats below current usage. "
|
||||
f"You have {used_seats} active users/integrations but requested {request.new_seat_count} seats.",
|
||||
)
|
||||
|
||||
try:
|
||||
result = await update_seat_service(
|
||||
new_seat_count=request.new_seat_count,
|
||||
|
||||
@@ -139,7 +139,7 @@ def put_logo(
|
||||
upload_logo(file=file, is_logotype=is_logotype)
|
||||
|
||||
|
||||
def fetch_logo_helper(db_session: Session) -> Response:
|
||||
def fetch_logo_helper(db_session: Session) -> Response: # noqa: ARG001
|
||||
try:
|
||||
file_store = get_default_file_store()
|
||||
onyx_file = file_store.get_file_with_mime_type(get_logo_filename())
|
||||
@@ -155,7 +155,7 @@ def fetch_logo_helper(db_session: Session) -> Response:
|
||||
return Response(content=onyx_file.data, media_type=onyx_file.mime_type)
|
||||
|
||||
|
||||
def fetch_logotype_helper(db_session: Session) -> Response:
|
||||
def fetch_logotype_helper(db_session: Session) -> Response: # noqa: ARG001
|
||||
try:
|
||||
file_store = get_default_file_store()
|
||||
onyx_file = file_store.get_file_with_mime_type(get_logotype_filename())
|
||||
|
||||
@@ -17,7 +17,7 @@ router = APIRouter(prefix="/evals")
|
||||
@router.post("/eval_run", response_model=EvalRunAck)
|
||||
def eval_run(
|
||||
request: EvalConfigurationOptions,
|
||||
user: User = Depends(current_cloud_superuser),
|
||||
user: User = Depends(current_cloud_superuser), # noqa: ARG001
|
||||
) -> EvalRunAck:
|
||||
"""
|
||||
Run an evaluation with the given message and optional dataset.
|
||||
|
||||
@@ -42,6 +42,20 @@ logger = setup_logger()
|
||||
|
||||
router = APIRouter(prefix="/license")
|
||||
|
||||
# PEM-style delimiters used in license file format
|
||||
_PEM_BEGIN = "-----BEGIN ONYX LICENSE-----"
|
||||
_PEM_END = "-----END ONYX LICENSE-----"
|
||||
|
||||
|
||||
def _strip_pem_delimiters(content: str) -> str:
|
||||
"""Strip PEM-style delimiters from license content if present."""
|
||||
content = content.strip()
|
||||
if content.startswith(_PEM_BEGIN) and content.endswith(_PEM_END):
|
||||
# Remove first and last lines (the delimiters)
|
||||
lines = content.split("\n")
|
||||
return "\n".join(lines[1:-1]).strip()
|
||||
return content
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def get_license_status(
|
||||
@@ -106,6 +120,11 @@ async def claim_license(
|
||||
- Updating seats via the billing API
|
||||
- Returning from the Stripe customer portal
|
||||
- Any operation that regenerates the license on control plane
|
||||
Claim a license from the control plane (self-hosted only).
|
||||
|
||||
Two modes:
|
||||
1. With session_id: After Stripe checkout, exchange session_id for license
|
||||
2. Without session_id: Re-claim using existing license for auth
|
||||
"""
|
||||
if MULTI_TENANT:
|
||||
raise HTTPException(
|
||||
@@ -210,6 +229,10 @@ async def upload_license(
|
||||
try:
|
||||
content = await license_file.read()
|
||||
license_data = content.decode("utf-8").strip()
|
||||
# Strip PEM-style delimiters if present (used in .lic file format)
|
||||
license_data = _strip_pem_delimiters(license_data)
|
||||
# Remove any stray whitespace/newlines from user input
|
||||
license_data = license_data.strip()
|
||||
except UnicodeDecodeError:
|
||||
raise HTTPException(status_code=400, detail="Invalid license file format")
|
||||
|
||||
|
||||
@@ -260,7 +260,7 @@ def confluence_oauth_accessible_resources(
|
||||
credential_id: int,
|
||||
user: User = Depends(current_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
tenant_id: str | None = Depends(get_current_tenant_id),
|
||||
tenant_id: str | None = Depends(get_current_tenant_id), # noqa: ARG001
|
||||
) -> JSONResponse:
|
||||
"""Atlassian's API is weird and does not supply us with enough info to be in a
|
||||
usable state after authorizing. All API's require a cloud id. We have to list
|
||||
@@ -270,7 +270,11 @@ def confluence_oauth_accessible_resources(
|
||||
if not credential:
|
||||
raise HTTPException(400, f"Credential {credential_id} not found.")
|
||||
|
||||
credential_dict = credential.credential_json
|
||||
credential_dict = (
|
||||
credential.credential_json.get_value(apply_mask=False)
|
||||
if credential.credential_json
|
||||
else {}
|
||||
)
|
||||
access_token = credential_dict["confluence_access_token"]
|
||||
|
||||
try:
|
||||
@@ -323,7 +327,7 @@ def confluence_oauth_finalize(
|
||||
cloud_url: str,
|
||||
user: User = Depends(current_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
tenant_id: str | None = Depends(get_current_tenant_id),
|
||||
tenant_id: str | None = Depends(get_current_tenant_id), # noqa: ARG001
|
||||
) -> JSONResponse:
|
||||
"""Saves the info for the selected cloud site to the credential.
|
||||
This is the final step in the confluence oauth flow where after the traditional
|
||||
@@ -337,7 +341,12 @@ def confluence_oauth_finalize(
|
||||
detail=f"Confluence Cloud OAuth failed - credential {credential_id} not found.",
|
||||
)
|
||||
|
||||
new_credential_json: dict[str, Any] = dict(credential.credential_json)
|
||||
existing_credential_json = (
|
||||
credential.credential_json.get_value(apply_mask=False)
|
||||
if credential.credential_json
|
||||
else {}
|
||||
)
|
||||
new_credential_json: dict[str, Any] = dict(existing_credential_json)
|
||||
new_credential_json["cloud_id"] = cloud_id
|
||||
new_credential_json["cloud_name"] = cloud_name
|
||||
new_credential_json["wiki_base"] = cloud_url
|
||||
|
||||
@@ -78,7 +78,7 @@ def fetch_and_process_chat_session_history(
|
||||
db_session: Session,
|
||||
start: datetime,
|
||||
end: datetime,
|
||||
limit: int | None = 500,
|
||||
limit: int | None = 500, # noqa: ARG001
|
||||
) -> Generator[ChatSessionSnapshot]:
|
||||
PAGE_SIZE = 100
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ def generate_report(
|
||||
def read_usage_report(
|
||||
report_name: str,
|
||||
_: User = Depends(current_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
db_session: Session = Depends(get_session), # noqa: ARG001
|
||||
) -> Response:
|
||||
try:
|
||||
file = get_usage_report_data(report_name)
|
||||
|
||||
@@ -123,14 +123,9 @@ def _seed_llms(
|
||||
upsert_llm_provider(llm_upsert_request, db_session)
|
||||
for llm_upsert_request in llm_upsert_requests
|
||||
]
|
||||
|
||||
if len(seeded_providers[0].model_configurations) > 0:
|
||||
default_model = seeded_providers[0].model_configurations[0].name
|
||||
update_default_provider(
|
||||
provider_id=seeded_providers[0].id,
|
||||
model_name=default_model,
|
||||
db_session=db_session,
|
||||
)
|
||||
update_default_provider(
|
||||
provider_id=seeded_providers[0].id, db_session=db_session
|
||||
)
|
||||
|
||||
|
||||
def _seed_personas(db_session: Session, personas: list[PersonaUpsertRequest]) -> None:
|
||||
|
||||
@@ -58,26 +58,42 @@ def apply_license_status_to_settings(settings: Settings) -> Settings:
|
||||
For self-hosted, looks up license metadata and overrides application_status
|
||||
if the license indicates GATED_ACCESS (fully expired).
|
||||
|
||||
Also sets ee_features_enabled based on license status to control
|
||||
visibility of EE features in the UI.
|
||||
|
||||
For multi-tenant (cloud), the settings already have the correct status
|
||||
from the control plane, so no override is needed.
|
||||
|
||||
If LICENSE_ENFORCEMENT_ENABLED is false, settings are returned unchanged,
|
||||
allowing the product to function normally without license checks.
|
||||
If LICENSE_ENFORCEMENT_ENABLED is false, ee_features_enabled is set to True
|
||||
(since EE code was loaded via ENABLE_PAID_ENTERPRISE_EDITION_FEATURES).
|
||||
"""
|
||||
if not LICENSE_ENFORCEMENT_ENABLED:
|
||||
# License enforcement disabled - EE code is loaded via
|
||||
# ENABLE_PAID_ENTERPRISE_EDITION_FEATURES, so EE features are on
|
||||
settings.ee_features_enabled = True
|
||||
return settings
|
||||
|
||||
if MULTI_TENANT:
|
||||
# Cloud mode - EE features always available (gating handled by is_tenant_gated)
|
||||
settings.ee_features_enabled = True
|
||||
return settings
|
||||
|
||||
tenant_id = get_current_tenant_id()
|
||||
try:
|
||||
metadata = get_cached_license_metadata(tenant_id)
|
||||
if metadata and metadata.status == _BLOCKING_STATUS:
|
||||
settings.application_status = metadata.status
|
||||
# No license = user hasn't purchased yet, allow access for upgrade flow
|
||||
# GRACE_PERIOD/PAYMENT_REMINDER don't block - they're for notifications
|
||||
if metadata:
|
||||
if metadata.status == _BLOCKING_STATUS:
|
||||
settings.application_status = metadata.status
|
||||
settings.ee_features_enabled = False
|
||||
else:
|
||||
# Has a valid license (GRACE_PERIOD/PAYMENT_REMINDER still allow EE features)
|
||||
settings.ee_features_enabled = True
|
||||
else:
|
||||
# No license = community edition, disable EE features
|
||||
settings.ee_features_enabled = False
|
||||
except RedisError as e:
|
||||
logger.warning(f"Failed to check license metadata for settings: {e}")
|
||||
# Fail closed - disable EE features if we can't verify license
|
||||
settings.ee_features_enabled = False
|
||||
|
||||
return settings
|
||||
|
||||
@@ -19,6 +19,7 @@ logger = setup_logger()
|
||||
def fetch_stripe_checkout_session(
|
||||
tenant_id: str,
|
||||
billing_period: Literal["monthly", "annual"] = "monthly",
|
||||
seats: int | None = None,
|
||||
) -> str:
|
||||
token = generate_data_plane_token()
|
||||
headers = {
|
||||
@@ -29,10 +30,23 @@ def fetch_stripe_checkout_session(
|
||||
payload = {
|
||||
"tenant_id": tenant_id,
|
||||
"billing_period": billing_period,
|
||||
"seats": seats,
|
||||
}
|
||||
response = requests.post(url, headers=headers, json=payload)
|
||||
response.raise_for_status()
|
||||
return response.json()["sessionId"]
|
||||
if not response.ok:
|
||||
try:
|
||||
data = response.json()
|
||||
error_msg = (
|
||||
data.get("error")
|
||||
or f"Request failed with status {response.status_code}"
|
||||
)
|
||||
except (ValueError, requests.exceptions.JSONDecodeError):
|
||||
error_msg = f"Request failed with status {response.status_code}: {response.text[:200]}"
|
||||
raise Exception(error_msg)
|
||||
data = response.json()
|
||||
if data.get("error"):
|
||||
raise Exception(data["error"])
|
||||
return data["sessionId"]
|
||||
|
||||
|
||||
def fetch_tenant_stripe_information(tenant_id: str) -> dict:
|
||||
@@ -51,7 +65,6 @@ def fetch_tenant_stripe_information(tenant_id: str) -> dict:
|
||||
def fetch_billing_information(
|
||||
tenant_id: str,
|
||||
) -> BillingInformation | SubscriptionStatusResponse:
|
||||
logger.info("Fetching billing information")
|
||||
token = generate_data_plane_token()
|
||||
headers = {
|
||||
"Authorization": f"Bearer {token}",
|
||||
|
||||
@@ -29,6 +29,7 @@ from ee.onyx.server.tenants.billing import fetch_billing_information
|
||||
from ee.onyx.server.tenants.billing import fetch_customer_portal_session
|
||||
from ee.onyx.server.tenants.billing import fetch_stripe_checkout_session
|
||||
from ee.onyx.server.tenants.models import BillingInformation
|
||||
from ee.onyx.server.tenants.models import CreateCheckoutSessionRequest
|
||||
from ee.onyx.server.tenants.models import CreateSubscriptionSessionRequest
|
||||
from ee.onyx.server.tenants.models import ProductGatingFullSyncRequest
|
||||
from ee.onyx.server.tenants.models import ProductGatingRequest
|
||||
@@ -114,12 +115,30 @@ async def create_customer_portal_session(
|
||||
|
||||
try:
|
||||
portal_url = fetch_customer_portal_session(tenant_id, return_url)
|
||||
return {"url": portal_url}
|
||||
return {"stripe_customer_portal_url": portal_url}
|
||||
except Exception as e:
|
||||
logger.exception("Failed to create customer portal session")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/create-checkout-session")
|
||||
async def create_checkout_session(
|
||||
request: CreateCheckoutSessionRequest | None = None,
|
||||
_: User = Depends(current_admin_user),
|
||||
) -> dict:
|
||||
"""Create a Stripe checkout session via the control plane."""
|
||||
tenant_id = get_current_tenant_id()
|
||||
billing_period = request.billing_period if request else "monthly"
|
||||
seats = request.seats if request else None
|
||||
|
||||
try:
|
||||
checkout_url = fetch_stripe_checkout_session(tenant_id, billing_period, seats)
|
||||
return {"stripe_checkout_url": checkout_url}
|
||||
except Exception as e:
|
||||
logger.exception("Failed to create checkout session")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/create-subscription-session")
|
||||
async def create_subscription_session(
|
||||
request: CreateSubscriptionSessionRequest | None = None,
|
||||
|
||||
@@ -42,6 +42,12 @@ class BillingInformation(BaseModel):
|
||||
payment_method_enabled: bool
|
||||
|
||||
|
||||
class CreateCheckoutSessionRequest(BaseModel):
|
||||
billing_period: Literal["monthly", "annual"] = "monthly"
|
||||
seats: int | None = None
|
||||
email: str | None = None
|
||||
|
||||
|
||||
class CheckoutSessionCreationResponse(BaseModel):
|
||||
id: str
|
||||
|
||||
|
||||
@@ -121,7 +121,9 @@ async def get_or_provision_tenant(
|
||||
)
|
||||
|
||||
|
||||
async def create_tenant(email: str, referral_source: str | None = None) -> str:
|
||||
async def create_tenant(
|
||||
email: str, referral_source: str | None = None # noqa: ARG001
|
||||
) -> str:
|
||||
"""
|
||||
Create a new tenant on-demand when no pre-provisioned tenants are available.
|
||||
This is the fallback method when we can't use a pre-provisioned tenant.
|
||||
@@ -300,12 +302,12 @@ def configure_default_api_keys(db_session: Session) -> None:
|
||||
|
||||
has_set_default_provider = False
|
||||
|
||||
def _upsert(request: LLMProviderUpsertRequest, default_model: str) -> None:
|
||||
def _upsert(request: LLMProviderUpsertRequest) -> None:
|
||||
nonlocal has_set_default_provider
|
||||
try:
|
||||
provider = upsert_llm_provider(request, db_session)
|
||||
if not has_set_default_provider:
|
||||
update_default_provider(provider.id, default_model, db_session)
|
||||
update_default_provider(provider.id, db_session)
|
||||
has_set_default_provider = True
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to configure {request.provider} provider: {e}")
|
||||
@@ -323,13 +325,14 @@ def configure_default_api_keys(db_session: Session) -> None:
|
||||
name="OpenAI",
|
||||
provider=OPENAI_PROVIDER_NAME,
|
||||
api_key=OPENAI_DEFAULT_API_KEY,
|
||||
default_model_name=default_model_name,
|
||||
model_configurations=_build_model_configuration_upsert_requests(
|
||||
OPENAI_PROVIDER_NAME, recommendations
|
||||
),
|
||||
api_key_changed=True,
|
||||
is_auto_mode=True,
|
||||
)
|
||||
_upsert(openai_provider, default_model_name)
|
||||
_upsert(openai_provider)
|
||||
|
||||
# Create default image generation config using the OpenAI API key
|
||||
try:
|
||||
@@ -358,13 +361,14 @@ def configure_default_api_keys(db_session: Session) -> None:
|
||||
name="Anthropic",
|
||||
provider=ANTHROPIC_PROVIDER_NAME,
|
||||
api_key=ANTHROPIC_DEFAULT_API_KEY,
|
||||
default_model_name=default_model_name,
|
||||
model_configurations=_build_model_configuration_upsert_requests(
|
||||
ANTHROPIC_PROVIDER_NAME, recommendations
|
||||
),
|
||||
api_key_changed=True,
|
||||
is_auto_mode=True,
|
||||
)
|
||||
_upsert(anthropic_provider, default_model_name)
|
||||
_upsert(anthropic_provider)
|
||||
else:
|
||||
logger.info(
|
||||
"ANTHROPIC_DEFAULT_API_KEY not set, skipping Anthropic provider configuration"
|
||||
@@ -389,13 +393,14 @@ def configure_default_api_keys(db_session: Session) -> None:
|
||||
name="Google Vertex AI",
|
||||
provider=VERTEXAI_PROVIDER_NAME,
|
||||
custom_config=custom_config,
|
||||
default_model_name=default_model_name,
|
||||
model_configurations=_build_model_configuration_upsert_requests(
|
||||
VERTEXAI_PROVIDER_NAME, recommendations
|
||||
),
|
||||
api_key_changed=True,
|
||||
is_auto_mode=True,
|
||||
)
|
||||
_upsert(vertexai_provider, default_model_name)
|
||||
_upsert(vertexai_provider)
|
||||
else:
|
||||
logger.info(
|
||||
"VERTEXAI_DEFAULT_CREDENTIALS not set, skipping Vertex AI provider configuration"
|
||||
@@ -427,11 +432,12 @@ def configure_default_api_keys(db_session: Session) -> None:
|
||||
name="OpenRouter",
|
||||
provider=OPENROUTER_PROVIDER_NAME,
|
||||
api_key=OPENROUTER_DEFAULT_API_KEY,
|
||||
default_model_name=default_model_name,
|
||||
model_configurations=model_configurations,
|
||||
api_key_changed=True,
|
||||
is_auto_mode=True,
|
||||
)
|
||||
_upsert(openrouter_provider, default_model_name)
|
||||
_upsert(openrouter_provider)
|
||||
else:
|
||||
logger.info(
|
||||
"OPENROUTER_DEFAULT_API_KEY not set, skipping OpenRouter provider configuration"
|
||||
@@ -671,7 +677,7 @@ async def setup_tenant(tenant_id: str) -> None:
|
||||
|
||||
|
||||
async def assign_tenant_to_user(
|
||||
tenant_id: str, email: str, referral_source: str | None = None
|
||||
tenant_id: str, email: str, referral_source: str | None = None # noqa: ARG001
|
||||
) -> None:
|
||||
"""
|
||||
Assign a tenant to a user and perform necessary operations.
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sqlalchemy import text
|
||||
@@ -10,9 +11,30 @@ from alembic import command
|
||||
from alembic.config import Config
|
||||
from onyx.db.engine.sql_engine import build_connection_string
|
||||
from onyx.db.engine.sql_engine import get_sqlalchemy_engine
|
||||
from shared_configs.configs import TENANT_ID_PREFIX
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Regex pattern for valid tenant IDs:
|
||||
# - UUID format: tenant_xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
|
||||
# - AWS instance ID format: tenant_i-xxxxxxxxxxxxxxxxx
|
||||
# Also useful for not accidentally dropping `public` schema
|
||||
TENANT_ID_PATTERN = re.compile(
|
||||
rf"^{re.escape(TENANT_ID_PREFIX)}("
|
||||
r"[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}" # UUID
|
||||
r"|i-[a-f0-9]+" # AWS instance ID
|
||||
r")$"
|
||||
)
|
||||
|
||||
|
||||
def validate_tenant_id(tenant_id: str) -> bool:
|
||||
"""Validate that tenant_id matches expected format.
|
||||
|
||||
This is important for SQL injection prevention since schema names
|
||||
cannot be parameterized in SQL and must be formatted directly.
|
||||
"""
|
||||
return bool(TENANT_ID_PATTERN.match(tenant_id))
|
||||
|
||||
|
||||
def run_alembic_migrations(schema_name: str) -> None:
|
||||
logger.info(f"Starting Alembic migrations for schema: {schema_name}")
|
||||
@@ -67,13 +89,18 @@ def create_schema_if_not_exists(tenant_id: str) -> bool:
|
||||
|
||||
|
||||
def drop_schema(tenant_id: str) -> None:
|
||||
if not tenant_id.isidentifier():
|
||||
raise ValueError("Invalid tenant_id.")
|
||||
"""Drop a tenant's schema.
|
||||
|
||||
Uses strict regex validation to reject unexpected formats early,
|
||||
preventing SQL injection since schema names cannot be parameterized.
|
||||
"""
|
||||
if not validate_tenant_id(tenant_id):
|
||||
raise ValueError(f"Invalid tenant_id format: {tenant_id}")
|
||||
|
||||
with get_sqlalchemy_engine().connect() as connection:
|
||||
connection.execute(
|
||||
text("DROP SCHEMA IF EXISTS %(schema_name)s CASCADE"),
|
||||
{"schema_name": tenant_id},
|
||||
)
|
||||
with connection.begin():
|
||||
# Use string formatting with validated tenant_id (safe after validation)
|
||||
connection.execute(text(f'DROP SCHEMA IF EXISTS "{tenant_id}" CASCADE'))
|
||||
|
||||
|
||||
def get_current_alembic_version(tenant_id: str) -> str:
|
||||
|
||||
@@ -319,11 +319,13 @@ def get_tenant_count(tenant_id: str) -> int:
|
||||
A user counts toward the seat count if:
|
||||
1. They have an active mapping to this tenant (UserTenantMapping.active == True)
|
||||
2. AND the User is active (User.is_active == True)
|
||||
3. AND the User is not the anonymous system user
|
||||
|
||||
TODO: Exclude API key dummy users from seat counting. API keys create
|
||||
users with emails like `__DANSWER_API_KEY_*` that should not count toward
|
||||
seat limits. See: https://linear.app/onyx-app/issue/ENG-3518
|
||||
"""
|
||||
from onyx.configs.constants import ANONYMOUS_USER_EMAIL
|
||||
from onyx.db.models import User
|
||||
|
||||
# First get all emails with active mappings to this tenant
|
||||
@@ -333,6 +335,7 @@ def get_tenant_count(tenant_id: str) -> int:
|
||||
.filter(
|
||||
UserTenantMapping.tenant_id == tenant_id,
|
||||
UserTenantMapping.active == True, # noqa: E712
|
||||
UserTenantMapping.email != ANONYMOUS_USER_EMAIL,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
@@ -96,7 +96,7 @@ def get_access_for_documents(
|
||||
return versioned_get_access_for_documents_fn(document_ids, db_session)
|
||||
|
||||
|
||||
def _get_acl_for_user(user: User, db_session: Session) -> set[str]:
|
||||
def _get_acl_for_user(user: User, db_session: Session) -> set[str]: # noqa: ARG001
|
||||
"""Returns a list of ACL entries that the user has access to. This is meant to be
|
||||
used downstream to filter out documents that the user does not have access to. The
|
||||
user should have access to a document if at least one entry in the document's ACL
|
||||
|
||||
@@ -4,7 +4,9 @@ from onyx.db.models import User
|
||||
from onyx.utils.variable_functionality import fetch_versioned_implementation
|
||||
|
||||
|
||||
def _get_user_external_group_ids(db_session: Session, user: User) -> list[str]:
|
||||
def _get_user_external_group_ids(
|
||||
db_session: Session, user: User # noqa: ARG001
|
||||
) -> list[str]:
|
||||
return []
|
||||
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ REFRESH_ENDPOINTS = {
|
||||
async def _test_expire_oauth_token(
|
||||
user: User,
|
||||
oauth_account: OAuthAccount,
|
||||
db_session: AsyncSession,
|
||||
db_session: AsyncSession, # noqa: ARG001
|
||||
user_manager: BaseUserManager[User, Any],
|
||||
expire_in_seconds: int = 10,
|
||||
) -> bool:
|
||||
@@ -59,7 +59,7 @@ async def _test_expire_oauth_token(
|
||||
async def refresh_oauth_token(
|
||||
user: User,
|
||||
oauth_account: OAuthAccount,
|
||||
db_session: AsyncSession,
|
||||
db_session: AsyncSession, # noqa: ARG001
|
||||
user_manager: BaseUserManager[User, Any],
|
||||
) -> bool:
|
||||
"""
|
||||
@@ -182,7 +182,7 @@ async def check_and_refresh_oauth_tokens(
|
||||
|
||||
|
||||
async def check_oauth_account_has_refresh_token(
|
||||
user: User,
|
||||
user: User, # noqa: ARG001
|
||||
oauth_account: OAuthAccount,
|
||||
) -> bool:
|
||||
"""
|
||||
|
||||
@@ -11,6 +11,7 @@ from onyx.db.models import OAuthUserToken
|
||||
from onyx.db.oauth_config import get_user_oauth_token
|
||||
from onyx.db.oauth_config import upsert_user_oauth_token
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.sensitive import SensitiveValue
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -33,7 +34,10 @@ class OAuthTokenManager:
|
||||
if not user_token:
|
||||
return None
|
||||
|
||||
token_data = user_token.token_data
|
||||
if not user_token.token_data:
|
||||
return None
|
||||
|
||||
token_data = self._unwrap_token_data(user_token.token_data)
|
||||
|
||||
# Check if token is expired
|
||||
if OAuthTokenManager.is_token_expired(token_data):
|
||||
@@ -51,7 +55,10 @@ class OAuthTokenManager:
|
||||
|
||||
def refresh_token(self, user_token: OAuthUserToken) -> str:
|
||||
"""Refresh access token using refresh token"""
|
||||
token_data = user_token.token_data
|
||||
if not user_token.token_data:
|
||||
raise ValueError("No token data available for refresh")
|
||||
|
||||
token_data = self._unwrap_token_data(user_token.token_data)
|
||||
|
||||
response = requests.post(
|
||||
self.oauth_config.token_url,
|
||||
@@ -153,3 +160,11 @@ class OAuthTokenManager:
|
||||
separator = "&" if "?" in oauth_config.authorization_url else "?"
|
||||
|
||||
return f"{oauth_config.authorization_url}{separator}{urlencode(params)}"
|
||||
|
||||
@staticmethod
|
||||
def _unwrap_token_data(
|
||||
token_data: SensitiveValue[dict[str, Any]] | dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
if isinstance(token_data, SensitiveValue):
|
||||
return token_data.get_value(apply_mask=False)
|
||||
return token_data
|
||||
|
||||
@@ -58,3 +58,4 @@ class UserUpdate(schemas.BaseUserUpdate):
|
||||
class AuthBackend(str, Enum):
|
||||
REDIS = "redis"
|
||||
POSTGRES = "postgres"
|
||||
JWT = "jwt"
|
||||
|
||||
@@ -38,6 +38,7 @@ from fastapi_users import schemas
|
||||
from fastapi_users import UUIDIDMixin
|
||||
from fastapi_users.authentication import AuthenticationBackend
|
||||
from fastapi_users.authentication import CookieTransport
|
||||
from fastapi_users.authentication import JWTStrategy
|
||||
from fastapi_users.authentication import RedisStrategy
|
||||
from fastapi_users.authentication import Strategy
|
||||
from fastapi_users.authentication.strategy.db import AccessTokenDatabase
|
||||
@@ -780,7 +781,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
)
|
||||
|
||||
async def on_after_forgot_password(
|
||||
self, user: User, token: str, request: Optional[Request] = None
|
||||
self, user: User, token: str, request: Optional[Request] = None # noqa: ARG002
|
||||
) -> None:
|
||||
if not EMAIL_CONFIGURED:
|
||||
logger.error(
|
||||
@@ -799,7 +800,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
send_forgot_password_email(user.email, tenant_id=tenant_id, token=token)
|
||||
|
||||
async def on_after_request_verify(
|
||||
self, user: User, token: str, request: Optional[Request] = None
|
||||
self, user: User, token: str, request: Optional[Request] = None # noqa: ARG002
|
||||
) -> None:
|
||||
verify_email_domain(user.email)
|
||||
|
||||
@@ -983,7 +984,7 @@ class TenantAwareRedisStrategy(RedisStrategy[User, uuid.UUID]):
|
||||
except (exceptions.UserNotExists, exceptions.InvalidID, KeyError):
|
||||
return None
|
||||
|
||||
async def destroy_token(self, token: str, user: User) -> None:
|
||||
async def destroy_token(self, token: str, user: User) -> None: # noqa: ARG002
|
||||
"""Properly delete the token from async redis."""
|
||||
redis = await get_async_redis_connection()
|
||||
await redis.delete(f"{self.key_prefix}{token}")
|
||||
@@ -1046,6 +1047,61 @@ class RefreshableDatabaseStrategy(DatabaseStrategy[User, uuid.UUID, AccessToken]
|
||||
return token
|
||||
|
||||
|
||||
class SingleTenantJWTStrategy(JWTStrategy[User, uuid.UUID]):
|
||||
"""Stateless JWT strategy for single-tenant deployments.
|
||||
|
||||
Tokens are self-contained and verified via signature — no Redis or DB
|
||||
lookup required per request. An ``iat`` claim is embedded so that
|
||||
downstream code can determine when the token was created without
|
||||
querying an external store.
|
||||
|
||||
Refresh is implemented by issuing a brand-new JWT (the old one remains
|
||||
valid until its natural expiry). ``destroy_token`` is a no-op because
|
||||
JWTs cannot be server-side invalidated.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
secret: SecretType,
|
||||
lifetime_seconds: int | None = SESSION_EXPIRE_TIME_SECONDS,
|
||||
token_audience: list[str] | None = None,
|
||||
algorithm: str = "HS256",
|
||||
public_key: SecretType | None = None,
|
||||
):
|
||||
super().__init__(
|
||||
secret=secret,
|
||||
lifetime_seconds=lifetime_seconds,
|
||||
token_audience=token_audience or ["fastapi-users:auth"],
|
||||
algorithm=algorithm,
|
||||
public_key=public_key,
|
||||
)
|
||||
|
||||
async def write_token(self, user: User) -> str:
|
||||
data = {
|
||||
"sub": str(user.id),
|
||||
"aud": self.token_audience,
|
||||
"iat": int(datetime.now(timezone.utc).timestamp()),
|
||||
}
|
||||
return generate_jwt(
|
||||
data, self.encode_key, self.lifetime_seconds, algorithm=self.algorithm
|
||||
)
|
||||
|
||||
async def destroy_token(self, token: str, user: User) -> None: # noqa: ARG002
|
||||
# JWTs are stateless — nothing to invalidate server-side.
|
||||
# NOTE: a compromise that makes JWT auth stateful but revocable
|
||||
# is to include a token_version claim in the JWT payload. The token_version
|
||||
# is incremented whenever the user logs out (or gets login revoked). Whenever
|
||||
# the JWT is used, it is only valid if the token_version claim is the same as the one
|
||||
# in the db. If not, the JWT is invalid and the user needs to login again.
|
||||
return
|
||||
|
||||
async def refresh_token(
|
||||
self, token: Optional[str], user: User # noqa: ARG002
|
||||
) -> str:
|
||||
"""Issue a fresh JWT with a new expiry."""
|
||||
return await self.write_token(user)
|
||||
|
||||
|
||||
def get_redis_strategy() -> TenantAwareRedisStrategy:
|
||||
return TenantAwareRedisStrategy()
|
||||
|
||||
@@ -1058,6 +1114,22 @@ def get_database_strategy(
|
||||
)
|
||||
|
||||
|
||||
def get_jwt_strategy() -> SingleTenantJWTStrategy:
|
||||
return SingleTenantJWTStrategy(
|
||||
secret=USER_AUTH_SECRET,
|
||||
lifetime_seconds=SESSION_EXPIRE_TIME_SECONDS,
|
||||
)
|
||||
|
||||
|
||||
if AUTH_BACKEND == AuthBackend.JWT:
|
||||
if MULTI_TENANT or AUTH_TYPE == AuthType.CLOUD:
|
||||
raise ValueError(
|
||||
"JWT auth backend is only supported for single-tenant, self-hosted deployments. "
|
||||
"Use 'redis' or 'postgres' instead."
|
||||
)
|
||||
if not USER_AUTH_SECRET:
|
||||
raise ValueError("USER_AUTH_SECRET is required for JWT auth backend.")
|
||||
|
||||
if AUTH_BACKEND == AuthBackend.REDIS:
|
||||
auth_backend = AuthenticationBackend(
|
||||
name="redis", transport=cookie_transport, get_strategy=get_redis_strategy
|
||||
@@ -1066,6 +1138,10 @@ elif AUTH_BACKEND == AuthBackend.POSTGRES:
|
||||
auth_backend = AuthenticationBackend(
|
||||
name="postgres", transport=cookie_transport, get_strategy=get_database_strategy
|
||||
)
|
||||
elif AUTH_BACKEND == AuthBackend.JWT:
|
||||
auth_backend = AuthenticationBackend(
|
||||
name="jwt", transport=cookie_transport, get_strategy=get_jwt_strategy
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid auth backend: {AUTH_BACKEND}")
|
||||
|
||||
@@ -1328,14 +1404,6 @@ async def optional_user(
|
||||
user: User | None = Depends(optional_fastapi_current_user),
|
||||
) -> User | None:
|
||||
|
||||
tenant_id = get_current_tenant_id()
|
||||
if (
|
||||
user is not None
|
||||
and user.is_anonymous
|
||||
and anonymous_user_enabled(tenant_id=tenant_id)
|
||||
):
|
||||
return get_anonymous_user()
|
||||
|
||||
if user := await _check_for_saml_and_jwt(request, user, async_db_session):
|
||||
# If user is already set, _check_for_saml_and_jwt returns the same user object
|
||||
return user
|
||||
|
||||
@@ -43,7 +43,7 @@ from onyx.redis.redis_connector_prune import RedisConnectorPrune
|
||||
from onyx.redis.redis_document_set import RedisDocumentSet
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_usergroup import RedisUserGroup
|
||||
from onyx.tracing.braintrust_tracing import setup_braintrust_if_creds_available
|
||||
from onyx.tracing.setup import setup_tracing
|
||||
from onyx.utils.logger import ColoredFormatter
|
||||
from onyx.utils.logger import LoggerContextVars
|
||||
from onyx.utils.logger import PlainFormatter
|
||||
@@ -93,12 +93,12 @@ class TenantAwareTask(Task):
|
||||
|
||||
@task_prerun.connect
|
||||
def on_task_prerun(
|
||||
sender: Any | None = None,
|
||||
task_id: str | None = None,
|
||||
task: Task | None = None,
|
||||
args: tuple[Any, ...] | None = None,
|
||||
kwargs: dict[str, Any] | None = None,
|
||||
**other_kwargs: Any,
|
||||
sender: Any | None = None, # noqa: ARG001
|
||||
task_id: str | None = None, # noqa: ARG001
|
||||
task: Task | None = None, # noqa: ARG001
|
||||
args: tuple[Any, ...] | None = None, # noqa: ARG001
|
||||
kwargs: dict[str, Any] | None = None, # noqa: ARG001
|
||||
**other_kwargs: Any, # noqa: ARG001
|
||||
) -> None:
|
||||
# Reset any per-task logging context so that prefixes (e.g. pruning_ctx)
|
||||
# from a previous task executed in the same worker process do not leak
|
||||
@@ -110,14 +110,14 @@ def on_task_prerun(
|
||||
|
||||
|
||||
def on_task_postrun(
|
||||
sender: Any | None = None,
|
||||
sender: Any | None = None, # noqa: ARG001
|
||||
task_id: str | None = None,
|
||||
task: Task | None = None,
|
||||
args: tuple | None = None,
|
||||
args: tuple | None = None, # noqa: ARG001
|
||||
kwargs: dict[str, Any] | None = None,
|
||||
retval: Any | None = None,
|
||||
retval: Any | None = None, # noqa: ARG001
|
||||
state: str | None = None,
|
||||
**kwds: Any,
|
||||
**kwds: Any, # noqa: ARG001
|
||||
) -> None:
|
||||
"""We handle this signal in order to remove completed tasks
|
||||
from their respective tasksets. This allows us to track the progress of document set
|
||||
@@ -209,7 +209,9 @@ def on_task_postrun(
|
||||
return
|
||||
|
||||
|
||||
def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
|
||||
def on_celeryd_init(
|
||||
sender: str, conf: Any = None, **kwargs: Any # noqa: ARG001
|
||||
) -> None:
|
||||
"""The first signal sent on celery worker startup"""
|
||||
|
||||
# NOTE(rkuo): start method "fork" is unsafe and we really need it to be "spawn"
|
||||
@@ -238,11 +240,11 @@ def on_celeryd_init(sender: str, conf: Any = None, **kwargs: Any) -> None:
|
||||
f"Multiprocessing selected start method: {multiprocessing.get_start_method()}"
|
||||
)
|
||||
|
||||
# Initialize Braintrust tracing in workers if credentials are available.
|
||||
setup_braintrust_if_creds_available()
|
||||
# Initialize tracing in workers if credentials are available.
|
||||
setup_tracing()
|
||||
|
||||
|
||||
def wait_for_redis(sender: Any, **kwargs: Any) -> None:
|
||||
def wait_for_redis(sender: Any, **kwargs: Any) -> None: # noqa: ARG001
|
||||
"""Waits for redis to become ready subject to a hardcoded timeout.
|
||||
Will raise WorkerShutdown to kill the celery worker if the timeout
|
||||
is reached."""
|
||||
@@ -285,7 +287,7 @@ def wait_for_redis(sender: Any, **kwargs: Any) -> None:
|
||||
return
|
||||
|
||||
|
||||
def wait_for_db(sender: Any, **kwargs: Any) -> None:
|
||||
def wait_for_db(sender: Any, **kwargs: Any) -> None: # noqa: ARG001
|
||||
"""Waits for the db to become ready subject to a hardcoded timeout.
|
||||
Will raise WorkerShutdown to kill the celery worker if the timeout is reached."""
|
||||
|
||||
@@ -327,7 +329,7 @@ def wait_for_db(sender: Any, **kwargs: Any) -> None:
|
||||
return
|
||||
|
||||
|
||||
def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None: # noqa: ARG001
|
||||
logger.info(f"Running as a secondary celery worker: pid={os.getpid()}")
|
||||
|
||||
# Set up variables for waiting on primary worker
|
||||
@@ -359,7 +361,7 @@ def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
return
|
||||
|
||||
|
||||
def on_worker_ready(sender: Any, **kwargs: Any) -> None:
|
||||
def on_worker_ready(sender: Any, **kwargs: Any) -> None: # noqa: ARG001
|
||||
task_logger.info("worker_ready signal received.")
|
||||
|
||||
# file based way to do readiness/liveness probes
|
||||
@@ -372,7 +374,7 @@ def on_worker_ready(sender: Any, **kwargs: Any) -> None:
|
||||
logger.info(f"Readiness signal touched at {path}.")
|
||||
|
||||
|
||||
def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
def on_worker_shutdown(sender: Any, **kwargs: Any) -> None: # noqa: ARG001
|
||||
HttpxPool.close_all()
|
||||
|
||||
hostname: str = cast(str, sender.hostname)
|
||||
@@ -405,9 +407,9 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
def on_setup_logging(
|
||||
loglevel: int,
|
||||
logfile: str | None,
|
||||
format: str,
|
||||
colorize: bool,
|
||||
**kwargs: Any,
|
||||
format: str, # noqa: ARG001
|
||||
colorize: bool, # noqa: ARG001
|
||||
**kwargs: Any, # noqa: ARG001
|
||||
) -> None:
|
||||
# TODO: could unhardcode format and colorize and accept these as options from
|
||||
# celery's config
|
||||
@@ -508,18 +510,18 @@ class TenantContextFilter(logging.Filter):
|
||||
|
||||
@task_postrun.connect
|
||||
def reset_tenant_id(
|
||||
sender: Any | None = None,
|
||||
task_id: str | None = None,
|
||||
task: Task | None = None,
|
||||
args: tuple[Any, ...] | None = None,
|
||||
kwargs: dict[str, Any] | None = None,
|
||||
**other_kwargs: Any,
|
||||
sender: Any | None = None, # noqa: ARG001
|
||||
task_id: str | None = None, # noqa: ARG001
|
||||
task: Task | None = None, # noqa: ARG001
|
||||
args: tuple[Any, ...] | None = None, # noqa: ARG001
|
||||
kwargs: dict[str, Any] | None = None, # noqa: ARG001
|
||||
**other_kwargs: Any, # noqa: ARG001
|
||||
) -> None:
|
||||
"""Signal handler to reset tenant ID in context var after task ends."""
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(POSTGRES_DEFAULT_SCHEMA)
|
||||
|
||||
|
||||
def wait_for_vespa_or_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
def wait_for_vespa_or_shutdown(sender: Any, **kwargs: Any) -> None: # noqa: ARG001
|
||||
"""Waits for Vespa to become ready subject to a timeout.
|
||||
Raises WorkerShutdown if the timeout is reached."""
|
||||
|
||||
@@ -553,12 +555,12 @@ class LivenessProbe(bootsteps.StartStopStep):
|
||||
priority=10,
|
||||
)
|
||||
|
||||
def stop(self, worker: Any) -> None:
|
||||
def stop(self, worker: Any) -> None: # noqa: ARG002
|
||||
self.path.unlink(missing_ok=True)
|
||||
if self.task_tref:
|
||||
self.task_tref.cancel()
|
||||
|
||||
def update_liveness_file(self, worker: Any) -> None:
|
||||
def update_liveness_file(self, worker: Any) -> None: # noqa: ARG002
|
||||
self.path.touch()
|
||||
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
|
||||
|
||||
@worker_process_init.connect
|
||||
def init_worker(**kwargs: Any) -> None:
|
||||
def init_worker(**kwargs: Any) -> None: # noqa: ARG001
|
||||
SqlEngine.reset_engine()
|
||||
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
|
||||
|
||||
@worker_process_init.connect
|
||||
def init_worker(**kwargs: Any) -> None:
|
||||
def init_worker(**kwargs: Any) -> None: # noqa: ARG001
|
||||
SqlEngine.reset_engine()
|
||||
|
||||
|
||||
|
||||
@@ -244,7 +244,7 @@ class HubPeriodicTask(bootsteps.StartStopStep):
|
||||
# it's unclear to me whether using the hub's timer or the bootstep timer is better
|
||||
requires = {"celery.worker.components:Hub"}
|
||||
|
||||
def __init__(self, worker: Any, **kwargs: Any) -> None:
|
||||
def __init__(self, worker: Any, **kwargs: Any) -> None: # noqa: ARG002
|
||||
self.interval = CELERY_PRIMARY_WORKER_LOCK_TIMEOUT / 8 # Interval in seconds
|
||||
self.task_tref = None
|
||||
|
||||
@@ -300,7 +300,7 @@ class HubPeriodicTask(bootsteps.StartStopStep):
|
||||
except Exception:
|
||||
task_logger.exception("Periodic task failed.")
|
||||
|
||||
def stop(self, worker: Any) -> None:
|
||||
def stop(self, worker: Any) -> None: # noqa: ARG002
|
||||
# Cancel the scheduled task when the worker stops
|
||||
if self.task_tref:
|
||||
self.task_tref.cancel()
|
||||
|
||||
@@ -91,7 +91,7 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
|
||||
|
||||
@worker_process_init.connect
|
||||
def init_worker(**kwargs: Any) -> None:
|
||||
def init_worker(**kwargs: Any) -> None: # noqa: ARG001
|
||||
SqlEngine.reset_engine()
|
||||
|
||||
|
||||
|
||||
@@ -217,9 +217,11 @@ if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
|
||||
{
|
||||
"name": "check-for-documents-for-opensearch-migration",
|
||||
"task": OnyxCeleryTask.CHECK_FOR_DOCUMENTS_FOR_OPENSEARCH_MIGRATION_TASK,
|
||||
# Try to enqueue an invocation of this task with this frequency.
|
||||
"schedule": timedelta(seconds=120), # 2 minutes
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.LOW,
|
||||
# If the task was not dequeued in this time, revoke it.
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
}
|
||||
@@ -227,10 +229,18 @@ if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
|
||||
beat_task_templates.append(
|
||||
{
|
||||
"name": "migrate-documents-from-vespa-to-opensearch",
|
||||
"task": OnyxCeleryTask.MIGRATE_DOCUMENT_FROM_VESPA_TO_OPENSEARCH_TASK,
|
||||
"task": OnyxCeleryTask.MIGRATE_DOCUMENTS_FROM_VESPA_TO_OPENSEARCH_TASK,
|
||||
# Try to enqueue an invocation of this task with this frequency.
|
||||
# NOTE: If MIGRATION_TASK_SOFT_TIME_LIMIT_S is greater than this
|
||||
# value and the task is maximally busy, we can expect to see some
|
||||
# enqueued tasks be revoked over time. This is ok; by erring on the
|
||||
# side of "there will probably always be at least one task of this
|
||||
# type in the queue", we are minimizing this task's idleness while
|
||||
# still giving chances for other tasks to execute.
|
||||
"schedule": timedelta(seconds=120), # 2 minutes
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.LOW,
|
||||
# If the task was not dequeued in this time, revoke it.
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -366,7 +366,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
|
||||
|
||||
|
||||
def monitor_connector_deletion_taskset(
|
||||
tenant_id: str, key_bytes: bytes, r: Redis
|
||||
tenant_id: str, key_bytes: bytes, r: Redis # noqa: ARG001
|
||||
) -> None:
|
||||
fence_key = key_bytes.decode("utf-8")
|
||||
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
|
||||
|
||||
@@ -1071,7 +1071,7 @@ def check_for_checkpoint_cleanup(self: Task, *, tenant_id: str) -> None:
|
||||
bind=True,
|
||||
)
|
||||
def cleanup_checkpoint_task(
|
||||
self: Task, *, index_attempt_id: int, tenant_id: str | None
|
||||
self: Task, *, index_attempt_id: int, tenant_id: str | None # noqa: ARG001
|
||||
) -> None:
|
||||
"""Clean up a checkpoint for a given index attempt"""
|
||||
|
||||
@@ -1160,7 +1160,7 @@ def check_for_index_attempt_cleanup(self: Task, *, tenant_id: str) -> None:
|
||||
bind=True,
|
||||
)
|
||||
def cleanup_index_attempt_task(
|
||||
self: Task, *, index_attempt_ids: list[int], tenant_id: str
|
||||
self: Task, *, index_attempt_ids: list[int], tenant_id: str # noqa: ARG001
|
||||
) -> None:
|
||||
"""Clean up an index attempt"""
|
||||
start = time.monotonic()
|
||||
@@ -1266,7 +1266,7 @@ def _resolve_indexing_document_errors(
|
||||
bind=True,
|
||||
)
|
||||
def docprocessing_task(
|
||||
self: Task,
|
||||
self: Task, # noqa: ARG001
|
||||
index_attempt_id: int,
|
||||
cc_pair_id: int,
|
||||
tenant_id: str,
|
||||
|
||||
@@ -57,7 +57,7 @@ class IndexingCallbackBase(IndexingHeartbeatInterface):
|
||||
# TODO: Pass index_attempt_id to the callback and check cancellation using the db
|
||||
return bool(self.redis_connector.stop.fenced)
|
||||
|
||||
def progress(self, tag: str, amount: int) -> None:
|
||||
def progress(self, tag: str, amount: int) -> None: # noqa: ARG002
|
||||
"""Amount isn't used yet."""
|
||||
|
||||
# rkuo: this shouldn't be necessary yet because we spawn the process this runs inside
|
||||
|
||||
@@ -26,7 +26,7 @@ logger = setup_logger()
|
||||
trail=False,
|
||||
)
|
||||
def eval_run_task(
|
||||
self: Task,
|
||||
self: Task, # noqa: ARG001
|
||||
*,
|
||||
configuration_dict: dict[str, Any],
|
||||
) -> None:
|
||||
@@ -48,7 +48,7 @@ def eval_run_task(
|
||||
bind=True,
|
||||
trail=False,
|
||||
)
|
||||
def scheduled_eval_task(self: Task, **kwargs: Any) -> None:
|
||||
def scheduled_eval_task(self: Task, **kwargs: Any) -> None: # noqa: ARG001
|
||||
"""
|
||||
Scheduled task to run evaluations on configured datasets.
|
||||
Runs weekly on Sunday at midnight UTC.
|
||||
|
||||
@@ -322,7 +322,7 @@ def _run_hierarchy_extraction(
|
||||
bind=True,
|
||||
)
|
||||
def connector_hierarchy_fetching_task(
|
||||
self: Task,
|
||||
self: Task, # noqa: ARG001
|
||||
*,
|
||||
cc_pair_id: int,
|
||||
tenant_id: str,
|
||||
|
||||
@@ -17,7 +17,9 @@ from onyx.llm.well_known_providers.auto_update_service import (
|
||||
trail=False,
|
||||
bind=True,
|
||||
)
|
||||
def check_for_auto_llm_updates(self: Task, *, tenant_id: str) -> bool | None:
|
||||
def check_for_auto_llm_updates(
|
||||
self: Task, *, tenant_id: str # noqa: ARG001
|
||||
) -> bool | None:
|
||||
"""Periodic task to fetch LLM model updates from GitHub
|
||||
and sync them to providers in Auto mode.
|
||||
|
||||
|
||||
@@ -871,7 +871,7 @@ def cloud_monitor_celery_queues(
|
||||
|
||||
|
||||
@shared_task(name=OnyxCeleryTask.MONITOR_CELERY_QUEUES, ignore_result=True, bind=True)
|
||||
def monitor_celery_queues(self: Task, *, tenant_id: str) -> None:
|
||||
def monitor_celery_queues(self: Task, *, tenant_id: str) -> None: # noqa: ARG001
|
||||
return monitor_celery_queues_helper(self)
|
||||
|
||||
|
||||
@@ -952,7 +952,7 @@ def _get_cmdline_for_process(process: psutil.Process) -> str | None:
|
||||
queue=OnyxCeleryQueues.MONITORING,
|
||||
bind=True,
|
||||
)
|
||||
def monitor_process_memory(self: Task, *, tenant_id: str) -> None:
|
||||
def monitor_process_memory(self: Task, *, tenant_id: str) -> None: # noqa: ARG001
|
||||
"""
|
||||
Task to monitor memory usage of supervisor-managed processes.
|
||||
This periodically checks the memory usage of processes and logs information
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
# Tasks are expected to cease execution and do cleanup after the soft time
|
||||
# limit. In principle they are also forceably terminated after the hard time
|
||||
# limit, in practice this does not happen since we use threadpools for Celery
|
||||
# task execution, and we simple hope that the total task time plus cleanup does
|
||||
# not exceed this. Therefore tasks should regularly check their timeout and lock
|
||||
# status. The lock timeout is the maximum time the lock manager (Redis in this
|
||||
# case) will enforce the lock, independent of what is happening in the task. To
|
||||
# reduce the chances that a task is still doing work while a lock has expired,
|
||||
# make the lock timeout well above the task timeouts. In practice we should
|
||||
# never see locks be held for this long anyway because a task should release the
|
||||
# lock after its cleanup which happens at most after its soft timeout.
|
||||
|
||||
# Constants corresponding to migrate_documents_from_vespa_to_opensearch_task.
|
||||
MIGRATION_TASK_SOFT_TIME_LIMIT_S = 60 * 5 # 5 minutes.
|
||||
MIGRATION_TASK_TIME_LIMIT_S = 60 * 6 # 6 minutes.
|
||||
# The maximum time the lock can be held for. Will automatically be released
|
||||
# after this time.
|
||||
MIGRATION_TASK_LOCK_TIMEOUT_S = 60 * 7 # 7 minutes.
|
||||
assert (
|
||||
MIGRATION_TASK_SOFT_TIME_LIMIT_S < MIGRATION_TASK_TIME_LIMIT_S
|
||||
), "The soft time limit must be less than the time limit."
|
||||
assert (
|
||||
MIGRATION_TASK_TIME_LIMIT_S < MIGRATION_TASK_LOCK_TIMEOUT_S
|
||||
), "The time limit must be less than the lock timeout."
|
||||
# Time to wait to acquire the lock.
|
||||
MIGRATION_TASK_LOCK_BLOCKING_TIMEOUT_S = 60 * 2 # 2 minutes.
|
||||
|
||||
# Constants corresponding to check_for_documents_for_opensearch_migration_task.
|
||||
CHECK_FOR_DOCUMENTS_TASK_SOFT_TIME_LIMIT_S = 60 # 60 seconds / 1 minute.
|
||||
CHECK_FOR_DOCUMENTS_TASK_TIME_LIMIT_S = 90 # 90 seconds.
|
||||
# The maximum time the lock can be held for. Will automatically be released
|
||||
# after this time.
|
||||
CHECK_FOR_DOCUMENTS_TASK_LOCK_TIMEOUT_S = 120 # 120 seconds / 2 minutes.
|
||||
assert (
|
||||
CHECK_FOR_DOCUMENTS_TASK_SOFT_TIME_LIMIT_S < CHECK_FOR_DOCUMENTS_TASK_TIME_LIMIT_S
|
||||
), "The soft time limit must be less than the time limit."
|
||||
assert (
|
||||
CHECK_FOR_DOCUMENTS_TASK_TIME_LIMIT_S < CHECK_FOR_DOCUMENTS_TASK_LOCK_TIMEOUT_S
|
||||
), "The time limit must be less than the lock timeout."
|
||||
# Time to wait to acquire the lock.
|
||||
CHECK_FOR_DOCUMENTS_TASK_LOCK_BLOCKING_TIMEOUT_S = 30 # 30 seconds.
|
||||
|
||||
TOTAL_ALLOWABLE_DOC_MIGRATION_ATTEMPTS_BEFORE_PERMANENT_FAILURE = 15
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Celery tasks for migrating documents from Vespa to OpenSearch."""
|
||||
|
||||
import time
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
@@ -10,6 +11,30 @@ from celery import Task
|
||||
from redis.lock import Lock as RedisLock
|
||||
|
||||
from onyx.background.celery.apps.app_base import task_logger
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
CHECK_FOR_DOCUMENTS_TASK_LOCK_BLOCKING_TIMEOUT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
CHECK_FOR_DOCUMENTS_TASK_LOCK_TIMEOUT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
CHECK_FOR_DOCUMENTS_TASK_SOFT_TIME_LIMIT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
CHECK_FOR_DOCUMENTS_TASK_TIME_LIMIT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
MIGRATION_TASK_LOCK_BLOCKING_TIMEOUT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
MIGRATION_TASK_LOCK_TIMEOUT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
MIGRATION_TASK_SOFT_TIME_LIMIT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.constants import (
|
||||
MIGRATION_TASK_TIME_LIMIT_S,
|
||||
)
|
||||
from onyx.background.celery.tasks.opensearch_migration.transformer import (
|
||||
transform_vespa_chunks_to_opensearch_chunks,
|
||||
)
|
||||
@@ -31,6 +56,9 @@ from onyx.db.opensearch_migration import (
|
||||
increment_num_times_observed_no_additional_docs_to_populate_migration_table_with_commit,
|
||||
)
|
||||
from onyx.db.opensearch_migration import should_document_migration_be_permanently_failed
|
||||
from onyx.db.opensearch_migration import (
|
||||
try_insert_opensearch_tenant_migration_record_with_commit,
|
||||
)
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.interfaces_new import TenantState
|
||||
from onyx.document_index.opensearch.opensearch_document_index import (
|
||||
@@ -72,7 +100,9 @@ def _migrate_single_document(
|
||||
raise RuntimeError(f"No chunks found for document {document_id} in Vespa.")
|
||||
|
||||
opensearch_document_chunks: list[DocumentChunk] = (
|
||||
transform_vespa_chunks_to_opensearch_chunks(vespa_document_chunks, tenant_state)
|
||||
transform_vespa_chunks_to_opensearch_chunks(
|
||||
vespa_document_chunks, tenant_state, document_id
|
||||
)
|
||||
)
|
||||
if len(opensearch_document_chunks) != len(vespa_document_chunks):
|
||||
raise RuntimeError(
|
||||
@@ -90,22 +120,30 @@ def _migrate_single_document(
|
||||
name=OnyxCeleryTask.CHECK_FOR_DOCUMENTS_FOR_OPENSEARCH_MIGRATION_TASK,
|
||||
# Does not store the task's return value in the result backend.
|
||||
ignore_result=True,
|
||||
# When exceeded celery will raise a SoftTimeLimitExceeded in the task.
|
||||
soft_time_limit=60 * 5, # 5 minutes.
|
||||
# When exceeded the task will be forcefully terminated.
|
||||
time_limit=60 * 6, # 6 minutes.
|
||||
# WARNING: This is here just for rigor but since we use threads for Celery
|
||||
# this config is not respected and timeout logic must be implemented in the
|
||||
# task.
|
||||
soft_time_limit=CHECK_FOR_DOCUMENTS_TASK_SOFT_TIME_LIMIT_S,
|
||||
# WARNING: This is here just for rigor but since we use threads for Celery
|
||||
# this config is not respected and timeout logic must be implemented in the
|
||||
# task.
|
||||
time_limit=CHECK_FOR_DOCUMENTS_TASK_TIME_LIMIT_S,
|
||||
# Passed in self to the task to get task metadata.
|
||||
bind=True,
|
||||
)
|
||||
def check_for_documents_for_opensearch_migration_task(
|
||||
self: Task, *, tenant_id: str
|
||||
self: Task, *, tenant_id: str # noqa: ARG001
|
||||
) -> bool | None:
|
||||
"""
|
||||
Periodic task to check for and add documents to the OpenSearch migration
|
||||
table.
|
||||
|
||||
Should not execute meaningful logic at the same time as
|
||||
migrate_document_from_vespa_to_opensearch_task.
|
||||
migrate_documents_from_vespa_to_opensearch_task.
|
||||
|
||||
Effectively tries to populate as many migration records as possible within
|
||||
CHECK_FOR_DOCUMENTS_TASK_SOFT_TIME_LIMIT_S seconds. Does so in batches of
|
||||
1000 documents.
|
||||
|
||||
Returns:
|
||||
None if OpenSearch migration is not enabled, or if the lock could not be
|
||||
@@ -119,29 +157,33 @@ def check_for_documents_for_opensearch_migration_task(
|
||||
return None
|
||||
|
||||
task_logger.info("Checking for documents for OpenSearch migration.")
|
||||
|
||||
task_start_time = time.monotonic()
|
||||
r = get_redis_client()
|
||||
|
||||
# Use a lock to prevent overlapping tasks. Only this task or
|
||||
# migrate_document_from_vespa_to_opensearch_task can interact with the
|
||||
# migrate_documents_from_vespa_to_opensearch_task can interact with the
|
||||
# OpenSearchMigration table at once.
|
||||
lock_beat: RedisLock = r.lock(
|
||||
lock: RedisLock = r.lock(
|
||||
name=OnyxRedisLocks.OPENSEARCH_MIGRATION_BEAT_LOCK,
|
||||
# The maximum time the lock can be held for. Will automatically be
|
||||
# released after this time.
|
||||
timeout=60 * 6, # 6 minutes, same as the time limit for this task.
|
||||
timeout=CHECK_FOR_DOCUMENTS_TASK_LOCK_TIMEOUT_S,
|
||||
# .acquire will block until the lock is acquired.
|
||||
blocking=True,
|
||||
# Wait for 2 minutes trying to acquire the lock.
|
||||
blocking_timeout=60 * 2, # 2 minutes.
|
||||
# Time to wait to acquire the lock.
|
||||
blocking_timeout=CHECK_FOR_DOCUMENTS_TASK_LOCK_BLOCKING_TIMEOUT_S,
|
||||
)
|
||||
|
||||
if not lock_beat.acquire():
|
||||
if not lock.acquire():
|
||||
task_logger.warning(
|
||||
"The OpenSearch migration check task timed out waiting for the lock."
|
||||
)
|
||||
return None
|
||||
else:
|
||||
task_logger.info(
|
||||
f"Acquired the OpenSearch migration check lock. Took {time.monotonic() - task_start_time:.3f} seconds. "
|
||||
f"Token: {lock.local.token}"
|
||||
)
|
||||
|
||||
num_documents_found_for_record_creation = 0
|
||||
try:
|
||||
# Double check that tenant info is correct.
|
||||
if tenant_id != get_current_tenant_id():
|
||||
@@ -151,65 +193,89 @@ def check_for_documents_for_opensearch_migration_task(
|
||||
)
|
||||
task_logger.error(err_str)
|
||||
return False
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# For pagination, get the last ID we've inserted into
|
||||
# OpenSearchMigration.
|
||||
last_opensearch_migration_document_id = (
|
||||
get_last_opensearch_migration_document_id(db_session)
|
||||
)
|
||||
# Now get the next batch of doc IDs starting after the last ID.
|
||||
document_ids = get_paginated_document_batch(
|
||||
db_session,
|
||||
prev_ending_document_id=last_opensearch_migration_document_id,
|
||||
)
|
||||
|
||||
if not document_ids:
|
||||
task_logger.info(
|
||||
"No more documents to insert for OpenSearch migration."
|
||||
while (
|
||||
time.monotonic() - task_start_time
|
||||
< CHECK_FOR_DOCUMENTS_TASK_SOFT_TIME_LIMIT_S
|
||||
and lock.owned()
|
||||
):
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# For pagination, get the last ID we've inserted into
|
||||
# OpenSearchMigration.
|
||||
last_opensearch_migration_document_id = (
|
||||
get_last_opensearch_migration_document_id(db_session)
|
||||
)
|
||||
increment_num_times_observed_no_additional_docs_to_populate_migration_table_with_commit(
|
||||
db_session
|
||||
# Now get the next batch of doc IDs starting after the last ID.
|
||||
# We'll do 1000 documents per transaction/timeout check.
|
||||
document_ids = get_paginated_document_batch(
|
||||
db_session,
|
||||
limit=1000,
|
||||
prev_ending_document_id=last_opensearch_migration_document_id,
|
||||
)
|
||||
# TODO(andrei): Once we've done this enough times and the number
|
||||
# of documents matches the number of migration records, we can
|
||||
# be done with this task and update
|
||||
# document_migration_record_table_population_status.
|
||||
return True
|
||||
|
||||
# Create the migration records for the next batch of documents with
|
||||
# status PENDING.
|
||||
create_opensearch_migration_records_with_commit(db_session, document_ids)
|
||||
task_logger.info(
|
||||
f"Created {len(document_ids)} migration records for the next batch of documents."
|
||||
)
|
||||
if not document_ids:
|
||||
task_logger.info(
|
||||
"No more documents to insert for OpenSearch migration."
|
||||
)
|
||||
increment_num_times_observed_no_additional_docs_to_populate_migration_table_with_commit(
|
||||
db_session
|
||||
)
|
||||
# TODO(andrei): Once we've done this enough times and the
|
||||
# number of documents matches the number of migration
|
||||
# records, we can be done with this task and update
|
||||
# document_migration_record_table_population_status.
|
||||
return True
|
||||
|
||||
# Create the migration records for the next batch of documents
|
||||
# with status PENDING.
|
||||
create_opensearch_migration_records_with_commit(
|
||||
db_session, document_ids
|
||||
)
|
||||
num_documents_found_for_record_creation += len(document_ids)
|
||||
|
||||
# Try to create the singleton row in
|
||||
# OpenSearchTenantMigrationRecord if it doesn't already exist.
|
||||
# This is a reasonable place to put it because we already have a
|
||||
# lock, a session, and error handling, at the cost of running
|
||||
# this small set of logic for every batch.
|
||||
try_insert_opensearch_tenant_migration_record_with_commit(db_session)
|
||||
except Exception:
|
||||
task_logger.exception("Error in the OpenSearch migration check task.")
|
||||
return False
|
||||
finally:
|
||||
if lock_beat.owned():
|
||||
lock_beat.release()
|
||||
if lock.owned():
|
||||
lock.release()
|
||||
else:
|
||||
task_logger.warning(
|
||||
"The OpenSearch migration lock was not owned on completion of the check task."
|
||||
)
|
||||
|
||||
task_logger.info(
|
||||
f"Finished checking for documents for OpenSearch migration. Found {num_documents_found_for_record_creation} documents "
|
||||
f"to create migration records for in {time.monotonic() - task_start_time:.3f} seconds. However, this may include "
|
||||
"documents for which there already exist records."
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# shared_task allows this task to be shared across celery app instances.
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.MIGRATE_DOCUMENT_FROM_VESPA_TO_OPENSEARCH_TASK,
|
||||
name=OnyxCeleryTask.MIGRATE_DOCUMENTS_FROM_VESPA_TO_OPENSEARCH_TASK,
|
||||
# Does not store the task's return value in the result backend.
|
||||
ignore_result=True,
|
||||
# When exceeded celery will raise a SoftTimeLimitExceeded in the task.
|
||||
soft_time_limit=60 * 5, # 5 minutes.
|
||||
# When exceeded the task will be forcefully terminated.
|
||||
time_limit=60 * 6, # 6 minutes.
|
||||
# WARNING: This is here just for rigor but since we use threads for Celery
|
||||
# this config is not respected and timeout logic must be implemented in the
|
||||
# task.
|
||||
soft_time_limit=MIGRATION_TASK_SOFT_TIME_LIMIT_S,
|
||||
# WARNING: This is here just for rigor but since we use threads for Celery
|
||||
# this config is not respected and timeout logic must be implemented in the
|
||||
# task.
|
||||
time_limit=MIGRATION_TASK_TIME_LIMIT_S,
|
||||
# Passed in self to the task to get task metadata.
|
||||
bind=True,
|
||||
)
|
||||
def migrate_documents_from_vespa_to_opensearch_task(
|
||||
self: Task,
|
||||
self: Task, # noqa: ARG001
|
||||
*,
|
||||
tenant_id: str,
|
||||
) -> bool | None:
|
||||
@@ -218,10 +284,13 @@ def migrate_documents_from_vespa_to_opensearch_task(
|
||||
Should not execute meaningful logic at the same time as
|
||||
check_for_documents_for_opensearch_migration_task.
|
||||
|
||||
Effectively tries to migrate as many documents as possible within
|
||||
MIGRATION_TASK_SOFT_TIME_LIMIT_S seconds. Does so in batches of 5 documents.
|
||||
|
||||
Returns:
|
||||
None if OpenSearch migration is not enabled, or if the lock could not be
|
||||
acquired; effectively a no-op. True if the task completed
|
||||
successfully. False if the task failed.
|
||||
successfully. False if the task errored.
|
||||
"""
|
||||
if not ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
|
||||
task_logger.warning(
|
||||
@@ -229,30 +298,36 @@ def migrate_documents_from_vespa_to_opensearch_task(
|
||||
)
|
||||
return None
|
||||
|
||||
task_logger.info("Trying to migrate documents from Vespa to OpenSearch.")
|
||||
|
||||
task_logger.info("Trying a migration batch from Vespa to OpenSearch.")
|
||||
task_start_time = time.monotonic()
|
||||
r = get_redis_client()
|
||||
|
||||
# Use a lock to prevent overlapping tasks. Only this task or
|
||||
# check_for_documents_for_opensearch_migration_task can interact with the
|
||||
# OpenSearchMigration table at once.
|
||||
lock_beat: RedisLock = r.lock(
|
||||
lock: RedisLock = r.lock(
|
||||
name=OnyxRedisLocks.OPENSEARCH_MIGRATION_BEAT_LOCK,
|
||||
# The maximum time the lock can be held for. Will automatically be
|
||||
# released after this time.
|
||||
timeout=60 * 6, # 6 minutes, same as the time limit for this task.
|
||||
timeout=MIGRATION_TASK_LOCK_TIMEOUT_S,
|
||||
# .acquire will block until the lock is acquired.
|
||||
blocking=True,
|
||||
# Wait for 2 minutes trying to acquire the lock.
|
||||
blocking_timeout=60 * 2, # 2 minutes.
|
||||
# Time to wait to acquire the lock.
|
||||
blocking_timeout=MIGRATION_TASK_LOCK_BLOCKING_TIMEOUT_S,
|
||||
)
|
||||
|
||||
if not lock_beat.acquire():
|
||||
if not lock.acquire():
|
||||
task_logger.warning(
|
||||
"The OpenSearch migration task timed out waiting for the lock."
|
||||
)
|
||||
return None
|
||||
else:
|
||||
task_logger.info(
|
||||
f"Acquired the OpenSearch migration lock. Took {time.monotonic() - task_start_time:.3f} seconds. "
|
||||
f"Token: {lock.local.token}"
|
||||
)
|
||||
|
||||
num_documents_migrated = 0
|
||||
num_chunks_migrated = 0
|
||||
num_documents_failed = 0
|
||||
try:
|
||||
# Double check that tenant info is correct.
|
||||
if tenant_id != get_current_tenant_id():
|
||||
@@ -262,98 +337,111 @@ def migrate_documents_from_vespa_to_opensearch_task(
|
||||
)
|
||||
task_logger.error(err_str)
|
||||
return False
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
records_needing_migration = (
|
||||
get_opensearch_migration_records_needing_migration(db_session)
|
||||
)
|
||||
if not records_needing_migration:
|
||||
task_logger.info(
|
||||
"No documents found that need to be migrated from Vespa to OpenSearch."
|
||||
)
|
||||
increment_num_times_observed_no_additional_docs_to_migrate_with_commit(
|
||||
db_session
|
||||
)
|
||||
# TODO(andrei): Once we've done this enough times and
|
||||
# document_migration_record_table_population_status is done, we
|
||||
# can be done with this task and update
|
||||
# overall_document_migration_status accordingly. Note that this
|
||||
# includes marking connectors as needing reindexing if some
|
||||
# migrations failed.
|
||||
return True
|
||||
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
tenant_state = TenantState(tenant_id=tenant_id, multitenant=MULTI_TENANT)
|
||||
|
||||
opensearch_document_index = OpenSearchDocumentIndex(
|
||||
index_name=search_settings.index_name, tenant_state=tenant_state
|
||||
)
|
||||
vespa_document_index = VespaDocumentIndex(
|
||||
index_name=search_settings.index_name,
|
||||
tenant_state=tenant_state,
|
||||
large_chunks_enabled=False,
|
||||
)
|
||||
|
||||
task_logger.info(
|
||||
f"Trying to migrate {len(records_needing_migration)} documents from Vespa to OpenSearch."
|
||||
)
|
||||
|
||||
for record in records_needing_migration:
|
||||
try:
|
||||
# If the Document's chunk count is not known, it was
|
||||
# probably just indexed so fail here to give it a chance to
|
||||
# sync. If in the rare event this Document has not been
|
||||
# re-indexed in a very long time and is still under the
|
||||
# "old" embedding/indexing logic where chunk count was never
|
||||
# stored, we will eventually permanently fail and thus force
|
||||
# a re-index of this doc, which is a desireable outcome.
|
||||
if record.document.chunk_count is None:
|
||||
raise RuntimeError(
|
||||
f"Document {record.document_id} has no chunk count."
|
||||
)
|
||||
|
||||
chunks_migrated = _migrate_single_document(
|
||||
document_id=record.document_id,
|
||||
opensearch_document_index=opensearch_document_index,
|
||||
vespa_document_index=vespa_document_index,
|
||||
tenant_state=tenant_state,
|
||||
while (
|
||||
time.monotonic() - task_start_time < MIGRATION_TASK_SOFT_TIME_LIMIT_S
|
||||
and lock.owned()
|
||||
):
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# We'll do 5 documents per transaction/timeout check.
|
||||
records_needing_migration = (
|
||||
get_opensearch_migration_records_needing_migration(
|
||||
db_session, limit=5
|
||||
)
|
||||
|
||||
# If the number of chunks in Vespa is not in sync with the
|
||||
# Document table for this doc let's not consider this
|
||||
# completed and let's let a subsequent run take care of it.
|
||||
if chunks_migrated != record.document.chunk_count:
|
||||
raise RuntimeError(
|
||||
f"Number of chunks migrated ({chunks_migrated}) does not match number of expected chunks in Vespa "
|
||||
f"({record.document.chunk_count}) for document {record.document_id}."
|
||||
)
|
||||
|
||||
record.status = OpenSearchDocumentMigrationStatus.COMPLETED
|
||||
except Exception:
|
||||
record.status = OpenSearchDocumentMigrationStatus.FAILED
|
||||
record.error_message = f"Attempt {record.attempts_count + 1}:\n{traceback.format_exc()}"
|
||||
task_logger.exception(
|
||||
f"Error migrating document {record.document_id} from Vespa to OpenSearch."
|
||||
)
|
||||
if not records_needing_migration:
|
||||
task_logger.info(
|
||||
"No documents found that need to be migrated from Vespa to OpenSearch."
|
||||
)
|
||||
finally:
|
||||
record.attempts_count += 1
|
||||
record.last_attempt_at = datetime.now(timezone.utc)
|
||||
if should_document_migration_be_permanently_failed(record):
|
||||
record.status = (
|
||||
OpenSearchDocumentMigrationStatus.PERMANENTLY_FAILED
|
||||
)
|
||||
# TODO(andrei): Not necessarily here but if this happens
|
||||
# we'll need to mark the connector as needing reindex.
|
||||
increment_num_times_observed_no_additional_docs_to_migrate_with_commit(
|
||||
db_session
|
||||
)
|
||||
# TODO(andrei): Once we've done this enough times and
|
||||
# document_migration_record_table_population_status is done, we
|
||||
# can be done with this task and update
|
||||
# overall_document_migration_status accordingly. Note that this
|
||||
# includes marking connectors as needing reindexing if some
|
||||
# migrations failed.
|
||||
return True
|
||||
|
||||
db_session.commit()
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
tenant_state = TenantState(
|
||||
tenant_id=tenant_id, multitenant=MULTI_TENANT
|
||||
)
|
||||
opensearch_document_index = OpenSearchDocumentIndex(
|
||||
index_name=search_settings.index_name, tenant_state=tenant_state
|
||||
)
|
||||
vespa_document_index = VespaDocumentIndex(
|
||||
index_name=search_settings.index_name,
|
||||
tenant_state=tenant_state,
|
||||
large_chunks_enabled=False,
|
||||
)
|
||||
|
||||
for record in records_needing_migration:
|
||||
try:
|
||||
# If the Document's chunk count is not known, it was
|
||||
# probably just indexed so fail here to give it a chance to
|
||||
# sync. If in the rare event this Document has not been
|
||||
# re-indexed in a very long time and is still under the
|
||||
# "old" embedding/indexing logic where chunk count was never
|
||||
# stored, we will eventually permanently fail and thus force
|
||||
# a re-index of this doc, which is a desireable outcome.
|
||||
if record.document.chunk_count is None:
|
||||
raise RuntimeError(
|
||||
f"Document {record.document_id} has no chunk count."
|
||||
)
|
||||
|
||||
chunks_migrated = _migrate_single_document(
|
||||
document_id=record.document_id,
|
||||
opensearch_document_index=opensearch_document_index,
|
||||
vespa_document_index=vespa_document_index,
|
||||
tenant_state=tenant_state,
|
||||
)
|
||||
|
||||
# If the number of chunks in Vespa is not in sync with the
|
||||
# Document table for this doc let's not consider this
|
||||
# completed and let's let a subsequent run take care of it.
|
||||
if chunks_migrated != record.document.chunk_count:
|
||||
raise RuntimeError(
|
||||
f"Number of chunks migrated ({chunks_migrated}) does not match number of expected chunks "
|
||||
f"in Vespa ({record.document.chunk_count}) for document {record.document_id}."
|
||||
)
|
||||
|
||||
record.status = OpenSearchDocumentMigrationStatus.COMPLETED
|
||||
num_documents_migrated += 1
|
||||
num_chunks_migrated += chunks_migrated
|
||||
except Exception:
|
||||
record.status = OpenSearchDocumentMigrationStatus.FAILED
|
||||
record.error_message = f"Attempt {record.attempts_count + 1}:\n{traceback.format_exc()}"
|
||||
task_logger.exception(
|
||||
f"Error migrating document {record.document_id} from Vespa to OpenSearch."
|
||||
)
|
||||
num_documents_failed += 1
|
||||
finally:
|
||||
record.attempts_count += 1
|
||||
record.last_attempt_at = datetime.now(timezone.utc)
|
||||
if should_document_migration_be_permanently_failed(record):
|
||||
record.status = (
|
||||
OpenSearchDocumentMigrationStatus.PERMANENTLY_FAILED
|
||||
)
|
||||
# TODO(andrei): Not necessarily here but if this happens
|
||||
# we'll need to mark the connector as needing reindex.
|
||||
|
||||
db_session.commit()
|
||||
except Exception:
|
||||
task_logger.exception("Error in the OpenSearch migration task.")
|
||||
return False
|
||||
finally:
|
||||
if lock_beat.owned():
|
||||
lock_beat.release()
|
||||
if lock.owned():
|
||||
lock.release()
|
||||
else:
|
||||
task_logger.warning(
|
||||
"The OpenSearch migration lock was not owned on completion of the migration task."
|
||||
)
|
||||
|
||||
task_logger.info(
|
||||
f"Finished a migration batch from Vespa to OpenSearch. Migrated {num_chunks_migrated} chunks "
|
||||
f"from {num_documents_migrated} documents in {time.monotonic() - task_start_time:.3f} seconds. "
|
||||
f"Failed to migrate {num_documents_failed} documents."
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
@@ -30,8 +30,11 @@ from onyx.document_index.vespa_constants import TENANT_ID
|
||||
from onyx.document_index.vespa_constants import TITLE
|
||||
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
|
||||
from onyx.document_index.vespa_constants import USER_PROJECT
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
|
||||
|
||||
def _extract_content_vector(embeddings: Any) -> list[float]:
|
||||
"""Extracts the full chunk embedding vector from Vespa's embeddings tensor.
|
||||
@@ -150,13 +153,25 @@ def _transform_vespa_acl_to_opensearch_acl(
|
||||
def transform_vespa_chunks_to_opensearch_chunks(
|
||||
vespa_chunks: list[dict[str, Any]],
|
||||
tenant_state: TenantState,
|
||||
document_id: str,
|
||||
) -> list[DocumentChunk]:
|
||||
result: list[DocumentChunk] = []
|
||||
for vespa_chunk in vespa_chunks:
|
||||
# This should exist; fail loudly if it does not.
|
||||
document_id: str = vespa_chunk[DOCUMENT_ID]
|
||||
if not document_id:
|
||||
vespa_document_id: str = vespa_chunk[DOCUMENT_ID]
|
||||
if not vespa_document_id:
|
||||
raise ValueError("Missing document_id in Vespa chunk.")
|
||||
# Vespa doc IDs were sanitized using replace_invalid_doc_id_characters.
|
||||
# This was a poor design choice and we don't want this in OpenSearch;
|
||||
# whatever restrictions there may be on indexed chunk ID should have no
|
||||
# bearing on the chunk's document ID field, even if document ID is an
|
||||
# argument to the chunk ID. Deliberately choose to use the real doc ID
|
||||
# supplied to this function.
|
||||
if vespa_document_id != document_id:
|
||||
logger.warning(
|
||||
f"Vespa document ID {vespa_document_id} does not match the document ID supplied {document_id}. "
|
||||
"The Vespa ID will be discarded."
|
||||
)
|
||||
|
||||
# This should exist; fail loudly if it does not.
|
||||
chunk_index: int = vespa_chunk[CHUNK_ID]
|
||||
@@ -236,6 +251,8 @@ def transform_vespa_chunks_to_opensearch_chunks(
|
||||
)
|
||||
|
||||
opensearch_chunk = DocumentChunk(
|
||||
# We deliberately choose to use the doc ID supplied to this function
|
||||
# over the Vespa doc ID.
|
||||
document_id=document_id,
|
||||
chunk_index=chunk_index,
|
||||
title=title,
|
||||
|
||||
@@ -24,7 +24,7 @@ from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
bind=True,
|
||||
base=AbortableTask,
|
||||
)
|
||||
def kombu_message_cleanup_task(self: Any, tenant_id: str) -> int:
|
||||
def kombu_message_cleanup_task(self: Any, tenant_id: str) -> int: # noqa: ARG001
|
||||
"""Runs periodically to clean up the kombu_message table"""
|
||||
|
||||
# we will select messages older than this amount to clean up
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user