mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-06 15:32:43 +00:00
Compare commits
180 Commits
v2.10.7
...
add-featur
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1c992229dc | ||
|
|
0f39deb779 | ||
|
|
f7c51338a6 | ||
|
|
acd16cc20f | ||
|
|
186ce13c24 | ||
|
|
2c16436a1b | ||
|
|
0ba23c4465 | ||
|
|
b2fb6a31d0 | ||
|
|
2250a83f35 | ||
|
|
3c3750a922 | ||
|
|
cc80712533 | ||
|
|
1f544875e3 | ||
|
|
52bfe467e3 | ||
|
|
1c5ab38a31 | ||
|
|
4ba8e38988 | ||
|
|
db603e8b6f | ||
|
|
9631e1aec4 | ||
|
|
7ca15aac82 | ||
|
|
70708e90c5 | ||
|
|
9e72970b78 | ||
|
|
cd53ffdfd6 | ||
|
|
74a0bb3a8f | ||
|
|
76a6d7d597 | ||
|
|
153706cdf5 | ||
|
|
6bd2ffb99c | ||
|
|
e447c73e85 | ||
|
|
6819d5359a | ||
|
|
58d01c6e59 | ||
|
|
f87b794fb5 | ||
|
|
40ff76d32b | ||
|
|
4e9f693a6e | ||
|
|
8ebdeaba69 | ||
|
|
4072b6dd20 | ||
|
|
c1c4fa299a | ||
|
|
75edbc6a71 | ||
|
|
95cf65031c | ||
|
|
21e4100276 | ||
|
|
7dacdf083a | ||
|
|
bb79a67fc8 | ||
|
|
3f8cefaa68 | ||
|
|
3c64b4507f | ||
|
|
744c28dadd | ||
|
|
a175a5c393 | ||
|
|
31db112de9 | ||
|
|
a3e2da2c51 | ||
|
|
f4d33bcc0d | ||
|
|
464d957494 | ||
|
|
be12de9a44 | ||
|
|
fac0ec03b9 | ||
|
|
4ce69fc885 | ||
|
|
3e4a1f8a09 | ||
|
|
0190401c63 | ||
|
|
12bb2e3c51 | ||
|
|
af9b7826ab | ||
|
|
79d6f9167e | ||
|
|
cb16eb13fc | ||
|
|
0fd082bcca | ||
|
|
fbcf4b452e | ||
|
|
4f1bdf1de1 | ||
|
|
96345db5a7 | ||
|
|
20a73bdd2e | ||
|
|
8562ab3466 | ||
|
|
85cc2b99b7 | ||
|
|
1208a3ee2b | ||
|
|
84dffa0414 | ||
|
|
8ae896847d | ||
|
|
900fcef9dd | ||
|
|
d4ed25753b | ||
|
|
0ee58333b4 | ||
|
|
11b7e0d571 | ||
|
|
a35831f328 | ||
|
|
048a6d5259 | ||
|
|
2d4d00b506 | ||
|
|
e4bdb15910 | ||
|
|
3517d59286 | ||
|
|
c8057e1257 | ||
|
|
5f641dac35 | ||
|
|
4bc08e5d88 | ||
|
|
ff101313b6 | ||
|
|
4bd080cf62 | ||
|
|
5bf35f3c36 | ||
|
|
b0a8625ffc | ||
|
|
f94baf6143 | ||
|
|
9e1867638a | ||
|
|
ba8beb40d4 | ||
|
|
5b6d7c9f0d | ||
|
|
1115e515a8 | ||
|
|
e5dcf31f10 | ||
|
|
8ca06ef3e7 | ||
|
|
6897dbd610 | ||
|
|
7063fa354b | ||
|
|
d3f19294db | ||
|
|
3df1cb0759 | ||
|
|
7f3cb77466 | ||
|
|
267042a5aa | ||
|
|
d02b3ae6ac | ||
|
|
683c3f7a7e | ||
|
|
06c561b3fd | ||
|
|
008b4d2288 | ||
|
|
8be261405a | ||
|
|
7358bb4bc2 | ||
|
|
61f2c48ebc | ||
|
|
a16e65d5cb | ||
|
|
dbde2e6d6d | ||
|
|
0cd4ad921e | ||
|
|
2860136214 | ||
|
|
49ec5994d3 | ||
|
|
8d5fb67f0f | ||
|
|
15d02f6e3c | ||
|
|
e58974c419 | ||
|
|
28194a58b5 | ||
|
|
6b66c07952 | ||
|
|
cae058a3ac | ||
|
|
aa3b21a191 | ||
|
|
ed6e134b28 | ||
|
|
2f2d65a950 | ||
|
|
2d436fb207 | ||
|
|
c0d9c17312 | ||
|
|
7a07a78696 | ||
|
|
a8db236e37 | ||
|
|
5462460ae7 | ||
|
|
fa44774ef3 | ||
|
|
8a2e4ed36f | ||
|
|
216f2c95a7 | ||
|
|
67081efe08 | ||
|
|
9d40b8336f | ||
|
|
23f0033302 | ||
|
|
9011b76eb0 | ||
|
|
e2bd32e405 | ||
|
|
45e436bafc | ||
|
|
010bc36d61 | ||
|
|
468e488bdb | ||
|
|
9104c0ffce | ||
|
|
d36a6bd0b4 | ||
|
|
74431ea0a8 | ||
|
|
9ed9c95007 | ||
|
|
b06cf041d7 | ||
|
|
a3603c498c | ||
|
|
8f274e34c9 | ||
|
|
5c256760ff | ||
|
|
fe2cc230b5 | ||
|
|
544bcf8e7f | ||
|
|
258e1372b3 | ||
|
|
83a543a265 | ||
|
|
f9719d199d | ||
|
|
1c7bb6e56a | ||
|
|
982ad7d329 | ||
|
|
f94292808b | ||
|
|
293553a2e2 | ||
|
|
b818709b7d | ||
|
|
ba906ae6fa | ||
|
|
c84c7a354e | ||
|
|
2187b0dd82 | ||
|
|
d88a417bf9 | ||
|
|
f2d32b0b3b | ||
|
|
3ccf71c5ee | ||
|
|
e8cac79b96 | ||
|
|
81967d3507 | ||
|
|
6c5197cb74 | ||
|
|
6d3e4809e3 | ||
|
|
f61a275aa5 | ||
|
|
3715d1ed75 | ||
|
|
6e3fa6fbac | ||
|
|
537bdf75a9 | ||
|
|
f706a411f1 | ||
|
|
e54c7c377a | ||
|
|
9371cbecb4 | ||
|
|
1ae7e4feee | ||
|
|
2649876f64 | ||
|
|
4628b711be | ||
|
|
1dd8e07310 | ||
|
|
295de1268e | ||
|
|
f8ef7d5ea5 | ||
|
|
343cd5cb1b | ||
|
|
08228e1a10 | ||
|
|
95df31aa01 | ||
|
|
ba45f9bc77 | ||
|
|
be088a0964 | ||
|
|
67bb013063 | ||
|
|
9140c4c449 |
411
.github/workflows/deployment.yml
vendored
411
.github/workflows/deployment.yml
vendored
@@ -8,7 +8,9 @@ on:
|
||||
|
||||
# Set restrictive default permissions for all jobs. Jobs that need more permissions
|
||||
# should explicitly declare them.
|
||||
permissions: {}
|
||||
permissions:
|
||||
# Required for OIDC authentication with AWS
|
||||
id-token: write # zizmor: ignore[excessive-permissions]
|
||||
|
||||
env:
|
||||
EDGE_TAG: ${{ startsWith(github.ref_name, 'nightly-latest') }}
|
||||
@@ -150,16 +152,30 @@ jobs:
|
||||
if: always() && needs.check-version-tag.result == 'failure' && github.event_name != 'workflow_dispatch'
|
||||
runs-on: ubuntu-slim
|
||||
timeout-minutes: 10
|
||||
environment: release
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
MONITOR_DEPLOYMENTS_WEBHOOK, deploy/monitor-deployments-webhook
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Send Slack notification
|
||||
uses: ./.github/actions/slack-notify
|
||||
with:
|
||||
webhook-url: ${{ secrets.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
webhook-url: ${{ env.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
failed-jobs: "• check-version-tag"
|
||||
title: "🚨 Version Tag Check Failed"
|
||||
ref-name: ${{ github.ref_name }}
|
||||
@@ -168,6 +184,7 @@ jobs:
|
||||
needs: determine-builds
|
||||
if: needs.determine-builds.outputs.build-desktop == 'true'
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: write
|
||||
actions: read
|
||||
strategy:
|
||||
@@ -185,12 +202,33 @@ jobs:
|
||||
|
||||
runs-on: ${{ matrix.platform }}
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
steps:
|
||||
- uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6.0.1
|
||||
with:
|
||||
# NOTE: persist-credentials is needed for tauri-action to create GitHub releases.
|
||||
persist-credentials: true # zizmor: ignore[artipacked]
|
||||
|
||||
- name: Configure AWS credentials
|
||||
if: startsWith(matrix.platform, 'macos-')
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
if: startsWith(matrix.platform, 'macos-')
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
APPLE_ID, deploy/apple-id
|
||||
APPLE_PASSWORD, deploy/apple-password
|
||||
APPLE_CERTIFICATE, deploy/apple-certificate
|
||||
APPLE_CERTIFICATE_PASSWORD, deploy/apple-certificate-password
|
||||
KEYCHAIN_PASSWORD, deploy/keychain-password
|
||||
APPLE_TEAM_ID, deploy/apple-team-id
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: install dependencies (ubuntu only)
|
||||
if: startsWith(matrix.platform, 'ubuntu-')
|
||||
run: |
|
||||
@@ -285,15 +323,40 @@ jobs:
|
||||
|
||||
Write-Host "Versions set to: $VERSION"
|
||||
|
||||
- name: Import Apple Developer Certificate
|
||||
if: startsWith(matrix.platform, 'macos-')
|
||||
run: |
|
||||
echo $APPLE_CERTIFICATE | base64 --decode > certificate.p12
|
||||
security create-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
|
||||
security default-keychain -s build.keychain
|
||||
security unlock-keychain -p "$KEYCHAIN_PASSWORD" build.keychain
|
||||
security set-keychain-settings -t 3600 -u build.keychain
|
||||
security import certificate.p12 -k build.keychain -P "$APPLE_CERTIFICATE_PASSWORD" -T /usr/bin/codesign
|
||||
security set-key-partition-list -S apple-tool:,apple:,codesign: -s -k "$KEYCHAIN_PASSWORD" build.keychain
|
||||
security find-identity -v -p codesigning build.keychain
|
||||
|
||||
- name: Verify Certificate
|
||||
if: startsWith(matrix.platform, 'macos-')
|
||||
run: |
|
||||
CERT_INFO=$(security find-identity -v -p codesigning build.keychain | grep -E "(Developer ID Application|Apple Distribution|Apple Development)" | head -n 1)
|
||||
CERT_ID=$(echo "$CERT_INFO" | awk -F'"' '{print $2}')
|
||||
echo "CERT_ID=$CERT_ID" >> $GITHUB_ENV
|
||||
echo "Certificate imported."
|
||||
|
||||
- uses: tauri-apps/tauri-action@73fb865345c54760d875b94642314f8c0c894afa # ratchet:tauri-apps/tauri-action@action-v0.6.1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
APPLE_ID: ${{ env.APPLE_ID }}
|
||||
APPLE_PASSWORD: ${{ env.APPLE_PASSWORD }}
|
||||
APPLE_SIGNING_IDENTITY: ${{ env.CERT_ID }}
|
||||
APPLE_TEAM_ID: ${{ env.APPLE_TEAM_ID }}
|
||||
with:
|
||||
tagName: ${{ needs.determine-builds.outputs.is-test-run != 'true' && 'v__VERSION__' || format('v0.0.0-dev+{0}', needs.determine-builds.outputs.short-sha) }}
|
||||
releaseName: ${{ needs.determine-builds.outputs.is-test-run != 'true' && 'v__VERSION__' || format('v0.0.0-dev+{0}', needs.determine-builds.outputs.short-sha) }}
|
||||
releaseBody: "See the assets to download this version and install."
|
||||
releaseDraft: true
|
||||
prerelease: false
|
||||
assetNamePattern: "[name]_[arch][ext]"
|
||||
args: ${{ matrix.args }}
|
||||
|
||||
build-web-amd64:
|
||||
@@ -305,6 +368,7 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-web-amd64
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -317,6 +381,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -326,13 +404,13 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
@@ -363,6 +441,7 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-web-arm64
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -375,6 +454,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -384,13 +477,13 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
@@ -423,19 +516,34 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-merge-web
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: onyxdotapp/onyx-web-server
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
@@ -471,6 +579,7 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-web-cloud-amd64
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -483,6 +592,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -492,13 +615,13 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
@@ -537,6 +660,7 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-web-cloud-arm64
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -549,6 +673,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -558,13 +696,13 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
@@ -605,19 +743,34 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-merge-web-cloud
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: onyxdotapp/onyx-web-server-cloud
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
@@ -650,6 +803,7 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-backend-amd64
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -662,6 +816,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -671,13 +839,13 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
@@ -707,6 +875,7 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-backend-arm64
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -719,6 +888,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -728,13 +911,13 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
@@ -766,19 +949,34 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-merge-backend
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-backend-cloud' || 'onyxdotapp/onyx-backend' }}
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
@@ -815,6 +1013,7 @@ jobs:
|
||||
- volume=40gb
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -827,6 +1026,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -836,15 +1049,15 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
with:
|
||||
buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
@@ -879,6 +1092,7 @@ jobs:
|
||||
- volume=40gb
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
outputs:
|
||||
digest: ${{ steps.build.outputs.digest }}
|
||||
env:
|
||||
@@ -891,6 +1105,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # ratchet:docker/metadata-action@v5
|
||||
@@ -900,15 +1128,15 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
with:
|
||||
buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
@@ -944,19 +1172,34 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-merge-model-server
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-model-server-cloud' || 'onyxdotapp/onyx-model-server' }}
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
username: ${{ env.DOCKER_USERNAME }}
|
||||
password: ${{ env.DOCKER_TOKEN }}
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
@@ -994,11 +1237,26 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-trivy-scan-web
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: onyxdotapp/onyx-web-server
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
|
||||
with:
|
||||
@@ -1014,8 +1272,8 @@ jobs:
|
||||
docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
-e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
@@ -1034,11 +1292,26 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-trivy-scan-web-cloud
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: onyxdotapp/onyx-web-server-cloud
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
|
||||
with:
|
||||
@@ -1054,8 +1327,8 @@ jobs:
|
||||
docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
-e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
@@ -1074,6 +1347,7 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-trivy-scan-backend
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-backend-cloud' || 'onyxdotapp/onyx-backend' }}
|
||||
steps:
|
||||
@@ -1084,6 +1358,20 @@ jobs:
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
|
||||
with:
|
||||
@@ -1100,8 +1388,8 @@ jobs:
|
||||
-v ${{ github.workspace }}/backend/.trivyignore:/tmp/.trivyignore:ro \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
-e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
@@ -1121,11 +1409,26 @@ jobs:
|
||||
- run-id=${{ github.run_id }}-trivy-scan-model-server
|
||||
- extras=ecr-cache
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
env:
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'onyxdotapp/onyx-model-server-cloud' || 'onyxdotapp/onyx-model-server' }}
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
DOCKER_USERNAME, deploy/docker-username
|
||||
DOCKER_TOKEN, deploy/docker-token
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: nick-fields/retry@ce71cc2ab81d554ebbe88c79ab5975992d79ba08 # ratchet:nick-fields/retry@v3
|
||||
with:
|
||||
@@ -1141,8 +1444,8 @@ jobs:
|
||||
docker run --rm -v $HOME/.cache/trivy:/root/.cache/trivy \
|
||||
-e TRIVY_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-db:2" \
|
||||
-e TRIVY_JAVA_DB_REPOSITORY="public.ecr.aws/aquasecurity/trivy-java-db:1" \
|
||||
-e TRIVY_USERNAME="${{ secrets.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ secrets.DOCKER_TOKEN }}" \
|
||||
-e TRIVY_USERNAME="${{ env.DOCKER_USERNAME }}" \
|
||||
-e TRIVY_PASSWORD="${{ env.DOCKER_TOKEN }}" \
|
||||
aquasec/trivy@sha256:a22415a38938a56c379387a8163fcb0ce38b10ace73e593475d3658d578b2436 \
|
||||
image \
|
||||
--skip-version-check \
|
||||
@@ -1170,12 +1473,26 @@ jobs:
|
||||
# NOTE: Github-hosted runners have about 20s faster queue times and are preferred here.
|
||||
runs-on: ubuntu-slim
|
||||
timeout-minutes: 90
|
||||
environment: release
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # ratchet:actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Configure AWS credentials
|
||||
uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
|
||||
with:
|
||||
role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
|
||||
aws-region: us-east-2
|
||||
|
||||
- name: Get AWS Secrets
|
||||
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802
|
||||
with:
|
||||
secret-ids: |
|
||||
MONITOR_DEPLOYMENTS_WEBHOOK, deploy/monitor-deployments-webhook
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Determine failed jobs
|
||||
id: failed-jobs
|
||||
shell: bash
|
||||
@@ -1241,7 +1558,7 @@ jobs:
|
||||
- name: Send Slack notification
|
||||
uses: ./.github/actions/slack-notify
|
||||
with:
|
||||
webhook-url: ${{ secrets.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
webhook-url: ${{ env.MONITOR_DEPLOYMENTS_WEBHOOK }}
|
||||
failed-jobs: ${{ steps.failed-jobs.outputs.jobs }}
|
||||
title: "🚨 Deployment Workflow Failed"
|
||||
ref-name: ${{ github.ref_name }}
|
||||
|
||||
2
.github/workflows/docker-tag-beta.yml
vendored
2
.github/workflows/docker-tag-beta.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
|
||||
2
.github/workflows/docker-tag-latest.yml
vendored
2
.github/workflows/docker-tag-latest.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
|
||||
1
.github/workflows/helm-chart-releases.yml
vendored
1
.github/workflows/helm-chart-releases.yml
vendored
@@ -29,6 +29,7 @@ jobs:
|
||||
run: |
|
||||
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
|
||||
helm repo add onyx-vespa https://onyx-dot-app.github.io/vespa-helm-charts
|
||||
helm repo add opensearch https://opensearch-project.github.io/helm-charts
|
||||
helm repo add cloudnative-pg https://cloudnative-pg.github.io/charts
|
||||
helm repo add ot-container-kit https://ot-container-kit.github.io/helm-charts
|
||||
helm repo add minio https://charts.min.io/
|
||||
|
||||
2
.github/workflows/nightly-scan-licenses.yml
vendored
2
.github/workflows/nightly-scan-licenses.yml
vendored
@@ -94,7 +94,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # ratchet:docker/login-action@v3
|
||||
|
||||
@@ -45,6 +45,9 @@ env:
|
||||
# TODO: debug why this is failing and enable
|
||||
CODE_INTERPRETER_BASE_URL: http://localhost:8000
|
||||
|
||||
# OpenSearch
|
||||
OPENSEARCH_ADMIN_PASSWORD: "StrongPassword123!"
|
||||
|
||||
jobs:
|
||||
discover-test-dirs:
|
||||
# NOTE: Github-hosted runners have about 20s faster queue times and are preferred here.
|
||||
@@ -125,11 +128,13 @@ jobs:
|
||||
docker compose \
|
||||
-f docker-compose.yml \
|
||||
-f docker-compose.dev.yml \
|
||||
-f docker-compose.opensearch.yml \
|
||||
up -d \
|
||||
minio \
|
||||
relational_db \
|
||||
cache \
|
||||
index \
|
||||
opensearch \
|
||||
code-interpreter
|
||||
|
||||
- name: Run migrations
|
||||
@@ -158,7 +163,7 @@ jobs:
|
||||
cd deployment/docker_compose
|
||||
|
||||
# Get list of running containers
|
||||
containers=$(docker compose -f docker-compose.yml -f docker-compose.dev.yml ps -q)
|
||||
containers=$(docker compose -f docker-compose.yml -f docker-compose.dev.yml -f docker-compose.opensearch.yml ps -q)
|
||||
|
||||
# Collect logs from each container
|
||||
for container in $containers; do
|
||||
|
||||
8
.github/workflows/pr-helm-chart-testing.yml
vendored
8
.github/workflows/pr-helm-chart-testing.yml
vendored
@@ -88,6 +88,7 @@ jobs:
|
||||
echo "=== Adding Helm repositories ==="
|
||||
helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx
|
||||
helm repo add vespa https://onyx-dot-app.github.io/vespa-helm-charts
|
||||
helm repo add opensearch https://opensearch-project.github.io/helm-charts
|
||||
helm repo add cloudnative-pg https://cloudnative-pg.github.io/charts
|
||||
helm repo add ot-container-kit https://ot-container-kit.github.io/helm-charts
|
||||
helm repo add minio https://charts.min.io/
|
||||
@@ -180,6 +181,11 @@ jobs:
|
||||
trap cleanup EXIT
|
||||
|
||||
# Run the actual installation with detailed logging
|
||||
# Note that opensearch.enabled is true whereas others in this install
|
||||
# are false. There is some work that needs to be done to get this
|
||||
# entire step working in CI, enabling opensearch here is a small step
|
||||
# in that direction. If this is causing issues, disabling it in this
|
||||
# step should be ok in the short term.
|
||||
echo "=== Starting ct install ==="
|
||||
set +e
|
||||
ct install --all \
|
||||
@@ -187,6 +193,8 @@ jobs:
|
||||
--set=nginx.enabled=false \
|
||||
--set=minio.enabled=false \
|
||||
--set=vespa.enabled=false \
|
||||
--set=opensearch.enabled=true \
|
||||
--set=auth.opensearch.enabled=true \
|
||||
--set=slackbot.enabled=false \
|
||||
--set=postgresql.enabled=true \
|
||||
--set=postgresql.nameOverride=cloudnative-pg \
|
||||
|
||||
6
.github/workflows/pr-integration-tests.yml
vendored
6
.github/workflows/pr-integration-tests.yml
vendored
@@ -103,7 +103,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
@@ -163,7 +163,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
@@ -208,7 +208,7 @@ jobs:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling openapitools/openapi-generator-cli
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
|
||||
@@ -95,7 +95,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
@@ -155,7 +155,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
@@ -214,7 +214,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling openapitools/openapi-generator-cli
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
|
||||
6
.github/workflows/pr-playwright-tests.yml
vendored
6
.github/workflows/pr-playwright-tests.yml
vendored
@@ -85,7 +85,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
@@ -146,7 +146,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
@@ -207,7 +207,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # ratchet:docker/setup-buildx-action@v3
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
|
||||
2
.github/workflows/pr-python-model-tests.yml
vendored
2
.github/workflows/pr-python-model-tests.yml
vendored
@@ -70,7 +70,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f
|
||||
|
||||
- name: Build and load
|
||||
uses: docker/bake-action@5be5f02ff8819ecd3092ea6b2e6261c31774f2b4 # ratchet:docker/bake-action@v6
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,5 +1,8 @@
|
||||
# editors
|
||||
.vscode
|
||||
!/.vscode/env_template.txt
|
||||
!/.vscode/launch.json
|
||||
!/.vscode/tasks.template.jsonc
|
||||
.zed
|
||||
.cursor
|
||||
|
||||
|
||||
@@ -66,7 +66,8 @@ repos:
|
||||
- id: uv-run
|
||||
name: Check lazy imports
|
||||
args: ["--active", "--with=onyx-devtools", "ods", "check-lazy-imports"]
|
||||
files: ^backend/(?!\.venv/).*\.py$
|
||||
pass_filenames: true
|
||||
files: ^backend/(?!\.venv/|scripts/).*\.py$
|
||||
# NOTE: This takes ~6s on a single, large module which is prohibitively slow.
|
||||
# - id: uv-run
|
||||
# name: mypy
|
||||
@@ -74,6 +75,13 @@ repos:
|
||||
# pass_filenames: true
|
||||
# files: ^backend/.*\.py$
|
||||
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: 3e8a8703264a2f4a69428a0aa4dcb512790b2c8c # frozen: v6.0.0
|
||||
hooks:
|
||||
- id: check-added-large-files
|
||||
name: Check for added large files
|
||||
args: ["--maxkb=1500"]
|
||||
|
||||
- repo: https://github.com/rhysd/actionlint
|
||||
rev: a443f344ff32813837fa49f7aa6cbc478d770e62 # frozen: v1.7.9
|
||||
hooks:
|
||||
|
||||
116
.vscode/launch.template.jsonc → .vscode/launch.json
vendored
116
.vscode/launch.template.jsonc → .vscode/launch.json
vendored
@@ -1,5 +1,3 @@
|
||||
/* Copy this file into '.vscode/launch.json' or merge its contents into your existing configurations. */
|
||||
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
@@ -24,7 +22,7 @@
|
||||
"Slack Bot",
|
||||
"Celery primary",
|
||||
"Celery light",
|
||||
"Celery background",
|
||||
"Celery heavy",
|
||||
"Celery docfetching",
|
||||
"Celery docprocessing",
|
||||
"Celery beat"
|
||||
@@ -399,7 +397,6 @@
|
||||
"onyx.background.celery.versioned_apps.docfetching",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=1",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=docfetching@%n",
|
||||
@@ -430,7 +427,6 @@
|
||||
"onyx.background.celery.versioned_apps.docprocessing",
|
||||
"worker",
|
||||
"--pool=threads",
|
||||
"--concurrency=6",
|
||||
"--prefetch-multiplier=1",
|
||||
"--loglevel=INFO",
|
||||
"--hostname=docprocessing@%n",
|
||||
@@ -579,6 +575,116 @@
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Build Sandbox Templates",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"module": "onyx.server.features.build.sandbox.build_templates",
|
||||
"cwd": "${workspaceFolder}/backend",
|
||||
"envFile": "${workspaceFolder}/.vscode/.env",
|
||||
"env": {
|
||||
"PYTHONUNBUFFERED": "1",
|
||||
"PYTHONPATH": "."
|
||||
},
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
},
|
||||
"consoleTitle": "Build Sandbox Templates"
|
||||
},
|
||||
{
|
||||
// Dummy entry used to label the group
|
||||
"name": "--- Database ---",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"presentation": {
|
||||
"group": "4",
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Clean restore seeded database dump (destructive)",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "uv",
|
||||
"runtimeArgs": [
|
||||
"run",
|
||||
"--with",
|
||||
"onyx-devtools",
|
||||
"ods",
|
||||
"db",
|
||||
"restore",
|
||||
"--fetch-seeded",
|
||||
"--clean",
|
||||
"--yes"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "4"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Create database snapshot",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "uv",
|
||||
"runtimeArgs": [
|
||||
"run",
|
||||
"--with",
|
||||
"onyx-devtools",
|
||||
"ods",
|
||||
"db",
|
||||
"dump",
|
||||
"backup.dump"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "4"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Clean restore database snapshot (destructive)",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "uv",
|
||||
"runtimeArgs": [
|
||||
"run",
|
||||
"--with",
|
||||
"onyx-devtools",
|
||||
"ods",
|
||||
"db",
|
||||
"restore",
|
||||
"--clean",
|
||||
"--yes",
|
||||
"backup.dump"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "4"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Upgrade database to head revision",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "uv",
|
||||
"runtimeArgs": [
|
||||
"run",
|
||||
"--with",
|
||||
"onyx-devtools",
|
||||
"ods",
|
||||
"db",
|
||||
"upgrade"
|
||||
],
|
||||
"cwd": "${workspaceFolder}",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "4"
|
||||
}
|
||||
},
|
||||
{
|
||||
// script to generate the openapi schema
|
||||
"name": "Onyx OpenAPI Schema Generator",
|
||||
91
BUILD_CONNECTOR_AUDIT.md
Normal file
91
BUILD_CONNECTOR_AUDIT.md
Normal file
@@ -0,0 +1,91 @@
|
||||
# Build Mode Connector Audit Report
|
||||
|
||||
## Connectors in Build Mode
|
||||
1. GoogleDrive
|
||||
2. Gmail
|
||||
3. Notion
|
||||
4. GitHub
|
||||
5. Slack
|
||||
6. Linear
|
||||
7. Fireflies
|
||||
8. Hubspot
|
||||
|
||||
## Issues Found
|
||||
|
||||
### Issue 1: Missing `groups` parameter in connector creation
|
||||
**Location**: `/web/src/app/build/v1/configure/utils/createBuildConnector.ts:52-61`
|
||||
|
||||
**Problem**: The `createConnector` call passes `access_type: "private"` but doesn't include `groups: []`. While the backend defaults to empty list, it's better to be explicit.
|
||||
|
||||
**Status**: ✅ Will fix
|
||||
|
||||
### Issue 2: Step flow logic - Advanced values only connectors show as 2-step
|
||||
**Location**: `/web/src/app/build/v1/configure/components/ConfigureConnectorModal.tsx:24-36`
|
||||
|
||||
**Problem**: The `connectorNeedsConfigStep` function checks both `values` and `advanced_values`. Connectors that only have `advanced_values` (like Slack) incorrectly show as 2-step when they should be 1-step.
|
||||
|
||||
**Connectors affected**:
|
||||
- **Slack**: `values: []`, `advanced_values: [channels, channel_regex_enabled]` → Currently 2-step, should be 1-step
|
||||
|
||||
**Status**: ✅ Will fix
|
||||
|
||||
## Connector-by-Connector Analysis
|
||||
|
||||
### 1. GoogleDrive ✅
|
||||
- **Config**: Has `values` (indexing_scope tab with fields) and `advanced_values` (specific_user_emails, exclude_domain_link_only)
|
||||
- **Step Flow**: 2-step ✅ CORRECT
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
|
||||
### 2. Gmail ✅
|
||||
- **Config**: `values: []`, `advanced_values: []`
|
||||
- **Step Flow**: 1-step ✅ CORRECT
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
|
||||
### 3. Notion ✅
|
||||
- **Config**: Has `values` (root_page_id), `advanced_values: []`
|
||||
- **Step Flow**: 2-step ✅ CORRECT
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
|
||||
### 4. GitHub ✅
|
||||
- **Config**: Has `values` (repo_owner, github_mode, include_prs, include_issues), `advanced_values: []`
|
||||
- **Step Flow**: 2-step ✅ CORRECT
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
|
||||
### 5. Slack ❌
|
||||
- **Config**: `values: []`, `advanced_values: [channels, channel_regex_enabled]`
|
||||
- **Step Flow**: 2-step ❌ INCORRECT (should be 1-step)
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
- **Fix**: Update `connectorNeedsConfigStep` to only check `values`, not `advanced_values`
|
||||
|
||||
### 6. Linear ✅
|
||||
- **Config**: `values: []`, `advanced_values: []`
|
||||
- **Step Flow**: 1-step ✅ CORRECT
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
|
||||
### 7. Fireflies ✅
|
||||
- **Config**: `values: []`, `advanced_values: []`
|
||||
- **Step Flow**: 1-step ✅ CORRECT
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
|
||||
### 8. Hubspot ✅
|
||||
- **Config**: Has `values` (object_types), `advanced_values: []`
|
||||
- **Step Flow**: 2-step ✅ CORRECT
|
||||
- **access_type**: ✅ Included in createConnector call
|
||||
- **groups**: ❌ Missing (will add)
|
||||
|
||||
## Summary
|
||||
- **Total connectors**: 8
|
||||
- **Step flow issues**: 1 (Slack)
|
||||
- **access_type issues**: 0 (all connectors include it)
|
||||
- **groups issues**: 8 (all missing, but backend defaults to [])
|
||||
|
||||
## Fixes Required
|
||||
1. Add `groups: []` to `createConnector` call in `createBuildConnector.ts`
|
||||
2. Update `connectorNeedsConfigStep` to only check `values`, not `advanced_values`
|
||||
@@ -37,10 +37,6 @@ CVE-2023-50868
|
||||
CVE-2023-52425
|
||||
CVE-2024-28757
|
||||
|
||||
# sqlite, only used by NLTK library to grab word lemmatizer and stopwords
|
||||
# No impact in our settings
|
||||
CVE-2023-7104
|
||||
|
||||
# libharfbuzz0b, O(n^2) growth, worst case is denial of service
|
||||
# Accept the risk
|
||||
CVE-2023-25193
|
||||
|
||||
@@ -89,12 +89,6 @@ RUN uv pip install --system --no-cache-dir --upgrade \
|
||||
RUN python -c "from tokenizers import Tokenizer; \
|
||||
Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
|
||||
|
||||
# Pre-downloading NLTK for setups with limited egress
|
||||
RUN python -c "import nltk; \
|
||||
nltk.download('stopwords', quiet=True); \
|
||||
nltk.download('punkt_tab', quiet=True);"
|
||||
# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed
|
||||
|
||||
# Pre-downloading tiktoken for setups with limited egress
|
||||
RUN python -c "import tiktoken; \
|
||||
tiktoken.get_encoding('cl100k_base')"
|
||||
|
||||
150
backend/Dockerfile.sandbox-templates
Normal file
150
backend/Dockerfile.sandbox-templates
Normal file
@@ -0,0 +1,150 @@
|
||||
FROM python:3.11.7-slim-bookworm
|
||||
|
||||
LABEL com.danswer.maintainer="founders@onyx.app"
|
||||
LABEL com.danswer.description="This image is the web/frontend container of Onyx which \
|
||||
contains code for both the Community and Enterprise editions of Onyx. If you do not \
|
||||
have a contract or agreement with DanswerAI, you are not permitted to use the Enterprise \
|
||||
Edition features outside of personal development or testing purposes. Please reach out to \
|
||||
founders@onyx.app for more information. Please visit https://github.com/onyx-dot-app/onyx"
|
||||
|
||||
# DO_NOT_TRACK is used to disable telemetry for Unstructured
|
||||
ENV DANSWER_RUNNING_IN_DOCKER="true" \
|
||||
DO_NOT_TRACK="true" \
|
||||
PLAYWRIGHT_BROWSERS_PATH="/app/.cache/ms-playwright"
|
||||
|
||||
# Create non-root user for security best practices
|
||||
RUN groupadd -g 1001 onyx && \
|
||||
useradd -u 1001 -g onyx -m -s /bin/bash onyx && \
|
||||
mkdir -p /var/log/onyx && \
|
||||
chmod 755 /var/log/onyx && \
|
||||
chown onyx:onyx /var/log/onyx
|
||||
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.9.9 /uv /uvx /bin/
|
||||
|
||||
# Install system dependencies
|
||||
# cmake needed for psycopg (postgres)
|
||||
# libpq-dev needed for psycopg (postgres)
|
||||
# curl included just for users' convenience
|
||||
# zip for Vespa step futher down
|
||||
# ca-certificates for HTTPS
|
||||
# nodejs and npm needed for building Next.js template
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake \
|
||||
curl \
|
||||
zip \
|
||||
ca-certificates \
|
||||
libgnutls30 \
|
||||
libblkid1 \
|
||||
libmount1 \
|
||||
libsmartcols1 \
|
||||
libuuid1 \
|
||||
libxmlsec1-dev \
|
||||
pkg-config \
|
||||
gcc \
|
||||
nano \
|
||||
vim \
|
||||
nodejs \
|
||||
npm && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
apt-get clean
|
||||
|
||||
|
||||
|
||||
# Install Python dependencies
|
||||
# Remove py which is pulled in by retry, py is not needed and is a CVE
|
||||
COPY ./requirements/default.txt /tmp/requirements.txt
|
||||
COPY ./requirements/ee.txt /tmp/ee-requirements.txt
|
||||
RUN uv pip install --system --no-cache-dir --upgrade \
|
||||
-r /tmp/requirements.txt \
|
||||
-r /tmp/ee-requirements.txt && \
|
||||
pip uninstall -y py && \
|
||||
playwright install chromium && \
|
||||
playwright install-deps chromium && \
|
||||
chown -R onyx:onyx /app && \
|
||||
ln -s /usr/local/bin/supervisord /usr/bin/supervisord && \
|
||||
# Cleanup for CVEs and size reduction
|
||||
# https://github.com/tornadoweb/tornado/issues/3107
|
||||
# xserver-common and xvfb included by playwright installation but not needed after
|
||||
# perl-base is part of the base Python Debian image but not needed for Onyx functionality
|
||||
# perl-base could only be removed with --allow-remove-essential
|
||||
apt-get update && \
|
||||
apt-get remove -y --allow-remove-essential \
|
||||
perl-base \
|
||||
xserver-common \
|
||||
xvfb \
|
||||
cmake \
|
||||
libldap-2.5-0 \
|
||||
libxmlsec1-dev \
|
||||
pkg-config \
|
||||
gcc && \
|
||||
# Install here to avoid some packages being cleaned up above
|
||||
apt-get install -y \
|
||||
libxmlsec1-openssl \
|
||||
# Install postgresql-client for easy manual tests
|
||||
postgresql-client && \
|
||||
apt-get autoremove -y && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
rm -rf ~/.cache/uv /tmp/*.txt && \
|
||||
rm -f /usr/local/lib/python3.11/site-packages/tornado/test/test.key
|
||||
|
||||
# Build sandbox templates
|
||||
# Copy web template from the backend package (versioned with code)
|
||||
COPY --chown=onyx:onyx ./onyx/server/features/build/templates/outputs/web /templates/outputs/web
|
||||
|
||||
# Build Python venv template
|
||||
COPY --chown=onyx:onyx ./onyx/server/features/build/sandbox/kubernetes/docker/initial-requirements.txt /tmp/sandbox-requirements.txt
|
||||
COPY --chown=onyx:onyx ./onyx/server/features/build/sandbox/util/build_venv_template.py /tmp/build_venv_template.py
|
||||
RUN mkdir -p /templates && \
|
||||
python /tmp/build_venv_template.py --requirements /tmp/sandbox-requirements.txt && \
|
||||
chown -R onyx:onyx /templates && \
|
||||
rm -f /tmp/build_venv_template.py /tmp/sandbox-requirements.txt
|
||||
|
||||
# Pre-downloading models for setups with limited egress
|
||||
RUN python -c "from tokenizers import Tokenizer; \
|
||||
Tokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1')"
|
||||
|
||||
# Pre-downloading NLTK for setups with limited egress
|
||||
RUN python -c "import nltk; \
|
||||
nltk.download('stopwords', quiet=True); \
|
||||
nltk.download('punkt_tab', quiet=True);"
|
||||
# nltk.download('wordnet', quiet=True); introduce this back if lemmatization is needed
|
||||
|
||||
# Pre-downloading tiktoken for setups with limited egress
|
||||
RUN python -c "import tiktoken; \
|
||||
tiktoken.get_encoding('cl100k_base')"
|
||||
|
||||
# Set up application files
|
||||
WORKDIR /app
|
||||
|
||||
# Enterprise Version Files
|
||||
COPY --chown=onyx:onyx ./ee /app/ee
|
||||
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
|
||||
|
||||
# Set up application files
|
||||
COPY --chown=onyx:onyx ./onyx /app/onyx
|
||||
COPY --chown=onyx:onyx ./shared_configs /app/shared_configs
|
||||
COPY --chown=onyx:onyx ./alembic /app/alembic
|
||||
COPY --chown=onyx:onyx ./alembic_tenants /app/alembic_tenants
|
||||
COPY --chown=onyx:onyx ./alembic.ini /app/alembic.ini
|
||||
COPY supervisord.conf /usr/etc/supervisord.conf
|
||||
COPY --chown=onyx:onyx ./static /app/static
|
||||
|
||||
# Escape hatch scripts
|
||||
COPY --chown=onyx:onyx ./scripts/debugging /app/scripts/debugging
|
||||
COPY --chown=onyx:onyx ./scripts/force_delete_connector_by_id.py /app/scripts/force_delete_connector_by_id.py
|
||||
COPY --chown=onyx:onyx ./scripts/supervisord_entrypoint.sh /app/scripts/supervisord_entrypoint.sh
|
||||
RUN chmod +x /app/scripts/supervisord_entrypoint.sh
|
||||
|
||||
# Put logo in assets
|
||||
COPY --chown=onyx:onyx ./assets /app/assets
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
|
||||
# Default ONYX_VERSION, typically overriden during builds by GitHub Actions.
|
||||
ARG ONYX_VERSION=0.0.0-dev
|
||||
ENV ONYX_VERSION=${ONYX_VERSION}
|
||||
|
||||
# Default command which does nothing
|
||||
# This container is used by api server and background which specify their own CMD
|
||||
CMD ["tail", "-f", "/dev/null"]
|
||||
@@ -0,0 +1,33 @@
|
||||
"""add_processing_mode_to_connector_credential_pair
|
||||
|
||||
Revision ID: 0ab5805121ef
|
||||
Revises: 7cd906f37fc6
|
||||
Create Date: 2026-01-20 15:49:44.136116
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "0ab5805121ef"
|
||||
down_revision = "7cd906f37fc6"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"connector_credential_pair",
|
||||
sa.Column(
|
||||
"processing_mode",
|
||||
sa.String(),
|
||||
nullable=False,
|
||||
server_default="REGULAR",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("connector_credential_pair", "processing_mode")
|
||||
@@ -0,0 +1,121 @@
|
||||
"""snapshot_use_sandbox_id_foreign_key
|
||||
|
||||
Revision ID: 111d7192d457
|
||||
Revises: 0ab5805121ef
|
||||
Create Date: 2026-01-22 16:21:41.711611
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "111d7192d457"
|
||||
down_revision = "0ab5805121ef"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add sandbox_id column (nullable initially for data migration)
|
||||
op.add_column(
|
||||
"snapshot",
|
||||
sa.Column(
|
||||
"sandbox_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Populate sandbox_id from sandbox table via session_id
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE snapshot
|
||||
SET sandbox_id = sandbox.id
|
||||
FROM sandbox
|
||||
WHERE snapshot.session_id = sandbox.session_id
|
||||
"""
|
||||
)
|
||||
|
||||
# Make sandbox_id not nullable
|
||||
op.alter_column("snapshot", "sandbox_id", nullable=False)
|
||||
|
||||
# Add foreign key constraint for sandbox_id
|
||||
op.create_foreign_key(
|
||||
"snapshot_sandbox_id_fkey",
|
||||
"snapshot",
|
||||
"sandbox",
|
||||
["sandbox_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# Drop the old index that used session_id
|
||||
op.drop_index("ix_snapshot_session_created", table_name="snapshot")
|
||||
|
||||
# Drop the foreign key constraint on session_id
|
||||
op.drop_constraint("snapshot_session_id_fkey", "snapshot", type_="foreignkey")
|
||||
|
||||
# Drop the session_id column
|
||||
op.drop_column("snapshot", "session_id")
|
||||
|
||||
# Create new index using sandbox_id
|
||||
op.create_index(
|
||||
"ix_snapshot_sandbox_created",
|
||||
"snapshot",
|
||||
["sandbox_id", sa.text("created_at DESC")],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop the new index
|
||||
op.drop_index("ix_snapshot_sandbox_created", table_name="snapshot")
|
||||
|
||||
# Add session_id column back (nullable initially for data migration)
|
||||
op.add_column(
|
||||
"snapshot",
|
||||
sa.Column(
|
||||
"session_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# Populate session_id from sandbox table
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE snapshot
|
||||
SET session_id = sandbox.session_id
|
||||
FROM sandbox
|
||||
WHERE snapshot.sandbox_id = sandbox.id
|
||||
"""
|
||||
)
|
||||
|
||||
# Make session_id not nullable
|
||||
op.alter_column("snapshot", "session_id", nullable=False)
|
||||
|
||||
# Add foreign key constraint for session_id
|
||||
op.create_foreign_key(
|
||||
"snapshot_session_id_fkey",
|
||||
"snapshot",
|
||||
"build_session",
|
||||
["session_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# Recreate the old index
|
||||
op.create_index(
|
||||
"ix_snapshot_session_created",
|
||||
"snapshot",
|
||||
["session_id", sa.text("created_at DESC")],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# Drop the foreign key constraint on sandbox_id
|
||||
op.drop_constraint("snapshot_sandbox_id_fkey", "snapshot", type_="foreignkey")
|
||||
|
||||
# Drop the sandbox_id column
|
||||
op.drop_column("snapshot", "sandbox_id")
|
||||
@@ -0,0 +1,89 @@
|
||||
"""create_build_message_table
|
||||
|
||||
Revision ID: 26b589bf8be7
|
||||
Revises: df6cbd9a37cc
|
||||
Create Date: 2026-01-19 17:51:08.289325
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "26b589bf8be7"
|
||||
down_revision = "df6cbd9a37cc"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Reuse existing messagetype enum from chat_message table
|
||||
# Build messages only use: USER, ASSISTANT, SYSTEM
|
||||
# Note: The existing enum has uppercase values but MessageType in code uses lowercase
|
||||
# This works because SQLAlchemy handles the conversion when native_enum=False
|
||||
|
||||
# Create build_message table
|
||||
# Schema notes:
|
||||
# - turn_index: 0-indexed user message number; groups all assistant responses
|
||||
# (tool calls, thoughts, messages, plans) under the user prompt they respond to
|
||||
# - message_metadata: Required JSONB field containing the raw ACP packet JSON
|
||||
# - No content column: all data is stored in message_metadata
|
||||
op.create_table(
|
||||
"build_message",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"session_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("build_session.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"turn_index",
|
||||
sa.Integer(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"type",
|
||||
sa.Enum(
|
||||
"SYSTEM",
|
||||
"USER",
|
||||
"ASSISTANT",
|
||||
"DANSWER",
|
||||
name="messagetype",
|
||||
create_type=False,
|
||||
native_enum=False,
|
||||
),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"message_metadata",
|
||||
postgresql.JSONB(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create composite index for efficient turn-based queries
|
||||
# Orders by session_id, turn_index, then created_at for proper message ordering
|
||||
op.create_index(
|
||||
"ix_build_message_session_turn",
|
||||
"build_message",
|
||||
["session_id", "turn_index", sa.text("created_at ASC")],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop index
|
||||
op.drop_index("ix_build_message_session_turn", table_name="build_message")
|
||||
|
||||
# Drop table
|
||||
op.drop_table("build_message")
|
||||
@@ -0,0 +1,42 @@
|
||||
"""add_unique_constraint_to_inputprompt_prompt_user_id
|
||||
|
||||
Revision ID: 2c2430828bdf
|
||||
Revises: fb80bdd256de
|
||||
Create Date: 2026-01-20 16:01:54.314805
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "2c2430828bdf"
|
||||
down_revision = "fb80bdd256de"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create unique constraint on (prompt, user_id) for user-owned prompts
|
||||
# This ensures each user can only have one shortcut with a given name
|
||||
op.create_unique_constraint(
|
||||
"uq_inputprompt_prompt_user_id",
|
||||
"inputprompt",
|
||||
["prompt", "user_id"],
|
||||
)
|
||||
|
||||
# Create partial unique index for public prompts (where user_id IS NULL)
|
||||
# PostgreSQL unique constraints don't enforce uniqueness for NULL values,
|
||||
# so we need a partial index to ensure public prompt names are also unique
|
||||
op.execute(
|
||||
"""
|
||||
CREATE UNIQUE INDEX uq_inputprompt_prompt_public
|
||||
ON inputprompt (prompt)
|
||||
WHERE user_id IS NULL
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute("DROP INDEX IF EXISTS uq_inputprompt_prompt_public")
|
||||
op.drop_constraint("uq_inputprompt_prompt_user_id", "inputprompt", type_="unique")
|
||||
@@ -0,0 +1,29 @@
|
||||
"""remove default prompt shortcuts
|
||||
|
||||
Revision ID: 41fa44bef321
|
||||
Revises: 2c2430828bdf
|
||||
Create Date: 2025-01-21
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "41fa44bef321"
|
||||
down_revision = "2c2430828bdf"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Delete any user associations for the default prompts first (foreign key constraint)
|
||||
op.execute(
|
||||
"DELETE FROM inputprompt__user WHERE input_prompt_id IN (SELECT id FROM inputprompt WHERE id < 0)"
|
||||
)
|
||||
# Delete the pre-seeded default prompt shortcuts (they have negative IDs)
|
||||
op.execute("DELETE FROM inputprompt WHERE id < 0")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# We don't restore the default prompts on downgrade
|
||||
pass
|
||||
@@ -0,0 +1,84 @@
|
||||
"""create_sandbox_table
|
||||
|
||||
Revision ID: 484b9fa1ac89
|
||||
Revises: 96086064c5db
|
||||
Create Date: 2026-01-19 14:47:52.829749
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "484b9fa1ac89"
|
||||
down_revision = "96086064c5db"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create sandbox status enum
|
||||
sandbox_status_enum = sa.Enum(
|
||||
"provisioning",
|
||||
"running",
|
||||
"idle",
|
||||
"terminated",
|
||||
name="sandboxstatus",
|
||||
native_enum=False,
|
||||
)
|
||||
|
||||
# Create sandbox table
|
||||
op.create_table(
|
||||
"sandbox",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"session_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("build_session.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
unique=True,
|
||||
),
|
||||
sa.Column("container_id", sa.String(), nullable=True),
|
||||
sa.Column(
|
||||
"status",
|
||||
sandbox_status_enum,
|
||||
nullable=False,
|
||||
server_default="provisioning",
|
||||
),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("last_heartbeat", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create indexes for sandbox
|
||||
op.create_index(
|
||||
"ix_sandbox_status",
|
||||
"sandbox",
|
||||
["status"],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_sandbox_container_id",
|
||||
"sandbox",
|
||||
["container_id"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop indexes
|
||||
op.drop_index("ix_sandbox_container_id", table_name="sandbox")
|
||||
op.drop_index("ix_sandbox_status", table_name="sandbox")
|
||||
|
||||
# Drop table
|
||||
op.drop_table("sandbox")
|
||||
|
||||
# Drop enum
|
||||
sa.Enum(name="sandboxstatus").drop(op.get_bind(), checkfirst=True)
|
||||
@@ -0,0 +1,268 @@
|
||||
"""User shared sandbox - Phase 1 schema changes
|
||||
|
||||
Changes Sandbox from session-owned to user-owned (one sandbox per user),
|
||||
and changes Snapshot from sandbox-linked to session-linked.
|
||||
|
||||
Sandbox table:
|
||||
- Remove: session_id (unique FK to BuildSession)
|
||||
- Add: user_id (FK to User, NOT NULL, unique)
|
||||
|
||||
Snapshot table:
|
||||
- Remove: sandbox_id (FK to Sandbox)
|
||||
- Add: session_id (FK to BuildSession, NOT NULL, ondelete=CASCADE)
|
||||
- Update index: ix_snapshot_sandbox_created -> ix_snapshot_session_created
|
||||
|
||||
Revision ID: 6db00b8237e5
|
||||
Revises: 111d7192d457
|
||||
Create Date: 2026-01-23 12:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "6db00b8237e5"
|
||||
down_revision = "111d7192d457"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# ==========================================================================
|
||||
# SNAPSHOT: Change from sandbox_id to session_id
|
||||
# Must be done BEFORE dropping sandbox.session_id (needed for data migration)
|
||||
# ==========================================================================
|
||||
|
||||
# 1. Add session_id column (nullable initially for data migration)
|
||||
op.add_column(
|
||||
"snapshot",
|
||||
sa.Column(
|
||||
"session_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# 2. Populate session_id from sandbox.session_id via snapshot.sandbox_id
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE snapshot
|
||||
SET session_id = sandbox.session_id
|
||||
FROM sandbox
|
||||
WHERE snapshot.sandbox_id = sandbox.id
|
||||
"""
|
||||
)
|
||||
|
||||
# 3. Make session_id not nullable
|
||||
op.alter_column("snapshot", "session_id", nullable=False)
|
||||
|
||||
# 4. Add FK constraint for session_id
|
||||
op.create_foreign_key(
|
||||
"snapshot_session_id_fkey",
|
||||
"snapshot",
|
||||
"build_session",
|
||||
["session_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 5. Drop old index
|
||||
op.drop_index("ix_snapshot_sandbox_created", table_name="snapshot")
|
||||
|
||||
# 6. Drop FK constraint on sandbox_id
|
||||
op.drop_constraint("snapshot_sandbox_id_fkey", "snapshot", type_="foreignkey")
|
||||
|
||||
# 7. Drop sandbox_id column
|
||||
op.drop_column("snapshot", "sandbox_id")
|
||||
|
||||
# 8. Create new index
|
||||
op.create_index(
|
||||
"ix_snapshot_session_created",
|
||||
"snapshot",
|
||||
["session_id", sa.text("created_at DESC")],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# ==========================================================================
|
||||
# SANDBOX: Change from session_id to user_id
|
||||
# ==========================================================================
|
||||
|
||||
# 1. Add user_id column (nullable initially for data migration)
|
||||
op.add_column(
|
||||
"sandbox",
|
||||
sa.Column(
|
||||
"user_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# 2. Populate user_id from build_session.user_id via sandbox.session_id
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE sandbox
|
||||
SET user_id = build_session.user_id
|
||||
FROM build_session
|
||||
WHERE sandbox.session_id = build_session.id
|
||||
"""
|
||||
)
|
||||
|
||||
# 3. Delete any sandboxes that couldn't be mapped (orphaned data)
|
||||
# This handles sandboxes whose sessions have NULL user_id or were deleted
|
||||
op.execute(
|
||||
"""
|
||||
DELETE FROM sandbox WHERE user_id IS NULL
|
||||
"""
|
||||
)
|
||||
|
||||
# 4. Make user_id not nullable
|
||||
op.alter_column("sandbox", "user_id", nullable=False)
|
||||
|
||||
# 5. Drop the unique constraint on session_id
|
||||
op.drop_constraint("sandbox_session_id_key", "sandbox", type_="unique")
|
||||
|
||||
# 6. Drop FK constraint on session_id
|
||||
op.drop_constraint("sandbox_session_id_fkey", "sandbox", type_="foreignkey")
|
||||
|
||||
# 7. Drop session_id column
|
||||
op.drop_column("sandbox", "session_id")
|
||||
|
||||
# 8. Add FK constraint for user_id
|
||||
op.create_foreign_key(
|
||||
"sandbox_user_id_fkey",
|
||||
"sandbox",
|
||||
"user",
|
||||
["user_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 9. Add unique constraint on user_id (one sandbox per user)
|
||||
op.create_unique_constraint("sandbox_user_id_key", "sandbox", ["user_id"])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# ==========================================================================
|
||||
# SANDBOX: Change back from user_id to session_id
|
||||
# ==========================================================================
|
||||
|
||||
# 1. Drop unique constraint on user_id
|
||||
op.drop_constraint("sandbox_user_id_key", "sandbox", type_="unique")
|
||||
|
||||
# 2. Drop FK constraint on user_id
|
||||
op.drop_constraint("sandbox_user_id_fkey", "sandbox", type_="foreignkey")
|
||||
|
||||
# 3. Add session_id column back (nullable initially)
|
||||
op.add_column(
|
||||
"sandbox",
|
||||
sa.Column(
|
||||
"session_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# 4. NOTE: Cannot reliably restore session_id data since the relationship
|
||||
# is now one-to-many (user can have multiple sessions).
|
||||
# Set session_id to a random session of the user for data integrity.
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE sandbox
|
||||
SET session_id = (
|
||||
SELECT build_session.id
|
||||
FROM build_session
|
||||
WHERE build_session.user_id = sandbox.user_id
|
||||
ORDER BY build_session.created_at DESC
|
||||
LIMIT 1
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
# 5. Delete sandboxes that couldn't be mapped
|
||||
op.execute(
|
||||
"""
|
||||
DELETE FROM sandbox WHERE session_id IS NULL
|
||||
"""
|
||||
)
|
||||
|
||||
# 6. Make session_id not nullable
|
||||
op.alter_column("sandbox", "session_id", nullable=False)
|
||||
|
||||
# 7. Add FK constraint for session_id
|
||||
op.create_foreign_key(
|
||||
"sandbox_session_id_fkey",
|
||||
"sandbox",
|
||||
"build_session",
|
||||
["session_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 8. Add unique constraint on session_id
|
||||
op.create_unique_constraint("sandbox_session_id_key", "sandbox", ["session_id"])
|
||||
|
||||
# 9. Drop user_id column
|
||||
op.drop_column("sandbox", "user_id")
|
||||
|
||||
# ==========================================================================
|
||||
# SNAPSHOT: Change back from session_id to sandbox_id
|
||||
# ==========================================================================
|
||||
|
||||
# 1. Drop new index
|
||||
op.drop_index("ix_snapshot_session_created", table_name="snapshot")
|
||||
|
||||
# 2. Add sandbox_id column back (nullable initially)
|
||||
op.add_column(
|
||||
"snapshot",
|
||||
sa.Column(
|
||||
"sandbox_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
# 3. Populate sandbox_id from the newly restored sandbox.session_id
|
||||
op.execute(
|
||||
"""
|
||||
UPDATE snapshot
|
||||
SET sandbox_id = sandbox.id
|
||||
FROM sandbox
|
||||
WHERE snapshot.session_id = sandbox.session_id
|
||||
"""
|
||||
)
|
||||
|
||||
# 4. Delete snapshots that couldn't be mapped
|
||||
op.execute(
|
||||
"""
|
||||
DELETE FROM snapshot WHERE sandbox_id IS NULL
|
||||
"""
|
||||
)
|
||||
|
||||
# 5. Make sandbox_id not nullable
|
||||
op.alter_column("snapshot", "sandbox_id", nullable=False)
|
||||
|
||||
# 6. Add FK constraint for sandbox_id
|
||||
op.create_foreign_key(
|
||||
"snapshot_sandbox_id_fkey",
|
||||
"snapshot",
|
||||
"sandbox",
|
||||
["sandbox_id"],
|
||||
["id"],
|
||||
ondelete="CASCADE",
|
||||
)
|
||||
|
||||
# 7. Create old index
|
||||
op.create_index(
|
||||
"ix_snapshot_sandbox_created",
|
||||
"snapshot",
|
||||
["sandbox_id", sa.text("created_at DESC")],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
# 8. Drop FK constraint on session_id
|
||||
op.drop_constraint("snapshot_session_id_fkey", "snapshot", type_="foreignkey")
|
||||
|
||||
# 9. Drop session_id column
|
||||
op.drop_column("snapshot", "session_id")
|
||||
@@ -0,0 +1,37 @@
|
||||
"""move nextjs_port from sandbox to build_session
|
||||
|
||||
Revision ID: 76dd1eb17d31
|
||||
Revises: 6db00b8237e5
|
||||
Create Date: 2026-01-23 16:24:36.965851
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "76dd1eb17d31"
|
||||
down_revision = "6db00b8237e5"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Add nextjs_port column to build_session (per-session port allocation)
|
||||
op.add_column(
|
||||
"build_session",
|
||||
sa.Column("nextjs_port", sa.Integer(), nullable=True),
|
||||
)
|
||||
# Drop nextjs_port column from sandbox (no longer needed at sandbox level)
|
||||
op.drop_column("sandbox", "nextjs_port")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Add nextjs_port back to sandbox
|
||||
op.add_column(
|
||||
"sandbox",
|
||||
sa.Column("nextjs_port", sa.Integer(), nullable=True),
|
||||
)
|
||||
# Drop nextjs_port from build_session
|
||||
op.drop_column("build_session", "nextjs_port")
|
||||
@@ -0,0 +1,27 @@
|
||||
"""add nextjs_port to sandbox
|
||||
|
||||
Revision ID: 7cd906f37fc6
|
||||
Revises: 26b589bf8be7
|
||||
Create Date: 2026-01-20
|
||||
|
||||
"""
|
||||
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "7cd906f37fc6"
|
||||
down_revision: Union[str, None] = "26b589bf8be7"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column("sandbox", sa.Column("nextjs_port", sa.Integer(), nullable=True))
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("sandbox", "nextjs_port")
|
||||
@@ -0,0 +1,86 @@
|
||||
"""create_build_session_table
|
||||
|
||||
Revision ID: 96086064c5db
|
||||
Revises: 41fa44bef321
|
||||
Create Date: 2026-01-19 14:47:38.156803
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "96086064c5db"
|
||||
down_revision = "41fa44bef321"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create build_session status enum
|
||||
build_session_status_enum = sa.Enum(
|
||||
"active",
|
||||
"idle",
|
||||
name="buildsessionstatus",
|
||||
native_enum=False,
|
||||
)
|
||||
|
||||
# Create build_session table
|
||||
op.create_table(
|
||||
"build_session",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"user_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("user.id", ondelete="CASCADE"),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("name", sa.String(), nullable=True),
|
||||
sa.Column(
|
||||
"status",
|
||||
build_session_status_enum,
|
||||
nullable=False,
|
||||
server_default="active",
|
||||
),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"last_activity_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create indexes for build_session
|
||||
op.create_index(
|
||||
"ix_build_session_user_created",
|
||||
"build_session",
|
||||
["user_id", sa.text("created_at DESC")],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_build_session_status",
|
||||
"build_session",
|
||||
["status"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop indexes
|
||||
op.drop_index("ix_build_session_status", table_name="build_session")
|
||||
op.drop_index("ix_build_session_user_created", table_name="build_session")
|
||||
|
||||
# Drop table
|
||||
op.drop_table("build_session")
|
||||
|
||||
# Drop enum
|
||||
sa.Enum(name="buildsessionstatus").drop(op.get_bind(), checkfirst=True)
|
||||
@@ -0,0 +1,86 @@
|
||||
"""create_artifact_table
|
||||
|
||||
Revision ID: a441232d9c5a
|
||||
Revises: 484b9fa1ac89
|
||||
Create Date: 2026-01-19 14:47:57.226496
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "a441232d9c5a"
|
||||
down_revision = "484b9fa1ac89"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create artifact type enum
|
||||
artifact_type_enum = sa.Enum(
|
||||
"web_app",
|
||||
"pptx",
|
||||
"docx",
|
||||
"markdown",
|
||||
"excel",
|
||||
"image",
|
||||
name="artifacttype",
|
||||
native_enum=False,
|
||||
)
|
||||
|
||||
# Create artifact table
|
||||
op.create_table(
|
||||
"artifact",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"session_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("build_session.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("type", artifact_type_enum, nullable=False),
|
||||
sa.Column("path", sa.String(), nullable=False),
|
||||
sa.Column("name", sa.String(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create indexes for artifact
|
||||
op.create_index(
|
||||
"ix_artifact_session_created",
|
||||
"artifact",
|
||||
["session_id", sa.text("created_at DESC")],
|
||||
unique=False,
|
||||
)
|
||||
op.create_index(
|
||||
"ix_artifact_type",
|
||||
"artifact",
|
||||
["type"],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop indexes
|
||||
op.drop_index("ix_artifact_type", table_name="artifact")
|
||||
op.drop_index("ix_artifact_session_created", table_name="artifact")
|
||||
|
||||
# Drop table
|
||||
op.drop_table("artifact")
|
||||
|
||||
# Drop enum
|
||||
sa.Enum(name="artifacttype").drop(op.get_bind(), checkfirst=True)
|
||||
@@ -0,0 +1,57 @@
|
||||
"""create_snapshot_table
|
||||
|
||||
Revision ID: df6cbd9a37cc
|
||||
Revises: a441232d9c5a
|
||||
Create Date: 2026-01-19 14:48:00.757530
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "df6cbd9a37cc"
|
||||
down_revision = "a441232d9c5a"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create snapshot table (no enum needed)
|
||||
op.create_table(
|
||||
"snapshot",
|
||||
sa.Column("id", postgresql.UUID(as_uuid=True), primary_key=True),
|
||||
sa.Column(
|
||||
"session_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
sa.ForeignKey("build_session.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("storage_path", sa.String(), nullable=False),
|
||||
sa.Column("size_bytes", sa.BigInteger(), nullable=False, server_default="0"),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
# Create index for snapshot
|
||||
op.create_index(
|
||||
"ix_snapshot_session_created",
|
||||
"snapshot",
|
||||
["session_id", sa.text("created_at DESC")],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop index
|
||||
op.drop_index("ix_snapshot_session_created", table_name="snapshot")
|
||||
|
||||
# Drop table
|
||||
op.drop_table("snapshot")
|
||||
@@ -0,0 +1,31 @@
|
||||
"""add chat_background to user
|
||||
|
||||
Revision ID: fb80bdd256de
|
||||
Revises: 8b5ce697290e
|
||||
Create Date: 2026-01-16 16:15:59.222617
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "fb80bdd256de"
|
||||
down_revision = "8b5ce697290e"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"user",
|
||||
sa.Column(
|
||||
"chat_background",
|
||||
sa.String(),
|
||||
nullable=True,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("user", "chat_background")
|
||||
@@ -17,7 +17,8 @@ from onyx.context.search.models import InferenceChunk
|
||||
from onyx.context.search.pipeline import merge_individual_chunks
|
||||
from onyx.context.search.pipeline import search_pipeline
|
||||
from onyx.db.models import User
|
||||
from onyx.document_index.factory import get_current_primary_default_document_index
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.llm.factory import get_default_llm
|
||||
from onyx.secondary_llm_flows.document_filter import select_sections_for_expansion
|
||||
@@ -42,11 +43,13 @@ def _run_single_search(
|
||||
document_index: DocumentIndex,
|
||||
user: User | None,
|
||||
db_session: Session,
|
||||
num_hits: int | None = None,
|
||||
) -> list[InferenceChunk]:
|
||||
"""Execute a single search query and return chunks."""
|
||||
chunk_search_request = ChunkSearchRequest(
|
||||
query=query,
|
||||
user_selected_filters=filters,
|
||||
limit=num_hits,
|
||||
)
|
||||
|
||||
return search_pipeline(
|
||||
@@ -72,7 +75,9 @@ def stream_search_query(
|
||||
Used by both streaming and non-streaming endpoints.
|
||||
"""
|
||||
# Get document index
|
||||
document_index = get_current_primary_default_document_index(db_session)
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
# This flow is for search so we do not get all indices.
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
|
||||
# Determine queries to execute
|
||||
original_query = request.search_query
|
||||
@@ -114,6 +119,7 @@ def stream_search_query(
|
||||
document_index=document_index,
|
||||
user=user,
|
||||
db_session=db_session,
|
||||
num_hits=request.num_hits,
|
||||
)
|
||||
else:
|
||||
# Multiple queries - run in parallel and merge with RRF
|
||||
@@ -121,7 +127,14 @@ def stream_search_query(
|
||||
search_functions = [
|
||||
(
|
||||
_run_single_search,
|
||||
(query, request.filters, document_index, user, db_session),
|
||||
(
|
||||
query,
|
||||
request.filters,
|
||||
document_index,
|
||||
user,
|
||||
db_session,
|
||||
request.num_hits,
|
||||
),
|
||||
)
|
||||
for query in all_executed_queries
|
||||
]
|
||||
@@ -168,6 +181,9 @@ def stream_search_query(
|
||||
# Merge chunks into sections
|
||||
sections = merge_individual_chunks(chunks)
|
||||
|
||||
# Truncate to the requested number of hits
|
||||
sections = sections[: request.num_hits]
|
||||
|
||||
# Apply LLM document selection if requested
|
||||
# num_docs_fed_to_llm_selection specifies how many sections to feed to the LLM for selection
|
||||
# The LLM will always try to select TARGET_NUM_SECTIONS_FOR_LLM_SELECTION sections from those fed to it
|
||||
|
||||
@@ -10,6 +10,8 @@ EE_PUBLIC_ENDPOINT_SPECS = PUBLIC_ENDPOINT_SPECS + [
|
||||
("/enterprise-settings/logo", {"GET"}),
|
||||
("/enterprise-settings/logotype", {"GET"}),
|
||||
("/enterprise-settings/custom-analytics-script", {"GET"}),
|
||||
# Stripe publishable key is safe to expose publicly
|
||||
("/tenants/stripe-publishable-key", {"GET"}),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ class SendSearchQueryRequest(BaseModel):
|
||||
filters: BaseFilters | None = None
|
||||
num_docs_fed_to_llm_selection: int | None = None
|
||||
run_query_expansion: bool = False
|
||||
num_hits: int = 50
|
||||
|
||||
include_content: bool = False
|
||||
stream: bool = False
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
import asyncio
|
||||
|
||||
import httpx
|
||||
from fastapi import APIRouter
|
||||
from fastapi import Depends
|
||||
from fastapi import HTTPException
|
||||
@@ -12,11 +15,14 @@ from ee.onyx.server.tenants.models import CreateSubscriptionSessionRequest
|
||||
from ee.onyx.server.tenants.models import ProductGatingFullSyncRequest
|
||||
from ee.onyx.server.tenants.models import ProductGatingRequest
|
||||
from ee.onyx.server.tenants.models import ProductGatingResponse
|
||||
from ee.onyx.server.tenants.models import StripePublishableKeyResponse
|
||||
from ee.onyx.server.tenants.models import SubscriptionSessionResponse
|
||||
from ee.onyx.server.tenants.models import SubscriptionStatusResponse
|
||||
from ee.onyx.server.tenants.product_gating import overwrite_full_gated_set
|
||||
from ee.onyx.server.tenants.product_gating import store_product_gating
|
||||
from onyx.auth.users import User
|
||||
from onyx.configs.app_configs import STRIPE_PUBLISHABLE_KEY_OVERRIDE
|
||||
from onyx.configs.app_configs import STRIPE_PUBLISHABLE_KEY_URL
|
||||
from onyx.configs.app_configs import WEB_DOMAIN
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
@@ -26,6 +32,10 @@ logger = setup_logger()
|
||||
|
||||
router = APIRouter(prefix="/tenants")
|
||||
|
||||
# Cache for Stripe publishable key to avoid hitting S3 on every request
|
||||
_stripe_publishable_key_cache: str | None = None
|
||||
_stripe_key_lock = asyncio.Lock()
|
||||
|
||||
|
||||
@router.post("/product-gating")
|
||||
def gate_product(
|
||||
@@ -113,3 +123,67 @@ async def create_subscription_session(
|
||||
except Exception as e:
|
||||
logger.exception("Failed to create subscription session")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.get("/stripe-publishable-key")
|
||||
async def get_stripe_publishable_key() -> StripePublishableKeyResponse:
|
||||
"""
|
||||
Fetch the Stripe publishable key.
|
||||
Priority: env var override (for testing) > S3 bucket (production).
|
||||
This endpoint is public (no auth required) since publishable keys are safe to expose.
|
||||
The key is cached in memory to avoid hitting S3 on every request.
|
||||
"""
|
||||
global _stripe_publishable_key_cache
|
||||
|
||||
# Fast path: return cached value without lock
|
||||
if _stripe_publishable_key_cache:
|
||||
return StripePublishableKeyResponse(
|
||||
publishable_key=_stripe_publishable_key_cache
|
||||
)
|
||||
|
||||
# Use lock to prevent concurrent S3 requests
|
||||
async with _stripe_key_lock:
|
||||
# Double-check after acquiring lock (another request may have populated cache)
|
||||
if _stripe_publishable_key_cache:
|
||||
return StripePublishableKeyResponse(
|
||||
publishable_key=_stripe_publishable_key_cache
|
||||
)
|
||||
|
||||
# Check for env var override first (for local testing with pk_test_* keys)
|
||||
if STRIPE_PUBLISHABLE_KEY_OVERRIDE:
|
||||
key = STRIPE_PUBLISHABLE_KEY_OVERRIDE.strip()
|
||||
if not key.startswith("pk_"):
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Invalid Stripe publishable key format",
|
||||
)
|
||||
_stripe_publishable_key_cache = key
|
||||
return StripePublishableKeyResponse(publishable_key=key)
|
||||
|
||||
# Fall back to S3 bucket
|
||||
if not STRIPE_PUBLISHABLE_KEY_URL:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Stripe publishable key is not configured",
|
||||
)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(STRIPE_PUBLISHABLE_KEY_URL)
|
||||
response.raise_for_status()
|
||||
key = response.text.strip()
|
||||
|
||||
# Validate key format
|
||||
if not key.startswith("pk_"):
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Invalid Stripe publishable key format",
|
||||
)
|
||||
|
||||
_stripe_publishable_key_cache = key
|
||||
return StripePublishableKeyResponse(publishable_key=key)
|
||||
except httpx.HTTPError:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Failed to fetch Stripe publishable key",
|
||||
)
|
||||
|
||||
@@ -105,3 +105,7 @@ class PendingUserSnapshot(BaseModel):
|
||||
|
||||
class ApproveUserRequest(BaseModel):
|
||||
email: str
|
||||
|
||||
|
||||
class StripePublishableKeyResponse(BaseModel):
|
||||
publishable_key: str
|
||||
|
||||
@@ -11,6 +11,7 @@ from typing import Any
|
||||
from typing import cast
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Literal
|
||||
from typing import Optional
|
||||
from typing import Protocol
|
||||
from typing import Tuple
|
||||
@@ -1456,6 +1457,9 @@ def get_default_admin_user_emails_() -> list[str]:
|
||||
|
||||
|
||||
STATE_TOKEN_AUDIENCE = "fastapi-users:oauth-state"
|
||||
STATE_TOKEN_LIFETIME_SECONDS = 3600
|
||||
CSRF_TOKEN_KEY = "csrftoken"
|
||||
CSRF_TOKEN_COOKIE_NAME = "fastapiusersoauthcsrf"
|
||||
|
||||
|
||||
class OAuth2AuthorizeResponse(BaseModel):
|
||||
@@ -1463,13 +1467,19 @@ class OAuth2AuthorizeResponse(BaseModel):
|
||||
|
||||
|
||||
def generate_state_token(
|
||||
data: Dict[str, str], secret: SecretType, lifetime_seconds: int = 3600
|
||||
data: Dict[str, str],
|
||||
secret: SecretType,
|
||||
lifetime_seconds: int = STATE_TOKEN_LIFETIME_SECONDS,
|
||||
) -> str:
|
||||
data["aud"] = STATE_TOKEN_AUDIENCE
|
||||
|
||||
return generate_jwt(data, secret, lifetime_seconds)
|
||||
|
||||
|
||||
def generate_csrf_token() -> str:
|
||||
return secrets.token_urlsafe(32)
|
||||
|
||||
|
||||
# refer to https://github.com/fastapi-users/fastapi-users/blob/42ddc241b965475390e2bce887b084152ae1a2cd/fastapi_users/fastapi_users.py#L91
|
||||
def create_onyx_oauth_router(
|
||||
oauth_client: BaseOAuth2,
|
||||
@@ -1498,6 +1508,13 @@ def get_oauth_router(
|
||||
redirect_url: Optional[str] = None,
|
||||
associate_by_email: bool = False,
|
||||
is_verified_by_default: bool = False,
|
||||
*,
|
||||
csrf_token_cookie_name: str = CSRF_TOKEN_COOKIE_NAME,
|
||||
csrf_token_cookie_path: str = "/",
|
||||
csrf_token_cookie_domain: Optional[str] = None,
|
||||
csrf_token_cookie_secure: Optional[bool] = None,
|
||||
csrf_token_cookie_httponly: bool = True,
|
||||
csrf_token_cookie_samesite: Optional[Literal["lax", "strict", "none"]] = "lax",
|
||||
) -> APIRouter:
|
||||
"""Generate a router with the OAuth routes."""
|
||||
router = APIRouter()
|
||||
@@ -1514,6 +1531,9 @@ def get_oauth_router(
|
||||
route_name=callback_route_name,
|
||||
)
|
||||
|
||||
if csrf_token_cookie_secure is None:
|
||||
csrf_token_cookie_secure = WEB_DOMAIN.startswith("https")
|
||||
|
||||
@router.get(
|
||||
"/authorize",
|
||||
name=f"oauth:{oauth_client.name}.{backend.name}.authorize",
|
||||
@@ -1521,8 +1541,10 @@ def get_oauth_router(
|
||||
)
|
||||
async def authorize(
|
||||
request: Request,
|
||||
response: Response,
|
||||
redirect: bool = Query(False),
|
||||
scopes: List[str] = Query(None),
|
||||
) -> OAuth2AuthorizeResponse:
|
||||
) -> Response | OAuth2AuthorizeResponse:
|
||||
referral_source = request.cookies.get("referral_source", None)
|
||||
|
||||
if redirect_url is not None:
|
||||
@@ -1532,9 +1554,11 @@ def get_oauth_router(
|
||||
|
||||
next_url = request.query_params.get("next", "/")
|
||||
|
||||
csrf_token = generate_csrf_token()
|
||||
state_data: Dict[str, str] = {
|
||||
"next_url": next_url,
|
||||
"referral_source": referral_source or "default_referral",
|
||||
CSRF_TOKEN_KEY: csrf_token,
|
||||
}
|
||||
state = generate_state_token(state_data, state_secret)
|
||||
|
||||
@@ -1551,6 +1575,31 @@ def get_oauth_router(
|
||||
authorization_url, {"access_type": "offline", "prompt": "consent"}
|
||||
)
|
||||
|
||||
if redirect:
|
||||
redirect_response = RedirectResponse(authorization_url, status_code=302)
|
||||
redirect_response.set_cookie(
|
||||
key=csrf_token_cookie_name,
|
||||
value=csrf_token,
|
||||
max_age=STATE_TOKEN_LIFETIME_SECONDS,
|
||||
path=csrf_token_cookie_path,
|
||||
domain=csrf_token_cookie_domain,
|
||||
secure=csrf_token_cookie_secure,
|
||||
httponly=csrf_token_cookie_httponly,
|
||||
samesite=csrf_token_cookie_samesite,
|
||||
)
|
||||
return redirect_response
|
||||
|
||||
response.set_cookie(
|
||||
key=csrf_token_cookie_name,
|
||||
value=csrf_token,
|
||||
max_age=STATE_TOKEN_LIFETIME_SECONDS,
|
||||
path=csrf_token_cookie_path,
|
||||
domain=csrf_token_cookie_domain,
|
||||
secure=csrf_token_cookie_secure,
|
||||
httponly=csrf_token_cookie_httponly,
|
||||
samesite=csrf_token_cookie_samesite,
|
||||
)
|
||||
|
||||
return OAuth2AuthorizeResponse(authorization_url=authorization_url)
|
||||
|
||||
@log_function_time(print_only=True)
|
||||
@@ -1600,7 +1649,33 @@ def get_oauth_router(
|
||||
try:
|
||||
state_data = decode_jwt(state, state_secret, [STATE_TOKEN_AUDIENCE])
|
||||
except jwt.DecodeError:
|
||||
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=getattr(
|
||||
ErrorCode, "ACCESS_TOKEN_DECODE_ERROR", "ACCESS_TOKEN_DECODE_ERROR"
|
||||
),
|
||||
)
|
||||
except jwt.ExpiredSignatureError:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=getattr(
|
||||
ErrorCode,
|
||||
"ACCESS_TOKEN_ALREADY_EXPIRED",
|
||||
"ACCESS_TOKEN_ALREADY_EXPIRED",
|
||||
),
|
||||
)
|
||||
|
||||
cookie_csrf_token = request.cookies.get(csrf_token_cookie_name)
|
||||
state_csrf_token = state_data.get(CSRF_TOKEN_KEY)
|
||||
if (
|
||||
not cookie_csrf_token
|
||||
or not state_csrf_token
|
||||
or not secrets.compare_digest(cookie_csrf_token, state_csrf_token)
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=getattr(ErrorCode, "OAUTH_INVALID_STATE", "OAUTH_INVALID_STATE"),
|
||||
)
|
||||
|
||||
next_url = state_data.get("next_url", "/")
|
||||
referral_source = state_data.get("referral_source", None)
|
||||
|
||||
@@ -26,10 +26,13 @@ from onyx.background.celery.celery_utils import celery_is_worker_primary
|
||||
from onyx.background.celery.celery_utils import make_probe_path
|
||||
from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_PREFIX
|
||||
from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_TASKSET_KEY
|
||||
from onyx.configs.app_configs import ENABLE_OPENSEARCH_FOR_ONYX
|
||||
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
|
||||
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine.sql_engine import get_sqlalchemy_engine
|
||||
from onyx.document_index.opensearch.client import (
|
||||
wait_for_opensearch_with_timeout,
|
||||
)
|
||||
from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
@@ -516,15 +519,17 @@ def wait_for_vespa_or_shutdown(sender: Any, **kwargs: Any) -> None:
|
||||
"""Waits for Vespa to become ready subject to a timeout.
|
||||
Raises WorkerShutdown if the timeout is reached."""
|
||||
|
||||
if ENABLE_OPENSEARCH_FOR_ONYX:
|
||||
# TODO(andrei): Do some similar liveness checking for OpenSearch.
|
||||
return
|
||||
|
||||
if not wait_for_vespa_with_timeout():
|
||||
msg = "Vespa: Readiness probe did not succeed within the timeout. Exiting..."
|
||||
msg = "[Vespa] Readiness probe did not succeed within the timeout. Exiting..."
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
|
||||
if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
|
||||
if not wait_for_opensearch_with_timeout():
|
||||
msg = "[OpenSearch] Readiness probe did not succeed within the timeout. Exiting..."
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
|
||||
|
||||
# File for validating worker liveness
|
||||
class LivenessProbe(bootsteps.StartStopStep):
|
||||
|
||||
@@ -134,5 +134,7 @@ celery_app.autodiscover_tasks(
|
||||
"onyx.background.celery.tasks.docprocessing",
|
||||
# Docfetching worker tasks
|
||||
"onyx.background.celery.tasks.docfetching",
|
||||
# Sandbox cleanup tasks (isolated in build feature)
|
||||
"onyx.server.features.build.sandbox.tasks",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -116,5 +116,7 @@ celery_app.autodiscover_tasks(
|
||||
"onyx.background.celery.tasks.connector_deletion",
|
||||
"onyx.background.celery.tasks.doc_permission_syncing",
|
||||
"onyx.background.celery.tasks.docprocessing",
|
||||
# Sandbox cleanup tasks (isolated in build feature)
|
||||
"onyx.server.features.build.sandbox.tasks",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -139,6 +139,27 @@ beat_task_templates: list[dict] = [
|
||||
"queue": OnyxCeleryQueues.MONITORING,
|
||||
},
|
||||
},
|
||||
# Sandbox cleanup tasks
|
||||
{
|
||||
"name": "cleanup-idle-sandboxes",
|
||||
"task": OnyxCeleryTask.CLEANUP_IDLE_SANDBOXES,
|
||||
"schedule": timedelta(minutes=1),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.LOW,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"queue": OnyxCeleryQueues.SANDBOX,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "cleanup-old-snapshots",
|
||||
"task": OnyxCeleryTask.CLEANUP_OLD_SNAPSHOTS,
|
||||
"schedule": timedelta(hours=24),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.LOW,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"queue": OnyxCeleryQueues.SANDBOX,
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
if ENTERPRISE_EDITION_ENABLED:
|
||||
|
||||
@@ -87,7 +87,7 @@ from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.db.swap_index import check_and_perform_index_swap
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.factory import get_all_document_indices
|
||||
from onyx.file_store.document_batch_storage import DocumentBatchStorage
|
||||
from onyx.file_store.document_batch_storage import get_document_batch_storage
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
@@ -1436,7 +1436,7 @@ def _docprocessing_task(
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
document_index = get_default_document_index(
|
||||
document_indices = get_all_document_indices(
|
||||
index_attempt.search_settings,
|
||||
None,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
@@ -1473,7 +1473,7 @@ def _docprocessing_task(
|
||||
# real work happens here!
|
||||
index_pipeline_result = run_indexing_pipeline(
|
||||
embedder=embedding_model,
|
||||
document_index=document_index,
|
||||
document_indices=document_indices,
|
||||
ignore_time_skip=True, # Documents are already filtered during extraction
|
||||
db_session=db_session,
|
||||
tenant_id=tenant_id,
|
||||
|
||||
@@ -25,7 +25,7 @@ from onyx.db.document_set import fetch_document_sets_for_document
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.relationships import delete_document_references_from_kg
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.factory import get_all_document_indices
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
@@ -97,13 +97,17 @@ def document_by_cc_pair_cleanup_task(
|
||||
action = "skip"
|
||||
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
doc_index = get_default_document_index(
|
||||
# This flow is for updates and deletion so we get all indices.
|
||||
document_indices = get_all_document_indices(
|
||||
active_search_settings.primary,
|
||||
active_search_settings.secondary,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
)
|
||||
|
||||
retry_index = RetryDocumentIndex(doc_index)
|
||||
retry_document_indices: list[RetryDocumentIndex] = [
|
||||
RetryDocumentIndex(document_index)
|
||||
for document_index in document_indices
|
||||
]
|
||||
|
||||
count = get_document_connector_count(db_session, document_id)
|
||||
if count == 1:
|
||||
@@ -113,11 +117,12 @@ def document_by_cc_pair_cleanup_task(
|
||||
|
||||
chunk_count = fetch_chunk_count_for_document(document_id, db_session)
|
||||
|
||||
_ = retry_index.delete_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
)
|
||||
for retry_document_index in retry_document_indices:
|
||||
_ = retry_document_index.delete_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
)
|
||||
|
||||
delete_document_references_from_kg(
|
||||
db_session=db_session,
|
||||
@@ -155,14 +160,18 @@ def document_by_cc_pair_cleanup_task(
|
||||
hidden=doc.hidden,
|
||||
)
|
||||
|
||||
# update Vespa. OK if doc doesn't exist. Raises exception otherwise.
|
||||
retry_index.update_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
user_fields=None,
|
||||
)
|
||||
for retry_document_index in retry_document_indices:
|
||||
# TODO(andrei): Previously there was a comment here saying
|
||||
# it was ok if a doc did not exist in the document index. I
|
||||
# don't agree with that claim, so keep an eye on this task
|
||||
# to see if this raises.
|
||||
retry_document_index.update_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
user_fields=None,
|
||||
)
|
||||
|
||||
# there are still other cc_pair references to the doc, so just resync to Vespa
|
||||
delete_document_by_connector_credential_pair__no_commit(
|
||||
|
||||
@@ -32,7 +32,7 @@ from onyx.db.enums import UserFileStatus
|
||||
from onyx.db.models import UserFile
|
||||
from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.search_settings import get_active_search_settings_list
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.factory import get_all_document_indices
|
||||
from onyx.document_index.interfaces import VespaDocumentUserFields
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
@@ -244,7 +244,8 @@ def process_single_user_file(self: Task, *, user_file_id: str, tenant_id: str) -
|
||||
search_settings=current_search_settings,
|
||||
)
|
||||
|
||||
document_index = get_default_document_index(
|
||||
# This flow is for indexing so we get all indices.
|
||||
document_indices = get_all_document_indices(
|
||||
current_search_settings,
|
||||
None,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
@@ -258,7 +259,7 @@ def process_single_user_file(self: Task, *, user_file_id: str, tenant_id: str) -
|
||||
# real work happens here!
|
||||
index_pipeline_result = run_indexing_pipeline(
|
||||
embedder=embedding_model,
|
||||
document_index=document_index,
|
||||
document_indices=document_indices,
|
||||
ignore_time_skip=True,
|
||||
db_session=db_session,
|
||||
tenant_id=tenant_id,
|
||||
@@ -412,12 +413,16 @@ def process_single_user_file_delete(
|
||||
httpx_init_vespa_pool(20)
|
||||
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
document_index = get_default_document_index(
|
||||
# This flow is for deletion so we get all indices.
|
||||
document_indices = get_all_document_indices(
|
||||
search_settings=active_search_settings.primary,
|
||||
secondary_search_settings=active_search_settings.secondary,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
)
|
||||
retry_index = RetryDocumentIndex(document_index)
|
||||
retry_document_indices: list[RetryDocumentIndex] = [
|
||||
RetryDocumentIndex(document_index)
|
||||
for document_index in document_indices
|
||||
]
|
||||
index_name = active_search_settings.primary.index_name
|
||||
selection = f"{index_name}.document_id=='{user_file_id}'"
|
||||
|
||||
@@ -438,11 +443,12 @@ def process_single_user_file_delete(
|
||||
else:
|
||||
chunk_count = user_file.chunk_count
|
||||
|
||||
retry_index.delete_single(
|
||||
doc_id=user_file_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
)
|
||||
for retry_document_index in retry_document_indices:
|
||||
retry_document_index.delete_single(
|
||||
doc_id=user_file_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=chunk_count,
|
||||
)
|
||||
|
||||
# 2) Delete the user-uploaded file content from filestore (blob + metadata)
|
||||
file_store = get_default_file_store()
|
||||
@@ -564,12 +570,16 @@ def process_single_user_file_project_sync(
|
||||
httpx_init_vespa_pool(20)
|
||||
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
doc_index = get_default_document_index(
|
||||
# This flow is for updates so we get all indices.
|
||||
document_indices = get_all_document_indices(
|
||||
search_settings=active_search_settings.primary,
|
||||
secondary_search_settings=active_search_settings.secondary,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
)
|
||||
retry_index = RetryDocumentIndex(doc_index)
|
||||
retry_document_indices: list[RetryDocumentIndex] = [
|
||||
RetryDocumentIndex(document_index)
|
||||
for document_index in document_indices
|
||||
]
|
||||
|
||||
user_file = db_session.get(UserFile, _as_uuid(user_file_id))
|
||||
if not user_file:
|
||||
@@ -579,13 +589,14 @@ def process_single_user_file_project_sync(
|
||||
return None
|
||||
|
||||
project_ids = [project.id for project in user_file.projects]
|
||||
retry_index.update_single(
|
||||
doc_id=str(user_file.id),
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=user_file.chunk_count,
|
||||
fields=None,
|
||||
user_fields=VespaDocumentUserFields(user_projects=project_ids),
|
||||
)
|
||||
for retry_document_index in retry_document_indices:
|
||||
retry_document_index.update_single(
|
||||
doc_id=str(user_file.id),
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=user_file.chunk_count,
|
||||
fields=None,
|
||||
user_fields=VespaDocumentUserFields(user_projects=project_ids),
|
||||
)
|
||||
|
||||
task_logger.info(
|
||||
f"process_single_user_file_project_sync - User file id={user_file_id}"
|
||||
|
||||
@@ -49,7 +49,7 @@ from onyx.db.search_settings import get_active_search_settings
|
||||
from onyx.db.sync_record import cleanup_sync_records
|
||||
from onyx.db.sync_record import insert_sync_record
|
||||
from onyx.db.sync_record import update_sync_record_status
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.factory import get_all_document_indices
|
||||
from onyx.document_index.interfaces import VespaDocumentFields
|
||||
from onyx.httpx.httpx_pool import HttpxPool
|
||||
from onyx.redis.redis_document_set import RedisDocumentSet
|
||||
@@ -70,6 +70,8 @@ logger = setup_logger()
|
||||
|
||||
# celery auto associates tasks created inside another task,
|
||||
# which bloats the result metadata considerably. trail=False prevents this.
|
||||
# TODO(andrei): Rename all these kinds of functions from *vespa* to a more
|
||||
# generic *document_index*.
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.CHECK_FOR_VESPA_SYNC_TASK,
|
||||
ignore_result=True,
|
||||
@@ -465,13 +467,17 @@ def vespa_metadata_sync_task(self: Task, document_id: str, *, tenant_id: str) ->
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
active_search_settings = get_active_search_settings(db_session)
|
||||
doc_index = get_default_document_index(
|
||||
# This flow is for updates so we get all indices.
|
||||
document_indices = get_all_document_indices(
|
||||
search_settings=active_search_settings.primary,
|
||||
secondary_search_settings=active_search_settings.secondary,
|
||||
httpx_client=HttpxPool.get("vespa"),
|
||||
)
|
||||
|
||||
retry_index = RetryDocumentIndex(doc_index)
|
||||
retry_document_indices: list[RetryDocumentIndex] = [
|
||||
RetryDocumentIndex(document_index)
|
||||
for document_index in document_indices
|
||||
]
|
||||
|
||||
doc = get_document(document_id, db_session)
|
||||
if not doc:
|
||||
@@ -500,14 +506,18 @@ def vespa_metadata_sync_task(self: Task, document_id: str, *, tenant_id: str) ->
|
||||
# aggregated_boost_factor=doc.aggregated_boost_factor,
|
||||
)
|
||||
|
||||
# update Vespa. OK if doc doesn't exist. Raises exception otherwise.
|
||||
retry_index.update_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
user_fields=None,
|
||||
)
|
||||
for retry_document_index in retry_document_indices:
|
||||
# TODO(andrei): Previously there was a comment here saying
|
||||
# it was ok if a doc did not exist in the document index. I
|
||||
# don't agree with that claim, so keep an eye on this task
|
||||
# to see if this raises.
|
||||
retry_document_index.update_single(
|
||||
document_id,
|
||||
tenant_id=tenant_id,
|
||||
chunk_count=doc.chunk_count,
|
||||
fields=fields,
|
||||
user_fields=None,
|
||||
)
|
||||
|
||||
# update db last. Worst case = we crash right before this and
|
||||
# the sync might repeat again later
|
||||
|
||||
@@ -31,17 +31,20 @@ from onyx.connectors.interfaces import CheckpointedConnector
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import ConnectorStopSignal
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import IndexAttemptMetadata
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.db.connector import mark_ccpair_with_indexing_trigger
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from onyx.db.connector_credential_pair import get_last_successful_attempt_poll_range_end
|
||||
from onyx.db.connector_credential_pair import update_connector_credential_pair
|
||||
from onyx.db.constants import CONNECTOR_VALIDATION_ERROR_MESSAGE_PREFIX
|
||||
from onyx.db.document import mark_document_as_indexed_for_cc_pair__no_commit
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import IndexingStatus
|
||||
from onyx.db.enums import IndexModelStatus
|
||||
from onyx.db.enums import ProcessingMode
|
||||
from onyx.db.index_attempt import create_index_attempt_error
|
||||
from onyx.db.index_attempt import get_index_attempt
|
||||
from onyx.db.index_attempt import get_recent_completed_attempts_for_cc_pair
|
||||
@@ -53,7 +56,10 @@ from onyx.db.models import IndexAttempt
|
||||
from onyx.file_store.document_batch_storage import DocumentBatchStorage
|
||||
from onyx.file_store.document_batch_storage import get_document_batch_storage
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
|
||||
from onyx.indexing.persistent_document_writer import get_persistent_document_writer
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.middleware import make_randomized_onyx_request_id
|
||||
from onyx.utils.variable_functionality import global_version
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.contextvars import INDEX_ATTEMPT_INFO_CONTEXTVAR
|
||||
@@ -367,6 +373,7 @@ def connector_document_extraction(
|
||||
|
||||
db_connector = index_attempt.connector_credential_pair.connector
|
||||
db_credential = index_attempt.connector_credential_pair.credential
|
||||
processing_mode = index_attempt.connector_credential_pair.processing_mode
|
||||
is_primary = index_attempt.search_settings.status == IndexModelStatus.PRESENT
|
||||
|
||||
from_beginning = index_attempt.from_beginning
|
||||
@@ -600,34 +607,100 @@ def connector_document_extraction(
|
||||
logger.debug(f"Indexing batch of documents: {batch_description}")
|
||||
memory_tracer.increment_and_maybe_trace()
|
||||
|
||||
# Store documents in storage
|
||||
batch_storage.store_batch(batch_num, doc_batch_cleaned)
|
||||
# cc4a
|
||||
if processing_mode == ProcessingMode.FILE_SYSTEM:
|
||||
# File system only - write directly to persistent storage,
|
||||
# skip chunking/embedding/Vespa but still track documents in DB
|
||||
|
||||
# Create processing task data
|
||||
processing_batch_data = {
|
||||
"index_attempt_id": index_attempt_id,
|
||||
"cc_pair_id": cc_pair_id,
|
||||
"tenant_id": tenant_id,
|
||||
"batch_num": batch_num, # 0-indexed
|
||||
}
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# Create metadata for the batch
|
||||
index_attempt_metadata = IndexAttemptMetadata(
|
||||
attempt_id=index_attempt_id,
|
||||
connector_id=db_connector.id,
|
||||
credential_id=db_credential.id,
|
||||
request_id=make_randomized_onyx_request_id("FSI"),
|
||||
structured_id=f"{tenant_id}:{cc_pair_id}:{index_attempt_id}:{batch_num}",
|
||||
batch_num=batch_num,
|
||||
)
|
||||
|
||||
# Queue document processing task
|
||||
app.send_task(
|
||||
OnyxCeleryTask.DOCPROCESSING_TASK,
|
||||
kwargs=processing_batch_data,
|
||||
queue=OnyxCeleryQueues.DOCPROCESSING,
|
||||
priority=docprocessing_priority,
|
||||
)
|
||||
# Upsert documents to PostgreSQL (document table + cc_pair relationship)
|
||||
# This is a subset of what docprocessing does - just DB tracking, no chunking/embedding
|
||||
index_doc_batch_prepare(
|
||||
documents=doc_batch_cleaned,
|
||||
index_attempt_metadata=index_attempt_metadata,
|
||||
db_session=db_session,
|
||||
ignore_time_skip=True, # Documents already filtered during extraction
|
||||
)
|
||||
|
||||
batch_num += 1
|
||||
total_doc_batches_queued += 1
|
||||
# Mark documents as indexed for the CC pair
|
||||
mark_document_as_indexed_for_cc_pair__no_commit(
|
||||
connector_id=db_connector.id,
|
||||
credential_id=db_credential.id,
|
||||
document_ids=[doc.id for doc in doc_batch_cleaned],
|
||||
db_session=db_session,
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
logger.info(
|
||||
f"Queued document processing batch: "
|
||||
f"batch_num={batch_num} "
|
||||
f"docs={len(doc_batch_cleaned)} "
|
||||
f"attempt={index_attempt_id}"
|
||||
)
|
||||
# Write documents to persistent file system
|
||||
# Use creator_id for user-segregated storage paths (sandbox isolation)
|
||||
creator_id = index_attempt.connector_credential_pair.creator_id
|
||||
if creator_id is None:
|
||||
raise ValueError(
|
||||
f"ConnectorCredentialPair {index_attempt.connector_credential_pair.id} "
|
||||
"must have a creator_id for persistent document storage"
|
||||
)
|
||||
user_id_str: str = str(creator_id)
|
||||
writer = get_persistent_document_writer(user_id=user_id_str)
|
||||
written_paths = writer.write_documents(doc_batch_cleaned)
|
||||
|
||||
# Update coordination directly (no docprocessing task)
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
IndexingCoordination.update_batch_completion_and_docs(
|
||||
db_session=db_session,
|
||||
index_attempt_id=index_attempt_id,
|
||||
total_docs_indexed=len(doc_batch_cleaned),
|
||||
new_docs_indexed=len(doc_batch_cleaned),
|
||||
total_chunks=0, # No chunks for file system mode
|
||||
)
|
||||
|
||||
batch_num += 1
|
||||
total_doc_batches_queued += 1
|
||||
|
||||
logger.info(
|
||||
f"Wrote documents to file system: "
|
||||
f"batch_num={batch_num} "
|
||||
f"docs={len(written_paths)} "
|
||||
f"attempt={index_attempt_id}"
|
||||
)
|
||||
else:
|
||||
# REGULAR mode (default): Full pipeline - store and queue docprocessing
|
||||
batch_storage.store_batch(batch_num, doc_batch_cleaned)
|
||||
|
||||
# Create processing task data
|
||||
processing_batch_data = {
|
||||
"index_attempt_id": index_attempt_id,
|
||||
"cc_pair_id": cc_pair_id,
|
||||
"tenant_id": tenant_id,
|
||||
"batch_num": batch_num, # 0-indexed
|
||||
}
|
||||
|
||||
# Queue document processing task
|
||||
app.send_task(
|
||||
OnyxCeleryTask.DOCPROCESSING_TASK,
|
||||
kwargs=processing_batch_data,
|
||||
queue=OnyxCeleryQueues.DOCPROCESSING,
|
||||
priority=docprocessing_priority,
|
||||
)
|
||||
|
||||
batch_num += 1
|
||||
total_doc_batches_queued += 1
|
||||
|
||||
logger.info(
|
||||
f"Queued document processing batch: "
|
||||
f"batch_num={batch_num} "
|
||||
f"docs={len(doc_batch_cleaned)} "
|
||||
f"attempt={index_attempt_id}"
|
||||
)
|
||||
|
||||
# Check checkpoint size periodically
|
||||
CHECKPOINT_SIZE_CHECK_INTERVAL = 100
|
||||
|
||||
@@ -7,6 +7,7 @@ from typing import Any
|
||||
|
||||
from onyx.chat.citation_processor import CitationMapping
|
||||
from onyx.chat.emitter import Emitter
|
||||
from onyx.context.search.models import SearchDoc
|
||||
from onyx.server.query_and_chat.placement import Placement
|
||||
from onyx.server.query_and_chat.streaming_models import OverallStop
|
||||
from onyx.server.query_and_chat.streaming_models import Packet
|
||||
@@ -15,6 +16,11 @@ from onyx.tools.models import ToolCallInfo
|
||||
from onyx.utils.threadpool_concurrency import run_in_background
|
||||
from onyx.utils.threadpool_concurrency import wait_on_background
|
||||
|
||||
# Type alias for search doc deduplication key
|
||||
# Simple key: just document_id (str)
|
||||
# Full key: (document_id, chunk_ind, match_highlights)
|
||||
SearchDocKey = str | tuple[str, int, tuple[str, ...]]
|
||||
|
||||
|
||||
class ChatStateContainer:
|
||||
"""Container for accumulating state during LLM loop execution.
|
||||
@@ -40,6 +46,10 @@ class ChatStateContainer:
|
||||
# True if this turn is a clarification question (deep research flow)
|
||||
self.is_clarification: bool = False
|
||||
# Note: LLM cost tracking is now handled in multi_llm.py
|
||||
# Search doc collection - maps dedup key to SearchDoc for all docs from tool calls
|
||||
self._all_search_docs: dict[SearchDocKey, SearchDoc] = {}
|
||||
# Track which citation numbers were actually emitted during streaming
|
||||
self._emitted_citations: set[int] = set()
|
||||
|
||||
def add_tool_call(self, tool_call: ToolCallInfo) -> None:
|
||||
"""Add a tool call to the accumulated state."""
|
||||
@@ -91,6 +101,54 @@ class ChatStateContainer:
|
||||
with self._lock:
|
||||
return self.is_clarification
|
||||
|
||||
@staticmethod
|
||||
def create_search_doc_key(
|
||||
search_doc: SearchDoc, use_simple_key: bool = True
|
||||
) -> SearchDocKey:
|
||||
"""Create a unique key for a SearchDoc for deduplication.
|
||||
|
||||
Args:
|
||||
search_doc: The SearchDoc to create a key for
|
||||
use_simple_key: If True (default), use only document_id for deduplication.
|
||||
If False, include chunk_ind and match_highlights so that the same
|
||||
document/chunk with different highlights are stored separately.
|
||||
"""
|
||||
if use_simple_key:
|
||||
return search_doc.document_id
|
||||
match_highlights_tuple = tuple(sorted(search_doc.match_highlights or []))
|
||||
return (search_doc.document_id, search_doc.chunk_ind, match_highlights_tuple)
|
||||
|
||||
def add_search_docs(
|
||||
self, search_docs: list[SearchDoc], use_simple_key: bool = True
|
||||
) -> None:
|
||||
"""Add search docs to the accumulated collection with deduplication.
|
||||
|
||||
Args:
|
||||
search_docs: List of SearchDoc objects to add
|
||||
use_simple_key: If True (default), deduplicate by document_id only.
|
||||
If False, deduplicate by document_id + chunk_ind + match_highlights.
|
||||
"""
|
||||
with self._lock:
|
||||
for doc in search_docs:
|
||||
key = self.create_search_doc_key(doc, use_simple_key)
|
||||
if key not in self._all_search_docs:
|
||||
self._all_search_docs[key] = doc
|
||||
|
||||
def get_all_search_docs(self) -> dict[SearchDocKey, SearchDoc]:
|
||||
"""Thread-safe getter for all accumulated search docs (returns a copy)."""
|
||||
with self._lock:
|
||||
return self._all_search_docs.copy()
|
||||
|
||||
def add_emitted_citation(self, citation_num: int) -> None:
|
||||
"""Add a citation number that was actually emitted during streaming."""
|
||||
with self._lock:
|
||||
self._emitted_citations.add(citation_num)
|
||||
|
||||
def get_emitted_citations(self) -> set[int]:
|
||||
"""Thread-safe getter for emitted citations (returns a copy)."""
|
||||
with self._lock:
|
||||
return self._emitted_citations.copy()
|
||||
|
||||
|
||||
def run_chat_loop_with_state_containers(
|
||||
func: Callable[..., None],
|
||||
|
||||
@@ -53,6 +53,50 @@ def update_citation_processor_from_tool_response(
|
||||
citation_processor.update_citation_mapping(citation_to_doc)
|
||||
|
||||
|
||||
def extract_citation_order_from_text(text: str) -> list[int]:
|
||||
"""Extract citation numbers from text in order of first appearance.
|
||||
|
||||
Parses citation patterns like [1], [1, 2], [[1]], 【1】 etc. and returns
|
||||
the citation numbers in the order they first appear in the text.
|
||||
|
||||
Args:
|
||||
text: The text containing citations
|
||||
|
||||
Returns:
|
||||
List of citation numbers in order of first appearance (no duplicates)
|
||||
"""
|
||||
# Same pattern used in collapse_citations and DynamicCitationProcessor
|
||||
# Group 2 captures the number in double bracket format: [[1]], 【【1】】
|
||||
# Group 4 captures the numbers in single bracket format: [1], [1, 2]
|
||||
citation_pattern = re.compile(
|
||||
r"([\[【[]{2}(\d+)[\]】]]{2})|([\[【[]([\d]+(?: *, *\d+)*)[\]】]])"
|
||||
)
|
||||
seen: set[int] = set()
|
||||
order: list[int] = []
|
||||
|
||||
for match in citation_pattern.finditer(text):
|
||||
# Group 2 is for double bracket single number, group 4 is for single bracket
|
||||
if match.group(2):
|
||||
nums_str = match.group(2)
|
||||
elif match.group(4):
|
||||
nums_str = match.group(4)
|
||||
else:
|
||||
continue
|
||||
|
||||
for num_str in nums_str.split(","):
|
||||
num_str = num_str.strip()
|
||||
if num_str:
|
||||
try:
|
||||
num = int(num_str)
|
||||
if num not in seen:
|
||||
seen.add(num)
|
||||
order.append(num)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
return order
|
||||
|
||||
|
||||
def collapse_citations(
|
||||
answer_text: str,
|
||||
existing_citation_mapping: CitationMapping,
|
||||
|
||||
@@ -45,6 +45,7 @@ from onyx.tools.tool_implementations.images.models import (
|
||||
FinalImageGenerationResponse,
|
||||
)
|
||||
from onyx.tools.tool_implementations.search.search_tool import SearchTool
|
||||
from onyx.tools.tool_implementations.web_search.utils import extract_url_snippet_map
|
||||
from onyx.tools.tool_implementations.web_search.web_search_tool import WebSearchTool
|
||||
from onyx.tools.tool_runner import run_tool_calls
|
||||
from onyx.tracing.framework.create import trace
|
||||
@@ -453,12 +454,16 @@ def run_llm_loop(
|
||||
|
||||
# The section below calculates the available tokens for history a bit more accurately
|
||||
# now that project files are loaded in.
|
||||
if persona and persona.replace_base_system_prompt and persona.system_prompt:
|
||||
if persona and persona.replace_base_system_prompt:
|
||||
# Handles the case where user has checked off the "Replace base system prompt" checkbox
|
||||
system_prompt = ChatMessageSimple(
|
||||
message=persona.system_prompt,
|
||||
token_count=token_counter(persona.system_prompt),
|
||||
message_type=MessageType.SYSTEM,
|
||||
system_prompt = (
|
||||
ChatMessageSimple(
|
||||
message=persona.system_prompt,
|
||||
token_count=token_counter(persona.system_prompt),
|
||||
message_type=MessageType.SYSTEM,
|
||||
)
|
||||
if persona.system_prompt
|
||||
else None
|
||||
)
|
||||
custom_agent_prompt_msg = None
|
||||
else:
|
||||
@@ -612,6 +617,7 @@ def run_llm_loop(
|
||||
next_citation_num=citation_processor.get_next_citation_number(),
|
||||
max_concurrent_tools=None,
|
||||
skip_search_query_expansion=has_called_search_tool,
|
||||
url_snippet_map=extract_url_snippet_map(gathered_documents or []),
|
||||
)
|
||||
tool_responses = parallel_tool_call_results.tool_responses
|
||||
citation_mapping = parallel_tool_call_results.updated_citation_mapping
|
||||
@@ -650,8 +656,15 @@ def run_llm_loop(
|
||||
|
||||
# Extract search_docs if this is a search tool response
|
||||
search_docs = None
|
||||
displayed_docs = None
|
||||
if isinstance(tool_response.rich_response, SearchDocsResponse):
|
||||
search_docs = tool_response.rich_response.search_docs
|
||||
displayed_docs = tool_response.rich_response.displayed_docs
|
||||
|
||||
# Add ALL search docs to state container for DB persistence
|
||||
if search_docs:
|
||||
state_container.add_search_docs(search_docs)
|
||||
|
||||
if gathered_documents:
|
||||
gathered_documents.extend(search_docs)
|
||||
else:
|
||||
@@ -685,7 +698,7 @@ def run_llm_loop(
|
||||
reasoning_tokens=llm_step_result.reasoning, # All tool calls from this loop share the same reasoning
|
||||
tool_call_arguments=tool_call.tool_args,
|
||||
tool_call_response=saved_response,
|
||||
search_docs=search_docs,
|
||||
search_docs=displayed_docs or search_docs,
|
||||
generated_images=generated_images,
|
||||
)
|
||||
# Add to state container for partial save support
|
||||
|
||||
@@ -14,6 +14,7 @@ from onyx.chat.emitter import Emitter
|
||||
from onyx.chat.models import ChatMessageSimple
|
||||
from onyx.chat.models import LlmStepResult
|
||||
from onyx.configs.app_configs import LOG_ONYX_MODEL_INTERACTIONS
|
||||
from onyx.configs.app_configs import PROMPT_CACHE_CHAT_HISTORY
|
||||
from onyx.configs.constants import MessageType
|
||||
from onyx.context.search.models import SearchDoc
|
||||
from onyx.file_store.models import ChatFileType
|
||||
@@ -432,7 +433,7 @@ def translate_history_to_llm_format(
|
||||
|
||||
for idx, msg in enumerate(history):
|
||||
# if the message is being added to the history
|
||||
if msg.message_type in [
|
||||
if PROMPT_CACHE_CHAT_HISTORY and msg.message_type in [
|
||||
MessageType.SYSTEM,
|
||||
MessageType.USER,
|
||||
MessageType.ASSISTANT,
|
||||
@@ -859,6 +860,11 @@ def run_llm_step_pkt_generator(
|
||||
),
|
||||
obj=result,
|
||||
)
|
||||
# Track emitted citation for saving
|
||||
if state_container:
|
||||
state_container.add_emitted_citation(
|
||||
result.citation_number
|
||||
)
|
||||
else:
|
||||
# When citation_processor is None, use delta.content directly without modification
|
||||
accumulated_answer += delta.content
|
||||
@@ -985,6 +991,9 @@ def run_llm_step_pkt_generator(
|
||||
),
|
||||
obj=result,
|
||||
)
|
||||
# Track emitted citation for saving
|
||||
if state_container:
|
||||
state_container.add_emitted_citation(result.citation_number)
|
||||
|
||||
# Note: Content (AgentResponseDelta) doesn't need an explicit end packet - OverallStop handles it
|
||||
# Tool calls are handled by tool execution code and emit their own packets (e.g., SectionEnd)
|
||||
|
||||
@@ -42,7 +42,6 @@ from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import MessageType
|
||||
from onyx.configs.constants import MilestoneRecordType
|
||||
from onyx.context.search.models import BaseFilters
|
||||
from onyx.context.search.models import CitationDocInfo
|
||||
from onyx.context.search.models import SearchDoc
|
||||
from onyx.db.chat import create_new_chat_message
|
||||
from onyx.db.chat import get_chat_session_by_id
|
||||
@@ -744,27 +743,16 @@ def llm_loop_completion_handle(
|
||||
else:
|
||||
final_answer = "The generation was stopped by the user."
|
||||
|
||||
# Build citation_docs_info from accumulated citations in state container
|
||||
citation_docs_info: list[CitationDocInfo] = []
|
||||
seen_citation_nums: set[int] = set()
|
||||
for citation_num, search_doc in state_container.citation_to_doc.items():
|
||||
if citation_num not in seen_citation_nums:
|
||||
seen_citation_nums.add(citation_num)
|
||||
citation_docs_info.append(
|
||||
CitationDocInfo(
|
||||
search_doc=search_doc,
|
||||
citation_number=citation_num,
|
||||
)
|
||||
)
|
||||
|
||||
save_chat_turn(
|
||||
message_text=final_answer,
|
||||
reasoning_tokens=state_container.reasoning_tokens,
|
||||
citation_docs_info=citation_docs_info,
|
||||
citation_to_doc=state_container.citation_to_doc,
|
||||
tool_calls=state_container.tool_calls,
|
||||
all_search_docs=state_container.get_all_search_docs(),
|
||||
db_session=db_session,
|
||||
assistant_message=assistant_message,
|
||||
is_clarification=state_container.is_clarification,
|
||||
emitted_citations=state_container.get_emitted_citations(),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -2,8 +2,9 @@ import json
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.chat.chat_state import ChatStateContainer
|
||||
from onyx.chat.chat_state import SearchDocKey
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.context.search.models import CitationDocInfo
|
||||
from onyx.context.search.models import SearchDoc
|
||||
from onyx.db.chat import add_search_docs_to_chat_message
|
||||
from onyx.db.chat import add_search_docs_to_tool_call
|
||||
@@ -19,22 +20,6 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _create_search_doc_key(search_doc: SearchDoc) -> tuple[str, int, tuple[str, ...]]:
|
||||
"""
|
||||
Create a unique key for a SearchDoc that accounts for different versions of the same
|
||||
document/chunk with different match_highlights.
|
||||
|
||||
Args:
|
||||
search_doc: The SearchDoc pydantic model to create a key for
|
||||
|
||||
Returns:
|
||||
A tuple of (document_id, chunk_ind, sorted match_highlights) that uniquely identifies
|
||||
this specific version of the document
|
||||
"""
|
||||
match_highlights_tuple = tuple(sorted(search_doc.match_highlights or []))
|
||||
return (search_doc.document_id, search_doc.chunk_ind, match_highlights_tuple)
|
||||
|
||||
|
||||
def _create_and_link_tool_calls(
|
||||
tool_calls: list[ToolCallInfo],
|
||||
assistant_message: ChatMessage,
|
||||
@@ -154,38 +139,36 @@ def save_chat_turn(
|
||||
message_text: str,
|
||||
reasoning_tokens: str | None,
|
||||
tool_calls: list[ToolCallInfo],
|
||||
citation_docs_info: list[CitationDocInfo],
|
||||
citation_to_doc: dict[int, SearchDoc],
|
||||
all_search_docs: dict[SearchDocKey, SearchDoc],
|
||||
db_session: Session,
|
||||
assistant_message: ChatMessage,
|
||||
is_clarification: bool = False,
|
||||
emitted_citations: set[int] | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Save a chat turn by populating the assistant_message and creating related entities.
|
||||
|
||||
This function:
|
||||
1. Updates the ChatMessage with text, reasoning tokens, and token count
|
||||
2. Creates SearchDoc entries from ToolCall search_docs (for tool calls that returned documents)
|
||||
3. Collects all unique SearchDocs from all tool calls and links them to ChatMessage
|
||||
4. Builds citation mapping from citation_docs_info
|
||||
5. Links all unique SearchDocs from tool calls to the ChatMessage
|
||||
2. Creates DB SearchDoc entries from pre-deduplicated all_search_docs
|
||||
3. Builds tool_call -> search_doc mapping for displayed docs
|
||||
4. Builds citation mapping from citation_to_doc
|
||||
5. Links all unique SearchDocs to the ChatMessage
|
||||
6. Creates ToolCall entries and links SearchDocs to them
|
||||
7. Builds the citations mapping for the ChatMessage
|
||||
|
||||
Deduplication Logic:
|
||||
- SearchDocs are deduplicated using (document_id, chunk_ind, match_highlights) as the key
|
||||
- This ensures that the same document/chunk with different match_highlights (from different
|
||||
queries) are stored as separate SearchDoc entries
|
||||
- Each ToolCall and ChatMessage will map to the correct version of the SearchDoc that
|
||||
matches its specific query highlights
|
||||
|
||||
Args:
|
||||
message_text: The message content to save
|
||||
reasoning_tokens: Optional reasoning tokens for the message
|
||||
tool_calls: List of tool call information to create ToolCall entries (may include search_docs)
|
||||
citation_docs_info: List of citation document information for building citations mapping
|
||||
citation_to_doc: Mapping from citation number to SearchDoc for building citations
|
||||
all_search_docs: Pre-deduplicated search docs from ChatStateContainer
|
||||
db_session: Database session for persistence
|
||||
assistant_message: The ChatMessage object to populate (should already exist in DB)
|
||||
is_clarification: Whether this assistant message is a clarification question (deep research flow)
|
||||
emitted_citations: Set of citation numbers that were actually emitted during streaming.
|
||||
If provided, only citations in this set will be saved; others are filtered out.
|
||||
"""
|
||||
# 1. Update ChatMessage with message content, reasoning tokens, and token count
|
||||
assistant_message.message = message_text
|
||||
@@ -200,53 +183,53 @@ def save_chat_turn(
|
||||
else:
|
||||
assistant_message.token_count = 0
|
||||
|
||||
# 2. Create SearchDoc entries from tool_calls
|
||||
# Build mapping from SearchDoc to DB SearchDoc ID
|
||||
# Use (document_id, chunk_ind, match_highlights) as key to avoid duplicates
|
||||
# while ensuring different versions with different highlights are stored separately
|
||||
search_doc_key_to_id: dict[tuple[str, int, tuple[str, ...]], int] = {}
|
||||
tool_call_to_search_doc_ids: dict[str, list[int]] = {}
|
||||
# 2. Create DB SearchDoc entries from pre-deduplicated all_search_docs
|
||||
search_doc_key_to_id: dict[SearchDocKey, int] = {}
|
||||
for key, search_doc_py in all_search_docs.items():
|
||||
db_search_doc = create_db_search_doc(
|
||||
server_search_doc=search_doc_py,
|
||||
db_session=db_session,
|
||||
commit=False,
|
||||
)
|
||||
search_doc_key_to_id[key] = db_search_doc.id
|
||||
|
||||
# Process tool calls and their search docs
|
||||
# 3. Build tool_call -> search_doc mapping (for displayed docs in each tool call)
|
||||
tool_call_to_search_doc_ids: dict[str, list[int]] = {}
|
||||
for tool_call_info in tool_calls:
|
||||
if tool_call_info.search_docs:
|
||||
search_doc_ids_for_tool: list[int] = []
|
||||
for search_doc_py in tool_call_info.search_docs:
|
||||
# Create a unique key for this SearchDoc version
|
||||
search_doc_key = _create_search_doc_key(search_doc_py)
|
||||
|
||||
# Check if we've already created this exact SearchDoc version
|
||||
if search_doc_key in search_doc_key_to_id:
|
||||
search_doc_ids_for_tool.append(search_doc_key_to_id[search_doc_key])
|
||||
key = ChatStateContainer.create_search_doc_key(search_doc_py)
|
||||
if key in search_doc_key_to_id:
|
||||
search_doc_ids_for_tool.append(search_doc_key_to_id[key])
|
||||
else:
|
||||
# Create new DB SearchDoc entry
|
||||
# Displayed doc not in all_search_docs - create it
|
||||
# This can happen if displayed_docs contains docs not in search_docs
|
||||
db_search_doc = create_db_search_doc(
|
||||
server_search_doc=search_doc_py,
|
||||
db_session=db_session,
|
||||
commit=False,
|
||||
)
|
||||
search_doc_key_to_id[search_doc_key] = db_search_doc.id
|
||||
search_doc_key_to_id[key] = db_search_doc.id
|
||||
search_doc_ids_for_tool.append(db_search_doc.id)
|
||||
|
||||
tool_call_to_search_doc_ids[tool_call_info.tool_call_id] = list(
|
||||
set(search_doc_ids_for_tool)
|
||||
)
|
||||
|
||||
# 3. Collect all unique SearchDoc IDs from all tool calls to link to ChatMessage
|
||||
# Use a set to deduplicate by ID (since we've already deduplicated by key above)
|
||||
all_search_doc_ids_set: set[int] = set()
|
||||
for search_doc_ids in tool_call_to_search_doc_ids.values():
|
||||
all_search_doc_ids_set.update(search_doc_ids)
|
||||
# Collect all search doc IDs for ChatMessage linking
|
||||
all_search_doc_ids_set: set[int] = set(search_doc_key_to_id.values())
|
||||
|
||||
# 4. Build citation mapping from citation_docs_info
|
||||
# 4. Build a citation mapping from the citation number to the saved DB SearchDoc ID
|
||||
# Only include citations that were actually emitted during streaming
|
||||
citation_number_to_search_doc_id: dict[int, int] = {}
|
||||
|
||||
for citation_doc_info in citation_docs_info:
|
||||
# Extract SearchDoc pydantic model
|
||||
search_doc_py = citation_doc_info.search_doc
|
||||
for citation_num, search_doc_py in citation_to_doc.items():
|
||||
# Skip citations that weren't actually emitted (if emitted_citations is provided)
|
||||
if emitted_citations is not None and citation_num not in emitted_citations:
|
||||
continue
|
||||
|
||||
# Create the unique key for this SearchDoc version
|
||||
search_doc_key = _create_search_doc_key(search_doc_py)
|
||||
search_doc_key = ChatStateContainer.create_search_doc_key(search_doc_py)
|
||||
|
||||
# Get the search doc ID (should already exist from processing tool_calls)
|
||||
if search_doc_key in search_doc_key_to_id:
|
||||
@@ -283,10 +266,7 @@ def save_chat_turn(
|
||||
all_search_doc_ids_set.add(db_search_doc_id)
|
||||
|
||||
# Build mapping from citation number to search doc ID
|
||||
if citation_doc_info.citation_number is not None:
|
||||
citation_number_to_search_doc_id[citation_doc_info.citation_number] = (
|
||||
db_search_doc_id
|
||||
)
|
||||
citation_number_to_search_doc_id[citation_num] = db_search_doc_id
|
||||
|
||||
# 5. Link all unique SearchDocs (from both tool calls and citations) to ChatMessage
|
||||
final_search_doc_ids: list[int] = list(all_search_doc_ids_set)
|
||||
@@ -306,23 +286,10 @@ def save_chat_turn(
|
||||
tool_call_to_search_doc_ids=tool_call_to_search_doc_ids,
|
||||
)
|
||||
|
||||
# 7. Build citations mapping from citation_docs_info
|
||||
# Any citation_doc_info with a citation_number appeared in the text and should be mapped
|
||||
citations: dict[int, int] = {}
|
||||
for citation_doc_info in citation_docs_info:
|
||||
if citation_doc_info.citation_number is not None:
|
||||
search_doc_id = citation_number_to_search_doc_id.get(
|
||||
citation_doc_info.citation_number
|
||||
)
|
||||
if search_doc_id is not None:
|
||||
citations[citation_doc_info.citation_number] = search_doc_id
|
||||
else:
|
||||
logger.warning(
|
||||
f"Citation number {citation_doc_info.citation_number} found in citation_docs_info "
|
||||
f"but no matching search doc ID in mapping"
|
||||
)
|
||||
|
||||
assistant_message.citations = citations if citations else None
|
||||
# 7. Build citations mapping - use the mapping we already built in step 4
|
||||
assistant_message.citations = (
|
||||
citation_number_to_search_doc_id if citation_number_to_search_doc_id else None
|
||||
)
|
||||
|
||||
# Finally save the messages, tool calls, and docs
|
||||
db_session.commit()
|
||||
|
||||
@@ -208,8 +208,19 @@ OPENSEARCH_REST_API_PORT = int(os.environ.get("OPENSEARCH_REST_API_PORT") or 920
|
||||
OPENSEARCH_ADMIN_USERNAME = os.environ.get("OPENSEARCH_ADMIN_USERNAME", "admin")
|
||||
OPENSEARCH_ADMIN_PASSWORD = os.environ.get("OPENSEARCH_ADMIN_PASSWORD", "")
|
||||
|
||||
ENABLE_OPENSEARCH_FOR_ONYX = (
|
||||
os.environ.get("ENABLE_OPENSEARCH_FOR_ONYX", "").lower() == "true"
|
||||
# This is the "base" config for now, the idea is that at least for our dev
|
||||
# environments we always want to be dual indexing into both OpenSearch and Vespa
|
||||
# to stress test the new codepaths. Only enable this if there is some instance
|
||||
# of OpenSearch running for the relevant Onyx instance.
|
||||
ENABLE_OPENSEARCH_INDEXING_FOR_ONYX = (
|
||||
os.environ.get("ENABLE_OPENSEARCH_INDEXING_FOR_ONYX", "").lower() == "true"
|
||||
)
|
||||
# Given that the "base" config above is true, this enables whether we want to
|
||||
# retrieve from OpenSearch or Vespa. We want to be able to quickly toggle this
|
||||
# in the event we see issues with OpenSearch retrieval in our dev environments.
|
||||
ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX = (
|
||||
ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
|
||||
and os.environ.get("ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX", "").lower() == "true"
|
||||
)
|
||||
|
||||
VESPA_HOST = os.environ.get("VESPA_HOST") or "localhost"
|
||||
@@ -738,6 +749,10 @@ JOB_TIMEOUT = 60 * 60 * 6 # 6 hours default
|
||||
LOG_ONYX_MODEL_INTERACTIONS = (
|
||||
os.environ.get("LOG_ONYX_MODEL_INTERACTIONS", "").lower() == "true"
|
||||
)
|
||||
|
||||
PROMPT_CACHE_CHAT_HISTORY = (
|
||||
os.environ.get("PROMPT_CACHE_CHAT_HISTORY", "").lower() == "true"
|
||||
)
|
||||
# If set to `true` will enable additional logs about Vespa query performance
|
||||
# (time spent on finding the right docs + time spent fetching summaries from disk)
|
||||
LOG_VESPA_TIMING_INFORMATION = (
|
||||
@@ -1016,3 +1031,25 @@ INSTANCE_TYPE = (
|
||||
## Discord Bot Configuration
|
||||
DISCORD_BOT_TOKEN = os.environ.get("DISCORD_BOT_TOKEN")
|
||||
DISCORD_BOT_INVOKE_CHAR = os.environ.get("DISCORD_BOT_INVOKE_CHAR", "!")
|
||||
|
||||
|
||||
## Stripe Configuration
|
||||
# URL to fetch the Stripe publishable key from a public S3 bucket.
|
||||
# Publishable keys are safe to expose publicly - they can only initialize
|
||||
# Stripe.js and tokenize payment info, not make charges or access data.
|
||||
STRIPE_PUBLISHABLE_KEY_URL = (
|
||||
"https://onyx-stripe-public.s3.amazonaws.com/publishable-key.txt"
|
||||
)
|
||||
# Override for local testing with Stripe test keys (pk_test_*)
|
||||
STRIPE_PUBLISHABLE_KEY_OVERRIDE = os.environ.get("STRIPE_PUBLISHABLE_KEY")
|
||||
# Persistent Document Storage Configuration
|
||||
# When enabled, indexed documents are written to local filesystem with hierarchical structure
|
||||
PERSISTENT_DOCUMENT_STORAGE_ENABLED = (
|
||||
os.environ.get("PERSISTENT_DOCUMENT_STORAGE_ENABLED", "").lower() == "true"
|
||||
)
|
||||
|
||||
# Base directory path for persistent document storage (local filesystem)
|
||||
# Example: /var/onyx/indexed-docs or /app/indexed-docs
|
||||
PERSISTENT_DOCUMENT_STORAGE_PATH = os.environ.get(
|
||||
"PERSISTENT_DOCUMENT_STORAGE_PATH", "/app/indexed-docs"
|
||||
)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
|
||||
INPUT_PROMPT_YAML = "./onyx/seeding/input_prompts.yaml"
|
||||
PROMPTS_YAML = "./onyx/seeding/prompts.yaml"
|
||||
PERSONAS_YAML = "./onyx/seeding/personas.yaml"
|
||||
NUM_RETURNED_HITS = 50
|
||||
|
||||
@@ -241,6 +241,7 @@ class NotificationType(str, Enum):
|
||||
TRIAL_ENDS_TWO_DAYS = "two_day_trial_ending" # 2 days left in trial
|
||||
RELEASE_NOTES = "release_notes"
|
||||
ASSISTANT_FILES_READY = "assistant_files_ready"
|
||||
FEATURE_ANNOUNCEMENT = "feature_announcement"
|
||||
|
||||
|
||||
class BlobType(str, Enum):
|
||||
@@ -327,6 +328,7 @@ class FileOrigin(str, Enum):
|
||||
PLAINTEXT_CACHE = "plaintext_cache"
|
||||
OTHER = "other"
|
||||
QUERY_HISTORY_CSV = "query_history_csv"
|
||||
SANDBOX_SNAPSHOT = "sandbox_snapshot"
|
||||
USER_FILE = "user_file"
|
||||
|
||||
|
||||
@@ -344,6 +346,7 @@ class MilestoneRecordType(str, Enum):
|
||||
MULTIPLE_ASSISTANTS = "multiple_assistants"
|
||||
CREATED_ASSISTANT = "created_assistant"
|
||||
CREATED_ONYX_BOT = "created_onyx_bot"
|
||||
REQUESTED_CONNECTOR = "requested_connector"
|
||||
|
||||
|
||||
class PostgresAdvisoryLocks(Enum):
|
||||
@@ -383,6 +386,9 @@ class OnyxCeleryQueues:
|
||||
# KG processing queue
|
||||
KG_PROCESSING = "kg_processing"
|
||||
|
||||
# Sandbox processing queue
|
||||
SANDBOX = "sandbox"
|
||||
|
||||
|
||||
class OnyxRedisLocks:
|
||||
PRIMARY_WORKER = "da_lock:primary_worker"
|
||||
@@ -431,6 +437,10 @@ class OnyxRedisLocks:
|
||||
# Release notes
|
||||
RELEASE_NOTES_FETCH_LOCK = "da_lock:release_notes_fetch"
|
||||
|
||||
# Sandbox cleanup
|
||||
CLEANUP_IDLE_SANDBOXES_BEAT_LOCK = "da_lock:cleanup_idle_sandboxes_beat"
|
||||
CLEANUP_OLD_SNAPSHOTS_BEAT_LOCK = "da_lock:cleanup_old_snapshots_beat"
|
||||
|
||||
|
||||
class OnyxRedisSignals:
|
||||
BLOCK_VALIDATE_INDEXING_FENCES = "signal:block_validate_indexing_fences"
|
||||
@@ -556,6 +566,10 @@ class OnyxCeleryTask:
|
||||
CHECK_KG_PROCESSING_CLUSTERING_ONLY = "check_kg_processing_clustering_only"
|
||||
KG_RESET_SOURCE_INDEX = "kg_reset_source_index"
|
||||
|
||||
# Sandbox cleanup
|
||||
CLEANUP_IDLE_SANDBOXES = "cleanup_idle_sandboxes"
|
||||
CLEANUP_OLD_SNAPSHOTS = "cleanup_old_snapshots"
|
||||
|
||||
|
||||
# this needs to correspond to the matching entry in supervisord
|
||||
ONYX_CELERY_BEAT_HEARTBEAT_KEY = "onyx:celery:beat:heartbeat"
|
||||
|
||||
@@ -89,6 +89,9 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
meeting_date_unix = transcript["date"]
|
||||
meeting_date = datetime.fromtimestamp(meeting_date_unix / 1000, tz=timezone.utc)
|
||||
|
||||
# Build hierarchy based on meeting date (year-month)
|
||||
year_month = meeting_date.strftime("%Y-%m")
|
||||
|
||||
meeting_organizer_email = transcript["organizer_email"]
|
||||
organizer_email_user_info = [BasicExpertInfo(email=meeting_organizer_email)]
|
||||
|
||||
@@ -102,6 +105,14 @@ def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
sections=cast(list[TextSection | ImageSection], sections),
|
||||
source=DocumentSource.FIREFLIES,
|
||||
semantic_identifier=meeting_title,
|
||||
doc_metadata={
|
||||
"hierarchy": {
|
||||
"source_path": [year_month],
|
||||
"year_month": year_month,
|
||||
"meeting_title": meeting_title,
|
||||
"organizer_email": meeting_organizer_email,
|
||||
}
|
||||
},
|
||||
metadata={
|
||||
k: str(v)
|
||||
for k, v in {
|
||||
|
||||
@@ -240,8 +240,21 @@ def _get_userinfo(user: NamedUser) -> dict[str, str]:
|
||||
def _convert_pr_to_document(
|
||||
pull_request: PullRequest, repo_external_access: ExternalAccess | None
|
||||
) -> Document:
|
||||
repo_name = pull_request.base.repo.full_name if pull_request.base else ""
|
||||
doc_metadata = DocMetadata(repo=repo_name)
|
||||
repo_full_name = pull_request.base.repo.full_name if pull_request.base else ""
|
||||
# Split full_name (e.g., "owner/repo") into owner and repo
|
||||
parts = repo_full_name.split("/", 1)
|
||||
owner_name = parts[0] if parts else ""
|
||||
repo_name = parts[1] if len(parts) > 1 else repo_full_name
|
||||
|
||||
doc_metadata = {
|
||||
"repo": repo_full_name,
|
||||
"hierarchy": {
|
||||
"source_path": [owner_name, repo_name, "pull_requests"],
|
||||
"owner": owner_name,
|
||||
"repo": repo_name,
|
||||
"object_type": "pull_request",
|
||||
},
|
||||
}
|
||||
return Document(
|
||||
id=pull_request.html_url,
|
||||
sections=[
|
||||
@@ -259,7 +272,7 @@ def _convert_pr_to_document(
|
||||
else None
|
||||
),
|
||||
# this metadata is used in perm sync
|
||||
doc_metadata=doc_metadata.model_dump(),
|
||||
doc_metadata=doc_metadata,
|
||||
metadata={
|
||||
k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
|
||||
for k, v in {
|
||||
@@ -316,8 +329,21 @@ def _fetch_issue_comments(issue: Issue) -> str:
|
||||
def _convert_issue_to_document(
|
||||
issue: Issue, repo_external_access: ExternalAccess | None
|
||||
) -> Document:
|
||||
repo_name = issue.repository.full_name if issue.repository else ""
|
||||
doc_metadata = DocMetadata(repo=repo_name)
|
||||
repo_full_name = issue.repository.full_name if issue.repository else ""
|
||||
# Split full_name (e.g., "owner/repo") into owner and repo
|
||||
parts = repo_full_name.split("/", 1)
|
||||
owner_name = parts[0] if parts else ""
|
||||
repo_name = parts[1] if len(parts) > 1 else repo_full_name
|
||||
|
||||
doc_metadata = {
|
||||
"repo": repo_full_name,
|
||||
"hierarchy": {
|
||||
"source_path": [owner_name, repo_name, "issues"],
|
||||
"owner": owner_name,
|
||||
"repo": repo_name,
|
||||
"object_type": "issue",
|
||||
},
|
||||
}
|
||||
return Document(
|
||||
id=issue.html_url,
|
||||
sections=[TextSection(link=issue.html_url, text=issue.body or "")],
|
||||
@@ -327,7 +353,7 @@ def _convert_issue_to_document(
|
||||
# updated_at is UTC time but is timezone unaware
|
||||
doc_updated_at=issue.updated_at.replace(tzinfo=timezone.utc),
|
||||
# this metadata is used in perm sync
|
||||
doc_metadata=doc_metadata.model_dump(),
|
||||
doc_metadata=doc_metadata,
|
||||
metadata={
|
||||
k: [str(vi) for vi in v] if isinstance(v, list) else str(v)
|
||||
for k, v in {
|
||||
|
||||
@@ -390,7 +390,9 @@ class GmailConnector(
|
||||
"""
|
||||
List all user emails if we are on a Google Workspace domain.
|
||||
If the domain is gmail.com, or if we attempt to call the Admin SDK and
|
||||
get a 404, fall back to using the single user.
|
||||
get a 404 or 403, fall back to using the single user.
|
||||
A 404 indicates a personal Gmail account with no Workspace domain.
|
||||
A 403 indicates insufficient permissions (e.g., OAuth user without admin privileges).
|
||||
"""
|
||||
|
||||
try:
|
||||
@@ -413,6 +415,13 @@ class GmailConnector(
|
||||
"with no Workspace domain. Falling back to single user."
|
||||
)
|
||||
return [self.primary_admin_email]
|
||||
elif e.resp.status == 403:
|
||||
logger.warning(
|
||||
"Received 403 from Admin SDK; this may indicate insufficient permissions "
|
||||
"(e.g., OAuth user without admin privileges or service account without "
|
||||
"domain-wide delegation). Falling back to single user."
|
||||
)
|
||||
return [self.primary_admin_email]
|
||||
raise
|
||||
|
||||
def _fetch_threads_impl(
|
||||
|
||||
@@ -46,6 +46,138 @@ from onyx.utils.variable_functionality import noop_fallback
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# Cache for folder path lookups to avoid redundant API calls
|
||||
# Maps folder_id -> (folder_name, parent_id)
|
||||
_folder_cache: dict[str, tuple[str, str | None]] = {}
|
||||
|
||||
|
||||
def _get_folder_info(
|
||||
service: GoogleDriveService, folder_id: str
|
||||
) -> tuple[str, str | None]:
|
||||
"""Fetch folder name and parent ID, with caching."""
|
||||
if folder_id in _folder_cache:
|
||||
return _folder_cache[folder_id]
|
||||
|
||||
try:
|
||||
folder = (
|
||||
service.files()
|
||||
.get(
|
||||
fileId=folder_id,
|
||||
fields="name, parents",
|
||||
supportsAllDrives=True,
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
folder_name = folder.get("name", "Unknown")
|
||||
parents = folder.get("parents", [])
|
||||
parent_id = parents[0] if parents else None
|
||||
_folder_cache[folder_id] = (folder_name, parent_id)
|
||||
return folder_name, parent_id
|
||||
except HttpError as e:
|
||||
logger.warning(f"Failed to get folder info for {folder_id}: {e}")
|
||||
_folder_cache[folder_id] = ("Unknown", None)
|
||||
return "Unknown", None
|
||||
|
||||
|
||||
def _get_drive_name(service: GoogleDriveService, drive_id: str) -> str:
|
||||
"""Fetch shared drive name."""
|
||||
cache_key = f"drive_{drive_id}"
|
||||
if cache_key in _folder_cache:
|
||||
return _folder_cache[cache_key][0]
|
||||
|
||||
try:
|
||||
drive = service.drives().get(driveId=drive_id).execute()
|
||||
drive_name = drive.get("name", f"Shared Drive {drive_id}")
|
||||
_folder_cache[cache_key] = (drive_name, None)
|
||||
return drive_name
|
||||
except HttpError as e:
|
||||
logger.warning(f"Failed to get drive name for {drive_id}: {e}")
|
||||
_folder_cache[cache_key] = (f"Shared Drive {drive_id}", None)
|
||||
return f"Shared Drive {drive_id}"
|
||||
|
||||
|
||||
def build_folder_path(
|
||||
file: GoogleDriveFileType,
|
||||
service: GoogleDriveService,
|
||||
drive_id: str | None = None,
|
||||
user_email: str | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Build the full folder path for a file by walking up the parent chain.
|
||||
Returns a list of folder names from root to immediate parent.
|
||||
|
||||
Args:
|
||||
file: The Google Drive file object
|
||||
service: Google Drive service instance
|
||||
drive_id: Optional drive ID (will be extracted from file if not provided)
|
||||
user_email: Optional user email to check ownership for "My Drive" vs "Shared with me"
|
||||
"""
|
||||
path_parts: list[str] = []
|
||||
|
||||
# Get drive_id from file if not provided
|
||||
if drive_id is None:
|
||||
drive_id = file.get("driveId")
|
||||
|
||||
# Check if file is owned by the user (for distinguishing "My Drive" vs "Shared with me")
|
||||
is_owned_by_user = False
|
||||
if user_email:
|
||||
owners = file.get("owners", [])
|
||||
is_owned_by_user = any(
|
||||
owner.get("emailAddress", "").lower() == user_email.lower()
|
||||
for owner in owners
|
||||
)
|
||||
|
||||
# Get the file's parent folder ID
|
||||
parents = file.get("parents", [])
|
||||
if not parents:
|
||||
# File is at root level
|
||||
if drive_id:
|
||||
return [_get_drive_name(service, drive_id)]
|
||||
# If not in a shared drive, check if it's owned by the user
|
||||
if is_owned_by_user:
|
||||
return ["My Drive"]
|
||||
else:
|
||||
return ["Shared with me"]
|
||||
|
||||
parent_id: str | None = parents[0]
|
||||
|
||||
# Walk up the folder hierarchy (limit to 50 levels to prevent infinite loops)
|
||||
visited: set[str] = set()
|
||||
for _ in range(50):
|
||||
if not parent_id or parent_id in visited:
|
||||
break
|
||||
visited.add(parent_id)
|
||||
|
||||
folder_name, next_parent = _get_folder_info(service, parent_id)
|
||||
|
||||
# Check if we've reached the root (parent is the drive itself or no parent)
|
||||
if next_parent is None:
|
||||
# This folder's name is either the drive root, My Drive, or Shared with me
|
||||
if drive_id:
|
||||
path_parts.insert(0, _get_drive_name(service, drive_id))
|
||||
else:
|
||||
# Not in a shared drive - determine if it's "My Drive" or "Shared with me"
|
||||
if is_owned_by_user:
|
||||
path_parts.insert(0, "My Drive")
|
||||
else:
|
||||
path_parts.insert(0, "Shared with me")
|
||||
break
|
||||
else:
|
||||
path_parts.insert(0, folder_name)
|
||||
parent_id = next_parent
|
||||
|
||||
# If we didn't find a root, determine the root based on ownership and drive
|
||||
if not path_parts:
|
||||
if drive_id:
|
||||
return [_get_drive_name(service, drive_id)]
|
||||
elif is_owned_by_user:
|
||||
return ["My Drive"]
|
||||
else:
|
||||
return ["Shared with me"]
|
||||
|
||||
return path_parts
|
||||
|
||||
|
||||
# This is not a standard valid unicode char, it is used by the docs advanced API to
|
||||
# represent smart chips (elements like dates and doc links).
|
||||
SMART_CHIP_CHAR = "\ue907"
|
||||
@@ -526,12 +658,33 @@ def _convert_drive_item_to_document(
|
||||
else None
|
||||
)
|
||||
|
||||
# Build doc_metadata with hierarchy information
|
||||
file_name = file.get("name", "")
|
||||
mime_type = file.get("mimeType", "")
|
||||
drive_id = file.get("driveId")
|
||||
|
||||
# Build full folder path by walking up the parent chain
|
||||
# Pass retriever_email to determine if file is in "My Drive" vs "Shared with me"
|
||||
source_path = build_folder_path(
|
||||
file, _get_drive_service(), drive_id, retriever_email
|
||||
)
|
||||
|
||||
doc_metadata = {
|
||||
"hierarchy": {
|
||||
"source_path": source_path,
|
||||
"drive_id": drive_id,
|
||||
"file_name": file_name,
|
||||
"mime_type": mime_type,
|
||||
}
|
||||
}
|
||||
|
||||
# Create the document
|
||||
return Document(
|
||||
id=doc_id,
|
||||
sections=sections,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
semantic_identifier=file.get("name", ""),
|
||||
semantic_identifier=file_name,
|
||||
doc_metadata=doc_metadata,
|
||||
metadata={
|
||||
"owner_names": ", ".join(
|
||||
owner.get("displayName", "") for owner in file.get("owners", [])
|
||||
|
||||
@@ -39,11 +39,11 @@ PERMISSION_FULL_DESCRIPTION = (
|
||||
"permissions(id, emailAddress, type, domain, allowFileDiscovery, permissionDetails)"
|
||||
)
|
||||
FILE_FIELDS = (
|
||||
"nextPageToken, files(mimeType, id, name, "
|
||||
"nextPageToken, files(mimeType, id, name, driveId, parents, "
|
||||
"modifiedTime, webViewLink, shortcutDetails, owners(emailAddress), size)"
|
||||
)
|
||||
FILE_FIELDS_WITH_PERMISSIONS = (
|
||||
f"nextPageToken, files(mimeType, id, name, {PERMISSION_FULL_DESCRIPTION}, permissionIds, "
|
||||
f"nextPageToken, files(mimeType, id, name, driveId, parents, {PERMISSION_FULL_DESCRIPTION}, permissionIds, "
|
||||
"modifiedTime, webViewLink, shortcutDetails, owners(emailAddress), size)"
|
||||
)
|
||||
SLIM_FILE_FIELDS = (
|
||||
|
||||
@@ -490,6 +490,13 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
semantic_identifier=title,
|
||||
doc_updated_at=ticket.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata=metadata,
|
||||
doc_metadata={
|
||||
"hierarchy": {
|
||||
"source_path": ["Tickets"],
|
||||
"object_type": "ticket",
|
||||
"object_id": ticket.id,
|
||||
}
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
@@ -615,6 +622,13 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
semantic_identifier=title,
|
||||
doc_updated_at=company.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata=metadata,
|
||||
doc_metadata={
|
||||
"hierarchy": {
|
||||
"source_path": ["Companies"],
|
||||
"object_type": "company",
|
||||
"object_id": company.id,
|
||||
}
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
@@ -738,6 +752,13 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
semantic_identifier=title,
|
||||
doc_updated_at=deal.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata=metadata,
|
||||
doc_metadata={
|
||||
"hierarchy": {
|
||||
"source_path": ["Deals"],
|
||||
"object_type": "deal",
|
||||
"object_id": deal.id,
|
||||
}
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
@@ -881,6 +902,13 @@ class HubSpotConnector(LoadConnector, PollConnector):
|
||||
semantic_identifier=title,
|
||||
doc_updated_at=contact.updated_at.replace(tzinfo=timezone.utc),
|
||||
metadata=metadata,
|
||||
doc_metadata={
|
||||
"hierarchy": {
|
||||
"source_path": ["Contacts"],
|
||||
"object_type": "contact",
|
||||
"object_id": contact.id,
|
||||
}
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -274,6 +274,10 @@ class LinearConnector(LoadConnector, PollConnector, OAuthConnector):
|
||||
# Cast the sections list to the expected type
|
||||
typed_sections = cast(list[TextSection | ImageSection], sections)
|
||||
|
||||
# Extract team name for hierarchy
|
||||
team_name = (node.get("team") or {}).get("name") or "Unknown Team"
|
||||
identifier = node.get("identifier", node["id"])
|
||||
|
||||
documents.append(
|
||||
Document(
|
||||
id=node["id"],
|
||||
@@ -282,6 +286,13 @@ class LinearConnector(LoadConnector, PollConnector, OAuthConnector):
|
||||
semantic_identifier=f"[{node['identifier']}] {node['title']}",
|
||||
title=node["title"],
|
||||
doc_updated_at=time_str_to_utc(node["updatedAt"]),
|
||||
doc_metadata={
|
||||
"hierarchy": {
|
||||
"source_path": [team_name],
|
||||
"team_name": team_name,
|
||||
"identifier": identifier,
|
||||
}
|
||||
},
|
||||
metadata={
|
||||
k: str(v)
|
||||
for k, v in {
|
||||
|
||||
@@ -234,6 +234,8 @@ def thread_to_doc(
|
||||
"\n", " "
|
||||
)
|
||||
|
||||
channel_name = channel["name"]
|
||||
|
||||
return Document(
|
||||
id=_build_doc_id(channel_id=channel_id, thread_ts=thread[0]["ts"]),
|
||||
sections=[
|
||||
@@ -247,7 +249,14 @@ def thread_to_doc(
|
||||
semantic_identifier=doc_sem_id,
|
||||
doc_updated_at=get_latest_message_time(thread),
|
||||
primary_owners=valid_experts,
|
||||
metadata={"Channel": channel["name"]},
|
||||
doc_metadata={
|
||||
"hierarchy": {
|
||||
"source_path": [channel_name],
|
||||
"channel_name": channel_name,
|
||||
"channel_id": channel_id,
|
||||
}
|
||||
},
|
||||
metadata={"Channel": channel_name},
|
||||
external_access=channel_access,
|
||||
)
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ from onyx.federated_connectors.slack.models import SlackEntities
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.llm.models import UserMessage
|
||||
from onyx.llm.utils import llm_response_to_string
|
||||
from onyx.natural_language_processing.english_stopwords import ENGLISH_STOPWORDS_SET
|
||||
from onyx.onyxbot.slack.models import ChannelType
|
||||
from onyx.prompts.federated_search import SLACK_DATE_EXTRACTION_PROMPT
|
||||
from onyx.prompts.federated_search import SLACK_QUERY_EXPANSION_PROMPT
|
||||
@@ -113,7 +114,7 @@ def is_recency_query(query: str) -> bool:
|
||||
if not has_recency_keyword:
|
||||
return False
|
||||
|
||||
# Get combined stop words (NLTK + Slack-specific)
|
||||
# Get combined stop words (English + Slack-specific)
|
||||
all_stop_words = _get_combined_stop_words()
|
||||
|
||||
# Extract content words (excluding stop words)
|
||||
@@ -488,7 +489,7 @@ def build_channel_override_query(channel_references: set[str], time_filter: str)
|
||||
return f"__CHANNEL_OVERRIDE__ {channel_filter}{time_filter}"
|
||||
|
||||
|
||||
# Slack-specific stop words (in addition to standard NLTK stop words)
|
||||
# Slack-specific stop words (in addition to standard English stop words)
|
||||
# These include Slack-specific terms and temporal/recency keywords
|
||||
SLACK_SPECIFIC_STOP_WORDS = frozenset(
|
||||
RECENCY_KEYWORDS
|
||||
@@ -508,27 +509,16 @@ SLACK_SPECIFIC_STOP_WORDS = frozenset(
|
||||
)
|
||||
|
||||
|
||||
def _get_combined_stop_words() -> set[str]:
|
||||
"""Get combined NLTK + Slack-specific stop words.
|
||||
def _get_combined_stop_words() -> frozenset[str]:
|
||||
"""Get combined English + Slack-specific stop words.
|
||||
|
||||
Returns a set of stop words for filtering content words.
|
||||
Falls back to just Slack-specific stop words if NLTK is unavailable.
|
||||
Returns a frozenset of stop words for filtering content words.
|
||||
|
||||
Note: Currently only supports English stop words. Non-English queries
|
||||
may have suboptimal content word extraction. Future enhancement could
|
||||
detect query language and load appropriate stop words.
|
||||
"""
|
||||
try:
|
||||
from nltk.corpus import stopwords # type: ignore
|
||||
|
||||
# TODO: Support multiple languages - currently hardcoded to English
|
||||
# Could detect language or allow configuration
|
||||
nltk_stop_words = set(stopwords.words("english"))
|
||||
except Exception:
|
||||
# Fallback if NLTK not available
|
||||
nltk_stop_words = set()
|
||||
|
||||
return nltk_stop_words | SLACK_SPECIFIC_STOP_WORDS
|
||||
return ENGLISH_STOPWORDS_SET | SLACK_SPECIFIC_STOP_WORDS
|
||||
|
||||
|
||||
def extract_content_words_from_recency_query(
|
||||
@@ -536,7 +526,7 @@ def extract_content_words_from_recency_query(
|
||||
) -> list[str]:
|
||||
"""Extract meaningful content words from a recency query.
|
||||
|
||||
Filters out NLTK stop words, Slack-specific terms, channel references, and proper nouns.
|
||||
Filters out English stop words, Slack-specific terms, channel references, and proper nouns.
|
||||
|
||||
Args:
|
||||
query_text: The user's query text
|
||||
@@ -545,7 +535,7 @@ def extract_content_words_from_recency_query(
|
||||
Returns:
|
||||
List of content words (up to MAX_CONTENT_WORDS)
|
||||
"""
|
||||
# Get combined stop words (NLTK + Slack-specific)
|
||||
# Get combined stop words (English + Slack-specific)
|
||||
all_stop_words = _get_combined_stop_words()
|
||||
|
||||
words = query_text.split()
|
||||
@@ -567,6 +557,23 @@ def extract_content_words_from_recency_query(
|
||||
return content_words_filtered[:MAX_CONTENT_WORDS]
|
||||
|
||||
|
||||
def _is_valid_keyword_query(line: str) -> bool:
|
||||
"""Check if a line looks like a valid keyword query vs explanatory text.
|
||||
|
||||
Returns False for lines that appear to be LLM explanations rather than keywords.
|
||||
"""
|
||||
# Reject lines that start with parentheses (explanatory notes)
|
||||
if line.startswith("("):
|
||||
return False
|
||||
|
||||
# Reject lines that are too long (likely sentences, not keywords)
|
||||
# Keywords should be short - reject if > 50 chars or > 6 words
|
||||
if len(line) > 50 or len(line.split()) > 6:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def expand_query_with_llm(query_text: str, llm: LLM) -> list[str]:
|
||||
"""Use LLM to expand query into multiple search variations.
|
||||
|
||||
@@ -589,10 +596,18 @@ def expand_query_with_llm(query_text: str, llm: LLM) -> list[str]:
|
||||
response_clean = _parse_llm_code_block_response(response)
|
||||
|
||||
# Split into lines and filter out empty lines
|
||||
rephrased_queries = [
|
||||
raw_queries = [
|
||||
line.strip() for line in response_clean.split("\n") if line.strip()
|
||||
]
|
||||
|
||||
# Filter out lines that look like explanatory text rather than keywords
|
||||
rephrased_queries = [q for q in raw_queries if _is_valid_keyword_query(q)]
|
||||
|
||||
# Log if we filtered out garbage
|
||||
if len(raw_queries) != len(rephrased_queries):
|
||||
filtered_out = set(raw_queries) - set(rephrased_queries)
|
||||
logger.warning(f"Filtered out non-keyword LLM responses: {filtered_out}")
|
||||
|
||||
# If no queries generated, use empty query
|
||||
if not rephrased_queries:
|
||||
logger.debug("No content keywords extracted from query expansion")
|
||||
|
||||
@@ -144,10 +144,6 @@ class BasicChunkRequest(BaseModel):
|
||||
# In case some queries favor recency more than other queries.
|
||||
recency_bias_multiplier: float = 1.0
|
||||
|
||||
# Sometimes we may want to extract specific keywords from a more semantic query for
|
||||
# a better keyword search.
|
||||
query_keywords: list[str] | None = None # Not used currently
|
||||
|
||||
limit: int | None = None
|
||||
offset: int | None = None # This one is not set currently
|
||||
|
||||
@@ -166,6 +162,8 @@ class ChunkIndexRequest(BasicChunkRequest):
|
||||
# Calculated final filters
|
||||
filters: IndexFilters
|
||||
|
||||
query_keywords: list[str] | None = None
|
||||
|
||||
|
||||
class ContextExpansionType(str, Enum):
|
||||
NOT_RELEVANT = "not_relevant"
|
||||
@@ -372,6 +370,10 @@ class SearchDocsResponse(BaseModel):
|
||||
# document id is the most staightforward way.
|
||||
citation_mapping: dict[int, str]
|
||||
|
||||
# For cases where the frontend only needs to display a subset of the search docs
|
||||
# The whole list is typically still needed for later steps but this set should be saved separately
|
||||
displayed_docs: list[SearchDoc] | None = None
|
||||
|
||||
|
||||
class SavedSearchDoc(SearchDoc):
|
||||
db_doc_id: int
|
||||
@@ -430,11 +432,6 @@ class SavedSearchDoc(SearchDoc):
|
||||
return self_score < other_score
|
||||
|
||||
|
||||
class CitationDocInfo(BaseModel):
|
||||
search_doc: SearchDoc
|
||||
citation_number: int | None
|
||||
|
||||
|
||||
class SavedSearchDocWithContent(SavedSearchDoc):
|
||||
"""Used for endpoints that need to return the actual contents of the retrieved
|
||||
section in addition to the match_highlights."""
|
||||
|
||||
@@ -19,6 +19,7 @@ from onyx.db.models import Persona
|
||||
from onyx.db.models import User
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.natural_language_processing.english_stopwords import strip_stopwords
|
||||
from onyx.secondary_llm_flows.source_filter import extract_source_filter
|
||||
from onyx.secondary_llm_flows.time_filter import extract_time_filter
|
||||
from onyx.utils.logger import setup_logger
|
||||
@@ -278,12 +279,16 @@ def search_pipeline(
|
||||
bypass_acl=chunk_search_request.bypass_acl,
|
||||
)
|
||||
|
||||
query_keywords = strip_stopwords(chunk_search_request.query)
|
||||
|
||||
query_request = ChunkIndexRequest(
|
||||
query=chunk_search_request.query,
|
||||
hybrid_alpha=chunk_search_request.hybrid_alpha,
|
||||
recency_bias_multiplier=chunk_search_request.recency_bias_multiplier,
|
||||
query_keywords=chunk_search_request.query_keywords,
|
||||
query_keywords=query_keywords,
|
||||
filters=filters,
|
||||
limit=chunk_search_request.limit,
|
||||
offset=chunk_search_request.offset,
|
||||
)
|
||||
|
||||
retrieved_chunks = search_chunks(
|
||||
|
||||
@@ -23,45 +23,6 @@ from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _dedupe_chunks(
|
||||
chunks: list[InferenceChunk],
|
||||
) -> list[InferenceChunk]:
|
||||
used_chunks: dict[tuple[str, int], InferenceChunk] = {}
|
||||
for chunk in chunks:
|
||||
key = (chunk.document_id, chunk.chunk_id)
|
||||
if key not in used_chunks:
|
||||
used_chunks[key] = chunk
|
||||
else:
|
||||
stored_chunk_score = used_chunks[key].score or 0
|
||||
this_chunk_score = chunk.score or 0
|
||||
if stored_chunk_score < this_chunk_score:
|
||||
used_chunks[key] = chunk
|
||||
|
||||
return list(used_chunks.values())
|
||||
|
||||
|
||||
def download_nltk_data() -> None:
|
||||
import nltk # type: ignore[import-untyped]
|
||||
|
||||
resources = {
|
||||
"stopwords": "corpora/stopwords",
|
||||
# "wordnet": "corpora/wordnet", # Not in use
|
||||
"punkt_tab": "tokenizers/punkt_tab",
|
||||
}
|
||||
|
||||
for resource_name, resource_path in resources.items():
|
||||
try:
|
||||
nltk.data.find(resource_path)
|
||||
logger.info(f"{resource_name} is already downloaded.")
|
||||
except LookupError:
|
||||
try:
|
||||
logger.info(f"Downloading {resource_name}...")
|
||||
nltk.download(resource_name, quiet=True)
|
||||
logger.info(f"{resource_name} downloaded successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download {resource_name}. Error: {e}")
|
||||
|
||||
|
||||
def combine_retrieval_results(
|
||||
chunk_sets: list[list[InferenceChunk]],
|
||||
) -> list[InferenceChunk]:
|
||||
|
||||
@@ -22,6 +22,7 @@ from onyx.db.credentials import fetch_credential_by_id_for_user
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import ProcessingMode
|
||||
from onyx.db.models import Connector
|
||||
from onyx.db.models import ConnectorCredentialPair
|
||||
from onyx.db.models import Credential
|
||||
@@ -116,7 +117,14 @@ def get_connector_credential_pairs_for_user(
|
||||
eager_load_user: bool = False,
|
||||
order_by_desc: bool = False,
|
||||
source: DocumentSource | None = None,
|
||||
processing_mode: ProcessingMode | None = ProcessingMode.REGULAR,
|
||||
) -> list[ConnectorCredentialPair]:
|
||||
"""Get connector credential pairs for a user.
|
||||
|
||||
Args:
|
||||
processing_mode: Filter by processing mode. Defaults to REGULAR to hide
|
||||
FILE_SYSTEM connectors from standard admin UI. Pass None to get all.
|
||||
"""
|
||||
if eager_load_user:
|
||||
assert (
|
||||
eager_load_credential
|
||||
@@ -142,6 +150,9 @@ def get_connector_credential_pairs_for_user(
|
||||
if ids:
|
||||
stmt = stmt.where(ConnectorCredentialPair.id.in_(ids))
|
||||
|
||||
if processing_mode is not None:
|
||||
stmt = stmt.where(ConnectorCredentialPair.processing_mode == processing_mode)
|
||||
|
||||
if order_by_desc:
|
||||
stmt = stmt.order_by(desc(ConnectorCredentialPair.id))
|
||||
|
||||
@@ -160,6 +171,7 @@ def get_connector_credential_pairs_for_user_parallel(
|
||||
eager_load_user: bool = False,
|
||||
order_by_desc: bool = False,
|
||||
source: DocumentSource | None = None,
|
||||
processing_mode: ProcessingMode | None = ProcessingMode.REGULAR,
|
||||
) -> list[ConnectorCredentialPair]:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
return get_connector_credential_pairs_for_user(
|
||||
@@ -172,6 +184,7 @@ def get_connector_credential_pairs_for_user_parallel(
|
||||
eager_load_user=eager_load_user,
|
||||
order_by_desc=order_by_desc,
|
||||
source=source,
|
||||
processing_mode=processing_mode,
|
||||
)
|
||||
|
||||
|
||||
@@ -501,6 +514,7 @@ def add_credential_to_connector(
|
||||
initial_status: ConnectorCredentialPairStatus = ConnectorCredentialPairStatus.SCHEDULED,
|
||||
last_successful_index_time: datetime | None = None,
|
||||
seeding_flow: bool = False,
|
||||
processing_mode: ProcessingMode = ProcessingMode.REGULAR,
|
||||
) -> StatusResponse:
|
||||
connector = fetch_connector_by_id(connector_id, db_session)
|
||||
|
||||
@@ -566,6 +580,7 @@ def add_credential_to_connector(
|
||||
access_type=access_type,
|
||||
auto_sync_options=auto_sync_options,
|
||||
last_successful_index_time=last_successful_index_time,
|
||||
processing_mode=processing_mode,
|
||||
)
|
||||
db_session.add(association)
|
||||
db_session.flush() # make sure the association has an id
|
||||
|
||||
@@ -56,6 +56,13 @@ class IndexingMode(str, PyEnum):
|
||||
REINDEX = "reindex"
|
||||
|
||||
|
||||
class ProcessingMode(str, PyEnum):
|
||||
"""Determines how documents are processed after fetching."""
|
||||
|
||||
REGULAR = "regular" # Full pipeline: chunk → embed → Vespa
|
||||
FILE_SYSTEM = "file_system" # Write to file system only
|
||||
|
||||
|
||||
class SyncType(str, PyEnum):
|
||||
DOCUMENT_SET = "document_set"
|
||||
USER_GROUP = "user_group"
|
||||
@@ -194,3 +201,39 @@ class SwitchoverType(str, PyEnum):
|
||||
REINDEX = "reindex"
|
||||
ACTIVE_ONLY = "active_only"
|
||||
INSTANT = "instant"
|
||||
|
||||
|
||||
# Onyx Build Mode Enums
|
||||
class BuildSessionStatus(str, PyEnum):
|
||||
ACTIVE = "active"
|
||||
IDLE = "idle"
|
||||
|
||||
|
||||
class SandboxStatus(str, PyEnum):
|
||||
PROVISIONING = "provisioning"
|
||||
RUNNING = "running"
|
||||
IDLE = "idle"
|
||||
SLEEPING = "sleeping" # Pod terminated, snapshots saved to S3
|
||||
TERMINATED = "terminated"
|
||||
FAILED = "failed"
|
||||
|
||||
def is_active(self) -> bool:
|
||||
"""Check if sandbox is in an active state (running or idle)."""
|
||||
return self in (SandboxStatus.RUNNING, SandboxStatus.IDLE)
|
||||
|
||||
def is_terminal(self) -> bool:
|
||||
"""Check if sandbox is in a terminal state."""
|
||||
return self in (SandboxStatus.TERMINATED, SandboxStatus.FAILED)
|
||||
|
||||
def is_sleeping(self) -> bool:
|
||||
"""Check if sandbox is sleeping (pod terminated but can be restored)."""
|
||||
return self == SandboxStatus.SLEEPING
|
||||
|
||||
|
||||
class ArtifactType(str, PyEnum):
|
||||
WEB_APP = "web_app"
|
||||
PPTX = "pptx"
|
||||
DOCX = "docx"
|
||||
IMAGE = "image"
|
||||
MARKDOWN = "markdown"
|
||||
EXCEL = "excel"
|
||||
|
||||
@@ -3,6 +3,8 @@ from uuid import UUID
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
from sqlalchemy.orm import aliased
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -18,45 +20,6 @@ from onyx.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def insert_input_prompt_if_not_exists(
|
||||
user: User | None,
|
||||
input_prompt_id: int | None,
|
||||
prompt: str,
|
||||
content: str,
|
||||
active: bool,
|
||||
is_public: bool,
|
||||
db_session: Session,
|
||||
commit: bool = True,
|
||||
) -> InputPrompt:
|
||||
if input_prompt_id is not None:
|
||||
input_prompt = (
|
||||
db_session.query(InputPrompt).filter_by(id=input_prompt_id).first()
|
||||
)
|
||||
else:
|
||||
query = db_session.query(InputPrompt).filter(InputPrompt.prompt == prompt)
|
||||
if user:
|
||||
query = query.filter(InputPrompt.user_id == user.id)
|
||||
else:
|
||||
query = query.filter(InputPrompt.user_id.is_(None))
|
||||
input_prompt = query.first()
|
||||
|
||||
if input_prompt is None:
|
||||
input_prompt = InputPrompt(
|
||||
id=input_prompt_id,
|
||||
prompt=prompt,
|
||||
content=content,
|
||||
active=active,
|
||||
is_public=is_public or user is None,
|
||||
user_id=user.id if user else None,
|
||||
)
|
||||
db_session.add(input_prompt)
|
||||
|
||||
if commit:
|
||||
db_session.commit()
|
||||
|
||||
return input_prompt
|
||||
|
||||
|
||||
def insert_input_prompt(
|
||||
prompt: str,
|
||||
content: str,
|
||||
@@ -64,16 +27,41 @@ def insert_input_prompt(
|
||||
user: User | None,
|
||||
db_session: Session,
|
||||
) -> InputPrompt:
|
||||
input_prompt = InputPrompt(
|
||||
user_id = user.id if user else None
|
||||
|
||||
# Use atomic INSERT ... ON CONFLICT DO NOTHING with RETURNING
|
||||
# to avoid race conditions with the uniqueness check
|
||||
stmt = pg_insert(InputPrompt).values(
|
||||
prompt=prompt,
|
||||
content=content,
|
||||
active=True,
|
||||
is_public=is_public,
|
||||
user_id=user.id if user is not None else None,
|
||||
user_id=user_id,
|
||||
)
|
||||
db_session.add(input_prompt)
|
||||
db_session.commit()
|
||||
|
||||
# Use the appropriate constraint based on whether this is a user-owned or public prompt
|
||||
if user_id is not None:
|
||||
stmt = stmt.on_conflict_do_nothing(constraint="uq_inputprompt_prompt_user_id")
|
||||
else:
|
||||
# Partial unique indexes cannot be targeted by constraint name;
|
||||
# must use index_elements + index_where
|
||||
stmt = stmt.on_conflict_do_nothing(
|
||||
index_elements=[InputPrompt.prompt],
|
||||
index_where=InputPrompt.user_id.is_(None),
|
||||
)
|
||||
|
||||
stmt = stmt.returning(InputPrompt)
|
||||
|
||||
result = db_session.execute(stmt)
|
||||
input_prompt = result.scalar_one_or_none()
|
||||
|
||||
if input_prompt is None:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"A prompt shortcut with the name '{prompt}' already exists",
|
||||
)
|
||||
|
||||
db_session.commit()
|
||||
return input_prompt
|
||||
|
||||
|
||||
@@ -98,23 +86,40 @@ def update_input_prompt(
|
||||
input_prompt.content = content
|
||||
input_prompt.active = active
|
||||
|
||||
db_session.commit()
|
||||
try:
|
||||
db_session.commit()
|
||||
except IntegrityError:
|
||||
db_session.rollback()
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"A prompt shortcut with the name '{prompt}' already exists",
|
||||
)
|
||||
|
||||
return input_prompt
|
||||
|
||||
|
||||
def validate_user_prompt_authorization(
|
||||
user: User | None, input_prompt: InputPrompt
|
||||
) -> bool:
|
||||
"""
|
||||
Check if the user is authorized to modify the given input prompt.
|
||||
Returns True only if the user owns the prompt.
|
||||
Returns False for public prompts (only admins can modify those),
|
||||
unless auth is disabled (then anyone can manage public prompts).
|
||||
"""
|
||||
prompt = InputPromptSnapshot.from_model(input_prompt=input_prompt)
|
||||
|
||||
if prompt.user_id is not None:
|
||||
if user is None:
|
||||
return False
|
||||
# Public prompts cannot be modified via the user API (unless auth is disabled)
|
||||
if prompt.is_public or prompt.user_id is None:
|
||||
return AUTH_TYPE == AuthType.DISABLED
|
||||
|
||||
user_details = UserInfo.from_model(user)
|
||||
if str(user_details.id) != str(prompt.user_id):
|
||||
return False
|
||||
return True
|
||||
# User must be logged in
|
||||
if user is None:
|
||||
return False
|
||||
|
||||
# User must own the prompt
|
||||
user_details = UserInfo.from_model(user)
|
||||
return str(user_details.id) == str(prompt.user_id)
|
||||
|
||||
|
||||
def remove_public_input_prompt(input_prompt_id: int, db_session: Session) -> None:
|
||||
|
||||
@@ -9,6 +9,9 @@ def get_memories(user: User | None, db_session: Session) -> list[str]:
|
||||
if user is None:
|
||||
return []
|
||||
|
||||
if not user.use_memories:
|
||||
return []
|
||||
|
||||
user_info = [
|
||||
f"User's name: {user.personal_name}" if user.personal_name else "",
|
||||
f"User's role: {user.personal_role}" if user.personal_role else "",
|
||||
|
||||
@@ -11,6 +11,7 @@ from typing_extensions import TypedDict # noreorder
|
||||
from uuid import UUID
|
||||
from pydantic import ValidationError
|
||||
|
||||
from sqlalchemy.dialects.postgresql import JSONB as PGJSONB
|
||||
from sqlalchemy.dialects.postgresql import UUID as PGUUID
|
||||
|
||||
from fastapi_users_db_sqlalchemy import SQLAlchemyBaseOAuthAccountTableUUID
|
||||
@@ -55,8 +56,12 @@ from onyx.configs.constants import FileOrigin
|
||||
from onyx.configs.constants import MessageType
|
||||
from onyx.db.enums import (
|
||||
AccessType,
|
||||
ArtifactType,
|
||||
BuildSessionStatus,
|
||||
EmbeddingPrecision,
|
||||
IndexingMode,
|
||||
ProcessingMode,
|
||||
SandboxStatus,
|
||||
SyncType,
|
||||
SyncStatus,
|
||||
MCPAuthenticationType,
|
||||
@@ -188,6 +193,7 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
|
||||
nullable=True,
|
||||
default=None,
|
||||
)
|
||||
chat_background: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
# personalization fields are exposed via the chat user settings "Personalization" tab
|
||||
personal_name: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
personal_role: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
@@ -608,6 +614,16 @@ class ConnectorCredentialPair(Base):
|
||||
Enum(IndexingMode, native_enum=False), nullable=True
|
||||
)
|
||||
|
||||
# Determines how documents are processed after fetching:
|
||||
# REGULAR: Full pipeline (chunk → embed → Vespa)
|
||||
# FILE_SYSTEM: Write to file system only (for CLI agent sandbox)
|
||||
processing_mode: Mapped[ProcessingMode] = mapped_column(
|
||||
Enum(ProcessingMode, native_enum=False),
|
||||
nullable=False,
|
||||
default=ProcessingMode.REGULAR,
|
||||
server_default="regular",
|
||||
)
|
||||
|
||||
connector: Mapped["Connector"] = relationship(
|
||||
"Connector", back_populates="credentials"
|
||||
)
|
||||
@@ -3626,6 +3642,18 @@ class InputPrompt(Base):
|
||||
ForeignKey("user.id", ondelete="CASCADE"), nullable=True
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
# Unique constraint on (prompt, user_id) for user-owned prompts
|
||||
UniqueConstraint("prompt", "user_id", name="uq_inputprompt_prompt_user_id"),
|
||||
# Partial unique index for public prompts (user_id IS NULL)
|
||||
Index(
|
||||
"uq_inputprompt_prompt_public",
|
||||
"prompt",
|
||||
unique=True,
|
||||
postgresql_where=text("user_id IS NULL"),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
class InputPrompt__User(Base):
|
||||
__tablename__ = "inputprompt__user"
|
||||
@@ -3634,7 +3662,7 @@ class InputPrompt__User(Base):
|
||||
ForeignKey("inputprompt.id"), primary_key=True
|
||||
)
|
||||
user_id: Mapped[UUID | None] = mapped_column(
|
||||
ForeignKey("inputprompt.id"), primary_key=True
|
||||
ForeignKey("user.id"), primary_key=True
|
||||
)
|
||||
disabled: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False)
|
||||
|
||||
@@ -4129,3 +4157,202 @@ class TenantUsage(Base):
|
||||
# Ensure only one row per window start (tenant_id is in the schema name)
|
||||
UniqueConstraint("window_start", name="uq_tenant_usage_window"),
|
||||
)
|
||||
|
||||
|
||||
"""Tables related to Build Mode (CLI Agent Platform)"""
|
||||
|
||||
|
||||
class BuildSession(Base):
|
||||
"""Stores metadata about CLI agent build sessions."""
|
||||
|
||||
__tablename__ = "build_session"
|
||||
|
||||
id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True), primary_key=True, default=uuid4
|
||||
)
|
||||
user_id: Mapped[UUID | None] = mapped_column(
|
||||
PGUUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=True
|
||||
)
|
||||
name: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
status: Mapped[BuildSessionStatus] = mapped_column(
|
||||
Enum(BuildSessionStatus, native_enum=False, name="buildsessionstatus"),
|
||||
nullable=False,
|
||||
default=BuildSessionStatus.ACTIVE,
|
||||
)
|
||||
created_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
last_activity_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True),
|
||||
server_default=func.now(),
|
||||
onupdate=func.now(),
|
||||
nullable=False,
|
||||
)
|
||||
nextjs_port: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
|
||||
# Relationships
|
||||
user: Mapped[User | None] = relationship("User", foreign_keys=[user_id])
|
||||
artifacts: Mapped[list["Artifact"]] = relationship(
|
||||
"Artifact", back_populates="session", cascade="all, delete-orphan"
|
||||
)
|
||||
messages: Mapped[list["BuildMessage"]] = relationship(
|
||||
"BuildMessage", back_populates="session", cascade="all, delete-orphan"
|
||||
)
|
||||
snapshots: Mapped[list["Snapshot"]] = relationship(
|
||||
"Snapshot", back_populates="session", cascade="all, delete-orphan"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_build_session_user_created", "user_id", desc("created_at")),
|
||||
Index("ix_build_session_status", "status"),
|
||||
)
|
||||
|
||||
|
||||
class Sandbox(Base):
|
||||
"""Stores sandbox container metadata for users (one sandbox per user)."""
|
||||
|
||||
__tablename__ = "sandbox"
|
||||
|
||||
id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True), primary_key=True, default=uuid4
|
||||
)
|
||||
user_id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True),
|
||||
ForeignKey("user.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
unique=True,
|
||||
)
|
||||
container_id: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
status: Mapped[SandboxStatus] = mapped_column(
|
||||
Enum(SandboxStatus, native_enum=False, name="sandboxstatus"),
|
||||
nullable=False,
|
||||
default=SandboxStatus.PROVISIONING,
|
||||
)
|
||||
created_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
last_heartbeat: Mapped[datetime.datetime | None] = mapped_column(
|
||||
DateTime(timezone=True), nullable=True
|
||||
)
|
||||
|
||||
# Relationships
|
||||
user: Mapped[User] = relationship("User")
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_sandbox_status", "status"),
|
||||
Index("ix_sandbox_container_id", "container_id"),
|
||||
)
|
||||
|
||||
|
||||
class Artifact(Base):
|
||||
"""Stores metadata about artifacts generated by CLI agents."""
|
||||
|
||||
__tablename__ = "artifact"
|
||||
|
||||
id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True), primary_key=True, default=uuid4
|
||||
)
|
||||
session_id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True),
|
||||
ForeignKey("build_session.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
)
|
||||
type: Mapped[ArtifactType] = mapped_column(
|
||||
Enum(ArtifactType, native_enum=False, name="artifacttype"), nullable=False
|
||||
)
|
||||
# path of artifact in sandbox relative to outputs/
|
||||
path: Mapped[str] = mapped_column(String, nullable=False)
|
||||
name: Mapped[str] = mapped_column(String, nullable=False)
|
||||
created_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
updated_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True),
|
||||
server_default=func.now(),
|
||||
onupdate=func.now(),
|
||||
nullable=False,
|
||||
)
|
||||
|
||||
# Relationships
|
||||
session: Mapped[BuildSession] = relationship(
|
||||
"BuildSession", back_populates="artifacts"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_artifact_session_created", "session_id", desc("created_at")),
|
||||
Index("ix_artifact_type", "type"),
|
||||
)
|
||||
|
||||
|
||||
class Snapshot(Base):
|
||||
"""Stores metadata about session output snapshots."""
|
||||
|
||||
__tablename__ = "snapshot"
|
||||
|
||||
id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True), primary_key=True, default=uuid4
|
||||
)
|
||||
session_id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True),
|
||||
ForeignKey("build_session.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
)
|
||||
storage_path: Mapped[str] = mapped_column(String, nullable=False)
|
||||
size_bytes: Mapped[int] = mapped_column(BigInteger, nullable=False, default=0)
|
||||
created_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
# Relationships
|
||||
session: Mapped[BuildSession] = relationship(
|
||||
"BuildSession", back_populates="snapshots"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index("ix_snapshot_session_created", "session_id", desc("created_at")),
|
||||
)
|
||||
|
||||
|
||||
class BuildMessage(Base):
|
||||
"""Stores messages exchanged in build sessions.
|
||||
|
||||
All message data is stored in message_metadata as JSON (the raw ACP packet).
|
||||
The turn_index groups all assistant responses under the user prompt they respond to.
|
||||
|
||||
Packet types stored in message_metadata:
|
||||
- user_message: {type: "user_message", content: {...}}
|
||||
- agent_message: {type: "agent_message", content: {...}} (accumulated from chunks)
|
||||
- agent_thought: {type: "agent_thought", content: {...}} (accumulated from chunks)
|
||||
- tool_call_progress: {type: "tool_call_progress", status: "completed", ...} (only completed)
|
||||
- agent_plan_update: {type: "agent_plan_update", entries: [...]} (upserted, latest only)
|
||||
"""
|
||||
|
||||
__tablename__ = "build_message"
|
||||
|
||||
id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True), primary_key=True, default=uuid4
|
||||
)
|
||||
session_id: Mapped[UUID] = mapped_column(
|
||||
PGUUID(as_uuid=True),
|
||||
ForeignKey("build_session.id", ondelete="CASCADE"),
|
||||
nullable=False,
|
||||
)
|
||||
turn_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
type: Mapped[MessageType] = mapped_column(
|
||||
Enum(MessageType, native_enum=False, name="messagetype"), nullable=False
|
||||
)
|
||||
message_metadata: Mapped[dict[str, Any]] = mapped_column(PGJSONB, nullable=False)
|
||||
created_at: Mapped[datetime.datetime] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), nullable=False
|
||||
)
|
||||
|
||||
# Relationships
|
||||
session: Mapped[BuildSession] = relationship(
|
||||
"BuildSession", back_populates="messages"
|
||||
)
|
||||
|
||||
__table_args__ = (
|
||||
Index(
|
||||
"ix_build_message_session_turn", "session_id", "turn_index", "created_at"
|
||||
),
|
||||
)
|
||||
|
||||
@@ -20,7 +20,7 @@ from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.db.search_settings import update_search_settings_status
|
||||
from onyx.document_index.factory import get_default_document_index
|
||||
from onyx.document_index.factory import get_all_document_indices
|
||||
from onyx.key_value_store.factory import get_kv_store
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@@ -80,39 +80,43 @@ def _perform_index_swap(
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
# remove the old index from the vector db
|
||||
document_index = get_default_document_index(new_search_settings, None)
|
||||
# This flow is for checking and possibly creating an index so we get all
|
||||
# indices.
|
||||
document_indices = get_all_document_indices(new_search_settings, None, None)
|
||||
|
||||
WAIT_SECONDS = 5
|
||||
|
||||
success = False
|
||||
for x in range(VESPA_NUM_ATTEMPTS_ON_STARTUP):
|
||||
try:
|
||||
logger.notice(
|
||||
f"Vespa index swap (attempt {x+1}/{VESPA_NUM_ATTEMPTS_ON_STARTUP})..."
|
||||
)
|
||||
document_index.ensure_indices_exist(
|
||||
primary_embedding_dim=new_search_settings.final_embedding_dim,
|
||||
primary_embedding_precision=new_search_settings.embedding_precision,
|
||||
# just finished swap, no more secondary index
|
||||
secondary_index_embedding_dim=None,
|
||||
secondary_index_embedding_precision=None,
|
||||
)
|
||||
for document_index in document_indices:
|
||||
success = False
|
||||
for x in range(VESPA_NUM_ATTEMPTS_ON_STARTUP):
|
||||
try:
|
||||
logger.notice(
|
||||
f"Document index {document_index.__class__.__name__} swap (attempt {x+1}/{VESPA_NUM_ATTEMPTS_ON_STARTUP})..."
|
||||
)
|
||||
document_index.ensure_indices_exist(
|
||||
primary_embedding_dim=new_search_settings.final_embedding_dim,
|
||||
primary_embedding_precision=new_search_settings.embedding_precision,
|
||||
# just finished swap, no more secondary index
|
||||
secondary_index_embedding_dim=None,
|
||||
secondary_index_embedding_precision=None,
|
||||
)
|
||||
|
||||
logger.notice("Vespa index swap complete.")
|
||||
success = True
|
||||
break
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Vespa index swap did not succeed. The Vespa service may not be ready yet. Retrying in {WAIT_SECONDS} seconds."
|
||||
)
|
||||
time.sleep(WAIT_SECONDS)
|
||||
logger.notice("Document index swap complete.")
|
||||
success = True
|
||||
break
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Document index swap for {document_index.__class__.__name__} did not succeed. "
|
||||
f"The document index services may not be ready yet. Retrying in {WAIT_SECONDS} seconds."
|
||||
)
|
||||
time.sleep(WAIT_SECONDS)
|
||||
|
||||
if not success:
|
||||
logger.error(
|
||||
f"Vespa index swap did not succeed. Attempt limit reached. ({VESPA_NUM_ATTEMPTS_ON_STARTUP})"
|
||||
)
|
||||
return None
|
||||
if not success:
|
||||
logger.error(
|
||||
f"Document index swap for {document_index.__class__.__name__} did not succeed. "
|
||||
f"Attempt limit reached. ({VESPA_NUM_ATTEMPTS_ON_STARTUP})"
|
||||
)
|
||||
return None
|
||||
|
||||
return current_search_settings
|
||||
|
||||
|
||||
@@ -139,6 +139,20 @@ def update_user_theme_preference(
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def update_user_chat_background(
|
||||
user_id: UUID,
|
||||
chat_background: str | None,
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
"""Update user's chat background setting."""
|
||||
db_session.execute(
|
||||
update(User)
|
||||
.where(User.id == user_id) # type: ignore
|
||||
.values(chat_background=chat_background)
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def update_user_personalization(
|
||||
user_id: UUID,
|
||||
*,
|
||||
|
||||
@@ -2,13 +2,18 @@ from onyx.configs.app_configs import BLURB_SIZE
|
||||
from onyx.configs.constants import RETURN_SEPARATOR
|
||||
from onyx.context.search.models import InferenceChunk
|
||||
from onyx.context.search.models import InferenceChunkUncleaned
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.indexing.models import DocMetadataAwareIndexChunk
|
||||
|
||||
|
||||
def generate_enriched_content_for_chunk(chunk: DocMetadataAwareIndexChunk) -> str:
|
||||
def generate_enriched_content_for_chunk_text(chunk: DocMetadataAwareIndexChunk) -> str:
|
||||
return f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_keyword}"
|
||||
|
||||
|
||||
def generate_enriched_content_for_chunk_embedding(chunk: DocAwareChunk) -> str:
|
||||
return f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_semantic}"
|
||||
|
||||
|
||||
def cleanup_content_for_chunks(
|
||||
chunks: list[InferenceChunkUncleaned],
|
||||
) -> list[InferenceChunk]:
|
||||
|
||||
@@ -1,9 +1,8 @@
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import ENABLE_OPENSEARCH_FOR_ONYX
|
||||
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
|
||||
from onyx.configs.app_configs import ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.opensearch.opensearch_document_index import (
|
||||
OpenSearchOldDocumentIndex,
|
||||
@@ -17,17 +16,24 @@ def get_default_document_index(
|
||||
secondary_search_settings: SearchSettings | None,
|
||||
httpx_client: httpx.Client | None = None,
|
||||
) -> DocumentIndex:
|
||||
"""Primary index is the index that is used for querying/updating etc.
|
||||
Secondary index is for when both the currently used index and the upcoming
|
||||
index both need to be updated, updates are applied to both indices"""
|
||||
"""Gets the default document index from env vars.
|
||||
|
||||
To be used for retrieval only. Indexing should be done through both indices
|
||||
until Vespa is deprecated.
|
||||
|
||||
Pre-existing docstring for this function, although secondary indices are not
|
||||
currently supported:
|
||||
Primary index is the index that is used for querying/updating etc. Secondary
|
||||
index is for when both the currently used index and the upcoming index both
|
||||
need to be updated, updates are applied to both indices.
|
||||
"""
|
||||
secondary_index_name: str | None = None
|
||||
secondary_large_chunks_enabled: bool | None = None
|
||||
if secondary_search_settings:
|
||||
secondary_index_name = secondary_search_settings.index_name
|
||||
secondary_large_chunks_enabled = secondary_search_settings.large_chunks_enabled
|
||||
|
||||
if ENABLE_OPENSEARCH_FOR_ONYX:
|
||||
if ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX:
|
||||
return OpenSearchOldDocumentIndex(
|
||||
index_name=search_settings.index_name,
|
||||
secondary_index_name=secondary_index_name,
|
||||
@@ -47,12 +53,48 @@ def get_default_document_index(
|
||||
)
|
||||
|
||||
|
||||
def get_current_primary_default_document_index(db_session: Session) -> DocumentIndex:
|
||||
def get_all_document_indices(
|
||||
search_settings: SearchSettings,
|
||||
secondary_search_settings: SearchSettings | None,
|
||||
httpx_client: httpx.Client | None = None,
|
||||
) -> list[DocumentIndex]:
|
||||
"""Gets all document indices.
|
||||
|
||||
NOTE: Will only return an OpenSearch index interface if
|
||||
ENABLE_OPENSEARCH_INDEXING_FOR_ONYX is True. This is so we don't break flows
|
||||
where we know it won't be enabled.
|
||||
|
||||
Used for indexing only. Until Vespa is deprecated we will index into both
|
||||
document indices. Retrieval is done through only one index however.
|
||||
|
||||
Large chunks and secondary indices are not currently supported so we
|
||||
hardcode appropriate values.
|
||||
"""
|
||||
TODO: Use redis to cache this or something
|
||||
"""
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
return get_default_document_index(
|
||||
search_settings,
|
||||
None,
|
||||
vespa_document_index = VespaIndex(
|
||||
index_name=search_settings.index_name,
|
||||
secondary_index_name=(
|
||||
secondary_search_settings.index_name if secondary_search_settings else None
|
||||
),
|
||||
large_chunks_enabled=search_settings.large_chunks_enabled,
|
||||
secondary_large_chunks_enabled=(
|
||||
secondary_search_settings.large_chunks_enabled
|
||||
if secondary_search_settings
|
||||
else None
|
||||
),
|
||||
multitenant=MULTI_TENANT,
|
||||
httpx_client=httpx_client,
|
||||
)
|
||||
opensearch_document_index: OpenSearchOldDocumentIndex | None = None
|
||||
if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
|
||||
opensearch_document_index = OpenSearchOldDocumentIndex(
|
||||
index_name=search_settings.index_name,
|
||||
secondary_index_name=None,
|
||||
large_chunks_enabled=False,
|
||||
secondary_large_chunks_enabled=None,
|
||||
multitenant=MULTI_TENANT,
|
||||
httpx_client=httpx_client,
|
||||
)
|
||||
result: list[DocumentIndex] = [vespa_document_index]
|
||||
if opensearch_document_index:
|
||||
result.append(opensearch_document_index)
|
||||
return result
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
from typing import Generic
|
||||
from typing import TypeVar
|
||||
@@ -569,6 +570,9 @@ class OpenSearchClient:
|
||||
def close(self) -> None:
|
||||
"""Closes the client.
|
||||
|
||||
TODO(andrei): Can we have some way to auto close when the client no
|
||||
longer has any references?
|
||||
|
||||
Raises:
|
||||
Exception: There was an error closing the client.
|
||||
"""
|
||||
@@ -596,3 +600,55 @@ class OpenSearchClient:
|
||||
)
|
||||
hits_second_layer: list[Any] = hits_first_layer.get("hits", [])
|
||||
return hits_second_layer
|
||||
|
||||
|
||||
def wait_for_opensearch_with_timeout(
|
||||
wait_interval_s: int = 5,
|
||||
wait_limit_s: int = 60,
|
||||
client: OpenSearchClient | None = None,
|
||||
) -> bool:
|
||||
"""Waits for OpenSearch to become ready subject to a timeout.
|
||||
|
||||
Will create a new dummy client if no client is provided. Will close this
|
||||
client at the end of the function. Will not close the client if it was
|
||||
supplied.
|
||||
|
||||
Args:
|
||||
wait_interval_s: The interval in seconds to wait between checks.
|
||||
Defaults to 5.
|
||||
wait_limit_s: The total timeout in seconds to wait for OpenSearch to
|
||||
become ready. Defaults to 60.
|
||||
client: The OpenSearch client to use for pinging. If None, a new dummy
|
||||
client will be created. Defaults to None.
|
||||
|
||||
Returns:
|
||||
True if OpenSearch is ready, False otherwise.
|
||||
"""
|
||||
made_client = False
|
||||
try:
|
||||
if client is None:
|
||||
# NOTE: index_name does not matter because we are only using this object
|
||||
# to ping.
|
||||
# TODO(andrei): Make this better.
|
||||
client = OpenSearchClient(index_name="")
|
||||
made_client = True
|
||||
time_start = time.monotonic()
|
||||
while True:
|
||||
if client.ping():
|
||||
logger.info("[OpenSearch] Readiness probe succeeded. Continuing...")
|
||||
return True
|
||||
time_elapsed = time.monotonic() - time_start
|
||||
if time_elapsed > wait_limit_s:
|
||||
logger.info(
|
||||
f"[OpenSearch] Readiness probe did not succeed within the timeout "
|
||||
f"({wait_limit_s} seconds)."
|
||||
)
|
||||
return False
|
||||
logger.info(
|
||||
f"[OpenSearch] Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={wait_limit_s:.1f}"
|
||||
)
|
||||
time.sleep(wait_interval_s)
|
||||
finally:
|
||||
if made_client:
|
||||
assert client is not None
|
||||
client.close()
|
||||
|
||||
@@ -17,7 +17,7 @@ from onyx.db.enums import EmbeddingPrecision
|
||||
from onyx.db.models import DocumentSource
|
||||
from onyx.document_index.chunk_content_enrichment import cleanup_content_for_chunks
|
||||
from onyx.document_index.chunk_content_enrichment import (
|
||||
generate_enriched_content_for_chunk,
|
||||
generate_enriched_content_for_chunk_text,
|
||||
)
|
||||
from onyx.document_index.interfaces import DocumentIndex as OldDocumentIndex
|
||||
from onyx.document_index.interfaces import (
|
||||
@@ -140,9 +140,12 @@ def _convert_onyx_chunk_to_opensearch_document(
|
||||
return DocumentChunk(
|
||||
document_id=chunk.source_document.id,
|
||||
chunk_index=chunk.chunk_id,
|
||||
title=chunk.source_document.title,
|
||||
# Use get_title_for_document_index to match the logic used when creating
|
||||
# the title_embedding in the embedder. This method falls back to
|
||||
# semantic_identifier when title is None (but not empty string).
|
||||
title=chunk.source_document.get_title_for_document_index(),
|
||||
title_vector=chunk.title_embedding,
|
||||
content=generate_enriched_content_for_chunk(chunk),
|
||||
content=generate_enriched_content_for_chunk_text(chunk),
|
||||
content_vector=chunk.embeddings.full_embedding,
|
||||
source_type=chunk.source_document.source.value,
|
||||
metadata_list=chunk.source_document.get_metadata_str_attributes(),
|
||||
@@ -421,6 +424,24 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
def verify_and_create_index_if_necessary(
|
||||
self, embedding_dim: int, embedding_precision: EmbeddingPrecision
|
||||
) -> None:
|
||||
"""Verifies and creates the index if necessary.
|
||||
|
||||
Also puts the desired search pipeline state, creating the pipelines if
|
||||
they do not exist and updating them otherwise.
|
||||
|
||||
Args:
|
||||
embedding_dim: Vector dimensionality for the vector similarity part
|
||||
of the search.
|
||||
embedding_precision: Precision of the values of the vectors for the
|
||||
similarity part of the search.
|
||||
|
||||
Raises:
|
||||
RuntimeError: There was an error verifying or creating the index or
|
||||
search pipelines.
|
||||
"""
|
||||
logger.debug(
|
||||
f"[OpenSearchDocumentIndex] Verifying and creating index {self._index_name} if necessary."
|
||||
)
|
||||
expected_mappings = DocumentSchema.get_document_schema(
|
||||
embedding_dim, self._tenant_state.multitenant
|
||||
)
|
||||
@@ -450,6 +471,9 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
chunks: list[DocMetadataAwareIndexChunk],
|
||||
indexing_metadata: IndexingMetadata,
|
||||
) -> list[DocumentInsertionRecord]:
|
||||
logger.debug(
|
||||
f"[OpenSearchDocumentIndex] Indexing {len(chunks)} chunks for index {self._index_name}."
|
||||
)
|
||||
# Set of doc IDs.
|
||||
unique_docs_to_be_indexed: set[str] = set()
|
||||
document_indexing_results: list[DocumentInsertionRecord] = []
|
||||
@@ -494,6 +518,8 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
def delete(self, document_id: str, chunk_count: int | None = None) -> int:
|
||||
"""Deletes all chunks for a given document.
|
||||
|
||||
Does nothing if the specified document ID does not exist.
|
||||
|
||||
TODO(andrei): Make this method require supplying source type.
|
||||
TODO(andrei): Consider implementing this method to delete on document
|
||||
chunk IDs vs querying for matching document chunks.
|
||||
@@ -510,6 +536,9 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
Returns:
|
||||
The number of chunks successfully deleted.
|
||||
"""
|
||||
logger.debug(
|
||||
f"[OpenSearchDocumentIndex] Deleting document {document_id} from index {self._index_name}."
|
||||
)
|
||||
query_body = DocumentQuery.delete_from_document_id_query(
|
||||
document_id=document_id,
|
||||
tenant_state=self._tenant_state,
|
||||
@@ -523,6 +552,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
) -> None:
|
||||
"""Updates some set of chunks.
|
||||
|
||||
NOTE: Will raise if the specified document chunks do not exist.
|
||||
NOTE: Requires document chunk count be known; will raise if it is not.
|
||||
NOTE: Each update request must have some field to update; if not it is
|
||||
assumed there is a bug in the caller and this will raise.
|
||||
@@ -539,6 +569,9 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
RuntimeError: Failed to update some or all of the chunks for the
|
||||
specified documents.
|
||||
"""
|
||||
logger.debug(
|
||||
f"[OpenSearchDocumentIndex] Updating {len(update_requests)} chunks for index {self._index_name}."
|
||||
)
|
||||
for update_request in update_requests:
|
||||
properties_to_update: dict[str, Any] = dict()
|
||||
# TODO(andrei): Nit but consider if we can use DocumentChunk
|
||||
@@ -604,6 +637,9 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
TODO(andrei): Consider implementing this method to retrieve on document
|
||||
chunk IDs vs querying for matching document chunks.
|
||||
"""
|
||||
logger.debug(
|
||||
f"[OpenSearchDocumentIndex] Retrieving {len(chunk_requests)} chunks for index {self._index_name}."
|
||||
)
|
||||
results: list[InferenceChunk] = []
|
||||
for chunk_request in chunk_requests:
|
||||
search_hits: list[SearchHit[DocumentChunk]] = []
|
||||
@@ -643,6 +679,9 @@ class OpenSearchDocumentIndex(DocumentIndex):
|
||||
num_to_retrieve: int,
|
||||
offset: int = 0,
|
||||
) -> list[InferenceChunk]:
|
||||
logger.debug(
|
||||
f"[OpenSearchDocumentIndex] Hybrid retrieving {num_to_retrieve} chunks for index {self._index_name}."
|
||||
)
|
||||
query_body = DocumentQuery.get_hybrid_search_query(
|
||||
query_text=query,
|
||||
query_vector=query_embedding,
|
||||
|
||||
@@ -17,7 +17,7 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_experts_stores_representations,
|
||||
)
|
||||
from onyx.document_index.chunk_content_enrichment import (
|
||||
generate_enriched_content_for_chunk,
|
||||
generate_enriched_content_for_chunk_text,
|
||||
)
|
||||
from onyx.document_index.document_index_utils import get_uuid_from_chunk
|
||||
from onyx.document_index.document_index_utils import get_uuid_from_chunk_info_old
|
||||
@@ -186,7 +186,7 @@ def _index_vespa_chunk(
|
||||
# For the BM25 index, the keyword suffix is used, the vector is already generated with the more
|
||||
# natural language representation of the metadata section
|
||||
CONTENT: remove_invalid_unicode_chars(
|
||||
generate_enriched_content_for_chunk(chunk)
|
||||
generate_enriched_content_for_chunk_text(chunk)
|
||||
),
|
||||
# This duplication of `content` is needed for keyword highlighting
|
||||
# Note that it's not exactly the same as the actual content
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from onyx.configs.app_configs import DEV_MODE
|
||||
from onyx.feature_flags.interface import FeatureFlagProvider
|
||||
from onyx.feature_flags.interface import NoOpFeatureFlagProvider
|
||||
from onyx.utils.variable_functionality import (
|
||||
@@ -19,7 +20,7 @@ def get_default_feature_flag_provider() -> FeatureFlagProvider:
|
||||
Returns:
|
||||
FeatureFlagProvider: The configured feature flag provider instance
|
||||
"""
|
||||
if MULTI_TENANT:
|
||||
if MULTI_TENANT or DEV_MODE:
|
||||
return fetch_versioned_implementation_with_fallback(
|
||||
module="onyx.feature_flags.factory",
|
||||
attribute="get_posthog_feature_flag_provider",
|
||||
|
||||
@@ -2,3 +2,7 @@
|
||||
Feature flag keys used throughout the application.
|
||||
Centralizes feature flag key definitions to avoid magic strings.
|
||||
"""
|
||||
|
||||
# Build Mode feature flag - controls access to /build routes and features
|
||||
# When disabled via PostHog, all build routes return 404
|
||||
BUILD_MODE_ENABLED = "build-mode-enabled"
|
||||
|
||||
@@ -7,6 +7,9 @@ from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import ConnectorStopSignal
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.document_index.chunk_content_enrichment import (
|
||||
generate_enriched_content_for_chunk_embedding,
|
||||
)
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.models import ChunkEmbedding
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
@@ -126,7 +129,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
|
||||
if chunk.large_chunk_reference_ids:
|
||||
large_chunks_present = True
|
||||
chunk_text = (
|
||||
f"{chunk.title_prefix}{chunk.doc_summary}{chunk.content}{chunk.chunk_context}{chunk.metadata_suffix_semantic}"
|
||||
generate_enriched_content_for_chunk_embedding(chunk)
|
||||
) or chunk.source_document.get_title_for_document_index()
|
||||
|
||||
if not chunk_text:
|
||||
|
||||
@@ -37,6 +37,7 @@ from onyx.document_index.document_index_utils import (
|
||||
get_multipass_config,
|
||||
)
|
||||
from onyx.document_index.interfaces import DocumentIndex
|
||||
from onyx.document_index.interfaces import DocumentInsertionRecord
|
||||
from onyx.document_index.interfaces import DocumentMetadata
|
||||
from onyx.document_index.interfaces import IndexBatchParams
|
||||
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
|
||||
@@ -163,7 +164,7 @@ def index_doc_batch_with_handler(
|
||||
*,
|
||||
chunker: Chunker,
|
||||
embedder: IndexingEmbedder,
|
||||
document_index: DocumentIndex,
|
||||
document_indices: list[DocumentIndex],
|
||||
document_batch: list[Document],
|
||||
request_id: str | None,
|
||||
tenant_id: str,
|
||||
@@ -176,7 +177,7 @@ def index_doc_batch_with_handler(
|
||||
index_pipeline_result = index_doc_batch(
|
||||
chunker=chunker,
|
||||
embedder=embedder,
|
||||
document_index=document_index,
|
||||
document_indices=document_indices,
|
||||
document_batch=document_batch,
|
||||
request_id=request_id,
|
||||
tenant_id=tenant_id,
|
||||
@@ -627,7 +628,7 @@ def index_doc_batch(
|
||||
document_batch: list[Document],
|
||||
chunker: Chunker,
|
||||
embedder: IndexingEmbedder,
|
||||
document_index: DocumentIndex,
|
||||
document_indices: list[DocumentIndex],
|
||||
request_id: str | None,
|
||||
tenant_id: str,
|
||||
adapter: IndexingBatchAdapter,
|
||||
@@ -743,47 +744,57 @@ def index_doc_batch(
|
||||
short_descriptor_log = str(short_descriptor_list)[:1024]
|
||||
logger.debug(f"Indexing the following chunks: {short_descriptor_log}")
|
||||
|
||||
# A document will not be spread across different batches, so all the
|
||||
# documents with chunks in this set, are fully represented by the chunks
|
||||
# in this set
|
||||
(
|
||||
insertion_records,
|
||||
vector_db_write_failures,
|
||||
) = write_chunks_to_vector_db_with_backoff(
|
||||
document_index=document_index,
|
||||
chunks=result.chunks,
|
||||
index_batch_params=IndexBatchParams(
|
||||
doc_id_to_previous_chunk_cnt=result.doc_id_to_previous_chunk_cnt,
|
||||
doc_id_to_new_chunk_cnt=result.doc_id_to_new_chunk_cnt,
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=chunker.enable_large_chunks,
|
||||
),
|
||||
)
|
||||
primary_doc_idx_insertion_records: list[DocumentInsertionRecord] | None = None
|
||||
primary_doc_idx_vector_db_write_failures: list[ConnectorFailure] | None = None
|
||||
for document_index in document_indices:
|
||||
# A document will not be spread across different batches, so all the
|
||||
# documents with chunks in this set, are fully represented by the chunks
|
||||
# in this set
|
||||
(
|
||||
insertion_records,
|
||||
vector_db_write_failures,
|
||||
) = write_chunks_to_vector_db_with_backoff(
|
||||
document_index=document_index,
|
||||
chunks=result.chunks,
|
||||
index_batch_params=IndexBatchParams(
|
||||
doc_id_to_previous_chunk_cnt=result.doc_id_to_previous_chunk_cnt,
|
||||
doc_id_to_new_chunk_cnt=result.doc_id_to_new_chunk_cnt,
|
||||
tenant_id=tenant_id,
|
||||
large_chunks_enabled=chunker.enable_large_chunks,
|
||||
),
|
||||
)
|
||||
|
||||
all_returned_doc_ids = (
|
||||
{record.document_id for record in insertion_records}
|
||||
.union(
|
||||
{
|
||||
record.failed_document.document_id
|
||||
for record in vector_db_write_failures
|
||||
if record.failed_document
|
||||
}
|
||||
)
|
||||
.union(
|
||||
{
|
||||
record.failed_document.document_id
|
||||
for record in embedding_failures
|
||||
if record.failed_document
|
||||
}
|
||||
)
|
||||
)
|
||||
if all_returned_doc_ids != set(updatable_ids):
|
||||
raise RuntimeError(
|
||||
f"Some documents were not successfully indexed. "
|
||||
f"Updatable IDs: {updatable_ids}, "
|
||||
f"Returned IDs: {all_returned_doc_ids}. "
|
||||
"This should never happen."
|
||||
all_returned_doc_ids: set[str] = (
|
||||
{record.document_id for record in insertion_records}
|
||||
.union(
|
||||
{
|
||||
record.failed_document.document_id
|
||||
for record in vector_db_write_failures
|
||||
if record.failed_document
|
||||
}
|
||||
)
|
||||
.union(
|
||||
{
|
||||
record.failed_document.document_id
|
||||
for record in embedding_failures
|
||||
if record.failed_document
|
||||
}
|
||||
)
|
||||
)
|
||||
if all_returned_doc_ids != set(updatable_ids):
|
||||
raise RuntimeError(
|
||||
f"Some documents were not successfully indexed. "
|
||||
f"Updatable IDs: {updatable_ids}, "
|
||||
f"Returned IDs: {all_returned_doc_ids}. "
|
||||
"This should never happen."
|
||||
f"This occured for document index {document_index.__class__.__name__}"
|
||||
)
|
||||
# We treat the first document index we got as the primary one used
|
||||
# for reporting the state of indexing.
|
||||
if primary_doc_idx_insertion_records is None:
|
||||
primary_doc_idx_insertion_records = insertion_records
|
||||
if primary_doc_idx_vector_db_write_failures is None:
|
||||
primary_doc_idx_vector_db_write_failures = vector_db_write_failures
|
||||
|
||||
adapter.post_index(
|
||||
context=context,
|
||||
@@ -792,11 +803,15 @@ def index_doc_batch(
|
||||
result=result,
|
||||
)
|
||||
|
||||
assert primary_doc_idx_insertion_records is not None
|
||||
assert primary_doc_idx_vector_db_write_failures is not None
|
||||
return IndexingPipelineResult(
|
||||
new_docs=len([r for r in insertion_records if not r.already_existed]),
|
||||
new_docs=len(
|
||||
[r for r in primary_doc_idx_insertion_records if not r.already_existed]
|
||||
),
|
||||
total_docs=len(filtered_documents),
|
||||
total_chunks=len(chunks_with_embeddings),
|
||||
failures=vector_db_write_failures + embedding_failures,
|
||||
failures=primary_doc_idx_vector_db_write_failures + embedding_failures,
|
||||
)
|
||||
|
||||
|
||||
@@ -805,7 +820,7 @@ def run_indexing_pipeline(
|
||||
document_batch: list[Document],
|
||||
request_id: str | None,
|
||||
embedder: IndexingEmbedder,
|
||||
document_index: DocumentIndex,
|
||||
document_indices: list[DocumentIndex],
|
||||
db_session: Session,
|
||||
tenant_id: str,
|
||||
adapter: IndexingBatchAdapter,
|
||||
@@ -846,7 +861,7 @@ def run_indexing_pipeline(
|
||||
return index_doc_batch_with_handler(
|
||||
chunker=chunker,
|
||||
embedder=embedder,
|
||||
document_index=document_index,
|
||||
document_indices=document_indices,
|
||||
document_batch=document_batch,
|
||||
request_id=request_id,
|
||||
tenant_id=tenant_id,
|
||||
|
||||
168
backend/onyx/indexing/persistent_document_writer.py
Normal file
168
backend/onyx/indexing/persistent_document_writer.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Persistent Document Writer for writing indexed documents to local filesystem with
|
||||
hierarchical directory structure that mirrors the source organization.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.server.features.build.configs import PERSISTENT_DOCUMENT_STORAGE_PATH
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class PersistentDocumentWriter:
|
||||
"""Writes indexed documents to local filesystem with hierarchical structure.
|
||||
|
||||
Documents are stored in user-segregated paths:
|
||||
{base_path}/{user_id}/{source}/{hierarchy}/document.json
|
||||
|
||||
This enables per-user isolation for sandbox access control.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_path: str,
|
||||
user_id: str,
|
||||
):
|
||||
self.base_path = Path(base_path)
|
||||
self.user_id = user_id
|
||||
|
||||
def write_documents(self, documents: list[Document]) -> list[str]:
|
||||
"""Write documents to local filesystem, returns written file paths"""
|
||||
written_paths = []
|
||||
|
||||
# Build a map of base filenames to detect duplicates
|
||||
# Key: (directory_path, base_filename) -> list of docs with that name
|
||||
filename_map: dict[tuple[Path, str], list[Document]] = {}
|
||||
|
||||
for doc in documents:
|
||||
dir_path = self._build_directory_path(doc)
|
||||
base_filename = self._get_base_filename(doc)
|
||||
key = (dir_path, base_filename)
|
||||
if key not in filename_map:
|
||||
filename_map[key] = []
|
||||
filename_map[key].append(doc)
|
||||
|
||||
# Now write documents, appending ID if there are duplicates
|
||||
for (dir_path, base_filename), docs in filename_map.items():
|
||||
has_duplicates = len(docs) > 1
|
||||
for doc in docs:
|
||||
try:
|
||||
if has_duplicates:
|
||||
# Append sanitized ID to disambiguate
|
||||
id_suffix = self._sanitize_path_component(doc.id)
|
||||
if len(id_suffix) > 50:
|
||||
id_suffix = hashlib.sha256(doc.id.encode()).hexdigest()[:16]
|
||||
filename = f"{base_filename}_{id_suffix}.json"
|
||||
else:
|
||||
filename = f"{base_filename}.json"
|
||||
|
||||
path = dir_path / filename
|
||||
self._write_document(doc, path)
|
||||
written_paths.append(str(path))
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to write document {doc.id} to persistent storage: {e}"
|
||||
)
|
||||
|
||||
return written_paths
|
||||
|
||||
def _build_directory_path(self, doc: Document) -> Path:
|
||||
"""Build directory path from document metadata.
|
||||
|
||||
Documents are stored under user-segregated paths:
|
||||
{base_path}/{user_id}/{source}/{hierarchy}/
|
||||
|
||||
This enables per-user isolation for sandbox access control.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
|
||||
# Add user_id as the first path component for user segregation
|
||||
parts.append(self.user_id)
|
||||
|
||||
parts.append(doc.source.value)
|
||||
|
||||
# Get hierarchy from doc_metadata
|
||||
hierarchy = doc.doc_metadata.get("hierarchy", {}) if doc.doc_metadata else {}
|
||||
source_path = hierarchy.get("source_path", [])
|
||||
|
||||
if source_path:
|
||||
parts.extend([self._sanitize_path_component(p) for p in source_path])
|
||||
|
||||
return self.base_path / "/".join(parts)
|
||||
|
||||
def _get_base_filename(self, doc: Document) -> str:
|
||||
"""Get base filename from semantic identifier, falling back to ID"""
|
||||
# Prefer semantic_identifier, fall back to title, then ID
|
||||
name = doc.semantic_identifier or doc.title or doc.id
|
||||
return self._sanitize_filename(name)
|
||||
|
||||
def _sanitize_path_component(self, component: str) -> str:
|
||||
"""Sanitize a path component for file system safety"""
|
||||
# Replace spaces with underscores
|
||||
sanitized = component.replace(" ", "_")
|
||||
# Replace other problematic characters
|
||||
sanitized = sanitized.replace("/", "_").replace("\\", "_").replace(":", "_")
|
||||
sanitized = sanitized.replace("<", "_").replace(">", "_").replace("|", "_")
|
||||
sanitized = sanitized.replace('"', "_").replace("?", "_").replace("*", "_")
|
||||
# Also handle null bytes and other control characters
|
||||
sanitized = "".join(c for c in sanitized if ord(c) >= 32)
|
||||
return sanitized.strip() or "unnamed"
|
||||
|
||||
def _sanitize_filename(self, name: str) -> str:
|
||||
"""Sanitize name for use as filename"""
|
||||
sanitized = self._sanitize_path_component(name)
|
||||
if len(sanitized) > 200:
|
||||
# Keep first 150 chars + hash suffix for uniqueness
|
||||
hash_suffix = hashlib.sha256(name.encode()).hexdigest()[:16]
|
||||
return f"{sanitized[:150]}_{hash_suffix}"
|
||||
return sanitized
|
||||
|
||||
def _write_document(self, doc: Document, path: Path) -> None:
|
||||
"""Serialize and write document to filesystem"""
|
||||
content = {
|
||||
"id": doc.id,
|
||||
"semantic_identifier": doc.semantic_identifier,
|
||||
"title": doc.title,
|
||||
"source": doc.source.value,
|
||||
"doc_updated_at": (
|
||||
doc.doc_updated_at.isoformat() if doc.doc_updated_at else None
|
||||
),
|
||||
"metadata": doc.metadata,
|
||||
"doc_metadata": doc.doc_metadata,
|
||||
"sections": [
|
||||
{"text": s.text if hasattr(s, "text") else None, "link": s.link}
|
||||
for s in doc.sections
|
||||
],
|
||||
"primary_owners": [o.model_dump() for o in (doc.primary_owners or [])],
|
||||
"secondary_owners": [o.model_dump() for o in (doc.secondary_owners or [])],
|
||||
}
|
||||
|
||||
# Create parent directories if they don't exist
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Write the JSON file
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(content, f, indent=2, default=str)
|
||||
|
||||
logger.debug(f"Wrote document to {path}")
|
||||
|
||||
|
||||
def get_persistent_document_writer(
|
||||
user_id: str,
|
||||
) -> PersistentDocumentWriter:
|
||||
"""Factory function to create a PersistentDocumentWriter with default configuration.
|
||||
|
||||
Args:
|
||||
user_id: User ID for user-segregated storage paths.
|
||||
Documents are stored under {base_path}/{user_id}/...
|
||||
for sandbox access control isolation.
|
||||
"""
|
||||
return PersistentDocumentWriter(
|
||||
base_path=PERSISTENT_DOCUMENT_STORAGE_PATH,
|
||||
user_id=user_id,
|
||||
)
|
||||
@@ -41,6 +41,11 @@ alphanum_regex = re.compile(r"[^a-z0-9]+")
|
||||
rem_email_regex = re.compile(r"(?<=\S)@([a-z0-9-]+)\.([a-z]{2,6})$")
|
||||
|
||||
|
||||
def _ngrams(sequence: str, n: int) -> list[tuple[str, ...]]:
|
||||
"""Generate n-grams from a sequence."""
|
||||
return [tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]
|
||||
|
||||
|
||||
def _clean_name(entity_name: str) -> str:
|
||||
"""
|
||||
Clean an entity string by removing non-alphanumeric characters and email addresses.
|
||||
@@ -58,8 +63,6 @@ def _normalize_one_entity(
|
||||
attributes: dict[str, str],
|
||||
allowed_docs_temp_view_name: str | None = None,
|
||||
) -> str | None:
|
||||
from nltk import ngrams # type: ignore
|
||||
|
||||
"""
|
||||
Matches a single entity to the best matching entity of the same type.
|
||||
"""
|
||||
@@ -150,16 +153,16 @@ def _normalize_one_entity(
|
||||
|
||||
# step 2: do a weighted ngram analysis and damerau levenshtein distance to rerank
|
||||
n1, n2, n3 = (
|
||||
set(ngrams(cleaned_entity, 1)),
|
||||
set(ngrams(cleaned_entity, 2)),
|
||||
set(ngrams(cleaned_entity, 3)),
|
||||
set(_ngrams(cleaned_entity, 1)),
|
||||
set(_ngrams(cleaned_entity, 2)),
|
||||
set(_ngrams(cleaned_entity, 3)),
|
||||
)
|
||||
for i, (candidate_id_name, candidate_name, _) in enumerate(candidates):
|
||||
cleaned_candidate = _clean_name(candidate_name)
|
||||
h_n1, h_n2, h_n3 = (
|
||||
set(ngrams(cleaned_candidate, 1)),
|
||||
set(ngrams(cleaned_candidate, 2)),
|
||||
set(ngrams(cleaned_candidate, 3)),
|
||||
set(_ngrams(cleaned_candidate, 1)),
|
||||
set(_ngrams(cleaned_candidate, 2)),
|
||||
set(_ngrams(cleaned_candidate, 3)),
|
||||
)
|
||||
|
||||
# compute ngram overlap, renormalize scores if the names are too short for larger ngrams
|
||||
|
||||
@@ -54,11 +54,6 @@
|
||||
"model_vendor": "amazon",
|
||||
"model_version": "v1:0"
|
||||
},
|
||||
"anthropic.claude-3-5-haiku-20241022-v1:0": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "20241022-v1:0"
|
||||
},
|
||||
"anthropic.claude-3-5-sonnet-20240620-v1:0": {
|
||||
"display_name": "Claude Sonnet 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
@@ -1465,11 +1460,6 @@
|
||||
"model_vendor": "mistral",
|
||||
"model_version": "v0:1"
|
||||
},
|
||||
"bedrock/us.anthropic.claude-3-5-haiku-20241022-v1:0": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "20241022-v1:0"
|
||||
},
|
||||
"chat-bison": {
|
||||
"display_name": "Chat Bison",
|
||||
"model_vendor": "google",
|
||||
@@ -1500,16 +1490,6 @@
|
||||
"model_vendor": "openai",
|
||||
"model_version": "latest"
|
||||
},
|
||||
"claude-3-5-haiku-20241022": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "20241022"
|
||||
},
|
||||
"claude-3-5-haiku-latest": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "latest"
|
||||
},
|
||||
"claude-3-5-sonnet-20240620": {
|
||||
"display_name": "Claude Sonnet 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
@@ -1715,11 +1695,6 @@
|
||||
"model_vendor": "amazon",
|
||||
"model_version": "v1:0"
|
||||
},
|
||||
"eu.anthropic.claude-3-5-haiku-20241022-v1:0": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "20241022-v1:0"
|
||||
},
|
||||
"eu.anthropic.claude-3-5-sonnet-20240620-v1:0": {
|
||||
"display_name": "Claude Sonnet 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
@@ -3251,15 +3226,6 @@
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "latest"
|
||||
},
|
||||
"openrouter/anthropic/claude-3-5-haiku": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic"
|
||||
},
|
||||
"openrouter/anthropic/claude-3-5-haiku-20241022": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "20241022"
|
||||
},
|
||||
"openrouter/anthropic/claude-3-haiku": {
|
||||
"display_name": "Claude Haiku 3",
|
||||
"model_vendor": "anthropic"
|
||||
@@ -3774,11 +3740,6 @@
|
||||
"model_vendor": "amazon",
|
||||
"model_version": "1:0"
|
||||
},
|
||||
"us.anthropic.claude-3-5-haiku-20241022-v1:0": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "20241022"
|
||||
},
|
||||
"us.anthropic.claude-3-5-sonnet-20240620-v1:0": {
|
||||
"display_name": "Claude Sonnet 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
@@ -3899,15 +3860,6 @@
|
||||
"model_vendor": "twelvelabs",
|
||||
"model_version": "v1:0"
|
||||
},
|
||||
"vertex_ai/claude-3-5-haiku": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic"
|
||||
},
|
||||
"vertex_ai/claude-3-5-haiku@20241022": {
|
||||
"display_name": "Claude Haiku 3.5",
|
||||
"model_vendor": "anthropic",
|
||||
"model_version": "20241022"
|
||||
},
|
||||
"vertex_ai/claude-3-5-sonnet": {
|
||||
"display_name": "Claude Sonnet 3.5",
|
||||
"model_vendor": "anthropic"
|
||||
|
||||
@@ -301,6 +301,12 @@ class LitellmLLM(LLM):
|
||||
)
|
||||
is_ollama = self._model_provider == LlmProviderNames.OLLAMA_CHAT
|
||||
is_mistral = self._model_provider == LlmProviderNames.MISTRAL
|
||||
is_vertex_ai = self._model_provider == LlmProviderNames.VERTEX_AI
|
||||
# Vertex Anthropic Opus 4.5 rejects output_config (LiteLLM maps reasoning_effort).
|
||||
# Keep this guard until LiteLLM/Vertex accept the field for this model.
|
||||
is_vertex_opus_4_5 = (
|
||||
is_vertex_ai and "claude-opus-4-5" in self.config.model_name.lower()
|
||||
)
|
||||
|
||||
#########################
|
||||
# Build arguments
|
||||
@@ -331,12 +337,16 @@ class LitellmLLM(LLM):
|
||||
# Temperature
|
||||
temperature = 1 if is_reasoning else self._temperature
|
||||
|
||||
if stream:
|
||||
if stream and not is_vertex_opus_4_5:
|
||||
optional_kwargs["stream_options"] = {"include_usage": True}
|
||||
|
||||
# Use configured default if not provided (if not set in env, low)
|
||||
reasoning_effort = reasoning_effort or ReasoningEffort(DEFAULT_REASONING_EFFORT)
|
||||
if is_reasoning and reasoning_effort != ReasoningEffort.OFF:
|
||||
if (
|
||||
is_reasoning
|
||||
and reasoning_effort != ReasoningEffort.OFF
|
||||
and not is_vertex_opus_4_5
|
||||
):
|
||||
if is_openai_model:
|
||||
# OpenAI API does not accept reasoning params for GPT 5 chat models
|
||||
# (neither reasoning nor reasoning_effort are accepted)
|
||||
|
||||
@@ -738,7 +738,7 @@ def model_is_reasoning_model(model_name: str, model_provider: str) -> bool:
|
||||
|
||||
# Fallback: try using litellm.supports_reasoning() for newer models
|
||||
try:
|
||||
logger.debug("Falling back to `litellm.supports_reasoning`")
|
||||
# logger.debug("Falling back to `litellm.supports_reasoning`")
|
||||
full_model_name = (
|
||||
f"{model_provider}/{model_name}"
|
||||
if model_provider not in model_name
|
||||
|
||||
@@ -63,6 +63,9 @@ from onyx.server.documents.connector import router as connector_router
|
||||
from onyx.server.documents.credential import router as credential_router
|
||||
from onyx.server.documents.document import router as document_router
|
||||
from onyx.server.documents.standard_oauth import router as standard_oauth_router
|
||||
from onyx.server.features.build.api.api import build_status_router
|
||||
from onyx.server.features.build.api.api import nextjs_assets_router
|
||||
from onyx.server.features.build.api.api import router as build_router
|
||||
from onyx.server.features.default_assistant.api import (
|
||||
router as default_assistant_router,
|
||||
)
|
||||
@@ -376,6 +379,9 @@ def get_application(lifespan_override: Lifespan | None = None) -> FastAPI:
|
||||
include_router_with_global_prefix_prepended(application, admin_input_prompt_router)
|
||||
include_router_with_global_prefix_prepended(application, cc_pair_router)
|
||||
include_router_with_global_prefix_prepended(application, projects_router)
|
||||
include_router_with_global_prefix_prepended(application, build_router)
|
||||
include_router_with_global_prefix_prepended(application, build_status_router)
|
||||
include_router_with_global_prefix_prepended(application, nextjs_assets_router)
|
||||
include_router_with_global_prefix_prepended(application, document_set_router)
|
||||
include_router_with_global_prefix_prepended(application, search_settings_router)
|
||||
include_router_with_global_prefix_prepended(
|
||||
|
||||
225
backend/onyx/natural_language_processing/english_stopwords.py
Normal file
225
backend/onyx/natural_language_processing/english_stopwords.py
Normal file
@@ -0,0 +1,225 @@
|
||||
import re
|
||||
|
||||
ENGLISH_STOPWORDS = [
|
||||
"a",
|
||||
"about",
|
||||
"above",
|
||||
"after",
|
||||
"again",
|
||||
"against",
|
||||
"ain",
|
||||
"all",
|
||||
"am",
|
||||
"an",
|
||||
"and",
|
||||
"any",
|
||||
"are",
|
||||
"aren",
|
||||
"aren't",
|
||||
"as",
|
||||
"at",
|
||||
"be",
|
||||
"because",
|
||||
"been",
|
||||
"before",
|
||||
"being",
|
||||
"below",
|
||||
"between",
|
||||
"both",
|
||||
"but",
|
||||
"by",
|
||||
"can",
|
||||
"couldn",
|
||||
"couldn't",
|
||||
"d",
|
||||
"did",
|
||||
"didn",
|
||||
"didn't",
|
||||
"do",
|
||||
"does",
|
||||
"doesn",
|
||||
"doesn't",
|
||||
"doing",
|
||||
"don",
|
||||
"don't",
|
||||
"down",
|
||||
"during",
|
||||
"each",
|
||||
"few",
|
||||
"for",
|
||||
"from",
|
||||
"further",
|
||||
"had",
|
||||
"hadn",
|
||||
"hadn't",
|
||||
"has",
|
||||
"hasn",
|
||||
"hasn't",
|
||||
"have",
|
||||
"haven",
|
||||
"haven't",
|
||||
"having",
|
||||
"he",
|
||||
"he'd",
|
||||
"he'll",
|
||||
"he's",
|
||||
"her",
|
||||
"here",
|
||||
"hers",
|
||||
"herself",
|
||||
"him",
|
||||
"himself",
|
||||
"his",
|
||||
"how",
|
||||
"i",
|
||||
"i'd",
|
||||
"i'll",
|
||||
"i'm",
|
||||
"i've",
|
||||
"if",
|
||||
"in",
|
||||
"into",
|
||||
"is",
|
||||
"isn",
|
||||
"isn't",
|
||||
"it",
|
||||
"it'd",
|
||||
"it'll",
|
||||
"it's",
|
||||
"its",
|
||||
"itself",
|
||||
"just",
|
||||
"ll",
|
||||
"m",
|
||||
"ma",
|
||||
"me",
|
||||
"mightn",
|
||||
"mightn't",
|
||||
"more",
|
||||
"most",
|
||||
"mustn",
|
||||
"mustn't",
|
||||
"my",
|
||||
"myself",
|
||||
"needn",
|
||||
"needn't",
|
||||
"no",
|
||||
"nor",
|
||||
"not",
|
||||
"now",
|
||||
"o",
|
||||
"of",
|
||||
"off",
|
||||
"on",
|
||||
"once",
|
||||
"only",
|
||||
"or",
|
||||
"other",
|
||||
"our",
|
||||
"ours",
|
||||
"ourselves",
|
||||
"out",
|
||||
"over",
|
||||
"own",
|
||||
"re",
|
||||
"s",
|
||||
"same",
|
||||
"shan",
|
||||
"shan't",
|
||||
"she",
|
||||
"she'd",
|
||||
"she'll",
|
||||
"she's",
|
||||
"should",
|
||||
"should've",
|
||||
"shouldn",
|
||||
"shouldn't",
|
||||
"so",
|
||||
"some",
|
||||
"such",
|
||||
"t",
|
||||
"than",
|
||||
"that",
|
||||
"that'll",
|
||||
"the",
|
||||
"their",
|
||||
"theirs",
|
||||
"them",
|
||||
"themselves",
|
||||
"then",
|
||||
"there",
|
||||
"these",
|
||||
"they",
|
||||
"they'd",
|
||||
"they'll",
|
||||
"they're",
|
||||
"they've",
|
||||
"this",
|
||||
"those",
|
||||
"through",
|
||||
"to",
|
||||
"too",
|
||||
"under",
|
||||
"until",
|
||||
"up",
|
||||
"ve",
|
||||
"very",
|
||||
"was",
|
||||
"wasn",
|
||||
"wasn't",
|
||||
"we",
|
||||
"we'd",
|
||||
"we'll",
|
||||
"we're",
|
||||
"we've",
|
||||
"were",
|
||||
"weren",
|
||||
"weren't",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"which",
|
||||
"while",
|
||||
"who",
|
||||
"whom",
|
||||
"why",
|
||||
"will",
|
||||
"with",
|
||||
"won",
|
||||
"won't",
|
||||
"wouldn",
|
||||
"wouldn't",
|
||||
"y",
|
||||
"you",
|
||||
"you'd",
|
||||
"you'll",
|
||||
"you're",
|
||||
"you've",
|
||||
"your",
|
||||
"yours",
|
||||
"yourself",
|
||||
"yourselves",
|
||||
]
|
||||
|
||||
ENGLISH_STOPWORDS_SET = frozenset(ENGLISH_STOPWORDS)
|
||||
|
||||
|
||||
def strip_stopwords(text: str) -> list[str]:
|
||||
"""Remove English stopwords from text.
|
||||
|
||||
Matching is case-insensitive and ignores leading/trailing punctuation
|
||||
on each word. Internal punctuation (like apostrophes in contractions)
|
||||
is preserved for matching, so "you're" matches the stopword "you're"
|
||||
but "youre" would not.
|
||||
"""
|
||||
words = text.split()
|
||||
result = []
|
||||
|
||||
for word in words:
|
||||
# Strip leading/trailing punctuation to get the core word for comparison
|
||||
# This preserves internal punctuation like apostrophes
|
||||
core = re.sub(r"^[^\w']+|[^\w']+$", "", word)
|
||||
if core.lower() not in ENGLISH_STOPWORDS_SET:
|
||||
result.append(word)
|
||||
|
||||
return result
|
||||
@@ -32,9 +32,6 @@ from onyx.configs.constants import MessageType
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.configs.onyxbot_configs import NOTIFY_SLACKBOT_NO_ANSWER
|
||||
from onyx.connectors.slack.utils import expert_info_from_slack_id
|
||||
from onyx.context.search.retrieval.search_runner import (
|
||||
download_nltk_data,
|
||||
)
|
||||
from onyx.db.engine.sql_engine import get_session_with_current_tenant
|
||||
from onyx.db.engine.sql_engine import get_session_with_tenant
|
||||
from onyx.db.engine.sql_engine import SqlEngine
|
||||
@@ -1129,9 +1126,6 @@ if __name__ == "__main__":
|
||||
|
||||
set_is_ee_based_on_env_variable()
|
||||
|
||||
logger.info("Verifying query preprocessing (NLTK) data is downloaded")
|
||||
download_nltk_data()
|
||||
|
||||
try:
|
||||
# Keep the main thread alive
|
||||
while tenant_handler.running:
|
||||
|
||||
@@ -96,7 +96,7 @@ ADDITIONAL_INFO = "\n\nAdditional Information:\n\t- {datetime_info}."
|
||||
|
||||
CHAT_NAMING_SYSTEM_PROMPT = """
|
||||
Given the conversation history, provide a SHORT name for the conversation. Focus the name on the important keywords to convey the topic of the conversation. \
|
||||
Make sure the name is in the same language as the user's language.
|
||||
Make sure the name is in the same language as the user's first message.
|
||||
|
||||
IMPORTANT: DO NOT OUTPUT ANYTHING ASIDE FROM THE NAME. MAKE IT AS CONCISE AS POSSIBLE. NEVER USE MORE THAN 5 WORDS, LESS IS FINE.
|
||||
""".strip()
|
||||
|
||||
@@ -19,7 +19,7 @@ If you need to ask questions, follow these guidelines:
|
||||
- Be concise and do not ask more than 5 questions.
|
||||
- If there are ambiguous terms or questions, ask the user to clarify.
|
||||
- Your questions should be a numbered list for clarity.
|
||||
- Respond in the user's language.
|
||||
- Respond in the same language as the user's query.
|
||||
- Make sure to gather all the information needed to carry out the research task in a concise, well-structured manner.{{internal_search_clarification_guidance}}
|
||||
- Wrap up with a quick sentence on what the clarification will help with, it's ok to reference the user query closely here.
|
||||
""".strip()
|
||||
@@ -44,9 +44,9 @@ For context, the date is {current_datetime}.
|
||||
|
||||
The research plan should be formatted as a numbered list of steps and have 6 or less individual steps.
|
||||
|
||||
Each step should be a standalone exploration question or topic that can be researched independently but may build on previous steps.
|
||||
Each step should be a standalone exploration question or topic that can be researched independently but may build on previous steps. The plan should be in the same language as the user's query.
|
||||
|
||||
Output only the numbered list of steps with no additional prefix or suffix. Respond in the user's language.
|
||||
Output only the numbered list of steps with no additional prefix or suffix.
|
||||
""".strip()
|
||||
|
||||
|
||||
@@ -76,10 +76,11 @@ You have currently used {{current_cycle_count}} of {{max_cycles}} max research c
|
||||
|
||||
## {RESEARCH_AGENT_TOOL_NAME}
|
||||
The research task provided to the {RESEARCH_AGENT_TOOL_NAME} should be reasonably high level with a clear direction for investigation. \
|
||||
It should not be a single short query, rather it should be 1 (or 2 if necessary) descriptive sentences that outline the direction of the investigation.
|
||||
It should not be a single short query, rather it should be 1 (or 2 if necessary) descriptive sentences that outline the direction of the investigation. \
|
||||
The research task should be in the same language as the overall research plan.
|
||||
|
||||
CRITICAL - the {RESEARCH_AGENT_TOOL_NAME} only receives the task and has no additional context about the user's query, research plan, other research agents, or message history. \
|
||||
You absolutely must provide all of the context needed to complete the task in the argument to the {RESEARCH_AGENT_TOOL_NAME}. The research task should be in the user's language.{{internal_search_research_task_guidance}}
|
||||
You absolutely must provide all of the context needed to complete the task in the argument to the {RESEARCH_AGENT_TOOL_NAME}.{{internal_search_research_task_guidance}}
|
||||
|
||||
You should call the {RESEARCH_AGENT_TOOL_NAME} MANY times before completing with the {GENERATE_REPORT_TOOL_NAME} tool.
|
||||
|
||||
@@ -129,7 +130,7 @@ For context, the date is {current_datetime}.
|
||||
|
||||
Users have explicitly selected the deep research mode and will expect a long and detailed answer. It is ok and encouraged that your response is several pages long.
|
||||
|
||||
You use different text styles and formatting to make the response easier to read. You may use markdown rarely when necessary to make the response more digestible. Respond in the user's language.
|
||||
You use different text styles and formatting to make the response easier to read. You may use markdown rarely when necessary to make the response more digestible.
|
||||
|
||||
Not every fact retrieved will be relevant to the user's query.
|
||||
|
||||
@@ -165,10 +166,11 @@ You have currently used {{current_cycle_count}} of {{max_cycles}} max research c
|
||||
|
||||
## {RESEARCH_AGENT_TOOL_NAME}
|
||||
The research task provided to the {RESEARCH_AGENT_TOOL_NAME} should be reasonably high level with a clear direction for investigation. \
|
||||
It should not be a single short query, rather it should be 1 (or 2 if necessary) descriptive sentences that outline the direction of the investigation.
|
||||
It should not be a single short query, rather it should be 1 (or 2 if necessary) descriptive sentences that outline the direction of the investigation. \
|
||||
The research task should be in the same language as the overall research plan.
|
||||
|
||||
CRITICAL - the {RESEARCH_AGENT_TOOL_NAME} only receives the task and has no additional context about the user's query, research plan, or message history. \
|
||||
You absolutely must provide all of the context needed to complete the task in the argument to the {RESEARCH_AGENT_TOOL_NAME}. The research task should be in the user's language.{{internal_search_research_task_guidance}}
|
||||
You absolutely must provide all of the context needed to complete the task in the argument to the {RESEARCH_AGENT_TOOL_NAME}.{{internal_search_research_task_guidance}}
|
||||
|
||||
You should call the {RESEARCH_AGENT_TOOL_NAME} MANY times before completing with the {GENERATE_REPORT_TOOL_NAME} tool.
|
||||
|
||||
|
||||
@@ -1,30 +1,39 @@
|
||||
from onyx.configs.app_configs import MAX_SLACK_QUERY_EXPANSIONS
|
||||
|
||||
SLACK_QUERY_EXPANSION_PROMPT = f"""
|
||||
Rewrite the user's query and, if helpful, split it into at most {MAX_SLACK_QUERY_EXPANSIONS} \
|
||||
keyword-only queries, so that Slack's keyword search yields the best matches.
|
||||
Rewrite the user's query into at most {MAX_SLACK_QUERY_EXPANSIONS} keyword-only queries for Slack's keyword search.
|
||||
|
||||
Keep in mind the Slack's search behavior:
|
||||
- Pure keyword AND search (no semantics).
|
||||
- Word order matters.
|
||||
- More words = fewer matches, so keep each query concise.
|
||||
- IMPORTANT: Prefer simple 1-2 word queries over longer multi-word queries.
|
||||
Slack search behavior:
|
||||
- Pure keyword AND search (no semantics)
|
||||
- More words = fewer matches, so keep queries concise (1-3 words)
|
||||
|
||||
Critical: Extract ONLY keywords that would actually appear in Slack message content.
|
||||
ALWAYS include:
|
||||
- Person names (e.g., "Sarah Chen", "Mike Johnson") - people search for messages from/about specific people
|
||||
- Project/product names, technical terms, proper nouns
|
||||
- Actual content words: "performance", "bug", "deployment", "API", "error"
|
||||
|
||||
DO NOT include:
|
||||
- Meta-words: "topics", "conversations", "discussed", "summary", "messages", "big", "main", "talking"
|
||||
- Temporal: "today", "yesterday", "week", "month", "recent", "past", "last"
|
||||
- Channels/Users: "general", "eng-general", "engineering", "@username"
|
||||
|
||||
DO include:
|
||||
- Actual content: "performance", "bug", "deployment", "API", "database", "error", "feature"
|
||||
- Meta-words: "topics", "conversations", "discussed", "summary", "messages"
|
||||
- Temporal: "today", "yesterday", "week", "month", "recent", "last"
|
||||
- Channel names: "general", "eng-general", "random"
|
||||
|
||||
Examples:
|
||||
|
||||
Query: "what are the big topics in eng-general this week?"
|
||||
Output:
|
||||
|
||||
Query: "messages with Sarah about the deployment"
|
||||
Output:
|
||||
Sarah deployment
|
||||
Sarah
|
||||
deployment
|
||||
|
||||
Query: "what did Mike say about the budget?"
|
||||
Output:
|
||||
Mike budget
|
||||
Mike
|
||||
budget
|
||||
|
||||
Query: "performance issues in eng-general"
|
||||
Output:
|
||||
performance issues
|
||||
@@ -41,7 +50,7 @@ Now process this query:
|
||||
|
||||
{{query}}
|
||||
|
||||
Output:
|
||||
Output (keywords only, one per line, NO explanations or commentary):
|
||||
"""
|
||||
|
||||
SLACK_DATE_EXTRACTION_PROMPT = """
|
||||
|
||||
@@ -48,7 +48,7 @@ Do not use the "site:" operator in your web search queries.
|
||||
OPEN_URLS_GUIDANCE = """
|
||||
|
||||
## open_url
|
||||
Use the `open_url` tool to read the content of one or more URLs. Use this tool to access the contents of the most promising web pages from your searches.
|
||||
Use the `open_url` tool to read the content of one or more URLs. Use this tool to access the contents of the most promising web pages from your web searches or user specified URLs.
|
||||
You can open many URLs at once by passing multiple URLs in the array if multiple pages seem promising. Prioritize the most promising pages and reputable sources.
|
||||
You should almost always use open_url after a web_search call. Use this tool when a user asks about a specific provided URL.
|
||||
"""
|
||||
|
||||
@@ -1,24 +0,0 @@
|
||||
input_prompts:
|
||||
- id: -5
|
||||
prompt: "Elaborate"
|
||||
content: "Elaborate on the above, give me a more in depth explanation."
|
||||
active: true
|
||||
is_public: true
|
||||
|
||||
- id: -4
|
||||
prompt: "Reword"
|
||||
content: "Help me rewrite the following politely and concisely for professional communication:\n"
|
||||
active: true
|
||||
is_public: true
|
||||
|
||||
- id: -3
|
||||
prompt: "Email"
|
||||
content: "Write a professional email for me including a subject line, signature, etc. Template the parts that need editing with [ ]. The email should cover the following points:\n"
|
||||
active: true
|
||||
is_public: true
|
||||
|
||||
- id: -2
|
||||
prompt: "Debug"
|
||||
content: "Provide step-by-step troubleshooting instructions for the following issue:\n"
|
||||
active: true
|
||||
is_public: true
|
||||
@@ -1,40 +0,0 @@
|
||||
import yaml
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.chat_configs import INPUT_PROMPT_YAML
|
||||
from onyx.db.input_prompt import insert_input_prompt_if_not_exists
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def load_input_prompts_from_yaml(
|
||||
db_session: Session, input_prompts_yaml: str = INPUT_PROMPT_YAML
|
||||
) -> None:
|
||||
with open(input_prompts_yaml, "r") as file:
|
||||
data = yaml.safe_load(file)
|
||||
|
||||
all_input_prompts = data.get("input_prompts", [])
|
||||
for input_prompt in all_input_prompts:
|
||||
# If these prompts are deleted (which is a hard delete in the DB), on server startup
|
||||
# they will be recreated, but the user can always just deactivate them, just a light inconvenience
|
||||
|
||||
insert_input_prompt_if_not_exists(
|
||||
user=None,
|
||||
input_prompt_id=input_prompt.get("id"),
|
||||
prompt=input_prompt["prompt"],
|
||||
content=input_prompt["content"],
|
||||
is_public=input_prompt["is_public"],
|
||||
active=input_prompt.get("active", True),
|
||||
db_session=db_session,
|
||||
commit=True,
|
||||
)
|
||||
|
||||
|
||||
def load_chat_yamls(
|
||||
db_session: Session,
|
||||
input_prompts_yaml: str = INPUT_PROMPT_YAML,
|
||||
) -> None:
|
||||
"""Load all chat-related YAML configurations (such as the prompt shortcuts which are called input prompts on the backend)"""
|
||||
load_input_prompts_from_yaml(db_session, input_prompts_yaml)
|
||||
@@ -564,6 +564,7 @@ def associate_credential_to_connector(
|
||||
access_type=metadata.access_type,
|
||||
auto_sync_options=metadata.auto_sync_options,
|
||||
groups=metadata.groups,
|
||||
processing_mode=metadata.processing_mode,
|
||||
)
|
||||
|
||||
# trigger indexing immediately
|
||||
|
||||
@@ -20,6 +20,7 @@ from google.oauth2.credentials import Credentials
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.auth.email_utils import send_email
|
||||
from onyx.auth.users import current_admin_user
|
||||
from onyx.auth.users import current_chat_accessible_user
|
||||
from onyx.auth.users import current_curator_or_admin_user
|
||||
@@ -29,6 +30,7 @@ from onyx.background.celery.tasks.pruning.tasks import (
|
||||
)
|
||||
from onyx.background.celery.versioned_apps.client import app as client_app
|
||||
from onyx.configs.app_configs import DISABLE_AUTH
|
||||
from onyx.configs.app_configs import EMAIL_CONFIGURED
|
||||
from onyx.configs.app_configs import ENABLED_CONNECTOR_TYPES
|
||||
from onyx.configs.app_configs import MOCK_CONNECTOR_FILE_PATH
|
||||
from onyx.configs.constants import DocumentSource
|
||||
@@ -125,6 +127,7 @@ from onyx.server.documents.models import ConnectorFileInfo
|
||||
from onyx.server.documents.models import ConnectorFilesResponse
|
||||
from onyx.server.documents.models import ConnectorIndexingStatusLite
|
||||
from onyx.server.documents.models import ConnectorIndexingStatusLiteResponse
|
||||
from onyx.server.documents.models import ConnectorRequestSubmission
|
||||
from onyx.server.documents.models import ConnectorSnapshot
|
||||
from onyx.server.documents.models import ConnectorStatus
|
||||
from onyx.server.documents.models import ConnectorUpdateRequest
|
||||
@@ -1759,6 +1762,86 @@ def get_connector_by_id(
|
||||
)
|
||||
|
||||
|
||||
@router.post("/connector-request")
|
||||
def submit_connector_request(
|
||||
request_data: ConnectorRequestSubmission,
|
||||
user: User | None = Depends(current_user),
|
||||
) -> StatusResponse:
|
||||
"""
|
||||
Submit a connector request for Cloud deployments.
|
||||
Tracks via PostHog telemetry and sends email to hello@onyx.app.
|
||||
"""
|
||||
tenant_id = get_current_tenant_id()
|
||||
connector_name = request_data.connector_name.strip()
|
||||
|
||||
if not connector_name:
|
||||
raise HTTPException(status_code=400, detail="Connector name cannot be empty")
|
||||
|
||||
# Get user identifier for telemetry
|
||||
user_email = user.email if user else None
|
||||
distinct_id = user_email or tenant_id
|
||||
|
||||
# Track connector request via PostHog telemetry (Cloud only)
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
if MULTI_TENANT:
|
||||
mt_cloud_telemetry(
|
||||
tenant_id=tenant_id,
|
||||
distinct_id=distinct_id,
|
||||
event=MilestoneRecordType.REQUESTED_CONNECTOR,
|
||||
properties={
|
||||
"connector_name": connector_name,
|
||||
"user_email": user_email,
|
||||
},
|
||||
)
|
||||
|
||||
# Send email notification (if email is configured)
|
||||
if EMAIL_CONFIGURED:
|
||||
try:
|
||||
subject = "Onyx Craft Connector Request"
|
||||
email_body_text = f"""A new connector request has been submitted:
|
||||
|
||||
Connector Name: {connector_name}
|
||||
User Email: {user_email or 'Not provided (anonymous user)'}
|
||||
Tenant ID: {tenant_id}
|
||||
"""
|
||||
email_body_html = f"""<html>
|
||||
<body>
|
||||
<p>A new connector request has been submitted:</p>
|
||||
<ul>
|
||||
<li><strong>Connector Name:</strong> {connector_name}</li>
|
||||
<li><strong>User Email:</strong> {user_email or 'Not provided (anonymous user)'}</li>
|
||||
<li><strong>Tenant ID:</strong> {tenant_id}</li>
|
||||
</ul>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
send_email(
|
||||
user_email="hello@onyx.app",
|
||||
subject=subject,
|
||||
html_body=email_body_html,
|
||||
text_body=email_body_text,
|
||||
)
|
||||
logger.info(
|
||||
f"Connector request email sent to hello@onyx.app for connector: {connector_name}"
|
||||
)
|
||||
except Exception as e:
|
||||
# Log error but don't fail the request if email fails
|
||||
logger.error(
|
||||
f"Failed to send connector request email for {connector_name}: {e}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Connector request submitted: {connector_name} by user {user_email or 'anonymous'} "
|
||||
f"(tenant: {tenant_id})"
|
||||
)
|
||||
|
||||
return StatusResponse(
|
||||
success=True,
|
||||
message="Connector request submitted successfully. We'll prioritize popular requests!",
|
||||
)
|
||||
|
||||
|
||||
class BasicCCPairInfo(BaseModel):
|
||||
has_successful_run: bool
|
||||
source: DocumentSource
|
||||
|
||||
@@ -32,6 +32,7 @@ def get_document_info(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> DocumentInfo:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
# This flow is for search so we do not get all indices.
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
|
||||
user_acl_filters = build_access_filters_for_user(user, db_session)
|
||||
@@ -76,6 +77,7 @@ def get_chunk_info(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> ChunkInfo:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
# This flow is for search so we do not get all indices.
|
||||
document_index = get_default_document_index(search_settings, None)
|
||||
|
||||
user_acl_filters = build_access_filters_for_user(user, db_session)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user