Compare commits

..

78 Commits
v3.0.1 ... main

Author SHA1 Message Date
Nikolas Garza
c57ea65d42 fix(db): avoid SQLAlchemy sentinel mismatch in batch user insert (#9300) 2026-03-12 06:56:45 +00:00
Nikolas Garza
c1ce180b72 feat(admin): add role, group, and status filters to Users table - 4/9 (#9179) 2026-03-11 21:56:19 -07:00
Jamison Lahman
b5474dc127 chore(devtools): upgrade ods: 0.6.3->0.7.0 (#9297) 2026-03-11 20:30:55 -07:00
Nikolas Garza
e1df3f533a feat(admin): add Users table with DataTable and server-side pagination - 3/9 (#9178) 2026-03-11 20:26:07 -07:00
Jamison Lahman
df5252db05 chore(devtools): ods backend api (#9295)
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2026-03-11 20:07:23 -07:00
Nikolas Garza
f01f210af8 fix(slackbot): resolve channel references and filter search by channel tags (#9256) 2026-03-11 19:37:03 -07:00
Jamison Lahman
781219cf18 chore(models): rm claude-3-5-sonnet-v2 metadata (#9285) 2026-03-12 02:17:09 +00:00
Nikolas Garza
ca39da7de9 feat(admin): add user timestamps and enrich FullUserSnapshot - 2/9 (#9183) 2026-03-11 19:07:45 -07:00
dependabot[bot]
abf76cd747 chore(deps): bump tornado from 6.5.2 to 6.5.5 (#9290)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Jamison Lahman <jamison@lahman.dev>
2026-03-12 01:41:01 +00:00
Jamison Lahman
a78607f1b5 fix(fe): InputComboBox resets filter value on open (#9287) 2026-03-12 01:06:02 +00:00
roshan
e213853f63 fix(craft): rename webapp download endpoint to avoid route conflict (#9283)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
Co-authored-by: Wenxi <wenxi@onyx.app>
2026-03-11 23:19:38 +00:00
Wenxi
8dc379c6fd feat(ods): use release-tag to print highest stable semver that should receive the latest tag (#9278)
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2026-03-11 22:18:13 +00:00
dependabot[bot]
787f117e17 chore(deps): bump pypdf from 6.7.5 to 6.8.0 (#9260)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Jamison Lahman <jamison@lahman.dev>
2026-03-11 21:59:35 +00:00
Jamison Lahman
665640fac8 chore(opensearch): unset container ulimits in dev (#9277)
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2026-03-11 21:58:43 +00:00
Danelegend
d2d44c1e68 fix(indexing): Stop deep-copy during indexing (#9275) 2026-03-11 21:24:15 +00:00
Nikolas Garza
ffe04ab91f fix(tests): remove deprecated o1-preview and o1-mini model tests (#9280) 2026-03-11 20:32:51 +00:00
Raunak Bhagat
6499b21235 feat(opal): add Card and EmptyMessageCard components (#9271) 2026-03-11 13:14:17 -07:00
Nikolas Garza
c5bfd5a152 feat(admin): add Users page shell with stats bar and SCIM card - 1/9 (#9079) 2026-03-11 16:28:47 +00:00
Justin Tahara
a0329161b0 feat(litellm): Adding FE Provider workflow (#9264) 2026-03-11 03:45:08 +00:00
Raunak Bhagat
334b7a6d2f feat(opal): add foldable support to OpenButton + fix MessageToolbar (#9265) 2026-03-11 03:00:51 +00:00
dependabot[bot]
36196373a8 chore(deps): bump hono from 4.12.5 to 4.12.7 in /backend/onyx/server/features/build/sandbox/kubernetes/docker/templates/outputs/web (#9263)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-03-10 18:54:17 -07:00
Jamison Lahman
533aa8eff8 chore(release): upgrade release-tag (#9257) 2026-03-11 00:50:55 +00:00
Raunak Bhagat
ecbb267f80 fix: Consolidate search state-machine (#9234) 2026-03-11 00:42:39 +00:00
Danelegend
66023dbb6d feat(llm-provider): fetch litellm models (#8418) 2026-03-10 23:48:56 +00:00
Wenxi
f97466e4de chore: redeclare cache_okay for EncryptedBase children (#9253) 2026-03-10 23:44:51 +00:00
Evan Lohn
2cc8303e5f chore: sharepoint dedupe (#9254) 2026-03-10 23:41:51 +00:00
Wenxi
a92ff61f64 chore: add cache_okay to EncryptedJson (#9252) 2026-03-10 22:18:39 +00:00
acaprau
17551a907e fix(opensearch): Update should clear projects and personas when they are empty (#8845) 2026-03-10 21:49:55 +00:00
Jamison Lahman
9e42951fa4 fix(fe): increase responsive breakpoint for centering modals (#9250) 2026-03-10 21:45:23 +00:00
acaprau
dcb18c2411 chore(opensearch): Followup for #9243 (#9247) 2026-03-10 14:31:44 -07:00
Jamison Lahman
2f628e39d3 fix(fe): correctly parse comma literals in CSVs (#9245) 2026-03-10 21:03:47 +00:00
Nikolas Garza
fd200d46f8 fix(storybook): case-sensitivity, icon rename, and story fixes (#9244) 2026-03-10 20:05:32 +00:00
Evan Lohn
ec7482619b fix: update jira group sync endpoint (#9241) 2026-03-10 19:57:01 +00:00
Jamison Lahman
9d1a357533 fix(fe): make CSV inline display responsive (#9242) 2026-03-10 19:42:23 +00:00
acaprau
fbe823b551 chore(opensearch): Allow configuring num hits from hybrid subquery from env var (#9243) 2026-03-10 19:27:36 +00:00
acaprau
1608e2f274 fix(opensearch): Allow configuring the page size of chunks we get from Vespa during migration (#9239) 2026-03-10 17:51:52 +00:00
Jamison Lahman
4dbb1fa606 chore(tests): fix nightly model-server tests (#9236) 2026-03-10 17:49:08 +00:00
Jessica Singh
19b33e4d93 chore(auth): deployment docker cleanup (#8587) 2026-03-10 16:48:27 +00:00
Jamison Lahman
e56fa57c21 chore(release): run playwright on release pushes (#9233) 2026-03-10 16:35:30 +00:00
SubashMohan
5cdeb84164 feat(custom-tools): enhance custom tool error handling and timeline UI (#9189) 2026-03-10 10:50:32 +00:00
Danelegend
5b5100a07a fix: Prevent the removal and hiding of default model (#9131) 2026-03-10 07:34:00 +00:00
Evan Lohn
77f58fbad5 feat: prune hierarchynodes (#9066) 2026-03-10 05:29:26 +00:00
Evan Lohn
cf74afc65e fix: assistant file transfer (#9163) 2026-03-10 05:02:31 +00:00
Jamison Lahman
a887bc616c fix(fe): preview modal fade matches code bg color (#9221) 2026-03-10 04:32:44 +00:00
Jamison Lahman
fef1fd093e feat(fe): increase preview file type support & replace TextViewModal with PreviewModal variant (#9212)
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2026-03-10 03:05:32 +00:00
Nikolas Garza
8d085a4ccf ci: add Storybook deploy workflow - 3/3 (#9205) 2026-03-10 02:40:53 +00:00
Nikolas Garza
28310b9138 feat(storybook): add stories for all components - 2/3 (#9194) 2026-03-09 19:16:42 -07:00
Nikolas Garza
f71fab580c fix: use detail instead of message in OnyxError response shape (#9214) 2026-03-10 02:03:54 +00:00
Jessica Singh
89593b353f chore(auth): backend cleanup (#8558)
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-10 01:05:29 +00:00
Jamison Lahman
91e24ae63a fix(code-interpreter): set default CODE_INTERPRETER_BASE_URL w/ docke… (#9215) 2026-03-10 00:50:10 +00:00
Jamison Lahman
d2b37724d1 fix(fe): fix chat content padding (#9216) 2026-03-10 00:48:53 +00:00
acaprau
87f0849330 feat(opensearch): Enable by default (#9211) 2026-03-09 17:30:32 -07:00
Bo-Onyx
2ec7526772 fix(api memory): replace glibc with jemalloc for memory allocating (#9196) 2026-03-10 00:02:31 +00:00
Wenxi
bbd68e2795 fix: impropoer kv store strings (#9213) 2026-03-09 23:48:44 +00:00
Nikolas Garza
e74c36001a feat(storybook): add Storybook infrastructure - 1/3 (#9195) 2026-03-09 15:55:05 -07:00
Jamison Lahman
fe593a15da fix(safari): Search results dont shrink (#9126) 2026-03-09 21:04:20 +00:00
Wenxi
27df690a8d fix: discord connector async resource cleanup (#9203) 2026-03-09 20:46:58 +00:00
Wenxi
edbe569edd fix: don't fetch mcp tools when no llms are configured (#9173) 2026-03-09 20:45:55 +00:00
Jamison Lahman
5118193d16 fix(fe): move app padding inside overflow container (#9206) 2026-03-09 20:38:47 +00:00
Raunak Bhagat
63d3efd380 refactor: default width from w-autow-fit (#9146)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 19:58:16 +00:00
Wenxi
ec978d9a3f fix(mcp): use CE-compatible chat endpoint for search_indexed_documents (#9193)
Co-authored-by: Fizza-Mukhtar <fizzamukhtar01@gmail.com>
2026-03-09 19:44:08 +00:00
dependabot[bot]
d4d98a6cd0 chore(deps): bump hashicorp/setup-terraform from 3.1.2 to 4.0.0 (#9198)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-03-09 12:42:30 -07:00
dependabot[bot]
dc40e86dac chore(deps): bump actions/download-artifact from 7.0.0 to 8.0.0 (#9199)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-03-09 12:41:21 -07:00
dependabot[bot]
e495f7a13e chore(deps): bump astral-sh/setup-uv from 7.2.0 to 7.3.1 (#9200)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-03-09 12:40:58 -07:00
Wenxi
4761e4b132 fix: fallback doc access when drive item is externally owned (#9053) 2026-03-09 17:58:14 +00:00
Raunak Bhagat
6b5ab54b85 feat: add LineItemButton component (#9137)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 17:49:37 +00:00
Wenxi
959cf444f8 fix: set event hook for wrapping values into SensitiveValue (#9177) 2026-03-09 17:37:33 +00:00
Wenxi
2ebccea6d6 fix: move available context tokens to useChatController and remove arbitrary 50% cap (#9174) 2026-03-09 16:32:28 +00:00
Wenxi
5fe7a474db chore: update decryption utility (#9176) 2026-03-09 16:32:14 +00:00
Wenxi
9d7dc3da21 fix: ph ssl upgrade on redirect for local development (#9175) 2026-03-08 23:35:59 +00:00
Wenxi
2899be4c5e fix: remove unnecessary multitenant check in migration (#9172)
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2026-03-08 20:53:11 +00:00
Nikolas Garza
64ee7fc23f fix(fe): fix broken slack bot admin pages (#9168) 2026-03-08 20:11:17 +00:00
Justin Tahara
e07764285d chore(llm): Adding Integration test for Model state cache 2/2 (#9142) 2026-03-08 19:07:11 +00:00
Justin Tahara
cc2e6ffa8a fix(user files): Add frontend precheck for oversized user uploads 3/3 (#9159) 2026-03-08 18:47:25 +00:00
Justin Tahara
d3ee5c9b59 fix(user files): Enforce user upload file size limit in projects/chat upload path 2/3 (#9158) 2026-03-08 17:42:44 +00:00
Justin Tahara
dfa0efc093 fix(user files): Add configurable user file max upload size setting 1/3 (#9157) 2026-03-08 17:01:55 +00:00
Danelegend
9aad4077f1 feat: Tool call arg streaming (#9095) 2026-03-07 09:02:39 +00:00
Wenxi
29d9ebf7b3 feat: rotate encryption key utility (#9162) 2026-03-07 06:17:21 +00:00
309 changed files with 18123 additions and 1963 deletions

View File

@@ -151,7 +151,7 @@ jobs:
fetch-depth: 0
- name: Setup uv
uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # ratchet:astral-sh/setup-uv@v7
uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # ratchet:astral-sh/setup-uv@v7
with:
version: "0.9.9"
# NOTE: This isn't caching much and zizmor suggests this could be poisoned, so disable.

View File

@@ -70,7 +70,7 @@ jobs:
- name: Install the latest version of uv
if: steps.gate.outputs.should_cherrypick == 'true'
uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # ratchet:astral-sh/setup-uv@v7
uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # ratchet:astral-sh/setup-uv@v7
with:
enable-cache: false
version: "0.9.9"

View File

@@ -471,7 +471,7 @@ jobs:
- name: Install the latest version of uv
if: always()
uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # ratchet:astral-sh/setup-uv@v7
uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # ratchet:astral-sh/setup-uv@v7
with:
enable-cache: false
version: "0.9.9"
@@ -710,7 +710,7 @@ jobs:
pull-requests: write
steps:
- name: Download visual diff summaries
uses: actions/download-artifact@37930b1c2abaa49bbe596cd826c3c89aef350131
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3
with:
pattern: screenshot-diff-summary-*
path: summaries/

View File

@@ -28,7 +28,7 @@ jobs:
with:
python-version: "3.11"
- name: Setup Terraform
uses: hashicorp/setup-terraform@b9cd54a3c349d3f38e8881555d616ced269862dd # ratchet:hashicorp/setup-terraform@v3
uses: hashicorp/setup-terraform@5e8dbf3c6d9deaf4193ca7a8fb23f2ac83bb6c85 # ratchet:hashicorp/setup-terraform@v4.0.0
- name: Setup node
uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # ratchet:actions/setup-node@v6
with: # zizmor: ignore[cache-poisoning]

View File

@@ -26,7 +26,7 @@ jobs:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v6
with:
persist-credentials: false
- uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # ratchet:astral-sh/setup-uv@v7
- uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # ratchet:astral-sh/setup-uv@v7
with:
enable-cache: false
version: "0.9.9"

69
.github/workflows/storybook-deploy.yml vendored Normal file
View File

@@ -0,0 +1,69 @@
name: Storybook Deploy
env:
VERCEL_ORG_ID: ${{ secrets.VERCEL_ORG_ID }}
VERCEL_PROJECT_ID: prj_sG49mVsA25UsxIPhN2pmBJlikJZM
VERCEL_CLI: vercel@50.14.1
VERCEL_TOKEN: ${{ secrets.VERCEL_TOKEN }}
concurrency:
group: storybook-deploy-production
cancel-in-progress: true
on:
workflow_dispatch:
push:
branches:
- main
paths:
- "web/lib/opal/**"
- "web/src/refresh-components/**"
- "web/.storybook/**"
- "web/package.json"
- "web/package-lock.json"
permissions:
contents: read
jobs:
Deploy-Storybook:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v4
with:
persist-credentials: false
- name: Setup node
uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # ratchet:actions/setup-node@v4
with:
node-version: 22
cache: "npm"
cache-dependency-path: ./web/package-lock.json
- name: Install dependencies
working-directory: web
run: npm ci
- name: Build Storybook
working-directory: web
run: npm run storybook:build
- name: Deploy to Vercel (Production)
working-directory: web
run: npx --yes "$VERCEL_CLI" deploy storybook-static/ --prod --yes --token="$VERCEL_TOKEN"
notify-slack-on-failure:
needs: Deploy-Storybook
if: always() && needs.Deploy-Storybook.result == 'failure'
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v4
with:
persist-credentials: false
sparse-checkout: .github/actions/slack-notify
- name: Send Slack notification
uses: ./.github/actions/slack-notify
with:
webhook-url: ${{ secrets.MONITOR_DEPLOYMENTS_WEBHOOK }}
failed-jobs: "• Deploy-Storybook"
title: "🚨 Storybook Deploy Failed"

View File

@@ -24,7 +24,7 @@ jobs:
persist-credentials: false
- name: Install the latest version of uv
uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # ratchet:astral-sh/setup-uv@v7
uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # ratchet:astral-sh/setup-uv@v7
with:
enable-cache: false
version: "0.9.9"

View File

@@ -7,6 +7,9 @@
AUTH_TYPE=basic
# Recommended for basic auth - used for signing password reset and verification tokens
# Generate a secure value with: openssl rand -hex 32
USER_AUTH_SECRET=""
DEV_MODE=true

View File

@@ -46,7 +46,9 @@ RUN apt-get update && \
pkg-config \
gcc \
nano \
vim && \
vim \
libjemalloc2 \
&& \
rm -rf /var/lib/apt/lists/* && \
apt-get clean
@@ -141,6 +143,7 @@ COPY --chown=onyx:onyx ./scripts/debugging /app/scripts/debugging
COPY --chown=onyx:onyx ./scripts/force_delete_connector_by_id.py /app/scripts/force_delete_connector_by_id.py
COPY --chown=onyx:onyx ./scripts/supervisord_entrypoint.sh /app/scripts/supervisord_entrypoint.sh
COPY --chown=onyx:onyx ./scripts/setup_craft_templates.sh /app/scripts/setup_craft_templates.sh
COPY --chown=onyx:onyx ./scripts/reencrypt_secrets.py /app/scripts/reencrypt_secrets.py
RUN chmod +x /app/scripts/supervisord_entrypoint.sh /app/scripts/setup_craft_templates.sh
# Run Craft template setup at build time when ENABLE_CRAFT=true
@@ -164,6 +167,13 @@ ENV PYTHONPATH=/app
ARG ONYX_VERSION=0.0.0-dev
ENV ONYX_VERSION=${ONYX_VERSION}
# Use jemalloc instead of glibc malloc to reduce memory fragmentation
# in long-running Python processes (API server, Celery workers).
# The soname is architecture-independent; the dynamic linker resolves
# the correct path from standard library directories.
# Placed after all RUN steps so build-time processes are unaffected.
ENV LD_PRELOAD=libjemalloc.so.2
# Default command which does nothing
# This container is used by api server and background which specify their own CMD
CMD ["tail", "-f", "/dev/null"]

View File

@@ -0,0 +1,43 @@
"""add timestamps to user table
Revision ID: 27fb147a843f
Revises: b5c4d7e8f9a1
Create Date: 2026-03-08 17:18:40.828644
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "27fb147a843f"
down_revision = "b5c4d7e8f9a1"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"user",
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
)
op.add_column(
"user",
sa.Column(
"updated_at",
sa.DateTime(timezone=True),
server_default=sa.func.now(),
nullable=False,
),
)
def downgrade() -> None:
op.drop_column("user", "updated_at")
op.drop_column("user", "created_at")

View File

@@ -0,0 +1,51 @@
"""add hierarchy_node_by_connector_credential_pair table
Revision ID: b5c4d7e8f9a1
Revises: a3b8d9e2f1c4
Create Date: 2026-03-04
"""
import sqlalchemy as sa
from alembic import op
revision = "b5c4d7e8f9a1"
down_revision = "a3b8d9e2f1c4"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_table(
"hierarchy_node_by_connector_credential_pair",
sa.Column("hierarchy_node_id", sa.Integer(), nullable=False),
sa.Column("connector_id", sa.Integer(), nullable=False),
sa.Column("credential_id", sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(
["hierarchy_node_id"],
["hierarchy_node.id"],
ondelete="CASCADE",
),
sa.ForeignKeyConstraint(
["connector_id", "credential_id"],
[
"connector_credential_pair.connector_id",
"connector_credential_pair.credential_id",
],
ondelete="CASCADE",
),
sa.PrimaryKeyConstraint("hierarchy_node_id", "connector_id", "credential_id"),
)
op.create_index(
"ix_hierarchy_node_cc_pair_connector_credential",
"hierarchy_node_by_connector_credential_pair",
["connector_id", "credential_id"],
)
def downgrade() -> None:
op.drop_index(
"ix_hierarchy_node_cc_pair_connector_credential",
table_name="hierarchy_node_by_connector_credential_pair",
)
op.drop_table("hierarchy_node_by_connector_credential_pair")

View File

@@ -11,7 +11,6 @@ from sqlalchemy import text
from alembic import op
from onyx.configs.app_configs import DB_READONLY_PASSWORD
from onyx.configs.app_configs import DB_READONLY_USER
from shared_configs.configs import MULTI_TENANT
# revision identifiers, used by Alembic.
@@ -22,59 +21,52 @@ depends_on = None
def upgrade() -> None:
if MULTI_TENANT:
# Enable pg_trgm extension if not already enabled
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
# Enable pg_trgm extension if not already enabled
op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
# Create the read-only db user if it does not already exist.
if not (DB_READONLY_USER and DB_READONLY_PASSWORD):
raise Exception("DB_READONLY_USER or DB_READONLY_PASSWORD is not set")
# Create read-only db user here only in multi-tenant mode. For single-tenant mode,
# the user is created in the standard migration.
if not (DB_READONLY_USER and DB_READONLY_PASSWORD):
raise Exception("DB_READONLY_USER or DB_READONLY_PASSWORD is not set")
op.execute(
text(
f"""
DO $$
BEGIN
-- Check if the read-only user already exists
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
-- Create the read-only user with the specified password
EXECUTE format('CREATE USER %I WITH PASSWORD %L', '{DB_READONLY_USER}', '{DB_READONLY_PASSWORD}');
-- First revoke all privileges to ensure a clean slate
EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
-- Grant only the CONNECT privilege to allow the user to connect to the database
-- but not perform any operations without additional specific grants
EXECUTE format('GRANT CONNECT ON DATABASE %I TO %I', current_database(), '{DB_READONLY_USER}');
END IF;
END
$$;
"""
)
)
def downgrade() -> None:
if MULTI_TENANT:
# Drop read-only db user here only in single tenant mode. For multi-tenant mode,
# the user is dropped in the alembic_tenants migration.
op.execute(
text(
f"""
op.execute(
text(
f"""
DO $$
BEGIN
IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
-- First revoke all privileges from the database
-- Check if the read-only user already exists
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
-- Create the read-only user with the specified password
EXECUTE format('CREATE USER %I WITH PASSWORD %L', '{DB_READONLY_USER}', '{DB_READONLY_PASSWORD}');
-- First revoke all privileges to ensure a clean slate
EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
-- Then revoke all privileges from the public schema
EXECUTE format('REVOKE ALL ON SCHEMA public FROM %I', '{DB_READONLY_USER}');
-- Then drop the user
EXECUTE format('DROP USER %I', '{DB_READONLY_USER}');
-- Grant only the CONNECT privilege to allow the user to connect to the database
-- but not perform any operations without additional specific grants
EXECUTE format('GRANT CONNECT ON DATABASE %I TO %I', current_database(), '{DB_READONLY_USER}');
END IF;
END
$$;
"""
)
"""
)
op.execute(text("DROP EXTENSION IF EXISTS pg_trgm"))
)
def downgrade() -> None:
op.execute(
text(
f"""
DO $$
BEGIN
IF EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{DB_READONLY_USER}') THEN
-- First revoke all privileges from the database
EXECUTE format('REVOKE ALL ON DATABASE %I FROM %I', current_database(), '{DB_READONLY_USER}');
-- Then revoke all privileges from the public schema
EXECUTE format('REVOKE ALL ON SCHEMA public FROM %I', '{DB_READONLY_USER}');
-- Then drop the user
EXECUTE format('DROP USER %I', '{DB_READONLY_USER}');
END IF;
END
$$;
"""
)
)
op.execute(text("DROP EXTENSION IF EXISTS pg_trgm"))

View File

@@ -9,12 +9,15 @@ from onyx.access.access import (
_get_access_for_documents as get_access_for_documents_without_groups,
)
from onyx.access.access import _get_acl_for_user as get_acl_for_user_without_groups
from onyx.access.access import collect_user_file_access
from onyx.access.models import DocumentAccess
from onyx.access.utils import prefix_external_group
from onyx.access.utils import prefix_user_group
from onyx.db.document import get_document_sources
from onyx.db.document import get_documents_by_ids
from onyx.db.models import User
from onyx.db.models import UserFile
from onyx.db.user_file import fetch_user_files_with_access_relationships
from onyx.utils.logger import setup_logger
@@ -116,6 +119,68 @@ def _get_access_for_documents(
return access_map
def _collect_user_file_group_names(user_file: UserFile) -> set[str]:
"""Extract user-group names from the already-loaded Persona.groups
relationships on a UserFile (skipping deleted personas)."""
groups: set[str] = set()
for persona in user_file.assistants:
if persona.deleted:
continue
for group in persona.groups:
groups.add(group.name)
return groups
def get_access_for_user_files_impl(
user_file_ids: list[str],
db_session: Session,
) -> dict[str, DocumentAccess]:
"""EE version: extends the MIT user file ACL with user group names
from personas shared via user groups.
Uses a single DB query (via fetch_user_files_with_access_relationships)
that eagerly loads both the MIT-needed and EE-needed relationships.
NOTE: is imported in onyx.access.access by `fetch_versioned_implementation`
DO NOT REMOVE."""
user_files = fetch_user_files_with_access_relationships(
user_file_ids, db_session, eager_load_groups=True
)
return build_access_for_user_files_impl(user_files)
def build_access_for_user_files_impl(
user_files: list[UserFile],
) -> dict[str, DocumentAccess]:
"""EE version: works on pre-loaded UserFile objects.
Expects Persona.groups to be eagerly loaded.
NOTE: is imported in onyx.access.access by `fetch_versioned_implementation`
DO NOT REMOVE."""
result: dict[str, DocumentAccess] = {}
for user_file in user_files:
if user_file.user is None:
result[str(user_file.id)] = DocumentAccess.build(
user_emails=[],
user_groups=[],
is_public=True,
external_user_emails=[],
external_user_group_ids=[],
)
continue
emails, is_public = collect_user_file_access(user_file)
group_names = _collect_user_file_group_names(user_file)
result[str(user_file.id)] = DocumentAccess.build(
user_emails=list(emails),
user_groups=list(group_names),
is_public=is_public,
external_user_emails=[],
external_user_group_ids=[],
)
return result
def _get_acl_for_user(user: User, db_session: Session) -> set[str]:
"""Returns a list of ACL entries that the user has access to. This is meant to be
used downstream to filter out documents that the user does not have access to. The

View File

@@ -1,3 +1,4 @@
import os
from datetime import datetime
import jwt
@@ -20,7 +21,13 @@ logger = setup_logger()
def verify_auth_setting() -> None:
# All the Auth flows are valid for EE version
# All the Auth flows are valid for EE version, but warn about deprecated 'disabled'
raw_auth_type = (os.environ.get("AUTH_TYPE") or "").lower()
if raw_auth_type == "disabled":
logger.warning(
"AUTH_TYPE='disabled' is no longer supported. "
"Using 'basic' instead. Please update your configuration."
)
logger.notice(f"Using Auth Type: {AUTH_TYPE.value}")

View File

@@ -18,7 +18,7 @@ from onyx.db.models import HierarchyNode
def _build_hierarchy_access_filter(
user_email: str | None,
user_email: str,
external_group_ids: list[str],
) -> ColumnElement[bool]:
"""Build SQLAlchemy filter for hierarchy node access.
@@ -43,7 +43,7 @@ def _build_hierarchy_access_filter(
def _get_accessible_hierarchy_nodes_for_source(
db_session: Session,
source: DocumentSource,
user_email: str | None,
user_email: str,
external_group_ids: list[str],
) -> list[HierarchyNode]:
"""

View File

@@ -7,6 +7,7 @@ from onyx.db.models import Persona
from onyx.db.models import Persona__User
from onyx.db.models import Persona__UserGroup
from onyx.db.notification import create_notification
from onyx.db.persona import mark_persona_user_files_for_sync
from onyx.server.features.persona.models import PersonaSharedNotificationData
@@ -26,7 +27,9 @@ def update_persona_access(
NOTE: Callers are responsible for committing."""
needs_sync = False
if is_public is not None:
needs_sync = True
persona = db_session.query(Persona).filter(Persona.id == persona_id).first()
if persona:
persona.is_public = is_public
@@ -35,6 +38,7 @@ def update_persona_access(
# and a non-empty list means "replace with these shares".
if user_ids is not None:
needs_sync = True
db_session.query(Persona__User).filter(
Persona__User.persona_id == persona_id
).delete(synchronize_session="fetch")
@@ -54,6 +58,7 @@ def update_persona_access(
)
if group_ids is not None:
needs_sync = True
db_session.query(Persona__UserGroup).filter(
Persona__UserGroup.persona_id == persona_id
).delete(synchronize_session="fetch")
@@ -63,3 +68,7 @@ def update_persona_access(
db_session.add(
Persona__UserGroup(persona_id=persona_id, user_group_id=group_id)
)
# When sharing changes, user file ACLs need to be updated in the vector DB
if needs_sync:
mark_persona_user_files_for_sync(persona_id, db_session)

View File

@@ -14,67 +14,91 @@ from onyx.utils.variable_functionality import fetch_versioned_implementation
logger = setup_logger()
@lru_cache(maxsize=1)
@lru_cache(maxsize=2)
def _get_trimmed_key(key: str) -> bytes:
encoded_key = key.encode()
key_length = len(encoded_key)
if key_length < 16:
raise RuntimeError("Invalid ENCRYPTION_KEY_SECRET - too short")
elif key_length > 32:
key = key[:32]
elif key_length not in (16, 24, 32):
valid_lengths = [16, 24, 32]
key = key[: min(valid_lengths, key=lambda x: abs(x - key_length))]
return encoded_key
# Trim to the largest valid AES key size that fits
valid_lengths = [32, 24, 16]
for size in valid_lengths:
if key_length >= size:
return encoded_key[:size]
raise AssertionError("unreachable")
def _encrypt_string(input_str: str) -> bytes:
if not ENCRYPTION_KEY_SECRET:
def _encrypt_string(input_str: str, key: str | None = None) -> bytes:
effective_key = key if key is not None else ENCRYPTION_KEY_SECRET
if not effective_key:
return input_str.encode()
key = _get_trimmed_key(ENCRYPTION_KEY_SECRET)
trimmed = _get_trimmed_key(effective_key)
iv = urandom(16)
padder = padding.PKCS7(algorithms.AES.block_size).padder()
padded_data = padder.update(input_str.encode()) + padder.finalize()
cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
cipher = Cipher(algorithms.AES(trimmed), modes.CBC(iv), backend=default_backend())
encryptor = cipher.encryptor()
encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
return iv + encrypted_data
def _decrypt_bytes(input_bytes: bytes) -> str:
if not ENCRYPTION_KEY_SECRET:
def _decrypt_bytes(input_bytes: bytes, key: str | None = None) -> str:
effective_key = key if key is not None else ENCRYPTION_KEY_SECRET
if not effective_key:
return input_bytes.decode()
key = _get_trimmed_key(ENCRYPTION_KEY_SECRET)
iv = input_bytes[:16]
encrypted_data = input_bytes[16:]
trimmed = _get_trimmed_key(effective_key)
try:
iv = input_bytes[:16]
encrypted_data = input_bytes[16:]
cipher = Cipher(algorithms.AES(key), modes.CBC(iv), backend=default_backend())
decryptor = cipher.decryptor()
decrypted_padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
cipher = Cipher(
algorithms.AES(trimmed), modes.CBC(iv), backend=default_backend()
)
decryptor = cipher.decryptor()
decrypted_padded_data = decryptor.update(encrypted_data) + decryptor.finalize()
unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
decrypted_data = unpadder.update(decrypted_padded_data) + unpadder.finalize()
unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
decrypted_data = unpadder.update(decrypted_padded_data) + unpadder.finalize()
return decrypted_data.decode()
return decrypted_data.decode()
except (ValueError, UnicodeDecodeError):
if key is not None:
# Explicit key was provided — don't fall back silently
raise
# Read path: attempt raw UTF-8 decode as a fallback for legacy data.
# Does NOT handle data encrypted with a different key — that
# ciphertext is not valid UTF-8 and will raise below.
logger.warning(
"AES decryption failed — falling back to raw decode. "
"Run the re-encrypt secrets script to rotate to the current key."
)
try:
return input_bytes.decode()
except UnicodeDecodeError:
raise ValueError(
"Data is not valid UTF-8 — likely encrypted with a different key. "
"Run the re-encrypt secrets script to rotate to the current key."
) from None
def encrypt_string_to_bytes(input_str: str) -> bytes:
def encrypt_string_to_bytes(input_str: str, key: str | None = None) -> bytes:
versioned_encryption_fn = fetch_versioned_implementation(
"onyx.utils.encryption", "_encrypt_string"
)
return versioned_encryption_fn(input_str)
return versioned_encryption_fn(input_str, key=key)
def decrypt_bytes_to_string(input_bytes: bytes) -> str:
def decrypt_bytes_to_string(input_bytes: bytes, key: str | None = None) -> str:
versioned_decryption_fn = fetch_versioned_implementation(
"onyx.utils.encryption", "_decrypt_bytes"
)
return versioned_decryption_fn(input_bytes)
return versioned_decryption_fn(input_bytes, key=key)
def test_encryption() -> None:

View File

@@ -1,7 +1,6 @@
from collections.abc import Callable
from typing import cast
from sqlalchemy.orm import joinedload
from sqlalchemy.orm import Session
from onyx.access.models import DocumentAccess
@@ -12,6 +11,7 @@ from onyx.db.document import get_access_info_for_document
from onyx.db.document import get_access_info_for_documents
from onyx.db.models import User
from onyx.db.models import UserFile
from onyx.db.user_file import fetch_user_files_with_access_relationships
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
from onyx.utils.variable_functionality import fetch_versioned_implementation
@@ -132,19 +132,61 @@ def get_access_for_user_files(
user_file_ids: list[str],
db_session: Session,
) -> dict[str, DocumentAccess]:
user_files = (
db_session.query(UserFile)
.options(joinedload(UserFile.user)) # Eager load the user relationship
.filter(UserFile.id.in_(user_file_ids))
.all()
versioned_fn = fetch_versioned_implementation(
"onyx.access.access", "get_access_for_user_files_impl"
)
return {
str(user_file.id): DocumentAccess.build(
user_emails=[user_file.user.email] if user_file.user else [],
return versioned_fn(user_file_ids, db_session)
def get_access_for_user_files_impl(
user_file_ids: list[str],
db_session: Session,
) -> dict[str, DocumentAccess]:
user_files = fetch_user_files_with_access_relationships(user_file_ids, db_session)
return build_access_for_user_files_impl(user_files)
def build_access_for_user_files(
user_files: list[UserFile],
) -> dict[str, DocumentAccess]:
"""Compute access from pre-loaded UserFile objects (with relationships).
Callers must ensure UserFile.user, Persona.users, and Persona.user are
eagerly loaded (and Persona.groups for the EE path)."""
versioned_fn = fetch_versioned_implementation(
"onyx.access.access", "build_access_for_user_files_impl"
)
return versioned_fn(user_files)
def build_access_for_user_files_impl(
user_files: list[UserFile],
) -> dict[str, DocumentAccess]:
result: dict[str, DocumentAccess] = {}
for user_file in user_files:
emails, is_public = collect_user_file_access(user_file)
result[str(user_file.id)] = DocumentAccess.build(
user_emails=list(emails),
user_groups=[],
is_public=True if user_file.user is None else False,
is_public=is_public,
external_user_emails=[],
external_user_group_ids=[],
)
for user_file in user_files
}
return result
def collect_user_file_access(user_file: UserFile) -> tuple[set[str], bool]:
"""Collect all user emails that should have access to this user file.
Includes the owner plus any users who have access via shared personas.
Returns (emails, is_public)."""
emails: set[str] = {user_file.user.email}
is_public = False
for persona in user_file.assistants:
if persona.deleted:
continue
if persona.is_public:
is_public = True
if persona.user_id is not None and persona.user:
emails.add(persona.user.email)
for shared_user in persona.users:
emails.add(shared_user.email)
return emails, is_public

View File

@@ -1,4 +1,5 @@
import json
import os
import random
import secrets
import string
@@ -145,10 +146,22 @@ def is_user_admin(user: User) -> bool:
def verify_auth_setting() -> None:
if AUTH_TYPE == AuthType.CLOUD:
"""Log warnings for AUTH_TYPE issues.
This only runs on app startup not during migrations/scripts.
"""
raw_auth_type = (os.environ.get("AUTH_TYPE") or "").lower()
if raw_auth_type == "cloud":
raise ValueError(
f"{AUTH_TYPE.value} is not a valid auth type for self-hosted deployments."
"'cloud' is not a valid auth type for self-hosted deployments."
)
if raw_auth_type == "disabled":
logger.warning(
"AUTH_TYPE='disabled' is no longer supported. "
"Using 'basic' instead. Please update your configuration."
)
logger.notice(f"Using Auth Type: {AUTH_TYPE.value}")

View File

@@ -115,8 +115,6 @@ def _extract_from_batch(
for item in doc_list:
if isinstance(item, HierarchyNode):
hierarchy_nodes.append(item)
if item.raw_node_id not in ids:
ids[item.raw_node_id] = None
elif isinstance(item, ConnectorFailure):
failed_id = _get_failure_id(item)
if failed_id:
@@ -125,8 +123,7 @@ def _extract_from_batch(
f"Failed to retrieve document {failed_id}: " f"{item.failure_message}"
)
else:
parent_raw = getattr(item, "parent_hierarchy_raw_node_id", None)
ids[item.id] = parent_raw
ids[item.id] = item.parent_hierarchy_raw_node_id
return BatchResult(raw_id_to_parent=ids, hierarchy_nodes=hierarchy_nodes)
@@ -192,9 +189,7 @@ def extract_ids_from_runnable_connector(
batch_ids = batch_result.raw_id_to_parent
batch_nodes = batch_result.hierarchy_nodes
doc_batch_processing_func(batch_ids)
for k, v in batch_ids.items():
if v is not None or k not in all_raw_id_to_parent:
all_raw_id_to_parent[k] = v
all_raw_id_to_parent.update(batch_ids)
all_hierarchy_nodes.extend(batch_nodes)
if callback:

View File

@@ -40,6 +40,7 @@ from onyx.db.connector_credential_pair import get_connector_credential_pair_from
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.hierarchy import upsert_hierarchy_node_cc_pair_entries
from onyx.db.hierarchy import upsert_hierarchy_nodes_batch
from onyx.db.models import ConnectorCredentialPair
from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
@@ -289,6 +290,14 @@ def _run_hierarchy_extraction(
is_connector_public=is_connector_public,
)
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=[n.id for n in upserted_nodes],
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
commit=True,
)
# Cache in Redis for fast ancestor resolution
cache_entries = [
HierarchyNodeCacheEntry.from_db_model(node) for node in upserted_nodes

View File

@@ -11,6 +11,9 @@
# lock after its cleanup which happens at most after its soft timeout.
# Constants corresponding to migrate_documents_from_vespa_to_opensearch_task.
from onyx.configs.app_configs import OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE
MIGRATION_TASK_SOFT_TIME_LIMIT_S = 60 * 5 # 5 minutes.
MIGRATION_TASK_TIME_LIMIT_S = 60 * 6 # 6 minutes.
# The maximum time the lock can be held for. Will automatically be released
@@ -44,7 +47,7 @@ TOTAL_ALLOWABLE_DOC_MIGRATION_ATTEMPTS_BEFORE_PERMANENT_FAILURE = 15
# WARNING: Do not change these values without knowing what changes also need to
# be made to OpenSearchTenantMigrationRecord.
GET_VESPA_CHUNKS_PAGE_SIZE = 500
GET_VESPA_CHUNKS_PAGE_SIZE = OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE
GET_VESPA_CHUNKS_SLICE_COUNT = 4
# String used to indicate in the vespa_visit_continuation_token mapping that the

View File

@@ -48,10 +48,15 @@ from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.enums import SyncStatus
from onyx.db.enums import SyncType
from onyx.db.hierarchy import delete_orphaned_hierarchy_nodes
from onyx.db.hierarchy import link_hierarchy_nodes_to_documents
from onyx.db.hierarchy import remove_stale_hierarchy_node_cc_pair_entries
from onyx.db.hierarchy import reparent_orphaned_hierarchy_nodes
from onyx.db.hierarchy import update_document_parent_hierarchy_nodes
from onyx.db.hierarchy import upsert_hierarchy_node_cc_pair_entries
from onyx.db.hierarchy import upsert_hierarchy_nodes_batch
from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import HierarchyNode as DBHierarchyNode
from onyx.db.sync_record import insert_sync_record
from onyx.db.sync_record import update_sync_record_status
from onyx.db.tag import delete_orphan_tags__no_commit
@@ -60,6 +65,7 @@ from onyx.redis.redis_connector_prune import RedisConnectorPrune
from onyx.redis.redis_connector_prune import RedisConnectorPrunePayload
from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
from onyx.redis.redis_hierarchy import ensure_source_node_exists
from onyx.redis.redis_hierarchy import evict_hierarchy_nodes_from_cache
from onyx.redis.redis_hierarchy import get_node_id_from_raw_id
from onyx.redis.redis_hierarchy import get_source_node_id_from_cache
from onyx.redis.redis_hierarchy import HierarchyNodeCacheEntry
@@ -579,11 +585,12 @@ def connector_pruning_generator_task(
source = cc_pair.connector.source
redis_client = get_redis_client(tenant_id=tenant_id)
ensure_source_node_exists(redis_client, db_session, source)
upserted_nodes: list[DBHierarchyNode] = []
if extraction_result.hierarchy_nodes:
is_connector_public = cc_pair.access_type == AccessType.PUBLIC
ensure_source_node_exists(redis_client, db_session, source)
upserted_nodes = upsert_hierarchy_nodes_batch(
db_session=db_session,
nodes=extraction_result.hierarchy_nodes,
@@ -592,6 +599,14 @@ def connector_pruning_generator_task(
is_connector_public=is_connector_public,
)
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=[n.id for n in upserted_nodes],
connector_id=connector_id,
credential_id=credential_id,
commit=True,
)
cache_entries = [
HierarchyNodeCacheEntry.from_db_model(node)
for node in upserted_nodes
@@ -607,7 +622,6 @@ def connector_pruning_generator_task(
f"hierarchy nodes for cc_pair={cc_pair_id}"
)
ensure_source_node_exists(redis_client, db_session, source)
# Resolve parent_hierarchy_raw_node_id → parent_hierarchy_node_id
# and bulk-update documents, mirroring the docfetching resolution
_resolve_and_update_document_parents(
@@ -664,6 +678,43 @@ def connector_pruning_generator_task(
)
redis_connector.prune.generator_complete = tasks_generated
# --- Hierarchy node pruning ---
live_node_ids = {n.id for n in upserted_nodes}
stale_removed = remove_stale_hierarchy_node_cc_pair_entries(
db_session=db_session,
connector_id=connector_id,
credential_id=credential_id,
live_hierarchy_node_ids=live_node_ids,
commit=True,
)
deleted_raw_ids = delete_orphaned_hierarchy_nodes(
db_session=db_session,
source=source,
commit=True,
)
reparented_nodes = reparent_orphaned_hierarchy_nodes(
db_session=db_session,
source=source,
commit=True,
)
if deleted_raw_ids:
evict_hierarchy_nodes_from_cache(redis_client, source, deleted_raw_ids)
if reparented_nodes:
reparented_cache_entries = [
HierarchyNodeCacheEntry.from_db_model(node)
for node in reparented_nodes
]
cache_hierarchy_nodes_batch(
redis_client, source, reparented_cache_entries
)
if stale_removed or deleted_raw_ids or reparented_nodes:
task_logger.info(
f"Hierarchy node pruning: cc_pair={cc_pair_id} "
f"stale_entries_removed={stale_removed} "
f"nodes_deleted={len(deleted_raw_ids)} "
f"nodes_reparented={len(reparented_nodes)}"
)
except Exception as e:
task_logger.exception(
f"Pruning exceptioned: cc_pair={cc_pair_id} "

View File

@@ -12,9 +12,9 @@ from redis import Redis
from redis.lock import Lock as RedisLock
from retry import retry
from sqlalchemy import select
from sqlalchemy.orm import selectinload
from sqlalchemy.orm import Session
from onyx.access.access import build_access_for_user_files
from onyx.background.celery.apps.app_base import task_logger
from onyx.background.celery.celery_redis import celery_get_queue_length
from onyx.background.celery.celery_utils import httpx_init_vespa_pool
@@ -43,7 +43,9 @@ from onyx.db.enums import UserFileStatus
from onyx.db.models import UserFile
from onyx.db.search_settings import get_active_search_settings
from onyx.db.search_settings import get_active_search_settings_list
from onyx.db.user_file import fetch_user_files_with_access_relationships
from onyx.document_index.factory import get_all_document_indices
from onyx.document_index.interfaces import VespaDocumentFields
from onyx.document_index.interfaces import VespaDocumentUserFields
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
from onyx.file_store.file_store import get_default_file_store
@@ -54,6 +56,7 @@ from onyx.indexing.adapters.user_file_indexing_adapter import UserFileIndexingAd
from onyx.indexing.embedder import DefaultIndexingEmbedder
from onyx.indexing.indexing_pipeline import run_indexing_pipeline
from onyx.redis.redis_pool import get_redis_client
from onyx.utils.variable_functionality import global_version
def _as_uuid(value: str | UUID) -> UUID:
@@ -791,11 +794,12 @@ def project_sync_user_file_impl(
try:
with get_session_with_current_tenant() as db_session:
user_file = db_session.execute(
select(UserFile)
.where(UserFile.id == _as_uuid(user_file_id))
.options(selectinload(UserFile.assistants))
).scalar_one_or_none()
user_files = fetch_user_files_with_access_relationships(
[user_file_id],
db_session,
eager_load_groups=global_version.is_ee_version(),
)
user_file = user_files[0] if user_files else None
if not user_file:
task_logger.info(
f"project_sync_user_file_impl - User file not found id={user_file_id}"
@@ -823,12 +827,21 @@ def project_sync_user_file_impl(
project_ids = [project.id for project in user_file.projects]
persona_ids = [p.id for p in user_file.assistants if not p.deleted]
file_id_str = str(user_file.id)
access_map = build_access_for_user_files([user_file])
access = access_map.get(file_id_str)
for retry_document_index in retry_document_indices:
retry_document_index.update_single(
doc_id=str(user_file.id),
doc_id=file_id_str,
tenant_id=tenant_id,
chunk_count=user_file.chunk_count,
fields=None,
fields=(
VespaDocumentFields(access=access)
if access is not None
else None
),
user_fields=VespaDocumentUserFields(
user_projects=project_ids,
personas=persona_ids,

View File

@@ -45,6 +45,7 @@ from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.enums import IndexingStatus
from onyx.db.enums import IndexModelStatus
from onyx.db.enums import ProcessingMode
from onyx.db.hierarchy import upsert_hierarchy_node_cc_pair_entries
from onyx.db.hierarchy import upsert_hierarchy_nodes_batch
from onyx.db.index_attempt import create_index_attempt_error
from onyx.db.index_attempt import get_index_attempt
@@ -587,6 +588,14 @@ def connector_document_extraction(
is_connector_public=is_connector_public,
)
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=[n.id for n in upserted_nodes],
connector_id=db_connector.id,
credential_id=db_credential.id,
commit=True,
)
# Cache in Redis for fast ancestor resolution during doc processing
redis_client = get_redis_client(tenant_id=tenant_id)
cache_entries = [

View File

@@ -15,6 +15,7 @@ from onyx.chat.citation_processor import DynamicCitationProcessor
from onyx.chat.emitter import Emitter
from onyx.chat.models import ChatMessageSimple
from onyx.chat.models import LlmStepResult
from onyx.chat.tool_call_args_streaming import maybe_emit_argument_delta
from onyx.configs.app_configs import LOG_ONYX_MODEL_INTERACTIONS
from onyx.configs.app_configs import PROMPT_CACHE_CHAT_HISTORY
from onyx.configs.constants import MessageType
@@ -54,6 +55,7 @@ from onyx.server.query_and_chat.streaming_models import ReasoningStart
from onyx.tools.models import ToolCallKickoff
from onyx.tracing.framework.create import generation_span
from onyx.utils.b64 import get_image_type_from_bytes
from onyx.utils.jsonriver import Parser
from onyx.utils.logger import setup_logger
from onyx.utils.postgres_sanitization import sanitize_string
from onyx.utils.text_processing import find_all_json_objects
@@ -1009,6 +1011,7 @@ def run_llm_step_pkt_generator(
)
id_to_tool_call_map: dict[int, dict[str, Any]] = {}
arg_parsers: dict[int, Parser] = {}
reasoning_start = False
answer_start = False
accumulated_reasoning = ""
@@ -1215,7 +1218,14 @@ def run_llm_step_pkt_generator(
yield from _close_reasoning_if_active()
for tool_call_delta in delta.tool_calls:
# maybe_emit depends and update being called first and attaching the delta
_update_tool_call_with_delta(id_to_tool_call_map, tool_call_delta)
yield from maybe_emit_argument_delta(
tool_calls_in_progress=id_to_tool_call_map,
tool_call_delta=tool_call_delta,
placement=_current_placement(),
parsers=arg_parsers,
)
# Flush any tail text buffered while checking for split "<function_calls" markers.
filtered_content_tail = xml_tool_call_content_filter.flush()

View File

@@ -0,0 +1,77 @@
from collections.abc import Generator
from collections.abc import Mapping
from typing import Any
from typing import Type
from onyx.llm.model_response import ChatCompletionDeltaToolCall
from onyx.server.query_and_chat.placement import Placement
from onyx.server.query_and_chat.streaming_models import Packet
from onyx.server.query_and_chat.streaming_models import ToolCallArgumentDelta
from onyx.tools.built_in_tools import TOOL_NAME_TO_CLASS
from onyx.tools.interface import Tool
from onyx.utils.jsonriver import Parser
def _get_tool_class(
tool_calls_in_progress: Mapping[int, Mapping[str, Any]],
tool_call_delta: ChatCompletionDeltaToolCall,
) -> Type[Tool] | None:
"""Look up the Tool subclass for a streaming tool call delta."""
tool_name = tool_calls_in_progress.get(tool_call_delta.index, {}).get("name")
if not tool_name:
return None
return TOOL_NAME_TO_CLASS.get(tool_name)
def maybe_emit_argument_delta(
tool_calls_in_progress: Mapping[int, Mapping[str, Any]],
tool_call_delta: ChatCompletionDeltaToolCall,
placement: Placement,
parsers: dict[int, Parser],
) -> Generator[Packet, None, None]:
"""Emit decoded tool-call argument deltas to the frontend.
Uses a ``jsonriver.Parser`` per tool-call index to incrementally parse
the JSON argument string and extract only the newly-appended content
for each string-valued argument.
NOTE: Non-string arguments (numbers, booleans, null, arrays, objects)
are skipped — they are available in the final tool-call kickoff packet.
``parsers`` is a mutable dict keyed by tool-call index. A new
``Parser`` is created automatically for each new index.
"""
tool_cls = _get_tool_class(tool_calls_in_progress, tool_call_delta)
if not tool_cls or not tool_cls.should_emit_argument_deltas():
return
fn = tool_call_delta.function
delta_fragment = fn.arguments if fn else None
if not delta_fragment:
return
idx = tool_call_delta.index
if idx not in parsers:
parsers[idx] = Parser()
parser = parsers[idx]
deltas = parser.feed(delta_fragment)
argument_deltas: dict[str, str] = {}
for delta in deltas:
if isinstance(delta, dict):
for key, value in delta.items():
if isinstance(value, str):
argument_deltas[key] = argument_deltas.get(key, "") + value
if not argument_deltas:
return
tc_data = tool_calls_in_progress[tool_call_delta.index]
yield Packet(
placement=placement,
obj=ToolCallArgumentDelta(
tool_type=tc_data.get("name", ""),
argument_deltas=argument_deltas,
),
)

View File

@@ -68,6 +68,10 @@ FILE_TOKEN_COUNT_THRESHOLD = int(
os.environ.get("FILE_TOKEN_COUNT_THRESHOLD", str(_DEFAULT_FILE_TOKEN_LIMIT))
)
# Maximum upload size for a single user file (chat/projects) in MB.
USER_FILE_MAX_UPLOAD_SIZE_MB = int(os.environ.get("USER_FILE_MAX_UPLOAD_SIZE_MB") or 50)
USER_FILE_MAX_UPLOAD_SIZE_BYTES = USER_FILE_MAX_UPLOAD_SIZE_MB * 1024 * 1024
# If set to true, will show extra/uncommon connectors in the "Other" category
SHOW_EXTRA_CONNECTORS = os.environ.get("SHOW_EXTRA_CONNECTORS", "").lower() == "true"
@@ -92,19 +96,12 @@ WEB_DOMAIN = os.environ.get("WEB_DOMAIN") or "http://localhost:3000"
#####
# Auth Configs
#####
# Upgrades users from disabled auth to basic auth and shows warning.
_auth_type_str = (os.environ.get("AUTH_TYPE") or "basic").lower()
if _auth_type_str == "disabled":
logger.warning(
"AUTH_TYPE='disabled' is no longer supported. "
"Defaulting to 'basic'. Please update your configuration. "
"Your existing data will be migrated automatically."
)
_auth_type_str = AuthType.BASIC.value
try:
# Silently default to basic - warnings/errors logged in verify_auth_setting()
# which only runs on app startup, not during migrations/scripts
_auth_type_str = (os.environ.get("AUTH_TYPE") or "").lower()
if _auth_type_str in [auth_type.value for auth_type in AuthType]:
AUTH_TYPE = AuthType(_auth_type_str)
except ValueError:
logger.error(f"Invalid AUTH_TYPE: {_auth_type_str}. Defaulting to 'basic'.")
else:
AUTH_TYPE = AuthType.BASIC
PASSWORD_MIN_LENGTH = int(os.getenv("PASSWORD_MIN_LENGTH", 8))
@@ -207,6 +204,12 @@ JWT_PUBLIC_KEY_URL: str | None = os.getenv("JWT_PUBLIC_KEY_URL", None)
USER_AUTH_SECRET = os.environ.get("USER_AUTH_SECRET", "")
if AUTH_TYPE == AuthType.BASIC and not USER_AUTH_SECRET:
logger.warning(
"USER_AUTH_SECRET is not set. This is required for secure password reset "
"and email verification tokens. Please set USER_AUTH_SECRET in production."
)
# Duration (in seconds) for which the FastAPI Users JWT token remains valid in the user's browser.
# By default, this is set to match the Redis expiry time for consistency.
AUTH_COOKIE_EXPIRE_TIME_SECONDS = int(
@@ -308,6 +311,12 @@ VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT = (
os.environ.get("VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT", "true").lower()
== "true"
)
OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE = int(
os.environ.get("OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE") or 500
)
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = int(
os.environ.get("OPENSEARCH_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES") or 0
)
VESPA_HOST = os.environ.get("VESPA_HOST") or "localhost"
# NOTE: this is used if and only if the vespa config server is accessible via a

View File

@@ -44,6 +44,7 @@ from onyx.connectors.google_utils.shared_constants import (
from onyx.db.credentials import update_credential_json
from onyx.db.models import User
from onyx.key_value_store.factory import get_kv_store
from onyx.key_value_store.interface import unwrap_str
from onyx.server.documents.models import CredentialBase
from onyx.server.documents.models import GoogleAppCredentials
from onyx.server.documents.models import GoogleServiceAccountKey
@@ -89,7 +90,7 @@ def _get_current_oauth_user(creds: OAuthCredentials, source: DocumentSource) ->
def verify_csrf(credential_id: int, state: str) -> None:
csrf = get_kv_store().load(KV_CRED_KEY.format(str(credential_id)))
csrf = unwrap_str(get_kv_store().load(KV_CRED_KEY.format(str(credential_id))))
if csrf != state:
raise PermissionError(
"State from Google Drive Connector callback does not match expected"
@@ -178,7 +179,9 @@ def get_auth_url(credential_id: int, source: DocumentSource) -> str:
params = parse_qs(parsed_url.query)
get_kv_store().store(
KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True
KV_CRED_KEY.format(credential_id),
{"value": params.get("state", [None])[0]},
encrypt=True,
)
return str(auth_url)

View File

@@ -258,6 +258,10 @@ class SharepointConnectorCheckpoint(ConnectorCheckpoint):
# Track yielded hierarchy nodes by their raw_node_id (URLs) to avoid duplicates
seen_hierarchy_node_raw_ids: set[str] = Field(default_factory=set)
# Track yielded document IDs to avoid processing the same document twice.
# The Microsoft Graph delta API can return the same item on multiple pages.
seen_document_ids: set[str] = Field(default_factory=set)
class SharepointAuthMethod(Enum):
CLIENT_SECRET = "client_secret"
@@ -1557,6 +1561,7 @@ class SharepointConnector(
checkpoint.current_drive_id = None
checkpoint.current_drive_web_url = None
checkpoint.current_drive_delta_next_link = None
checkpoint.seen_document_ids.clear()
def _fetch_slim_documents_from_sharepoint(self) -> GenerateSlimDocumentOutput:
site_descriptors = self.site_descriptors or self.fetch_sites()
@@ -2137,6 +2142,14 @@ class SharepointConnector(
item_count = 0
for driveitem in driveitems:
item_count += 1
if driveitem.id and driveitem.id in checkpoint.seen_document_ids:
logger.debug(
f"Skipping duplicate document {driveitem.id} "
f"({driveitem.name})"
)
continue
driveitem_extension = get_file_ext(driveitem.name)
if driveitem_extension not in OnyxFileExtensions.ALL_ALLOWED_EXTENSIONS:
logger.warning(
@@ -2189,11 +2202,13 @@ class SharepointConnector(
if isinstance(doc_or_failure, Document):
if doc_or_failure.sections:
checkpoint.seen_document_ids.add(doc_or_failure.id)
yield doc_or_failure
elif should_yield_if_empty:
doc_or_failure.sections = [
TextSection(link=driveitem.web_url, text="")
]
checkpoint.seen_document_ids.add(doc_or_failure.id)
yield doc_or_failure
else:
logger.warning(

View File

@@ -2,7 +2,10 @@
from collections import defaultdict
from sqlalchemy import delete
from sqlalchemy import select
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.engine import CursorResult
from sqlalchemy.orm import Session
from onyx.configs.constants import DocumentSource
@@ -10,6 +13,7 @@ from onyx.connectors.models import HierarchyNode as PydanticHierarchyNode
from onyx.db.enums import HierarchyNodeType
from onyx.db.models import Document
from onyx.db.models import HierarchyNode
from onyx.db.models import HierarchyNodeByConnectorCredentialPair
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import fetch_versioned_implementation
@@ -458,7 +462,7 @@ def get_all_hierarchy_nodes_for_source(
def _get_accessible_hierarchy_nodes_for_source(
db_session: Session,
source: DocumentSource,
user_email: str | None, # noqa: ARG001
user_email: str, # noqa: ARG001
external_group_ids: list[str], # noqa: ARG001
) -> list[HierarchyNode]:
"""
@@ -485,7 +489,7 @@ def _get_accessible_hierarchy_nodes_for_source(
def get_accessible_hierarchy_nodes_for_source(
db_session: Session,
source: DocumentSource,
user_email: str | None,
user_email: str,
external_group_ids: list[str],
) -> list[HierarchyNode]:
"""
@@ -620,3 +624,154 @@ def update_hierarchy_node_permissions(
db_session.flush()
return True
def upsert_hierarchy_node_cc_pair_entries(
db_session: Session,
hierarchy_node_ids: list[int],
connector_id: int,
credential_id: int,
commit: bool = True,
) -> None:
"""Insert rows into HierarchyNodeByConnectorCredentialPair, ignoring conflicts.
This records that the given cc_pair "owns" these hierarchy nodes. Used by
indexing, pruning, and hierarchy-fetching paths.
"""
if not hierarchy_node_ids:
return
_M = HierarchyNodeByConnectorCredentialPair
stmt = pg_insert(_M).values(
[
{
_M.hierarchy_node_id: node_id,
_M.connector_id: connector_id,
_M.credential_id: credential_id,
}
for node_id in hierarchy_node_ids
]
)
stmt = stmt.on_conflict_do_nothing()
db_session.execute(stmt)
if commit:
db_session.commit()
else:
db_session.flush()
def remove_stale_hierarchy_node_cc_pair_entries(
db_session: Session,
connector_id: int,
credential_id: int,
live_hierarchy_node_ids: set[int],
commit: bool = True,
) -> int:
"""Delete join-table rows for this cc_pair that are NOT in the live set.
If ``live_hierarchy_node_ids`` is empty ALL rows for the cc_pair are deleted
(i.e. the connector no longer has any hierarchy nodes). Callers that want a
no-op when there are no live nodes must guard before calling.
Returns the number of deleted rows.
"""
stmt = delete(HierarchyNodeByConnectorCredentialPair).where(
HierarchyNodeByConnectorCredentialPair.connector_id == connector_id,
HierarchyNodeByConnectorCredentialPair.credential_id == credential_id,
)
if live_hierarchy_node_ids:
stmt = stmt.where(
HierarchyNodeByConnectorCredentialPair.hierarchy_node_id.notin_(
live_hierarchy_node_ids
)
)
result: CursorResult = db_session.execute(stmt) # type: ignore[assignment]
deleted = result.rowcount
if commit:
db_session.commit()
elif deleted:
db_session.flush()
return deleted
def delete_orphaned_hierarchy_nodes(
db_session: Session,
source: DocumentSource,
commit: bool = True,
) -> list[str]:
"""Delete hierarchy nodes for a source that have zero cc_pair associations.
SOURCE-type nodes are excluded (they are synthetic roots).
Returns the list of raw_node_ids that were deleted (for cache eviction).
"""
# Find orphaned nodes: no rows in the join table
orphan_stmt = (
select(HierarchyNode.id, HierarchyNode.raw_node_id)
.outerjoin(
HierarchyNodeByConnectorCredentialPair,
HierarchyNode.id
== HierarchyNodeByConnectorCredentialPair.hierarchy_node_id,
)
.where(
HierarchyNode.source == source,
HierarchyNode.node_type != HierarchyNodeType.SOURCE,
HierarchyNodeByConnectorCredentialPair.hierarchy_node_id.is_(None),
)
)
orphans = db_session.execute(orphan_stmt).all()
if not orphans:
return []
orphan_ids = [row[0] for row in orphans]
deleted_raw_ids = [row[1] for row in orphans]
db_session.execute(delete(HierarchyNode).where(HierarchyNode.id.in_(orphan_ids)))
if commit:
db_session.commit()
else:
db_session.flush()
return deleted_raw_ids
def reparent_orphaned_hierarchy_nodes(
db_session: Session,
source: DocumentSource,
commit: bool = True,
) -> list[HierarchyNode]:
"""Re-parent hierarchy nodes whose parent_id is NULL to the SOURCE node.
After pruning deletes stale nodes, their former children get parent_id=NULL
via the SET NULL cascade. This function points them back to the SOURCE root.
Returns the reparented HierarchyNode objects (with updated parent_id)
so callers can refresh downstream caches.
"""
source_node = get_source_hierarchy_node(db_session, source)
if not source_node:
return []
stmt = select(HierarchyNode).where(
HierarchyNode.source == source,
HierarchyNode.parent_id.is_(None),
HierarchyNode.node_type != HierarchyNodeType.SOURCE,
)
orphans = list(db_session.execute(stmt).scalars().all())
if not orphans:
return []
for node in orphans:
node.parent_id = source_node.id
if commit:
db_session.commit()
else:
db_session.flush()
return orphans

View File

@@ -25,6 +25,7 @@ from sqlalchemy import desc
from sqlalchemy import Enum
from sqlalchemy import Float
from sqlalchemy import ForeignKey
from sqlalchemy import ForeignKeyConstraint
from sqlalchemy import func
from sqlalchemy import Index
from sqlalchemy import Integer
@@ -36,9 +37,11 @@ from sqlalchemy import Text
from sqlalchemy import text
from sqlalchemy import UniqueConstraint
from sqlalchemy.dialects import postgresql
from sqlalchemy import event
from sqlalchemy.engine.interfaces import Dialect
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import Mapper
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship
from sqlalchemy.types import LargeBinary
@@ -117,10 +120,52 @@ class Base(DeclarativeBase):
__abstract__ = True
class EncryptedString(TypeDecorator):
class _EncryptedBase(TypeDecorator):
"""Base for encrypted column types that wrap values in SensitiveValue."""
impl = LargeBinary
# This type's behavior is fully deterministic and doesn't depend on any external factors.
cache_ok = True
_is_json: bool = False
def wrap_raw(self, value: Any) -> SensitiveValue:
"""Encrypt a raw value and wrap it in SensitiveValue.
Called by the attribute set event so the Python-side type is always
SensitiveValue, regardless of whether the value was loaded from the DB
or assigned in application code.
"""
if self._is_json:
if not isinstance(value, dict):
raise TypeError(
f"EncryptedJson column expected dict, got {type(value).__name__}"
)
raw_str = json.dumps(value)
else:
if not isinstance(value, str):
raise TypeError(
f"EncryptedString column expected str, got {type(value).__name__}"
)
raw_str = value
return SensitiveValue(
encrypted_bytes=encrypt_string_to_bytes(raw_str),
decrypt_fn=decrypt_bytes_to_string,
is_json=self._is_json,
)
def compare_values(self, x: Any, y: Any) -> bool:
if x is None or y is None:
return x == y
if isinstance(x, SensitiveValue):
x = x.get_value(apply_mask=False)
if isinstance(y, SensitiveValue):
y = y.get_value(apply_mask=False)
return x == y
class EncryptedString(_EncryptedBase):
# Must redeclare cache_ok in this child class since we explicitly redeclare _is_json
cache_ok = True
_is_json: bool = False
def process_bind_param(
self, value: str | SensitiveValue[str] | None, dialect: Dialect # noqa: ARG002
@@ -144,20 +189,10 @@ class EncryptedString(TypeDecorator):
)
return None
def compare_values(self, x: Any, y: Any) -> bool:
if x is None or y is None:
return x == y
if isinstance(x, SensitiveValue):
x = x.get_value(apply_mask=False)
if isinstance(y, SensitiveValue):
y = y.get_value(apply_mask=False)
return x == y
class EncryptedJson(TypeDecorator):
impl = LargeBinary
# This type's behavior is fully deterministic and doesn't depend on any external factors.
class EncryptedJson(_EncryptedBase):
cache_ok = True
_is_json: bool = True
def process_bind_param(
self,
@@ -165,9 +200,7 @@ class EncryptedJson(TypeDecorator):
dialect: Dialect, # noqa: ARG002
) -> bytes | None:
if value is not None:
# Handle both raw dicts and SensitiveValue wrappers
if isinstance(value, SensitiveValue):
# Get raw value for storage
value = value.get_value(apply_mask=False)
json_str = json.dumps(value)
return encrypt_string_to_bytes(json_str)
@@ -184,14 +217,40 @@ class EncryptedJson(TypeDecorator):
)
return None
def compare_values(self, x: Any, y: Any) -> bool:
if x is None or y is None:
return x == y
if isinstance(x, SensitiveValue):
x = x.get_value(apply_mask=False)
if isinstance(y, SensitiveValue):
y = y.get_value(apply_mask=False)
return x == y
_REGISTERED_ATTRS: set[str] = set()
@event.listens_for(Mapper, "mapper_configured")
def _register_sensitive_value_set_events(
mapper: Mapper,
class_: type,
) -> None:
"""Auto-wrap raw values in SensitiveValue when assigned to encrypted columns."""
for prop in mapper.column_attrs:
for col in prop.columns:
if isinstance(col.type, _EncryptedBase):
col_type = col.type
attr = getattr(class_, prop.key)
# Guard against double-registration (e.g. if mapper is
# re-configured in test setups)
attr_key = f"{class_.__qualname__}.{prop.key}"
if attr_key in _REGISTERED_ATTRS:
continue
_REGISTERED_ATTRS.add(attr_key)
@event.listens_for(attr, "set", retval=True)
def _wrap_value(
target: Any, # noqa: ARG001
value: Any,
oldvalue: Any, # noqa: ARG001
initiator: Any, # noqa: ARG001
_col_type: _EncryptedBase = col_type,
) -> Any:
if value is not None and not isinstance(value, SensitiveValue):
return _col_type.wrap_raw(value)
return value
class NullFilteredString(TypeDecorator):
@@ -280,6 +339,16 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
TIMESTAMPAware(timezone=True), nullable=True
)
created_at: Mapped[datetime.datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
updated_at: Mapped[datetime.datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(),
onupdate=func.now(),
nullable=False,
)
default_model: Mapped[str] = mapped_column(Text, nullable=True)
# organized in typical structured fashion
# formatted as `displayName__provider__modelName`
@@ -2370,6 +2439,38 @@ class SyncRecord(Base):
)
class HierarchyNodeByConnectorCredentialPair(Base):
"""Tracks which cc_pairs reference each hierarchy node.
During pruning, stale entries are removed for the current cc_pair.
Hierarchy nodes with zero remaining entries are then deleted.
"""
__tablename__ = "hierarchy_node_by_connector_credential_pair"
hierarchy_node_id: Mapped[int] = mapped_column(
ForeignKey("hierarchy_node.id", ondelete="CASCADE"), primary_key=True
)
connector_id: Mapped[int] = mapped_column(primary_key=True)
credential_id: Mapped[int] = mapped_column(primary_key=True)
__table_args__ = (
ForeignKeyConstraint(
["connector_id", "credential_id"],
[
"connector_credential_pair.connector_id",
"connector_credential_pair.credential_id",
],
ondelete="CASCADE",
),
Index(
"ix_hierarchy_node_cc_pair_connector_credential",
"connector_id",
"credential_id",
),
)
class DocumentByConnectorCredentialPair(Base):
"""Represents an indexing of a document by a specific connector / credential pair"""

View File

@@ -205,7 +205,9 @@ def update_persona_access(
NOTE: Callers are responsible for committing."""
needs_sync = False
if is_public is not None:
needs_sync = True
persona = db_session.query(Persona).filter(Persona.id == persona_id).first()
if persona:
persona.is_public = is_public
@@ -213,6 +215,7 @@ def update_persona_access(
# NOTE: For user-ids and group-ids, `None` means "leave unchanged", `[]` means "clear all shares",
# and a non-empty list means "replace with these shares".
if user_ids is not None:
needs_sync = True
db_session.query(Persona__User).filter(
Persona__User.persona_id == persona_id
).delete(synchronize_session="fetch")
@@ -233,6 +236,7 @@ def update_persona_access(
# MIT doesn't support group-based sharing, so we allow clearing (no-op since
# there shouldn't be any) but raise an error if trying to add actual groups.
if group_ids is not None:
needs_sync = True
db_session.query(Persona__UserGroup).filter(
Persona__UserGroup.persona_id == persona_id
).delete(synchronize_session="fetch")
@@ -240,6 +244,10 @@ def update_persona_access(
if group_ids:
raise NotImplementedError("Onyx MIT does not support group-based sharing")
# When sharing changes, user file ACLs need to be updated in the vector DB
if needs_sync:
mark_persona_user_files_for_sync(persona_id, db_session)
def create_update_persona(
persona_id: int | None,
@@ -851,6 +859,24 @@ def update_personas_display_priority(
db_session.commit()
def mark_persona_user_files_for_sync(
persona_id: int,
db_session: Session,
) -> None:
"""When persona sharing changes, mark all of its user files for sync
so that their ACLs get updated in the vector DB."""
persona = (
db_session.query(Persona)
.options(selectinload(Persona.user_files))
.filter(Persona.id == persona_id)
.first()
)
if not persona:
return
file_ids = [uf.id for uf in persona.user_files]
_mark_files_need_persona_sync(db_session, file_ids)
def _mark_files_need_persona_sync(
db_session: Session,
user_file_ids: list[UUID],

View File

@@ -0,0 +1,161 @@
"""Rotate encryption key for all encrypted columns.
Dynamically discovers all columns using EncryptedString / EncryptedJson,
decrypts each value with the old key, and re-encrypts with the current
ENCRYPTION_KEY_SECRET.
The operation is idempotent: rows already encrypted with the current key
are skipped. Commits are made in batches so a crash mid-rotation can be
safely resumed by re-running.
"""
import json
from typing import Any
from sqlalchemy import LargeBinary
from sqlalchemy import select
from sqlalchemy import update
from sqlalchemy.orm import Session
from onyx.configs.app_configs import ENCRYPTION_KEY_SECRET
from onyx.db.models import Base
from onyx.db.models import EncryptedJson
from onyx.db.models import EncryptedString
from onyx.utils.encryption import decrypt_bytes_to_string
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import global_version
logger = setup_logger()
_BATCH_SIZE = 500
def _can_decrypt_with_current_key(data: bytes) -> bool:
"""Check if data is already encrypted with the current key.
Passes the key explicitly so the fallback-to-raw-decode path in
_decrypt_bytes is NOT triggered — a clean success/failure signal.
"""
try:
decrypt_bytes_to_string(data, key=ENCRYPTION_KEY_SECRET)
return True
except Exception:
return False
def _discover_encrypted_columns() -> list[tuple[type, str, list[str], bool]]:
"""Walk all ORM models and find columns using EncryptedString/EncryptedJson.
Returns list of (ModelClass, column_attr_name, [pk_attr_names], is_json).
"""
results: list[tuple[type, str, list[str], bool]] = []
for mapper in Base.registry.mappers:
model_cls = mapper.class_
pk_names = [col.key for col in mapper.primary_key]
for prop in mapper.column_attrs:
for col in prop.columns:
if isinstance(col.type, EncryptedJson):
results.append((model_cls, prop.key, pk_names, True))
elif isinstance(col.type, EncryptedString):
results.append((model_cls, prop.key, pk_names, False))
return results
def rotate_encryption_key(
db_session: Session,
old_key: str | None,
dry_run: bool = False,
) -> dict[str, int]:
"""Decrypt all encrypted columns with old_key and re-encrypt with the current key.
Args:
db_session: Active database session.
old_key: The previous encryption key. Pass None or "" if values were
not previously encrypted with a key.
dry_run: If True, count rows that need rotation without modifying data.
Returns:
Dict of "table.column" -> number of rows re-encrypted (or would be).
Commits every _BATCH_SIZE rows so that locks are held briefly and progress
is preserved on crash. Already-rotated rows are detected and skipped,
making the operation safe to re-run.
"""
if not global_version.is_ee_version():
raise RuntimeError("EE mode is not enabled — rotation requires EE encryption.")
if not ENCRYPTION_KEY_SECRET:
raise RuntimeError(
"ENCRYPTION_KEY_SECRET is not set — cannot rotate. "
"Set the target encryption key in the environment before running."
)
encrypted_columns = _discover_encrypted_columns()
totals: dict[str, int] = {}
for model_cls, col_name, pk_names, is_json in encrypted_columns:
table_name: str = model_cls.__tablename__ # type: ignore[attr-defined]
col_attr = getattr(model_cls, col_name)
pk_attrs = [getattr(model_cls, pk) for pk in pk_names]
# Read raw bytes directly, bypassing the TypeDecorator
raw_col = col_attr.property.columns[0]
stmt = select(*pk_attrs, raw_col.cast(LargeBinary)).where(col_attr.is_not(None))
rows = db_session.execute(stmt).all()
reencrypted = 0
batch_pending = 0
for row in rows:
raw_bytes: bytes | None = row[-1]
if raw_bytes is None:
continue
if _can_decrypt_with_current_key(raw_bytes):
continue
try:
if not old_key:
decrypted_str = raw_bytes.decode("utf-8")
else:
decrypted_str = decrypt_bytes_to_string(raw_bytes, key=old_key)
# For EncryptedJson, parse back to dict so the TypeDecorator
# can json.dumps() it cleanly (avoids double-encoding).
value: Any = json.loads(decrypted_str) if is_json else decrypted_str
except (ValueError, UnicodeDecodeError) as e:
pk_vals = [row[i] for i in range(len(pk_names))]
logger.warning(
f"Could not decrypt/parse {table_name}.{col_name} "
f"row {pk_vals} — skipping: {e}"
)
continue
if not dry_run:
pk_filters = [pk_attr == row[i] for i, pk_attr in enumerate(pk_attrs)]
update_stmt = (
update(model_cls).where(*pk_filters).values({col_name: value})
)
db_session.execute(update_stmt)
batch_pending += 1
if batch_pending >= _BATCH_SIZE:
db_session.commit()
batch_pending = 0
reencrypted += 1
# Flush remaining rows in this column
if batch_pending > 0:
db_session.commit()
if reencrypted > 0:
totals[f"{table_name}.{col_name}"] = reencrypted
logger.info(
f"{'[DRY RUN] Would re-encrypt' if dry_run else 'Re-encrypted'} "
f"{reencrypted} value(s) in {table_name}.{col_name}"
)
return totals

View File

@@ -3,9 +3,11 @@ from uuid import UUID
from sqlalchemy import func
from sqlalchemy import select
from sqlalchemy.orm import joinedload
from sqlalchemy.orm import selectinload
from sqlalchemy.orm import Session
from onyx.db.models import Persona
from onyx.db.models import Project__UserFile
from onyx.db.models import UserFile
@@ -118,3 +120,31 @@ def get_file_ids_by_user_file_ids(
) -> list[str]:
user_files = db_session.query(UserFile).filter(UserFile.id.in_(user_file_ids)).all()
return [user_file.file_id for user_file in user_files]
def fetch_user_files_with_access_relationships(
user_file_ids: list[str],
db_session: Session,
eager_load_groups: bool = False,
) -> list[UserFile]:
"""Fetch user files with the owner and assistant relationships
eagerly loaded (needed for computing access control).
When eager_load_groups is True, Persona.groups is also loaded so that
callers can extract user-group names without a second DB round-trip."""
persona_sub_options = [
selectinload(Persona.users),
selectinload(Persona.user),
]
if eager_load_groups:
persona_sub_options.append(selectinload(Persona.groups))
return (
db_session.query(UserFile)
.options(
joinedload(UserFile.user),
selectinload(UserFile.assistants).options(*persona_sub_options),
)
.filter(UserFile.id.in_(user_file_ids))
.all()
)

View File

@@ -4,6 +4,7 @@ from uuid import UUID
from fastapi import HTTPException
from fastapi_users.password import PasswordHelper
from sqlalchemy import case
from sqlalchemy import func
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
@@ -11,6 +12,7 @@ from sqlalchemy.orm import Session
from sqlalchemy.sql import expression
from sqlalchemy.sql.elements import ColumnElement
from sqlalchemy.sql.elements import KeyedColumnElement
from sqlalchemy.sql.expression import or_
from onyx.auth.invited_users import remove_user_from_invited_users
from onyx.auth.schemas import UserRole
@@ -24,6 +26,7 @@ from onyx.db.models import Persona__User
from onyx.db.models import SamlAccount
from onyx.db.models import User
from onyx.db.models import User__UserGroup
from onyx.db.models import UserGroup
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
@@ -162,7 +165,13 @@ def _get_accepted_user_where_clause(
where_clause.append(User.role != UserRole.EXT_PERM_USER)
if email_filter_string is not None:
where_clause.append(email_col.ilike(f"%{email_filter_string}%"))
personal_name_col: KeyedColumnElement[Any] = User.__table__.c.personal_name
where_clause.append(
or_(
email_col.ilike(f"%{email_filter_string}%"),
personal_name_col.ilike(f"%{email_filter_string}%"),
)
)
if roles_filter:
where_clause.append(User.role.in_(roles_filter))
@@ -173,6 +182,21 @@ def _get_accepted_user_where_clause(
return where_clause
def get_all_accepted_users(
db_session: Session,
include_external: bool = False,
) -> Sequence[User]:
"""Returns all accepted users without pagination.
Uses the same filtering as the paginated endpoint but without
search, role, or active filters."""
stmt = select(User)
where_clause = _get_accepted_user_where_clause(
include_external=include_external,
)
stmt = stmt.where(*where_clause).order_by(User.email)
return db_session.scalars(stmt).unique().all()
def get_page_of_filtered_users(
db_session: Session,
page_size: int,
@@ -218,6 +242,41 @@ def get_total_filtered_users_count(
return db_session.scalar(total_count_stmt) or 0
def get_user_counts_by_role_and_status(
db_session: Session,
) -> dict[str, dict[str, int]]:
"""Returns user counts grouped by role and by active/inactive status.
Excludes API key users, anonymous users, and no-auth placeholder users.
Uses a single query with conditional aggregation.
"""
base_where = _get_accepted_user_where_clause()
role_col = User.__table__.c.role
is_active_col = User.__table__.c.is_active
stmt = (
select(
role_col,
func.count().label("total"),
func.sum(case((is_active_col.is_(True), 1), else_=0)).label("active"),
func.sum(case((is_active_col.is_(False), 1), else_=0)).label("inactive"),
)
.where(*base_where)
.group_by(role_col)
)
role_counts: dict[str, int] = {}
status_counts: dict[str, int] = {"active": 0, "inactive": 0}
for role_val, total, active, inactive in db_session.execute(stmt).all():
key = role_val.value if hasattr(role_val, "value") else str(role_val)
role_counts[key] = total
status_counts["active"] += active or 0
status_counts["inactive"] += inactive or 0
return {"role_counts": role_counts, "status_counts": status_counts}
def get_user_by_email(email: str, db_session: Session) -> User | None:
user = (
db_session.query(User)
@@ -294,24 +353,23 @@ def batch_add_ext_perm_user_if_not_exists(
lower_emails = [email.lower() for email in emails]
found_users, missing_lower_emails = _get_users_by_emails(db_session, lower_emails)
new_users: list[User] = []
# Use savepoints (begin_nested) so that a failed insert only rolls back
# that single user, not the entire transaction. A plain rollback() would
# discard all previously flushed users in the same transaction.
# We also avoid add_all() because SQLAlchemy 2.0's insertmanyvalues
# batch path hits a UUID sentinel mismatch with server_default columns.
for email in missing_lower_emails:
new_users.append(_generate_ext_permissioned_user(email=email))
user = _generate_ext_permissioned_user(email=email)
savepoint = db_session.begin_nested()
try:
db_session.add(user)
savepoint.commit()
except IntegrityError:
savepoint.rollback()
if not continue_on_error:
raise
try:
db_session.add_all(new_users)
db_session.commit()
except IntegrityError:
db_session.rollback()
if not continue_on_error:
raise
for user in new_users:
try:
db_session.add(user)
db_session.commit()
except IntegrityError:
db_session.rollback()
continue
db_session.commit()
# Fetch all users again to ensure we have the most up-to-date list
all_users, _ = _get_users_by_emails(db_session, lower_emails)
return all_users
@@ -358,3 +416,28 @@ def delete_user_from_db(
# NOTE: edge case may exist with race conditions
# with this `invited user` scheme generally.
remove_user_from_invited_users(user_to_delete.email)
def batch_get_user_groups(
db_session: Session,
user_ids: list[UUID],
) -> dict[UUID, list[tuple[int, str]]]:
"""Fetch group memberships for a batch of users in a single query.
Returns a mapping of user_id -> list of (group_id, group_name) tuples."""
if not user_ids:
return {}
rows = db_session.execute(
select(
User__UserGroup.user_id,
UserGroup.id,
UserGroup.name,
)
.join(UserGroup, UserGroup.id == User__UserGroup.user_group_id)
.where(User__UserGroup.user_id.in_(user_ids))
).all()
result: dict[UUID, list[tuple[int, str]]] = {uid: [] for uid in user_ids}
for user_id, group_id, group_name in rows:
result[user_id].append((group_id, group_name))
return result

View File

@@ -1,5 +1,10 @@
# Default value for the maximum number of tokens a chunk can hold, if none is
# specified when creating an index.
from onyx.configs.app_configs import (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
)
DEFAULT_MAX_CHUNK_SIZE = 512
# Size of the dynamic list used to consider elements during kNN graph creation.
@@ -10,27 +15,43 @@ EF_CONSTRUCTION = 256
# quality but increase memory footprint. Values typically range between 12 - 48.
M = 32 # Set relatively high for better accuracy.
# When performing hybrid search, we need to consider more candidates than the number of results to be returned.
# This is because the scoring is hybrid and the results are reordered due to the hybrid scoring.
# Higher = more candidates for hybrid fusion = better retrieval accuracy, but results in more computation per query.
# Imagine a simple case with a single keyword query and a single vector query and we want 10 final docs.
# If we only fetch 10 candidates from each of keyword and vector, they would have to have perfect overlap to get a good hybrid
# ranking for the 10 results. If we fetch 1000 candidates from each, we have a much higher chance of all 10 of the final desired
# docs showing up and getting scored. In worse situations, the final 10 docs don't even show up as the final 10 (worse than just
# a miss at the reranking step).
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = 750
# When performing hybrid search, we need to consider more candidates than the
# number of results to be returned. This is because the scoring is hybrid and
# the results are reordered due to the hybrid scoring. Higher = more candidates
# for hybrid fusion = better retrieval accuracy, but results in more computation
# per query. Imagine a simple case with a single keyword query and a single
# vector query and we want 10 final docs. If we only fetch 10 candidates from
# each of keyword and vector, they would have to have perfect overlap to get a
# good hybrid ranking for the 10 results. If we fetch 1000 candidates from each,
# we have a much higher chance of all 10 of the final desired docs showing up
# and getting scored. In worse situations, the final 10 docs don't even show up
# as the final 10 (worse than just a miss at the reranking step).
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
if OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES > 0
else 750
)
# Number of vectors to examine for top k neighbors for the HNSW method.
# Number of vectors to examine to decide the top k neighbors for the HNSW
# method.
# NOTE: "When creating a search query, you must specify k. If you provide both k
# and ef_search, then the larger value is passed to the engine. If ef_search is
# larger than k, you can provide the size parameter to limit the final number of
# results to k." from
# https://docs.opensearch.org/latest/query-dsl/specialized/k-nn/index/#ef_search
EF_SEARCH = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
# Since the titles are included in the contents, they are heavily downweighted as they act as a boost
# rather than an independent scoring component.
# Since the titles are included in the contents, the embedding matches are
# heavily downweighted as they act as a boost rather than an independent scoring
# component.
SEARCH_TITLE_VECTOR_WEIGHT = 0.1
SEARCH_CONTENT_VECTOR_WEIGHT = 0.45
# Single keyword weight for both title and content (merged from former title keyword + content keyword).
# Single keyword weight for both title and content (merged from former title
# keyword + content keyword).
SEARCH_KEYWORD_WEIGHT = 0.45
# NOTE: it is critical that the order of these weights matches the order of the sub-queries in the hybrid search.
# NOTE: It is critical that the order of these weights matches the order of the
# sub-queries in the hybrid search.
HYBRID_SEARCH_NORMALIZATION_WEIGHTS = [
SEARCH_TITLE_VECTOR_WEIGHT,
SEARCH_CONTENT_VECTOR_WEIGHT,

View File

@@ -433,12 +433,16 @@ class OpenSearchOldDocumentIndex(OldDocumentIndex):
hidden=fields.hidden if fields else None,
project_ids=(
set(user_fields.user_projects)
if user_fields and user_fields.user_projects
# NOTE: Empty user_projects is semantically different from None
# user_projects.
if user_fields and user_fields.user_projects is not None
else None
),
persona_ids=(
set(user_fields.personas)
if user_fields and user_fields.personas
# NOTE: Empty personas is semantically different from None
# personas.
if user_fields and user_fields.personas is not None
else None
),
)

View File

@@ -255,8 +255,12 @@ class DocumentQuery:
f"result window ({DEFAULT_OPENSEARCH_MAX_RESULT_WINDOW})."
)
# TODO(andrei, yuhong): We can tune this more dynamically based on
# num_hits.
max_results_per_subquery = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
hybrid_search_subqueries = DocumentQuery._get_hybrid_search_subqueries(
query_text, query_vector
query_text, query_vector, vector_candidates=max_results_per_subquery
)
hybrid_search_filters = DocumentQuery._get_search_filters(
tenant_state=tenant_state,
@@ -285,13 +289,16 @@ class DocumentQuery:
hybrid_search_query: dict[str, Any] = {
"hybrid": {
"queries": hybrid_search_subqueries,
# Max results per subquery per shard before aggregation. Ensures keyword and vector
# subqueries contribute equally to the candidate pool for hybrid fusion.
# Max results per subquery per shard before aggregation. Ensures
# keyword and vector subqueries contribute equally to the
# candidate pool for hybrid fusion.
# Sources:
# https://docs.opensearch.org/latest/vector-search/ai-search/hybrid-search/pagination/
# https://opensearch.org/blog/navigating-pagination-in-hybrid-queries-with-the-pagination_depth-parameter/
"pagination_depth": DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
# Applied to all the sub-queries independently (this avoids having subqueries having a lot of results thrown out).
"pagination_depth": max_results_per_subquery,
# Applied to all the sub-queries independently (this avoids
# subqueries having a lot of results thrown out during
# aggregation).
# Sources:
# https://docs.opensearch.org/latest/query-dsl/compound/hybrid/
# https://opensearch.org/blog/introducing-common-filter-support-for-hybrid-search-queries
@@ -374,9 +381,10 @@ class DocumentQuery:
def _get_hybrid_search_subqueries(
query_text: str,
query_vector: list[float],
# The default number of neighbors to consider for knn vector similarity search.
# This is higher than the number of results because the scoring is hybrid.
# for a detailed breakdown, see where the default value is set.
# The default number of neighbors to consider for knn vector similarity
# search. This is higher than the number of results because the scoring
# is hybrid. For a detailed breakdown, see where the default value is
# set.
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
) -> list[dict[str, Any]]:
"""Returns subqueries for hybrid search.
@@ -400,20 +408,27 @@ class DocumentQuery:
in a single hybrid query. Source:
https://docs.opensearch.org/latest/query-dsl/compound/hybrid/
NOTE: Each query is independent during the search phase, there is no backfilling of scores for missing query components.
What this means is that if a document was a good vector match but did not show up for keyword, it gets a score of 0 for
the keyword component of the hybrid scoring. This is not as bad as just disregarding a score though as there is
normalization applied after. So really it is "increasing" the missing score compared to if it was included and the range
was renormalized. This does however mean that between docs that have high scores for say the vector field, the keyword
scores between them are completely ignored unless they also showed up in the keyword query as a reasonably high match.
TLDR, this is a bit of unique funky behavior but it seems ok.
NOTE: Each query is independent during the search phase, there is no
backfilling of scores for missing query components. What this means is
that if a document was a good vector match but did not show up for
keyword, it gets a score of 0 for the keyword component of the hybrid
scoring. This is not as bad as just disregarding a score though as there
is normalization applied after. So really it is "increasing" the missing
score compared to if it was included and the range was renormalized.
This does however mean that between docs that have high scores for say
the vector field, the keyword scores between them are completely ignored
unless they also showed up in the keyword query as a reasonably high
match. TLDR, this is a bit of unique funky behavior but it seems ok.
NOTE: Options considered and rejected:
- minimum_should_match: Since it's hybrid search and users often provide semantic queries, there is often a lot of terms,
and very low number of meaningful keywords (and a low ratio of keywords).
- fuzziness AUTO: typo tolerance (0/1/2 edit distance by term length). It's mostly for typos as the analyzer ("english by
default") already does some stemming and tokenization. In testing datasets, this makes recall slightly worse. It also is
less performant so not really any reason to do it.
- minimum_should_match: Since it's hybrid search and users often provide
semantic queries, there is often a lot of terms, and very low number
of meaningful keywords (and a low ratio of keywords).
- fuzziness AUTO: Typo tolerance (0/1/2 edit distance by term length).
It's mostly for typos as the analyzer ("english" by default) already
does some stemming and tokenization. In testing datasets, this makes
recall slightly worse. It also is less performant so not really any
reason to do it.
Args:
query_text: The text of the query to search for.
@@ -723,14 +738,13 @@ class DocumentQuery:
# document's metadata list.
filter_clauses.append(_get_tag_filter(tags))
# Knowledge scope: explicit knowledge attachments restrict what
# an assistant can see. When none are set the assistant
# searches everything.
# Knowledge scope: explicit knowledge attachments restrict what an
# assistant can see. When none are set the assistant searches
# everything.
#
# project_id / persona_id are additive: they make overflowing
# user files findable but must NOT trigger the restriction on
# their own (an agent with no explicit knowledge should search
# everything).
# project_id / persona_id are additive: they make overflowing user files
# findable but must NOT trigger the restriction on their own (an agent
# with no explicit knowledge should search everything).
has_knowledge_scope = (
attached_document_ids
or hierarchy_node_ids
@@ -758,9 +772,8 @@ class DocumentQuery:
knowledge_filter["bool"]["should"].append(
_get_document_set_filter(document_sets)
)
# Additive: widen scope to also cover overflowing user
# files, but only when an explicit restriction is already
# in effect.
# Additive: widen scope to also cover overflowing user files, but
# only when an explicit restriction is already in effect.
if project_id is not None:
knowledge_filter["bool"]["should"].append(
_get_user_project_filter(project_id)

View File

@@ -690,9 +690,12 @@ class VespaIndex(DocumentIndex):
)
project_ids: set[int] | None = None
# NOTE: Empty user_projects is semantically different from None
# user_projects.
if user_fields is not None and user_fields.user_projects is not None:
project_ids = set(user_fields.user_projects)
persona_ids: set[int] | None = None
# NOTE: Empty personas is semantically different from None personas.
if user_fields is not None and user_fields.personas is not None:
persona_ids = set(user_fields.personas)
update_request = MetadataUpdateRequest(

View File

@@ -19,12 +19,16 @@ class OnyxMimeTypes:
PLAIN_TEXT_MIME_TYPE,
"text/markdown",
"text/x-markdown",
"text/x-log",
"text/x-config",
"text/tab-separated-values",
"application/json",
"application/xml",
"text/xml",
"application/x-yaml",
"application/yaml",
"text/yaml",
"text/x-yaml",
}
DOCUMENT_MIME_TYPES = {
PDF_MIME_TYPE,

View File

@@ -123,15 +123,11 @@ class DocumentIndexingBatchAdapter:
}
doc_id_to_new_chunk_cnt: dict[str, int] = {
document_id: len(
[
chunk
for chunk in chunks_with_embeddings
if chunk.source_document.id == document_id
]
)
for document_id in updatable_ids
doc_id: 0 for doc_id in updatable_ids
}
for chunk in chunks_with_embeddings:
if chunk.source_document.id in doc_id_to_new_chunk_cnt:
doc_id_to_new_chunk_cnt[chunk.source_document.id] += 1
# Get ancestor hierarchy node IDs for each document
doc_id_to_ancestor_ids = self._get_ancestor_ids_for_documents(

View File

@@ -16,6 +16,7 @@ from onyx.indexing.models import DocAwareChunk
from onyx.indexing.models import IndexChunk
from onyx.natural_language_processing.search_nlp_models import EmbeddingModel
from onyx.utils.logger import setup_logger
from onyx.utils.pydantic_util import shallow_model_dump
from onyx.utils.timing import log_function_time
from shared_configs.configs import INDEXING_MODEL_SERVER_HOST
from shared_configs.configs import INDEXING_MODEL_SERVER_PORT
@@ -210,8 +211,8 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
)[0]
title_embed_dict[title] = title_embedding
new_embedded_chunk = IndexChunk(
**chunk.model_dump(),
new_embedded_chunk = IndexChunk.model_construct(
**shallow_model_dump(chunk),
embeddings=ChunkEmbedding(
full_embedding=chunk_embeddings[0],
mini_chunk_embeddings=chunk_embeddings[1:],

View File

@@ -12,6 +12,7 @@ from onyx.connectors.models import Document
from onyx.db.enums import EmbeddingPrecision
from onyx.db.enums import SwitchoverType
from onyx.utils.logger import setup_logger
from onyx.utils.pydantic_util import shallow_model_dump
from shared_configs.enums import EmbeddingProvider
from shared_configs.model_server_models import Embedding
@@ -133,9 +134,8 @@ class DocMetadataAwareIndexChunk(IndexChunk):
tenant_id: str,
ancestor_hierarchy_node_ids: list[int] | None = None,
) -> "DocMetadataAwareIndexChunk":
index_chunk_data = index_chunk.model_dump()
return cls(
**index_chunk_data,
return cls.model_construct(
**shallow_model_dump(index_chunk),
access=access,
document_sets=document_sets,
user_project=user_project,

View File

@@ -1,4 +1,5 @@
import abc
from typing import cast
from onyx.utils.special_types import JSON_ro
@@ -7,6 +8,19 @@ class KvKeyNotFoundError(Exception):
pass
def unwrap_str(val: JSON_ro) -> str:
"""Unwrap a string stored as {"value": str} in the encrypted KV store.
Also handles legacy plain-string values cached in Redis."""
if isinstance(val, dict):
try:
return cast(str, val["value"])
except KeyError:
raise ValueError(
f"Expected dict with 'value' key, got keys: {list(val.keys())}"
)
return cast(str, val)
class KeyValueStore:
# In the Multi Tenant case, the tenant context is picked up automatically, it does not need to be passed in
# It's read from the global thread level variable

View File

@@ -3782,16 +3782,6 @@
"display_name": "Claude Sonnet 3.5",
"model_vendor": "anthropic"
},
"vertex_ai/claude-3-5-sonnet-v2": {
"display_name": "Claude Sonnet 3.5",
"model_vendor": "anthropic",
"model_version": "v2"
},
"vertex_ai/claude-3-5-sonnet-v2@20241022": {
"display_name": "Claude Sonnet 3.5 v2",
"model_vendor": "anthropic",
"model_version": "20241022"
},
"vertex_ai/claude-3-5-sonnet@20240620": {
"display_name": "Claude Sonnet 3.5",
"model_vendor": "anthropic",

View File

@@ -1,5 +1,9 @@
import re
from enum import Enum
# Matches Slack channel references like <#C097NBWMY8Y> or <#C097NBWMY8Y|channel-name>
SLACK_CHANNEL_REF_PATTERN = re.compile(r"<#([A-Z0-9]+)(?:\|([^>]+))?>")
LIKE_BLOCK_ACTION_ID = "feedback-like"
DISLIKE_BLOCK_ACTION_ID = "feedback-dislike"
SHOW_EVERYONE_ACTION_ID = "show-everyone"

View File

@@ -18,15 +18,18 @@ from onyx.configs.onyxbot_configs import ONYX_BOT_DISPLAY_ERROR_MSGS
from onyx.configs.onyxbot_configs import ONYX_BOT_NUM_RETRIES
from onyx.configs.onyxbot_configs import ONYX_BOT_REACT_EMOJI
from onyx.context.search.models import BaseFilters
from onyx.context.search.models import Tag
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.models import SlackChannelConfig
from onyx.db.models import User
from onyx.db.persona import get_persona_by_id
from onyx.db.users import get_user_by_email
from onyx.onyxbot.slack.blocks import build_slack_response_blocks
from onyx.onyxbot.slack.constants import SLACK_CHANNEL_REF_PATTERN
from onyx.onyxbot.slack.handlers.utils import send_team_member_message
from onyx.onyxbot.slack.models import SlackMessageInfo
from onyx.onyxbot.slack.models import ThreadMessage
from onyx.onyxbot.slack.utils import get_channel_from_id
from onyx.onyxbot.slack.utils import get_channel_name_from_id
from onyx.onyxbot.slack.utils import respond_in_thread_or_channel
from onyx.onyxbot.slack.utils import SlackRateLimiter
@@ -41,6 +44,51 @@ srl = SlackRateLimiter()
RT = TypeVar("RT") # return type
def resolve_channel_references(
message: str,
client: WebClient,
logger: OnyxLoggingAdapter,
) -> tuple[str, list[Tag]]:
"""Parse Slack channel references from a message, resolve IDs to names,
replace the raw markup with readable #channel-name, and return channel tags
for search filtering."""
tags: list[Tag] = []
channel_matches = SLACK_CHANNEL_REF_PATTERN.findall(message)
seen_channel_ids: set[str] = set()
for channel_id, channel_name_from_markup in channel_matches:
if channel_id in seen_channel_ids:
continue
seen_channel_ids.add(channel_id)
channel_name = channel_name_from_markup or None
if not channel_name:
try:
channel_info = get_channel_from_id(client=client, channel_id=channel_id)
channel_name = channel_info.get("name") or None
except Exception:
logger.warning(f"Failed to resolve channel name for ID: {channel_id}")
if not channel_name:
continue
# Replace raw Slack markup with readable channel name
if channel_name_from_markup:
message = message.replace(
f"<#{channel_id}|{channel_name_from_markup}>",
f"#{channel_name}",
)
else:
message = message.replace(
f"<#{channel_id}>",
f"#{channel_name}",
)
tags.append(Tag(tag_key="Channel", tag_value=channel_name))
return message, tags
def rate_limits(
client: WebClient, channel: str, thread_ts: Optional[str]
) -> Callable[[Callable[..., RT]], Callable[..., RT]]:
@@ -157,6 +205,20 @@ def handle_regular_answer(
user_message = messages[-1]
history_messages = messages[:-1]
# Resolve any <#CHANNEL_ID> references in the user message to readable
# channel names and extract channel tags for search filtering
resolved_message, channel_tags = resolve_channel_references(
message=user_message.message,
client=client,
logger=logger,
)
user_message = ThreadMessage(
message=resolved_message,
sender=user_message.sender,
role=user_message.role,
)
channel_name, _ = get_channel_name_from_id(
client=client,
channel_id=channel,
@@ -207,6 +269,7 @@ def handle_regular_answer(
source_type=None,
document_set=document_set_names,
time_cutoff=None,
tags=channel_tags if channel_tags else None,
)
new_message_request = SendMessageRequest(
@@ -231,6 +294,16 @@ def handle_regular_answer(
slack_context_str=slack_context_str,
)
# If a channel filter was applied but no results were found, override
# the LLM response to avoid hallucinated answers about unindexed channels
if channel_tags and not answer.citation_info and not answer.top_documents:
channel_names = ", ".join(f"#{tag.tag_value}" for tag in channel_tags)
answer.answer = (
f"No indexed data found for {channel_names}. "
"This channel may not be indexed, or there may be no messages "
"matching your query within it."
)
except Exception as e:
logger.exception(
f"Unable to process message - did not successfully answer "
@@ -285,6 +358,7 @@ def handle_regular_answer(
only_respond_if_citations
and not answer.citation_info
and not message_info.bypass_filters
and not channel_tags
):
logger.error(
f"Unable to find citations to answer: '{answer.answer}' - not answering!"

View File

@@ -16,6 +16,7 @@ Cache Strategy:
using only the SOURCE-type node as the ancestor
"""
from typing import cast
from typing import TYPE_CHECKING
from pydantic import BaseModel
@@ -204,6 +205,30 @@ def cache_hierarchy_nodes_batch(
redis_client.expire(raw_id_key, HIERARCHY_CACHE_TTL_SECONDS)
def evict_hierarchy_nodes_from_cache(
redis_client: Redis,
source: DocumentSource,
raw_node_ids: list[str],
) -> None:
"""Remove specific hierarchy nodes from the Redis cache.
Deletes entries from both the parent-chain hash and the raw_id→node_id hash.
"""
if not raw_node_ids:
return
cache_key = _cache_key(source)
raw_id_key = _raw_id_cache_key(source)
# Look up node_ids so we can remove them from the parent-chain hash
raw_values = cast(list[str | None], redis_client.hmget(raw_id_key, raw_node_ids))
node_id_strs = [v for v in raw_values if v is not None]
if node_id_strs:
redis_client.hdel(cache_key, *node_id_strs)
redis_client.hdel(raw_id_key, *raw_node_ids)
def get_node_id_from_raw_id(
redis_client: Redis,
source: DocumentSource,

View File

@@ -1905,7 +1905,7 @@ def get_connector_by_id(
@router.post("/connector-request")
def submit_connector_request(
request_data: ConnectorRequestSubmission,
user: User | None = Depends(current_user),
user: User = Depends(current_user),
) -> StatusResponse:
"""
Submit a connector request for Cloud deployments.
@@ -1918,7 +1918,7 @@ def submit_connector_request(
raise HTTPException(status_code=400, detail="Connector name cannot be empty")
# Get user identifier for telemetry
user_email = user.email if user else None
user_email = user.email
distinct_id = user_email or tenant_id
# Track connector request via PostHog telemetry (Cloud only)

View File

@@ -57,9 +57,6 @@ def list_messages(
db_session: Session = Depends(get_session),
) -> MessageListResponse:
"""Get all messages for a build session."""
if user is None:
raise HTTPException(status_code=401, detail="Authentication required")
session_manager = SessionManager(db_session)
messages = session_manager.list_messages(session_id, user.id)

View File

@@ -732,7 +732,7 @@ def get_webapp_info(
return WebappInfo(**webapp_info)
@router.get("/{session_id}/webapp/download")
@router.get("/{session_id}/webapp-download")
def download_webapp(
session_id: UUID,
user: User = Depends(current_user),

View File

@@ -7424,9 +7424,9 @@
}
},
"node_modules/hono": {
"version": "4.12.5",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.5.tgz",
"integrity": "sha512-3qq+FUBtlTHhtYxbxheZgY8NIFnkkC/MR8u5TTsr7YZ3wixryQ3cCwn3iZbg8p8B88iDBBAYSfZDS75t8MN7Vg==",
"version": "4.12.7",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.12.7.tgz",
"integrity": "sha512-jq9l1DM0zVIvsm3lv9Nw9nlJnMNPOcAtsbsgiUhWcFzPE99Gvo6yRTlszSLLYacMeQ6quHD6hMfId8crVHvexw==",
"license": "MIT",
"engines": {
"node": ">=16.9.0"

View File

@@ -54,18 +54,14 @@ def _require_opensearch(db_session: Session) -> None:
)
def _get_user_access_info(
user: User | None, db_session: Session
) -> tuple[str | None, list[str]]:
if not user:
return None, []
def _get_user_access_info(user: User, db_session: Session) -> tuple[str, list[str]]:
return user.email, get_user_external_group_ids(db_session, user)
@router.get(HIERARCHY_NODES_LIST_PATH)
def list_accessible_hierarchy_nodes(
source: DocumentSource,
user: User | None = Depends(current_user),
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> HierarchyNodesResponse:
_require_opensearch(db_session)
@@ -92,7 +88,7 @@ def list_accessible_hierarchy_nodes(
@router.post(HIERARCHY_NODE_DOCUMENTS_PATH)
def list_accessible_hierarchy_node_documents(
documents_request: HierarchyNodeDocumentsRequest,
user: User | None = Depends(current_user),
user: User = Depends(current_user),
db_session: Session = Depends(get_session),
) -> HierarchyNodeDocumentsResponse:
_require_opensearch(db_session)

View File

@@ -1013,7 +1013,7 @@ def get_mcp_servers_for_assistant(
@router.get("/servers", response_model=MCPServersResponse)
def get_mcp_servers_for_user(
db: Session = Depends(get_session),
user: User | None = Depends(current_user),
user: User = Depends(current_user),
) -> MCPServersResponse:
"""List all MCP servers for use in agent configuration and chat UI.

View File

@@ -10,6 +10,8 @@ from pydantic import Field
from sqlalchemy.orm import Session
from onyx.configs.app_configs import FILE_TOKEN_COUNT_THRESHOLD
from onyx.configs.app_configs import USER_FILE_MAX_UPLOAD_SIZE_BYTES
from onyx.configs.app_configs import USER_FILE_MAX_UPLOAD_SIZE_MB
from onyx.db.llm import fetch_default_llm_model
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
@@ -35,6 +37,38 @@ def get_safe_filename(upload: UploadFile) -> str:
return upload.filename
def get_upload_size_bytes(upload: UploadFile) -> int | None:
"""Best-effort file size in bytes without consuming the stream."""
if upload.size is not None:
return upload.size
try:
current_pos = upload.file.tell()
upload.file.seek(0, 2)
size = upload.file.tell()
upload.file.seek(current_pos)
return size
except Exception as e:
logger.warning(
"Could not determine upload size via stream seek "
f"(filename='{get_safe_filename(upload)}', "
f"error_type={type(e).__name__}, error={e})"
)
return None
def is_upload_too_large(upload: UploadFile, max_bytes: int) -> bool:
"""Return True when upload size is known and exceeds max_bytes."""
size_bytes = get_upload_size_bytes(upload)
if size_bytes is None:
logger.warning(
"Could not determine upload size; skipping size-limit check for "
f"'{get_safe_filename(upload)}'"
)
return False
return size_bytes > max_bytes
# Guard against extremely large images
Image.MAX_IMAGE_PIXELS = 12000 * 12000
@@ -159,6 +193,18 @@ def categorize_uploaded_files(
for upload in files:
try:
filename = get_safe_filename(upload)
# Size limit is a hard safety cap and is enforced even when token
# threshold checks are skipped via SKIP_USERFILE_THRESHOLD settings.
if is_upload_too_large(upload, USER_FILE_MAX_UPLOAD_SIZE_BYTES):
results.rejected.append(
RejectedFile(
filename=filename,
reason=f"Exceeds {USER_FILE_MAX_UPLOAD_SIZE_MB} MB file size limit",
)
)
continue
extension = get_file_ext(filename)
# If image, estimate tokens via dedicated method first

View File

@@ -5,6 +5,7 @@ from datetime import datetime
from datetime import timedelta
from datetime import timezone
from typing import cast
from uuid import UUID
import jwt
from email_validator import EmailNotValidError
@@ -18,6 +19,7 @@ from fastapi import Query
from fastapi import Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from sqlalchemy import select
from sqlalchemy.orm import Session
from onyx.auth.anonymous_user import fetch_anonymous_user_info
@@ -67,11 +69,14 @@ from onyx.db.user_preferences import update_user_role
from onyx.db.user_preferences import update_user_shortcut_enabled
from onyx.db.user_preferences import update_user_temperature_override_enabled
from onyx.db.user_preferences import update_user_theme_preference
from onyx.db.users import batch_get_user_groups
from onyx.db.users import delete_user_from_db
from onyx.db.users import get_all_accepted_users
from onyx.db.users import get_all_users
from onyx.db.users import get_page_of_filtered_users
from onyx.db.users import get_total_filtered_users_count
from onyx.db.users import get_user_by_email
from onyx.db.users import get_user_counts_by_role_and_status
from onyx.db.users import validate_user_role_update
from onyx.key_value_store.factory import get_kv_store
from onyx.redis.redis_pool import get_raw_redis_client
@@ -98,6 +103,7 @@ from onyx.server.manage.models import UserSpecificAssistantPreferences
from onyx.server.models import FullUserSnapshot
from onyx.server.models import InvitedUserSnapshot
from onyx.server.models import MinimalUserSnapshot
from onyx.server.models import UserGroupInfo
from onyx.server.usage_limits import is_tenant_on_trial_fn
from onyx.server.utils import BasicAuthenticationError
from onyx.utils.logger import setup_logger
@@ -203,14 +209,91 @@ def list_accepted_users(
total_items=0,
)
user_ids = [user.id for user in filtered_accepted_users]
groups_by_user = batch_get_user_groups(db_session, user_ids)
# Batch-fetch SCIM mappings to mark synced users
scim_synced_ids: set[UUID] = set()
try:
from onyx.db.models import ScimUserMapping
scim_mappings = db_session.scalars(
select(ScimUserMapping.user_id).where(ScimUserMapping.user_id.in_(user_ids))
).all()
scim_synced_ids = set(scim_mappings)
except Exception:
logger.warning(
"Failed to fetch SCIM mappings; marking all users as non-synced",
exc_info=True,
)
return PaginatedReturn(
items=[
FullUserSnapshot.from_user_model(user) for user in filtered_accepted_users
FullUserSnapshot.from_user_model(
user,
groups=[
UserGroupInfo(id=gid, name=gname)
for gid, gname in groups_by_user.get(user.id, [])
],
is_scim_synced=user.id in scim_synced_ids,
)
for user in filtered_accepted_users
],
total_items=total_accepted_users_count,
)
@router.get("/manage/users/accepted/all", tags=PUBLIC_API_TAGS)
def list_all_accepted_users(
_: User = Depends(current_admin_user),
db_session: Session = Depends(get_session),
) -> list[FullUserSnapshot]:
"""Returns all accepted users without pagination.
Used by the admin Users page for client-side filtering/sorting."""
users = get_all_accepted_users(db_session=db_session)
if not users:
return []
user_ids = [user.id for user in users]
groups_by_user = batch_get_user_groups(db_session, user_ids)
# Batch-fetch SCIM mappings to mark synced users
scim_synced_ids: set[UUID] = set()
try:
from onyx.db.models import ScimUserMapping
scim_mappings = db_session.scalars(
select(ScimUserMapping.user_id).where(ScimUserMapping.user_id.in_(user_ids))
).all()
scim_synced_ids = set(scim_mappings)
except Exception:
logger.warning(
"Failed to fetch SCIM mappings; marking all users as non-synced",
exc_info=True,
)
return [
FullUserSnapshot.from_user_model(
user,
groups=[
UserGroupInfo(id=gid, name=gname)
for gid, gname in groups_by_user.get(user.id, [])
],
is_scim_synced=user.id in scim_synced_ids,
)
for user in users
]
@router.get("/manage/users/counts")
def get_user_counts(
_: User = Depends(current_admin_user),
db_session: Session = Depends(get_session),
) -> dict[str, dict[str, int]]:
return get_user_counts_by_role_and_status(db_session)
@router.get("/manage/users/invited", tags=PUBLIC_API_TAGS)
def list_invited_users(
_: User = Depends(current_admin_user),
@@ -269,24 +352,10 @@ def list_all_users(
if accepted_page is None or invited_page is None or slack_users_page is None:
return AllUsersResponse(
accepted=[
FullUserSnapshot(
id=user.id,
email=user.email,
role=user.role,
is_active=user.is_active,
password_configured=user.password_configured,
)
for user in accepted_users
FullUserSnapshot.from_user_model(user) for user in accepted_users
],
slack_users=[
FullUserSnapshot(
id=user.id,
email=user.email,
role=user.role,
is_active=user.is_active,
password_configured=user.password_configured,
)
for user in slack_users
FullUserSnapshot.from_user_model(user) for user in slack_users
],
invited=[InvitedUserSnapshot(email=email) for email in invited_emails],
accepted_pages=1,
@@ -296,26 +365,10 @@ def list_all_users(
# Otherwise, return paginated results
return AllUsersResponse(
accepted=[
FullUserSnapshot(
id=user.id,
email=user.email,
role=user.role,
is_active=user.is_active,
password_configured=user.password_configured,
)
for user in accepted_users
][accepted_page * USERS_PAGE_SIZE : (accepted_page + 1) * USERS_PAGE_SIZE],
slack_users=[
FullUserSnapshot(
id=user.id,
email=user.email,
role=user.role,
is_active=user.is_active,
password_configured=user.password_configured,
)
for user in slack_users
][
accepted=[FullUserSnapshot.from_user_model(user) for user in accepted_users][
accepted_page * USERS_PAGE_SIZE : (accepted_page + 1) * USERS_PAGE_SIZE
],
slack_users=[FullUserSnapshot.from_user_model(user) for user in slack_users][
slack_users_page
* USERS_PAGE_SIZE : (slack_users_page + 1)
* USERS_PAGE_SIZE

View File

@@ -1,3 +1,4 @@
import datetime
from typing import Generic
from typing import Optional
from typing import TypeVar
@@ -31,21 +32,41 @@ class MinimalUserSnapshot(BaseModel):
email: str
class UserGroupInfo(BaseModel):
id: int
name: str
class FullUserSnapshot(BaseModel):
id: UUID
email: str
role: UserRole
is_active: bool
password_configured: bool
personal_name: str | None
created_at: datetime.datetime
updated_at: datetime.datetime
groups: list[UserGroupInfo]
is_scim_synced: bool
@classmethod
def from_user_model(cls, user: User) -> "FullUserSnapshot":
def from_user_model(
cls,
user: User,
groups: list[UserGroupInfo] | None = None,
is_scim_synced: bool = False,
) -> "FullUserSnapshot":
return cls(
id=user.id,
email=user.email,
role=user.role,
is_active=user.is_active,
password_configured=user.password_configured,
personal_name=user.personal_name,
created_at=user.created_at,
updated_at=user.updated_at,
groups=groups or [],
is_scim_synced=is_scim_synced,
)

View File

@@ -42,6 +42,7 @@ class StreamingType(Enum):
REASONING_DONE = "reasoning_done"
CITATION_INFO = "citation_info"
TOOL_CALL_DEBUG = "tool_call_debug"
TOOL_CALL_ARGUMENT_DELTA = "tool_call_argument_delta"
MEMORY_TOOL_START = "memory_tool_start"
MEMORY_TOOL_DELTA = "memory_tool_delta"
@@ -276,6 +277,15 @@ class CustomToolDelta(BaseObj):
error: CustomToolErrorInfo | None = None
class ToolCallArgumentDelta(BaseObj):
type: Literal["tool_call_argument_delta"] = (
StreamingType.TOOL_CALL_ARGUMENT_DELTA.value
)
tool_type: str
argument_deltas: dict[str, Any]
################################################
# File Reader Packets
################################################
@@ -397,6 +407,7 @@ PacketObj = Union[
# Citation Packets
CitationInfo,
ToolCallDebug,
ToolCallArgumentDelta,
# Deep Research Packets
DeepResearchPlanStart,
DeepResearchPlanDelta,

View File

@@ -78,6 +78,7 @@ class Settings(BaseModel):
# User Knowledge settings
user_knowledge_enabled: bool | None = True
user_file_max_upload_size_mb: int | None = None
# Connector settings
show_extra_connectors: bool | None = True

View File

@@ -3,6 +3,7 @@ from onyx.configs.app_configs import DISABLE_USER_KNOWLEDGE
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
from onyx.configs.app_configs import ONYX_QUERY_HISTORY_TYPE
from onyx.configs.app_configs import SHOW_EXTRA_CONNECTORS
from onyx.configs.app_configs import USER_FILE_MAX_UPLOAD_SIZE_MB
from onyx.configs.constants import KV_SETTINGS_KEY
from onyx.configs.constants import OnyxRedisLocks
from onyx.key_value_store.factory import get_kv_store
@@ -50,6 +51,7 @@ def load_settings() -> Settings:
if DISABLE_USER_KNOWLEDGE:
settings.user_knowledge_enabled = False
settings.user_file_max_upload_size_mb = USER_FILE_MAX_UPLOAD_SIZE_MB
settings.show_extra_connectors = SHOW_EXTRA_CONNECTORS
settings.opensearch_indexing_enabled = ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
return settings

View File

@@ -56,3 +56,23 @@ def get_built_in_tool_ids() -> list[str]:
def get_built_in_tool_by_id(in_code_tool_id: str) -> Type[BUILT_IN_TOOL_TYPES]:
return BUILT_IN_TOOL_MAP[in_code_tool_id]
def _build_tool_name_to_class() -> dict[str, Type[BUILT_IN_TOOL_TYPES]]:
"""Build a mapping from LLM-facing tool name to tool class."""
result: dict[str, Type[BUILT_IN_TOOL_TYPES]] = {}
for cls in BUILT_IN_TOOL_MAP.values():
name_attr = cls.__dict__.get("name")
if isinstance(name_attr, property) and name_attr.fget is not None:
tool_name = name_attr.fget(cls)
elif isinstance(name_attr, str):
tool_name = name_attr
else:
raise ValueError(
f"Built-in tool {cls.__name__} must define a valid LLM-facing tool name"
)
result[tool_name] = cls
return result
TOOL_NAME_TO_CLASS: dict[str, Type[BUILT_IN_TOOL_TYPES]] = _build_tool_name_to_class()

View File

@@ -92,3 +92,7 @@ class Tool(abc.ABC, Generic[TOverride]):
**llm_kwargs: Any,
) -> ToolResponse:
raise NotImplementedError
@classmethod
def should_emit_argument_deltas(cls) -> bool:
return False

View File

@@ -376,3 +376,8 @@ class PythonTool(Tool[PythonToolOverrideKwargs]):
rich_response=None,
llm_facing_response=llm_response,
)
@classmethod
@override
def should_emit_argument_deltas(cls) -> bool:
return True

View File

@@ -11,16 +11,20 @@ logger = setup_logger()
# IMPORTANT DO NOT DELETE, THIS IS USED BY fetch_versioned_implementation
def _encrypt_string(input_str: str) -> bytes:
def _encrypt_string(input_str: str, key: str | None = None) -> bytes: # noqa: ARG001
if ENCRYPTION_KEY_SECRET:
logger.warning("MIT version of Onyx does not support encryption of secrets.")
elif key is not None:
logger.debug("MIT encrypt called with explicit key — key ignored.")
return input_str.encode()
# IMPORTANT DO NOT DELETE, THIS IS USED BY fetch_versioned_implementation
def _decrypt_bytes(input_bytes: bytes) -> str:
# No need to double warn. If you wish to learn more about encryption features
# refer to the Onyx EE code
def _decrypt_bytes(input_bytes: bytes, key: str | None = None) -> str: # noqa: ARG001
if ENCRYPTION_KEY_SECRET:
logger.warning("MIT version of Onyx does not support decryption of secrets.")
elif key is not None:
logger.debug("MIT decrypt called with explicit key — key ignored.")
return input_bytes.decode()
@@ -86,15 +90,15 @@ def _mask_list(items: list[Any]) -> list[Any]:
return masked
def encrypt_string_to_bytes(intput_str: str) -> bytes:
def encrypt_string_to_bytes(intput_str: str, key: str | None = None) -> bytes:
versioned_encryption_fn = fetch_versioned_implementation(
"onyx.utils.encryption", "_encrypt_string"
)
return versioned_encryption_fn(intput_str)
return versioned_encryption_fn(intput_str, key=key)
def decrypt_bytes_to_string(intput_bytes: bytes) -> str:
def decrypt_bytes_to_string(intput_bytes: bytes, key: str | None = None) -> str:
versioned_decryption_fn = fetch_versioned_implementation(
"onyx.utils.encryption", "_decrypt_bytes"
)
return versioned_decryption_fn(intput_bytes)
return versioned_decryption_fn(intput_bytes, key=key)

View File

@@ -0,0 +1,13 @@
from typing import Any
from pydantic import BaseModel
def shallow_model_dump(model_instance: BaseModel) -> dict[str, Any]:
"""Like model_dump(), but returns references to field values instead of
deep copies. Use with model_construct() to avoid unnecessary memory
duplication when building subclass instances."""
return {
field_name: getattr(model_instance, field_name)
for field_name in model_instance.__class__.model_fields
}

View File

@@ -128,6 +128,8 @@ class SensitiveValue(Generic[T]):
value = self._decrypt()
if not apply_mask:
# Callers must not mutate the returned dict — doing so would
# desync the cache from the encrypted bytes and the DB.
return value
# Apply masking
@@ -174,18 +176,20 @@ class SensitiveValue(Generic[T]):
)
def __eq__(self, other: Any) -> bool:
"""Prevent direct comparison which might expose value."""
if isinstance(other, SensitiveValue):
# Compare encrypted bytes for equality check
return self._encrypted_bytes == other._encrypted_bytes
raise SensitiveAccessError(
"Cannot compare SensitiveValue with non-SensitiveValue. "
"Use .get_value(apply_mask=True/False) to access the value for comparison."
)
"""Compare SensitiveValues by their decrypted content."""
# NOTE: if you attempt to compare a string/dict to a SensitiveValue,
# this comparison will return NotImplemented, which then evaluates to False.
# This is the convention and required for SQLAlchemy's attribute tracking.
if not isinstance(other, SensitiveValue):
return NotImplemented
return self._decrypt() == other._decrypt()
def __hash__(self) -> int:
"""Allow hashing based on encrypted bytes."""
return hash(self._encrypted_bytes)
"""Hash based on decrypted content."""
value = self._decrypt()
if isinstance(value, dict):
return hash(json.dumps(value, sort_keys=True))
return hash(value)
# Prevent JSON serialization
def __json__(self) -> Any:

View File

@@ -2,7 +2,6 @@ import contextvars
import threading
import uuid
from enum import Enum
from typing import cast
import requests
@@ -15,6 +14,7 @@ from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.models import User
from onyx.key_value_store.factory import get_kv_store
from onyx.key_value_store.interface import KvKeyNotFoundError
from onyx.key_value_store.interface import unwrap_str
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import (
fetch_versioned_implementation_with_fallback,
@@ -25,6 +25,7 @@ from shared_configs.contextvars import get_current_tenant_id
logger = setup_logger()
_DANSWER_TELEMETRY_ENDPOINT = "https://telemetry.onyx.app/anonymous_telemetry"
_CACHED_UUID: str | None = None
_CACHED_INSTANCE_DOMAIN: str | None = None
@@ -62,10 +63,10 @@ def get_or_generate_uuid() -> str:
kv_store = get_kv_store()
try:
_CACHED_UUID = cast(str, kv_store.load(KV_CUSTOMER_UUID_KEY))
_CACHED_UUID = unwrap_str(kv_store.load(KV_CUSTOMER_UUID_KEY))
except KvKeyNotFoundError:
_CACHED_UUID = str(uuid.uuid4())
kv_store.store(KV_CUSTOMER_UUID_KEY, _CACHED_UUID, encrypt=True)
kv_store.store(KV_CUSTOMER_UUID_KEY, {"value": _CACHED_UUID}, encrypt=True)
return _CACHED_UUID
@@ -79,14 +80,16 @@ def _get_or_generate_instance_domain() -> str | None: #
kv_store = get_kv_store()
try:
_CACHED_INSTANCE_DOMAIN = cast(str, kv_store.load(KV_INSTANCE_DOMAIN_KEY))
_CACHED_INSTANCE_DOMAIN = unwrap_str(kv_store.load(KV_INSTANCE_DOMAIN_KEY))
except KvKeyNotFoundError:
with get_session_with_current_tenant() as db_session:
first_user = db_session.query(User).first()
if first_user:
_CACHED_INSTANCE_DOMAIN = first_user.email.split("@")[-1]
kv_store.store(
KV_INSTANCE_DOMAIN_KEY, _CACHED_INSTANCE_DOMAIN, encrypt=True
KV_INSTANCE_DOMAIN_KEY,
{"value": _CACHED_INSTANCE_DOMAIN},
encrypt=True,
)
return _CACHED_INSTANCE_DOMAIN

View File

@@ -750,7 +750,7 @@ pypandoc-binary==1.16.2
# via onyx
pyparsing==3.2.5
# via httplib2
pypdf==6.7.5
pypdf==6.8.0
# via
# onyx
# unstructured-client
@@ -1020,7 +1020,7 @@ toolz==1.1.0
# dask
# distributed
# partd
tornado==6.5.2
tornado==6.5.5
# via distributed
tqdm==4.67.1
# via

View File

@@ -263,7 +263,7 @@ oauthlib==3.2.2
# via
# kubernetes
# requests-oauthlib
onyx-devtools==0.6.3
onyx-devtools==0.7.0
# via onyx
openai==2.14.0
# via
@@ -466,7 +466,7 @@ tokenizers==0.21.4
# via
# cohere
# litellm
tornado==6.5.2
tornado==6.5.5
# via
# ipykernel
# jupyter-client

View File

@@ -1,48 +1,93 @@
"""Decrypt a raw hex-encoded credential value.
Usage:
python -m scripts.decrypt <hex_value>
python -m scripts.decrypt <hex_value> --key "my-encryption-key"
python -m scripts.decrypt <hex_value> --key ""
Pass --key "" to skip decryption and just decode the raw bytes as UTF-8.
Omit --key to use the current ENCRYPTION_KEY_SECRET from the environment.
"""
import argparse
import binascii
import json
import os
import sys
from onyx.utils.encryption import decrypt_bytes_to_string
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from onyx.utils.encryption import decrypt_bytes_to_string # noqa: E402
from onyx.utils.variable_functionality import global_version # noqa: E402
def decrypt_raw_credential(encrypted_value: str) -> None:
"""Decrypt and display a raw encrypted credential value
def decrypt_raw_credential(encrypted_value: str, key: str | None = None) -> None:
"""Decrypt and display a raw encrypted credential value.
Args:
encrypted_value: The hex encoded encrypted credential value
encrypted_value: The hex-encoded encrypted credential value.
key: Encryption key to use. None means use ENCRYPTION_KEY_SECRET,
empty string means just decode as UTF-8.
"""
# Strip common hex prefixes
if encrypted_value.startswith("\\x"):
encrypted_value = encrypted_value[2:]
elif encrypted_value.startswith("x"):
encrypted_value = encrypted_value[1:]
print(encrypted_value)
try:
# If string starts with 'x', remove it as it's just a prefix indicating hex
if encrypted_value.startswith("x"):
encrypted_value = encrypted_value[1:]
elif encrypted_value.startswith("\\x"):
encrypted_value = encrypted_value[2:]
# Convert hex string to bytes
encrypted_bytes = binascii.unhexlify(encrypted_value)
# Decrypt the bytes
decrypted_str = decrypt_bytes_to_string(encrypted_bytes)
# Parse and pretty print the decrypted JSON
decrypted_json = json.loads(decrypted_str)
print("Decrypted credential value:")
print(json.dumps(decrypted_json, indent=2))
raw_bytes = binascii.unhexlify(encrypted_value)
except binascii.Error:
print("Error: Invalid hex encoded string")
print("Error: Invalid hex-encoded string")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Decrypted raw value (not JSON): {e}")
if key == "":
# Empty key → just decode as UTF-8, no decryption
try:
decrypted_str = raw_bytes.decode("utf-8")
except UnicodeDecodeError as e:
print(f"Error decoding bytes as UTF-8: {e}")
sys.exit(1)
else:
print(key)
try:
decrypted_str = decrypt_bytes_to_string(raw_bytes, key=key)
except Exception as e:
print(f"Error decrypting value: {e}")
sys.exit(1)
except Exception as e:
print(f"Error decrypting value: {e}")
# Try to pretty-print as JSON, otherwise print raw
try:
parsed = json.loads(decrypted_str)
print(json.dumps(parsed, indent=2))
except json.JSONDecodeError:
print(decrypted_str)
def main() -> None:
parser = argparse.ArgumentParser(
description="Decrypt a hex-encoded credential value."
)
parser.add_argument(
"value",
help="Hex-encoded encrypted value to decrypt.",
)
parser.add_argument(
"--key",
default=None,
help=(
"Encryption key. Omit to use ENCRYPTION_KEY_SECRET from env. "
'Pass "" (empty) to just decode as UTF-8 without decryption.'
),
)
args = parser.parse_args()
global_version.set_ee()
decrypt_raw_credential(args.value, key=args.key)
global_version.unset_ee()
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python decrypt.py <hex_encoded_encrypted_value>")
sys.exit(1)
encrypted_value = sys.argv[1]
decrypt_raw_credential(encrypted_value)
main()

View File

@@ -0,0 +1,107 @@
"""Re-encrypt secrets under the current ENCRYPTION_KEY_SECRET.
Decrypts all encrypted columns using the old key (or raw decode if the old key
is empty), then re-encrypts them with the current ENCRYPTION_KEY_SECRET.
Usage (docker):
docker exec -it onyx-api_server-1 \
python -m scripts.reencrypt_secrets --old-key "previous-key"
Usage (kubernetes):
kubectl exec -it <pod> -- \
python -m scripts.reencrypt_secrets --old-key "previous-key"
Omit --old-key (or pass "") if secrets were not previously encrypted.
For multi-tenant deployments, pass --tenant-id to target a specific tenant,
or --all-tenants to iterate every tenant.
"""
import argparse
import os
import sys
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from onyx.db.rotate_encryption_key import rotate_encryption_key # noqa: E402
from onyx.db.engine.sql_engine import get_session_with_tenant # noqa: E402
from onyx.db.engine.sql_engine import SqlEngine # noqa: E402
from onyx.db.engine.tenant_utils import get_all_tenant_ids # noqa: E402
from onyx.utils.variable_functionality import global_version # noqa: E402
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA # noqa: E402
def _run_for_tenant(tenant_id: str, old_key: str | None, dry_run: bool = False) -> None:
print(f"Re-encrypting secrets for tenant: {tenant_id}")
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
results = rotate_encryption_key(db_session, old_key=old_key, dry_run=dry_run)
if results:
for col, count in results.items():
print(
f" {col}: {count} row(s) {'would be ' if dry_run else ''}re-encrypted"
)
else:
print("No rows needed re-encryption.")
def main() -> None:
parser = argparse.ArgumentParser(
description="Re-encrypt secrets under the current encryption key."
)
parser.add_argument(
"--old-key",
default=None,
help="Previous encryption key. Omit or pass empty string if not applicable.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be re-encrypted without making changes.",
)
tenant_group = parser.add_mutually_exclusive_group()
tenant_group.add_argument(
"--tenant-id",
default=None,
help="Target a specific tenant schema.",
)
tenant_group.add_argument(
"--all-tenants",
action="store_true",
help="Iterate all tenants.",
)
args = parser.parse_args()
old_key = args.old_key if args.old_key else None
global_version.set_ee()
SqlEngine.init_engine(pool_size=5, max_overflow=2)
if args.dry_run:
print("DRY RUN — no changes will be made")
if args.all_tenants:
tenant_ids = get_all_tenant_ids()
print(f"Found {len(tenant_ids)} tenant(s)")
failed_tenants: list[str] = []
for tid in tenant_ids:
try:
_run_for_tenant(tid, old_key, dry_run=args.dry_run)
except Exception as e:
print(f" ERROR for tenant {tid}: {e}")
failed_tenants.append(tid)
if failed_tenants:
print(f"FAILED tenants ({len(failed_tenants)}): {failed_tenants}")
sys.exit(1)
else:
tenant_id = args.tenant_id or POSTGRES_DEFAULT_SCHEMA
_run_for_tenant(tenant_id, old_key, dry_run=args.dry_run)
print("Done.")
if __name__ == "__main__":
main()

View File

@@ -19,7 +19,7 @@ from fastapi.testclient import TestClient
from onyx.auth.users import current_admin_user
from onyx.db.engine.sql_engine import get_session
from onyx.db.models import UserRole
from onyx.main import fetch_versioned_implementation
from onyx.main import get_application
from onyx.utils.logger import setup_logger
logger = setup_logger()
@@ -51,11 +51,8 @@ def client() -> Generator[TestClient, None, None]:
# Patch out prometheus metrics setup to avoid "Duplicated timeseries in
# CollectorRegistry" errors when multiple tests each create a new app
# (prometheus registers metrics globally and rejects duplicate names).
get_app = fetch_versioned_implementation(
module="onyx.main", attribute="get_application"
)
with patch("onyx.main.setup_prometheus_metrics"):
app: FastAPI = get_app(lifespan_override=test_lifespan)
app: FastAPI = get_application(lifespan_override=test_lifespan)
# Override the database session dependency with a mock
# (these tests don't actually need DB access)

View File

@@ -48,7 +48,7 @@ def test_gitlab_connector_basic(gitlab_connector: GitlabConnector) -> None:
# --- Specific Document Details to Validate ---
target_mr_id = f"https://{gitlab_base_url}/{project_path}/-/merge_requests/1"
target_issue_id = f"https://{gitlab_base_url}/{project_path}/-/issues/2"
target_issue_id = f"https://{gitlab_base_url}/{project_path}/-/work_items/2"
target_code_file_semantic_id = "README.md"
# ---

View File

@@ -7,6 +7,8 @@ Verifies that:
3. Upserting is idempotent (running twice doesn't duplicate nodes)
4. Document-to-hierarchy-node linkage is updated during pruning
5. link_hierarchy_nodes_to_documents links nodes that are also documents
6. HierarchyNodeByConnectorCredentialPair join table population and pruning
7. Orphaned hierarchy node deletion and re-parenting
Uses a mock SlimConnectorWithPermSync that yields known hierarchy nodes and slim documents,
combined with a real PostgreSQL database for verifying persistence.
@@ -24,16 +26,27 @@ from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnectorWithPermSync
from onyx.connectors.models import HierarchyNode as PydanticHierarchyNode
from onyx.connectors.models import InputType
from onyx.connectors.models import SlimDocument
from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.enums import HierarchyNodeType
from onyx.db.hierarchy import delete_orphaned_hierarchy_nodes
from onyx.db.hierarchy import ensure_source_node_exists
from onyx.db.hierarchy import get_all_hierarchy_nodes_for_source
from onyx.db.hierarchy import get_hierarchy_node_by_raw_id
from onyx.db.hierarchy import link_hierarchy_nodes_to_documents
from onyx.db.hierarchy import remove_stale_hierarchy_node_cc_pair_entries
from onyx.db.hierarchy import reparent_orphaned_hierarchy_nodes
from onyx.db.hierarchy import update_document_parent_hierarchy_nodes
from onyx.db.hierarchy import upsert_hierarchy_node_cc_pair_entries
from onyx.db.hierarchy import upsert_hierarchy_nodes_batch
from onyx.db.models import Connector
from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import Credential
from onyx.db.models import Document as DbDocument
from onyx.db.models import HierarchyNode as DBHierarchyNode
from onyx.db.models import HierarchyNodeByConnectorCredentialPair
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.kg.models import KGStage
@@ -142,13 +155,80 @@ class MockSlimConnectorWithPermSync(SlimConnectorWithPermSync):
# ---------------------------------------------------------------------------
def _create_cc_pair(
db_session: Session,
source: DocumentSource = TEST_SOURCE,
) -> ConnectorCredentialPair:
"""Create a real Connector + Credential + ConnectorCredentialPair for testing."""
connector = Connector(
name=f"Test {source.value} Connector",
source=source,
input_type=InputType.LOAD_STATE,
connector_specific_config={},
)
db_session.add(connector)
db_session.flush()
credential = Credential(
source=source,
credential_json={},
admin_public=True,
)
db_session.add(credential)
db_session.flush()
db_session.expire(credential)
cc_pair = ConnectorCredentialPair(
connector_id=connector.id,
credential_id=credential.id,
name=f"Test {source.value} CC Pair",
status=ConnectorCredentialPairStatus.ACTIVE,
access_type=AccessType.PUBLIC,
)
db_session.add(cc_pair)
db_session.commit()
db_session.refresh(cc_pair)
return cc_pair
def _cleanup_test_data(db_session: Session) -> None:
"""Remove all test hierarchy nodes and documents to isolate tests."""
for doc_id in SLIM_DOC_IDS:
db_session.query(DbDocument).filter(DbDocument.id == doc_id).delete()
test_connector_ids_q = db_session.query(Connector.id).filter(
Connector.source == TEST_SOURCE,
Connector.name.like("Test %"),
)
db_session.query(HierarchyNodeByConnectorCredentialPair).filter(
HierarchyNodeByConnectorCredentialPair.connector_id.in_(test_connector_ids_q)
).delete(synchronize_session="fetch")
db_session.query(DBHierarchyNode).filter(
DBHierarchyNode.source == TEST_SOURCE
).delete()
db_session.flush()
# Collect credential IDs before deleting cc_pairs (bulk query.delete()
# bypasses ORM-level cascade, so credentials won't be auto-removed).
credential_ids = [
row[0]
for row in db_session.query(ConnectorCredentialPair.credential_id)
.filter(ConnectorCredentialPair.connector_id.in_(test_connector_ids_q))
.all()
]
db_session.query(ConnectorCredentialPair).filter(
ConnectorCredentialPair.connector_id.in_(test_connector_ids_q)
).delete(synchronize_session="fetch")
db_session.query(Connector).filter(
Connector.source == TEST_SOURCE,
Connector.name.like("Test %"),
).delete(synchronize_session="fetch")
if credential_ids:
db_session.query(Credential).filter(Credential.id.in_(credential_ids)).delete(
synchronize_session="fetch"
)
db_session.commit()
@@ -179,15 +259,8 @@ def test_pruning_extracts_hierarchy_nodes(db_session: Session) -> None: # noqa:
result = extract_ids_from_runnable_connector(connector, callback=None)
# Doc IDs should include both slim doc IDs and hierarchy node raw_node_ids
# (hierarchy node IDs are added to raw_id_to_parent so they aren't pruned)
expected_ids = {
CHANNEL_A_ID,
CHANNEL_B_ID,
CHANNEL_C_ID,
*SLIM_DOC_IDS,
}
assert result.raw_id_to_parent.keys() == expected_ids
# raw_id_to_parent should contain ONLY document IDs, not hierarchy node IDs
assert result.raw_id_to_parent.keys() == set(SLIM_DOC_IDS)
# Hierarchy nodes should be the 3 channels
assert len(result.hierarchy_nodes) == 3
@@ -395,9 +468,9 @@ def test_extraction_preserves_parent_hierarchy_raw_node_id(
result.raw_id_to_parent[doc_id] == expected_parent
), f"raw_id_to_parent[{doc_id}] should be {expected_parent}"
# Hierarchy node entries have None parent (they aren't documents)
# Hierarchy node IDs should NOT be in raw_id_to_parent
for channel_id in [CHANNEL_A_ID, CHANNEL_B_ID, CHANNEL_C_ID]:
assert result.raw_id_to_parent[channel_id] is None
assert channel_id not in result.raw_id_to_parent
def test_update_document_parent_hierarchy_nodes(db_session: Session) -> None:
@@ -565,3 +638,241 @@ def test_link_hierarchy_nodes_skips_non_hierarchy_sources(
commit=False,
)
assert linked == 0
# ---------------------------------------------------------------------------
# Join table + pruning tests
# ---------------------------------------------------------------------------
def test_upsert_hierarchy_node_cc_pair_entries(db_session: Session) -> None:
"""upsert_hierarchy_node_cc_pair_entries should insert rows and be idempotent."""
_cleanup_test_data(db_session)
ensure_source_node_exists(db_session, TEST_SOURCE, commit=True)
cc_pair = _create_cc_pair(db_session)
upserted = upsert_hierarchy_nodes_batch(
db_session=db_session,
nodes=_make_hierarchy_nodes(),
source=TEST_SOURCE,
commit=True,
is_connector_public=False,
)
node_ids = [n.id for n in upserted]
# First call — should insert rows
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=node_ids,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
commit=True,
)
rows = (
db_session.query(HierarchyNodeByConnectorCredentialPair)
.filter(
HierarchyNodeByConnectorCredentialPair.connector_id == cc_pair.connector_id,
HierarchyNodeByConnectorCredentialPair.credential_id
== cc_pair.credential_id,
)
.all()
)
assert len(rows) == 3
# Second call — idempotent, same count
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=node_ids,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
commit=True,
)
rows_after = (
db_session.query(HierarchyNodeByConnectorCredentialPair)
.filter(
HierarchyNodeByConnectorCredentialPair.connector_id == cc_pair.connector_id,
HierarchyNodeByConnectorCredentialPair.credential_id
== cc_pair.credential_id,
)
.all()
)
assert len(rows_after) == 3
def test_remove_stale_entries_and_delete_orphans(db_session: Session) -> None:
"""After removing stale join-table entries, orphaned hierarchy nodes should
be deleted and the SOURCE node should survive."""
_cleanup_test_data(db_session)
source_node = ensure_source_node_exists(db_session, TEST_SOURCE, commit=True)
cc_pair = _create_cc_pair(db_session)
upserted = upsert_hierarchy_nodes_batch(
db_session=db_session,
nodes=_make_hierarchy_nodes(),
source=TEST_SOURCE,
commit=True,
is_connector_public=False,
)
all_ids = [n.id for n in upserted]
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=all_ids,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
commit=True,
)
# Now simulate a pruning run where only channel A survived
channel_a = get_hierarchy_node_by_raw_id(db_session, CHANNEL_A_ID, TEST_SOURCE)
assert channel_a is not None
live_ids = {channel_a.id}
stale_removed = remove_stale_hierarchy_node_cc_pair_entries(
db_session=db_session,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
live_hierarchy_node_ids=live_ids,
commit=True,
)
assert stale_removed == 2
# Delete orphaned nodes
deleted_raw_ids = delete_orphaned_hierarchy_nodes(
db_session=db_session,
source=TEST_SOURCE,
commit=True,
)
assert set(deleted_raw_ids) == {CHANNEL_B_ID, CHANNEL_C_ID}
# Verify only channel A + SOURCE remain
remaining = get_all_hierarchy_nodes_for_source(db_session, TEST_SOURCE)
remaining_raw = {n.raw_node_id for n in remaining}
assert remaining_raw == {CHANNEL_A_ID, source_node.raw_node_id}
def test_multi_cc_pair_prevents_premature_deletion(db_session: Session) -> None:
"""A hierarchy node shared by two cc_pairs should NOT be deleted when only
one cc_pair removes its association."""
_cleanup_test_data(db_session)
ensure_source_node_exists(db_session, TEST_SOURCE, commit=True)
cc_pair_1 = _create_cc_pair(db_session)
cc_pair_2 = _create_cc_pair(db_session)
upserted = upsert_hierarchy_nodes_batch(
db_session=db_session,
nodes=_make_hierarchy_nodes(),
source=TEST_SOURCE,
commit=True,
is_connector_public=False,
)
all_ids = [n.id for n in upserted]
# cc_pair 1 owns all 3
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=all_ids,
connector_id=cc_pair_1.connector_id,
credential_id=cc_pair_1.credential_id,
commit=True,
)
# cc_pair 2 also owns all 3
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=all_ids,
connector_id=cc_pair_2.connector_id,
credential_id=cc_pair_2.credential_id,
commit=True,
)
# cc_pair 1 prunes — keeps none
remove_stale_hierarchy_node_cc_pair_entries(
db_session=db_session,
connector_id=cc_pair_1.connector_id,
credential_id=cc_pair_1.credential_id,
live_hierarchy_node_ids=set(),
commit=True,
)
# Orphan deletion should find nothing because cc_pair 2 still references them
deleted = delete_orphaned_hierarchy_nodes(
db_session=db_session,
source=TEST_SOURCE,
commit=True,
)
assert deleted == []
# All 3 nodes + SOURCE should still exist
remaining = get_all_hierarchy_nodes_for_source(db_session, TEST_SOURCE)
assert len(remaining) == 4
def test_reparent_orphaned_children(db_session: Session) -> None:
"""After deleting a parent hierarchy node, its children should be
re-parented to the SOURCE node."""
_cleanup_test_data(db_session)
source_node = ensure_source_node_exists(db_session, TEST_SOURCE, commit=True)
cc_pair = _create_cc_pair(db_session)
# Create a parent node and a child node
parent_node = PydanticHierarchyNode(
raw_node_id="PARENT",
raw_parent_id=None,
display_name="Parent",
node_type=HierarchyNodeType.CHANNEL,
)
child_node = PydanticHierarchyNode(
raw_node_id="CHILD",
raw_parent_id="PARENT",
display_name="Child",
node_type=HierarchyNodeType.CHANNEL,
)
upserted = upsert_hierarchy_nodes_batch(
db_session=db_session,
nodes=[parent_node, child_node],
source=TEST_SOURCE,
commit=True,
is_connector_public=False,
)
assert len(upserted) == 2
parent_db = get_hierarchy_node_by_raw_id(db_session, "PARENT", TEST_SOURCE)
child_db = get_hierarchy_node_by_raw_id(db_session, "CHILD", TEST_SOURCE)
assert parent_db is not None and child_db is not None
assert child_db.parent_id == parent_db.id
# Associate only the child with a cc_pair (parent is orphaned)
upsert_hierarchy_node_cc_pair_entries(
db_session=db_session,
hierarchy_node_ids=[child_db.id],
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
commit=True,
)
# Delete orphaned nodes (parent has no cc_pair entry)
deleted = delete_orphaned_hierarchy_nodes(
db_session=db_session,
source=TEST_SOURCE,
commit=True,
)
assert "PARENT" in deleted
# Child should now have parent_id=NULL (SET NULL cascade)
db_session.expire_all()
child_db = get_hierarchy_node_by_raw_id(db_session, "CHILD", TEST_SOURCE)
assert child_db is not None
assert child_db.parent_id is None
# Re-parent orphans to SOURCE
reparented = reparent_orphaned_hierarchy_nodes(
db_session=db_session,
source=TEST_SOURCE,
commit=True,
)
assert len(reparented) == 1
db_session.expire_all()
child_db = get_hierarchy_node_by_raw_id(db_session, "CHILD", TEST_SOURCE)
assert child_db is not None
assert child_db.parent_id == source_node.id

View File

@@ -0,0 +1,90 @@
"""Test that Credential with nested JSON round-trips through SensitiveValue correctly.
Exercises the full encrypt → store → read → decrypt → SensitiveValue path
with realistic nested OAuth credential data, and verifies SQLAlchemy dirty
tracking works with nested dict comparison.
Requires a running Postgres instance.
"""
from sqlalchemy.orm import Session
from onyx.configs.constants import DocumentSource
from onyx.db.models import Credential
from onyx.utils.sensitive import SensitiveValue
# NOTE: this is not the real shape of a Drive credential,
# but it is intended to test nested JSON credential handling
_NESTED_CRED_JSON = {
"oauth_tokens": {
"access_token": "ya29.abc123",
"refresh_token": "1//xEg-def456",
},
"scopes": ["read", "write", "admin"],
"client_config": {
"client_id": "123.apps.googleusercontent.com",
"client_secret": "GOCSPX-secret",
},
}
def test_nested_credential_json_round_trip(db_session: Session) -> None:
"""Nested OAuth credential survives encrypt → store → read → decrypt."""
credential = Credential(
source=DocumentSource.GOOGLE_DRIVE,
credential_json=_NESTED_CRED_JSON,
)
db_session.add(credential)
db_session.flush()
# Immediate read (no DB round-trip) — tests the set event wrapping
assert isinstance(credential.credential_json, SensitiveValue)
assert credential.credential_json.get_value(apply_mask=False) == _NESTED_CRED_JSON
# DB round-trip — tests process_result_value
db_session.expire(credential)
reloaded = credential.credential_json
assert isinstance(reloaded, SensitiveValue)
assert reloaded.get_value(apply_mask=False) == _NESTED_CRED_JSON
db_session.rollback()
def test_reassign_same_nested_json_not_dirty(db_session: Session) -> None:
"""Re-assigning the same nested dict should not mark the session dirty."""
credential = Credential(
source=DocumentSource.GOOGLE_DRIVE,
credential_json=_NESTED_CRED_JSON,
)
db_session.add(credential)
db_session.flush()
# Clear dirty state from the insert
db_session.expire(credential)
_ = credential.credential_json # force reload
# Re-assign identical value
credential.credential_json = _NESTED_CRED_JSON # type: ignore[assignment]
assert not db_session.is_modified(credential)
db_session.rollback()
def test_assign_different_nested_json_is_dirty(db_session: Session) -> None:
"""Assigning a different nested dict should mark the session dirty."""
credential = Credential(
source=DocumentSource.GOOGLE_DRIVE,
credential_json=_NESTED_CRED_JSON,
)
db_session.add(credential)
db_session.flush()
db_session.expire(credential)
_ = credential.credential_json # force reload
modified_cred = {**_NESTED_CRED_JSON, "scopes": ["read"]}
credential.credential_json = modified_cred # type: ignore[assignment]
assert db_session.is_modified(credential)
db_session.rollback()

View File

@@ -0,0 +1,305 @@
"""Tests for rotate_encryption_key against real Postgres.
Uses real ORM models (Credential, InternetSearchProvider) and the actual
Postgres database. Discovery is mocked in rotation tests to scope mutations
to only the test rows — the real _discover_encrypted_columns walk is tested
separately in TestDiscoverEncryptedColumns.
Requires a running Postgres instance. Run with::
python -m dotenv -f .vscode/.env run -- pytest tests/external_dependency_unit/db/test_rotate_encryption_key.py
"""
import json
from collections.abc import Generator
from unittest.mock import patch
import pytest
from sqlalchemy import LargeBinary
from sqlalchemy import select
from sqlalchemy import text
from sqlalchemy.orm import Session
from ee.onyx.utils.encryption import _decrypt_bytes
from ee.onyx.utils.encryption import _encrypt_string
from ee.onyx.utils.encryption import _get_trimmed_key
from onyx.configs.constants import DocumentSource
from onyx.db.models import Credential
from onyx.db.models import EncryptedJson
from onyx.db.models import EncryptedString
from onyx.db.models import InternetSearchProvider
from onyx.db.rotate_encryption_key import _discover_encrypted_columns
from onyx.db.rotate_encryption_key import rotate_encryption_key
from onyx.utils.variable_functionality import fetch_versioned_implementation
from onyx.utils.variable_functionality import global_version
EE_MODULE = "ee.onyx.utils.encryption"
ROTATE_MODULE = "onyx.db.rotate_encryption_key"
OLD_KEY = "o" * 16
NEW_KEY = "n" * 16
@pytest.fixture(autouse=True)
def _enable_ee() -> Generator[None, None, None]:
prev = global_version._is_ee
global_version.set_ee()
fetch_versioned_implementation.cache_clear()
yield
global_version._is_ee = prev
fetch_versioned_implementation.cache_clear()
@pytest.fixture(autouse=True)
def _clear_key_cache() -> None:
_get_trimmed_key.cache_clear()
def _raw_credential_bytes(db_session: Session, credential_id: int) -> bytes | None:
"""Read raw bytes from credential_json, bypassing the TypeDecorator."""
col = Credential.__table__.c.credential_json
stmt = select(col.cast(LargeBinary)).where(
Credential.__table__.c.id == credential_id
)
return db_session.execute(stmt).scalar()
def _raw_isp_bytes(db_session: Session, isp_id: int) -> bytes | None:
"""Read raw bytes from InternetSearchProvider.api_key."""
col = InternetSearchProvider.__table__.c.api_key
stmt = select(col.cast(LargeBinary)).where(
InternetSearchProvider.__table__.c.id == isp_id
)
return db_session.execute(stmt).scalar()
class TestDiscoverEncryptedColumns:
"""Verify _discover_encrypted_columns finds real production models."""
def test_discovers_credential_json(self) -> None:
results = _discover_encrypted_columns()
found = {
(model_cls.__tablename__, col_name, is_json) # type: ignore[attr-defined]
for model_cls, col_name, _, is_json in results
}
assert ("credential", "credential_json", True) in found
def test_discovers_internet_search_provider_api_key(self) -> None:
results = _discover_encrypted_columns()
found = {
(model_cls.__tablename__, col_name, is_json) # type: ignore[attr-defined]
for model_cls, col_name, _, is_json in results
}
assert ("internet_search_provider", "api_key", False) in found
def test_all_encrypted_string_columns_are_not_json(self) -> None:
results = _discover_encrypted_columns()
for model_cls, col_name, _, is_json in results:
col = getattr(model_cls, col_name).property.columns[0]
if isinstance(col.type, EncryptedString):
assert not is_json, (
f"{model_cls.__tablename__}.{col_name} is EncryptedString " # type: ignore[attr-defined]
f"but is_json={is_json}"
)
def test_all_encrypted_json_columns_are_json(self) -> None:
results = _discover_encrypted_columns()
for model_cls, col_name, _, is_json in results:
col = getattr(model_cls, col_name).property.columns[0]
if isinstance(col.type, EncryptedJson):
assert is_json, (
f"{model_cls.__tablename__}.{col_name} is EncryptedJson " # type: ignore[attr-defined]
f"but is_json={is_json}"
)
class TestRotateCredential:
"""Test rotation against the real Credential table (EncryptedJson).
Discovery is scoped to only the Credential model to avoid mutating
other tables in the test database.
"""
@pytest.fixture(autouse=True)
def _limit_discovery(self) -> Generator[None, None, None]:
with patch(
f"{ROTATE_MODULE}._discover_encrypted_columns",
return_value=[(Credential, "credential_json", ["id"], True)],
):
yield
@pytest.fixture()
def credential_id(
self, db_session: Session, tenant_context: None # noqa: ARG002
) -> Generator[int, None, None]:
"""Insert a Credential row with raw encrypted bytes, clean up after."""
config = {"api_key": "sk-test-1234", "endpoint": "https://example.com"}
encrypted = _encrypt_string(json.dumps(config), key=OLD_KEY)
result = db_session.execute(
text(
"INSERT INTO credential "
"(source, credential_json, admin_public, curator_public) "
"VALUES (:source, :cred_json, true, false) "
"RETURNING id"
),
{"source": DocumentSource.INGESTION_API.value, "cred_json": encrypted},
)
cred_id = result.scalar_one()
db_session.commit()
yield cred_id
db_session.execute(
text("DELETE FROM credential WHERE id = :id"), {"id": cred_id}
)
db_session.commit()
def test_rotates_credential_json(
self, db_session: Session, credential_id: int
) -> None:
with (
patch(f"{ROTATE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
):
totals = rotate_encryption_key(db_session, old_key=OLD_KEY)
assert totals.get("credential.credential_json", 0) >= 1
raw = _raw_credential_bytes(db_session, credential_id)
assert raw is not None
decrypted = json.loads(_decrypt_bytes(raw, key=NEW_KEY))
assert decrypted["api_key"] == "sk-test-1234"
assert decrypted["endpoint"] == "https://example.com"
def test_skips_already_rotated(
self, db_session: Session, credential_id: int
) -> None:
with (
patch(f"{ROTATE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
):
rotate_encryption_key(db_session, old_key=OLD_KEY)
_ = rotate_encryption_key(db_session, old_key=OLD_KEY)
raw = _raw_credential_bytes(db_session, credential_id)
assert raw is not None
decrypted = json.loads(_decrypt_bytes(raw, key=NEW_KEY))
assert decrypted["api_key"] == "sk-test-1234"
def test_dry_run_does_not_modify(
self, db_session: Session, credential_id: int
) -> None:
original = _raw_credential_bytes(db_session, credential_id)
with (
patch(f"{ROTATE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
):
totals = rotate_encryption_key(db_session, old_key=OLD_KEY, dry_run=True)
assert totals.get("credential.credential_json", 0) >= 1
raw_after = _raw_credential_bytes(db_session, credential_id)
assert raw_after == original
class TestRotateInternetSearchProvider:
"""Test rotation against the real InternetSearchProvider table (EncryptedString).
Discovery is scoped to only the InternetSearchProvider model to avoid
mutating other tables in the test database.
"""
@pytest.fixture(autouse=True)
def _limit_discovery(self) -> Generator[None, None, None]:
with patch(
f"{ROTATE_MODULE}._discover_encrypted_columns",
return_value=[
(InternetSearchProvider, "api_key", ["id"], False),
],
):
yield
@pytest.fixture()
def isp_id(
self, db_session: Session, tenant_context: None # noqa: ARG002
) -> Generator[int, None, None]:
"""Insert an InternetSearchProvider row with raw encrypted bytes."""
encrypted = _encrypt_string("sk-secret-api-key", key=OLD_KEY)
result = db_session.execute(
text(
"INSERT INTO internet_search_provider "
"(name, provider_type, api_key, is_active) "
"VALUES (:name, :ptype, :api_key, false) "
"RETURNING id"
),
{
"name": f"test-rotation-{id(self)}",
"ptype": "test",
"api_key": encrypted,
},
)
isp_id = result.scalar_one()
db_session.commit()
yield isp_id
db_session.execute(
text("DELETE FROM internet_search_provider WHERE id = :id"),
{"id": isp_id},
)
db_session.commit()
def test_rotates_api_key(self, db_session: Session, isp_id: int) -> None:
with (
patch(f"{ROTATE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
):
totals = rotate_encryption_key(db_session, old_key=OLD_KEY)
assert totals.get("internet_search_provider.api_key", 0) >= 1
raw = _raw_isp_bytes(db_session, isp_id)
assert raw is not None
assert _decrypt_bytes(raw, key=NEW_KEY) == "sk-secret-api-key"
def test_rotates_from_unencrypted(
self, db_session: Session, tenant_context: None # noqa: ARG002
) -> None:
"""Test rotating data that was stored without any encryption key."""
result = db_session.execute(
text(
"INSERT INTO internet_search_provider "
"(name, provider_type, api_key, is_active) "
"VALUES (:name, :ptype, :api_key, false) "
"RETURNING id"
),
{
"name": f"test-raw-{id(self)}",
"ptype": "test",
"api_key": b"raw-api-key",
},
)
isp_id = result.scalar_one()
db_session.commit()
try:
with (
patch(f"{ROTATE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", NEW_KEY),
):
totals = rotate_encryption_key(db_session, old_key=None)
assert totals.get("internet_search_provider.api_key", 0) >= 1
raw = _raw_isp_bytes(db_session, isp_id)
assert raw is not None
assert _decrypt_bytes(raw, key=NEW_KEY) == "raw-api-key"
finally:
db_session.execute(
text("DELETE FROM internet_search_provider WHERE id = :id"),
{"id": isp_id},
)
db_session.commit()

View File

@@ -0,0 +1,398 @@
"""External dependency tests for the old DocumentIndex interface.
These tests assume Vespa and OpenSearch are running.
TODO(ENG-3764)(andrei): Consolidate some of these test fixtures.
"""
import os
import time
import uuid
from collections.abc import Generator
from unittest.mock import patch
import httpx
import pytest
from onyx.access.models import DocumentAccess
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.context.search.models import IndexFilters
from onyx.db.enums import EmbeddingPrecision
from onyx.document_index.interfaces import DocumentIndex
from onyx.document_index.interfaces import IndexBatchParams
from onyx.document_index.interfaces import VespaChunkRequest
from onyx.document_index.interfaces import VespaDocumentUserFields
from onyx.document_index.opensearch.client import wait_for_opensearch_with_timeout
from onyx.document_index.opensearch.opensearch_document_index import (
OpenSearchOldDocumentIndex,
)
from onyx.document_index.vespa.index import VespaIndex
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocMetadataAwareIndexChunk
from shared_configs.configs import MULTI_TENANT
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
from shared_configs.contextvars import get_current_tenant_id
from tests.external_dependency_unit.constants import TEST_TENANT_ID
@pytest.fixture(scope="module")
def opensearch_available() -> Generator[None, None, None]:
"""Verifies OpenSearch is running, fails the test if not."""
if not wait_for_opensearch_with_timeout():
pytest.fail("OpenSearch is not available.")
yield # Test runs here.
@pytest.fixture(scope="module")
def test_index_name() -> Generator[str, None, None]:
yield f"test_index_{uuid.uuid4().hex[:8]}" # Test runs here.
@pytest.fixture(scope="module")
def tenant_context() -> Generator[None, None, None]:
"""Sets up tenant context for testing."""
token = CURRENT_TENANT_ID_CONTEXTVAR.set(TEST_TENANT_ID)
try:
yield # Test runs here.
finally:
# Reset the tenant context after the test
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
@pytest.fixture(scope="module")
def httpx_client() -> Generator[httpx.Client, None, None]:
client = get_vespa_http_client()
try:
yield client
finally:
client.close()
@pytest.fixture(scope="module")
def vespa_document_index(
httpx_client: httpx.Client,
tenant_context: None, # noqa: ARG001
test_index_name: str,
) -> Generator[VespaIndex, None, None]:
vespa_index = VespaIndex(
index_name=test_index_name,
secondary_index_name=None,
large_chunks_enabled=False,
secondary_large_chunks_enabled=None,
multitenant=MULTI_TENANT,
httpx_client=httpx_client,
)
backend_dir = os.path.abspath(
os.path.join(os.path.dirname(__file__), "..", "..", "..")
)
with patch("os.getcwd", return_value=backend_dir):
vespa_index.ensure_indices_exist(
primary_embedding_dim=128,
primary_embedding_precision=EmbeddingPrecision.FLOAT,
secondary_index_embedding_dim=None,
secondary_index_embedding_precision=None,
)
# Verify Vespa is running, fails the test if not. Try 90 seconds for testing
# in CI. We have to do this here because this endpoint only becomes live
# once we create an index.
if not wait_for_vespa_with_timeout(wait_limit=90):
pytest.fail("Vespa is not available.")
# Wait until the schema is actually ready for writes on content nodes. We
# probe by attempting a PUT; 200 means the schema is live, 400 means not
# yet. This is so scuffed but running the test is really flakey otherwise;
# this is only temporary until we entirely move off of Vespa.
probe_doc = {
"fields": {
"document_id": "__probe__",
"chunk_id": 0,
"blurb": "",
"title": "",
"skip_title": True,
"content": "",
"content_summary": "",
"source_type": "file",
"source_links": "null",
"semantic_identifier": "",
"section_continuation": False,
"large_chunk_reference_ids": [],
"metadata": "{}",
"metadata_list": [],
"metadata_suffix": "",
"chunk_context": "",
"doc_summary": "",
"embeddings": {"full_chunk": [1.0] + [0.0] * 127},
"access_control_list": {},
"document_sets": {},
"image_file_name": None,
"user_project": [],
"personas": [],
"boost": 0.0,
"aggregated_chunk_boost_factor": 0.0,
"primary_owners": [],
"secondary_owners": [],
}
}
schema_ready = False
probe_url = (
f"http://localhost:8081/document/v1/default/{test_index_name}/docid/__probe__"
)
for _ in range(60):
resp = httpx_client.post(probe_url, json=probe_doc)
if resp.status_code == 200:
schema_ready = True
# Clean up the probe document.
httpx_client.delete(probe_url)
break
time.sleep(1)
if not schema_ready:
pytest.fail(f"Vespa schema '{test_index_name}' did not become ready in time.")
yield vespa_index # Test runs here.
# TODO(ENG-3765)(andrei): Explicitly cleanup index. Not immediately
# pressing; in CI we should be using fresh instances of dependencies each
# time anyway.
@pytest.fixture(scope="module")
def opensearch_document_index(
opensearch_available: None, # noqa: ARG001
tenant_context: None, # noqa: ARG001
test_index_name: str,
) -> Generator[OpenSearchOldDocumentIndex, None, None]:
opensearch_index = OpenSearchOldDocumentIndex(
index_name=test_index_name,
embedding_dim=128,
embedding_precision=EmbeddingPrecision.FLOAT,
secondary_index_name=None,
secondary_embedding_dim=None,
secondary_embedding_precision=None,
large_chunks_enabled=False,
secondary_large_chunks_enabled=None,
multitenant=MULTI_TENANT,
)
opensearch_index.ensure_indices_exist(
primary_embedding_dim=128,
primary_embedding_precision=EmbeddingPrecision.FLOAT,
secondary_index_embedding_dim=None,
secondary_index_embedding_precision=None,
)
yield opensearch_index # Test runs here.
# TODO(ENG-3765)(andrei): Explicitly cleanup index. Not immediately
# pressing; in CI we should be using fresh instances of dependencies each
# time anyway.
@pytest.fixture(scope="module")
def document_indices(
vespa_document_index: VespaIndex,
opensearch_document_index: OpenSearchOldDocumentIndex,
) -> Generator[list[DocumentIndex], None, None]:
# Ideally these are parametrized; doing so with pytest fixtures is tricky.
yield [opensearch_document_index, vespa_document_index] # Test runs here.
@pytest.fixture(scope="function")
def chunks(
tenant_context: None, # noqa: ARG001
) -> Generator[list[DocMetadataAwareIndexChunk], None, None]:
result = []
chunk_count = 5
doc_id = "test_doc"
tenant_id = get_current_tenant_id()
access = DocumentAccess.build(
user_emails=[],
user_groups=[],
external_user_emails=[],
external_user_group_ids=[],
is_public=True,
)
document_sets: set[str] = set()
user_project: list[int] = list()
personas: list[int] = list()
boost = 0
blurb = "blurb"
content = "content"
title_prefix = ""
doc_summary = ""
chunk_context = ""
title_embedding = [1.0] + [0] * 127
# Full 0 vectors are not supported for cos similarity.
embeddings = ChunkEmbedding(
full_embedding=[1.0] + [0] * 127, mini_chunk_embeddings=[]
)
source_document = Document(
id=doc_id,
semantic_identifier="semantic identifier",
source=DocumentSource.FILE,
sections=[],
metadata={},
title="title",
)
metadata_suffix_keyword = ""
image_file_id = None
source_links: dict[int, str] = {0: ""}
ancestor_hierarchy_node_ids: list[int] = []
for i in range(chunk_count):
result.append(
DocMetadataAwareIndexChunk(
tenant_id=tenant_id,
access=access,
document_sets=document_sets,
user_project=user_project,
personas=personas,
boost=boost,
aggregated_chunk_boost_factor=0,
ancestor_hierarchy_node_ids=ancestor_hierarchy_node_ids,
embeddings=embeddings,
title_embedding=title_embedding,
source_document=source_document,
title_prefix=title_prefix,
metadata_suffix_keyword=metadata_suffix_keyword,
metadata_suffix_semantic="",
contextual_rag_reserved_tokens=0,
doc_summary=doc_summary,
chunk_context=chunk_context,
mini_chunk_texts=None,
large_chunk_id=None,
chunk_id=i,
blurb=blurb,
content=content,
source_links=source_links,
image_file_id=image_file_id,
section_continuation=False,
)
)
yield result # Test runs here.
@pytest.fixture(scope="function")
def index_batch_params(
tenant_context: None, # noqa: ARG001
) -> Generator[IndexBatchParams, None, None]:
# WARNING: doc_id_to_previous_chunk_cnt={"test_doc": 0} is hardcoded to 0,
# which is only correct on the very first index call. The document_indices
# fixture is scope="module", meaning the same OpenSearch and Vespa backends
# persist across all test functions in this module. When a second test
# function uses this fixture and calls document_index.index(...), the
# backend already has 5 chunks for "test_doc" from the previous test run,
# but the batch params still claim 0 prior chunks exist. This can lead to
# orphaned/duplicate chunks that make subsequent assertions incorrect.
# TODO: Whenever adding a second test, either change this or cleanup the
# index between test cases.
yield IndexBatchParams(
doc_id_to_previous_chunk_cnt={"test_doc": 0},
doc_id_to_new_chunk_cnt={"test_doc": 5},
tenant_id=get_current_tenant_id(),
large_chunks_enabled=False,
)
class TestDocumentIndexOld:
"""Tests the old DocumentIndex interface."""
def test_update_single_can_clear_user_projects_and_personas(
self,
document_indices: list[DocumentIndex],
# This test case assumes all these chunks correspond to one document.
chunks: list[DocMetadataAwareIndexChunk],
index_batch_params: IndexBatchParams,
) -> None:
"""
Tests that update_single can clear user_projects and personas.
"""
for document_index in document_indices:
# Precondition.
# Ensure there is some non-empty value for user project and
# personas.
for chunk in chunks:
chunk.user_project = [1]
chunk.personas = [2]
document_index.index(chunks, index_batch_params)
# Ensure that we can get chunks as expected with filters.
doc_id = chunks[0].source_document.id
chunk_count = len(chunks)
tenant_id = get_current_tenant_id()
# We need to specify the chunk index range and specify
# batch_retrieval=True below to trigger the codepath for Vespa's
# search API, which uses the expected additive filtering for
# project_id and persona_id. Otherwise we would use the codepath for
# the visit API, which does not have this kind of filtering
# implemented.
chunk_request = VespaChunkRequest(
document_id=doc_id, min_chunk_ind=0, max_chunk_ind=chunk_count - 1
)
project_persona_filters = IndexFilters(
access_control_list=None,
tenant_id=tenant_id,
project_id=1,
persona_id=2,
# We need this even though none of the chunks belong to a
# document set because project_id and persona_id are only
# additive filters in the event the agent has knowledge scope;
# if the agent does not, it is implied that it can see
# everything it is allowed to.
document_set=["1"],
)
# Not best practice here but the API for refreshing the index to
# ensure that the latest data is present is not exposed in this
# class and is not the same for Vespa and OpenSearch, so we just
# tolerate a sleep for now. As a consequence the number of tests in
# this suite should be small. We only need to tolerate this for as
# long as we continue to use Vespa, we can consider exposing
# something for OpenSearch later.
time.sleep(1)
inference_chunks = document_index.id_based_retrieval(
chunk_requests=[chunk_request],
filters=project_persona_filters,
batch_retrieval=True,
)
assert len(inference_chunks) == chunk_count
# Sort by chunk id to easily test if we have all chunks.
for i, inference_chunk in enumerate(
sorted(inference_chunks, key=lambda x: x.chunk_id)
):
assert inference_chunk.chunk_id == i
assert inference_chunk.document_id == doc_id
# Under test.
# Explicitly set empty fields here.
user_fields = VespaDocumentUserFields(user_projects=[], personas=[])
document_index.update_single(
doc_id=doc_id,
chunk_count=chunk_count,
tenant_id=tenant_id,
fields=None,
user_fields=user_fields,
)
# Postcondition.
filters = IndexFilters(access_control_list=None, tenant_id=tenant_id)
# We should expect to get back all expected chunks with no filters.
# Again, not best practice here.
time.sleep(1)
inference_chunks = document_index.id_based_retrieval(
chunk_requests=[chunk_request], filters=filters, batch_retrieval=True
)
assert len(inference_chunks) == chunk_count
# Sort by chunk id to easily test if we have all chunks.
for i, inference_chunk in enumerate(
sorted(inference_chunks, key=lambda x: x.chunk_id)
):
assert inference_chunk.chunk_id == i
assert inference_chunk.document_id == doc_id
# Now, we should expect to not get any chunks if we specify the user
# project and personas filters.
inference_chunks = document_index.id_based_retrieval(
chunk_requests=[chunk_request],
filters=project_persona_filters,
batch_retrieval=True,
)
assert len(inference_chunks) == 0

View File

@@ -85,7 +85,7 @@ def test_group_overlap_filter(
results = _get_accessible_hierarchy_nodes_for_source(
db_session,
source=DocumentSource.GOOGLE_DRIVE,
user_email=None,
user_email="",
external_group_ids=["group_engineering"],
)
result_ids = {n.raw_node_id for n in results}
@@ -124,7 +124,7 @@ def test_no_credentials_returns_only_public(
results = _get_accessible_hierarchy_nodes_for_source(
db_session,
source=DocumentSource.GOOGLE_DRIVE,
user_email=None,
user_email="",
external_group_ids=[],
)
result_ids = {n.raw_node_id for n in results}

View File

@@ -17,6 +17,9 @@ from unittest.mock import patch
import pytest
from sqlalchemy.orm import Session
from onyx.background.celery.tasks.opensearch_migration.constants import (
GET_VESPA_CHUNKS_SLICE_COUNT,
)
from onyx.background.celery.tasks.opensearch_migration.tasks import (
is_continuation_token_done_for_all_slices,
)
@@ -236,6 +239,8 @@ def full_deployment_setup() -> Generator[None, None, None]:
NOTE: We deliberately duplicate this logic from
backend/tests/external_dependency_unit/conftest.py because we need to set
opensearch_available just for this module, not the entire test session.
TODO(ENG-3764)(andrei): Consolidate some of these test fixtures.
"""
# Patch ENABLE_OPENSEARCH_INDEXING_FOR_ONYX just for this test because we
# don't yet want that enabled for all tests.
@@ -320,9 +325,15 @@ def test_embedding_dimension(db_session: Session) -> Generator[int, None, None]:
@pytest.fixture(scope="function")
def patch_get_vespa_chunks_page_size() -> Generator[int, None, None]:
test_page_size = 5
with patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
test_page_size,
with (
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
test_page_size,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.constants.GET_VESPA_CHUNKS_PAGE_SIZE",
test_page_size,
),
):
yield test_page_size # Test runs here.
@@ -582,6 +593,175 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
document_chunks[document.id][opensearch_chunk.chunk_index],
)
def test_chunk_migration_visits_all_chunks_even_when_batch_size_varies(
self,
db_session: Session,
test_documents: list[Document],
vespa_document_index: VespaDocumentIndex,
opensearch_client: OpenSearchIndexClient,
test_embedding_dimension: int,
clean_migration_tables: None, # noqa: ARG002
enable_opensearch_indexing_for_onyx: None, # noqa: ARG002
) -> None:
"""
Tests that chunk migration works correctly even when the batch size
changes halfway through a migration.
Simulates task time running out my mocking the locking behavior.
"""
# Precondition.
# Index chunks into Vespa.
document_chunks: dict[str, list[dict[str, Any]]] = {
document.id: [
_create_raw_document_chunk(
document_id=document.id,
chunk_index=i,
content=f"Test content {i} for {document.id}",
embedding=_generate_test_vector(test_embedding_dimension),
now=datetime.now(),
title=f"Test title {document.id}",
title_embedding=_generate_test_vector(test_embedding_dimension),
)
for i in range(CHUNK_COUNT)
]
for document in test_documents
}
all_chunks: list[dict[str, Any]] = []
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
# Run the initial batch. To simulate partial progress we will mock the
# redis lock to return True for the first invocation of .owned() and
# False subsequently.
# NOTE: The batch size is currently set to 5 in
# patch_get_vespa_chunks_page_size.
mock_redis_client = Mock()
mock_lock = Mock()
mock_lock.owned.side_effect = [True, False, False]
mock_lock.acquire.return_value = True
mock_redis_client.lock.return_value = mock_lock
with patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.get_redis_client",
return_value=mock_redis_client,
):
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
)
assert result_1 is True
# Expire the session cache to see the committed changes from the task.
db_session.expire_all()
# Verify partial progress was saved.
tenant_record = db_session.query(OpenSearchTenantMigrationRecord).first()
assert tenant_record is not None
partial_chunks_migrated = tenant_record.total_chunks_migrated
assert partial_chunks_migrated > 0
# page_size applies per slice, so one iteration can fetch up to
# page_size * GET_VESPA_CHUNKS_SLICE_COUNT chunks total.
assert partial_chunks_migrated <= 5 * GET_VESPA_CHUNKS_SLICE_COUNT
assert tenant_record.vespa_visit_continuation_token is not None
# Slices are not necessarily evenly distributed across all document
# chunks so we can't test that every token is non-None, but certainly at
# least one must be.
assert any(json.loads(tenant_record.vespa_visit_continuation_token).values())
assert tenant_record.migration_completed_at is None
assert tenant_record.approx_chunk_count_in_vespa is not None
# Under test.
# Now patch the batch size to be some other number, like 2.
mock_redis_client = Mock()
mock_lock = Mock()
mock_lock.owned.side_effect = [True, False, False]
mock_lock.acquire.return_value = True
mock_redis_client.lock.return_value = mock_lock
with (
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.constants.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.get_redis_client",
return_value=mock_redis_client,
),
):
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
)
# Postcondition.
assert result_2 is True
# Expire the session cache to see the committed changes from the task.
db_session.expire_all()
# Verify next partial progress was saved.
tenant_record = db_session.query(OpenSearchTenantMigrationRecord).first()
assert tenant_record is not None
new_partial_chunks_migrated = tenant_record.total_chunks_migrated
assert new_partial_chunks_migrated > partial_chunks_migrated
# page_size applies per slice, so one iteration can fetch up to
# page_size * GET_VESPA_CHUNKS_SLICE_COUNT chunks total.
assert new_partial_chunks_migrated <= (5 + 2) * GET_VESPA_CHUNKS_SLICE_COUNT
assert tenant_record.vespa_visit_continuation_token is not None
# Slices are not necessarily evenly distributed across all document
# chunks so we can't test that every token is non-None, but certainly at
# least one must be.
assert any(json.loads(tenant_record.vespa_visit_continuation_token).values())
assert tenant_record.migration_completed_at is None
assert tenant_record.approx_chunk_count_in_vespa is not None
# Under test.
# Run the remainder of the migration.
with (
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.constants.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
):
result_3 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
)
# Postcondition.
assert result_3 is True
# Expire the session cache to see the committed changes from the task.
db_session.expire_all()
# Verify completion.
tenant_record = db_session.query(OpenSearchTenantMigrationRecord).first()
assert tenant_record is not None
assert tenant_record.total_chunks_migrated > new_partial_chunks_migrated
assert tenant_record.total_chunks_migrated == len(all_chunks)
# Visit is complete so continuation token should be None.
assert tenant_record.vespa_visit_continuation_token is not None
assert is_continuation_token_done_for_all_slices(
json.loads(tenant_record.vespa_visit_continuation_token)
)
assert tenant_record.migration_completed_at is not None
assert tenant_record.approx_chunk_count_in_vespa == len(all_chunks)
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
for opensearch_chunk in opensearch_chunks:
_assert_chunk_matches_vespa_chunk(
opensearch_chunk,
document_chunks[document.id][opensearch_chunk.chunk_index],
)
def test_chunk_migration_empty_vespa(
self,
db_session: Session,

View File

@@ -0,0 +1,85 @@
"""Tests that SlackBot CRUD operations return properly typed SensitiveValue fields.
Regression test for the bug where insert_slack_bot/update_slack_bot returned
objects with raw string tokens instead of SensitiveValue wrappers, causing
'str object has no attribute get_value' errors in SlackBot.from_model().
"""
from uuid import uuid4
from sqlalchemy.orm import Session
from onyx.db.slack_bot import insert_slack_bot
from onyx.db.slack_bot import update_slack_bot
from onyx.server.manage.models import SlackBot
from onyx.utils.sensitive import SensitiveValue
def _unique(prefix: str) -> str:
return f"{prefix}-{uuid4().hex[:8]}"
def test_insert_slack_bot_returns_sensitive_values(db_session: Session) -> None:
bot_token = _unique("xoxb-insert")
app_token = _unique("xapp-insert")
user_token = _unique("xoxp-insert")
slack_bot = insert_slack_bot(
db_session=db_session,
name=_unique("test-bot-insert"),
enabled=True,
bot_token=bot_token,
app_token=app_token,
user_token=user_token,
)
assert isinstance(slack_bot.bot_token, SensitiveValue)
assert isinstance(slack_bot.app_token, SensitiveValue)
assert isinstance(slack_bot.user_token, SensitiveValue)
assert slack_bot.bot_token.get_value(apply_mask=False) == bot_token
assert slack_bot.app_token.get_value(apply_mask=False) == app_token
assert slack_bot.user_token.get_value(apply_mask=False) == user_token
# Verify from_model works without error
pydantic_bot = SlackBot.from_model(slack_bot)
assert pydantic_bot.bot_token # masked, but not empty
assert pydantic_bot.app_token
def test_update_slack_bot_returns_sensitive_values(db_session: Session) -> None:
slack_bot = insert_slack_bot(
db_session=db_session,
name=_unique("test-bot-update"),
enabled=True,
bot_token=_unique("xoxb-update"),
app_token=_unique("xapp-update"),
)
new_bot_token = _unique("xoxb-update-new")
new_app_token = _unique("xapp-update-new")
new_user_token = _unique("xoxp-update-new")
updated = update_slack_bot(
db_session=db_session,
slack_bot_id=slack_bot.id,
name=_unique("test-bot-updated"),
enabled=False,
bot_token=new_bot_token,
app_token=new_app_token,
user_token=new_user_token,
)
assert isinstance(updated.bot_token, SensitiveValue)
assert isinstance(updated.app_token, SensitiveValue)
assert isinstance(updated.user_token, SensitiveValue)
assert updated.bot_token.get_value(apply_mask=False) == new_bot_token
assert updated.app_token.get_value(apply_mask=False) == new_app_token
assert updated.user_token.get_value(apply_mask=False) == new_user_token
# Verify from_model works without error
pydantic_bot = SlackBot.from_model(updated)
assert pydantic_bot.bot_token
assert pydantic_bot.app_token
assert pydantic_bot.user_token is not None

View File

@@ -148,8 +148,16 @@ class TestOAuthConfigCRUD:
)
# Secrets should be preserved
assert updated_config.client_id == original_client_id
assert updated_config.client_secret == original_client_secret
assert updated_config.client_id is not None
assert original_client_id is not None
assert updated_config.client_id.get_value(
apply_mask=False
) == original_client_id.get_value(apply_mask=False)
assert updated_config.client_secret is not None
assert original_client_secret is not None
assert updated_config.client_secret.get_value(
apply_mask=False
) == original_client_secret.get_value(apply_mask=False)
# But name should be updated
assert updated_config.name == new_name
@@ -173,9 +181,14 @@ class TestOAuthConfigCRUD:
)
# client_id should be cleared (empty string)
assert updated_config.client_id == ""
assert updated_config.client_id is not None
assert updated_config.client_id.get_value(apply_mask=False) == ""
# client_secret should be preserved
assert updated_config.client_secret == original_client_secret
assert updated_config.client_secret is not None
assert original_client_secret is not None
assert updated_config.client_secret.get_value(
apply_mask=False
) == original_client_secret.get_value(apply_mask=False)
def test_update_oauth_config_clear_client_secret(self, db_session: Session) -> None:
"""Test clearing client_secret while preserving client_id"""
@@ -190,9 +203,14 @@ class TestOAuthConfigCRUD:
)
# client_secret should be cleared (empty string)
assert updated_config.client_secret == ""
assert updated_config.client_secret is not None
assert updated_config.client_secret.get_value(apply_mask=False) == ""
# client_id should be preserved
assert updated_config.client_id == original_client_id
assert updated_config.client_id is not None
assert original_client_id is not None
assert updated_config.client_id.get_value(
apply_mask=False
) == original_client_id.get_value(apply_mask=False)
def test_update_oauth_config_clear_both_secrets(self, db_session: Session) -> None:
"""Test clearing both client_id and client_secret"""
@@ -207,8 +225,10 @@ class TestOAuthConfigCRUD:
)
# Both should be cleared (empty strings)
assert updated_config.client_id == ""
assert updated_config.client_secret == ""
assert updated_config.client_id is not None
assert updated_config.client_id.get_value(apply_mask=False) == ""
assert updated_config.client_secret is not None
assert updated_config.client_secret.get_value(apply_mask=False) == ""
def test_update_oauth_config_authorization_url(self, db_session: Session) -> None:
"""Test updating authorization_url"""
@@ -275,7 +295,8 @@ class TestOAuthConfigCRUD:
assert updated_config.token_url == new_token_url
assert updated_config.scopes == new_scopes
assert updated_config.additional_params == new_params
assert updated_config.client_id == new_client_id
assert updated_config.client_id is not None
assert updated_config.client_id.get_value(apply_mask=False) == new_client_id
def test_delete_oauth_config(self, db_session: Session) -> None:
"""Test deleting an OAuth configuration"""
@@ -416,7 +437,8 @@ class TestOAuthUserTokenCRUD:
assert user_token.id is not None
assert user_token.oauth_config_id == oauth_config.id
assert user_token.user_id == user.id
assert user_token.token_data == token_data
assert user_token.token_data is not None
assert user_token.token_data.get_value(apply_mask=False) == token_data
assert user_token.created_at is not None
assert user_token.updated_at is not None
@@ -446,8 +468,13 @@ class TestOAuthUserTokenCRUD:
# Should be the same token record (updated, not inserted)
assert updated_token.id == initial_token_id
assert updated_token.token_data == updated_token_data
assert updated_token.token_data != initial_token_data
assert updated_token.token_data is not None
assert (
updated_token.token_data.get_value(apply_mask=False) == updated_token_data
)
assert (
updated_token.token_data.get_value(apply_mask=False) != initial_token_data
)
def test_get_user_oauth_token(self, db_session: Session) -> None:
"""Test retrieving a user's OAuth token"""
@@ -463,7 +490,8 @@ class TestOAuthUserTokenCRUD:
assert retrieved_token is not None
assert retrieved_token.id == created_token.id
assert retrieved_token.token_data == token_data
assert retrieved_token.token_data is not None
assert retrieved_token.token_data.get_value(apply_mask=False) == token_data
def test_get_user_oauth_token_not_found(self, db_session: Session) -> None:
"""Test retrieving a non-existent user token returns None"""
@@ -519,7 +547,8 @@ class TestOAuthUserTokenCRUD:
retrieved_token = get_user_oauth_token(oauth_config.id, user.id, db_session)
assert retrieved_token is not None
assert retrieved_token.id == updated_token.id
assert retrieved_token.token_data == token_data2
assert retrieved_token.token_data is not None
assert retrieved_token.token_data.get_value(apply_mask=False) == token_data2
def test_cascade_delete_user_tokens_on_config_deletion(
self, db_session: Session

View File

@@ -374,8 +374,14 @@ class TestOAuthTokenManagerCodeExchange:
assert call_args[0][0] == oauth_config.token_url
assert call_args[1]["data"]["grant_type"] == "authorization_code"
assert call_args[1]["data"]["code"] == "auth_code_123"
assert call_args[1]["data"]["client_id"] == oauth_config.client_id
assert call_args[1]["data"]["client_secret"] == oauth_config.client_secret
assert oauth_config.client_id is not None
assert oauth_config.client_secret is not None
assert call_args[1]["data"]["client_id"] == oauth_config.client_id.get_value(
apply_mask=False
)
assert call_args[1]["data"][
"client_secret"
] == oauth_config.client_secret.get_value(apply_mask=False)
assert call_args[1]["data"]["redirect_uri"] == "https://example.com/callback"
@patch("onyx.auth.oauth_token_manager.requests.post")

View File

@@ -950,6 +950,7 @@ from onyx.server.query_and_chat.streaming_models import Packet
from onyx.server.query_and_chat.streaming_models import PythonToolDelta
from onyx.server.query_and_chat.streaming_models import PythonToolStart
from onyx.server.query_and_chat.streaming_models import SectionEnd
from onyx.server.query_and_chat.streaming_models import ToolCallArgumentDelta
from onyx.tools.tool_implementations.python.python_tool import PythonTool
from tests.external_dependency_unit.answer.stream_test_builder import StreamTestBuilder
from tests.external_dependency_unit.answer.stream_test_utils import create_chat_session
@@ -1294,9 +1295,18 @@ def test_code_interpreter_replay_packets_include_code_and_output(
).expect(
Packet(
placement=create_placement(0),
obj=PythonToolStart(code=code),
obj=ToolCallArgumentDelta(
tool_type="python",
argument_deltas={"code": code},
),
),
forward=2,
).expect(
Packet(
placement=create_placement(0),
obj=PythonToolStart(code=code),
),
forward=False,
).expect(
Packet(
placement=create_placement(0),

View File

@@ -64,7 +64,8 @@ class TestBotConfigAPI:
db_session.commit()
assert config is not None
assert config.bot_token == "test_token_123"
assert config.bot_token is not None
assert config.bot_token.get_value(apply_mask=False) == "test_token_123"
# Cleanup
delete_discord_bot_config(db_session)

View File

@@ -0,0 +1,165 @@
"""Tests for EE AES-CBC encryption/decryption with explicit key support.
With EE mode enabled (via conftest), fetch_versioned_implementation resolves
to the EE implementations, so no patching of the MIT layer is needed.
"""
from unittest.mock import patch
import pytest
from ee.onyx.utils.encryption import _decrypt_bytes
from ee.onyx.utils.encryption import _encrypt_string
from ee.onyx.utils.encryption import _get_trimmed_key
from ee.onyx.utils.encryption import decrypt_bytes_to_string
from ee.onyx.utils.encryption import encrypt_string_to_bytes
EE_MODULE = "ee.onyx.utils.encryption"
# Keys must be exactly 16, 24, or 32 bytes for AES
KEY_16 = "a" * 16
KEY_16_ALT = "b" * 16
KEY_24 = "d" * 24
KEY_32 = "c" * 32
@pytest.fixture(autouse=True)
def _clear_key_cache() -> None:
_get_trimmed_key.cache_clear()
class TestEncryptDecryptRoundTrip:
def test_roundtrip_with_env_key(self) -> None:
with patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", KEY_16):
encrypted = _encrypt_string("hello world")
assert encrypted != b"hello world"
assert _decrypt_bytes(encrypted) == "hello world"
def test_roundtrip_with_explicit_key(self) -> None:
encrypted = _encrypt_string("secret data", key=KEY_32)
assert encrypted != b"secret data"
assert _decrypt_bytes(encrypted, key=KEY_32) == "secret data"
def test_roundtrip_no_key(self) -> None:
"""Without any key, data is raw-encoded (no encryption)."""
with patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", ""):
encrypted = _encrypt_string("plain text")
assert encrypted == b"plain text"
assert _decrypt_bytes(encrypted) == "plain text"
def test_explicit_key_overrides_env(self) -> None:
with patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", KEY_16):
encrypted = _encrypt_string("data", key=KEY_16_ALT)
with pytest.raises(ValueError):
_decrypt_bytes(encrypted, key=KEY_16)
assert _decrypt_bytes(encrypted, key=KEY_16_ALT) == "data"
def test_different_encryptions_produce_different_bytes(self) -> None:
"""Each encryption uses a random IV, so results differ."""
a = _encrypt_string("same", key=KEY_16)
b = _encrypt_string("same", key=KEY_16)
assert a != b
def test_roundtrip_empty_string(self) -> None:
encrypted = _encrypt_string("", key=KEY_16)
assert encrypted != b""
assert _decrypt_bytes(encrypted, key=KEY_16) == ""
def test_roundtrip_unicode(self) -> None:
text = "日本語テスト 🔐 émojis"
encrypted = _encrypt_string(text, key=KEY_16)
assert _decrypt_bytes(encrypted, key=KEY_16) == text
class TestDecryptFallbackBehavior:
def test_wrong_env_key_falls_back_to_raw_decode(self) -> None:
"""Default key path: AES fails on non-AES data → fallback to raw decode."""
raw = "readable text".encode()
with patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", KEY_16):
assert _decrypt_bytes(raw) == "readable text"
def test_explicit_wrong_key_raises(self) -> None:
"""Explicit key path: AES fails → raises, no fallback."""
encrypted = _encrypt_string("secret", key=KEY_16)
with pytest.raises(ValueError):
_decrypt_bytes(encrypted, key=KEY_16_ALT)
def test_explicit_none_key_with_no_env(self) -> None:
"""key=None with empty env → raw decode."""
with patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", ""):
assert _decrypt_bytes(b"hello", key=None) == "hello"
def test_explicit_empty_string_key(self) -> None:
"""key='' means no encryption."""
encrypted = _encrypt_string("test", key="")
assert encrypted == b"test"
assert _decrypt_bytes(encrypted, key="") == "test"
class TestKeyValidation:
def test_key_too_short_raises(self) -> None:
with pytest.raises(RuntimeError, match="too short"):
_encrypt_string("data", key="short")
def test_16_byte_key(self) -> None:
encrypted = _encrypt_string("data", key=KEY_16)
assert _decrypt_bytes(encrypted, key=KEY_16) == "data"
def test_24_byte_key(self) -> None:
encrypted = _encrypt_string("data", key=KEY_24)
assert _decrypt_bytes(encrypted, key=KEY_24) == "data"
def test_32_byte_key(self) -> None:
encrypted = _encrypt_string("data", key=KEY_32)
assert _decrypt_bytes(encrypted, key=KEY_32) == "data"
def test_long_key_truncated_to_32(self) -> None:
"""Keys longer than 32 bytes are truncated to 32."""
long_key = "e" * 64
encrypted = _encrypt_string("data", key=long_key)
assert _decrypt_bytes(encrypted, key=long_key) == "data"
def test_20_byte_key_trimmed_to_16(self) -> None:
"""A 20-byte key is trimmed to the largest valid AES size that fits (16)."""
key_20 = "f" * 20
encrypted = _encrypt_string("data", key=key_20)
assert _decrypt_bytes(encrypted, key=key_20) == "data"
# Verify it was trimmed to 16 by checking that the first 16 bytes
# of the key can also decrypt it
key_16_same_prefix = "f" * 16
assert _decrypt_bytes(encrypted, key=key_16_same_prefix) == "data"
def test_25_byte_key_trimmed_to_24(self) -> None:
"""A 25-byte key is trimmed to the largest valid AES size that fits (24)."""
key_25 = "g" * 25
encrypted = _encrypt_string("data", key=key_25)
assert _decrypt_bytes(encrypted, key=key_25) == "data"
key_24_same_prefix = "g" * 24
assert _decrypt_bytes(encrypted, key=key_24_same_prefix) == "data"
def test_30_byte_key_trimmed_to_24(self) -> None:
"""A 30-byte key is trimmed to the largest valid AES size that fits (24)."""
key_30 = "h" * 30
encrypted = _encrypt_string("data", key=key_30)
assert _decrypt_bytes(encrypted, key=key_30) == "data"
key_24_same_prefix = "h" * 24
assert _decrypt_bytes(encrypted, key=key_24_same_prefix) == "data"
class TestWrapperFunctions:
"""Test encrypt_string_to_bytes / decrypt_bytes_to_string pass key through.
With EE mode enabled, the wrappers resolve to EE implementations automatically.
"""
def test_wrapper_passes_key(self) -> None:
encrypted = encrypt_string_to_bytes("payload", key=KEY_16)
assert decrypt_bytes_to_string(encrypted, key=KEY_16) == "payload"
def test_wrapper_no_key_uses_env(self) -> None:
with patch(f"{EE_MODULE}.ENCRYPTION_KEY_SECRET", KEY_32):
encrypted = encrypt_string_to_bytes("payload")
assert decrypt_bytes_to_string(encrypted) == "payload"

View File

@@ -0,0 +1,163 @@
"""Tests for user file ACL computation, including shared persona access."""
from unittest.mock import MagicMock
from unittest.mock import patch
from uuid import uuid4
from onyx.access.access import collect_user_file_access
from onyx.access.access import get_access_for_user_files_impl
from onyx.access.utils import prefix_user_email
from onyx.configs.constants import PUBLIC_DOC_PAT
def _make_user(email: str) -> MagicMock:
user = MagicMock()
user.email = email
user.id = uuid4()
return user
def _make_persona(
*,
owner: MagicMock | None = None,
shared_users: list[MagicMock] | None = None,
is_public: bool = False,
deleted: bool = False,
) -> MagicMock:
persona = MagicMock()
persona.deleted = deleted
persona.is_public = is_public
persona.user_id = owner.id if owner else None
persona.user = owner
persona.users = shared_users or []
return persona
def _make_user_file(
*,
owner: MagicMock,
assistants: list[MagicMock] | None = None,
) -> MagicMock:
uf = MagicMock()
uf.id = uuid4()
uf.user = owner
uf.user_id = owner.id
uf.assistants = assistants or []
return uf
class TestCollectUserFileAccess:
def test_owner_only(self) -> None:
owner = _make_user("owner@test.com")
uf = _make_user_file(owner=owner)
emails, is_public = collect_user_file_access(uf)
assert emails == {"owner@test.com"}
assert is_public is False
def test_shared_persona_adds_users(self) -> None:
owner = _make_user("owner@test.com")
shared = _make_user("shared@test.com")
persona = _make_persona(owner=owner, shared_users=[shared])
uf = _make_user_file(owner=owner, assistants=[persona])
emails, is_public = collect_user_file_access(uf)
assert emails == {"owner@test.com", "shared@test.com"}
assert is_public is False
def test_persona_owner_added(self) -> None:
"""Persona owner (different from file owner) gets access too."""
file_owner = _make_user("file-owner@test.com")
persona_owner = _make_user("persona-owner@test.com")
persona = _make_persona(owner=persona_owner)
uf = _make_user_file(owner=file_owner, assistants=[persona])
emails, is_public = collect_user_file_access(uf)
assert "file-owner@test.com" in emails
assert "persona-owner@test.com" in emails
def test_public_persona_makes_file_public(self) -> None:
owner = _make_user("owner@test.com")
persona = _make_persona(owner=owner, is_public=True)
uf = _make_user_file(owner=owner, assistants=[persona])
emails, is_public = collect_user_file_access(uf)
assert is_public is True
assert "owner@test.com" in emails
def test_deleted_persona_ignored(self) -> None:
owner = _make_user("owner@test.com")
shared = _make_user("shared@test.com")
persona = _make_persona(owner=owner, shared_users=[shared], deleted=True)
uf = _make_user_file(owner=owner, assistants=[persona])
emails, is_public = collect_user_file_access(uf)
assert emails == {"owner@test.com"}
assert is_public is False
def test_multiple_personas_combine(self) -> None:
owner = _make_user("owner@test.com")
user_a = _make_user("a@test.com")
user_b = _make_user("b@test.com")
p1 = _make_persona(owner=owner, shared_users=[user_a])
p2 = _make_persona(owner=owner, shared_users=[user_b])
uf = _make_user_file(owner=owner, assistants=[p1, p2])
emails, is_public = collect_user_file_access(uf)
assert emails == {"owner@test.com", "a@test.com", "b@test.com"}
def test_deduplication(self) -> None:
owner = _make_user("owner@test.com")
shared = _make_user("shared@test.com")
p1 = _make_persona(owner=owner, shared_users=[shared])
p2 = _make_persona(owner=owner, shared_users=[shared])
uf = _make_user_file(owner=owner, assistants=[p1, p2])
emails, _ = collect_user_file_access(uf)
assert emails == {"owner@test.com", "shared@test.com"}
class TestGetAccessForUserFiles:
def test_shared_user_in_acl(self) -> None:
"""Shared persona users should appear in the ACL."""
owner = _make_user("owner@test.com")
shared = _make_user("shared@test.com")
persona = _make_persona(owner=owner, shared_users=[shared])
uf = _make_user_file(owner=owner, assistants=[persona])
db_session = MagicMock()
with patch(
"onyx.access.access.fetch_user_files_with_access_relationships",
return_value=[uf],
):
result = get_access_for_user_files_impl([str(uf.id)], db_session)
access = result[str(uf.id)]
acl = access.to_acl()
assert prefix_user_email("owner@test.com") in acl
assert prefix_user_email("shared@test.com") in acl
assert access.is_public is False
def test_public_persona_sets_public_acl(self) -> None:
owner = _make_user("owner@test.com")
persona = _make_persona(owner=owner, is_public=True)
uf = _make_user_file(owner=owner, assistants=[persona])
db_session = MagicMock()
with patch(
"onyx.access.access.fetch_user_files_with_access_relationships",
return_value=[uf],
):
result = get_access_for_user_files_impl([str(uf.id)], db_session)
access = result[str(uf.id)]
assert access.is_public is True
acl = access.to_acl()
assert PUBLIC_DOC_PAT in acl

View File

@@ -0,0 +1,54 @@
from unittest.mock import MagicMock
import pytest
import onyx.auth.users as users
from onyx.auth.users import verify_auth_setting
from onyx.configs.constants import AuthType
def test_verify_auth_setting_raises_for_cloud(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Cloud auth type is not valid for self-hosted deployments."""
monkeypatch.setenv("AUTH_TYPE", "cloud")
with pytest.raises(ValueError, match="'cloud' is not a valid auth type"):
verify_auth_setting()
def test_verify_auth_setting_warns_for_disabled(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Disabled auth type logs a deprecation warning."""
monkeypatch.setenv("AUTH_TYPE", "disabled")
mock_logger = MagicMock()
monkeypatch.setattr(users, "logger", mock_logger)
monkeypatch.setattr(users, "AUTH_TYPE", AuthType.BASIC)
verify_auth_setting()
mock_logger.warning.assert_called_once()
assert "no longer supported" in mock_logger.warning.call_args[0][0]
@pytest.mark.parametrize(
"auth_type",
[AuthType.BASIC, AuthType.GOOGLE_OAUTH, AuthType.OIDC, AuthType.SAML],
)
def test_verify_auth_setting_valid_auth_types(
monkeypatch: pytest.MonkeyPatch,
auth_type: AuthType,
) -> None:
"""Valid auth types work without errors or warnings."""
monkeypatch.setenv("AUTH_TYPE", auth_type.value)
mock_logger = MagicMock()
monkeypatch.setattr(users, "logger", mock_logger)
monkeypatch.setattr(users, "AUTH_TYPE", auth_type)
verify_auth_setting()
mock_logger.warning.assert_not_called()
mock_logger.notice.assert_called_once_with(f"Using Auth Type: {auth_type.value}")

View File

@@ -27,7 +27,6 @@ def _mock_session_returning_none() -> MagicMock:
"""Return a mock session whose .get() returns None (file not found)."""
session = MagicMock()
session.get.return_value = None
session.execute.return_value.scalar_one_or_none.return_value = None
return session
@@ -220,6 +219,10 @@ class TestDeleteUserFileImpl:
# ------------------------------------------------------------------
@patch(
f"{TASKS_MODULE}.fetch_user_files_with_access_relationships",
return_value=[],
)
class TestProjectSyncUserFileImpl:
@patch(f"{TASKS_MODULE}.get_session_with_current_tenant")
@patch(f"{TASKS_MODULE}.get_redis_client")
@@ -227,6 +230,7 @@ class TestProjectSyncUserFileImpl:
self,
mock_get_redis: MagicMock,
mock_get_session: MagicMock,
_mock_fetch: MagicMock,
) -> None:
redis_client = MagicMock()
lock = MagicMock()
@@ -255,6 +259,7 @@ class TestProjectSyncUserFileImpl:
self,
mock_get_redis: MagicMock,
mock_get_session: MagicMock,
_mock_fetch: MagicMock,
) -> None:
redis_client = MagicMock()
lock = MagicMock()
@@ -277,6 +282,7 @@ class TestProjectSyncUserFileImpl:
self,
mock_get_redis: MagicMock,
mock_get_session: MagicMock,
_mock_fetch: MagicMock,
) -> None:
session = _mock_session_returning_none()
mock_get_session.return_value.__enter__.return_value = session

View File

@@ -379,10 +379,13 @@ class TestProjectSyncImplNoVectorDb:
) -> None:
uf = _make_user_file(status=UserFileStatus.COMPLETED)
session = MagicMock()
session.execute.return_value.scalar_one_or_none.return_value = uf
mock_get_session.return_value.__enter__.return_value = session
with (
patch(
f"{TASKS_MODULE}.fetch_user_files_with_access_relationships",
return_value=[uf],
),
patch(f"{TASKS_MODULE}.get_all_document_indices") as mock_get_indices,
patch(f"{TASKS_MODULE}.get_active_search_settings") as mock_get_ss,
patch(f"{TASKS_MODULE}.httpx_init_vespa_pool") as mock_vespa_pool,
@@ -405,14 +408,17 @@ class TestProjectSyncImplNoVectorDb:
) -> None:
uf = _make_user_file(status=UserFileStatus.COMPLETED)
session = MagicMock()
session.execute.return_value.scalar_one_or_none.return_value = uf
mock_get_session.return_value.__enter__.return_value = session
project_sync_user_file_impl(
user_file_id=str(uf.id),
tenant_id="test-tenant",
redis_locking=False,
)
with patch(
f"{TASKS_MODULE}.fetch_user_files_with_access_relationships",
return_value=[uf],
):
project_sync_user_file_impl(
user_file_id=str(uf.id),
tenant_id="test-tenant",
redis_locking=False,
)
assert uf.needs_project_sync is False
assert uf.needs_persona_sync is False

View File

@@ -0,0 +1,630 @@
from typing import Any
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.chat.tool_call_args_streaming import maybe_emit_argument_delta
from onyx.server.query_and_chat.placement import Placement
from onyx.server.query_and_chat.streaming_models import ToolCallArgumentDelta
from onyx.utils.jsonriver import Parser
def _make_tool_call_delta(
index: int = 0,
name: str | None = None,
arguments: str | None = None,
function_is_none: bool = False,
) -> MagicMock:
"""Create a mock tool_call_delta matching the LiteLLM streaming shape."""
delta = MagicMock()
delta.index = index
if function_is_none:
delta.function = None
else:
delta.function = MagicMock()
delta.function.name = name
delta.function.arguments = arguments
return delta
def _make_placement() -> Placement:
return Placement(turn_index=0, tab_index=0)
def _mock_tool_class(emit: bool = True) -> MagicMock:
cls = MagicMock()
cls.should_emit_argument_deltas.return_value = emit
return cls
def _collect(
tc_map: dict[int, dict[str, Any]],
delta: MagicMock,
placement: Placement | None = None,
parsers: dict[int, Parser] | None = None,
) -> list[Any]:
"""Run maybe_emit_argument_delta and return the yielded packets."""
return list(
maybe_emit_argument_delta(
tc_map,
delta,
placement or _make_placement(),
parsers if parsers is not None else {},
)
)
def _stream_fragments(
fragments: list[str],
tc_map: dict[int, dict[str, Any]],
placement: Placement | None = None,
) -> list[str]:
"""Feed fragments into maybe_emit_argument_delta one by one, returning
all emitted content values concatenated per-key as a flat list."""
pl = placement or _make_placement()
parsers: dict[int, Parser] = {}
emitted: list[str] = []
for frag in fragments:
tc_map[0]["arguments"] += frag
delta = _make_tool_call_delta(arguments=frag)
for packet in maybe_emit_argument_delta(tc_map, delta, pl, parsers=parsers):
obj = packet.obj
assert isinstance(obj, ToolCallArgumentDelta)
for value in obj.argument_deltas.values():
emitted.append(value)
return emitted
class TestMaybeEmitArgumentDeltaGuards:
"""Tests for conditions that cause no packet to be emitted."""
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_no_emission_when_tool_does_not_opt_in(
self, mock_get_tool: MagicMock
) -> None:
"""Tools that return False from should_emit_argument_deltas emit nothing."""
mock_get_tool.return_value = _mock_tool_class(emit=False)
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": '{"code": "x'}
}
assert _collect(tc_map, _make_tool_call_delta(arguments="x")) == []
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_no_emission_when_tool_class_unknown(
self, mock_get_tool: MagicMock
) -> None:
mock_get_tool.return_value = None
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "unknown", "arguments": '{"code": "x'}
}
assert _collect(tc_map, _make_tool_call_delta(arguments="x")) == []
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_no_emission_when_no_argument_fragment(
self, mock_get_tool: MagicMock
) -> None:
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": '{"code": "x'}
}
assert _collect(tc_map, _make_tool_call_delta(arguments=None)) == []
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_no_emission_when_key_value_not_started(
self, mock_get_tool: MagicMock
) -> None:
"""Key exists in JSON but its string value hasn't begun yet."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": '{"code":'}
}
assert _collect(tc_map, _make_tool_call_delta(arguments=":")) == []
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_no_emission_before_any_key(self, mock_get_tool: MagicMock) -> None:
"""Only the opening brace has arrived — no key to stream yet."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": "{"}
}
assert _collect(tc_map, _make_tool_call_delta(arguments="{")) == []
class TestMaybeEmitArgumentDeltaBasic:
"""Tests for correct packet content and incremental emission."""
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_emits_packet_with_correct_fields(self, mock_get_tool: MagicMock) -> None:
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "', "print(1)", '"}']
pl = _make_placement()
parsers: dict[int, Parser] = {}
all_packets = []
for frag in fragments:
tc_map[0]["arguments"] += frag
packets = _collect(
tc_map, _make_tool_call_delta(arguments=frag), pl, parsers
)
all_packets.extend(packets)
assert len(all_packets) >= 1
# Verify packet structure
obj = all_packets[0].obj
assert isinstance(obj, ToolCallArgumentDelta)
assert obj.tool_type == "python"
# All emitted content should reconstruct the value
full_code = ""
for p in all_packets:
assert isinstance(p.obj, ToolCallArgumentDelta)
if "code" in p.obj.argument_deltas:
full_code += p.obj.argument_deltas["code"]
assert full_code == "print(1)"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_emits_only_new_content_on_subsequent_call(
self, mock_get_tool: MagicMock
) -> None:
"""After a first emission, subsequent calls emit only the diff."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
parsers: dict[int, Parser] = {}
pl = _make_placement()
# First fragment opens the string
tc_map[0]["arguments"] = '{"code": "abc'
packets_1 = _collect(
tc_map, _make_tool_call_delta(arguments='{"code": "abc'), pl, parsers
)
code_1 = ""
for p in packets_1:
assert isinstance(p.obj, ToolCallArgumentDelta)
code_1 += p.obj.argument_deltas.get("code", "")
assert code_1 == "abc"
# Second fragment appends more
tc_map[0]["arguments"] = '{"code": "abcdef'
packets_2 = _collect(
tc_map, _make_tool_call_delta(arguments="def"), pl, parsers
)
code_2 = ""
for p in packets_2:
assert isinstance(p.obj, ToolCallArgumentDelta)
code_2 += p.obj.argument_deltas.get("code", "")
assert code_2 == "def"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_handles_multiple_keys_sequentially(self, mock_get_tool: MagicMock) -> None:
"""When a second key starts, emissions switch to that key."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "x',
'", "output": "hello',
'"}',
]
emitted = _stream_fragments(fragments, tc_map)
full = "".join(emitted)
assert "x" in full
assert "hello" in full
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_delta_spans_key_boundary(self, mock_get_tool: MagicMock) -> None:
"""A single delta contains the end of one value and the start of the next key."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "x',
'y", "lang": "py',
'"}',
]
emitted = _stream_fragments(fragments, tc_map)
full = "".join(emitted)
assert "xy" in full
assert "py" in full
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_empty_value_emits_nothing(self, mock_get_tool: MagicMock) -> None:
"""An empty string value has nothing to emit."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
# Opening quote just arrived, value is empty
tc_map[0]["arguments"] = '{"code": "'
packets = _collect(tc_map, _make_tool_call_delta(arguments='{"code": "'))
# No string content yet, so either no packet or empty deltas
for p in packets:
assert isinstance(p.obj, ToolCallArgumentDelta)
assert p.obj.argument_deltas.get("code", "") == ""
class TestMaybeEmitArgumentDeltaDecoding:
"""Tests verifying that JSON escape sequences are properly decoded."""
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_decodes_newlines(self, mock_get_tool: MagicMock) -> None:
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "line1\\nline2"}']
emitted = _stream_fragments(fragments, tc_map)
assert "".join(emitted) == "line1\nline2"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_decodes_tabs(self, mock_get_tool: MagicMock) -> None:
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "\\tindented"}']
emitted = _stream_fragments(fragments, tc_map)
assert "".join(emitted) == "\tindented"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_decodes_escaped_quotes(self, mock_get_tool: MagicMock) -> None:
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "say \\"hi\\""}']
emitted = _stream_fragments(fragments, tc_map)
assert "".join(emitted) == 'say "hi"'
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_decodes_escaped_backslashes(self, mock_get_tool: MagicMock) -> None:
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "path\\\\dir"}']
emitted = _stream_fragments(fragments, tc_map)
assert "".join(emitted) == "path\\dir"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_decodes_unicode_escape(self, mock_get_tool: MagicMock) -> None:
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "\\u0041"}']
emitted = _stream_fragments(fragments, tc_map)
assert "".join(emitted) == "A"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_incomplete_escape_at_end_decoded_on_next_chunk(
self, mock_get_tool: MagicMock
) -> None:
"""A trailing backslash (incomplete escape) is completed in the next chunk."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "hello\\', 'n"}']
emitted = _stream_fragments(fragments, tc_map)
assert "".join(emitted) == "hello\n"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_incomplete_unicode_escape_completed_on_next_chunk(
self, mock_get_tool: MagicMock
) -> None:
"""A partial \\uXX sequence is completed in the next chunk."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"code": "hello\\u00', '41"}']
emitted = _stream_fragments(fragments, tc_map)
assert "".join(emitted) == "helloA"
class TestArgumentDeltaStreamingE2E:
"""Simulates realistic sequences of LLM argument deltas to verify
the full pipeline produces correct decoded output."""
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_realistic_python_code_streaming(self, mock_get_tool: MagicMock) -> None:
"""Streams: {"code": "print('hello')\\nprint('world')"}"""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"',
"code",
'": "',
"print(",
"'hello')",
"\\n",
"print(",
"'world')",
'"}',
]
full = "".join(_stream_fragments(fragments, tc_map))
assert full == "print('hello')\nprint('world')"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_streaming_with_tabs_and_newlines(self, mock_get_tool: MagicMock) -> None:
"""Streams code with tabs and newlines."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "',
"if True:",
"\\n",
"\\t",
"pass",
'"}',
]
full = "".join(_stream_fragments(fragments, tc_map))
assert full == "if True:\n\tpass"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_split_escape_sequence(self, mock_get_tool: MagicMock) -> None:
"""An escape sequence split across two fragments (backslash in one,
'n' in the next) should still decode correctly."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "hello',
"\\",
"n",
'world"}',
]
full = "".join(_stream_fragments(fragments, tc_map))
assert full == "hello\nworld"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_multiple_newlines_and_indentation(self, mock_get_tool: MagicMock) -> None:
"""Streams a multi-line function with multiple escape sequences."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "',
"def foo():",
"\\n",
"\\t",
"x = 1",
"\\n",
"\\t",
"return x",
'"}',
]
full = "".join(_stream_fragments(fragments, tc_map))
assert full == "def foo():\n\tx = 1\n\treturn x"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_two_keys_streamed_sequentially(self, mock_get_tool: MagicMock) -> None:
"""Streams code first, then a second key (language) — both decoded."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "',
"x = 1",
'", "language": "',
"python",
'"}',
]
emitted = _stream_fragments(fragments, tc_map)
# Should have emissions for both keys
full = "".join(emitted)
assert "x = 1" in full
assert "python" in full
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_code_containing_dict_literal(self, mock_get_tool: MagicMock) -> None:
"""Python code like `x = {"key": "val"}` contains JSON-like patterns.
The escaped quotes inside the *outer* JSON value should prevent the
inner `"key":` from being mistaken for a top-level JSON key."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
# The LLM sends: {"code": "x = {\"key\": \"val\"}"}
# The inner quotes are escaped as \" in the JSON value.
fragments = [
'{"code": "',
"x = {",
'\\"key\\"',
": ",
'\\"val\\"',
"}",
'"}',
]
full = "".join(_stream_fragments(fragments, tc_map))
assert full == 'x = {"key": "val"}'
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_code_with_colon_in_value(self, mock_get_tool: MagicMock) -> None:
"""Colons inside the string value should not confuse key detection."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "',
"url = ",
'\\"https://example.com\\"',
'"}',
]
full = "".join(_stream_fragments(fragments, tc_map))
assert full == 'url = "https://example.com"'
class TestMaybeEmitArgumentDeltaEdgeCases:
"""Edge cases not covered by the standard test classes."""
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_no_emission_when_function_is_none(self, mock_get_tool: MagicMock) -> None:
"""Some delta chunks have function=None (e.g. role-only deltas)."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": '{"code": "x'}
}
delta = _make_tool_call_delta(arguments=None, function_is_none=True)
assert _collect(tc_map, delta) == []
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_multiple_concurrent_tool_calls(self, mock_get_tool: MagicMock) -> None:
"""Two tool calls streaming at different indices in parallel."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""},
1: {"id": "tc_2", "name": "python", "arguments": ""},
}
parsers: dict[int, Parser] = {}
pl = _make_placement()
# Feed full JSON to index 0
tc_map[0]["arguments"] = '{"code": "aaa"}'
packets_0 = _collect(
tc_map,
_make_tool_call_delta(index=0, arguments='{"code": "aaa"}'),
pl,
parsers,
)
code_0 = ""
for p in packets_0:
assert isinstance(p.obj, ToolCallArgumentDelta)
code_0 += p.obj.argument_deltas.get("code", "")
assert code_0 == "aaa"
# Feed full JSON to index 1
tc_map[1]["arguments"] = '{"code": "bbb"}'
packets_1 = _collect(
tc_map,
_make_tool_call_delta(index=1, arguments='{"code": "bbb"}'),
pl,
parsers,
)
code_1 = ""
for p in packets_1:
assert isinstance(p.obj, ToolCallArgumentDelta)
code_1 += p.obj.argument_deltas.get("code", "")
assert code_1 == "bbb"
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_delta_with_four_arguments(self, mock_get_tool: MagicMock) -> None:
"""A single delta contains four complete key-value pairs."""
mock_get_tool.return_value = _mock_tool_class()
full = '{"a": "one", "b": "two", "c": "three", "d": "four"}'
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
tc_map[0]["arguments"] = full
parsers: dict[int, Parser] = {}
packets = _collect(
tc_map, _make_tool_call_delta(arguments=full), parsers=parsers
)
# Collect all argument deltas across packets
all_deltas: dict[str, str] = {}
for p in packets:
assert isinstance(p.obj, ToolCallArgumentDelta)
for k, v in p.obj.argument_deltas.items():
all_deltas[k] = all_deltas.get(k, "") + v
assert all_deltas == {
"a": "one",
"b": "two",
"c": "three",
"d": "four",
}
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_delta_on_second_arg_after_first_complete(
self, mock_get_tool: MagicMock
) -> None:
"""First argument is fully complete; delta only adds to the second."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = [
'{"code": "print(1)", "lang": "py',
'"}',
]
emitted = _stream_fragments(fragments, tc_map)
full = "".join(emitted)
assert "print(1)" in full
assert "py" in full
@patch("onyx.chat.tool_call_args_streaming._get_tool_class")
def test_non_string_values_skipped(self, mock_get_tool: MagicMock) -> None:
"""Non-string values (numbers, booleans, null) are skipped — they are
available in the final tool-call kickoff packet. String arguments
following them are still emitted."""
mock_get_tool.return_value = _mock_tool_class()
tc_map: dict[int, dict[str, Any]] = {
0: {"id": "tc_1", "name": "python", "arguments": ""}
}
fragments = ['{"timeout": 30, "code": "hello"}']
emitted = _stream_fragments(fragments, tc_map)
full = "".join(emitted)
assert full == "hello"

View File

@@ -6,6 +6,7 @@ Validates that:
- Crash + resume skips already-processed pages
- BFS (folder-scoped) drives process all items in one call
- 410 Gone triggers a full-resync URL in the checkpoint
- Duplicate document IDs across delta pages are deduplicated
"""
from __future__ import annotations
@@ -457,3 +458,228 @@ class TestDeltaPageFetchFailure:
assert final_cp.current_drive_name is None
assert final_cp.current_drive_id is None
assert final_cp.current_drive_delta_next_link is None
class TestDeltaDuplicateDocumentDedup:
"""The Microsoft Graph delta API can return the same item on multiple
pages. Documents already yielded should be skipped via
checkpoint.seen_document_ids."""
def test_duplicate_across_pages_is_skipped(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Item 'dup' appears on both page 1 and page 2. It should only be
yielded once."""
connector = _setup_connector(monkeypatch)
_mock_convert(monkeypatch)
call_count = 0
def fake_fetch_page(
self: SharepointConnector, # noqa: ARG001
page_url: str, # noqa: ARG001
drive_id: str, # noqa: ARG001
start: datetime | None = None, # noqa: ARG001
end: datetime | None = None, # noqa: ARG001
page_size: int = 200, # noqa: ARG001
) -> tuple[list[DriveItemData], str | None]:
nonlocal call_count
call_count += 1
if call_count == 1:
return [_make_item("a"), _make_item("dup")], "https://next2"
return [_make_item("dup"), _make_item("b")], None
monkeypatch.setattr(
SharepointConnector, "_fetch_one_delta_page", fake_fetch_page
)
checkpoint = _build_ready_checkpoint()
# Page 1: yields a, dup
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, checkpoint, include_permissions=False
)
yielded, checkpoint = _consume_generator(gen)
docs = _docs_from(yielded)
assert [d.id for d in docs] == ["a", "dup"]
assert "dup" in checkpoint.seen_document_ids
# Page 2: dup should be skipped, only b yielded
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, checkpoint, include_permissions=False
)
yielded, checkpoint = _consume_generator(gen)
docs = _docs_from(yielded)
assert [d.id for d in docs] == ["b"]
def test_duplicate_within_same_page_is_skipped(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""If the same item appears twice on a single delta page, only the
first occurrence should be yielded."""
connector = _setup_connector(monkeypatch)
_mock_convert(monkeypatch)
def fake_fetch_page(
self: SharepointConnector, # noqa: ARG001
page_url: str, # noqa: ARG001
drive_id: str, # noqa: ARG001
start: datetime | None = None, # noqa: ARG001
end: datetime | None = None, # noqa: ARG001
page_size: int = 200, # noqa: ARG001
) -> tuple[list[DriveItemData], str | None]:
return [_make_item("x"), _make_item("x"), _make_item("y")], None
monkeypatch.setattr(
SharepointConnector, "_fetch_one_delta_page", fake_fetch_page
)
checkpoint = _build_ready_checkpoint()
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, checkpoint, include_permissions=False
)
yielded, checkpoint = _consume_generator(gen)
docs = _docs_from(yielded)
assert [d.id for d in docs] == ["x", "y"]
def test_seen_ids_survive_checkpoint_serialization(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""seen_document_ids must survive JSON serialization so that
dedup works across crash + resume."""
connector = _setup_connector(monkeypatch)
_mock_convert(monkeypatch)
call_count = 0
def fake_fetch_page(
self: SharepointConnector, # noqa: ARG001
page_url: str, # noqa: ARG001
drive_id: str, # noqa: ARG001
start: datetime | None = None, # noqa: ARG001
end: datetime | None = None, # noqa: ARG001
page_size: int = 200, # noqa: ARG001
) -> tuple[list[DriveItemData], str | None]:
nonlocal call_count
call_count += 1
if call_count == 1:
return [_make_item("a")], "https://next2"
return [_make_item("a"), _make_item("b")], None
monkeypatch.setattr(
SharepointConnector, "_fetch_one_delta_page", fake_fetch_page
)
checkpoint = _build_ready_checkpoint()
# Page 1
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, checkpoint, include_permissions=False
)
_, checkpoint = _consume_generator(gen)
assert "a" in checkpoint.seen_document_ids
# Simulate crash: round-trip through JSON
restored = SharepointConnectorCheckpoint.model_validate_json(
checkpoint.model_dump_json()
)
assert "a" in restored.seen_document_ids
# Page 2 with restored checkpoint: 'a' should be skipped
connector2 = _setup_connector(monkeypatch)
_mock_convert(monkeypatch)
monkeypatch.setattr(
SharepointConnector, "_fetch_one_delta_page", fake_fetch_page
)
gen = connector2._load_from_checkpoint(
_START_TS, _END_TS, restored, include_permissions=False
)
yielded, final_cp = _consume_generator(gen)
docs = _docs_from(yielded)
assert [d.id for d in docs] == ["b"]
def test_no_dedup_across_separate_indexing_runs(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""A fresh checkpoint (new indexing run) should have an empty
seen_document_ids, so previously-indexed docs are re-processed."""
connector = _setup_connector(monkeypatch)
_mock_convert(monkeypatch)
def fake_fetch_page(
self: SharepointConnector, # noqa: ARG001
page_url: str, # noqa: ARG001
drive_id: str, # noqa: ARG001
start: datetime | None = None, # noqa: ARG001
end: datetime | None = None, # noqa: ARG001
page_size: int = 200, # noqa: ARG001
) -> tuple[list[DriveItemData], str | None]:
return [_make_item("a")], None
monkeypatch.setattr(
SharepointConnector, "_fetch_one_delta_page", fake_fetch_page
)
# First run
cp1 = _build_ready_checkpoint()
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, cp1, include_permissions=False
)
yielded, _ = _consume_generator(gen)
assert len(_docs_from(yielded)) == 1
# Second run with a fresh checkpoint — same doc should appear again
cp2 = _build_ready_checkpoint()
assert len(cp2.seen_document_ids) == 0
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, cp2, include_permissions=False
)
yielded, _ = _consume_generator(gen)
assert len(_docs_from(yielded)) == 1
def test_same_id_across_drives_not_skipped(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Graph item IDs are only unique within a drive. An item in drive B
that happens to share an ID with an item already seen in drive A must
NOT be skipped."""
connector = _setup_connector(monkeypatch)
_mock_convert(monkeypatch)
def fake_fetch_page(
self: SharepointConnector, # noqa: ARG001
page_url: str, # noqa: ARG001
drive_id: str, # noqa: ARG001
start: datetime | None = None, # noqa: ARG001
end: datetime | None = None, # noqa: ARG001
page_size: int = 200, # noqa: ARG001
) -> tuple[list[DriveItemData], str | None]:
return [_make_item("shared-id")], None
monkeypatch.setattr(
SharepointConnector, "_fetch_one_delta_page", fake_fetch_page
)
checkpoint = _build_ready_checkpoint(drive_names=["DriveA", "DriveB"])
# Drive A: yields the item
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, checkpoint, include_permissions=False
)
yielded, checkpoint = _consume_generator(gen)
docs = _docs_from(yielded)
assert len(docs) == 1
assert docs[0].id == "shared-id"
# seen_document_ids should have been cleared when drive A finished
assert len(checkpoint.seen_document_ids) == 0
# Drive B: same ID must be yielded again (different drive)
gen = connector._load_from_checkpoint(
_START_TS, _END_TS, checkpoint, include_permissions=False
)
yielded, checkpoint = _consume_generator(gen)
docs = _docs_from(yielded)
assert len(docs) == 1
assert docs[0].id == "shared-id"

View File

@@ -26,14 +26,6 @@ class TestIsTrueOpenAIModel:
"""Test that real OpenAI GPT-4o-mini model is correctly identified."""
assert is_true_openai_model(LlmProviderNames.OPENAI, "gpt-4o-mini") is True
def test_real_openai_o1_preview(self) -> None:
"""Test that real OpenAI o1-preview reasoning model is correctly identified."""
assert is_true_openai_model(LlmProviderNames.OPENAI, "o1-preview") is True
def test_real_openai_o1_mini(self) -> None:
"""Test that real OpenAI o1-mini reasoning model is correctly identified."""
assert is_true_openai_model(LlmProviderNames.OPENAI, "o1-mini") is True
def test_openai_with_provider_prefix(self) -> None:
"""Test that OpenAI model with provider prefix is correctly identified."""
assert is_true_openai_model(LlmProviderNames.OPENAI, "openai/gpt-4") is False

View File

@@ -0,0 +1,204 @@
"""Tests for Slack channel reference resolution and tag filtering
in handle_regular_answer.py."""
from unittest.mock import MagicMock
from slack_sdk.errors import SlackApiError
from onyx.context.search.models import Tag
from onyx.onyxbot.slack.constants import SLACK_CHANNEL_REF_PATTERN
from onyx.onyxbot.slack.handlers.handle_regular_answer import resolve_channel_references
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _mock_client_with_channels(
channel_map: dict[str, str],
) -> MagicMock:
"""Return a mock WebClient where conversations_info resolves IDs to names."""
client = MagicMock()
def _conversations_info(channel: str) -> MagicMock:
if channel in channel_map:
resp = MagicMock()
resp.validate = MagicMock()
resp.__getitem__ = lambda _self, key: {
"channel": {
"name": channel_map[channel],
"is_im": False,
"is_mpim": False,
}
}[key]
return resp
raise SlackApiError("channel_not_found", response=MagicMock())
client.conversations_info = _conversations_info
return client
def _mock_logger() -> MagicMock:
return MagicMock()
# ---------------------------------------------------------------------------
# SLACK_CHANNEL_REF_PATTERN regex tests
# ---------------------------------------------------------------------------
class TestSlackChannelRefPattern:
def test_matches_bare_channel_id(self) -> None:
matches = SLACK_CHANNEL_REF_PATTERN.findall("<#C097NBWMY8Y>")
assert matches == [("C097NBWMY8Y", "")]
def test_matches_channel_id_with_name(self) -> None:
matches = SLACK_CHANNEL_REF_PATTERN.findall("<#C097NBWMY8Y|eng-infra>")
assert matches == [("C097NBWMY8Y", "eng-infra")]
def test_matches_multiple_channels(self) -> None:
msg = "compare <#C111AAA> and <#C222BBB|general>"
matches = SLACK_CHANNEL_REF_PATTERN.findall(msg)
assert len(matches) == 2
assert ("C111AAA", "") in matches
assert ("C222BBB", "general") in matches
def test_no_match_on_plain_text(self) -> None:
matches = SLACK_CHANNEL_REF_PATTERN.findall("no channels here")
assert matches == []
def test_no_match_on_user_mention(self) -> None:
matches = SLACK_CHANNEL_REF_PATTERN.findall("<@U12345>")
assert matches == []
# ---------------------------------------------------------------------------
# resolve_channel_references tests
# ---------------------------------------------------------------------------
class TestResolveChannelReferences:
def test_resolves_bare_channel_id_via_api(self) -> None:
client = _mock_client_with_channels({"C097NBWMY8Y": "eng-infra"})
logger = _mock_logger()
message, tags = resolve_channel_references(
message="summary of <#C097NBWMY8Y> this week",
client=client,
logger=logger,
)
assert message == "summary of #eng-infra this week"
assert len(tags) == 1
assert tags[0] == Tag(tag_key="Channel", tag_value="eng-infra")
def test_uses_name_from_pipe_format_without_api_call(self) -> None:
client = MagicMock()
logger = _mock_logger()
message, tags = resolve_channel_references(
message="check <#C097NBWMY8Y|eng-infra> for updates",
client=client,
logger=logger,
)
assert message == "check #eng-infra for updates"
assert tags == [Tag(tag_key="Channel", tag_value="eng-infra")]
# Should NOT have called the API since name was in the markup
client.conversations_info.assert_not_called()
def test_multiple_channels(self) -> None:
client = _mock_client_with_channels(
{
"C111AAA": "eng-infra",
"C222BBB": "eng-general",
}
)
logger = _mock_logger()
message, tags = resolve_channel_references(
message="compare <#C111AAA> and <#C222BBB>",
client=client,
logger=logger,
)
assert "#eng-infra" in message
assert "#eng-general" in message
assert "<#" not in message
assert len(tags) == 2
tag_values = {t.tag_value for t in tags}
assert tag_values == {"eng-infra", "eng-general"}
def test_no_channel_references_returns_unchanged(self) -> None:
client = MagicMock()
logger = _mock_logger()
message, tags = resolve_channel_references(
message="just a normal message with no channels",
client=client,
logger=logger,
)
assert message == "just a normal message with no channels"
assert tags == []
def test_api_failure_skips_channel_gracefully(self) -> None:
# Client that fails for all channel lookups
client = _mock_client_with_channels({})
logger = _mock_logger()
message, tags = resolve_channel_references(
message="check <#CBADID123>",
client=client,
logger=logger,
)
# Message should remain unchanged for the failed channel
assert "<#CBADID123>" in message
assert tags == []
logger.warning.assert_called_once()
def test_partial_failure_resolves_what_it_can(self) -> None:
# Only one of two channels resolves
client = _mock_client_with_channels({"C111AAA": "eng-infra"})
logger = _mock_logger()
message, tags = resolve_channel_references(
message="compare <#C111AAA> and <#CBADID123>",
client=client,
logger=logger,
)
assert "#eng-infra" in message
assert "<#CBADID123>" in message # failed one stays raw
assert len(tags) == 1
assert tags[0].tag_value == "eng-infra"
def test_duplicate_channel_produces_single_tag(self) -> None:
client = _mock_client_with_channels({"C111AAA": "eng-infra"})
logger = _mock_logger()
message, tags = resolve_channel_references(
message="summarize <#C111AAA> and compare with <#C111AAA>",
client=client,
logger=logger,
)
assert message == "summarize #eng-infra and compare with #eng-infra"
assert len(tags) == 1
assert tags[0].tag_value == "eng-infra"
def test_mixed_pipe_and_bare_formats(self) -> None:
client = _mock_client_with_channels({"C222BBB": "random"})
logger = _mock_logger()
message, tags = resolve_channel_references(
message="see <#C111AAA|eng-infra> and <#C222BBB>",
client=client,
logger=logger,
)
assert "#eng-infra" in message
assert "#random" in message
assert len(tags) == 2

View File

@@ -0,0 +1,43 @@
"""Unit tests for _get_user_access_info helper function.
These tests mock all database operations and don't require a real database.
"""
from unittest.mock import MagicMock
from unittest.mock import patch
from sqlalchemy.orm import Session
from onyx.server.features.hierarchy.api import _get_user_access_info
def test_get_user_access_info_returns_email_and_groups() -> None:
"""_get_user_access_info returns the user's email and external group IDs."""
mock_user = MagicMock()
mock_user.email = "test@example.com"
mock_db_session = MagicMock(spec=Session)
with patch(
"onyx.server.features.hierarchy.api.get_user_external_group_ids",
return_value=["group1", "group2"],
):
email, groups = _get_user_access_info(mock_user, mock_db_session)
assert email == "test@example.com"
assert groups == ["group1", "group2"]
def test_get_user_access_info_with_no_groups() -> None:
"""User with no external groups returns empty list."""
mock_user = MagicMock()
mock_user.email = "solo@example.com"
mock_db_session = MagicMock(spec=Session)
with patch(
"onyx.server.features.hierarchy.api.get_user_external_group_ids",
return_value=[],
):
email, groups = _get_user_access_info(mock_user, mock_db_session)
assert email == "solo@example.com"
assert groups == []

View File

@@ -0,0 +1,54 @@
import datetime
from unittest.mock import MagicMock
from uuid import uuid4
from onyx.auth.schemas import UserRole
from onyx.server.models import FullUserSnapshot
from onyx.server.models import UserGroupInfo
def _mock_user(
personal_name: str | None = "Test User",
created_at: datetime.datetime | None = None,
updated_at: datetime.datetime | None = None,
) -> MagicMock:
user = MagicMock()
user.id = uuid4()
user.email = "test@example.com"
user.role = UserRole.BASIC
user.is_active = True
user.password_configured = True
user.personal_name = personal_name
user.created_at = created_at or datetime.datetime(
2025, 1, 1, tzinfo=datetime.timezone.utc
)
user.updated_at = updated_at or datetime.datetime(
2025, 6, 15, tzinfo=datetime.timezone.utc
)
return user
def test_from_user_model_includes_new_fields() -> None:
user = _mock_user(personal_name="Alice")
groups = [UserGroupInfo(id=1, name="Engineering")]
snapshot = FullUserSnapshot.from_user_model(user, groups=groups)
assert snapshot.personal_name == "Alice"
assert snapshot.created_at == user.created_at
assert snapshot.updated_at == user.updated_at
assert snapshot.groups == groups
def test_from_user_model_defaults_groups_to_empty() -> None:
user = _mock_user()
snapshot = FullUserSnapshot.from_user_model(user)
assert snapshot.groups == []
def test_from_user_model_personal_name_none() -> None:
user = _mock_user(personal_name=None)
snapshot = FullUserSnapshot.from_user_model(user)
assert snapshot.personal_name is None

View File

@@ -0,0 +1,188 @@
from io import BytesIO
from unittest.mock import MagicMock
import pytest
from fastapi import UploadFile
from onyx.server.features.projects import projects_file_utils as utils
class _Tokenizer:
def encode(self, text: str) -> list[int]:
return [1] * len(text)
class _NonSeekableFile(BytesIO):
def tell(self) -> int:
raise OSError("tell not supported")
def seek(self, *_args: object, **_kwargs: object) -> int:
raise OSError("seek not supported")
def _make_upload(filename: str, size: int, content: bytes | None = None) -> UploadFile:
payload = content if content is not None else (b"x" * size)
return UploadFile(filename=filename, file=BytesIO(payload), size=size)
def _make_upload_no_size(filename: str, content: bytes) -> UploadFile:
return UploadFile(filename=filename, file=BytesIO(content), size=None)
def _patch_common_dependencies(monkeypatch: pytest.MonkeyPatch) -> None:
monkeypatch.setattr(utils, "fetch_default_llm_model", lambda _db: None)
monkeypatch.setattr(utils, "get_tokenizer", lambda **_kwargs: _Tokenizer())
monkeypatch.setattr(utils, "is_file_password_protected", lambda **_kwargs: False)
def test_get_upload_size_bytes_falls_back_to_stream_size() -> None:
upload = UploadFile(filename="example.txt", file=BytesIO(b"abcdef"), size=None)
upload.file.seek(2)
size = utils.get_upload_size_bytes(upload)
assert size == 6
assert upload.file.tell() == 2
def test_get_upload_size_bytes_logs_warning_when_stream_size_unavailable(
caplog: pytest.LogCaptureFixture,
) -> None:
upload = UploadFile(filename="non_seekable.txt", file=_NonSeekableFile(), size=None)
caplog.set_level("WARNING")
size = utils.get_upload_size_bytes(upload)
assert size is None
assert "Could not determine upload size via stream seek" in caplog.text
assert "non_seekable.txt" in caplog.text
def test_is_upload_too_large_logs_warning_when_size_unknown(
monkeypatch: pytest.MonkeyPatch,
caplog: pytest.LogCaptureFixture,
) -> None:
upload = _make_upload("size_unknown.txt", size=1)
monkeypatch.setattr(utils, "get_upload_size_bytes", lambda _upload: None)
caplog.set_level("WARNING")
is_too_large = utils.is_upload_too_large(upload, max_bytes=100)
assert is_too_large is False
assert "Could not determine upload size; skipping size-limit check" in caplog.text
assert "size_unknown.txt" in caplog.text
def test_categorize_uploaded_files_accepts_size_under_limit(
monkeypatch: pytest.MonkeyPatch,
) -> None:
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_BYTES", 100)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_MB", 1)
monkeypatch.setattr(utils, "estimate_image_tokens_for_upload", lambda _upload: 10)
upload = _make_upload("small.png", size=99)
result = utils.categorize_uploaded_files([upload], MagicMock())
assert len(result.acceptable) == 1
assert len(result.rejected) == 0
def test_categorize_uploaded_files_uses_seek_fallback_when_upload_size_missing(
monkeypatch: pytest.MonkeyPatch,
) -> None:
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_BYTES", 100)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_MB", 1)
monkeypatch.setattr(utils, "estimate_image_tokens_for_upload", lambda _upload: 10)
upload = _make_upload_no_size("small.png", content=b"x" * 99)
result = utils.categorize_uploaded_files([upload], MagicMock())
assert len(result.acceptable) == 1
assert len(result.rejected) == 0
def test_categorize_uploaded_files_accepts_size_at_limit(
monkeypatch: pytest.MonkeyPatch,
) -> None:
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_BYTES", 100)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_MB", 1)
monkeypatch.setattr(utils, "estimate_image_tokens_for_upload", lambda _upload: 10)
upload = _make_upload("edge.png", size=100)
result = utils.categorize_uploaded_files([upload], MagicMock())
assert len(result.acceptable) == 1
assert len(result.rejected) == 0
def test_categorize_uploaded_files_rejects_size_over_limit_with_reason(
monkeypatch: pytest.MonkeyPatch,
) -> None:
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_BYTES", 100)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_MB", 1)
monkeypatch.setattr(utils, "estimate_image_tokens_for_upload", lambda _upload: 10)
upload = _make_upload("large.png", size=101)
result = utils.categorize_uploaded_files([upload], MagicMock())
assert len(result.acceptable) == 0
assert len(result.rejected) == 1
assert result.rejected[0].reason == "Exceeds 1 MB file size limit"
def test_categorize_uploaded_files_mixed_batch_keeps_valid_and_rejects_oversized(
monkeypatch: pytest.MonkeyPatch,
) -> None:
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_BYTES", 100)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_MB", 1)
monkeypatch.setattr(utils, "estimate_image_tokens_for_upload", lambda _upload: 10)
small = _make_upload("small.png", size=50)
large = _make_upload("large.png", size=101)
result = utils.categorize_uploaded_files([small, large], MagicMock())
assert [file.filename for file in result.acceptable] == ["small.png"]
assert len(result.rejected) == 1
assert result.rejected[0].filename == "large.png"
assert result.rejected[0].reason == "Exceeds 1 MB file size limit"
def test_categorize_uploaded_files_enforces_size_limit_even_when_threshold_is_skipped(
monkeypatch: pytest.MonkeyPatch,
) -> None:
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(utils, "SKIP_USERFILE_THRESHOLD", True)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_BYTES", 100)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_MB", 1)
upload = _make_upload("oversized.pdf", size=101)
result = utils.categorize_uploaded_files([upload], MagicMock())
assert len(result.acceptable) == 0
assert len(result.rejected) == 1
assert result.rejected[0].reason == "Exceeds 1 MB file size limit"
def test_categorize_uploaded_files_checks_size_before_text_extraction(
monkeypatch: pytest.MonkeyPatch,
) -> None:
_patch_common_dependencies(monkeypatch)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_BYTES", 100)
monkeypatch.setattr(utils, "USER_FILE_MAX_UPLOAD_SIZE_MB", 1)
extract_mock = MagicMock(return_value="this should not run")
monkeypatch.setattr(utils, "extract_file_text", extract_mock)
oversized_doc = _make_upload("oversized.pdf", size=101)
result = utils.categorize_uploaded_files([oversized_doc], MagicMock())
extract_mock.assert_not_called()
assert len(result.acceptable) == 0
assert len(result.rejected) == 1
assert result.rejected[0].reason == "Exceeds 1 MB file size limit"

View File

@@ -0,0 +1,32 @@
import pytest
from onyx.key_value_store.interface import KvKeyNotFoundError
from onyx.server.settings import store as settings_store
class _FakeKvStore:
def load(self, _key: str) -> dict:
raise KvKeyNotFoundError()
class _FakeCache:
def __init__(self) -> None:
self._vals: dict[str, bytes] = {}
def get(self, key: str) -> bytes | None:
return self._vals.get(key)
def set(self, key: str, value: str, ex: int | None = None) -> None: # noqa: ARG002
self._vals[key] = value.encode("utf-8")
def test_load_settings_includes_user_file_max_upload_size_mb(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(settings_store, "get_kv_store", lambda: _FakeKvStore())
monkeypatch.setattr(settings_store, "get_cache_backend", lambda: _FakeCache())
monkeypatch.setattr(settings_store, "USER_FILE_MAX_UPLOAD_SIZE_MB", 77)
settings = settings_store.load_settings()
assert settings.user_file_max_upload_size_mb == 77

View File

@@ -147,15 +147,18 @@ class TestSensitiveValueString:
)
assert sensitive1 != sensitive2
def test_equality_with_non_sensitive_raises(self) -> None:
"""Test that comparing with non-SensitiveValue raises error."""
def test_equality_with_non_sensitive_returns_not_equal(self) -> None:
"""Test that comparing with non-SensitiveValue is always not-equal.
Returns NotImplemented so Python falls back to identity comparison.
This is required for compatibility with SQLAlchemy's attribute tracking.
"""
sensitive = SensitiveValue(
encrypted_bytes=_encrypt_string("secret"),
decrypt_fn=_decrypt_string,
is_json=False,
)
with pytest.raises(SensitiveAccessError):
_ = sensitive == "secret"
assert not (sensitive == "secret")
class TestSensitiveValueJson:

Some files were not shown because too many files have changed in this diff Show More