mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-19 08:45:47 +00:00
Compare commits
63 Commits
bg_process
...
nit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c68602f456 | ||
|
|
9d57f34c34 | ||
|
|
cc2f584321 | ||
|
|
a1b95df3b8 | ||
|
|
9272d6ebfe | ||
|
|
4fb65dcf73 | ||
|
|
2bbc5d5d07 | ||
|
|
950b1c38f2 | ||
|
|
99fbfba32f | ||
|
|
0a59efe64a | ||
|
|
cf5d394d39 | ||
|
|
f6d8f5ca89 | ||
|
|
1fb4cdfcc3 | ||
|
|
ac51469bcb | ||
|
|
c25f164e28 | ||
|
|
813720905b | ||
|
|
0c45488ac6 | ||
|
|
95d9b33c1a | ||
|
|
55919f596c | ||
|
|
1d0fb6d012 | ||
|
|
2b1dbde829 | ||
|
|
2758ffd9d5 | ||
|
|
07a1b49b4f | ||
|
|
43d8daa5bc | ||
|
|
faeb9f09f0 | ||
|
|
25f5c12750 | ||
|
|
2d81710ccc | ||
|
|
187a7d2da2 | ||
|
|
4b152aa3a7 | ||
|
|
06f937cf93 | ||
|
|
5a24ed2947 | ||
|
|
2372e6a5a5 | ||
|
|
3eef4e3992 | ||
|
|
467ce4e3f3 | ||
|
|
ee4b334a0a | ||
|
|
4087292001 | ||
|
|
da6ed5b2b3 | ||
|
|
864ac2ac5c | ||
|
|
12cb77c80e | ||
|
|
583cd14bf4 | ||
|
|
001fcb3359 | ||
|
|
7ff18e0a93 | ||
|
|
9ac256e925 | ||
|
|
08600db41d | ||
|
|
6bf06ac7f7 | ||
|
|
5b06b53a3e | ||
|
|
afce57b29f | ||
|
|
257dbecd1d | ||
|
|
bd6baf39c3 | ||
|
|
b2c55ebd71 | ||
|
|
dea7a8f697 | ||
|
|
ddae2346ec | ||
|
|
9032fb4467 | ||
|
|
b6ecbbcf45 | ||
|
|
1d8e662b79 | ||
|
|
2cb33b1fb4 | ||
|
|
2cd1e6be00 | ||
|
|
8e55566f66 | ||
|
|
bafb95d920 | ||
|
|
c6e8bf2d28 | ||
|
|
c2d04f591d | ||
|
|
56c3a5ff5b | ||
|
|
fac2b100a1 |
@@ -3,61 +3,61 @@ name: Build and Push Backend Image on Tag
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '*'
|
||||
- "*"
|
||||
|
||||
env:
|
||||
REGISTRY_IMAGE: danswer/danswer-backend
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-backend-cloud' || 'danswer/danswer-backend' }}
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
|
||||
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
# TODO: investigate a matrix build like the web container
|
||||
# TODO: investigate a matrix build like the web container
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"]
|
||||
runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Install build-essential
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential
|
||||
|
||||
- name: Backend Image Docker Build and Push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
build-args: |
|
||||
DANSWER_VERSION=${{ github.ref_name }}
|
||||
- name: Install build-essential
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y build-essential
|
||||
|
||||
# trivy has their own rate limiting issues causing this action to flake
|
||||
# we worked around it by hardcoding to different db repos in env
|
||||
# can re-enable when they figure it out
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
|
||||
TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
|
||||
with:
|
||||
# To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend
|
||||
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
severity: 'CRITICAL,HIGH'
|
||||
trivyignores: ./backend/.trivyignore
|
||||
- name: Backend Image Docker Build and Push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
build-args: |
|
||||
DANSWER_VERSION=${{ github.ref_name }}
|
||||
|
||||
# trivy has their own rate limiting issues causing this action to flake
|
||||
# we worked around it by hardcoding to different db repos in env
|
||||
# can re-enable when they figure it out
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
|
||||
TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
|
||||
with:
|
||||
# To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend
|
||||
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
severity: "CRITICAL,HIGH"
|
||||
trivyignores: ./backend/.trivyignore
|
||||
|
||||
@@ -4,12 +4,12 @@ name: Build and Push Cloud Web Image on Tag
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '*'
|
||||
- "*"
|
||||
|
||||
env:
|
||||
REGISTRY_IMAGE: danswer/danswer-cloud-web-server
|
||||
REGISTRY_IMAGE: danswer/danswer-web-server-cloud
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
|
||||
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on:
|
||||
@@ -28,11 +28,11 @@ jobs:
|
||||
- name: Prepare
|
||||
run: |
|
||||
platform=${{ matrix.platform }}
|
||||
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
|
||||
|
||||
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
@@ -41,16 +41,16 @@ jobs:
|
||||
tags: |
|
||||
type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
|
||||
- name: Build and push by digest
|
||||
id: build
|
||||
uses: docker/build-push-action@v5
|
||||
@@ -65,17 +65,17 @@ jobs:
|
||||
NEXT_PUBLIC_POSTHOG_KEY=${{ secrets.POSTHOG_KEY }}
|
||||
NEXT_PUBLIC_POSTHOG_HOST=${{ secrets.POSTHOG_HOST }}
|
||||
NEXT_PUBLIC_SENTRY_DSN=${{ secrets.SENTRY_DSN }}
|
||||
# needed due to weird interactions with the builds for different platforms
|
||||
# needed due to weird interactions with the builds for different platforms
|
||||
no-cache: true
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
|
||||
|
||||
|
||||
- name: Export digest
|
||||
run: |
|
||||
mkdir -p /tmp/digests
|
||||
digest="${{ steps.build.outputs.digest }}"
|
||||
touch "/tmp/digests/${digest#sha256:}"
|
||||
|
||||
touch "/tmp/digests/${digest#sha256:}"
|
||||
|
||||
- name: Upload digest
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
@@ -95,42 +95,42 @@ jobs:
|
||||
path: /tmp/digests
|
||||
pattern: digests-*
|
||||
merge-multiple: true
|
||||
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
|
||||
- name: Docker meta
|
||||
id: meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ${{ env.REGISTRY_IMAGE }}
|
||||
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
|
||||
- name: Create manifest list and push
|
||||
working-directory: /tmp/digests
|
||||
run: |
|
||||
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
|
||||
$(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
|
||||
|
||||
$(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
|
||||
|
||||
- name: Inspect image
|
||||
run: |
|
||||
docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}
|
||||
|
||||
# trivy has their own rate limiting issues causing this action to flake
|
||||
# we worked around it by hardcoding to different db repos in env
|
||||
# can re-enable when they figure it out
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
# trivy has their own rate limiting issues causing this action to flake
|
||||
# we worked around it by hardcoding to different db repos in env
|
||||
# can re-enable when they figure it out
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
|
||||
TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
|
||||
TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
|
||||
TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
|
||||
with:
|
||||
image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
severity: 'CRITICAL,HIGH'
|
||||
severity: "CRITICAL,HIGH"
|
||||
|
||||
@@ -3,53 +3,53 @@ name: Build and Push Model Server Image on Tag
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- '*'
|
||||
- "*"
|
||||
|
||||
env:
|
||||
REGISTRY_IMAGE: danswer/danswer-model-server
|
||||
REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-model-server-cloud' || 'danswer/danswer-model-server' }}
|
||||
LATEST_TAG: ${{ contains(github.ref_name, 'latest') }}
|
||||
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"]
|
||||
runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Model Server Image Docker Build and Push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
build-args: |
|
||||
DANSWER_VERSION=${{ github.ref_name }}
|
||||
- name: Model Server Image Docker Build and Push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
platforms: linux/amd64,linux/arm64
|
||||
push: true
|
||||
tags: |
|
||||
${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }}
|
||||
${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }}
|
||||
build-args: |
|
||||
DANSWER_VERSION=${{ github.ref_name }}
|
||||
|
||||
# trivy has their own rate limiting issues causing this action to flake
|
||||
# we worked around it by hardcoding to different db repos in env
|
||||
# can re-enable when they figure it out
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2'
|
||||
TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1'
|
||||
with:
|
||||
image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }}
|
||||
severity: 'CRITICAL,HIGH'
|
||||
# trivy has their own rate limiting issues causing this action to flake
|
||||
# we worked around it by hardcoding to different db repos in env
|
||||
# can re-enable when they figure it out
|
||||
# https://github.com/aquasecurity/trivy/discussions/7538
|
||||
# https://github.com/aquasecurity/trivy-action/issues/389
|
||||
- name: Run Trivy vulnerability scanner
|
||||
uses: aquasecurity/trivy-action@master
|
||||
env:
|
||||
TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2"
|
||||
TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1"
|
||||
with:
|
||||
image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }}
|
||||
severity: "CRITICAL,HIGH"
|
||||
|
||||
11
.github/workflows/pr-Integration-tests.yml
vendored
11
.github/workflows/pr-Integration-tests.yml
vendored
@@ -210,17 +210,18 @@ jobs:
|
||||
echo "All integration tests passed successfully."
|
||||
fi
|
||||
|
||||
- name: Stop Docker containers
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.dev.yml -p danswer-stack down -v
|
||||
|
||||
# save before stopping the containers so the logs can be captured
|
||||
- name: Save Docker logs
|
||||
if: success() || failure()
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
|
||||
mv docker-compose.log ${{ github.workspace }}/docker-compose.log
|
||||
|
||||
- name: Stop Docker containers
|
||||
run: |
|
||||
cd deployment/docker_compose
|
||||
docker compose -f docker-compose.dev.yml -p danswer-stack down -v
|
||||
|
||||
- name: Upload logs
|
||||
if: success() || failure()
|
||||
|
||||
@@ -1,24 +1,20 @@
|
||||
# This workflow is intentionally disabled while we're still working on it
|
||||
# It's close to ready, but a race condition needs to be fixed with
|
||||
# API server and Vespa startup, and it needs to have a way to build/test against
|
||||
# local containers
|
||||
|
||||
name: Helm - Lint and Test Charts
|
||||
|
||||
on:
|
||||
merge_group:
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
workflow_dispatch: # Allows manual triggering
|
||||
|
||||
jobs:
|
||||
lint-test:
|
||||
helm-chart-check:
|
||||
# See https://runs-on.com/runners/linux/
|
||||
runs-on: [runs-on,runner=8cpu-linux-x64,hdd=256,"run-id=${{ github.run_id }}"]
|
||||
|
||||
# fetch-depth 0 is required for helm/chart-testing-action
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
@@ -28,7 +24,7 @@ jobs:
|
||||
version: v3.14.4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
cache: 'pip'
|
||||
@@ -45,24 +41,31 @@ jobs:
|
||||
- name: Set up chart-testing
|
||||
uses: helm/chart-testing-action@v2.6.1
|
||||
|
||||
# even though we specify chart-dirs in ct.yaml, it isn't used by ct for the list-changed command...
|
||||
- name: Run chart-testing (list-changed)
|
||||
id: list-changed
|
||||
run: |
|
||||
changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
|
||||
echo "default_branch: ${{ github.event.repository.default_branch }}"
|
||||
changed=$(ct list-changed --remote origin --target-branch ${{ github.event.repository.default_branch }} --chart-dirs deployment/helm/charts)
|
||||
echo "list-changed output: $changed"
|
||||
if [[ -n "$changed" ]]; then
|
||||
echo "changed=true" >> "$GITHUB_OUTPUT"
|
||||
fi
|
||||
|
||||
# lint all charts if any changes were detected
|
||||
- name: Run chart-testing (lint)
|
||||
# if: steps.list-changed.outputs.changed == 'true'
|
||||
run: ct lint --all --config ct.yaml --target-branch ${{ github.event.repository.default_branch }}
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: ct lint --config ct.yaml --all
|
||||
# the following would lint only changed charts, but linting isn't expensive
|
||||
# run: ct lint --config ct.yaml --target-branch ${{ github.event.repository.default_branch }}
|
||||
|
||||
- name: Create kind cluster
|
||||
# if: steps.list-changed.outputs.changed == 'true'
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
uses: helm/kind-action@v1.10.0
|
||||
|
||||
- name: Run chart-testing (install)
|
||||
# if: steps.list-changed.outputs.changed == 'true'
|
||||
run: ct install --all --config ct.yaml
|
||||
# run: ct install --target-branch ${{ github.event.repository.default_branch }}
|
||||
|
||||
if: steps.list-changed.outputs.changed == 'true'
|
||||
run: ct install --all --helm-extra-set-args="--set=nginx.enabled=false" --debug --config ct.yaml
|
||||
# the following would install only changed charts, but we only have one chart so
|
||||
# don't worry about that for now
|
||||
# run: ct install --target-branch ${{ github.event.repository.default_branch }}
|
||||
@@ -21,6 +21,8 @@ env:
|
||||
# Google
|
||||
GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
|
||||
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
|
||||
GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
|
||||
GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}
|
||||
|
||||
jobs:
|
||||
connectors-check:
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
"""remove description from starter messages
|
||||
|
||||
Revision ID: b72ed7a5db0e
|
||||
Revises: 33cb72ea4d80
|
||||
Create Date: 2024-11-03 15:55:28.944408
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "b72ed7a5db0e"
|
||||
down_revision = "33cb72ea4d80"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET starter_messages = (
|
||||
SELECT jsonb_agg(elem - 'description')
|
||||
FROM jsonb_array_elements(starter_messages) elem
|
||||
)
|
||||
WHERE starter_messages IS NOT NULL
|
||||
AND jsonb_typeof(starter_messages) = 'array'
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.execute(
|
||||
sa.text(
|
||||
"""
|
||||
UPDATE persona
|
||||
SET starter_messages = (
|
||||
SELECT jsonb_agg(elem || '{"description": ""}')
|
||||
FROM jsonb_array_elements(starter_messages) elem
|
||||
)
|
||||
WHERE starter_messages IS NOT NULL
|
||||
AND jsonb_typeof(starter_messages) = 'array'
|
||||
"""
|
||||
)
|
||||
)
|
||||
@@ -0,0 +1,29 @@
|
||||
"""add recent assistants
|
||||
|
||||
Revision ID: c0fd6e4da83a
|
||||
Revises: b72ed7a5db0e
|
||||
Create Date: 2024-11-03 17:28:54.916618
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "c0fd6e4da83a"
|
||||
down_revision = "b72ed7a5db0e"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"user",
|
||||
sa.Column(
|
||||
"recent_assistants", postgresql.JSONB(), server_default="[]", nullable=False
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("user", "recent_assistants")
|
||||
@@ -8,7 +8,7 @@ from passlib.hash import sha256_crypt
|
||||
from pydantic import BaseModel
|
||||
|
||||
from danswer.auth.schemas import UserRole
|
||||
from ee.danswer.configs.app_configs import API_KEY_HASH_ROUNDS
|
||||
from danswer.configs.app_configs import API_KEY_HASH_ROUNDS
|
||||
|
||||
|
||||
_API_KEY_HEADER_NAME = "Authorization"
|
||||
@@ -48,11 +48,11 @@ from httpx_oauth.integrations.fastapi import OAuth2AuthorizeCallback
|
||||
from httpx_oauth.oauth2 import BaseOAuth2
|
||||
from httpx_oauth.oauth2 import OAuth2Token
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import attributes
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.auth.api_key import get_hashed_api_key_from_request
|
||||
from danswer.auth.invited_users import get_invited_users
|
||||
from danswer.auth.schemas import UserCreate
|
||||
from danswer.auth.schemas import UserRole
|
||||
@@ -75,6 +75,7 @@ from danswer.configs.constants import AuthType
|
||||
from danswer.configs.constants import DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN
|
||||
from danswer.configs.constants import DANSWER_API_KEY_PREFIX
|
||||
from danswer.configs.constants import UNNAMED_KEY_PLACEHOLDER
|
||||
from danswer.db.api_key import fetch_user_for_api_key
|
||||
from danswer.db.auth import get_access_token_db
|
||||
from danswer.db.auth import get_default_admin_user_emails
|
||||
from danswer.db.auth import get_user_count
|
||||
@@ -83,24 +84,27 @@ from danswer.db.auth import SQLAlchemyUserAdminDB
|
||||
from danswer.db.engine import get_async_session_with_tenant
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.engine import get_session_with_tenant
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.db.models import AccessToken
|
||||
from danswer.db.models import OAuthAccount
|
||||
from danswer.db.models import User
|
||||
from danswer.db.models import UserTenantMapping
|
||||
from danswer.db.users import get_user_by_email
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.telemetry import optional_telemetry
|
||||
from danswer.utils.telemetry import RecordType
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
from danswer.utils.variable_functionality import fetch_versioned_implementation
|
||||
from shared_configs.configs import async_return_default_schema
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class BasicAuthenticationError(HTTPException):
|
||||
def __init__(self, detail: str):
|
||||
super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
|
||||
|
||||
|
||||
def is_user_admin(user: User | None) -> bool:
|
||||
if AUTH_TYPE == AuthType.DISABLED:
|
||||
return True
|
||||
@@ -190,20 +194,6 @@ def verify_email_domain(email: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
def get_tenant_id_for_email(email: str) -> str:
|
||||
if not MULTI_TENANT:
|
||||
return POSTGRES_DEFAULT_SCHEMA
|
||||
# Implement logic to get tenant_id from the mapping table
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
result = db_session.execute(
|
||||
select(UserTenantMapping.tenant_id).where(UserTenantMapping.email == email)
|
||||
)
|
||||
tenant_id = result.scalar_one_or_none()
|
||||
if tenant_id is None:
|
||||
raise exceptions.UserNotExists()
|
||||
return tenant_id
|
||||
|
||||
|
||||
def send_user_verification_email(
|
||||
user_email: str,
|
||||
token: str,
|
||||
@@ -238,19 +228,13 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
safe: bool = False,
|
||||
request: Optional[Request] = None,
|
||||
) -> User:
|
||||
try:
|
||||
tenant_id = (
|
||||
get_tenant_id_for_email(user_create.email)
|
||||
if MULTI_TENANT
|
||||
else POSTGRES_DEFAULT_SCHEMA
|
||||
)
|
||||
except exceptions.UserNotExists:
|
||||
raise HTTPException(status_code=401, detail="User not found")
|
||||
|
||||
if not tenant_id:
|
||||
raise HTTPException(
|
||||
status_code=401, detail="User does not belong to an organization"
|
||||
)
|
||||
tenant_id = await fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.provisioning",
|
||||
"get_or_create_tenant_id",
|
||||
async_return_default_schema,
|
||||
)(
|
||||
email=user_create.email,
|
||||
)
|
||||
|
||||
async with get_async_session_with_tenant(tenant_id) as db_session:
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
@@ -271,7 +255,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
user_create.role = UserRole.ADMIN
|
||||
else:
|
||||
user_create.role = UserRole.BASIC
|
||||
user = None
|
||||
|
||||
try:
|
||||
user = await super().create(user_create, safe=safe, request=request) # type: ignore
|
||||
except exceptions.UserAlreadyExists:
|
||||
@@ -292,7 +276,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
else:
|
||||
raise exceptions.UserAlreadyExists()
|
||||
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
finally:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
|
||||
return user
|
||||
|
||||
async def oauth_callback(
|
||||
@@ -308,19 +294,18 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
associate_by_email: bool = False,
|
||||
is_verified_by_default: bool = False,
|
||||
) -> models.UOAP:
|
||||
# Get tenant_id from mapping table
|
||||
try:
|
||||
tenant_id = (
|
||||
get_tenant_id_for_email(account_email)
|
||||
if MULTI_TENANT
|
||||
else POSTGRES_DEFAULT_SCHEMA
|
||||
)
|
||||
except exceptions.UserNotExists:
|
||||
raise HTTPException(status_code=401, detail="User not found")
|
||||
tenant_id = await fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.provisioning",
|
||||
"get_or_create_tenant_id",
|
||||
async_return_default_schema,
|
||||
)(
|
||||
email=account_email,
|
||||
)
|
||||
|
||||
if not tenant_id:
|
||||
raise HTTPException(status_code=401, detail="User not found")
|
||||
|
||||
# Proceed with the tenant context
|
||||
token = None
|
||||
async with get_async_session_with_tenant(tenant_id) as db_session:
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
@@ -371,9 +356,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
# Explicitly set the Postgres schema for this session to ensure
|
||||
# OAuth account creation happens in the correct tenant schema
|
||||
await db_session.execute(text(f'SET search_path = "{tenant_id}"'))
|
||||
user = await self.user_db.add_oauth_account(
|
||||
user, oauth_account_dict
|
||||
)
|
||||
|
||||
# Add OAuth account
|
||||
await self.user_db.add_oauth_account(user, oauth_account_dict)
|
||||
await self.on_after_register(user, request)
|
||||
|
||||
else:
|
||||
@@ -453,7 +438,13 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
email = credentials.username
|
||||
|
||||
# Get tenant_id from mapping table
|
||||
tenant_id = get_tenant_id_for_email(email)
|
||||
tenant_id = await fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.provisioning",
|
||||
"get_or_create_tenant_id",
|
||||
async_return_default_schema,
|
||||
)(
|
||||
email=email,
|
||||
)
|
||||
if not tenant_id:
|
||||
# User not found in mapping
|
||||
self.password_helper.hash(credentials.password)
|
||||
@@ -477,8 +468,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
has_web_login = attributes.get_attribute(user, "has_web_login")
|
||||
|
||||
if not has_web_login:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD",
|
||||
)
|
||||
|
||||
@@ -511,7 +501,14 @@ cookie_transport = CookieTransport(
|
||||
# This strategy is used to add tenant_id to the JWT token
|
||||
class TenantAwareJWTStrategy(JWTStrategy):
|
||||
async def _create_token_data(self, user: User, impersonate: bool = False) -> dict:
|
||||
tenant_id = get_tenant_id_for_email(user.email)
|
||||
tenant_id = await fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.provisioning",
|
||||
"get_or_create_tenant_id",
|
||||
async_return_default_schema,
|
||||
)(
|
||||
email=user.email,
|
||||
)
|
||||
|
||||
data = {
|
||||
"sub": str(user.id),
|
||||
"aud": self.token_audience,
|
||||
@@ -628,14 +625,12 @@ async def double_check_user(
|
||||
return None
|
||||
|
||||
if user is None:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="Access denied. User is not authenticated.",
|
||||
)
|
||||
|
||||
if user_needs_to_be_verified() and not user.is_verified:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="Access denied. User is not verified.",
|
||||
)
|
||||
|
||||
@@ -644,8 +639,7 @@ async def double_check_user(
|
||||
and user.oidc_expiry < datetime.now(timezone.utc)
|
||||
and not include_expired
|
||||
):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="Access denied. User's OIDC token has expired.",
|
||||
)
|
||||
|
||||
@@ -671,15 +665,13 @@ async def current_curator_or_admin_user(
|
||||
return None
|
||||
|
||||
if not user or not hasattr(user, "role"):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="Access denied. User is not authenticated or lacks role information.",
|
||||
)
|
||||
|
||||
allowed_roles = {UserRole.GLOBAL_CURATOR, UserRole.CURATOR, UserRole.ADMIN}
|
||||
if user.role not in allowed_roles:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="Access denied. User is not a curator or admin.",
|
||||
)
|
||||
|
||||
@@ -691,8 +683,7 @@ async def current_admin_user(user: User | None = Depends(current_user)) -> User
|
||||
return None
|
||||
|
||||
if not user or not hasattr(user, "role") or user.role != UserRole.ADMIN:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="Access denied. User must be an admin to perform this action.",
|
||||
)
|
||||
|
||||
@@ -885,3 +876,22 @@ def get_oauth_router(
|
||||
return redirect_response
|
||||
|
||||
return router
|
||||
|
||||
|
||||
def api_key_dep(
|
||||
request: Request, db_session: Session = Depends(get_session)
|
||||
) -> User | None:
|
||||
if AUTH_TYPE == AuthType.DISABLED:
|
||||
return None
|
||||
|
||||
hashed_api_key = get_hashed_api_key_from_request(request)
|
||||
if not hashed_api_key:
|
||||
raise HTTPException(status_code=401, detail="Missing API key")
|
||||
|
||||
if hashed_api_key:
|
||||
user = fetch_user_for_api_key(hashed_api_key, db_session)
|
||||
|
||||
if user is None:
|
||||
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||
|
||||
return user
|
||||
|
||||
@@ -3,6 +3,7 @@ import multiprocessing
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
import sentry_sdk
|
||||
from celery import Task
|
||||
from celery.app import trace
|
||||
@@ -11,11 +12,15 @@ from celery.states import READY_STATES
|
||||
from celery.utils.log import get_task_logger
|
||||
from celery.worker import strategy # type: ignore
|
||||
from sentry_sdk.integrations.celery import CeleryIntegration
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.background.celery.apps.task_formatters import CeleryTaskColoredFormatter
|
||||
from danswer.background.celery.apps.task_formatters import CeleryTaskPlainFormatter
|
||||
from danswer.background.celery.celery_utils import celery_is_worker_primary
|
||||
from danswer.configs.constants import DanswerRedisLocks
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.document_index.vespa_constants import VESPA_CONFIG_SERVER_URL
|
||||
from danswer.redis.redis_connector import RedisConnector
|
||||
from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
|
||||
from danswer.redis.redis_connector_delete import RedisConnectorDelete
|
||||
@@ -26,7 +31,6 @@ from danswer.redis.redis_usergroup import RedisUserGroup
|
||||
from danswer.utils.logger import ColoredFormatter
|
||||
from danswer.utils.logger import PlainFormatter
|
||||
from danswer.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import SENTRY_DSN
|
||||
|
||||
|
||||
@@ -139,45 +143,136 @@ def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None
|
||||
|
||||
|
||||
def wait_for_redis(sender: Any, **kwargs: Any) -> None:
|
||||
"""Waits for redis to become ready subject to a hardcoded timeout.
|
||||
Will raise WorkerShutdown to kill the celery worker if the timeout is reached."""
|
||||
|
||||
r = get_redis_client(tenant_id=None)
|
||||
|
||||
WAIT_INTERVAL = 5
|
||||
WAIT_LIMIT = 60
|
||||
|
||||
ready = False
|
||||
time_start = time.monotonic()
|
||||
logger.info("Redis: Readiness check starting.")
|
||||
logger.info("Redis: Readiness probe starting.")
|
||||
while True:
|
||||
try:
|
||||
if r.ping():
|
||||
ready = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time_elapsed = time.monotonic() - time_start
|
||||
logger.info(
|
||||
f"Redis: Ping failed. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
|
||||
)
|
||||
if time_elapsed > WAIT_LIMIT:
|
||||
msg = (
|
||||
f"Redis: Readiness check did not succeed within the timeout "
|
||||
f"({WAIT_LIMIT} seconds). Exiting..."
|
||||
)
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
break
|
||||
|
||||
logger.info(
|
||||
f"Redis: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
|
||||
)
|
||||
|
||||
time.sleep(WAIT_INTERVAL)
|
||||
|
||||
logger.info("Redis: Readiness check succeeded. Continuing...")
|
||||
if not ready:
|
||||
msg = (
|
||||
f"Redis: Readiness probe did not succeed within the timeout "
|
||||
f"({WAIT_LIMIT} seconds). Exiting..."
|
||||
)
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
|
||||
logger.info("Redis: Readiness probe succeeded. Continuing...")
|
||||
return
|
||||
|
||||
|
||||
def wait_for_db(sender: Any, **kwargs: Any) -> None:
|
||||
"""Waits for the db to become ready subject to a hardcoded timeout.
|
||||
Will raise WorkerShutdown to kill the celery worker if the timeout is reached."""
|
||||
|
||||
WAIT_INTERVAL = 5
|
||||
WAIT_LIMIT = 60
|
||||
|
||||
ready = False
|
||||
time_start = time.monotonic()
|
||||
logger.info("Database: Readiness probe starting.")
|
||||
while True:
|
||||
try:
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
result = db_session.execute(text("SELECT NOW()")).scalar()
|
||||
if result:
|
||||
ready = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time_elapsed = time.monotonic() - time_start
|
||||
if time_elapsed > WAIT_LIMIT:
|
||||
break
|
||||
|
||||
logger.info(
|
||||
f"Database: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
|
||||
)
|
||||
|
||||
time.sleep(WAIT_INTERVAL)
|
||||
|
||||
if not ready:
|
||||
msg = (
|
||||
f"Database: Readiness probe did not succeed within the timeout "
|
||||
f"({WAIT_LIMIT} seconds). Exiting..."
|
||||
)
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
|
||||
logger.info("Database: Readiness probe succeeded. Continuing...")
|
||||
return
|
||||
|
||||
|
||||
def wait_for_vespa(sender: Any, **kwargs: Any) -> None:
|
||||
"""Waits for Vespa to become ready subject to a hardcoded timeout.
|
||||
Will raise WorkerShutdown to kill the celery worker if the timeout is reached."""
|
||||
|
||||
WAIT_INTERVAL = 5
|
||||
WAIT_LIMIT = 60
|
||||
|
||||
ready = False
|
||||
time_start = time.monotonic()
|
||||
logger.info("Vespa: Readiness probe starting.")
|
||||
while True:
|
||||
try:
|
||||
response = requests.get(f"{VESPA_CONFIG_SERVER_URL}/state/v1/health")
|
||||
response.raise_for_status()
|
||||
|
||||
response_dict = response.json()
|
||||
if response_dict["status"]["code"] == "up":
|
||||
ready = True
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
time_elapsed = time.monotonic() - time_start
|
||||
if time_elapsed > WAIT_LIMIT:
|
||||
break
|
||||
|
||||
logger.info(
|
||||
f"Vespa: Readiness probe ongoing. elapsed={time_elapsed:.1f} timeout={WAIT_LIMIT:.1f}"
|
||||
)
|
||||
|
||||
time.sleep(WAIT_INTERVAL)
|
||||
|
||||
if not ready:
|
||||
msg = (
|
||||
f"Vespa: Readiness probe did not succeed within the timeout "
|
||||
f"({WAIT_LIMIT} seconds). Exiting..."
|
||||
)
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
|
||||
logger.info("Vespa: Readiness probe succeeded. Continuing...")
|
||||
return
|
||||
|
||||
|
||||
def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
logger.info("Running as a secondary celery worker.")
|
||||
|
||||
# Exit early if multi-tenant since primary worker check not needed
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
# Set up variables for waiting on primary worker
|
||||
WAIT_INTERVAL = 5
|
||||
WAIT_LIMIT = 60
|
||||
|
||||
@@ -12,6 +12,7 @@ from danswer.db.engine import get_all_tenant_ids
|
||||
from danswer.db.engine import SqlEngine
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.variable_functionality import fetch_versioned_implementation
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
|
||||
@@ -119,10 +120,10 @@ class DynamicTenantScheduler(PersistentScheduler):
|
||||
else:
|
||||
logger.info("Schedule is up to date, no changes needed")
|
||||
|
||||
except (AttributeError, KeyError) as e:
|
||||
logger.exception(f"Failed to process task configuration: {str(e)}")
|
||||
except Exception as e:
|
||||
logger.exception(f"Unexpected error updating tenant tasks: {str(e)}")
|
||||
except (AttributeError, KeyError):
|
||||
logger.exception("Failed to process task configuration")
|
||||
except Exception:
|
||||
logger.exception("Unexpected error updating tenant tasks")
|
||||
|
||||
def _should_update_schedule(
|
||||
self, current_schedule: dict, new_schedule: dict
|
||||
@@ -143,6 +144,11 @@ def on_beat_init(sender: Any, **kwargs: Any) -> None:
|
||||
# Celery beat shouldn't touch the db at all. But just setting a low minimum here.
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_BEAT_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=2, max_overflow=0)
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ import danswer.background.celery.apps.app_base as app_base
|
||||
from danswer.configs.constants import POSTGRES_CELERY_WORKER_HEAVY_APP_NAME
|
||||
from danswer.db.engine import SqlEngine
|
||||
from danswer.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -60,7 +61,13 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=4, max_overflow=12)
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
app_base.on_secondary_worker_init(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ import danswer.background.celery.apps.app_base as app_base
|
||||
from danswer.configs.constants import POSTGRES_CELERY_WORKER_INDEXING_APP_NAME
|
||||
from danswer.db.engine import SqlEngine
|
||||
from danswer.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -60,7 +61,13 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=8, max_overflow=0)
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
app_base.on_secondary_worker_init(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ import danswer.background.celery.apps.app_base as app_base
|
||||
from danswer.configs.constants import POSTGRES_CELERY_WORKER_LIGHT_APP_NAME
|
||||
from danswer.db.engine import SqlEngine
|
||||
from danswer.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -59,8 +60,13 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
app_base.on_secondary_worker_init(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -19,7 +19,13 @@ from danswer.configs.constants import DanswerRedisLocks
|
||||
from danswer.configs.constants import POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME
|
||||
from danswer.db.engine import SqlEngine
|
||||
from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
|
||||
from danswer.redis.redis_connector_delete import RedisConnectorDelete
|
||||
from danswer.redis.redis_connector_index import RedisConnectorIndex
|
||||
from danswer.redis.redis_connector_prune import RedisConnectorPrune
|
||||
from danswer.redis.redis_connector_stop import RedisConnectorStop
|
||||
from danswer.redis.redis_document_set import RedisDocumentSet
|
||||
from danswer.redis.redis_pool import get_redis_client
|
||||
from danswer.redis.redis_usergroup import RedisUserGroup
|
||||
from danswer.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
@@ -69,13 +75,16 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=8, max_overflow=0)
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
|
||||
logger.info("Running as the primary celery worker.")
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
|
||||
logger.info("Running as the primary celery worker.")
|
||||
|
||||
# This is singleton work that should be done on startup exactly once
|
||||
# by the primary worker. This is unnecessary in the multi tenant scenario
|
||||
r = get_redis_client(tenant_id=None)
|
||||
@@ -113,6 +122,18 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
r.delete(RedisConnectorCredentialPair.get_taskset_key())
|
||||
r.delete(RedisConnectorCredentialPair.get_fence_key())
|
||||
|
||||
RedisDocumentSet.reset_all(r)
|
||||
|
||||
RedisUserGroup.reset_all(r)
|
||||
|
||||
RedisConnectorDelete.reset_all(r)
|
||||
|
||||
RedisConnectorPrune.reset_all(r)
|
||||
|
||||
RedisConnectorIndex.reset_all(r)
|
||||
|
||||
RedisConnectorStop.reset_all(r)
|
||||
|
||||
|
||||
@worker_ready.connect
|
||||
def on_worker_ready(sender: Any, **kwargs: Any) -> None:
|
||||
|
||||
@@ -4,6 +4,7 @@ from http import HTTPStatus
|
||||
from time import sleep
|
||||
|
||||
import redis
|
||||
import sentry_sdk
|
||||
from celery import Celery
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
@@ -50,6 +51,7 @@ from danswer.utils.variable_functionality import global_version
|
||||
from shared_configs.configs import INDEXING_MODEL_SERVER_HOST
|
||||
from shared_configs.configs import INDEXING_MODEL_SERVER_PORT
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import SENTRY_DSN
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -173,7 +175,9 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
|
||||
)
|
||||
if attempt_id:
|
||||
task_logger.info(
|
||||
f"Indexing queued: cc_pair={cc_pair.id} index_attempt={attempt_id}"
|
||||
f"Indexing queued: index_attempt={attempt_id} "
|
||||
f"cc_pair={cc_pair.id} "
|
||||
f"search_settings={search_settings_instance.id} "
|
||||
)
|
||||
tasks_created += 1
|
||||
except SoftTimeLimitExceeded:
|
||||
@@ -482,6 +486,18 @@ def connector_indexing_task(
|
||||
that the task transitioned to a "READY" state but the generator_complete_key doesn't exist.
|
||||
This will cause the primary worker to abort the indexing attempt and clean up.
|
||||
"""
|
||||
|
||||
# Since connector_indexing_proxy_task spawns a new process using this function as
|
||||
# the entrypoint, we init Sentry here.
|
||||
if SENTRY_DSN:
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
traces_sample_rate=0.1,
|
||||
)
|
||||
logger.info("Sentry initialized")
|
||||
else:
|
||||
logger.debug("Sentry DSN not provided, skipping Sentry initialization")
|
||||
|
||||
logger.info(
|
||||
f"Indexing spawned task starting: attempt={index_attempt_id} "
|
||||
f"tenant={tenant_id} "
|
||||
@@ -489,7 +505,7 @@ def connector_indexing_task(
|
||||
f"search_settings={search_settings_id}"
|
||||
)
|
||||
|
||||
attempt = None
|
||||
attempt_found = False
|
||||
n_final_progress: int | None = None
|
||||
|
||||
redis_connector = RedisConnector(tenant_id, cc_pair_id)
|
||||
@@ -529,6 +545,13 @@ def connector_indexing_task(
|
||||
sleep(1)
|
||||
continue
|
||||
|
||||
if payload.index_attempt_id != index_attempt_id:
|
||||
raise ValueError(
|
||||
f"connector_indexing_task - id mismatch. Task may be left over from previous run.: "
|
||||
f"task_index_attempt={index_attempt_id} "
|
||||
f"payload_index_attempt={payload.index_attempt_id}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"connector_indexing_task - Fence found, continuing...: fence={redis_connector_index.fence_key}"
|
||||
)
|
||||
@@ -557,6 +580,7 @@ def connector_indexing_task(
|
||||
raise ValueError(
|
||||
f"Index attempt not found: index_attempt={index_attempt_id}"
|
||||
)
|
||||
attempt_found = True
|
||||
|
||||
cc_pair = get_connector_credential_pair_from_id(
|
||||
cc_pair_id=cc_pair_id,
|
||||
@@ -576,32 +600,32 @@ def connector_indexing_task(
|
||||
f"Credential not found: cc_pair={cc_pair_id} credential={cc_pair.credential_id}"
|
||||
)
|
||||
|
||||
# define a callback class
|
||||
callback = RunIndexingCallback(
|
||||
redis_connector.stop.fence_key,
|
||||
redis_connector_index.generator_progress_key,
|
||||
lock,
|
||||
r,
|
||||
)
|
||||
# define a callback class
|
||||
callback = RunIndexingCallback(
|
||||
redis_connector.stop.fence_key,
|
||||
redis_connector_index.generator_progress_key,
|
||||
lock,
|
||||
r,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Indexing spawned task running entrypoint: attempt={index_attempt_id} "
|
||||
f"tenant={tenant_id} "
|
||||
f"cc_pair={cc_pair_id} "
|
||||
f"search_settings={search_settings_id}"
|
||||
)
|
||||
logger.info(
|
||||
f"Indexing spawned task running entrypoint: attempt={index_attempt_id} "
|
||||
f"tenant={tenant_id} "
|
||||
f"cc_pair={cc_pair_id} "
|
||||
f"search_settings={search_settings_id}"
|
||||
)
|
||||
|
||||
run_indexing_entrypoint(
|
||||
index_attempt_id,
|
||||
tenant_id,
|
||||
cc_pair_id,
|
||||
is_ee,
|
||||
callback=callback,
|
||||
)
|
||||
run_indexing_entrypoint(
|
||||
index_attempt_id,
|
||||
tenant_id,
|
||||
cc_pair_id,
|
||||
is_ee,
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
# get back the total number of indexed docs and return it
|
||||
n_final_progress = redis_connector_index.get_progress()
|
||||
redis_connector_index.set_generator_complete(HTTPStatus.OK.value)
|
||||
# get back the total number of indexed docs and return it
|
||||
n_final_progress = redis_connector_index.get_progress()
|
||||
redis_connector_index.set_generator_complete(HTTPStatus.OK.value)
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Indexing spawned task failed: attempt={index_attempt_id} "
|
||||
@@ -609,11 +633,10 @@ def connector_indexing_task(
|
||||
f"cc_pair={cc_pair_id} "
|
||||
f"search_settings={search_settings_id}"
|
||||
)
|
||||
if attempt:
|
||||
if attempt_found:
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
mark_attempt_failed(attempt, db_session, failure_reason=str(e))
|
||||
mark_attempt_failed(index_attempt_id, db_session, failure_reason=str(e))
|
||||
|
||||
redis_connector_index.reset()
|
||||
raise e
|
||||
finally:
|
||||
if lock.owned():
|
||||
|
||||
@@ -610,7 +610,7 @@ def monitor_ccpair_indexing_taskset(
|
||||
index_attempt = get_index_attempt(db_session, payload.index_attempt_id)
|
||||
if index_attempt:
|
||||
mark_attempt_failed(
|
||||
index_attempt=index_attempt,
|
||||
index_attempt_id=payload.index_attempt_id,
|
||||
db_session=db_session,
|
||||
failure_reason="Connector indexing aborted or exceptioned.",
|
||||
)
|
||||
@@ -690,13 +690,18 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
|
||||
|
||||
for a in attempts:
|
||||
# if attempts exist in the db but we don't detect them in redis, mark them as failed
|
||||
failure_reason = f"Unknown index attempt {a.id}. Might be left over from a process restart."
|
||||
if not r.exists(
|
||||
RedisConnectorIndex.fence_key_with_ids(
|
||||
a.connector_credential_pair_id, a.search_settings_id
|
||||
fence_key = RedisConnectorIndex.fence_key_with_ids(
|
||||
a.connector_credential_pair_id, a.search_settings_id
|
||||
)
|
||||
if not r.exists(fence_key):
|
||||
failure_reason = (
|
||||
f"Unknown index attempt. Might be left over from a process restart: "
|
||||
f"index_attempt={a.id} "
|
||||
f"cc_pair={a.connector_credential_pair_id} "
|
||||
f"search_settings={a.search_settings_id}"
|
||||
)
|
||||
):
|
||||
mark_attempt_failed(a, db_session, failure_reason=failure_reason)
|
||||
task_logger.warning(failure_reason)
|
||||
mark_attempt_failed(a.id, db_session, failure_reason=failure_reason)
|
||||
|
||||
lock_beat.reacquire()
|
||||
if r.exists(RedisConnectorCredentialPair.get_fence_key()):
|
||||
|
||||
@@ -337,7 +337,7 @@ def _run_indexing(
|
||||
or index_attempt.status != IndexingStatus.IN_PROGRESS
|
||||
):
|
||||
mark_attempt_failed(
|
||||
index_attempt,
|
||||
index_attempt.id,
|
||||
db_session,
|
||||
failure_reason=str(e),
|
||||
full_exception_trace=traceback.format_exc(),
|
||||
@@ -372,7 +372,7 @@ def _run_indexing(
|
||||
and index_attempt_md.num_exceptions >= batch_num
|
||||
):
|
||||
mark_attempt_failed(
|
||||
index_attempt,
|
||||
index_attempt.id,
|
||||
db_session,
|
||||
failure_reason="All batches exceptioned.",
|
||||
)
|
||||
|
||||
4
backend/danswer/background/task_name_builders.py
Normal file
4
backend/danswer/background/task_name_builders.py
Normal file
@@ -0,0 +1,4 @@
|
||||
def name_sync_external_doc_permissions_task(
|
||||
cc_pair_id: int, tenant_id: str | None = None
|
||||
) -> str:
|
||||
return f"sync_external_doc_permissions_task__{cc_pair_id}"
|
||||
@@ -156,7 +156,7 @@ class QAResponse(SearchResponse, DanswerAnswer):
|
||||
error_msg: str | None = None
|
||||
|
||||
|
||||
class ImageGenerationDisplay(BaseModel):
|
||||
class FileChatDisplay(BaseModel):
|
||||
file_ids: list[str]
|
||||
|
||||
|
||||
@@ -170,7 +170,7 @@ AnswerQuestionPossibleReturn = (
|
||||
| DanswerQuotes
|
||||
| CitationInfo
|
||||
| DanswerContexts
|
||||
| ImageGenerationDisplay
|
||||
| FileChatDisplay
|
||||
| CustomToolResponse
|
||||
| StreamingError
|
||||
| StreamStopInfo
|
||||
|
||||
@@ -42,18 +42,14 @@ personas:
|
||||
display_priority: 1
|
||||
is_visible: true
|
||||
starter_messages:
|
||||
- name: "General Information"
|
||||
description: "Ask about available information"
|
||||
message: "Hello! I'm interested in learning more about the information available here. Could you give me an overview of the types of data or documents that might be accessible?"
|
||||
- name: "Specific Topic Search"
|
||||
description: "Search for specific information"
|
||||
message: "Hi! I'd like to learn more about a specific topic. Could you help me find relevant documents and information?"
|
||||
- name: "Recent Updates"
|
||||
description: "Inquire about latest additions"
|
||||
message: "Hello! I'm curious about any recent updates or additions to the knowledge base. Can you tell me what new information has been added lately?"
|
||||
- name: "Cross-referencing Information"
|
||||
description: "Connect information from different sources"
|
||||
message: "Hi! I'm working on a project that requires connecting information from multiple sources. How can I effectively cross-reference data across different documents or categories?"
|
||||
- name: "Give me an overview of what's here"
|
||||
message: "Sample some documents and tell me what you find."
|
||||
- name: "Use AI to solve a work related problem"
|
||||
message: "Ask me what problem I would like to solve, then search the knowledge base to help me find a solution."
|
||||
- name: "Find updates on a topic of interest"
|
||||
message: "Once I provide a topic, retrieve related documents and tell me when there was last activity on the topic if available."
|
||||
- name: "Surface contradictions"
|
||||
message: "Have me choose a subject. Once I have provided it, check against the knowledge base and point out any inconsistencies. For all your following responses, focus on identifying contradictions."
|
||||
|
||||
- id: 1
|
||||
name: "General"
|
||||
@@ -71,18 +67,14 @@ personas:
|
||||
display_priority: 0
|
||||
is_visible: true
|
||||
starter_messages:
|
||||
- name: "Open Discussion"
|
||||
description: "Start an open-ended conversation"
|
||||
message: "Hi! Can you help me write a professional email?"
|
||||
- name: "Problem Solving"
|
||||
description: "Get help with a challenge"
|
||||
message: "Hello! I need help managing my daily tasks better. Do you have any simple tips?"
|
||||
- name: "Learn Something New"
|
||||
description: "Explore a new topic"
|
||||
message: "Hi! Could you explain what project management is in simple terms?"
|
||||
- name: "Creative Brainstorming"
|
||||
description: "Generate creative ideas"
|
||||
message: "Hello! I need to brainstorm some team building activities. Do you have any fun suggestions?"
|
||||
- name: "Summarize a document"
|
||||
message: "If I have provided a document please summarize it for me. If not, please ask me to upload a document either by dragging it into the input bar or clicking the +file icon."
|
||||
- name: "Help me with coding"
|
||||
message: 'Write me a "Hello World" script in 5 random languages to show off the functionality.'
|
||||
- name: "Draft a professional email"
|
||||
message: "Help me craft a professional email. Let's establish the context and the anticipated outcomes of the email before proposing a draft."
|
||||
- name: "Learn something new"
|
||||
message: "What is the difference between a Gantt chart, a Burndown chart and a Kanban board?"
|
||||
|
||||
- id: 2
|
||||
name: "Paraphrase"
|
||||
@@ -101,16 +93,12 @@ personas:
|
||||
is_visible: false
|
||||
starter_messages:
|
||||
- name: "Document Search"
|
||||
description: "Find exact information"
|
||||
message: "Hi! Could you help me find information about our team structure and reporting lines from our internal documents?"
|
||||
- name: "Process Verification"
|
||||
description: "Find exact quotes"
|
||||
message: "Hello! I need to understand our project approval process. Could you find the exact steps from our documentation?"
|
||||
- name: "Technical Documentation"
|
||||
description: "Search technical details"
|
||||
message: "Hi there! I'm looking for information about our deployment procedures. Can you find the specific steps from our technical guides?"
|
||||
- name: "Policy Reference"
|
||||
description: "Check official policies"
|
||||
message: "Hello! Could you help me find our official guidelines about client communication? I need the exact wording from our documentation."
|
||||
|
||||
- id: 3
|
||||
@@ -130,15 +118,11 @@ personas:
|
||||
display_priority: 3
|
||||
is_visible: true
|
||||
starter_messages:
|
||||
- name: "Landscape"
|
||||
description: "Generate a landscape image"
|
||||
message: "Create an image of a serene mountain lake at sunset, with snow-capped peaks reflected in the calm water and a small wooden cabin on the shore."
|
||||
- name: "Character"
|
||||
description: "Generate a character image"
|
||||
message: "Generate an image of a futuristic robot with glowing blue eyes, sleek metallic body, and intricate circuitry visible through transparent panels on its chest and arms."
|
||||
- name: "Abstract"
|
||||
description: "Create an abstract image"
|
||||
message: "Create an abstract image representing the concept of time, using swirling clock hands, fragmented hourglasses, and streaks of light to convey the passage of moments and eras."
|
||||
- name: "Urban Scene"
|
||||
description: "Generate an urban landscape"
|
||||
message: "Generate an image of a bustling futuristic cityscape at night, with towering skyscrapers, flying vehicles, holographic advertisements, and a mix of neon and bioluminescent lighting."
|
||||
- name: "Create visuals for a presentation"
|
||||
message: "Generate someone presenting a graph which clearly demonstrates an upwards trajectory."
|
||||
- name: "Find inspiration for a marketing campaign"
|
||||
message: "Generate an image of two happy individuals sipping on a soda drink in a glass bottle."
|
||||
- name: "Visualize a product design"
|
||||
message: "I want to add a search bar to my Iphone app. Generate me generic examples of how other apps implement this."
|
||||
- name: "Generate a humorous image response"
|
||||
message: "My teammate just made a silly mistake and I want to respond with a facepalm. Can you generate me one?"
|
||||
|
||||
@@ -11,8 +11,8 @@ from danswer.chat.models import AllCitations
|
||||
from danswer.chat.models import CitationInfo
|
||||
from danswer.chat.models import CustomToolResponse
|
||||
from danswer.chat.models import DanswerAnswerPiece
|
||||
from danswer.chat.models import FileChatDisplay
|
||||
from danswer.chat.models import FinalUsedContextDocsResponse
|
||||
from danswer.chat.models import ImageGenerationDisplay
|
||||
from danswer.chat.models import LLMRelevanceFilterResponse
|
||||
from danswer.chat.models import MessageResponseIDInfo
|
||||
from danswer.chat.models import MessageSpecificCitations
|
||||
@@ -275,7 +275,7 @@ ChatPacket = (
|
||||
| DanswerAnswerPiece
|
||||
| AllCitations
|
||||
| CitationInfo
|
||||
| ImageGenerationDisplay
|
||||
| FileChatDisplay
|
||||
| CustomToolResponse
|
||||
| MessageSpecificCitations
|
||||
| MessageResponseIDInfo
|
||||
@@ -769,7 +769,6 @@ def stream_chat_message_objects(
|
||||
yield LLMRelevanceFilterResponse(
|
||||
llm_selected_doc_indices=llm_indices
|
||||
)
|
||||
|
||||
elif packet.id == FINAL_CONTEXT_DOCUMENTS_ID:
|
||||
yield FinalUsedContextDocsResponse(
|
||||
final_context_docs=packet.response
|
||||
@@ -787,7 +786,7 @@ def stream_chat_message_objects(
|
||||
FileDescriptor(id=str(file_id), type=ChatFileType.IMAGE)
|
||||
for file_id in file_ids
|
||||
]
|
||||
yield ImageGenerationDisplay(
|
||||
yield FileChatDisplay(
|
||||
file_ids=[str(file_id) for file_id in file_ids]
|
||||
)
|
||||
elif packet.id == INTERNET_SEARCH_RESPONSE_ID:
|
||||
@@ -801,10 +800,30 @@ def stream_chat_message_objects(
|
||||
yield qa_docs_response
|
||||
elif packet.id == CUSTOM_TOOL_RESPONSE_ID:
|
||||
custom_tool_response = cast(CustomToolCallSummary, packet.response)
|
||||
yield CustomToolResponse(
|
||||
response=custom_tool_response.tool_result,
|
||||
tool_name=custom_tool_response.tool_name,
|
||||
)
|
||||
|
||||
if (
|
||||
custom_tool_response.response_type == "image"
|
||||
or custom_tool_response.response_type == "csv"
|
||||
):
|
||||
file_ids = custom_tool_response.tool_result.file_ids
|
||||
ai_message_files = [
|
||||
FileDescriptor(
|
||||
id=str(file_id),
|
||||
type=ChatFileType.IMAGE
|
||||
if custom_tool_response.response_type == "image"
|
||||
else ChatFileType.CSV,
|
||||
)
|
||||
for file_id in file_ids
|
||||
]
|
||||
yield FileChatDisplay(
|
||||
file_ids=[str(file_id) for file_id in file_ids]
|
||||
)
|
||||
else:
|
||||
yield CustomToolResponse(
|
||||
response=custom_tool_response.tool_result,
|
||||
tool_name=custom_tool_response.tool_name,
|
||||
)
|
||||
|
||||
elif isinstance(packet, StreamStopInfo):
|
||||
pass
|
||||
else:
|
||||
|
||||
@@ -163,6 +163,17 @@ try:
|
||||
except ValueError:
|
||||
POSTGRES_POOL_RECYCLE = POSTGRES_POOL_RECYCLE_DEFAULT
|
||||
|
||||
# Experimental setting to control idle transactions
|
||||
POSTGRES_IDLE_SESSIONS_TIMEOUT_DEFAULT = 0 # milliseconds
|
||||
try:
|
||||
POSTGRES_IDLE_SESSIONS_TIMEOUT = int(
|
||||
os.environ.get(
|
||||
"POSTGRES_IDLE_SESSIONS_TIMEOUT", POSTGRES_IDLE_SESSIONS_TIMEOUT_DEFAULT
|
||||
)
|
||||
)
|
||||
except ValueError:
|
||||
POSTGRES_IDLE_SESSIONS_TIMEOUT = POSTGRES_IDLE_SESSIONS_TIMEOUT_DEFAULT
|
||||
|
||||
REDIS_SSL = os.getenv("REDIS_SSL", "").lower() == "true"
|
||||
REDIS_HOST = os.environ.get("REDIS_HOST") or "localhost"
|
||||
REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379))
|
||||
@@ -482,3 +493,13 @@ JWT_ALGORITHM = "HS256"
|
||||
# Super Users
|
||||
SUPER_USERS = json.loads(os.environ.get("SUPER_USERS", '["pablo@danswer.ai"]'))
|
||||
SUPER_CLOUD_API_KEY = os.environ.get("SUPER_CLOUD_API_KEY", "api_key")
|
||||
|
||||
|
||||
#####
|
||||
# API Key Configs
|
||||
#####
|
||||
# refers to the rounds described here: https://passlib.readthedocs.io/en/stable/lib/passlib.hash.sha256_crypt.html
|
||||
_API_KEY_HASH_ROUNDS_RAW = os.environ.get("API_KEY_HASH_ROUNDS")
|
||||
API_KEY_HASH_ROUNDS = (
|
||||
int(_API_KEY_HASH_ROUNDS_RAW) if _API_KEY_HASH_ROUNDS_RAW else None
|
||||
)
|
||||
|
||||
@@ -126,6 +126,7 @@ class DocumentSource(str, Enum):
|
||||
XENFORO = "xenforo"
|
||||
NOT_APPLICABLE = "not_applicable"
|
||||
FRESHDESK = "freshdesk"
|
||||
FIREFLIES = "fireflies"
|
||||
|
||||
|
||||
DocumentSourceRequiringTenantContext: list[DocumentSource] = [DocumentSource.FILE]
|
||||
@@ -225,6 +226,9 @@ class DanswerRedisLocks:
|
||||
PRUNING_LOCK_PREFIX = "da_lock:pruning"
|
||||
INDEXING_METADATA_PREFIX = "da_metadata:indexing"
|
||||
|
||||
SLACK_BOT_LOCK = "da_lock:slack_bot"
|
||||
SLACK_BOT_HEARTBEAT_PREFIX = "da_heartbeat:slack_bot"
|
||||
|
||||
|
||||
class DanswerCeleryPriority(int, Enum):
|
||||
HIGHEST = 0
|
||||
|
||||
@@ -16,6 +16,7 @@ from danswer.connectors.discourse.connector import DiscourseConnector
|
||||
from danswer.connectors.document360.connector import Document360Connector
|
||||
from danswer.connectors.dropbox.connector import DropboxConnector
|
||||
from danswer.connectors.file.connector import LocalFileConnector
|
||||
from danswer.connectors.fireflies.connector import FirefliesConnector
|
||||
from danswer.connectors.freshdesk.connector import FreshdeskConnector
|
||||
from danswer.connectors.github.connector import GithubConnector
|
||||
from danswer.connectors.gitlab.connector import GitlabConnector
|
||||
@@ -101,6 +102,7 @@ def identify_connector_class(
|
||||
DocumentSource.OCI_STORAGE: BlobStorageConnector,
|
||||
DocumentSource.XENFORO: XenforoConnector,
|
||||
DocumentSource.FRESHDESK: FreshdeskConnector,
|
||||
DocumentSource.FIREFLIES: FirefliesConnector,
|
||||
}
|
||||
connector_by_source = connector_map.get(source, {})
|
||||
|
||||
|
||||
@@ -123,9 +123,13 @@ def _process_file(
|
||||
"filename",
|
||||
"file_display_name",
|
||||
"title",
|
||||
"connector_type",
|
||||
]
|
||||
}
|
||||
|
||||
source_type_str = all_metadata.get("connector_type")
|
||||
source_type = DocumentSource(source_type_str) if source_type_str else None
|
||||
|
||||
p_owner_names = all_metadata.get("primary_owners")
|
||||
s_owner_names = all_metadata.get("secondary_owners")
|
||||
p_owners = (
|
||||
@@ -145,7 +149,7 @@ def _process_file(
|
||||
sections=[
|
||||
Section(link=all_metadata.get("link"), text=file_content_raw.strip())
|
||||
],
|
||||
source=DocumentSource.FILE,
|
||||
source=source_type or DocumentSource.FILE,
|
||||
semantic_identifier=file_display_name,
|
||||
title=title,
|
||||
doc_updated_at=final_time_updated,
|
||||
|
||||
0
backend/danswer/connectors/fireflies/__init__.py
Normal file
0
backend/danswer/connectors/fireflies/__init__.py
Normal file
182
backend/danswer/connectors/fireflies/connector.py
Normal file
182
backend/danswer/connectors/fireflies/connector.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import ConnectorMissingCredentialError
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
_FIREFLIES_ID_PREFIX = "FIREFLIES_"
|
||||
|
||||
_FIREFLIES_API_URL = "https://api.fireflies.ai/graphql"
|
||||
|
||||
_FIREFLIES_TRANSCRIPT_QUERY_SIZE = 50 # Max page size is 50
|
||||
|
||||
_FIREFLIES_API_QUERY = """
|
||||
query Transcripts($fromDate: DateTime, $toDate: DateTime, $limit: Int!, $skip: Int!) {
|
||||
transcripts(fromDate: $fromDate, toDate: $toDate, limit: $limit, skip: $skip) {
|
||||
id
|
||||
title
|
||||
host_email
|
||||
participants
|
||||
date
|
||||
transcript_url
|
||||
sentences {
|
||||
text
|
||||
speaker_name
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def _create_doc_from_transcript(transcript: dict) -> Document | None:
|
||||
meeting_text = ""
|
||||
sentences = transcript.get("sentences", [])
|
||||
if sentences:
|
||||
for sentence in sentences:
|
||||
meeting_text += sentence.get("speaker_name") or "Unknown Speaker"
|
||||
meeting_text += ": " + sentence.get("text", "") + "\n\n"
|
||||
else:
|
||||
return None
|
||||
|
||||
meeting_link = transcript["transcript_url"]
|
||||
|
||||
fireflies_id = _FIREFLIES_ID_PREFIX + transcript["id"]
|
||||
|
||||
meeting_title = transcript["title"] or "No Title"
|
||||
|
||||
meeting_date_unix = transcript["date"]
|
||||
meeting_date = datetime.fromtimestamp(meeting_date_unix / 1000, tz=timezone.utc)
|
||||
|
||||
meeting_host_email = transcript["host_email"]
|
||||
host_email_user_info = [BasicExpertInfo(email=meeting_host_email)]
|
||||
|
||||
meeting_participants_email_list = []
|
||||
for participant in transcript.get("participants", []):
|
||||
if participant != meeting_host_email and participant:
|
||||
meeting_participants_email_list.append(BasicExpertInfo(email=participant))
|
||||
|
||||
return Document(
|
||||
id=fireflies_id,
|
||||
sections=[
|
||||
Section(
|
||||
link=meeting_link,
|
||||
text=meeting_text,
|
||||
)
|
||||
],
|
||||
source=DocumentSource.FIREFLIES,
|
||||
semantic_identifier=meeting_title,
|
||||
metadata={},
|
||||
doc_updated_at=meeting_date,
|
||||
primary_owners=host_email_user_info,
|
||||
secondary_owners=meeting_participants_email_list,
|
||||
)
|
||||
|
||||
|
||||
class FirefliesConnector(PollConnector, LoadConnector):
|
||||
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
|
||||
self.batch_size = batch_size
|
||||
|
||||
def load_credentials(self, credentials: dict[str, str]) -> None:
|
||||
api_key = credentials.get("fireflies_api_key")
|
||||
|
||||
if not isinstance(api_key, str):
|
||||
raise ConnectorMissingCredentialError(
|
||||
"The Fireflies API key must be a string"
|
||||
)
|
||||
|
||||
self.api_key = api_key
|
||||
|
||||
return None
|
||||
|
||||
def _fetch_transcripts(
|
||||
self, start_datetime: str | None = None, end_datetime: str | None = None
|
||||
) -> Iterator[List[dict]]:
|
||||
if self.api_key is None:
|
||||
raise ConnectorMissingCredentialError("Missing API key")
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": "Bearer " + self.api_key,
|
||||
}
|
||||
|
||||
skip = 0
|
||||
variables: dict[str, int | str] = {
|
||||
"limit": _FIREFLIES_TRANSCRIPT_QUERY_SIZE,
|
||||
}
|
||||
|
||||
if start_datetime:
|
||||
variables["fromDate"] = start_datetime
|
||||
if end_datetime:
|
||||
variables["toDate"] = end_datetime
|
||||
|
||||
while True:
|
||||
variables["skip"] = skip
|
||||
response = requests.post(
|
||||
_FIREFLIES_API_URL,
|
||||
headers=headers,
|
||||
json={"query": _FIREFLIES_API_QUERY, "variables": variables},
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
if response.status_code == 204:
|
||||
break
|
||||
|
||||
recieved_transcripts = response.json()
|
||||
parsed_transcripts = recieved_transcripts.get("data", {}).get(
|
||||
"transcripts", []
|
||||
)
|
||||
|
||||
yield parsed_transcripts
|
||||
|
||||
if len(parsed_transcripts) < _FIREFLIES_TRANSCRIPT_QUERY_SIZE:
|
||||
break
|
||||
|
||||
skip += _FIREFLIES_TRANSCRIPT_QUERY_SIZE
|
||||
|
||||
def _process_transcripts(
|
||||
self, start: str | None = None, end: str | None = None
|
||||
) -> GenerateDocumentsOutput:
|
||||
doc_batch: List[Document] = []
|
||||
|
||||
for transcript_batch in self._fetch_transcripts(start, end):
|
||||
for transcript in transcript_batch:
|
||||
if doc := _create_doc_from_transcript(transcript):
|
||||
doc_batch.append(doc)
|
||||
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
return self._process_transcripts()
|
||||
|
||||
def poll_source(
|
||||
self, start_unixtime: SecondsSinceUnixEpoch, end_unixtime: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
start_datetime = datetime.fromtimestamp(
|
||||
start_unixtime, tz=timezone.utc
|
||||
).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
end_datetime = datetime.fromtimestamp(end_unixtime, tz=timezone.utc).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.000Z"
|
||||
)
|
||||
|
||||
yield from self._process_transcripts(start_datetime, end_datetime)
|
||||
@@ -1,283 +1,360 @@
|
||||
import re
|
||||
import time
|
||||
from base64 import urlsafe_b64decode
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import Dict
|
||||
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
|
||||
from googleapiclient import discovery # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from danswer.connectors.gmail.connector_auth import (
|
||||
get_gmail_creds_for_authorized_user,
|
||||
)
|
||||
from danswer.connectors.gmail.connector_auth import (
|
||||
get_gmail_creds_for_service_account,
|
||||
)
|
||||
from danswer.connectors.gmail.constants import (
|
||||
DB_CREDENTIALS_DICT_DELEGATED_USER_KEY,
|
||||
)
|
||||
from danswer.connectors.gmail.constants import DB_CREDENTIALS_DICT_TOKEN_KEY
|
||||
from danswer.connectors.gmail.constants import (
|
||||
GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
|
||||
from danswer.connectors.google_utils.google_auth import get_google_creds
|
||||
from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_utils.resources import get_admin_service
|
||||
from danswer.connectors.google_utils.resources import get_gmail_service
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_STR
|
||||
from danswer.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS
|
||||
from danswer.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
|
||||
from danswer.connectors.google_utils.shared_constants import USER_FIELDS
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.interfaces import SlimConnector
|
||||
from danswer.connectors.models import BasicExpertInfo
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.connectors.models import SlimDocument
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# This is for the initial list call to get the thread ids
|
||||
THREAD_LIST_FIELDS = "nextPageToken, threads(id)"
|
||||
|
||||
def _execute_with_retry(request: Any) -> Any:
|
||||
max_attempts = 10
|
||||
attempt = 0
|
||||
# These are the fields to retrieve using the ID from the initial list call
|
||||
PARTS_FIELDS = "parts(body(data), mimeType)"
|
||||
PAYLOAD_FIELDS = f"payload(headers, {PARTS_FIELDS})"
|
||||
MESSAGES_FIELDS = f"messages(id, {PAYLOAD_FIELDS})"
|
||||
THREADS_FIELDS = f"threads(id, {MESSAGES_FIELDS})"
|
||||
THREAD_FIELDS = f"id, {MESSAGES_FIELDS}"
|
||||
|
||||
while attempt < max_attempts:
|
||||
# Note for reasons unknown, the Google API will sometimes return a 429
|
||||
# and even after waiting the retry period, it will return another 429.
|
||||
# It could be due to a few possibilities:
|
||||
# 1. Other things are also requesting from the Gmail API with the same key
|
||||
# 2. It's a rolling rate limit so the moment we get some amount of requests cleared, we hit it again very quickly
|
||||
# 3. The retry-after has a maximum and we've already hit the limit for the day
|
||||
# or it's something else...
|
||||
try:
|
||||
return request.execute()
|
||||
except HttpError as error:
|
||||
attempt += 1
|
||||
EMAIL_FIELDS = [
|
||||
"cc",
|
||||
"bcc",
|
||||
"from",
|
||||
"to",
|
||||
]
|
||||
|
||||
if error.resp.status == 429:
|
||||
# Attempt to get 'Retry-After' from headers
|
||||
retry_after = error.resp.get("Retry-After")
|
||||
if retry_after:
|
||||
sleep_time = int(retry_after)
|
||||
else:
|
||||
# Extract 'Retry after' timestamp from error message
|
||||
match = re.search(
|
||||
r"Retry after (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)",
|
||||
str(error),
|
||||
add_retries = retry_builder(tries=50, max_delay=30)
|
||||
|
||||
|
||||
def _build_time_range_query(
|
||||
time_range_start: SecondsSinceUnixEpoch | None = None,
|
||||
time_range_end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> str | None:
|
||||
query = ""
|
||||
if time_range_start is not None and time_range_start != 0:
|
||||
query += f"after:{int(time_range_start)}"
|
||||
if time_range_end is not None and time_range_end != 0:
|
||||
query += f" before:{int(time_range_end)}"
|
||||
query = query.strip()
|
||||
|
||||
if len(query) == 0:
|
||||
return None
|
||||
|
||||
return query
|
||||
|
||||
|
||||
def _clean_email_and_extract_name(email: str) -> tuple[str, str | None]:
|
||||
email = email.strip()
|
||||
if "<" in email and ">" in email:
|
||||
# Handle format: "Display Name <email@domain.com>"
|
||||
display_name = email[: email.find("<")].strip()
|
||||
email_address = email[email.find("<") + 1 : email.find(">")].strip()
|
||||
return email_address, display_name if display_name else None
|
||||
else:
|
||||
# Handle plain email address
|
||||
return email.strip(), None
|
||||
|
||||
|
||||
def _get_owners_from_emails(emails: dict[str, str | None]) -> list[BasicExpertInfo]:
|
||||
owners = []
|
||||
for email, names in emails.items():
|
||||
if names:
|
||||
name_parts = names.split(" ")
|
||||
first_name = " ".join(name_parts[:-1])
|
||||
last_name = name_parts[-1]
|
||||
else:
|
||||
first_name = None
|
||||
last_name = None
|
||||
owners.append(
|
||||
BasicExpertInfo(email=email, first_name=first_name, last_name=last_name)
|
||||
)
|
||||
return owners
|
||||
|
||||
|
||||
def _get_message_body(payload: dict[str, Any]) -> str:
|
||||
parts = payload.get("parts", [])
|
||||
message_body = ""
|
||||
for part in parts:
|
||||
mime_type = part.get("mimeType")
|
||||
body = part.get("body")
|
||||
if mime_type == "text/plain" and body:
|
||||
data = body.get("data", "")
|
||||
text = urlsafe_b64decode(data).decode()
|
||||
message_body += text
|
||||
return message_body
|
||||
|
||||
|
||||
def message_to_section(message: Dict[str, Any]) -> tuple[Section, dict[str, str]]:
|
||||
link = f"https://mail.google.com/mail/u/0/#inbox/{message['id']}"
|
||||
|
||||
payload = message.get("payload", {})
|
||||
headers = payload.get("headers", [])
|
||||
metadata: dict[str, Any] = {}
|
||||
for header in headers:
|
||||
name = header.get("name").lower()
|
||||
value = header.get("value")
|
||||
if name in EMAIL_FIELDS:
|
||||
metadata[name] = value
|
||||
if name == "subject":
|
||||
metadata["subject"] = value
|
||||
if name == "date":
|
||||
metadata["updated_at"] = value
|
||||
|
||||
if labels := message.get("labelIds"):
|
||||
metadata["labels"] = labels
|
||||
|
||||
message_data = ""
|
||||
for name, value in metadata.items():
|
||||
# updated at isnt super useful for the llm
|
||||
if name != "updated_at":
|
||||
message_data += f"{name}: {value}\n"
|
||||
|
||||
message_body_text: str = _get_message_body(payload)
|
||||
|
||||
return Section(link=link, text=message_body_text + message_data), metadata
|
||||
|
||||
|
||||
def thread_to_document(full_thread: Dict[str, Any]) -> Document | None:
|
||||
all_messages = full_thread.get("messages", [])
|
||||
if not all_messages:
|
||||
return None
|
||||
|
||||
sections = []
|
||||
semantic_identifier = ""
|
||||
updated_at = None
|
||||
from_emails: dict[str, str | None] = {}
|
||||
other_emails: dict[str, str | None] = {}
|
||||
for message in all_messages:
|
||||
section, message_metadata = message_to_section(message)
|
||||
sections.append(section)
|
||||
|
||||
for name, value in message_metadata.items():
|
||||
if name in EMAIL_FIELDS:
|
||||
email, display_name = _clean_email_and_extract_name(value)
|
||||
if name == "from":
|
||||
from_emails[email] = (
|
||||
display_name if not from_emails.get(email) else None
|
||||
)
|
||||
else:
|
||||
other_emails[email] = (
|
||||
display_name if not other_emails.get(email) else None
|
||||
)
|
||||
if match:
|
||||
retry_after_timestamp = match.group(1)
|
||||
retry_after_dt = datetime.strptime(
|
||||
retry_after_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ"
|
||||
).replace(tzinfo=timezone.utc)
|
||||
current_time = datetime.now(timezone.utc)
|
||||
sleep_time = max(
|
||||
int((retry_after_dt - current_time).total_seconds()),
|
||||
0,
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"No Retry-After header or timestamp found in error message: {error}"
|
||||
)
|
||||
sleep_time = 60
|
||||
|
||||
sleep_time += 3 # Add a buffer to be safe
|
||||
# If we haven't set the semantic identifier yet, set it to the subject of the first message
|
||||
if not semantic_identifier:
|
||||
semantic_identifier = message_metadata.get("subject", "")
|
||||
|
||||
logger.info(
|
||||
f"Rate limit exceeded. Attempt {attempt}/{max_attempts}. Sleeping for {sleep_time} seconds."
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
if message_metadata.get("updated_at"):
|
||||
updated_at = message_metadata.get("updated_at")
|
||||
|
||||
else:
|
||||
raise
|
||||
updated_at_datetime = None
|
||||
if updated_at:
|
||||
updated_at_datetime = time_str_to_utc(updated_at)
|
||||
|
||||
# If we've exhausted all attempts
|
||||
raise Exception(f"Failed to execute request after {max_attempts} attempts")
|
||||
id = full_thread.get("id")
|
||||
if not id:
|
||||
raise ValueError("Thread ID is required")
|
||||
|
||||
primary_owners = _get_owners_from_emails(from_emails)
|
||||
secondary_owners = _get_owners_from_emails(other_emails)
|
||||
|
||||
return Document(
|
||||
id=id,
|
||||
semantic_identifier=semantic_identifier,
|
||||
sections=sections,
|
||||
source=DocumentSource.GMAIL,
|
||||
# This is used to perform permission sync
|
||||
primary_owners=primary_owners,
|
||||
secondary_owners=secondary_owners,
|
||||
doc_updated_at=updated_at_datetime,
|
||||
# Not adding emails to metadata because it's already in the sections
|
||||
metadata={},
|
||||
)
|
||||
|
||||
|
||||
class GmailConnector(LoadConnector, PollConnector):
|
||||
class GmailConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.creds: OAuthCredentials | ServiceAccountCredentials | None = None
|
||||
|
||||
self._creds: OAuthCredentials | ServiceAccountCredentials | None = None
|
||||
self._primary_admin_email: str | None = None
|
||||
|
||||
@property
|
||||
def primary_admin_email(self) -> str:
|
||||
if self._primary_admin_email is None:
|
||||
raise RuntimeError(
|
||||
"Primary admin email missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._primary_admin_email
|
||||
|
||||
@property
|
||||
def google_domain(self) -> str:
|
||||
if self._primary_admin_email is None:
|
||||
raise RuntimeError(
|
||||
"Primary admin email missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._primary_admin_email.split("@")[-1]
|
||||
|
||||
@property
|
||||
def creds(self) -> OAuthCredentials | ServiceAccountCredentials:
|
||||
if self._creds is None:
|
||||
raise RuntimeError(
|
||||
"Creds missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._creds
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None:
|
||||
"""Checks for two different types of credentials.
|
||||
(1) A credential which holds a token acquired via a user going thorugh
|
||||
the Google OAuth flow.
|
||||
(2) A credential which holds a service account key JSON file, which
|
||||
can then be used to impersonate any user in the workspace.
|
||||
"""
|
||||
creds: OAuthCredentials | ServiceAccountCredentials | None = None
|
||||
new_creds_dict = None
|
||||
if DB_CREDENTIALS_DICT_TOKEN_KEY in credentials:
|
||||
access_token_json_str = cast(
|
||||
str, credentials[DB_CREDENTIALS_DICT_TOKEN_KEY]
|
||||
)
|
||||
creds = get_gmail_creds_for_authorized_user(
|
||||
token_json_str=access_token_json_str
|
||||
)
|
||||
primary_admin_email = credentials[DB_CREDENTIALS_PRIMARY_ADMIN_KEY]
|
||||
self._primary_admin_email = primary_admin_email
|
||||
|
||||
# tell caller to update token stored in DB if it has changed
|
||||
# (e.g. the token has been refreshed)
|
||||
new_creds_json_str = creds.to_json() if creds else ""
|
||||
if new_creds_json_str != access_token_json_str:
|
||||
new_creds_dict = {DB_CREDENTIALS_DICT_TOKEN_KEY: new_creds_json_str}
|
||||
|
||||
if GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY in credentials:
|
||||
service_account_key_json_str = credentials[
|
||||
GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY
|
||||
]
|
||||
creds = get_gmail_creds_for_service_account(
|
||||
service_account_key_json_str=service_account_key_json_str
|
||||
)
|
||||
|
||||
# "Impersonate" a user if one is specified
|
||||
delegated_user_email = cast(
|
||||
str | None, credentials.get(DB_CREDENTIALS_DICT_DELEGATED_USER_KEY)
|
||||
)
|
||||
if delegated_user_email:
|
||||
creds = creds.with_subject(delegated_user_email) if creds else None # type: ignore
|
||||
|
||||
if creds is None:
|
||||
raise PermissionError(
|
||||
"Unable to access Gmail - unknown credential structure."
|
||||
)
|
||||
|
||||
self.creds = creds
|
||||
self._creds, new_creds_dict = get_google_creds(
|
||||
credentials=credentials,
|
||||
source=DocumentSource.GMAIL,
|
||||
)
|
||||
return new_creds_dict
|
||||
|
||||
def _get_email_body(self, payload: dict[str, Any]) -> str:
|
||||
parts = payload.get("parts", [])
|
||||
email_body = ""
|
||||
for part in parts:
|
||||
mime_type = part.get("mimeType")
|
||||
body = part.get("body")
|
||||
if mime_type == "text/plain":
|
||||
data = body.get("data", "")
|
||||
text = urlsafe_b64decode(data).decode()
|
||||
email_body += text
|
||||
return email_body
|
||||
def _get_all_user_emails(self) -> list[str]:
|
||||
admin_service = get_admin_service(self.creds, self.primary_admin_email)
|
||||
emails = []
|
||||
for user in execute_paginated_retrieval(
|
||||
retrieval_function=admin_service.users().list,
|
||||
list_key="users",
|
||||
fields=USER_FIELDS,
|
||||
domain=self.google_domain,
|
||||
):
|
||||
if email := user.get("primaryEmail"):
|
||||
emails.append(email)
|
||||
return emails
|
||||
|
||||
def _email_to_document(self, full_email: Dict[str, Any]) -> Document:
|
||||
email_id = full_email["id"]
|
||||
payload = full_email["payload"]
|
||||
headers = payload.get("headers")
|
||||
labels = full_email.get("labelIds", [])
|
||||
metadata = {}
|
||||
if headers:
|
||||
for header in headers:
|
||||
name = header.get("name").lower()
|
||||
value = header.get("value")
|
||||
if name in ["from", "to", "subject", "date", "cc", "bcc"]:
|
||||
metadata[name] = value
|
||||
email_data = ""
|
||||
for name, value in metadata.items():
|
||||
email_data += f"{name}: {value}\n"
|
||||
metadata["labels"] = labels
|
||||
logger.debug(f"{email_data}")
|
||||
email_body_text: str = self._get_email_body(payload)
|
||||
date_str = metadata.get("date")
|
||||
email_updated_at = time_str_to_utc(date_str) if date_str else None
|
||||
link = f"https://mail.google.com/mail/u/0/#inbox/{email_id}"
|
||||
return Document(
|
||||
id=email_id,
|
||||
sections=[Section(link=link, text=email_data + email_body_text)],
|
||||
source=DocumentSource.GMAIL,
|
||||
title=metadata.get("subject"),
|
||||
semantic_identifier=metadata.get("subject", "Untitled Email"),
|
||||
doc_updated_at=email_updated_at,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _build_time_range_query(
|
||||
time_range_start: SecondsSinceUnixEpoch | None = None,
|
||||
time_range_end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> str | None:
|
||||
query = ""
|
||||
if time_range_start is not None and time_range_start != 0:
|
||||
query += f"after:{int(time_range_start)}"
|
||||
if time_range_end is not None and time_range_end != 0:
|
||||
query += f" before:{int(time_range_end)}"
|
||||
query = query.strip()
|
||||
|
||||
if len(query) == 0:
|
||||
return None
|
||||
|
||||
return query
|
||||
|
||||
def _fetch_mails_from_gmail(
|
||||
def _fetch_threads(
|
||||
self,
|
||||
time_range_start: SecondsSinceUnixEpoch | None = None,
|
||||
time_range_end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateDocumentsOutput:
|
||||
if self.creds is None:
|
||||
raise PermissionError("Not logged into Gmail")
|
||||
page_token = ""
|
||||
query = GmailConnector._build_time_range_query(time_range_start, time_range_end)
|
||||
service = discovery.build("gmail", "v1", credentials=self.creds)
|
||||
while page_token is not None:
|
||||
result = _execute_with_retry(
|
||||
service.users()
|
||||
.messages()
|
||||
.list(
|
||||
userId="me",
|
||||
pageToken=page_token,
|
||||
q=query,
|
||||
maxResults=self.batch_size,
|
||||
query = _build_time_range_query(time_range_start, time_range_end)
|
||||
doc_batch = []
|
||||
for user_email in self._get_all_user_emails():
|
||||
gmail_service = get_gmail_service(self.creds, user_email)
|
||||
for thread in execute_paginated_retrieval(
|
||||
retrieval_function=gmail_service.users().threads().list,
|
||||
list_key="threads",
|
||||
userId=user_email,
|
||||
fields=THREAD_LIST_FIELDS,
|
||||
q=query,
|
||||
):
|
||||
full_threads = execute_paginated_retrieval(
|
||||
retrieval_function=gmail_service.users().threads().get,
|
||||
list_key=None,
|
||||
userId=user_email,
|
||||
fields=THREAD_FIELDS,
|
||||
id=thread["id"],
|
||||
)
|
||||
)
|
||||
|
||||
page_token = result.get("nextPageToken")
|
||||
messages = result.get("messages", [])
|
||||
doc_batch = []
|
||||
for message in messages:
|
||||
message_id = message["id"]
|
||||
msg = _execute_with_retry(
|
||||
service.users()
|
||||
.messages()
|
||||
.get(userId="me", id=message_id, format="full")
|
||||
)
|
||||
doc = self._email_to_document(msg)
|
||||
# full_threads is an iterator containing a single thread
|
||||
# so we need to convert it to a list and grab the first element
|
||||
full_thread = list(full_threads)[0]
|
||||
doc = thread_to_document(full_thread)
|
||||
if doc is None:
|
||||
continue
|
||||
doc_batch.append(doc)
|
||||
if len(doc_batch) > 0:
|
||||
yield doc_batch
|
||||
if len(doc_batch) > self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
|
||||
def _fetch_slim_threads(
|
||||
self,
|
||||
time_range_start: SecondsSinceUnixEpoch | None = None,
|
||||
time_range_end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
query = _build_time_range_query(time_range_start, time_range_end)
|
||||
doc_batch = []
|
||||
for user_email in self._get_all_user_emails():
|
||||
gmail_service = get_gmail_service(self.creds, user_email)
|
||||
for thread in execute_paginated_retrieval(
|
||||
retrieval_function=gmail_service.users().threads().list,
|
||||
list_key="threads",
|
||||
userId=user_email,
|
||||
fields=THREAD_LIST_FIELDS,
|
||||
q=query,
|
||||
):
|
||||
doc_batch.append(
|
||||
SlimDocument(
|
||||
id=thread["id"],
|
||||
perm_sync_data={"user_email": user_email},
|
||||
)
|
||||
)
|
||||
if len(doc_batch) > SLIM_BATCH_SIZE:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
yield from self._fetch_mails_from_gmail()
|
||||
try:
|
||||
yield from self._fetch_threads()
|
||||
except Exception as e:
|
||||
if MISSING_SCOPES_ERROR_STR in str(e):
|
||||
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
||||
raise e
|
||||
|
||||
def poll_source(
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
yield from self._fetch_mails_from_gmail(start, end)
|
||||
try:
|
||||
yield from self._fetch_threads(start, end)
|
||||
except Exception as e:
|
||||
if MISSING_SCOPES_ERROR_STR in str(e):
|
||||
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
||||
raise e
|
||||
|
||||
def retrieve_all_slim_documents(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
try:
|
||||
yield from self._fetch_slim_threads(start, end)
|
||||
except Exception as e:
|
||||
if MISSING_SCOPES_ERROR_STR in str(e):
|
||||
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
||||
raise e
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
import os
|
||||
|
||||
service_account_json_path = os.environ.get("GOOGLE_SERVICE_ACCOUNT_KEY_JSON_PATH")
|
||||
if not service_account_json_path:
|
||||
raise ValueError(
|
||||
"Please set GOOGLE_SERVICE_ACCOUNT_KEY_JSON_PATH environment variable"
|
||||
)
|
||||
with open(service_account_json_path) as f:
|
||||
creds = json.load(f)
|
||||
|
||||
credentials_dict = {
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY: json.dumps(creds),
|
||||
}
|
||||
delegated_user = os.environ.get("GMAIL_DELEGATED_USER")
|
||||
if delegated_user:
|
||||
credentials_dict[DB_CREDENTIALS_DICT_DELEGATED_USER_KEY] = delegated_user
|
||||
|
||||
connector = GmailConnector()
|
||||
connector.load_credentials(
|
||||
json.loads(credentials_dict[DB_CREDENTIALS_DICT_TOKEN_KEY])
|
||||
)
|
||||
document_batch_generator = connector.load_from_state()
|
||||
for document_batch in document_batch_generator:
|
||||
print(document_batch)
|
||||
break
|
||||
pass
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
import json
|
||||
from typing import cast
|
||||
from urllib.parse import parse_qs
|
||||
from urllib.parse import ParseResult
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from google.auth.transport.requests import Request # type: ignore
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow # type: ignore
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.app_configs import WEB_DOMAIN
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import KV_CRED_KEY
|
||||
from danswer.configs.constants import KV_GMAIL_CRED_KEY
|
||||
from danswer.configs.constants import KV_GMAIL_SERVICE_ACCOUNT_KEY
|
||||
from danswer.connectors.gmail.constants import (
|
||||
DB_CREDENTIALS_DICT_DELEGATED_USER_KEY,
|
||||
)
|
||||
from danswer.connectors.gmail.constants import DB_CREDENTIALS_DICT_TOKEN_KEY
|
||||
from danswer.connectors.gmail.constants import (
|
||||
GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
|
||||
)
|
||||
from danswer.connectors.gmail.constants import SCOPES
|
||||
from danswer.db.credentials import update_credential_json
|
||||
from danswer.db.models import User
|
||||
from danswer.key_value_store.factory import get_kv_store
|
||||
from danswer.server.documents.models import CredentialBase
|
||||
from danswer.server.documents.models import GoogleAppCredentials
|
||||
from danswer.server.documents.models import GoogleServiceAccountKey
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _build_frontend_gmail_redirect() -> str:
|
||||
return f"{WEB_DOMAIN}/admin/connectors/gmail/auth/callback"
|
||||
|
||||
|
||||
def get_gmail_creds_for_authorized_user(
|
||||
token_json_str: str,
|
||||
) -> OAuthCredentials | None:
|
||||
creds_json = json.loads(token_json_str)
|
||||
creds = OAuthCredentials.from_authorized_user_info(creds_json, SCOPES)
|
||||
if creds.valid:
|
||||
return creds
|
||||
|
||||
if creds.expired and creds.refresh_token:
|
||||
try:
|
||||
creds.refresh(Request())
|
||||
if creds.valid:
|
||||
logger.notice("Refreshed Gmail tokens.")
|
||||
return creds
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to refresh gmail access token due to: {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_gmail_creds_for_service_account(
|
||||
service_account_key_json_str: str,
|
||||
) -> ServiceAccountCredentials | None:
|
||||
service_account_key = json.loads(service_account_key_json_str)
|
||||
creds = ServiceAccountCredentials.from_service_account_info(
|
||||
service_account_key, scopes=SCOPES
|
||||
)
|
||||
if not creds.valid or not creds.expired:
|
||||
creds.refresh(Request())
|
||||
return creds if creds.valid else None
|
||||
|
||||
|
||||
def verify_csrf(credential_id: int, state: str) -> None:
|
||||
csrf = get_kv_store().load(KV_CRED_KEY.format(str(credential_id)))
|
||||
if csrf != state:
|
||||
raise PermissionError(
|
||||
"State from Gmail Connector callback does not match expected"
|
||||
)
|
||||
|
||||
|
||||
def get_gmail_auth_url(credential_id: int) -> str:
|
||||
creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY))
|
||||
credential_json = json.loads(creds_str)
|
||||
flow = InstalledAppFlow.from_client_config(
|
||||
credential_json,
|
||||
scopes=SCOPES,
|
||||
redirect_uri=_build_frontend_gmail_redirect(),
|
||||
)
|
||||
auth_url, _ = flow.authorization_url(prompt="consent")
|
||||
|
||||
parsed_url = cast(ParseResult, urlparse(auth_url))
|
||||
params = parse_qs(parsed_url.query)
|
||||
|
||||
get_kv_store().store(
|
||||
KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True
|
||||
) # type: ignore
|
||||
return str(auth_url)
|
||||
|
||||
|
||||
def get_auth_url(credential_id: int) -> str:
|
||||
creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY))
|
||||
credential_json = json.loads(creds_str)
|
||||
flow = InstalledAppFlow.from_client_config(
|
||||
credential_json,
|
||||
scopes=SCOPES,
|
||||
redirect_uri=_build_frontend_gmail_redirect(),
|
||||
)
|
||||
auth_url, _ = flow.authorization_url(prompt="consent")
|
||||
|
||||
parsed_url = cast(ParseResult, urlparse(auth_url))
|
||||
params = parse_qs(parsed_url.query)
|
||||
|
||||
get_kv_store().store(
|
||||
KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True
|
||||
) # type: ignore
|
||||
return str(auth_url)
|
||||
|
||||
|
||||
def update_gmail_credential_access_tokens(
|
||||
auth_code: str,
|
||||
credential_id: int,
|
||||
user: User,
|
||||
db_session: Session,
|
||||
) -> OAuthCredentials | None:
|
||||
app_credentials = get_google_app_gmail_cred()
|
||||
flow = InstalledAppFlow.from_client_config(
|
||||
app_credentials.model_dump(),
|
||||
scopes=SCOPES,
|
||||
redirect_uri=_build_frontend_gmail_redirect(),
|
||||
)
|
||||
flow.fetch_token(code=auth_code)
|
||||
creds = flow.credentials
|
||||
token_json_str = creds.to_json()
|
||||
new_creds_dict = {DB_CREDENTIALS_DICT_TOKEN_KEY: token_json_str}
|
||||
|
||||
if not update_credential_json(credential_id, new_creds_dict, user, db_session):
|
||||
return None
|
||||
return creds
|
||||
|
||||
|
||||
def build_service_account_creds(
|
||||
delegated_user_email: str | None = None,
|
||||
) -> CredentialBase:
|
||||
service_account_key = get_gmail_service_account_key()
|
||||
|
||||
credential_dict = {
|
||||
GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: service_account_key.json(),
|
||||
}
|
||||
if delegated_user_email:
|
||||
credential_dict[DB_CREDENTIALS_DICT_DELEGATED_USER_KEY] = delegated_user_email
|
||||
|
||||
return CredentialBase(
|
||||
source=DocumentSource.GMAIL,
|
||||
credential_json=credential_dict,
|
||||
admin_public=True,
|
||||
)
|
||||
|
||||
|
||||
def get_google_app_gmail_cred() -> GoogleAppCredentials:
|
||||
creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY))
|
||||
return GoogleAppCredentials(**json.loads(creds_str))
|
||||
|
||||
|
||||
def upsert_google_app_gmail_cred(app_credentials: GoogleAppCredentials) -> None:
|
||||
get_kv_store().store(KV_GMAIL_CRED_KEY, app_credentials.json(), encrypt=True)
|
||||
|
||||
|
||||
def delete_google_app_gmail_cred() -> None:
|
||||
get_kv_store().delete(KV_GMAIL_CRED_KEY)
|
||||
|
||||
|
||||
def get_gmail_service_account_key() -> GoogleServiceAccountKey:
|
||||
creds_str = str(get_kv_store().load(KV_GMAIL_SERVICE_ACCOUNT_KEY))
|
||||
return GoogleServiceAccountKey(**json.loads(creds_str))
|
||||
|
||||
|
||||
def upsert_gmail_service_account_key(
|
||||
service_account_key: GoogleServiceAccountKey,
|
||||
) -> None:
|
||||
get_kv_store().store(
|
||||
KV_GMAIL_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True
|
||||
)
|
||||
|
||||
|
||||
def upsert_service_account_key(service_account_key: GoogleServiceAccountKey) -> None:
|
||||
get_kv_store().store(
|
||||
KV_GMAIL_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True
|
||||
)
|
||||
|
||||
|
||||
def delete_gmail_service_account_key() -> None:
|
||||
get_kv_store().delete(KV_GMAIL_SERVICE_ACCOUNT_KEY)
|
||||
|
||||
|
||||
def delete_service_account_key() -> None:
|
||||
get_kv_store().delete(KV_GMAIL_SERVICE_ACCOUNT_KEY)
|
||||
@@ -1,4 +0,0 @@
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY = "gmail_tokens"
|
||||
GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY = "gmail_service_account_key"
|
||||
DB_CREDENTIALS_DICT_DELEGATED_USER_KEY = "gmail_delegated_user"
|
||||
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]
|
||||
@@ -1,39 +1,47 @@
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from concurrent.futures import as_completed
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
|
||||
from googleapiclient.discovery import build # type: ignore
|
||||
from googleapiclient.discovery import Resource # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from danswer.connectors.google_drive.connector_auth import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
)
|
||||
from danswer.connectors.google_drive.connector_auth import get_google_drive_creds
|
||||
from danswer.connectors.google_drive.constants import MISSING_SCOPES_ERROR_STR
|
||||
from danswer.connectors.google_drive.constants import ONYX_SCOPE_INSTRUCTIONS
|
||||
from danswer.connectors.google_drive.constants import SCOPE_DOC_URL
|
||||
from danswer.connectors.google_drive.constants import SLIM_BATCH_SIZE
|
||||
from danswer.connectors.google_drive.constants import USER_FIELDS
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.google_drive.doc_conversion import build_slim_document
|
||||
from danswer.connectors.google_drive.doc_conversion import (
|
||||
convert_drive_item_to_document,
|
||||
)
|
||||
from danswer.connectors.google_drive.file_retrieval import crawl_folders_for_files
|
||||
from danswer.connectors.google_drive.file_retrieval import get_files_in_my_drive
|
||||
from danswer.connectors.google_drive.file_retrieval import get_all_files_in_my_drive
|
||||
from danswer.connectors.google_drive.file_retrieval import get_files_in_shared_drive
|
||||
from danswer.connectors.google_drive.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_drive.models import GoogleDriveFileType
|
||||
from danswer.connectors.google_utils.google_auth import get_google_creds
|
||||
from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_utils.resources import get_admin_service
|
||||
from danswer.connectors.google_utils.resources import get_drive_service
|
||||
from danswer.connectors.google_utils.resources import get_google_docs_service
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_STR
|
||||
from danswer.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS
|
||||
from danswer.connectors.google_utils.shared_constants import SCOPE_DOC_URL
|
||||
from danswer.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
|
||||
from danswer.connectors.google_utils.shared_constants import USER_FIELDS
|
||||
from danswer.connectors.interfaces import GenerateDocumentsOutput
|
||||
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from danswer.connectors.interfaces import LoadConnector
|
||||
from danswer.connectors.interfaces import PollConnector
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.connectors.interfaces import SlimConnector
|
||||
from danswer.connectors.models import SlimDocument
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
# TODO: Improve this by using the batch utility: https://googleapis.github.io/google-api-python-client/docs/batch.html
|
||||
# All file retrievals could be batched and made at once
|
||||
|
||||
|
||||
def _extract_str_list_from_comma_str(string: str | None) -> list[str]:
|
||||
@@ -46,6 +54,34 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
|
||||
return [url.split("/")[-1] for url in urls]
|
||||
|
||||
|
||||
def _convert_single_file(
|
||||
creds: Any, primary_admin_email: str, file: dict[str, Any]
|
||||
) -> Any:
|
||||
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
|
||||
user_drive_service = get_drive_service(creds, user_email=user_email)
|
||||
docs_service = get_google_docs_service(creds, user_email=user_email)
|
||||
return convert_drive_item_to_document(
|
||||
file=file,
|
||||
drive_service=user_drive_service,
|
||||
docs_service=docs_service,
|
||||
)
|
||||
|
||||
|
||||
def _process_files_batch(
|
||||
files: list[GoogleDriveFileType], convert_func: Callable, batch_size: int
|
||||
) -> GenerateDocumentsOutput:
|
||||
doc_batch = []
|
||||
with ThreadPoolExecutor(max_workers=min(16, len(files))) as executor:
|
||||
for doc in executor.map(convert_func, files):
|
||||
if doc:
|
||||
doc_batch.append(doc)
|
||||
if len(doc_batch) >= batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
|
||||
|
||||
class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -95,156 +131,225 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
|
||||
self.include_shared_drives = include_shared_drives
|
||||
shared_drive_url_list = _extract_str_list_from_comma_str(shared_drive_urls)
|
||||
self.shared_drive_ids = _extract_ids_from_urls(shared_drive_url_list)
|
||||
self._requested_shared_drive_ids = set(
|
||||
_extract_ids_from_urls(shared_drive_url_list)
|
||||
)
|
||||
|
||||
self.include_my_drives = include_my_drives
|
||||
self.my_drive_emails = _extract_str_list_from_comma_str(my_drive_emails)
|
||||
self._requested_my_drive_emails = set(
|
||||
_extract_str_list_from_comma_str(my_drive_emails)
|
||||
)
|
||||
|
||||
shared_folder_url_list = _extract_str_list_from_comma_str(shared_folder_urls)
|
||||
self.shared_folder_ids = _extract_ids_from_urls(shared_folder_url_list)
|
||||
self._requested_folder_ids = set(_extract_ids_from_urls(shared_folder_url_list))
|
||||
|
||||
self.primary_admin_email: str | None = None
|
||||
self.google_domain: str | None = None
|
||||
self._primary_admin_email: str | None = None
|
||||
|
||||
self.creds: OAuthCredentials | ServiceAccountCredentials | None = None
|
||||
self._creds: OAuthCredentials | ServiceAccountCredentials | None = None
|
||||
|
||||
self._TRAVERSED_PARENT_IDS: set[str] = set()
|
||||
self._retrieved_ids: set[str] = set()
|
||||
|
||||
def _update_traversed_parent_ids(self, folder_id: str) -> None:
|
||||
self._TRAVERSED_PARENT_IDS.add(folder_id)
|
||||
@property
|
||||
def primary_admin_email(self) -> str:
|
||||
if self._primary_admin_email is None:
|
||||
raise RuntimeError(
|
||||
"Primary admin email missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._primary_admin_email
|
||||
|
||||
@property
|
||||
def google_domain(self) -> str:
|
||||
if self._primary_admin_email is None:
|
||||
raise RuntimeError(
|
||||
"Primary admin email missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._primary_admin_email.split("@")[-1]
|
||||
|
||||
@property
|
||||
def creds(self) -> OAuthCredentials | ServiceAccountCredentials:
|
||||
if self._creds is None:
|
||||
raise RuntimeError(
|
||||
"Creds missing, "
|
||||
"should not call this property "
|
||||
"before calling load_credentials"
|
||||
)
|
||||
return self._creds
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, str] | None:
|
||||
primary_admin_email = credentials[DB_CREDENTIALS_PRIMARY_ADMIN_KEY]
|
||||
self.google_domain = primary_admin_email.split("@")[1]
|
||||
self.primary_admin_email = primary_admin_email
|
||||
self._primary_admin_email = primary_admin_email
|
||||
|
||||
self.creds, new_creds_dict = get_google_drive_creds(credentials)
|
||||
self._creds, new_creds_dict = get_google_creds(
|
||||
credentials=credentials,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
)
|
||||
return new_creds_dict
|
||||
|
||||
def get_google_resource(
|
||||
self,
|
||||
service_name: str = "drive",
|
||||
service_version: str = "v3",
|
||||
user_email: str | None = None,
|
||||
) -> Resource:
|
||||
if isinstance(self.creds, ServiceAccountCredentials):
|
||||
creds = self.creds.with_subject(user_email or self.primary_admin_email)
|
||||
service = build(service_name, service_version, credentials=creds)
|
||||
elif isinstance(self.creds, OAuthCredentials):
|
||||
service = build(service_name, service_version, credentials=self.creds)
|
||||
else:
|
||||
raise PermissionError("No credentials found")
|
||||
def _update_traversed_parent_ids(self, folder_id: str) -> None:
|
||||
self._retrieved_ids.add(folder_id)
|
||||
|
||||
return service
|
||||
|
||||
def _get_all_user_emails(self) -> list[str]:
|
||||
admin_service = self.get_google_resource("admin", "directory_v1")
|
||||
def _get_all_user_emails(self, admins_only: bool) -> list[str]:
|
||||
admin_service = get_admin_service(
|
||||
creds=self.creds,
|
||||
user_email=self.primary_admin_email,
|
||||
)
|
||||
query = "isAdmin=true" if admins_only else "isAdmin=false"
|
||||
emails = []
|
||||
for user in execute_paginated_retrieval(
|
||||
retrieval_function=admin_service.users().list,
|
||||
list_key="users",
|
||||
fields=USER_FIELDS,
|
||||
domain=self.google_domain,
|
||||
query=query,
|
||||
):
|
||||
if email := user.get("primaryEmail"):
|
||||
emails.append(email)
|
||||
return emails
|
||||
|
||||
def _get_all_drive_ids(self) -> set[str]:
|
||||
primary_drive_service = get_drive_service(
|
||||
creds=self.creds,
|
||||
user_email=self.primary_admin_email,
|
||||
)
|
||||
all_drive_ids = set()
|
||||
for drive in execute_paginated_retrieval(
|
||||
retrieval_function=primary_drive_service.drives().list,
|
||||
list_key="drives",
|
||||
useDomainAdminAccess=True,
|
||||
fields="drives(id)",
|
||||
):
|
||||
all_drive_ids.add(drive["id"])
|
||||
return all_drive_ids
|
||||
|
||||
def _initialize_all_class_variables(self) -> None:
|
||||
# Get all user emails
|
||||
# Get admins first becuase they are more likely to have access to the most files
|
||||
user_emails = [self.primary_admin_email]
|
||||
for admins_only in [True, False]:
|
||||
for email in self._get_all_user_emails(admins_only=admins_only):
|
||||
if email not in user_emails:
|
||||
user_emails.append(email)
|
||||
self._all_org_emails = user_emails
|
||||
|
||||
self._all_drive_ids: set[str] = self._get_all_drive_ids()
|
||||
|
||||
# remove drive ids from the folder ids because they are queried differently
|
||||
self._requested_folder_ids -= self._all_drive_ids
|
||||
|
||||
# Remove drive_ids that are not in the all_drive_ids and check them as folders instead
|
||||
invalid_drive_ids = self._requested_shared_drive_ids - self._all_drive_ids
|
||||
if invalid_drive_ids:
|
||||
logger.warning(
|
||||
f"Some shared drive IDs were not found. IDs: {invalid_drive_ids}"
|
||||
)
|
||||
logger.warning("Checking for folder access instead...")
|
||||
self._requested_folder_ids.update(invalid_drive_ids)
|
||||
|
||||
if not self.include_shared_drives:
|
||||
self._requested_shared_drive_ids = set()
|
||||
elif not self._requested_shared_drive_ids:
|
||||
self._requested_shared_drive_ids = self._all_drive_ids
|
||||
|
||||
def _impersonate_user_for_retrieval(
|
||||
self,
|
||||
user_email: str,
|
||||
is_slim: bool,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
drive_service = get_drive_service(self.creds, user_email)
|
||||
if self.include_my_drives and (
|
||||
not self._requested_my_drive_emails
|
||||
or user_email in self._requested_my_drive_emails
|
||||
):
|
||||
yield from get_all_files_in_my_drive(
|
||||
service=drive_service,
|
||||
update_traversed_ids_func=self._update_traversed_parent_ids,
|
||||
is_slim=is_slim,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
|
||||
remaining_drive_ids = self._requested_shared_drive_ids - self._retrieved_ids
|
||||
for drive_id in remaining_drive_ids:
|
||||
yield from get_files_in_shared_drive(
|
||||
service=drive_service,
|
||||
drive_id=drive_id,
|
||||
is_slim=is_slim,
|
||||
update_traversed_ids_func=self._update_traversed_parent_ids,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
|
||||
remaining_folders = self._requested_folder_ids - self._retrieved_ids
|
||||
for folder_id in remaining_folders:
|
||||
yield from crawl_folders_for_files(
|
||||
service=drive_service,
|
||||
parent_id=folder_id,
|
||||
traversed_parent_ids=self._retrieved_ids,
|
||||
update_traversed_ids_func=self._update_traversed_parent_ids,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
|
||||
def _fetch_drive_items(
|
||||
self,
|
||||
is_slim: bool,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
primary_drive_service = self.get_google_resource()
|
||||
self._initialize_all_class_variables()
|
||||
|
||||
if self.include_shared_drives:
|
||||
shared_drive_urls = self.shared_drive_ids
|
||||
if not shared_drive_urls:
|
||||
# if no parent ids are specified, get all shared drives using the admin account
|
||||
for drive in execute_paginated_retrieval(
|
||||
retrieval_function=primary_drive_service.drives().list,
|
||||
list_key="drives",
|
||||
useDomainAdminAccess=True,
|
||||
fields="drives(id)",
|
||||
):
|
||||
shared_drive_urls.append(drive["id"])
|
||||
# Process users in parallel using ThreadPoolExecutor
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
future_to_email = {
|
||||
executor.submit(
|
||||
self._impersonate_user_for_retrieval, email, is_slim, start, end
|
||||
): email
|
||||
for email in self._all_org_emails
|
||||
}
|
||||
|
||||
# For each shared drive, retrieve all files
|
||||
for shared_drive_id in shared_drive_urls:
|
||||
for file in get_files_in_shared_drive(
|
||||
service=primary_drive_service,
|
||||
drive_id=shared_drive_id,
|
||||
is_slim=is_slim,
|
||||
cache_folders=bool(self.shared_folder_ids),
|
||||
update_traversed_ids_func=self._update_traversed_parent_ids,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
yield file
|
||||
# Yield results as they complete
|
||||
for future in as_completed(future_to_email):
|
||||
yield from future.result()
|
||||
|
||||
if self.shared_folder_ids:
|
||||
# Crawl all the shared parent ids for files
|
||||
for folder_id in self.shared_folder_ids:
|
||||
yield from crawl_folders_for_files(
|
||||
service=primary_drive_service,
|
||||
parent_id=folder_id,
|
||||
personal_drive=False,
|
||||
traversed_parent_ids=self._TRAVERSED_PARENT_IDS,
|
||||
update_traversed_ids_func=self._update_traversed_parent_ids,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
|
||||
all_user_emails = []
|
||||
# get all personal docs from each users' personal drive
|
||||
if self.include_my_drives:
|
||||
if isinstance(self.creds, ServiceAccountCredentials):
|
||||
all_user_emails = self.my_drive_emails or []
|
||||
|
||||
# If using service account and no emails specified, fetch all users
|
||||
if not all_user_emails:
|
||||
all_user_emails = self._get_all_user_emails()
|
||||
|
||||
elif self.primary_admin_email:
|
||||
# If using OAuth, only fetch the primary admin email
|
||||
all_user_emails = [self.primary_admin_email]
|
||||
|
||||
for email in all_user_emails:
|
||||
logger.info(f"Fetching personal files for user: {email}")
|
||||
user_drive_service = self.get_google_resource(user_email=email)
|
||||
|
||||
yield from get_files_in_my_drive(
|
||||
service=user_drive_service,
|
||||
email=email,
|
||||
is_slim=is_slim,
|
||||
start=start,
|
||||
end=end,
|
||||
)
|
||||
remaining_folders = self._requested_folder_ids - self._retrieved_ids
|
||||
if remaining_folders:
|
||||
logger.warning(
|
||||
f"Some folders/drives were not retrieved. IDs: {remaining_folders}"
|
||||
)
|
||||
|
||||
def _extract_docs_from_google_drive(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateDocumentsOutput:
|
||||
doc_batch = []
|
||||
for file in self._fetch_drive_items(
|
||||
is_slim=False,
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
user_email = file.get("owners", [{}])[0].get("emailAddress")
|
||||
service = self.get_google_resource(user_email=user_email)
|
||||
if doc := convert_drive_item_to_document(
|
||||
file=file,
|
||||
service=service,
|
||||
):
|
||||
doc_batch.append(doc)
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
# Create a larger process pool for file conversion
|
||||
convert_func = partial(
|
||||
_convert_single_file, self.creds, self.primary_admin_email
|
||||
)
|
||||
|
||||
yield doc_batch
|
||||
# Process files in larger batches
|
||||
LARGE_BATCH_SIZE = self.batch_size * 4
|
||||
files_to_process = []
|
||||
# Gather the files into batches to be processed in parallel
|
||||
for file in self._fetch_drive_items(is_slim=False, start=start, end=end):
|
||||
files_to_process.append(file)
|
||||
if len(files_to_process) >= LARGE_BATCH_SIZE:
|
||||
yield from _process_files_batch(
|
||||
files_to_process, convert_func, self.batch_size
|
||||
)
|
||||
files_to_process = []
|
||||
|
||||
# Process any remaining files
|
||||
if files_to_process:
|
||||
yield from _process_files_batch(
|
||||
files_to_process, convert_func, self.batch_size
|
||||
)
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
try:
|
||||
@@ -275,18 +380,8 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
start=start,
|
||||
end=end,
|
||||
):
|
||||
slim_batch.append(
|
||||
SlimDocument(
|
||||
id=file["webViewLink"],
|
||||
perm_sync_data={
|
||||
"doc_id": file.get("id"),
|
||||
"permissions": file.get("permissions", []),
|
||||
"permission_ids": file.get("permissionIds", []),
|
||||
"name": file.get("name"),
|
||||
"owner_email": file.get("owners", [{}])[0].get("emailAddress"),
|
||||
},
|
||||
)
|
||||
)
|
||||
if doc := build_slim_document(file):
|
||||
slim_batch.append(doc)
|
||||
if len(slim_batch) >= SLIM_BATCH_SIZE:
|
||||
yield slim_batch
|
||||
slim_batch = []
|
||||
|
||||
@@ -1,236 +0,0 @@
|
||||
import json
|
||||
from typing import cast
|
||||
from urllib.parse import parse_qs
|
||||
from urllib.parse import ParseResult
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from google.auth.transport.requests import Request # type: ignore
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow # type: ignore
|
||||
from googleapiclient.discovery import build # type: ignore
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.app_configs import WEB_DOMAIN
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import KV_CRED_KEY
|
||||
from danswer.configs.constants import KV_GOOGLE_DRIVE_CRED_KEY
|
||||
from danswer.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY
|
||||
from danswer.connectors.google_drive.constants import MISSING_SCOPES_ERROR_STR
|
||||
from danswer.connectors.google_drive.constants import ONYX_SCOPE_INSTRUCTIONS
|
||||
from danswer.db.credentials import update_credential_json
|
||||
from danswer.db.models import User
|
||||
from danswer.key_value_store.factory import get_kv_store
|
||||
from danswer.server.documents.models import CredentialBase
|
||||
from danswer.server.documents.models import GoogleAppCredentials
|
||||
from danswer.server.documents.models import GoogleServiceAccountKey
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
GOOGLE_DRIVE_SCOPES = [
|
||||
"https://www.googleapis.com/auth/drive.readonly",
|
||||
"https://www.googleapis.com/auth/drive.metadata.readonly",
|
||||
"https://www.googleapis.com/auth/admin.directory.group.readonly",
|
||||
"https://www.googleapis.com/auth/admin.directory.user.readonly",
|
||||
]
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY = "google_drive_tokens"
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY = "google_drive_primary_admin"
|
||||
|
||||
|
||||
def _build_frontend_google_drive_redirect() -> str:
|
||||
return f"{WEB_DOMAIN}/admin/connectors/google-drive/auth/callback"
|
||||
|
||||
|
||||
def get_google_drive_creds_for_authorized_user(
|
||||
token_json_str: str, scopes: list[str]
|
||||
) -> OAuthCredentials | None:
|
||||
creds_json = json.loads(token_json_str)
|
||||
creds = OAuthCredentials.from_authorized_user_info(creds_json, scopes)
|
||||
if creds.valid:
|
||||
return creds
|
||||
|
||||
if creds.expired and creds.refresh_token:
|
||||
try:
|
||||
creds.refresh(Request())
|
||||
if creds.valid:
|
||||
logger.notice("Refreshed Google Drive tokens.")
|
||||
return creds
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to refresh google drive access token due to: {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_google_drive_creds(
|
||||
credentials: dict[str, str], scopes: list[str] = GOOGLE_DRIVE_SCOPES
|
||||
) -> tuple[ServiceAccountCredentials | OAuthCredentials, dict[str, str] | None]:
|
||||
"""Checks for two different types of credentials.
|
||||
(1) A credential which holds a token acquired via a user going thorough
|
||||
the Google OAuth flow.
|
||||
(2) A credential which holds a service account key JSON file, which
|
||||
can then be used to impersonate any user in the workspace.
|
||||
"""
|
||||
oauth_creds = None
|
||||
service_creds = None
|
||||
new_creds_dict = None
|
||||
if DB_CREDENTIALS_DICT_TOKEN_KEY in credentials:
|
||||
access_token_json_str = cast(str, credentials[DB_CREDENTIALS_DICT_TOKEN_KEY])
|
||||
oauth_creds = get_google_drive_creds_for_authorized_user(
|
||||
token_json_str=access_token_json_str, scopes=scopes
|
||||
)
|
||||
|
||||
# tell caller to update token stored in DB if it has changed
|
||||
# (e.g. the token has been refreshed)
|
||||
new_creds_json_str = oauth_creds.to_json() if oauth_creds else ""
|
||||
if new_creds_json_str != access_token_json_str:
|
||||
new_creds_dict = {
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY: new_creds_json_str,
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: credentials[
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY
|
||||
],
|
||||
}
|
||||
|
||||
elif KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY in credentials:
|
||||
service_account_key_json_str = credentials[KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY]
|
||||
service_account_key = json.loads(service_account_key_json_str)
|
||||
|
||||
service_creds = ServiceAccountCredentials.from_service_account_info(
|
||||
service_account_key, scopes=scopes
|
||||
)
|
||||
|
||||
if not service_creds.valid or not service_creds.expired:
|
||||
service_creds.refresh(Request())
|
||||
|
||||
if not service_creds.valid:
|
||||
raise PermissionError(
|
||||
"Unable to access Google Drive - service account credentials are invalid."
|
||||
)
|
||||
|
||||
creds: ServiceAccountCredentials | OAuthCredentials | None = (
|
||||
oauth_creds or service_creds
|
||||
)
|
||||
if creds is None:
|
||||
raise PermissionError(
|
||||
"Unable to access Google Drive - unknown credential structure."
|
||||
)
|
||||
|
||||
return creds, new_creds_dict
|
||||
|
||||
|
||||
def verify_csrf(credential_id: int, state: str) -> None:
|
||||
csrf = get_kv_store().load(KV_CRED_KEY.format(str(credential_id)))
|
||||
if csrf != state:
|
||||
raise PermissionError(
|
||||
"State from Google Drive Connector callback does not match expected"
|
||||
)
|
||||
|
||||
|
||||
def get_auth_url(credential_id: int) -> str:
|
||||
creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY))
|
||||
credential_json = json.loads(creds_str)
|
||||
flow = InstalledAppFlow.from_client_config(
|
||||
credential_json,
|
||||
scopes=GOOGLE_DRIVE_SCOPES,
|
||||
redirect_uri=_build_frontend_google_drive_redirect(),
|
||||
)
|
||||
auth_url, _ = flow.authorization_url(prompt="consent")
|
||||
|
||||
parsed_url = cast(ParseResult, urlparse(auth_url))
|
||||
params = parse_qs(parsed_url.query)
|
||||
|
||||
get_kv_store().store(
|
||||
KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True
|
||||
) # type: ignore
|
||||
return str(auth_url)
|
||||
|
||||
|
||||
def update_credential_access_tokens(
|
||||
auth_code: str,
|
||||
credential_id: int,
|
||||
user: User,
|
||||
db_session: Session,
|
||||
) -> OAuthCredentials | None:
|
||||
app_credentials = get_google_app_cred()
|
||||
flow = InstalledAppFlow.from_client_config(
|
||||
app_credentials.model_dump(),
|
||||
scopes=GOOGLE_DRIVE_SCOPES,
|
||||
redirect_uri=_build_frontend_google_drive_redirect(),
|
||||
)
|
||||
flow.fetch_token(code=auth_code)
|
||||
creds = flow.credentials
|
||||
token_json_str = creds.to_json()
|
||||
|
||||
# Get user email from Google API so we know who
|
||||
# the primary admin is for this connector
|
||||
try:
|
||||
admin_service = build("drive", "v3", credentials=creds)
|
||||
user_info = (
|
||||
admin_service.about()
|
||||
.get(
|
||||
fields="user(emailAddress)",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
email = user_info.get("user", {}).get("emailAddress")
|
||||
except Exception as e:
|
||||
if MISSING_SCOPES_ERROR_STR in str(e):
|
||||
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
||||
raise e
|
||||
|
||||
new_creds_dict = {
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY: token_json_str,
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
|
||||
}
|
||||
|
||||
if not update_credential_json(credential_id, new_creds_dict, user, db_session):
|
||||
return None
|
||||
return creds
|
||||
|
||||
|
||||
def build_service_account_creds(
|
||||
source: DocumentSource,
|
||||
primary_admin_email: str | None = None,
|
||||
) -> CredentialBase:
|
||||
service_account_key = get_service_account_key()
|
||||
|
||||
credential_dict = {
|
||||
KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY: service_account_key.json(),
|
||||
}
|
||||
if primary_admin_email:
|
||||
credential_dict[DB_CREDENTIALS_PRIMARY_ADMIN_KEY] = primary_admin_email
|
||||
|
||||
return CredentialBase(
|
||||
credential_json=credential_dict,
|
||||
admin_public=True,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
)
|
||||
|
||||
|
||||
def get_google_app_cred() -> GoogleAppCredentials:
|
||||
creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY))
|
||||
return GoogleAppCredentials(**json.loads(creds_str))
|
||||
|
||||
|
||||
def upsert_google_app_cred(app_credentials: GoogleAppCredentials) -> None:
|
||||
get_kv_store().store(KV_GOOGLE_DRIVE_CRED_KEY, app_credentials.json(), encrypt=True)
|
||||
|
||||
|
||||
def delete_google_app_cred() -> None:
|
||||
get_kv_store().delete(KV_GOOGLE_DRIVE_CRED_KEY)
|
||||
|
||||
|
||||
def get_service_account_key() -> GoogleServiceAccountKey:
|
||||
creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY))
|
||||
return GoogleServiceAccountKey(**json.loads(creds_str))
|
||||
|
||||
|
||||
def upsert_service_account_key(service_account_key: GoogleServiceAccountKey) -> None:
|
||||
get_kv_store().store(
|
||||
KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True
|
||||
)
|
||||
|
||||
|
||||
def delete_service_account_key() -> None:
|
||||
get_kv_store().delete(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY)
|
||||
@@ -2,35 +2,3 @@ UNSUPPORTED_FILE_TYPE_CONTENT = "" # keep empty for now
|
||||
DRIVE_FOLDER_TYPE = "application/vnd.google-apps.folder"
|
||||
DRIVE_SHORTCUT_TYPE = "application/vnd.google-apps.shortcut"
|
||||
DRIVE_FILE_TYPE = "application/vnd.google-apps.file"
|
||||
|
||||
FILE_FIELDS = (
|
||||
"nextPageToken, files(mimeType, id, name, permissions, modifiedTime, webViewLink, "
|
||||
"shortcutDetails, owners(emailAddress))"
|
||||
)
|
||||
SLIM_FILE_FIELDS = (
|
||||
"nextPageToken, files(mimeType, id, name, permissions(emailAddress, type), "
|
||||
"permissionIds, webViewLink, owners(emailAddress))"
|
||||
)
|
||||
FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
|
||||
USER_FIELDS = "nextPageToken, users(primaryEmail)"
|
||||
|
||||
# these errors don't represent a failure in the connector, but simply files
|
||||
# that can't / shouldn't be indexed
|
||||
ERRORS_TO_CONTINUE_ON = [
|
||||
"cannotExportFile",
|
||||
"exportSizeLimitExceeded",
|
||||
"cannotDownloadFile",
|
||||
]
|
||||
|
||||
# Error message substrings
|
||||
MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requested"
|
||||
|
||||
# Documentation and error messages
|
||||
SCOPE_DOC_URL = "https://docs.danswer.dev/connectors/google_drive/overview"
|
||||
ONYX_SCOPE_INSTRUCTIONS = (
|
||||
"You have upgraded Danswer without updating the Google Drive scopes. "
|
||||
f"Please refer to the documentation to learn how to update the scopes: {SCOPE_DOC_URL}"
|
||||
)
|
||||
|
||||
# Batch sizes
|
||||
SLIM_BATCH_SIZE = 500
|
||||
|
||||
@@ -2,19 +2,22 @@ import io
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from googleapiclient.discovery import Resource # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
|
||||
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import IGNORE_FOR_QA
|
||||
from danswer.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
|
||||
from danswer.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
|
||||
from danswer.connectors.google_drive.constants import ERRORS_TO_CONTINUE_ON
|
||||
from danswer.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
from danswer.connectors.google_drive.models import GDriveMimeType
|
||||
from danswer.connectors.google_drive.models import GoogleDriveFileType
|
||||
from danswer.connectors.google_drive.section_extraction import get_document_sections
|
||||
from danswer.connectors.google_utils.resources import GoogleDocsService
|
||||
from danswer.connectors.google_utils.resources import GoogleDriveService
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import Section
|
||||
from danswer.connectors.models import SlimDocument
|
||||
from danswer.file_processing.extract_file_text import docx_to_text
|
||||
from danswer.file_processing.extract_file_text import pptx_to_text
|
||||
from danswer.file_processing.extract_file_text import read_pdf_file
|
||||
@@ -25,86 +28,148 @@ from danswer.utils.logger import setup_logger
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _extract_text(file: dict[str, str], service: Resource) -> str:
|
||||
# these errors don't represent a failure in the connector, but simply files
|
||||
# that can't / shouldn't be indexed
|
||||
ERRORS_TO_CONTINUE_ON = [
|
||||
"cannotExportFile",
|
||||
"exportSizeLimitExceeded",
|
||||
"cannotDownloadFile",
|
||||
]
|
||||
|
||||
|
||||
def _extract_sections_basic(
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
mime_type = file["mimeType"]
|
||||
link = file["webViewLink"]
|
||||
|
||||
if mime_type not in set(item.value for item in GDriveMimeType):
|
||||
# Unsupported file types can still have a title, finding this way is still useful
|
||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
if mime_type in [
|
||||
GDriveMimeType.DOC.value,
|
||||
GDriveMimeType.PPT.value,
|
||||
GDriveMimeType.SPREADSHEET.value,
|
||||
]:
|
||||
export_mime_type = (
|
||||
"text/plain"
|
||||
if mime_type != GDriveMimeType.SPREADSHEET.value
|
||||
else "text/csv"
|
||||
)
|
||||
return (
|
||||
service.files()
|
||||
.export(fileId=file["id"], mimeType=export_mime_type)
|
||||
.execute()
|
||||
.decode("utf-8")
|
||||
)
|
||||
elif mime_type in [
|
||||
GDriveMimeType.PLAIN_TEXT.value,
|
||||
GDriveMimeType.MARKDOWN.value,
|
||||
]:
|
||||
return service.files().get_media(fileId=file["id"]).execute().decode("utf-8")
|
||||
if mime_type in [
|
||||
GDriveMimeType.WORD_DOC.value,
|
||||
GDriveMimeType.POWERPOINT.value,
|
||||
GDriveMimeType.PDF.value,
|
||||
]:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
if get_unstructured_api_key():
|
||||
return unstructured_to_text(
|
||||
file=io.BytesIO(response), file_name=file.get("name", file["id"])
|
||||
try:
|
||||
if mime_type in [
|
||||
GDriveMimeType.DOC.value,
|
||||
GDriveMimeType.PPT.value,
|
||||
GDriveMimeType.SPREADSHEET.value,
|
||||
]:
|
||||
export_mime_type = (
|
||||
"text/plain"
|
||||
if mime_type != GDriveMimeType.SPREADSHEET.value
|
||||
else "text/csv"
|
||||
)
|
||||
text = (
|
||||
service.files()
|
||||
.export(fileId=file["id"], mimeType=export_mime_type)
|
||||
.execute()
|
||||
.decode("utf-8")
|
||||
)
|
||||
return [Section(link=link, text=text)]
|
||||
elif mime_type in [
|
||||
GDriveMimeType.PLAIN_TEXT.value,
|
||||
GDriveMimeType.MARKDOWN.value,
|
||||
]:
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=service.files()
|
||||
.get_media(fileId=file["id"])
|
||||
.execute()
|
||||
.decode("utf-8"),
|
||||
)
|
||||
]
|
||||
if mime_type in [
|
||||
GDriveMimeType.WORD_DOC.value,
|
||||
GDriveMimeType.POWERPOINT.value,
|
||||
GDriveMimeType.PDF.value,
|
||||
]:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
if get_unstructured_api_key():
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=unstructured_to_text(
|
||||
file=io.BytesIO(response),
|
||||
file_name=file.get("name", file["id"]),
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
return docx_to_text(file=io.BytesIO(response))
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
text, _ = read_pdf_file(file=io.BytesIO(response))
|
||||
return text
|
||||
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
return pptx_to_text(file=io.BytesIO(response))
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
return [
|
||||
Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
text, _ = read_pdf_file(file=io.BytesIO(response))
|
||||
return [Section(link=link, text=text)]
|
||||
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
return [
|
||||
Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
|
||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
except Exception:
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
|
||||
def convert_drive_item_to_document(
|
||||
file: GoogleDriveFileType, service: Resource
|
||||
file: GoogleDriveFileType,
|
||||
drive_service: GoogleDriveService,
|
||||
docs_service: GoogleDocsService,
|
||||
) -> Document | None:
|
||||
try:
|
||||
# Skip files that are shortcuts
|
||||
if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
|
||||
logger.info("Ignoring Drive Shortcut Filetype")
|
||||
return None
|
||||
try:
|
||||
text_contents = _extract_text(file, service) or ""
|
||||
except HttpError as e:
|
||||
reason = e.error_details[0]["reason"] if e.error_details else e.reason
|
||||
message = e.error_details[0]["message"] if e.error_details else e.reason
|
||||
if e.status_code == 403 and reason in ERRORS_TO_CONTINUE_ON:
|
||||
logger.warning(
|
||||
f"Could not export file '{file['name']}' due to '{message}', skipping..."
|
||||
)
|
||||
return None
|
||||
# Skip files that are folders
|
||||
if file.get("mimeType") == DRIVE_FOLDER_TYPE:
|
||||
logger.info("Ignoring Drive Folder Filetype")
|
||||
return None
|
||||
|
||||
raise
|
||||
sections: list[Section] = []
|
||||
|
||||
# Special handling for Google Docs to preserve structure, link
|
||||
# to headers
|
||||
if file.get("mimeType") == GDriveMimeType.DOC.value:
|
||||
try:
|
||||
sections = get_document_sections(docs_service, file["id"])
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
|
||||
" Falling back to basic extraction."
|
||||
)
|
||||
# NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
|
||||
if not sections:
|
||||
try:
|
||||
# For all other file types just extract the text
|
||||
sections = _extract_sections_basic(file, drive_service)
|
||||
|
||||
except HttpError as e:
|
||||
reason = e.error_details[0]["reason"] if e.error_details else e.reason
|
||||
message = e.error_details[0]["message"] if e.error_details else e.reason
|
||||
if e.status_code == 403 and reason in ERRORS_TO_CONTINUE_ON:
|
||||
logger.warning(
|
||||
f"Could not export file '{file['name']}' due to '{message}', skipping..."
|
||||
)
|
||||
return None
|
||||
|
||||
raise
|
||||
if not sections:
|
||||
return None
|
||||
|
||||
return Document(
|
||||
id=file["webViewLink"],
|
||||
sections=[Section(link=file["webViewLink"], text=text_contents)],
|
||||
sections=sections,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
semantic_identifier=file["name"],
|
||||
doc_updated_at=datetime.fromisoformat(file["modifiedTime"]).astimezone(
|
||||
timezone.utc
|
||||
),
|
||||
metadata={} if text_contents else {IGNORE_FOR_QA: "True"},
|
||||
metadata={}
|
||||
if any(section.text for section in sections)
|
||||
else {IGNORE_FOR_QA: "True"},
|
||||
additional_info=file.get("id"),
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -113,3 +178,20 @@ def convert_drive_item_to_document(
|
||||
|
||||
logger.exception("Ran into exception when pulling a file from Google Drive")
|
||||
return None
|
||||
|
||||
|
||||
def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
|
||||
# Skip files that are folders or shortcuts
|
||||
if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
|
||||
return None
|
||||
|
||||
return SlimDocument(
|
||||
id=file["webViewLink"],
|
||||
perm_sync_data={
|
||||
"doc_id": file.get("id"),
|
||||
"permissions": file.get("permissions", []),
|
||||
"permission_ids": file.get("permissionIds", []),
|
||||
"name": file.get("name"),
|
||||
"owner_email": file.get("owners", [{}])[0].get("emailAddress"),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -1,21 +1,29 @@
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from googleapiclient.discovery import Resource # type: ignore
|
||||
|
||||
from danswer.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
|
||||
from danswer.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
|
||||
from danswer.connectors.google_drive.constants import FILE_FIELDS
|
||||
from danswer.connectors.google_drive.constants import FOLDER_FIELDS
|
||||
from danswer.connectors.google_drive.constants import SLIM_FILE_FIELDS
|
||||
from danswer.connectors.google_drive.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_drive.models import GoogleDriveFileType
|
||||
from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
FILE_FIELDS = (
|
||||
"nextPageToken, files(mimeType, id, name, permissions, modifiedTime, webViewLink, "
|
||||
"shortcutDetails, owners(emailAddress))"
|
||||
)
|
||||
SLIM_FILE_FIELDS = (
|
||||
"nextPageToken, files(mimeType, id, name, permissions(emailAddress, type), "
|
||||
"permissionIds, webViewLink, owners(emailAddress))"
|
||||
)
|
||||
FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"
|
||||
|
||||
|
||||
def _generate_time_range_filter(
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
@@ -34,7 +42,6 @@ def _generate_time_range_filter(
|
||||
def _get_folders_in_parent(
|
||||
service: Resource,
|
||||
parent_id: str | None = None,
|
||||
personal_drive: bool = False,
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
# Follow shortcuts to folders
|
||||
query = f"(mimeType = '{DRIVE_FOLDER_TYPE}' or mimeType = '{DRIVE_SHORTCUT_TYPE}')"
|
||||
@@ -46,9 +53,10 @@ def _get_folders_in_parent(
|
||||
for file in execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
corpora="user" if personal_drive else "allDrives",
|
||||
supportsAllDrives=not personal_drive,
|
||||
includeItemsFromAllDrives=not personal_drive,
|
||||
continue_on_404_or_403=True,
|
||||
corpora="allDrives",
|
||||
supportsAllDrives=True,
|
||||
includeItemsFromAllDrives=True,
|
||||
fields=FOLDER_FIELDS,
|
||||
q=query,
|
||||
):
|
||||
@@ -58,7 +66,6 @@ def _get_folders_in_parent(
|
||||
def _get_files_in_parent(
|
||||
service: Resource,
|
||||
parent_id: str,
|
||||
personal_drive: bool,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
is_slim: bool = False,
|
||||
@@ -70,9 +77,10 @@ def _get_files_in_parent(
|
||||
for file in execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
corpora="user" if personal_drive else "allDrives",
|
||||
supportsAllDrives=not personal_drive,
|
||||
includeItemsFromAllDrives=not personal_drive,
|
||||
continue_on_404_or_403=True,
|
||||
corpora="allDrives",
|
||||
supportsAllDrives=True,
|
||||
includeItemsFromAllDrives=True,
|
||||
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
|
||||
q=query,
|
||||
):
|
||||
@@ -82,7 +90,6 @@ def _get_files_in_parent(
|
||||
def crawl_folders_for_files(
|
||||
service: Resource,
|
||||
parent_id: str,
|
||||
personal_drive: bool,
|
||||
traversed_parent_ids: set[str],
|
||||
update_traversed_ids_func: Callable[[str], None],
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
@@ -92,29 +99,30 @@ def crawl_folders_for_files(
|
||||
This function starts crawling from any folder. It is slower though.
|
||||
"""
|
||||
if parent_id in traversed_parent_ids:
|
||||
print(f"Skipping subfolder since already traversed: {parent_id}")
|
||||
logger.info(f"Skipping subfolder since already traversed: {parent_id}")
|
||||
return
|
||||
|
||||
update_traversed_ids_func(parent_id)
|
||||
|
||||
yield from _get_files_in_parent(
|
||||
found_files = False
|
||||
for file in _get_files_in_parent(
|
||||
service=service,
|
||||
personal_drive=personal_drive,
|
||||
start=start,
|
||||
end=end,
|
||||
parent_id=parent_id,
|
||||
)
|
||||
):
|
||||
found_files = True
|
||||
yield file
|
||||
|
||||
if found_files:
|
||||
update_traversed_ids_func(parent_id)
|
||||
|
||||
for subfolder in _get_folders_in_parent(
|
||||
service=service,
|
||||
parent_id=parent_id,
|
||||
personal_drive=personal_drive,
|
||||
):
|
||||
logger.info("Fetching all files in subfolder: " + subfolder["name"])
|
||||
yield from crawl_folders_for_files(
|
||||
service=service,
|
||||
parent_id=subfolder["id"],
|
||||
personal_drive=personal_drive,
|
||||
traversed_parent_ids=traversed_parent_ids,
|
||||
update_traversed_ids_func=update_traversed_ids_func,
|
||||
start=start,
|
||||
@@ -126,55 +134,59 @@ def get_files_in_shared_drive(
|
||||
service: Resource,
|
||||
drive_id: str,
|
||||
is_slim: bool = False,
|
||||
cache_folders: bool = True,
|
||||
update_traversed_ids_func: Callable[[str], None] = lambda _: None,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
# If we know we are going to folder crawl later, we can cache the folders here
|
||||
if cache_folders:
|
||||
# Get all folders being queried and add them to the traversed set
|
||||
query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
|
||||
query += " and trashed = false"
|
||||
for file in execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
corpora="drive",
|
||||
driveId=drive_id,
|
||||
supportsAllDrives=True,
|
||||
includeItemsFromAllDrives=True,
|
||||
fields="nextPageToken, files(id)",
|
||||
q=query,
|
||||
):
|
||||
update_traversed_ids_func(file["id"])
|
||||
# Get all folders being queried and add them to the traversed set
|
||||
query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
|
||||
query += " and trashed = false"
|
||||
found_folders = False
|
||||
for file in execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
continue_on_404_or_403=True,
|
||||
corpora="drive",
|
||||
driveId=drive_id,
|
||||
supportsAllDrives=True,
|
||||
includeItemsFromAllDrives=True,
|
||||
fields="nextPageToken, files(id)",
|
||||
q=query,
|
||||
):
|
||||
update_traversed_ids_func(file["id"])
|
||||
found_folders = True
|
||||
if found_folders:
|
||||
update_traversed_ids_func(drive_id)
|
||||
|
||||
# Get all files in the shared drive
|
||||
query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
|
||||
query += " and trashed = false"
|
||||
query += _generate_time_range_filter(start, end)
|
||||
for file in execute_paginated_retrieval(
|
||||
yield from execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
continue_on_404_or_403=True,
|
||||
corpora="drive",
|
||||
driveId=drive_id,
|
||||
supportsAllDrives=True,
|
||||
includeItemsFromAllDrives=True,
|
||||
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
|
||||
q=query,
|
||||
):
|
||||
yield file
|
||||
)
|
||||
|
||||
|
||||
def get_files_in_my_drive(
|
||||
service: Resource,
|
||||
email: str,
|
||||
def get_all_files_in_my_drive(
|
||||
service: Any,
|
||||
update_traversed_ids_func: Callable,
|
||||
is_slim: bool = False,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
query = f"mimeType != '{DRIVE_FOLDER_TYPE}' and '{email}' in owners"
|
||||
query += " and trashed = false"
|
||||
query += _generate_time_range_filter(start, end)
|
||||
# If we know we are going to folder crawl later, we can cache the folders here
|
||||
# Get all folders being queried and add them to the traversed set
|
||||
query = "trashed = false and 'me' in owners"
|
||||
found_folders = False
|
||||
for file in execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
@@ -182,7 +194,25 @@ def get_files_in_my_drive(
|
||||
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
|
||||
q=query,
|
||||
):
|
||||
yield file
|
||||
update_traversed_ids_func(file["id"])
|
||||
found_folders = True
|
||||
if found_folders:
|
||||
update_traversed_ids_func(get_root_folder_id(service))
|
||||
|
||||
# Then get the files
|
||||
query = "trashed = false and 'me' in owners"
|
||||
query += _generate_time_range_filter(start, end)
|
||||
fields = "files(id, name, mimeType, webViewLink, modifiedTime, createdTime)"
|
||||
if not is_slim:
|
||||
fields += ", files(permissions, permissionIds, owners)"
|
||||
|
||||
yield from execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
corpora="user",
|
||||
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
|
||||
q=query,
|
||||
)
|
||||
|
||||
|
||||
# Just in case we need to get the root folder id
|
||||
|
||||
@@ -1,35 +0,0 @@
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from typing import Any
|
||||
|
||||
from danswer.connectors.google_drive.models import GoogleDriveFileType
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
# Google Drive APIs are quite flakey and may 500 for an
|
||||
# extended period of time. Trying to combat here by adding a very
|
||||
# long retry period (~20 minutes of trying every minute)
|
||||
add_retries = retry_builder(tries=50, max_delay=30)
|
||||
|
||||
|
||||
def execute_paginated_retrieval(
|
||||
retrieval_function: Callable,
|
||||
list_key: str,
|
||||
**kwargs: Any,
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
"""Execute a paginated retrieval from Google Drive API
|
||||
Args:
|
||||
retrieval_function: The specific list function to call (e.g., service.files().list)
|
||||
**kwargs: Arguments to pass to the list function
|
||||
"""
|
||||
next_page_token = ""
|
||||
while next_page_token is not None:
|
||||
request_kwargs = kwargs.copy()
|
||||
if next_page_token:
|
||||
request_kwargs["pageToken"] = next_page_token
|
||||
|
||||
results = add_retries(lambda: retrieval_function(**request_kwargs).execute())()
|
||||
|
||||
next_page_token = results.get("nextPageToken")
|
||||
for item in results.get(list_key, []):
|
||||
yield item
|
||||
105
backend/danswer/connectors/google_drive/section_extraction.py
Normal file
105
backend/danswer/connectors/google_drive/section_extraction.py
Normal file
@@ -0,0 +1,105 @@
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from danswer.connectors.google_utils.resources import GoogleDocsService
|
||||
from danswer.connectors.models import Section
|
||||
|
||||
|
||||
class CurrentHeading(BaseModel):
|
||||
id: str
|
||||
text: str
|
||||
|
||||
|
||||
def _build_gdoc_section_link(doc_id: str, heading_id: str) -> str:
|
||||
"""Builds a Google Doc link that jumps to a specific heading"""
|
||||
# NOTE: doesn't support docs with multiple tabs atm, if we need that ask
|
||||
# @Chris
|
||||
return (
|
||||
f"https://docs.google.com/document/d/{doc_id}/edit?tab=t.0#heading={heading_id}"
|
||||
)
|
||||
|
||||
|
||||
def _extract_id_from_heading(paragraph: dict[str, Any]) -> str:
|
||||
"""Extracts the id from a heading paragraph element"""
|
||||
return paragraph["paragraphStyle"]["headingId"]
|
||||
|
||||
|
||||
def _extract_text_from_paragraph(paragraph: dict[str, Any]) -> str:
|
||||
"""Extracts the text content from a paragraph element"""
|
||||
text_elements = []
|
||||
for element in paragraph.get("elements", []):
|
||||
if "textRun" in element:
|
||||
text_elements.append(element["textRun"].get("content", ""))
|
||||
return "".join(text_elements)
|
||||
|
||||
|
||||
def get_document_sections(
|
||||
docs_service: GoogleDocsService,
|
||||
doc_id: str,
|
||||
) -> list[Section]:
|
||||
"""Extracts sections from a Google Doc, including their headings and content"""
|
||||
# Fetch the document structure
|
||||
doc = docs_service.documents().get(documentId=doc_id).execute()
|
||||
|
||||
# Get the content
|
||||
content = doc.get("body", {}).get("content", [])
|
||||
|
||||
sections: list[Section] = []
|
||||
current_section: list[str] = []
|
||||
current_heading: CurrentHeading | None = None
|
||||
|
||||
for element in content:
|
||||
if "paragraph" not in element:
|
||||
continue
|
||||
|
||||
paragraph = element["paragraph"]
|
||||
|
||||
# Check if this is a heading
|
||||
if (
|
||||
"paragraphStyle" in paragraph
|
||||
and "namedStyleType" in paragraph["paragraphStyle"]
|
||||
):
|
||||
style = paragraph["paragraphStyle"]["namedStyleType"]
|
||||
is_heading = style.startswith("HEADING_")
|
||||
is_title = style.startswith("TITLE")
|
||||
|
||||
if is_heading or is_title:
|
||||
# If we were building a previous section, add it to sections list
|
||||
if current_heading is not None and current_section:
|
||||
heading_text = current_heading.text
|
||||
section_text = f"{heading_text}\n" + "\n".join(current_section)
|
||||
sections.append(
|
||||
Section(
|
||||
text=section_text.strip(),
|
||||
link=_build_gdoc_section_link(doc_id, current_heading.id),
|
||||
)
|
||||
)
|
||||
current_section = []
|
||||
|
||||
# Start new heading
|
||||
heading_id = _extract_id_from_heading(paragraph)
|
||||
heading_text = _extract_text_from_paragraph(paragraph)
|
||||
current_heading = CurrentHeading(
|
||||
id=heading_id,
|
||||
text=heading_text,
|
||||
)
|
||||
continue
|
||||
|
||||
# Add content to current section
|
||||
if current_heading is not None:
|
||||
text = _extract_text_from_paragraph(paragraph)
|
||||
if text.strip():
|
||||
current_section.append(text)
|
||||
|
||||
# Don't forget to add the last section
|
||||
if current_heading is not None and current_section:
|
||||
section_text = f"{current_heading.text}\n" + "\n".join(current_section)
|
||||
sections.append(
|
||||
Section(
|
||||
text=section_text.strip(),
|
||||
link=_build_gdoc_section_link(doc_id, current_heading.id),
|
||||
)
|
||||
)
|
||||
|
||||
return sections
|
||||
0
backend/danswer/connectors/google_utils/__init__.py
Normal file
0
backend/danswer/connectors/google_utils/__init__.py
Normal file
107
backend/danswer/connectors/google_utils/google_auth.py
Normal file
107
backend/danswer/connectors/google_utils/google_auth.py
Normal file
@@ -0,0 +1,107 @@
|
||||
import json
|
||||
from typing import cast
|
||||
|
||||
from google.auth.transport.requests import Request # type: ignore
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
|
||||
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
GOOGLE_SCOPES,
|
||||
)
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def get_google_oauth_creds(
|
||||
token_json_str: str, source: DocumentSource
|
||||
) -> OAuthCredentials | None:
|
||||
creds_json = json.loads(token_json_str)
|
||||
creds = OAuthCredentials.from_authorized_user_info(
|
||||
info=creds_json,
|
||||
scopes=GOOGLE_SCOPES[source],
|
||||
)
|
||||
if creds.valid:
|
||||
return creds
|
||||
|
||||
if creds.expired and creds.refresh_token:
|
||||
try:
|
||||
creds.refresh(Request())
|
||||
if creds.valid:
|
||||
logger.notice("Refreshed Google Drive tokens.")
|
||||
return creds
|
||||
except Exception:
|
||||
logger.exception("Failed to refresh google drive access token due to:")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_google_creds(
|
||||
credentials: dict[str, str],
|
||||
source: DocumentSource,
|
||||
) -> tuple[ServiceAccountCredentials | OAuthCredentials, dict[str, str] | None]:
|
||||
"""Checks for two different types of credentials.
|
||||
(1) A credential which holds a token acquired via a user going thorough
|
||||
the Google OAuth flow.
|
||||
(2) A credential which holds a service account key JSON file, which
|
||||
can then be used to impersonate any user in the workspace.
|
||||
"""
|
||||
oauth_creds = None
|
||||
service_creds = None
|
||||
new_creds_dict = None
|
||||
if DB_CREDENTIALS_DICT_TOKEN_KEY in credentials:
|
||||
# OAUTH
|
||||
access_token_json_str = cast(str, credentials[DB_CREDENTIALS_DICT_TOKEN_KEY])
|
||||
oauth_creds = get_google_oauth_creds(
|
||||
token_json_str=access_token_json_str, source=source
|
||||
)
|
||||
|
||||
# tell caller to update token stored in DB if it has changed
|
||||
# (e.g. the token has been refreshed)
|
||||
new_creds_json_str = oauth_creds.to_json() if oauth_creds else ""
|
||||
if new_creds_json_str != access_token_json_str:
|
||||
new_creds_dict = {
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY: new_creds_json_str,
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: credentials[
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY
|
||||
],
|
||||
}
|
||||
elif DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY in credentials:
|
||||
# SERVICE ACCOUNT
|
||||
service_account_key_json_str = credentials[
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY
|
||||
]
|
||||
service_account_key = json.loads(service_account_key_json_str)
|
||||
|
||||
service_creds = ServiceAccountCredentials.from_service_account_info(
|
||||
service_account_key, scopes=GOOGLE_SCOPES[source]
|
||||
)
|
||||
|
||||
if not service_creds.valid or not service_creds.expired:
|
||||
service_creds.refresh(Request())
|
||||
|
||||
if not service_creds.valid:
|
||||
raise PermissionError(
|
||||
f"Unable to access {source} - service account credentials are invalid."
|
||||
)
|
||||
|
||||
creds: ServiceAccountCredentials | OAuthCredentials | None = (
|
||||
oauth_creds or service_creds
|
||||
)
|
||||
if creds is None:
|
||||
raise PermissionError(
|
||||
f"Unable to access {source} - unknown credential structure."
|
||||
)
|
||||
|
||||
return creds, new_creds_dict
|
||||
237
backend/danswer/connectors/google_utils/google_kv.py
Normal file
237
backend/danswer/connectors/google_utils/google_kv.py
Normal file
@@ -0,0 +1,237 @@
|
||||
import json
|
||||
from typing import cast
|
||||
from urllib.parse import parse_qs
|
||||
from urllib.parse import ParseResult
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow # type: ignore
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.app_configs import WEB_DOMAIN
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import KV_CRED_KEY
|
||||
from danswer.configs.constants import KV_GMAIL_CRED_KEY
|
||||
from danswer.configs.constants import KV_GMAIL_SERVICE_ACCOUNT_KEY
|
||||
from danswer.configs.constants import KV_GOOGLE_DRIVE_CRED_KEY
|
||||
from danswer.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY
|
||||
from danswer.connectors.google_utils.resources import get_drive_service
|
||||
from danswer.connectors.google_utils.resources import get_gmail_service
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
GOOGLE_SCOPES,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
MISSING_SCOPES_ERROR_STR,
|
||||
)
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
ONYX_SCOPE_INSTRUCTIONS,
|
||||
)
|
||||
from danswer.db.credentials import update_credential_json
|
||||
from danswer.db.models import User
|
||||
from danswer.key_value_store.factory import get_kv_store
|
||||
from danswer.server.documents.models import CredentialBase
|
||||
from danswer.server.documents.models import GoogleAppCredentials
|
||||
from danswer.server.documents.models import GoogleServiceAccountKey
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _build_frontend_google_drive_redirect(source: DocumentSource) -> str:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
return f"{WEB_DOMAIN}/admin/connectors/google-drive/auth/callback"
|
||||
elif source == DocumentSource.GMAIL:
|
||||
return f"{WEB_DOMAIN}/admin/connectors/gmail/auth/callback"
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
|
||||
|
||||
def _get_current_oauth_user(creds: OAuthCredentials, source: DocumentSource) -> str:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
drive_service = get_drive_service(creds)
|
||||
user_info = (
|
||||
drive_service.about()
|
||||
.get(
|
||||
fields="user(emailAddress)",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
email = user_info.get("user", {}).get("emailAddress")
|
||||
elif source == DocumentSource.GMAIL:
|
||||
gmail_service = get_gmail_service(creds)
|
||||
user_info = (
|
||||
gmail_service.users()
|
||||
.getProfile(
|
||||
userId="me",
|
||||
fields="emailAddress",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
email = user_info.get("emailAddress")
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
return email
|
||||
|
||||
|
||||
def verify_csrf(credential_id: int, state: str) -> None:
|
||||
csrf = get_kv_store().load(KV_CRED_KEY.format(str(credential_id)))
|
||||
if csrf != state:
|
||||
raise PermissionError(
|
||||
"State from Google Drive Connector callback does not match expected"
|
||||
)
|
||||
|
||||
|
||||
def update_credential_access_tokens(
|
||||
auth_code: str,
|
||||
credential_id: int,
|
||||
user: User,
|
||||
db_session: Session,
|
||||
source: DocumentSource,
|
||||
) -> OAuthCredentials | None:
|
||||
app_credentials = get_google_app_cred(source)
|
||||
flow = InstalledAppFlow.from_client_config(
|
||||
app_credentials.model_dump(),
|
||||
scopes=GOOGLE_SCOPES[source],
|
||||
redirect_uri=_build_frontend_google_drive_redirect(source),
|
||||
)
|
||||
flow.fetch_token(code=auth_code)
|
||||
creds = flow.credentials
|
||||
token_json_str = creds.to_json()
|
||||
|
||||
# Get user email from Google API so we know who
|
||||
# the primary admin is for this connector
|
||||
try:
|
||||
email = _get_current_oauth_user(creds, source)
|
||||
except Exception as e:
|
||||
if MISSING_SCOPES_ERROR_STR in str(e):
|
||||
raise PermissionError(ONYX_SCOPE_INSTRUCTIONS) from e
|
||||
raise e
|
||||
|
||||
new_creds_dict = {
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY: token_json_str,
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY: email,
|
||||
}
|
||||
|
||||
if not update_credential_json(credential_id, new_creds_dict, user, db_session):
|
||||
return None
|
||||
return creds
|
||||
|
||||
|
||||
def build_service_account_creds(
|
||||
source: DocumentSource,
|
||||
primary_admin_email: str | None = None,
|
||||
) -> CredentialBase:
|
||||
service_account_key = get_service_account_key(source=source)
|
||||
|
||||
credential_dict = {
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY: service_account_key.json(),
|
||||
}
|
||||
if primary_admin_email:
|
||||
credential_dict[DB_CREDENTIALS_PRIMARY_ADMIN_KEY] = primary_admin_email
|
||||
|
||||
return CredentialBase(
|
||||
credential_json=credential_dict,
|
||||
admin_public=True,
|
||||
source=source,
|
||||
)
|
||||
|
||||
|
||||
def get_auth_url(credential_id: int, source: DocumentSource) -> str:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY))
|
||||
elif source == DocumentSource.GMAIL:
|
||||
creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY))
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
credential_json = json.loads(creds_str)
|
||||
flow = InstalledAppFlow.from_client_config(
|
||||
credential_json,
|
||||
scopes=GOOGLE_SCOPES[source],
|
||||
redirect_uri=_build_frontend_google_drive_redirect(source),
|
||||
)
|
||||
auth_url, _ = flow.authorization_url(prompt="consent")
|
||||
|
||||
parsed_url = cast(ParseResult, urlparse(auth_url))
|
||||
params = parse_qs(parsed_url.query)
|
||||
|
||||
get_kv_store().store(
|
||||
KV_CRED_KEY.format(credential_id), params.get("state", [None])[0], encrypt=True
|
||||
) # type: ignore
|
||||
return str(auth_url)
|
||||
|
||||
|
||||
def get_google_app_cred(source: DocumentSource) -> GoogleAppCredentials:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_CRED_KEY))
|
||||
elif source == DocumentSource.GMAIL:
|
||||
creds_str = str(get_kv_store().load(KV_GMAIL_CRED_KEY))
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
return GoogleAppCredentials(**json.loads(creds_str))
|
||||
|
||||
|
||||
def upsert_google_app_cred(
|
||||
app_credentials: GoogleAppCredentials, source: DocumentSource
|
||||
) -> None:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
get_kv_store().store(
|
||||
KV_GOOGLE_DRIVE_CRED_KEY, app_credentials.json(), encrypt=True
|
||||
)
|
||||
elif source == DocumentSource.GMAIL:
|
||||
get_kv_store().store(KV_GMAIL_CRED_KEY, app_credentials.json(), encrypt=True)
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
|
||||
|
||||
def delete_google_app_cred(source: DocumentSource) -> None:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
get_kv_store().delete(KV_GOOGLE_DRIVE_CRED_KEY)
|
||||
elif source == DocumentSource.GMAIL:
|
||||
get_kv_store().delete(KV_GMAIL_CRED_KEY)
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
|
||||
|
||||
def get_service_account_key(source: DocumentSource) -> GoogleServiceAccountKey:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
creds_str = str(get_kv_store().load(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY))
|
||||
elif source == DocumentSource.GMAIL:
|
||||
creds_str = str(get_kv_store().load(KV_GMAIL_SERVICE_ACCOUNT_KEY))
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
return GoogleServiceAccountKey(**json.loads(creds_str))
|
||||
|
||||
|
||||
def upsert_service_account_key(
|
||||
service_account_key: GoogleServiceAccountKey, source: DocumentSource
|
||||
) -> None:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
get_kv_store().store(
|
||||
KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY,
|
||||
service_account_key.json(),
|
||||
encrypt=True,
|
||||
)
|
||||
elif source == DocumentSource.GMAIL:
|
||||
get_kv_store().store(
|
||||
KV_GMAIL_SERVICE_ACCOUNT_KEY, service_account_key.json(), encrypt=True
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
|
||||
|
||||
def delete_service_account_key(source: DocumentSource) -> None:
|
||||
if source == DocumentSource.GOOGLE_DRIVE:
|
||||
get_kv_store().delete(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY)
|
||||
elif source == DocumentSource.GMAIL:
|
||||
get_kv_store().delete(KV_GMAIL_SERVICE_ACCOUNT_KEY)
|
||||
else:
|
||||
raise ValueError(f"Unsupported source: {source}")
|
||||
125
backend/danswer/connectors/google_utils/google_utils.py
Normal file
125
backend/danswer/connectors/google_utils/google_utils.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import re
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
|
||||
from danswer.connectors.google_drive.models import GoogleDriveFileType
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.retry_wrapper import retry_builder
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
# Google Drive APIs are quite flakey and may 500 for an
|
||||
# extended period of time. Trying to combat here by adding a very
|
||||
# long retry period (~20 minutes of trying every minute)
|
||||
add_retries = retry_builder(tries=50, max_delay=30)
|
||||
|
||||
|
||||
def _execute_with_retry(request: Any) -> Any:
|
||||
max_attempts = 10
|
||||
attempt = 1
|
||||
|
||||
while attempt < max_attempts:
|
||||
# Note for reasons unknown, the Google API will sometimes return a 429
|
||||
# and even after waiting the retry period, it will return another 429.
|
||||
# It could be due to a few possibilities:
|
||||
# 1. Other things are also requesting from the Gmail API with the same key
|
||||
# 2. It's a rolling rate limit so the moment we get some amount of requests cleared, we hit it again very quickly
|
||||
# 3. The retry-after has a maximum and we've already hit the limit for the day
|
||||
# or it's something else...
|
||||
try:
|
||||
return request.execute()
|
||||
except HttpError as error:
|
||||
attempt += 1
|
||||
|
||||
if error.resp.status == 429:
|
||||
# Attempt to get 'Retry-After' from headers
|
||||
retry_after = error.resp.get("Retry-After")
|
||||
if retry_after:
|
||||
sleep_time = int(retry_after)
|
||||
else:
|
||||
# Extract 'Retry after' timestamp from error message
|
||||
match = re.search(
|
||||
r"Retry after (\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+Z)",
|
||||
str(error),
|
||||
)
|
||||
if match:
|
||||
retry_after_timestamp = match.group(1)
|
||||
retry_after_dt = datetime.strptime(
|
||||
retry_after_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ"
|
||||
).replace(tzinfo=timezone.utc)
|
||||
current_time = datetime.now(timezone.utc)
|
||||
sleep_time = max(
|
||||
int((retry_after_dt - current_time).total_seconds()),
|
||||
0,
|
||||
)
|
||||
else:
|
||||
logger.error(
|
||||
f"No Retry-After header or timestamp found in error message: {error}"
|
||||
)
|
||||
sleep_time = 60
|
||||
|
||||
sleep_time += 3 # Add a buffer to be safe
|
||||
|
||||
logger.info(
|
||||
f"Rate limit exceeded. Attempt {attempt}/{max_attempts}. Sleeping for {sleep_time} seconds."
|
||||
)
|
||||
time.sleep(sleep_time)
|
||||
|
||||
else:
|
||||
raise
|
||||
|
||||
# If we've exhausted all attempts
|
||||
raise Exception(f"Failed to execute request after {max_attempts} attempts")
|
||||
|
||||
|
||||
def execute_paginated_retrieval(
|
||||
retrieval_function: Callable,
|
||||
list_key: str | None = None,
|
||||
continue_on_404_or_403: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> Iterator[GoogleDriveFileType]:
|
||||
"""Execute a paginated retrieval from Google Drive API
|
||||
Args:
|
||||
retrieval_function: The specific list function to call (e.g., service.files().list)
|
||||
**kwargs: Arguments to pass to the list function
|
||||
"""
|
||||
next_page_token = ""
|
||||
while next_page_token is not None:
|
||||
request_kwargs = kwargs.copy()
|
||||
if next_page_token:
|
||||
request_kwargs["pageToken"] = next_page_token
|
||||
|
||||
try:
|
||||
results = retrieval_function(**request_kwargs).execute()
|
||||
except HttpError as e:
|
||||
if e.resp.status >= 500:
|
||||
results = add_retries(
|
||||
lambda: retrieval_function(**request_kwargs).execute()
|
||||
)()
|
||||
elif e.resp.status == 404 or e.resp.status == 403:
|
||||
if continue_on_404_or_403:
|
||||
logger.warning(f"Error executing request: {e}")
|
||||
results = {}
|
||||
else:
|
||||
raise e
|
||||
elif e.resp.status == 429:
|
||||
results = _execute_with_retry(
|
||||
lambda: retrieval_function(**request_kwargs).execute()
|
||||
)
|
||||
else:
|
||||
logger.exception("Error executing request:")
|
||||
raise e
|
||||
|
||||
next_page_token = results.get("nextPageToken")
|
||||
if list_key:
|
||||
for item in results.get(list_key, []):
|
||||
yield item
|
||||
else:
|
||||
yield results
|
||||
63
backend/danswer/connectors/google_utils/resources.py
Normal file
63
backend/danswer/connectors/google_utils/resources.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
|
||||
from googleapiclient.discovery import build # type: ignore
|
||||
from googleapiclient.discovery import Resource # type: ignore
|
||||
|
||||
|
||||
class GoogleDriveService(Resource):
|
||||
pass
|
||||
|
||||
|
||||
class GoogleDocsService(Resource):
|
||||
pass
|
||||
|
||||
|
||||
class AdminService(Resource):
|
||||
pass
|
||||
|
||||
|
||||
class GmailService(Resource):
|
||||
pass
|
||||
|
||||
|
||||
def _get_google_service(
|
||||
service_name: str,
|
||||
service_version: str,
|
||||
creds: ServiceAccountCredentials | OAuthCredentials,
|
||||
user_email: str | None = None,
|
||||
) -> GoogleDriveService | GoogleDocsService | AdminService | GmailService:
|
||||
if isinstance(creds, ServiceAccountCredentials):
|
||||
creds = creds.with_subject(user_email)
|
||||
service = build(service_name, service_version, credentials=creds)
|
||||
elif isinstance(creds, OAuthCredentials):
|
||||
service = build(service_name, service_version, credentials=creds)
|
||||
|
||||
return service
|
||||
|
||||
|
||||
def get_google_docs_service(
|
||||
creds: ServiceAccountCredentials | OAuthCredentials,
|
||||
user_email: str | None = None,
|
||||
) -> GoogleDocsService:
|
||||
return _get_google_service("docs", "v1", creds, user_email)
|
||||
|
||||
|
||||
def get_drive_service(
|
||||
creds: ServiceAccountCredentials | OAuthCredentials,
|
||||
user_email: str | None = None,
|
||||
) -> GoogleDriveService:
|
||||
return _get_google_service("drive", "v3", creds, user_email)
|
||||
|
||||
|
||||
def get_admin_service(
|
||||
creds: ServiceAccountCredentials | OAuthCredentials,
|
||||
user_email: str | None = None,
|
||||
) -> AdminService:
|
||||
return _get_google_service("admin", "directory_v1", creds, user_email)
|
||||
|
||||
|
||||
def get_gmail_service(
|
||||
creds: ServiceAccountCredentials | OAuthCredentials,
|
||||
user_email: str | None = None,
|
||||
) -> GmailService:
|
||||
return _get_google_service("gmail", "v1", creds, user_email)
|
||||
40
backend/danswer/connectors/google_utils/shared_constants.py
Normal file
40
backend/danswer/connectors/google_utils/shared_constants.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from danswer.configs.constants import DocumentSource
|
||||
|
||||
# NOTE: do not need https://www.googleapis.com/auth/documents.readonly
|
||||
# this is counted under `/auth/drive.readonly`
|
||||
GOOGLE_SCOPES = {
|
||||
DocumentSource.GOOGLE_DRIVE: [
|
||||
"https://www.googleapis.com/auth/drive.readonly",
|
||||
"https://www.googleapis.com/auth/drive.metadata.readonly",
|
||||
"https://www.googleapis.com/auth/admin.directory.group.readonly",
|
||||
"https://www.googleapis.com/auth/admin.directory.user.readonly",
|
||||
],
|
||||
DocumentSource.GMAIL: [
|
||||
"https://www.googleapis.com/auth/gmail.readonly",
|
||||
"https://www.googleapis.com/auth/admin.directory.user.readonly",
|
||||
"https://www.googleapis.com/auth/admin.directory.group.readonly",
|
||||
],
|
||||
}
|
||||
|
||||
# This is the Oauth token
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
|
||||
# This is the service account key
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY = "google_service_account_key"
|
||||
# The email saved for both auth types
|
||||
DB_CREDENTIALS_PRIMARY_ADMIN_KEY = "google_primary_admin"
|
||||
|
||||
USER_FIELDS = "nextPageToken, users(primaryEmail)"
|
||||
|
||||
# Error message substrings
|
||||
MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requested"
|
||||
|
||||
# Documentation and error messages
|
||||
SCOPE_DOC_URL = "https://docs.danswer.dev/connectors/google_drive/overview"
|
||||
ONYX_SCOPE_INSTRUCTIONS = (
|
||||
"You have upgraded Danswer without updating the Google Auth scopes. "
|
||||
f"Please refer to the documentation to learn how to update the scopes: {SCOPE_DOC_URL}"
|
||||
)
|
||||
|
||||
|
||||
# This is the maximum number of threads that can be retrieved at once
|
||||
SLIM_BATCH_SIZE = 500
|
||||
@@ -1,3 +1,5 @@
|
||||
import os
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.db.models import SlackBotConfig
|
||||
@@ -48,3 +50,16 @@ def validate_channel_names(
|
||||
)
|
||||
|
||||
return cleaned_channel_names
|
||||
|
||||
|
||||
# Scaling configurations for multi-tenant Slack bot handling
|
||||
TENANT_LOCK_EXPIRATION = 1800 # How long a pod can hold exclusive access to a tenant before other pods can acquire it
|
||||
TENANT_HEARTBEAT_INTERVAL = (
|
||||
60 # How often pods send heartbeats to indicate they are still processing a tenant
|
||||
)
|
||||
TENANT_HEARTBEAT_EXPIRATION = 180 # How long before a tenant's heartbeat expires, allowing other pods to take over
|
||||
TENANT_ACQUISITION_INTERVAL = (
|
||||
60 # How often pods attempt to acquire unprocessed tenants
|
||||
)
|
||||
|
||||
MAX_TENANTS_PER_POD = int(os.getenv("MAX_TENANTS_PER_POD", 50))
|
||||
|
||||
@@ -1,18 +1,34 @@
|
||||
import asyncio
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from threading import Event
|
||||
from types import FrameType
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import Dict
|
||||
from typing import Set
|
||||
|
||||
from prometheus_client import Gauge
|
||||
from prometheus_client import start_http_server
|
||||
from slack_sdk import WebClient
|
||||
from slack_sdk.socket_mode.request import SocketModeRequest
|
||||
from slack_sdk.socket_mode.response import SocketModeResponse
|
||||
|
||||
from danswer.configs.constants import DanswerRedisLocks
|
||||
from danswer.configs.constants import MessageType
|
||||
from danswer.configs.danswerbot_configs import DANSWER_BOT_REPHRASE_MESSAGE
|
||||
from danswer.configs.danswerbot_configs import DANSWER_BOT_RESPOND_EVERY_CHANNEL
|
||||
from danswer.configs.danswerbot_configs import NOTIFY_SLACKBOT_NO_ANSWER
|
||||
from danswer.connectors.slack.utils import expert_info_from_slack_id
|
||||
from danswer.danswerbot.slack.config import get_slack_bot_config_for_channel
|
||||
from danswer.danswerbot.slack.config import MAX_TENANTS_PER_POD
|
||||
from danswer.danswerbot.slack.config import TENANT_ACQUISITION_INTERVAL
|
||||
from danswer.danswerbot.slack.config import TENANT_HEARTBEAT_EXPIRATION
|
||||
from danswer.danswerbot.slack.config import TENANT_HEARTBEAT_INTERVAL
|
||||
from danswer.danswerbot.slack.config import TENANT_LOCK_EXPIRATION
|
||||
from danswer.danswerbot.slack.constants import DISLIKE_BLOCK_ACTION_ID
|
||||
from danswer.danswerbot.slack.constants import FEEDBACK_DOC_BUTTON_BLOCK_ACTION_ID
|
||||
from danswer.danswerbot.slack.constants import FOLLOWUP_BUTTON_ACTION_ID
|
||||
@@ -46,6 +62,7 @@ from danswer.danswerbot.slack.utils import remove_danswer_bot_tag
|
||||
from danswer.danswerbot.slack.utils import rephrase_slack_message
|
||||
from danswer.danswerbot.slack.utils import respond_in_thread
|
||||
from danswer.danswerbot.slack.utils import TenantSocketModeClient
|
||||
from danswer.db.engine import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
from danswer.db.engine import get_all_tenant_ids
|
||||
from danswer.db.engine import get_session_with_tenant
|
||||
from danswer.db.search_settings import get_current_search_settings
|
||||
@@ -53,17 +70,23 @@ from danswer.key_value_store.interface import KvKeyNotFoundError
|
||||
from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
|
||||
from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
|
||||
from danswer.one_shot_answer.models import ThreadMessage
|
||||
from danswer.redis.redis_pool import get_redis_client
|
||||
from danswer.search.retrieval.search_runner import download_nltk_data
|
||||
from danswer.server.manage.models import SlackBotTokens
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable
|
||||
from shared_configs.configs import MODEL_SERVER_HOST
|
||||
from shared_configs.configs import MODEL_SERVER_PORT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.configs import SLACK_CHANNEL_ID
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# Prometheus metric for HPA
|
||||
active_tenants_gauge = Gauge(
|
||||
"active_tenants", "Number of active tenants handled by this pod"
|
||||
)
|
||||
|
||||
# In rare cases, some users have been experiencing a massive amount of trivial messages coming through
|
||||
# to the Slack Bot with trivial messages. Adding this to avoid exploding LLM costs while we track down
|
||||
# the cause.
|
||||
@@ -77,10 +100,213 @@ _SLACK_GREETINGS_TO_IGNORE = {
|
||||
":wave:",
|
||||
}
|
||||
|
||||
# this is always (currently) the user id of Slack's official slackbot
|
||||
# This is always (currently) the user id of Slack's official slackbot
|
||||
_OFFICIAL_SLACKBOT_USER_ID = "USLACKBOT"
|
||||
|
||||
|
||||
class SlackbotHandler:
|
||||
def __init__(self) -> None:
|
||||
logger.info("Initializing SlackbotHandler")
|
||||
self.tenant_ids: Set[str | None] = set()
|
||||
self.socket_clients: Dict[str | None, TenantSocketModeClient] = {}
|
||||
self.slack_bot_tokens: Dict[str | None, SlackBotTokens] = {}
|
||||
self.running = True
|
||||
self.pod_id = self.get_pod_id()
|
||||
self._shutdown_event = Event()
|
||||
logger.info(f"Pod ID: {self.pod_id}")
|
||||
|
||||
# Set up signal handlers for graceful shutdown
|
||||
signal.signal(signal.SIGTERM, self.shutdown)
|
||||
signal.signal(signal.SIGINT, self.shutdown)
|
||||
logger.info("Signal handlers registered")
|
||||
|
||||
# Start the Prometheus metrics server
|
||||
logger.info("Starting Prometheus metrics server")
|
||||
start_http_server(8000)
|
||||
logger.info("Prometheus metrics server started")
|
||||
|
||||
# Start background threads
|
||||
logger.info("Starting background threads")
|
||||
self.acquire_thread = threading.Thread(
|
||||
target=self.acquire_tenants_loop, daemon=True
|
||||
)
|
||||
self.heartbeat_thread = threading.Thread(
|
||||
target=self.heartbeat_loop, daemon=True
|
||||
)
|
||||
|
||||
self.acquire_thread.start()
|
||||
self.heartbeat_thread.start()
|
||||
logger.info("Background threads started")
|
||||
|
||||
def get_pod_id(self) -> str:
|
||||
pod_id = os.environ.get("HOSTNAME", "unknown_pod")
|
||||
logger.info(f"Retrieved pod ID: {pod_id}")
|
||||
return pod_id
|
||||
|
||||
def acquire_tenants_loop(self) -> None:
|
||||
while not self._shutdown_event.is_set():
|
||||
try:
|
||||
self.acquire_tenants()
|
||||
active_tenants_gauge.set(len(self.tenant_ids))
|
||||
logger.debug(f"Current active tenants: {len(self.tenant_ids)}")
|
||||
except Exception as e:
|
||||
logger.exception(f"Error in Slack acquisition: {e}")
|
||||
self._shutdown_event.wait(timeout=TENANT_ACQUISITION_INTERVAL)
|
||||
|
||||
def heartbeat_loop(self) -> None:
|
||||
while not self._shutdown_event.is_set():
|
||||
try:
|
||||
self.send_heartbeats()
|
||||
logger.debug(f"Sent heartbeats for {len(self.tenant_ids)} tenants")
|
||||
except Exception as e:
|
||||
logger.exception(f"Error in heartbeat loop: {e}")
|
||||
self._shutdown_event.wait(timeout=TENANT_HEARTBEAT_INTERVAL)
|
||||
|
||||
def acquire_tenants(self) -> None:
|
||||
tenant_ids = get_all_tenant_ids()
|
||||
logger.debug(f"Found {len(tenant_ids)} total tenants in Postgres")
|
||||
|
||||
for tenant_id in tenant_ids:
|
||||
if tenant_id in self.tenant_ids:
|
||||
logger.debug(f"Tenant {tenant_id} already in self.tenant_ids")
|
||||
continue
|
||||
|
||||
if len(self.tenant_ids) >= MAX_TENANTS_PER_POD:
|
||||
logger.info(
|
||||
f"Max tenants per pod reached ({MAX_TENANTS_PER_POD}) Not acquiring any more tenants"
|
||||
)
|
||||
break
|
||||
|
||||
redis_client = get_redis_client(tenant_id=tenant_id)
|
||||
pod_id = self.pod_id
|
||||
acquired = redis_client.set(
|
||||
DanswerRedisLocks.SLACK_BOT_LOCK,
|
||||
pod_id,
|
||||
nx=True,
|
||||
ex=TENANT_LOCK_EXPIRATION,
|
||||
)
|
||||
if not acquired:
|
||||
logger.debug(f"Another pod holds the lock for tenant {tenant_id}")
|
||||
continue
|
||||
|
||||
logger.debug(f"Acquired lock for tenant {tenant_id}")
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(
|
||||
tenant_id or POSTGRES_DEFAULT_SCHEMA
|
||||
)
|
||||
try:
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
try:
|
||||
logger.debug(
|
||||
f"Setting tenant ID context variable for tenant {tenant_id}"
|
||||
)
|
||||
slack_bot_tokens = fetch_tokens()
|
||||
logger.debug(f"Fetched Slack bot tokens for tenant {tenant_id}")
|
||||
logger.debug(
|
||||
f"Reset tenant ID context variable for tenant {tenant_id}"
|
||||
)
|
||||
|
||||
if not slack_bot_tokens:
|
||||
logger.debug(
|
||||
f"No Slack bot token found for tenant {tenant_id}"
|
||||
)
|
||||
if tenant_id in self.socket_clients:
|
||||
asyncio.run(self.socket_clients[tenant_id].close())
|
||||
del self.socket_clients[tenant_id]
|
||||
del self.slack_bot_tokens[tenant_id]
|
||||
continue
|
||||
|
||||
if (
|
||||
tenant_id not in self.slack_bot_tokens
|
||||
or slack_bot_tokens != self.slack_bot_tokens[tenant_id]
|
||||
):
|
||||
if tenant_id in self.slack_bot_tokens:
|
||||
logger.info(
|
||||
f"Slack Bot tokens have changed for tenant {tenant_id} - reconnecting"
|
||||
)
|
||||
else:
|
||||
search_settings = get_current_search_settings(
|
||||
db_session
|
||||
)
|
||||
embedding_model = EmbeddingModel.from_db_model(
|
||||
search_settings=search_settings,
|
||||
server_host=MODEL_SERVER_HOST,
|
||||
server_port=MODEL_SERVER_PORT,
|
||||
)
|
||||
warm_up_bi_encoder(embedding_model=embedding_model)
|
||||
|
||||
self.slack_bot_tokens[tenant_id] = slack_bot_tokens
|
||||
|
||||
if tenant_id in self.socket_clients:
|
||||
asyncio.run(self.socket_clients[tenant_id].close())
|
||||
|
||||
self.start_socket_client(tenant_id, slack_bot_tokens)
|
||||
|
||||
except KvKeyNotFoundError:
|
||||
logger.debug(f"Missing Slack Bot tokens for tenant {tenant_id}")
|
||||
if tenant_id in self.socket_clients:
|
||||
asyncio.run(self.socket_clients[tenant_id].close())
|
||||
del self.socket_clients[tenant_id]
|
||||
del self.slack_bot_tokens[tenant_id]
|
||||
except Exception as e:
|
||||
logger.exception(f"Error handling tenant {tenant_id}: {e}")
|
||||
finally:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
|
||||
def send_heartbeats(self) -> None:
|
||||
current_time = int(time.time())
|
||||
logger.debug(f"Sending heartbeats for {len(self.tenant_ids)} tenants")
|
||||
for tenant_id in self.tenant_ids:
|
||||
redis_client = get_redis_client(tenant_id=tenant_id)
|
||||
heartbeat_key = (
|
||||
f"{DanswerRedisLocks.SLACK_BOT_HEARTBEAT_PREFIX}:{self.pod_id}"
|
||||
)
|
||||
redis_client.set(
|
||||
heartbeat_key, current_time, ex=TENANT_HEARTBEAT_EXPIRATION
|
||||
)
|
||||
|
||||
def start_socket_client(
|
||||
self, tenant_id: str | None, slack_bot_tokens: SlackBotTokens
|
||||
) -> None:
|
||||
logger.info(f"Starting socket client for tenant {tenant_id}")
|
||||
socket_client = _get_socket_client(slack_bot_tokens, tenant_id)
|
||||
|
||||
# Append the event handler
|
||||
socket_client.socket_mode_request_listeners.append(process_slack_event) # type: ignore
|
||||
|
||||
# Establish a WebSocket connection to the Socket Mode servers
|
||||
logger.info(f"Connecting socket client for tenant {tenant_id}")
|
||||
socket_client.connect()
|
||||
self.socket_clients[tenant_id] = socket_client
|
||||
self.tenant_ids.add(tenant_id)
|
||||
logger.info(f"Started SocketModeClient for tenant {tenant_id}")
|
||||
|
||||
def stop_socket_clients(self) -> None:
|
||||
logger.info(f"Stopping {len(self.socket_clients)} socket clients")
|
||||
for tenant_id, client in self.socket_clients.items():
|
||||
asyncio.run(client.close())
|
||||
logger.info(f"Stopped SocketModeClient for tenant {tenant_id}")
|
||||
|
||||
def shutdown(self, signum: int | None, frame: FrameType | None) -> None:
|
||||
if not self.running:
|
||||
return
|
||||
|
||||
logger.info("Shutting down gracefully")
|
||||
self.running = False
|
||||
self._shutdown_event.set()
|
||||
|
||||
# Stop all socket clients
|
||||
logger.info(f"Stopping {len(self.socket_clients)} socket clients")
|
||||
self.stop_socket_clients()
|
||||
|
||||
# Wait for background threads to finish (with timeout)
|
||||
logger.info("Waiting for background threads to finish...")
|
||||
self.acquire_thread.join(timeout=5)
|
||||
self.heartbeat_thread.join(timeout=5)
|
||||
|
||||
logger.info("Shutdown complete")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -> bool:
|
||||
"""True to keep going, False to ignore this Slack request"""
|
||||
if req.type == "events_api":
|
||||
@@ -172,7 +398,7 @@ def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -
|
||||
message_subtype = event.get("subtype")
|
||||
if message_subtype not in [None, "file_share"]:
|
||||
channel_specific_logger.info(
|
||||
f"Ignoring message with subtype '{message_subtype}' since is is a special message type"
|
||||
f"Ignoring message with subtype '{message_subtype}' since it is a special message type"
|
||||
)
|
||||
return False
|
||||
|
||||
@@ -247,7 +473,7 @@ def process_feedback(req: SocketModeRequest, client: TenantSocketModeClient) ->
|
||||
)
|
||||
|
||||
query_event_id, _, _ = decompose_action_id(feedback_id)
|
||||
logger.notice(f"Successfully handled QA feedback for event: {query_event_id}")
|
||||
logger.info(f"Successfully handled QA feedback for event: {query_event_id}")
|
||||
|
||||
|
||||
def build_request_details(
|
||||
@@ -269,14 +495,14 @@ def build_request_details(
|
||||
msg = remove_danswer_bot_tag(msg, client=client.web_client)
|
||||
|
||||
if DANSWER_BOT_REPHRASE_MESSAGE:
|
||||
logger.notice(f"Rephrasing Slack message. Original message: {msg}")
|
||||
logger.info(f"Rephrasing Slack message. Original message: {msg}")
|
||||
try:
|
||||
msg = rephrase_slack_message(msg)
|
||||
logger.notice(f"Rephrased message: {msg}")
|
||||
logger.info(f"Rephrased message: {msg}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error while trying to rephrase the Slack message: {e}")
|
||||
else:
|
||||
logger.notice(f"Received Slack message: {msg}")
|
||||
logger.info(f"Received Slack message: {msg}")
|
||||
|
||||
if tagged:
|
||||
logger.debug("User tagged DanswerBot")
|
||||
@@ -477,94 +703,21 @@ def _get_socket_client(
|
||||
)
|
||||
|
||||
|
||||
def _initialize_socket_client(socket_client: TenantSocketModeClient) -> None:
|
||||
socket_client.socket_mode_request_listeners.append(process_slack_event) # type: ignore
|
||||
|
||||
# Establish a WebSocket connection to the Socket Mode servers
|
||||
logger.notice(f"Listening for messages from Slack {socket_client.tenant_id }...")
|
||||
socket_client.connect()
|
||||
|
||||
|
||||
# Follow the guide (https://docs.danswer.dev/slack_bot_setup) to set up
|
||||
# the slack bot in your workspace, and then add the bot to any channels you want to
|
||||
# try and answer questions for. Running this file will setup Danswer to listen to all
|
||||
# messages in those channels and attempt to answer them. As of now, it will only respond
|
||||
# to messages sent directly in the channel - it will not respond to messages sent within a
|
||||
# thread.
|
||||
#
|
||||
# NOTE: we are using Web Sockets so that you can run this from within a firewalled VPC
|
||||
# without issue.
|
||||
if __name__ == "__main__":
|
||||
slack_bot_tokens: dict[str | None, SlackBotTokens] = {}
|
||||
socket_clients: dict[str | None, TenantSocketModeClient] = {}
|
||||
# Initialize the tenant handler which will manage tenant connections
|
||||
logger.info("Starting SlackbotHandler")
|
||||
tenant_handler = SlackbotHandler()
|
||||
|
||||
set_is_ee_based_on_env_variable()
|
||||
|
||||
logger.notice("Verifying query preprocessing (NLTK) data is downloaded")
|
||||
logger.info("Verifying query preprocessing (NLTK) data is downloaded")
|
||||
download_nltk_data()
|
||||
|
||||
while True:
|
||||
try:
|
||||
tenant_ids = get_all_tenant_ids() # Function to retrieve all tenant IDs
|
||||
try:
|
||||
# Keep the main thread alive
|
||||
while tenant_handler.running:
|
||||
time.sleep(1)
|
||||
|
||||
for tenant_id in tenant_ids:
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
try:
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id or "public")
|
||||
latest_slack_bot_tokens = fetch_tokens()
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
|
||||
if (
|
||||
tenant_id not in slack_bot_tokens
|
||||
or latest_slack_bot_tokens != slack_bot_tokens[tenant_id]
|
||||
):
|
||||
if tenant_id in slack_bot_tokens:
|
||||
logger.notice(
|
||||
f"Slack Bot tokens have changed for tenant {tenant_id} - reconnecting"
|
||||
)
|
||||
else:
|
||||
# Initial setup for this tenant
|
||||
search_settings = get_current_search_settings(
|
||||
db_session
|
||||
)
|
||||
embedding_model = EmbeddingModel.from_db_model(
|
||||
search_settings=search_settings,
|
||||
server_host=MODEL_SERVER_HOST,
|
||||
server_port=MODEL_SERVER_PORT,
|
||||
)
|
||||
warm_up_bi_encoder(embedding_model=embedding_model)
|
||||
|
||||
slack_bot_tokens[tenant_id] = latest_slack_bot_tokens
|
||||
|
||||
# potentially may cause a message to be dropped, but it is complicated
|
||||
# to avoid + (1) if the user is changing tokens, they are likely okay with some
|
||||
# "migration downtime" and (2) if a single message is lost it is okay
|
||||
# as this should be a very rare occurrence
|
||||
if tenant_id in socket_clients:
|
||||
socket_clients[tenant_id].close()
|
||||
|
||||
socket_client = _get_socket_client(
|
||||
latest_slack_bot_tokens, tenant_id
|
||||
)
|
||||
|
||||
# Initialize socket client for this tenant. Each tenant has its own
|
||||
# socket client, allowing for multiple concurrent connections (one
|
||||
# per tenant) with the tenant ID wrapped in the socket model client.
|
||||
# Each `connect` stores websocket connection in a separate thread.
|
||||
_initialize_socket_client(socket_client)
|
||||
|
||||
socket_clients[tenant_id] = socket_client
|
||||
|
||||
except KvKeyNotFoundError:
|
||||
logger.debug(f"Missing Slack Bot tokens for tenant {tenant_id}")
|
||||
if tenant_id in socket_clients:
|
||||
socket_clients[tenant_id].disconnect()
|
||||
del socket_clients[tenant_id]
|
||||
del slack_bot_tokens[tenant_id]
|
||||
|
||||
# Wait before checking for updates
|
||||
Event().wait(timeout=60)
|
||||
|
||||
except Exception:
|
||||
logger.exception("An error occurred outside of main event loop")
|
||||
time.sleep(60)
|
||||
except Exception:
|
||||
logger.exception("Fatal error in main thread")
|
||||
tenant_handler.shutdown(None, None)
|
||||
|
||||
@@ -5,16 +5,16 @@ from sqlalchemy import select
|
||||
from sqlalchemy.orm import joinedload
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.auth.api_key import ApiKeyDescriptor
|
||||
from danswer.auth.api_key import build_displayable_api_key
|
||||
from danswer.auth.api_key import generate_api_key
|
||||
from danswer.auth.api_key import hash_api_key
|
||||
from danswer.configs.constants import DANSWER_API_KEY_DUMMY_EMAIL_DOMAIN
|
||||
from danswer.configs.constants import DANSWER_API_KEY_PREFIX
|
||||
from danswer.configs.constants import UNNAMED_KEY_PLACEHOLDER
|
||||
from danswer.db.models import ApiKey
|
||||
from danswer.db.models import User
|
||||
from ee.danswer.auth.api_key import ApiKeyDescriptor
|
||||
from ee.danswer.auth.api_key import build_displayable_api_key
|
||||
from ee.danswer.auth.api_key import generate_api_key
|
||||
from ee.danswer.auth.api_key import hash_api_key
|
||||
from ee.danswer.server.api_key.models import APIKeyArgs
|
||||
from danswer.server.api_key.models import APIKeyArgs
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
@@ -14,6 +14,7 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.auth.invited_users import get_invited_users
|
||||
from danswer.auth.schemas import UserRole
|
||||
from danswer.db.api_key import get_api_key_email_pattern
|
||||
from danswer.db.engine import get_async_session
|
||||
from danswer.db.engine import get_async_session_with_tenant
|
||||
from danswer.db.models import AccessToken
|
||||
@@ -22,7 +23,6 @@ from danswer.db.models import User
|
||||
from danswer.utils.variable_functionality import (
|
||||
fetch_versioned_implementation_with_fallback,
|
||||
)
|
||||
from ee.danswer.db.api_key import get_api_key_email_pattern
|
||||
|
||||
|
||||
def get_default_admin_user_emails() -> list[str]:
|
||||
|
||||
@@ -25,8 +25,8 @@ from danswer.db.models import UserGroup__ConnectorCredentialPair
|
||||
from danswer.db.models import UserRole
|
||||
from danswer.server.models import StatusResponse
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.db.external_perm import delete_user__ext_group_for_cc_pair__no_commit
|
||||
from ee.danswer.external_permissions.sync_params import check_if_valid_sync_source
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -351,7 +351,11 @@ def add_credential_to_connector(
|
||||
raise HTTPException(status_code=404, detail="Connector does not exist")
|
||||
|
||||
if access_type == AccessType.SYNC:
|
||||
if not check_if_valid_sync_source(connector.source):
|
||||
if not fetch_ee_implementation_or_noop(
|
||||
"danswer.external_permissions.sync_params",
|
||||
"check_if_valid_sync_source",
|
||||
noop_return_value=True,
|
||||
)(connector.source):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Connector of type {connector.source} does not support SYNC access type",
|
||||
@@ -438,7 +442,10 @@ def remove_credential_from_connector(
|
||||
)
|
||||
|
||||
if association is not None:
|
||||
delete_user__ext_group_for_cc_pair__no_commit(
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.external_perm",
|
||||
"delete_user__ext_group_for_cc_pair__no_commit",
|
||||
)(
|
||||
db_session=db_session,
|
||||
cc_pair_id=association.id,
|
||||
)
|
||||
|
||||
@@ -10,9 +10,8 @@ from sqlalchemy.sql.expression import or_
|
||||
|
||||
from danswer.auth.schemas import UserRole
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY
|
||||
from danswer.connectors.gmail.constants import (
|
||||
GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY,
|
||||
)
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.models import Credential
|
||||
@@ -422,25 +421,15 @@ def cleanup_google_drive_credentials(db_session: Session) -> None:
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def delete_gmail_service_account_credentials(
|
||||
user: User | None, db_session: Session
|
||||
def delete_service_account_credentials(
|
||||
user: User | None, db_session: Session, source: DocumentSource
|
||||
) -> None:
|
||||
credentials = fetch_credentials(db_session=db_session, user=user)
|
||||
for credential in credentials:
|
||||
if credential.credential_json.get(
|
||||
GMAIL_DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY
|
||||
if (
|
||||
credential.credential_json.get(DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY)
|
||||
and credential.source == source
|
||||
):
|
||||
db_session.delete(credential)
|
||||
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def delete_google_drive_service_account_credentials(
|
||||
user: User | None, db_session: Session
|
||||
) -> None:
|
||||
credentials = fetch_credentials(db_session=db_session, user=user)
|
||||
for credential in credentials:
|
||||
if credential.credential_json.get(KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY):
|
||||
db_session.delete(credential)
|
||||
|
||||
db_session.commit()
|
||||
|
||||
@@ -29,6 +29,7 @@ from danswer.configs.app_configs import POSTGRES_API_SERVER_POOL_OVERFLOW
|
||||
from danswer.configs.app_configs import POSTGRES_API_SERVER_POOL_SIZE
|
||||
from danswer.configs.app_configs import POSTGRES_DB
|
||||
from danswer.configs.app_configs import POSTGRES_HOST
|
||||
from danswer.configs.app_configs import POSTGRES_IDLE_SESSIONS_TIMEOUT
|
||||
from danswer.configs.app_configs import POSTGRES_PASSWORD
|
||||
from danswer.configs.app_configs import POSTGRES_POOL_PRE_PING
|
||||
from danswer.configs.app_configs import POSTGRES_POOL_RECYCLE
|
||||
@@ -309,8 +310,12 @@ async def get_async_session_with_tenant(
|
||||
try:
|
||||
# Set the search_path to the tenant's schema
|
||||
await session.execute(text(f'SET search_path = "{tenant_id}"'))
|
||||
except Exception as e:
|
||||
logger.error(f"Error setting search_path: {str(e)}")
|
||||
if POSTGRES_IDLE_SESSIONS_TIMEOUT:
|
||||
await session.execute(
|
||||
f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Error setting search_path.")
|
||||
# You can choose to re-raise the exception or handle it
|
||||
# Here, we'll re-raise to prevent proceeding with an incorrect session
|
||||
raise
|
||||
@@ -318,24 +323,38 @@ async def get_async_session_with_tenant(
|
||||
yield session
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_session_with_default_tenant() -> Generator[Session, None, None]:
|
||||
"""
|
||||
Get a database session using the current tenant ID from the context variable.
|
||||
"""
|
||||
tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
|
||||
with get_session_with_tenant(tenant_id) as session:
|
||||
yield session
|
||||
|
||||
|
||||
@contextmanager
|
||||
def get_session_with_tenant(
|
||||
tenant_id: str | None = None,
|
||||
) -> Generator[Session, None, None]:
|
||||
"""
|
||||
Generate a database session bound to a connection with the appropriate tenant schema set.
|
||||
This preserves the tenant ID across the session and reverts to the previous tenant ID
|
||||
after the session is closed.
|
||||
Generate a database session for a specific tenant.
|
||||
|
||||
This function:
|
||||
1. Sets the database schema to the specified tenant's schema.
|
||||
2. Preserves the tenant ID across the session.
|
||||
3. Reverts to the previous tenant ID after the session is closed.
|
||||
4. Uses the default schema if no tenant ID is provided.
|
||||
"""
|
||||
engine = get_sqlalchemy_engine()
|
||||
|
||||
# Store the previous tenant ID
|
||||
previous_tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
|
||||
previous_tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get() or POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
if tenant_id is None:
|
||||
tenant_id = previous_tenant_id
|
||||
else:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
tenant_id = POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
|
||||
event.listen(engine, "checkout", set_search_path_on_checkout)
|
||||
|
||||
@@ -352,6 +371,10 @@ def get_session_with_tenant(
|
||||
cursor = dbapi_connection.cursor()
|
||||
try:
|
||||
cursor.execute(f'SET search_path = "{tenant_id}"')
|
||||
if POSTGRES_IDLE_SESSIONS_TIMEOUT:
|
||||
cursor.execute(
|
||||
f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
|
||||
)
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
|
||||
@@ -219,7 +219,7 @@ def mark_attempt_partially_succeeded(
|
||||
|
||||
|
||||
def mark_attempt_failed(
|
||||
index_attempt: IndexAttempt,
|
||||
index_attempt_id: int,
|
||||
db_session: Session,
|
||||
failure_reason: str = "Unknown",
|
||||
full_exception_trace: str | None = None,
|
||||
@@ -227,7 +227,7 @@ def mark_attempt_failed(
|
||||
try:
|
||||
attempt = db_session.execute(
|
||||
select(IndexAttempt)
|
||||
.where(IndexAttempt.id == index_attempt.id)
|
||||
.where(IndexAttempt.id == index_attempt_id)
|
||||
.with_for_update()
|
||||
).scalar_one()
|
||||
|
||||
|
||||
@@ -135,6 +135,9 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
|
||||
hidden_assistants: Mapped[list[int]] = mapped_column(
|
||||
postgresql.JSONB(), nullable=False, default=[]
|
||||
)
|
||||
recent_assistants: Mapped[list[dict]] = mapped_column(
|
||||
postgresql.JSONB(), nullable=False, default=list, server_default="[]"
|
||||
)
|
||||
|
||||
oidc_expiry: Mapped[datetime.datetime] = mapped_column(
|
||||
TIMESTAMPAware(timezone=True), nullable=True
|
||||
@@ -1321,7 +1324,6 @@ class StarterMessage(TypedDict):
|
||||
in Postgres"""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
message: str
|
||||
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ from danswer.configs.model_configs import NORMALIZE_EMBEDDINGS
|
||||
from danswer.configs.model_configs import OLD_DEFAULT_DOCUMENT_ENCODER_MODEL
|
||||
from danswer.configs.model_configs import OLD_DEFAULT_MODEL_DOC_EMBEDDING_DIM
|
||||
from danswer.configs.model_configs import OLD_DEFAULT_MODEL_NORMALIZE_EMBEDDINGS
|
||||
from danswer.db.engine import get_session_with_tenant
|
||||
from danswer.db.engine import get_session_with_default_tenant
|
||||
from danswer.db.llm import fetch_embedding_provider
|
||||
from danswer.db.models import CloudEmbeddingProvider
|
||||
from danswer.db.models import IndexAttempt
|
||||
@@ -152,7 +152,7 @@ def get_all_search_settings(db_session: Session) -> list[SearchSettings]:
|
||||
|
||||
def get_multilingual_expansion(db_session: Session | None = None) -> list[str]:
|
||||
if db_session is None:
|
||||
with get_session_with_tenant() as db_session:
|
||||
with get_session_with_default_tenant() as db_session:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
else:
|
||||
search_settings = get_current_search_settings(db_session)
|
||||
|
||||
111
backend/danswer/db/token_limit.py
Normal file
111
backend/danswer/db/token_limit.py
Normal file
@@ -0,0 +1,111 @@
|
||||
from collections.abc import Sequence
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.configs.constants import TokenRateLimitScope
|
||||
from danswer.db.models import TokenRateLimit
|
||||
from danswer.db.models import TokenRateLimit__UserGroup
|
||||
from danswer.server.token_rate_limits.models import TokenRateLimitArgs
|
||||
|
||||
|
||||
def fetch_all_user_token_rate_limits(
|
||||
db_session: Session,
|
||||
enabled_only: bool = False,
|
||||
ordered: bool = True,
|
||||
) -> Sequence[TokenRateLimit]:
|
||||
query = select(TokenRateLimit).where(
|
||||
TokenRateLimit.scope == TokenRateLimitScope.USER
|
||||
)
|
||||
|
||||
if enabled_only:
|
||||
query = query.where(TokenRateLimit.enabled.is_(True))
|
||||
|
||||
if ordered:
|
||||
query = query.order_by(TokenRateLimit.created_at.desc())
|
||||
|
||||
return db_session.scalars(query).all()
|
||||
|
||||
|
||||
def fetch_all_global_token_rate_limits(
|
||||
db_session: Session,
|
||||
enabled_only: bool = False,
|
||||
ordered: bool = True,
|
||||
) -> Sequence[TokenRateLimit]:
|
||||
query = select(TokenRateLimit).where(
|
||||
TokenRateLimit.scope == TokenRateLimitScope.GLOBAL
|
||||
)
|
||||
|
||||
if enabled_only:
|
||||
query = query.where(TokenRateLimit.enabled.is_(True))
|
||||
|
||||
if ordered:
|
||||
query = query.order_by(TokenRateLimit.created_at.desc())
|
||||
|
||||
token_rate_limits = db_session.scalars(query).all()
|
||||
return token_rate_limits
|
||||
|
||||
|
||||
def insert_user_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_settings: TokenRateLimitArgs,
|
||||
) -> TokenRateLimit:
|
||||
token_limit = TokenRateLimit(
|
||||
enabled=token_rate_limit_settings.enabled,
|
||||
token_budget=token_rate_limit_settings.token_budget,
|
||||
period_hours=token_rate_limit_settings.period_hours,
|
||||
scope=TokenRateLimitScope.USER,
|
||||
)
|
||||
db_session.add(token_limit)
|
||||
db_session.commit()
|
||||
|
||||
return token_limit
|
||||
|
||||
|
||||
def insert_global_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_settings: TokenRateLimitArgs,
|
||||
) -> TokenRateLimit:
|
||||
token_limit = TokenRateLimit(
|
||||
enabled=token_rate_limit_settings.enabled,
|
||||
token_budget=token_rate_limit_settings.token_budget,
|
||||
period_hours=token_rate_limit_settings.period_hours,
|
||||
scope=TokenRateLimitScope.GLOBAL,
|
||||
)
|
||||
db_session.add(token_limit)
|
||||
db_session.commit()
|
||||
|
||||
return token_limit
|
||||
|
||||
|
||||
def update_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_id: int,
|
||||
token_rate_limit_settings: TokenRateLimitArgs,
|
||||
) -> TokenRateLimit:
|
||||
token_limit = db_session.get(TokenRateLimit, token_rate_limit_id)
|
||||
if token_limit is None:
|
||||
raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found")
|
||||
|
||||
token_limit.enabled = token_rate_limit_settings.enabled
|
||||
token_limit.token_budget = token_rate_limit_settings.token_budget
|
||||
token_limit.period_hours = token_rate_limit_settings.period_hours
|
||||
db_session.commit()
|
||||
|
||||
return token_limit
|
||||
|
||||
|
||||
def delete_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_id: int,
|
||||
) -> None:
|
||||
token_limit = db_session.get(TokenRateLimit, token_rate_limit_id)
|
||||
if token_limit is None:
|
||||
raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found")
|
||||
|
||||
db_session.query(TokenRateLimit__UserGroup).filter(
|
||||
TokenRateLimit__UserGroup.rate_limit_id == token_rate_limit_id
|
||||
).delete()
|
||||
|
||||
db_session.delete(token_limit)
|
||||
db_session.commit()
|
||||
@@ -147,7 +147,7 @@ class VespaIndex(DocumentIndex):
|
||||
return None
|
||||
|
||||
deploy_url = f"{VESPA_APPLICATION_ENDPOINT}/tenant/default/prepareandactivate"
|
||||
logger.info(f"Deploying Vespa application package to {deploy_url}")
|
||||
logger.notice(f"Deploying Vespa application package to {deploy_url}")
|
||||
|
||||
vespa_schema_path = os.path.join(
|
||||
os.getcwd(), "danswer", "document_index", "vespa", "app_config"
|
||||
|
||||
@@ -109,11 +109,10 @@ def translate_danswer_msg_to_langchain(
|
||||
files: list[InMemoryChatFile] = []
|
||||
|
||||
# If the message is a `ChatMessage`, it doesn't have the downloaded files
|
||||
# attached. Just ignore them for now. Also, OpenAI doesn't allow files to
|
||||
# be attached to AI messages, so we must remove them
|
||||
if not isinstance(msg, ChatMessage) and msg.message_type != MessageType.ASSISTANT:
|
||||
# attached. Just ignore them for now.
|
||||
if not isinstance(msg, ChatMessage):
|
||||
files = msg.files
|
||||
content = build_content_with_imgs(msg.message, files)
|
||||
content = build_content_with_imgs(msg.message, files, message_type=msg.message_type)
|
||||
|
||||
if msg.message_type == MessageType.SYSTEM:
|
||||
raise ValueError("System messages are not currently part of history")
|
||||
@@ -188,10 +187,19 @@ def build_content_with_imgs(
|
||||
message: str,
|
||||
files: list[InMemoryChatFile] | None = None,
|
||||
img_urls: list[str] | None = None,
|
||||
message_type: MessageType = MessageType.USER,
|
||||
) -> str | list[str | dict[str, Any]]: # matching Langchain's BaseMessage content type
|
||||
files = files or []
|
||||
img_files = [file for file in files if file.file_type == ChatFileType.IMAGE]
|
||||
|
||||
# Only include image files for user messages
|
||||
img_files = (
|
||||
[file for file in files if file.file_type == ChatFileType.IMAGE]
|
||||
if message_type == MessageType.USER
|
||||
else []
|
||||
)
|
||||
|
||||
img_urls = img_urls or []
|
||||
|
||||
message_main_content = _build_content(message, files)
|
||||
|
||||
if not img_files and not img_urls:
|
||||
|
||||
@@ -25,6 +25,7 @@ from danswer.auth.schemas import UserCreate
|
||||
from danswer.auth.schemas import UserRead
|
||||
from danswer.auth.schemas import UserUpdate
|
||||
from danswer.auth.users import auth_backend
|
||||
from danswer.auth.users import BasicAuthenticationError
|
||||
from danswer.auth.users import fastapi_users
|
||||
from danswer.configs.app_configs import APP_API_PREFIX
|
||||
from danswer.configs.app_configs import APP_HOST
|
||||
@@ -194,7 +195,12 @@ async def lifespan(app: FastAPI) -> AsyncGenerator:
|
||||
|
||||
def log_http_error(_: Request, exc: Exception) -> JSONResponse:
|
||||
status_code = getattr(exc, "status_code", 500)
|
||||
if status_code >= 400:
|
||||
|
||||
if isinstance(exc, BasicAuthenticationError):
|
||||
# For BasicAuthenticationError, just log a brief message without stack trace (almost always spam)
|
||||
logger.error(f"Authentication failed: {str(exc)}")
|
||||
|
||||
elif status_code >= 400:
|
||||
error_msg = f"{str(exc)}\n"
|
||||
error_msg += "".join(traceback.format_tb(exc.__traceback__))
|
||||
logger.error(error_msg)
|
||||
@@ -220,7 +226,6 @@ def get_application() -> FastAPI:
|
||||
else:
|
||||
logger.debug("Sentry DSN not provided, skipping Sentry initialization")
|
||||
|
||||
# Add the custom exception handler
|
||||
application.add_exception_handler(status.HTTP_400_BAD_REQUEST, log_http_error)
|
||||
application.add_exception_handler(status.HTTP_401_UNAUTHORIZED, log_http_error)
|
||||
application.add_exception_handler(status.HTTP_403_FORBIDDEN, log_http_error)
|
||||
@@ -277,12 +282,14 @@ def get_application() -> FastAPI:
|
||||
prefix="/auth",
|
||||
tags=["auth"],
|
||||
)
|
||||
|
||||
include_router_with_global_prefix_prepended(
|
||||
application,
|
||||
fastapi_users.get_register_router(UserRead, UserCreate),
|
||||
prefix="/auth",
|
||||
tags=["auth"],
|
||||
)
|
||||
|
||||
include_router_with_global_prefix_prepended(
|
||||
application,
|
||||
fastapi_users.get_reset_password_router(),
|
||||
|
||||
@@ -35,23 +35,31 @@ class BaseTokenizer(ABC):
|
||||
class TiktokenTokenizer(BaseTokenizer):
|
||||
_instances: dict[str, "TiktokenTokenizer"] = {}
|
||||
|
||||
def __new__(cls, encoding_name: str = "cl100k_base") -> "TiktokenTokenizer":
|
||||
if encoding_name not in cls._instances:
|
||||
cls._instances[encoding_name] = super(TiktokenTokenizer, cls).__new__(cls)
|
||||
return cls._instances[encoding_name]
|
||||
def __new__(cls, model_name: str) -> "TiktokenTokenizer":
|
||||
if model_name not in cls._instances:
|
||||
cls._instances[model_name] = super(TiktokenTokenizer, cls).__new__(cls)
|
||||
return cls._instances[model_name]
|
||||
|
||||
def __init__(self, encoding_name: str = "cl100k_base"):
|
||||
def __init__(self, model_name: str):
|
||||
if not hasattr(self, "encoder"):
|
||||
import tiktoken
|
||||
|
||||
self.encoder = tiktoken.get_encoding(encoding_name)
|
||||
self.encoder = tiktoken.encoding_for_model(model_name)
|
||||
|
||||
def encode(self, string: str) -> list[int]:
|
||||
# this returns no special tokens
|
||||
# this ignores special tokens that the model is trained on, see encode_ordinary for details
|
||||
return self.encoder.encode_ordinary(string)
|
||||
|
||||
def tokenize(self, string: str) -> list[str]:
|
||||
return [self.encoder.decode([token]) for token in self.encode(string)]
|
||||
encoded = self.encode(string)
|
||||
decoded = [self.encoder.decode([token]) for token in encoded]
|
||||
|
||||
if len(decoded) != len(encoded):
|
||||
logger.warning(
|
||||
f"OpenAI tokenized length {len(decoded)} does not match encoded length {len(encoded)} for string: {string}"
|
||||
)
|
||||
|
||||
return decoded
|
||||
|
||||
def decode(self, tokens: list[int]) -> str:
|
||||
return self.encoder.decode(tokens)
|
||||
@@ -74,22 +82,35 @@ class HuggingFaceTokenizer(BaseTokenizer):
|
||||
return self.encoder.decode(tokens)
|
||||
|
||||
|
||||
_TOKENIZER_CACHE: dict[str, BaseTokenizer] = {}
|
||||
_TOKENIZER_CACHE: dict[tuple[EmbeddingProvider | None, str | None], BaseTokenizer] = {}
|
||||
|
||||
|
||||
def _check_tokenizer_cache(tokenizer_name: str) -> BaseTokenizer:
|
||||
def _check_tokenizer_cache(
|
||||
model_provider: EmbeddingProvider | None, model_name: str | None
|
||||
) -> BaseTokenizer:
|
||||
global _TOKENIZER_CACHE
|
||||
|
||||
if tokenizer_name not in _TOKENIZER_CACHE:
|
||||
if tokenizer_name == "openai":
|
||||
_TOKENIZER_CACHE[tokenizer_name] = TiktokenTokenizer("cl100k_base")
|
||||
return _TOKENIZER_CACHE[tokenizer_name]
|
||||
id_tuple = (model_provider, model_name)
|
||||
|
||||
if id_tuple not in _TOKENIZER_CACHE:
|
||||
if model_provider in [EmbeddingProvider.OPENAI, EmbeddingProvider.AZURE]:
|
||||
if model_name is None:
|
||||
raise ValueError(
|
||||
"model_name is required for OPENAI and AZURE embeddings"
|
||||
)
|
||||
|
||||
_TOKENIZER_CACHE[id_tuple] = TiktokenTokenizer(model_name)
|
||||
return _TOKENIZER_CACHE[id_tuple]
|
||||
|
||||
try:
|
||||
logger.debug(f"Initializing HuggingFaceTokenizer for: {tokenizer_name}")
|
||||
_TOKENIZER_CACHE[tokenizer_name] = HuggingFaceTokenizer(tokenizer_name)
|
||||
if model_name is None:
|
||||
model_name = DOCUMENT_ENCODER_MODEL
|
||||
|
||||
logger.debug(f"Initializing HuggingFaceTokenizer for: {model_name}")
|
||||
_TOKENIZER_CACHE[id_tuple] = HuggingFaceTokenizer(model_name)
|
||||
except Exception as primary_error:
|
||||
logger.error(
|
||||
f"Error initializing HuggingFaceTokenizer for {tokenizer_name}: {primary_error}"
|
||||
f"Error initializing HuggingFaceTokenizer for {model_name}: {primary_error}"
|
||||
)
|
||||
logger.warning(
|
||||
f"Falling back to default embedding model: {DOCUMENT_ENCODER_MODEL}"
|
||||
@@ -98,7 +119,7 @@ def _check_tokenizer_cache(tokenizer_name: str) -> BaseTokenizer:
|
||||
try:
|
||||
# Cache this tokenizer name to the default so we don't have to try to load it again
|
||||
# and fail again
|
||||
_TOKENIZER_CACHE[tokenizer_name] = HuggingFaceTokenizer(
|
||||
_TOKENIZER_CACHE[id_tuple] = HuggingFaceTokenizer(
|
||||
DOCUMENT_ENCODER_MODEL
|
||||
)
|
||||
except Exception as fallback_error:
|
||||
@@ -106,10 +127,10 @@ def _check_tokenizer_cache(tokenizer_name: str) -> BaseTokenizer:
|
||||
f"Error initializing fallback HuggingFaceTokenizer: {fallback_error}"
|
||||
)
|
||||
raise ValueError(
|
||||
f"Failed to initialize tokenizer for {tokenizer_name} and fallback model"
|
||||
f"Failed to initialize tokenizer for {model_name} and fallback model"
|
||||
) from fallback_error
|
||||
|
||||
return _TOKENIZER_CACHE[tokenizer_name]
|
||||
return _TOKENIZER_CACHE[id_tuple]
|
||||
|
||||
|
||||
_DEFAULT_TOKENIZER: BaseTokenizer = HuggingFaceTokenizer(DOCUMENT_ENCODER_MODEL)
|
||||
@@ -118,11 +139,16 @@ _DEFAULT_TOKENIZER: BaseTokenizer = HuggingFaceTokenizer(DOCUMENT_ENCODER_MODEL)
|
||||
def get_tokenizer(
|
||||
model_name: str | None, provider_type: EmbeddingProvider | str | None
|
||||
) -> BaseTokenizer:
|
||||
# Currently all of the viable models use the same sentencepiece tokenizer
|
||||
# OpenAI uses a different one but currently it's not supported due to quality issues
|
||||
# the inconsistent chunking makes using the sentencepiece tokenizer default better for now
|
||||
# LLM tokenizers are specified by strings
|
||||
global _DEFAULT_TOKENIZER
|
||||
if provider_type is not None:
|
||||
if isinstance(provider_type, str):
|
||||
try:
|
||||
provider_type = EmbeddingProvider(provider_type)
|
||||
except ValueError:
|
||||
logger.debug(
|
||||
f"Invalid provider_type '{provider_type}'. Falling back to default tokenizer."
|
||||
)
|
||||
return _DEFAULT_TOKENIZER
|
||||
return _check_tokenizer_cache(provider_type, model_name)
|
||||
return _DEFAULT_TOKENIZER
|
||||
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ from danswer.tools.tool_implementations.search.search_tool import (
|
||||
from danswer.tools.tool_runner import ToolCallKickoff
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.timing import log_generator_function_time
|
||||
from ee.danswer.server.query_and_chat.utils import create_temporary_persona
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -125,11 +125,11 @@ def stream_answer_objects(
|
||||
)
|
||||
|
||||
temporary_persona: Persona | None = None
|
||||
|
||||
if query_req.persona_config is not None:
|
||||
new_persona = create_temporary_persona(
|
||||
db_session=db_session, persona_config=query_req.persona_config, user=user
|
||||
)
|
||||
temporary_persona = new_persona
|
||||
temporary_persona = fetch_ee_implementation_or_noop(
|
||||
"danswer.server.query_and_chat.utils", "create_temporary_persona", None
|
||||
)(db_session=db_session, persona_config=query_req.persona_config, user=user)
|
||||
|
||||
persona = temporary_persona if temporary_persona else chat_session.persona
|
||||
|
||||
@@ -253,7 +253,7 @@ def stream_answer_objects(
|
||||
return_contexts=query_req.return_contexts,
|
||||
skip_gen_ai_answer_generation=query_req.skip_gen_ai_answer_generation,
|
||||
)
|
||||
# won't be any ImageGenerationDisplay responses since that tool is never passed in
|
||||
# won't be any FileChatDisplay responses since that tool is never passed in
|
||||
for packet in cast(AnswerObjectIterator, answer.processed_streamed_output):
|
||||
# for one-shot flow, don't currently do anything with these
|
||||
if isinstance(packet, ToolResponse):
|
||||
|
||||
@@ -13,6 +13,10 @@ from danswer.prompts.chat_prompts import ADDITIONAL_INFO
|
||||
from danswer.prompts.chat_prompts import CITATION_REMINDER
|
||||
from danswer.prompts.constants import CODE_BLOCK_PAT
|
||||
from danswer.search.models import InferenceChunk
|
||||
from danswer.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
MOST_BASIC_PROMPT = "You are a helpful AI assistant."
|
||||
@@ -136,14 +140,23 @@ def find_last_index(lst: list[int], max_prompt_tokens: int) -> int:
|
||||
before the list exceeds the maximum"""
|
||||
running_sum = 0
|
||||
|
||||
if not lst:
|
||||
logger.warning("Empty message history passed to find_last_index")
|
||||
return 0
|
||||
|
||||
last_ind = 0
|
||||
for i in range(len(lst) - 1, -1, -1):
|
||||
running_sum += lst[i] + _PER_MESSAGE_TOKEN_BUFFER
|
||||
if running_sum > max_prompt_tokens:
|
||||
last_ind = i + 1
|
||||
break
|
||||
|
||||
if last_ind >= len(lst):
|
||||
logger.error(
|
||||
f"Last message alone is too large! max_prompt_tokens: {max_prompt_tokens}, message_token_counts: {lst}"
|
||||
)
|
||||
raise ValueError("Last message alone is too large!")
|
||||
|
||||
return last_ind
|
||||
|
||||
|
||||
|
||||
@@ -3,15 +3,15 @@ from fastapi import Depends
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.auth.users import current_admin_user
|
||||
from danswer.db.api_key import ApiKeyDescriptor
|
||||
from danswer.db.api_key import fetch_api_keys
|
||||
from danswer.db.api_key import insert_api_key
|
||||
from danswer.db.api_key import regenerate_api_key
|
||||
from danswer.db.api_key import remove_api_key
|
||||
from danswer.db.api_key import update_api_key
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.models import User
|
||||
from ee.danswer.db.api_key import ApiKeyDescriptor
|
||||
from ee.danswer.db.api_key import fetch_api_keys
|
||||
from ee.danswer.db.api_key import insert_api_key
|
||||
from ee.danswer.db.api_key import regenerate_api_key
|
||||
from ee.danswer.db.api_key import remove_api_key
|
||||
from ee.danswer.db.api_key import update_api_key
|
||||
from ee.danswer.server.api_key.models import APIKeyArgs
|
||||
from danswer.server.api_key.models import APIKeyArgs
|
||||
|
||||
|
||||
router = APIRouter(prefix="/admin/api-key")
|
||||
@@ -10,8 +10,7 @@ from danswer.auth.users import current_user
|
||||
from danswer.auth.users import current_user_with_expired_token
|
||||
from danswer.configs.app_configs import APP_API_PREFIX
|
||||
from danswer.server.danswer_api.ingestion import api_key_dep
|
||||
from ee.danswer.auth.users import current_cloud_superuser
|
||||
from ee.danswer.server.tenants.access import control_plane_dep
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
|
||||
PUBLIC_ENDPOINT_SPECS = [
|
||||
@@ -81,6 +80,14 @@ def check_router_auth(
|
||||
(1) have auth enabled OR
|
||||
(2) are explicitly marked as a public endpoint
|
||||
"""
|
||||
|
||||
control_plane_dep = fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.access", "control_plane_dep"
|
||||
)
|
||||
current_cloud_superuser = fetch_ee_implementation_or_noop(
|
||||
"danswer.auth.users", "current_cloud_superuser"
|
||||
)
|
||||
|
||||
for route in application.routes:
|
||||
# explicitly marked as public
|
||||
if is_route_in_spec_list(route, public_endpoint_specs):
|
||||
|
||||
@@ -3,6 +3,7 @@ from fastapi import Depends
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.auth.users import api_key_dep
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.connectors.models import Document
|
||||
from danswer.connectors.models import IndexAttemptMetadata
|
||||
@@ -22,7 +23,6 @@ from danswer.server.danswer_api.models import DocMinimalInfo
|
||||
from danswer.server.danswer_api.models import IngestionDocument
|
||||
from danswer.server.danswer_api.models import IngestionResult
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.auth.users import api_key_dep
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@@ -16,6 +16,9 @@ from danswer.background.celery.tasks.pruning.tasks import (
|
||||
try_creating_prune_generator_task,
|
||||
)
|
||||
from danswer.background.celery.versioned_apps.primary import app as primary_app
|
||||
from danswer.background.task_name_builders import (
|
||||
name_sync_external_doc_permissions_task,
|
||||
)
|
||||
from danswer.db.connector_credential_pair import add_credential_to_connector
|
||||
from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
from danswer.db.connector_credential_pair import remove_credential_from_connector
|
||||
@@ -47,11 +50,7 @@ from danswer.server.documents.models import ConnectorCredentialPairMetadata
|
||||
from danswer.server.documents.models import PaginatedIndexAttempts
|
||||
from danswer.server.models import StatusResponse
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.background.task_name_builders import (
|
||||
name_sync_external_doc_permissions_task,
|
||||
)
|
||||
from ee.danswer.db.user_group import validate_user_creation_permissions
|
||||
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
logger = setup_logger()
|
||||
router = APIRouter(prefix="/manage")
|
||||
@@ -332,9 +331,6 @@ def sync_cc_pair(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> StatusResponse[list[int]]:
|
||||
# avoiding circular refs
|
||||
from ee.danswer.background.celery.apps.primary import (
|
||||
sync_external_doc_permissions_task,
|
||||
)
|
||||
|
||||
cc_pair = get_connector_credential_pair_from_id(
|
||||
cc_pair_id=cc_pair_id,
|
||||
@@ -360,12 +356,19 @@ def sync_cc_pair(
|
||||
)
|
||||
|
||||
logger.info(f"Syncing the {cc_pair.connector.name} connector.")
|
||||
sync_external_doc_permissions_task.apply_async(
|
||||
kwargs=dict(
|
||||
cc_pair_id=cc_pair_id, tenant_id=CURRENT_TENANT_ID_CONTEXTVAR.get()
|
||||
),
|
||||
sync_external_doc_permissions_task = fetch_ee_implementation_or_noop(
|
||||
"danswer.background.celery.apps.primary",
|
||||
"sync_external_doc_permissions_task",
|
||||
None,
|
||||
)
|
||||
|
||||
if sync_external_doc_permissions_task:
|
||||
sync_external_doc_permissions_task.apply_async(
|
||||
kwargs=dict(
|
||||
cc_pair_id=cc_pair_id, tenant_id=CURRENT_TENANT_ID_CONTEXTVAR.get()
|
||||
),
|
||||
)
|
||||
|
||||
return StatusResponse(
|
||||
success=True,
|
||||
message="Successfully created the sync task.",
|
||||
@@ -380,7 +383,9 @@ def associate_credential_to_connector(
|
||||
user: User | None = Depends(current_curator_or_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> StatusResponse[int]:
|
||||
validate_user_creation_permissions(
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.user_group", "validate_user_creation_permissions", None
|
||||
)(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
target_group_ids=metadata.groups,
|
||||
|
||||
@@ -22,35 +22,38 @@ from danswer.background.celery.versioned_apps.primary import app as primary_app
|
||||
from danswer.configs.app_configs import ENABLED_CONNECTOR_TYPES
|
||||
from danswer.configs.constants import DocumentSource
|
||||
from danswer.configs.constants import FileOrigin
|
||||
from danswer.connectors.gmail.connector_auth import delete_gmail_service_account_key
|
||||
from danswer.connectors.gmail.connector_auth import delete_google_app_gmail_cred
|
||||
from danswer.connectors.gmail.connector_auth import get_gmail_auth_url
|
||||
from danswer.connectors.gmail.connector_auth import get_gmail_service_account_key
|
||||
from danswer.connectors.gmail.connector_auth import get_google_app_gmail_cred
|
||||
from danswer.connectors.gmail.connector_auth import (
|
||||
update_gmail_credential_access_tokens,
|
||||
from danswer.connectors.google_utils.google_auth import (
|
||||
get_google_oauth_creds,
|
||||
)
|
||||
from danswer.connectors.gmail.connector_auth import (
|
||||
upsert_gmail_service_account_key,
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
build_service_account_creds,
|
||||
)
|
||||
from danswer.connectors.gmail.connector_auth import upsert_google_app_gmail_cred
|
||||
from danswer.connectors.google_drive.connector_auth import build_service_account_creds
|
||||
from danswer.connectors.google_drive.connector_auth import DB_CREDENTIALS_DICT_TOKEN_KEY
|
||||
from danswer.connectors.google_drive.connector_auth import delete_google_app_cred
|
||||
from danswer.connectors.google_drive.connector_auth import delete_service_account_key
|
||||
from danswer.connectors.google_drive.connector_auth import get_auth_url
|
||||
from danswer.connectors.google_drive.connector_auth import get_google_app_cred
|
||||
from danswer.connectors.google_drive.connector_auth import (
|
||||
get_google_drive_creds_for_authorized_user,
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
delete_google_app_cred,
|
||||
)
|
||||
from danswer.connectors.google_drive.connector_auth import get_service_account_key
|
||||
from danswer.connectors.google_drive.connector_auth import GOOGLE_DRIVE_SCOPES
|
||||
from danswer.connectors.google_drive.connector_auth import (
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
delete_service_account_key,
|
||||
)
|
||||
from danswer.connectors.google_utils.google_kv import get_auth_url
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
get_google_app_cred,
|
||||
)
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
get_service_account_key,
|
||||
)
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
update_credential_access_tokens,
|
||||
)
|
||||
from danswer.connectors.google_drive.connector_auth import upsert_google_app_cred
|
||||
from danswer.connectors.google_drive.connector_auth import upsert_service_account_key
|
||||
from danswer.connectors.google_drive.connector_auth import verify_csrf
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
upsert_google_app_cred,
|
||||
)
|
||||
from danswer.connectors.google_utils.google_kv import (
|
||||
upsert_service_account_key,
|
||||
)
|
||||
from danswer.connectors.google_utils.google_kv import verify_csrf
|
||||
from danswer.connectors.google_utils.shared_constants import (
|
||||
DB_CREDENTIALS_DICT_TOKEN_KEY,
|
||||
)
|
||||
from danswer.db.connector import create_connector
|
||||
from danswer.db.connector import delete_connector
|
||||
from danswer.db.connector import fetch_connector_by_id
|
||||
@@ -64,8 +67,7 @@ from danswer.db.connector_credential_pair import get_connector_credential_pairs
|
||||
from danswer.db.credentials import cleanup_gmail_credentials
|
||||
from danswer.db.credentials import cleanup_google_drive_credentials
|
||||
from danswer.db.credentials import create_credential
|
||||
from danswer.db.credentials import delete_gmail_service_account_credentials
|
||||
from danswer.db.credentials import delete_google_drive_service_account_credentials
|
||||
from danswer.db.credentials import delete_service_account_credentials
|
||||
from danswer.db.credentials import fetch_credential_by_id
|
||||
from danswer.db.deletion_attempt import check_deletion_attempt_is_allowed
|
||||
from danswer.db.document import get_document_counts_for_cc_pairs
|
||||
@@ -106,7 +108,7 @@ from danswer.server.documents.models import ObjectCreationIdResponse
|
||||
from danswer.server.documents.models import RunConnectorRequest
|
||||
from danswer.server.models import StatusResponse
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.db.user_group import validate_user_creation_permissions
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -125,7 +127,7 @@ def check_google_app_gmail_credentials_exist(
|
||||
_: User = Depends(current_curator_or_admin_user),
|
||||
) -> dict[str, str]:
|
||||
try:
|
||||
return {"client_id": get_google_app_gmail_cred().web.client_id}
|
||||
return {"client_id": get_google_app_cred(DocumentSource.GMAIL).web.client_id}
|
||||
except KvKeyNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="Google App Credentials not found")
|
||||
|
||||
@@ -135,7 +137,7 @@ def upsert_google_app_gmail_credentials(
|
||||
app_credentials: GoogleAppCredentials, _: User = Depends(current_admin_user)
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
upsert_google_app_gmail_cred(app_credentials)
|
||||
upsert_google_app_cred(app_credentials, DocumentSource.GMAIL)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@@ -150,7 +152,7 @@ def delete_google_app_gmail_credentials(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
delete_google_app_gmail_cred()
|
||||
delete_google_app_cred(DocumentSource.GMAIL)
|
||||
cleanup_gmail_credentials(db_session=db_session)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
@@ -165,7 +167,9 @@ def check_google_app_credentials_exist(
|
||||
_: User = Depends(current_curator_or_admin_user),
|
||||
) -> dict[str, str]:
|
||||
try:
|
||||
return {"client_id": get_google_app_cred().web.client_id}
|
||||
return {
|
||||
"client_id": get_google_app_cred(DocumentSource.GOOGLE_DRIVE).web.client_id
|
||||
}
|
||||
except KvKeyNotFoundError:
|
||||
raise HTTPException(status_code=404, detail="Google App Credentials not found")
|
||||
|
||||
@@ -175,7 +179,7 @@ def upsert_google_app_credentials(
|
||||
app_credentials: GoogleAppCredentials, _: User = Depends(current_admin_user)
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
upsert_google_app_cred(app_credentials)
|
||||
upsert_google_app_cred(app_credentials, DocumentSource.GOOGLE_DRIVE)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@@ -190,7 +194,7 @@ def delete_google_app_credentials(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
delete_google_app_cred()
|
||||
delete_google_app_cred(DocumentSource.GOOGLE_DRIVE)
|
||||
cleanup_google_drive_credentials(db_session=db_session)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
@@ -205,7 +209,11 @@ def check_google_service_gmail_account_key_exist(
|
||||
_: User = Depends(current_curator_or_admin_user),
|
||||
) -> dict[str, str]:
|
||||
try:
|
||||
return {"service_account_email": get_gmail_service_account_key().client_email}
|
||||
return {
|
||||
"service_account_email": get_service_account_key(
|
||||
DocumentSource.GMAIL
|
||||
).client_email
|
||||
}
|
||||
except KvKeyNotFoundError:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Google Service Account Key not found"
|
||||
@@ -217,7 +225,7 @@ def upsert_google_service_gmail_account_key(
|
||||
service_account_key: GoogleServiceAccountKey, _: User = Depends(current_admin_user)
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
upsert_gmail_service_account_key(service_account_key)
|
||||
upsert_service_account_key(service_account_key, DocumentSource.GMAIL)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@@ -232,7 +240,7 @@ def delete_google_service_gmail_account_key(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
delete_gmail_service_account_key()
|
||||
delete_service_account_key(DocumentSource.GMAIL)
|
||||
cleanup_gmail_credentials(db_session=db_session)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
@@ -247,7 +255,11 @@ def check_google_service_account_key_exist(
|
||||
_: User = Depends(current_curator_or_admin_user),
|
||||
) -> dict[str, str]:
|
||||
try:
|
||||
return {"service_account_email": get_service_account_key().client_email}
|
||||
return {
|
||||
"service_account_email": get_service_account_key(
|
||||
DocumentSource.GOOGLE_DRIVE
|
||||
).client_email
|
||||
}
|
||||
except KvKeyNotFoundError:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Google Service Account Key not found"
|
||||
@@ -259,7 +271,7 @@ def upsert_google_service_account_key(
|
||||
service_account_key: GoogleServiceAccountKey, _: User = Depends(current_admin_user)
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
upsert_service_account_key(service_account_key)
|
||||
upsert_service_account_key(service_account_key, DocumentSource.GOOGLE_DRIVE)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@@ -274,7 +286,7 @@ def delete_google_service_account_key(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> StatusResponse:
|
||||
try:
|
||||
delete_service_account_key()
|
||||
delete_service_account_key(DocumentSource.GOOGLE_DRIVE)
|
||||
cleanup_google_drive_credentials(db_session=db_session)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
@@ -296,13 +308,13 @@ def upsert_service_account_credential(
|
||||
try:
|
||||
credential_base = build_service_account_creds(
|
||||
DocumentSource.GOOGLE_DRIVE,
|
||||
primary_admin_email=service_account_credential_request.google_drive_primary_admin,
|
||||
primary_admin_email=service_account_credential_request.google_primary_admin,
|
||||
)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
# first delete all existing service account credentials
|
||||
delete_google_drive_service_account_credentials(user, db_session)
|
||||
delete_service_account_credentials(user, db_session, DocumentSource.GOOGLE_DRIVE)
|
||||
# `user=None` since this credential is not a personal credential
|
||||
credential = create_credential(
|
||||
credential_data=credential_base, user=user, db_session=db_session
|
||||
@@ -322,13 +334,13 @@ def upsert_gmail_service_account_credential(
|
||||
try:
|
||||
credential_base = build_service_account_creds(
|
||||
DocumentSource.GMAIL,
|
||||
primary_admin_email=service_account_credential_request.gmail_delegated_user,
|
||||
primary_admin_email=service_account_credential_request.google_primary_admin,
|
||||
)
|
||||
except KvKeyNotFoundError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
# first delete all existing service account credentials
|
||||
delete_gmail_service_account_credentials(user, db_session)
|
||||
delete_service_account_credentials(user, db_session, DocumentSource.GMAIL)
|
||||
# `user=None` since this credential is not a personal credential
|
||||
credential = create_credential(
|
||||
credential_data=credential_base, user=user, db_session=db_session
|
||||
@@ -349,9 +361,9 @@ def check_drive_tokens(
|
||||
):
|
||||
return AuthStatus(authenticated=False)
|
||||
token_json_str = str(db_credentials.credential_json[DB_CREDENTIALS_DICT_TOKEN_KEY])
|
||||
google_drive_creds = get_google_drive_creds_for_authorized_user(
|
||||
google_drive_creds = get_google_oauth_creds(
|
||||
token_json_str=token_json_str,
|
||||
scopes=GOOGLE_DRIVE_SCOPES,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
)
|
||||
if google_drive_creds is None:
|
||||
return AuthStatus(authenticated=False)
|
||||
@@ -646,7 +658,10 @@ def create_connector_from_model(
|
||||
) -> ObjectCreationIdResponse:
|
||||
try:
|
||||
_validate_connector_allowed(connector_data.source)
|
||||
validate_user_creation_permissions(
|
||||
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.user_group", "validate_user_creation_permissions", None
|
||||
)(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
target_group_ids=connector_data.groups,
|
||||
@@ -720,7 +735,9 @@ def update_connector_from_model(
|
||||
) -> ConnectorSnapshot | StatusResponse[int]:
|
||||
try:
|
||||
_validate_connector_allowed(connector_data.source)
|
||||
validate_user_creation_permissions(
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.user_group", "validate_user_creation_permissions", None
|
||||
)(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
target_group_ids=connector_data.groups,
|
||||
@@ -881,7 +898,7 @@ def gmail_auth(
|
||||
httponly=True,
|
||||
max_age=600,
|
||||
)
|
||||
return AuthUrl(auth_url=get_gmail_auth_url(int(credential_id)))
|
||||
return AuthUrl(auth_url=get_auth_url(int(credential_id), DocumentSource.GMAIL))
|
||||
|
||||
|
||||
@router.get("/connector/google-drive/authorize/{credential_id}")
|
||||
@@ -895,7 +912,9 @@ def google_drive_auth(
|
||||
httponly=True,
|
||||
max_age=600,
|
||||
)
|
||||
return AuthUrl(auth_url=get_auth_url(int(credential_id)))
|
||||
return AuthUrl(
|
||||
auth_url=get_auth_url(int(credential_id), DocumentSource.GOOGLE_DRIVE)
|
||||
)
|
||||
|
||||
|
||||
@router.get("/connector/gmail/callback")
|
||||
@@ -912,12 +931,10 @@ def gmail_callback(
|
||||
)
|
||||
credential_id = int(credential_id_cookie)
|
||||
verify_csrf(credential_id, callback.state)
|
||||
if (
|
||||
update_gmail_credential_access_tokens(
|
||||
callback.code, credential_id, user, db_session
|
||||
)
|
||||
is None
|
||||
):
|
||||
credentials: Credentials | None = update_credential_access_tokens(
|
||||
callback.code, credential_id, user, db_session, DocumentSource.GMAIL
|
||||
)
|
||||
if credentials is None:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Unable to fetch Gmail access tokens"
|
||||
)
|
||||
@@ -941,7 +958,7 @@ def google_drive_callback(
|
||||
verify_csrf(credential_id, callback.state)
|
||||
|
||||
credentials: Credentials | None = update_credential_access_tokens(
|
||||
callback.code, credential_id, user, db_session
|
||||
callback.code, credential_id, user, db_session, DocumentSource.GOOGLE_DRIVE
|
||||
)
|
||||
if credentials is None:
|
||||
raise HTTPException(
|
||||
|
||||
@@ -28,7 +28,7 @@ from danswer.server.documents.models import CredentialSwapRequest
|
||||
from danswer.server.documents.models import ObjectCreationIdResponse
|
||||
from danswer.server.models import StatusResponse
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.db.user_group import validate_user_creation_permissions
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -121,7 +121,9 @@ def create_credential_from_model(
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> ObjectCreationIdResponse:
|
||||
if not _ignore_credential_permissions(credential_info.source):
|
||||
validate_user_creation_permissions(
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.user_group", "validate_user_creation_permissions", None
|
||||
)(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
target_group_ids=credential_info.groups,
|
||||
|
||||
@@ -4,7 +4,6 @@ from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
from pydantic import model_validator
|
||||
|
||||
from danswer.configs.app_configs import MASK_CREDENTIAL_PREFIX
|
||||
from danswer.configs.constants import DocumentSource
|
||||
@@ -377,18 +376,7 @@ class GoogleServiceAccountKey(BaseModel):
|
||||
|
||||
|
||||
class GoogleServiceAccountCredentialRequest(BaseModel):
|
||||
google_drive_primary_admin: str | None = None # email of user to impersonate
|
||||
gmail_delegated_user: str | None = None # email of user to impersonate
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check_user_delegation(self) -> "GoogleServiceAccountCredentialRequest":
|
||||
if (self.google_drive_primary_admin is None) == (
|
||||
self.gmail_delegated_user is None
|
||||
):
|
||||
raise ValueError(
|
||||
"Exactly one of google_drive_primary_admin or gmail_delegated_user must be set"
|
||||
)
|
||||
return self
|
||||
google_primary_admin: str | None = None # email of user to impersonate
|
||||
|
||||
|
||||
class FileUploadResponse(BaseModel):
|
||||
|
||||
@@ -18,7 +18,7 @@ from danswer.server.features.document_set.models import CheckDocSetPublicRespons
|
||||
from danswer.server.features.document_set.models import DocumentSet
|
||||
from danswer.server.features.document_set.models import DocumentSetCreationRequest
|
||||
from danswer.server.features.document_set.models import DocumentSetUpdateRequest
|
||||
from ee.danswer.db.user_group import validate_user_creation_permissions
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
|
||||
router = APIRouter(prefix="/manage")
|
||||
@@ -30,7 +30,9 @@ def create_document_set(
|
||||
user: User = Depends(current_curator_or_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> int:
|
||||
validate_user_creation_permissions(
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.user_group", "validate_user_creation_permissions", None
|
||||
)(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
target_group_ids=document_set_creation_request.groups,
|
||||
@@ -53,7 +55,9 @@ def patch_document_set(
|
||||
user: User = Depends(current_curator_or_admin_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> None:
|
||||
validate_user_creation_permissions(
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.user_group", "validate_user_creation_permissions", None
|
||||
)(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
target_group_ids=document_set_update_request.groups,
|
||||
|
||||
@@ -42,7 +42,7 @@ class UserPreferences(BaseModel):
|
||||
chosen_assistants: list[int] | None = None
|
||||
hidden_assistants: list[int] = []
|
||||
visible_assistants: list[int] = []
|
||||
|
||||
recent_assistants: list[int] | None = None
|
||||
default_model: str | None = None
|
||||
|
||||
|
||||
|
||||
@@ -11,7 +11,6 @@ from fastapi import Body
|
||||
from fastapi import Depends
|
||||
from fastapi import HTTPException
|
||||
from fastapi import Request
|
||||
from fastapi import status
|
||||
from psycopg2.errors import UniqueViolation
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import Column
|
||||
@@ -27,10 +26,10 @@ from danswer.auth.noauth_user import fetch_no_auth_user
|
||||
from danswer.auth.noauth_user import set_no_auth_user_preferences
|
||||
from danswer.auth.schemas import UserRole
|
||||
from danswer.auth.schemas import UserStatus
|
||||
from danswer.auth.users import BasicAuthenticationError
|
||||
from danswer.auth.users import current_admin_user
|
||||
from danswer.auth.users import current_curator_or_admin_user
|
||||
from danswer.auth.users import current_user
|
||||
from danswer.auth.users import get_tenant_id_for_email
|
||||
from danswer.auth.users import optional_user
|
||||
from danswer.configs.app_configs import AUTH_TYPE
|
||||
from danswer.configs.app_configs import ENABLE_EMAIL_INVITES
|
||||
@@ -38,6 +37,7 @@ from danswer.configs.app_configs import SESSION_EXPIRE_TIME_SECONDS
|
||||
from danswer.configs.app_configs import SUPER_USERS
|
||||
from danswer.configs.app_configs import VALID_EMAIL_DOMAINS
|
||||
from danswer.configs.constants import AuthType
|
||||
from danswer.db.api_key import is_api_key_email_address
|
||||
from danswer.db.auth import get_total_users_count
|
||||
from danswer.db.engine import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
from danswer.db.engine import get_session
|
||||
@@ -61,12 +61,7 @@ from danswer.server.models import InvitedUserSnapshot
|
||||
from danswer.server.models import MinimalUserSnapshot
|
||||
from danswer.server.utils import send_user_email_invite
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.db.api_key import is_api_key_email_address
|
||||
from ee.danswer.db.external_perm import delete_user__ext_group_for_user__no_commit
|
||||
from ee.danswer.db.user_group import remove_curator_status__no_commit
|
||||
from ee.danswer.server.tenants.billing import register_tenant_users
|
||||
from ee.danswer.server.tenants.provisioning import add_users_to_tenant
|
||||
from ee.danswer.server.tenants.provisioning import remove_users_from_tenant
|
||||
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -105,7 +100,10 @@ def set_user_role(
|
||||
)
|
||||
|
||||
if user_to_update.role == UserRole.CURATOR:
|
||||
remove_curator_status__no_commit(db_session, user_to_update)
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.user_group",
|
||||
"remove_curator_status__no_commit",
|
||||
)(db_session, user_to_update)
|
||||
|
||||
user_to_update.role = user_role_update_request.new_role.value
|
||||
|
||||
@@ -197,15 +195,17 @@ def bulk_invite_users(
|
||||
email_info = validate_email(email)
|
||||
normalized_emails.append(email_info.normalized) # type: ignore
|
||||
|
||||
except (EmailUndeliverableError, EmailNotValidError):
|
||||
except (EmailUndeliverableError, EmailNotValidError) as e:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="One or more emails in the list are invalid",
|
||||
detail=f"Invalid email address: {email} - {str(e)}",
|
||||
)
|
||||
|
||||
if MULTI_TENANT:
|
||||
try:
|
||||
add_users_to_tenant(normalized_emails, tenant_id)
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.provisioning", "add_users_to_tenant", None
|
||||
)(normalized_emails, tenant_id)
|
||||
|
||||
except IntegrityError as e:
|
||||
if isinstance(e.orig, UniqueViolation):
|
||||
@@ -226,9 +226,9 @@ def bulk_invite_users(
|
||||
return number_of_invited_users
|
||||
try:
|
||||
logger.info("Registering tenant users")
|
||||
register_tenant_users(
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.get(), get_total_users_count(db_session)
|
||||
)
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.billing", "register_tenant_users", None
|
||||
)(CURRENT_TENANT_ID_CONTEXTVAR.get(), get_total_users_count(db_session))
|
||||
if ENABLE_EMAIL_INVITES:
|
||||
try:
|
||||
for email in all_emails:
|
||||
@@ -243,7 +243,9 @@ def bulk_invite_users(
|
||||
"Reverting changes: removing users from tenant and resetting invited users"
|
||||
)
|
||||
write_invited_users(initial_invited_users) # Reset to original state
|
||||
remove_users_from_tenant(normalized_emails, tenant_id)
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.user_mapping", "remove_users_from_tenant", None
|
||||
)(normalized_emails, tenant_id)
|
||||
raise e
|
||||
|
||||
|
||||
@@ -257,14 +259,16 @@ def remove_invited_user(
|
||||
remaining_users = [user for user in user_emails if user != user_email.user_email]
|
||||
|
||||
tenant_id = CURRENT_TENANT_ID_CONTEXTVAR.get()
|
||||
remove_users_from_tenant([user_email.user_email], tenant_id)
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.user_mapping", "remove_users_from_tenant", None
|
||||
)([user_email.user_email], tenant_id)
|
||||
number_of_invited_users = write_invited_users(remaining_users)
|
||||
|
||||
try:
|
||||
if MULTI_TENANT:
|
||||
register_tenant_users(
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.get(), get_total_users_count(db_session)
|
||||
)
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.billing", "register_tenant_users", None
|
||||
)(CURRENT_TENANT_ID_CONTEXTVAR.get(), get_total_users_count(db_session))
|
||||
except Exception:
|
||||
logger.error(
|
||||
"Request to update number of seats taken in control plane failed. "
|
||||
@@ -331,7 +335,10 @@ async def delete_user(
|
||||
for oauth_account in user_to_delete.oauth_accounts:
|
||||
db_session.delete(oauth_account)
|
||||
|
||||
delete_user__ext_group_for_user__no_commit(
|
||||
fetch_ee_implementation_or_noop(
|
||||
"danswer.db.external_perm",
|
||||
"delete_user__ext_group_for_user__no_commit",
|
||||
)(
|
||||
db_session=db_session,
|
||||
user_id=user_to_delete.id,
|
||||
)
|
||||
@@ -485,20 +492,19 @@ def verify_user_logged_in(
|
||||
store = get_kv_store()
|
||||
return fetch_no_auth_user(store)
|
||||
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN, detail="User Not Authenticated"
|
||||
)
|
||||
raise BasicAuthenticationError(detail="User Not Authenticated")
|
||||
|
||||
if user.oidc_expiry and user.oidc_expiry < datetime.now(timezone.utc):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_403_FORBIDDEN,
|
||||
raise BasicAuthenticationError(
|
||||
detail="Access denied. User's OIDC token has expired.",
|
||||
)
|
||||
|
||||
token_created_at = (
|
||||
None if MULTI_TENANT else get_current_token_creation(user, db_session)
|
||||
)
|
||||
organization_name = get_tenant_id_for_email(user.email)
|
||||
organization_name = fetch_ee_implementation_or_noop(
|
||||
"danswer.server.tenants.user_mapping", "get_tenant_id_for_email", None
|
||||
)(user.email)
|
||||
|
||||
user_info = UserInfo.from_model(
|
||||
user,
|
||||
@@ -518,6 +524,59 @@ class ChosenDefaultModelRequest(BaseModel):
|
||||
default_model: str | None = None
|
||||
|
||||
|
||||
class RecentAssistantsRequest(BaseModel):
|
||||
current_assistant: int
|
||||
|
||||
|
||||
def update_recent_assistants(
|
||||
recent_assistants: list[int] | None, current_assistant: int
|
||||
) -> list[int]:
|
||||
if recent_assistants is None:
|
||||
recent_assistants = []
|
||||
else:
|
||||
recent_assistants = [x for x in recent_assistants if x != current_assistant]
|
||||
|
||||
# Add current assistant to start of list
|
||||
recent_assistants.insert(0, current_assistant)
|
||||
|
||||
# Keep only the 5 most recent assistants
|
||||
recent_assistants = recent_assistants[:5]
|
||||
return recent_assistants
|
||||
|
||||
|
||||
@router.patch("/user/recent-assistants")
|
||||
def update_user_recent_assistants(
|
||||
request: RecentAssistantsRequest,
|
||||
user: User | None = Depends(current_user),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> None:
|
||||
if user is None:
|
||||
if AUTH_TYPE == AuthType.DISABLED:
|
||||
store = get_kv_store()
|
||||
no_auth_user = fetch_no_auth_user(store)
|
||||
preferences = no_auth_user.preferences
|
||||
recent_assistants = preferences.recent_assistants
|
||||
updated_preferences = update_recent_assistants(
|
||||
recent_assistants, request.current_assistant
|
||||
)
|
||||
preferences.recent_assistants = updated_preferences
|
||||
set_no_auth_user_preferences(store, preferences)
|
||||
return
|
||||
else:
|
||||
raise RuntimeError("This should never happen")
|
||||
|
||||
recent_assistants = UserInfo.from_model(user).preferences.recent_assistants
|
||||
updated_recent_assistants = update_recent_assistants(
|
||||
recent_assistants, request.current_assistant
|
||||
)
|
||||
db_session.execute(
|
||||
update(User)
|
||||
.where(User.id == user.id) # type: ignore
|
||||
.values(recent_assistants=updated_recent_assistants)
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
@router.patch("/user/default-model")
|
||||
def update_user_default_model(
|
||||
request: ChosenDefaultModelRequest,
|
||||
|
||||
@@ -359,7 +359,7 @@ def handle_new_chat_message(
|
||||
yield json.dumps(packet) if isinstance(packet, dict) else packet
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error in chat message streaming: {e}")
|
||||
logger.exception("Error in chat message streaming")
|
||||
yield json.dumps({"error": str(e)})
|
||||
|
||||
finally:
|
||||
|
||||
@@ -279,7 +279,7 @@ def get_answer_with_quote(
|
||||
):
|
||||
yield json.dumps(packet) if isinstance(packet, dict) else packet
|
||||
except Exception as e:
|
||||
logger.exception(f"Error in search answer streaming: {e}")
|
||||
logger.exception("Error in search answer streaming")
|
||||
yield json.dumps({"error": str(e)})
|
||||
|
||||
return StreamingResponse(stream_generator(), media_type="application/json")
|
||||
|
||||
@@ -18,9 +18,9 @@ from danswer.db.models import ChatMessage
|
||||
from danswer.db.models import ChatSession
|
||||
from danswer.db.models import TokenRateLimit
|
||||
from danswer.db.models import User
|
||||
from danswer.db.token_limit import fetch_all_global_token_rate_limits
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.variable_functionality import fetch_versioned_implementation
|
||||
from ee.danswer.db.token_limit import fetch_all_global_token_rate_limits
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
|
||||
|
||||
@@ -5,13 +5,13 @@ from sqlalchemy.orm import Session
|
||||
from danswer.auth.users import current_admin_user
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.models import User
|
||||
from danswer.db.token_limit import delete_token_rate_limit
|
||||
from danswer.db.token_limit import fetch_all_global_token_rate_limits
|
||||
from danswer.db.token_limit import insert_global_token_rate_limit
|
||||
from danswer.db.token_limit import update_token_rate_limit
|
||||
from danswer.server.query_and_chat.token_limit import any_rate_limit_exists
|
||||
from danswer.server.token_rate_limits.models import TokenRateLimitArgs
|
||||
from danswer.server.token_rate_limits.models import TokenRateLimitDisplay
|
||||
from ee.danswer.db.token_limit import delete_token_rate_limit
|
||||
from ee.danswer.db.token_limit import fetch_all_global_token_rate_limits
|
||||
from ee.danswer.db.token_limit import insert_global_token_rate_limit
|
||||
from ee.danswer.db.token_limit import update_token_rate_limit
|
||||
|
||||
router = APIRouter(prefix="/admin/token-rate-limits")
|
||||
|
||||
|
||||
@@ -1,22 +1,34 @@
|
||||
import csv
|
||||
import json
|
||||
import uuid
|
||||
from collections.abc import Generator
|
||||
from io import BytesIO
|
||||
from io import StringIO
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.messages import SystemMessage
|
||||
from pydantic import BaseModel
|
||||
|
||||
from danswer.configs.constants import FileOrigin
|
||||
from danswer.db.engine import get_session_with_default_tenant
|
||||
from danswer.file_store.file_store import get_default_file_store
|
||||
from danswer.file_store.models import ChatFileType
|
||||
from danswer.file_store.models import InMemoryChatFile
|
||||
from danswer.key_value_store.interface import JSON_ro
|
||||
from danswer.llm.answering.models import PreviousMessage
|
||||
from danswer.llm.answering.prompts.build import AnswerPromptBuilder
|
||||
from danswer.llm.interfaces import LLM
|
||||
from danswer.tools.base_tool import BaseTool
|
||||
from danswer.tools.message import ToolCallSummary
|
||||
from danswer.tools.models import CHAT_SESSION_ID_PLACEHOLDER
|
||||
from danswer.tools.models import DynamicSchemaInfo
|
||||
from danswer.tools.models import MESSAGE_ID_PLACEHOLDER
|
||||
from danswer.tools.models import ToolResponse
|
||||
from danswer.tools.tool_implementations.custom.base_tool_types import ToolResultType
|
||||
from danswer.tools.tool_implementations.custom.custom_tool_prompts import (
|
||||
SHOULD_USE_CUSTOM_TOOL_SYSTEM_PROMPT,
|
||||
)
|
||||
@@ -39,6 +51,9 @@ from danswer.tools.tool_implementations.custom.openapi_parsing import REQUEST_BO
|
||||
from danswer.tools.tool_implementations.custom.openapi_parsing import (
|
||||
validate_openapi_schema,
|
||||
)
|
||||
from danswer.tools.tool_implementations.custom.prompt import (
|
||||
build_custom_image_generation_user_prompt,
|
||||
)
|
||||
from danswer.utils.headers import header_list_to_header_dict
|
||||
from danswer.utils.headers import HeaderItemDict
|
||||
from danswer.utils.logger import setup_logger
|
||||
@@ -48,9 +63,14 @@ logger = setup_logger()
|
||||
CUSTOM_TOOL_RESPONSE_ID = "custom_tool_response"
|
||||
|
||||
|
||||
class CustomToolFileResponse(BaseModel):
|
||||
file_ids: List[str] # References to saved images or CSVs
|
||||
|
||||
|
||||
class CustomToolCallSummary(BaseModel):
|
||||
tool_name: str
|
||||
tool_result: ToolResultType
|
||||
response_type: str # e.g., 'json', 'image', 'csv', 'graph'
|
||||
tool_result: Any # The response data
|
||||
|
||||
|
||||
class CustomTool(BaseTool):
|
||||
@@ -91,6 +111,12 @@ class CustomTool(BaseTool):
|
||||
self, *args: ToolResponse
|
||||
) -> str | list[str | dict[str, Any]]:
|
||||
response = cast(CustomToolCallSummary, args[0].response)
|
||||
|
||||
if response.response_type == "image" or response.response_type == "csv":
|
||||
image_response = cast(CustomToolFileResponse, response.tool_result)
|
||||
return json.dumps({"file_ids": image_response.file_ids})
|
||||
|
||||
# For JSON or other responses, return as-is
|
||||
return json.dumps(response.tool_result)
|
||||
|
||||
"""For LLMs which do NOT support explicit tool calling"""
|
||||
@@ -158,6 +184,38 @@ class CustomTool(BaseTool):
|
||||
)
|
||||
return None
|
||||
|
||||
def _save_and_get_file_references(
|
||||
self, file_content: bytes | str, content_type: str
|
||||
) -> List[str]:
|
||||
with get_session_with_default_tenant() as db_session:
|
||||
file_store = get_default_file_store(db_session)
|
||||
|
||||
file_id = str(uuid.uuid4())
|
||||
|
||||
# Handle both binary and text content
|
||||
if isinstance(file_content, str):
|
||||
content = BytesIO(file_content.encode())
|
||||
else:
|
||||
content = BytesIO(file_content)
|
||||
|
||||
file_store.save_file(
|
||||
file_name=file_id,
|
||||
content=content,
|
||||
display_name=file_id,
|
||||
file_origin=FileOrigin.CHAT_UPLOAD,
|
||||
file_type=content_type,
|
||||
file_metadata={
|
||||
"content_type": content_type,
|
||||
},
|
||||
)
|
||||
|
||||
return [file_id]
|
||||
|
||||
def _parse_csv(self, csv_text: str) -> List[Dict[str, Any]]:
|
||||
csv_file = StringIO(csv_text)
|
||||
reader = csv.DictReader(csv_file)
|
||||
return [row for row in reader]
|
||||
|
||||
"""Actual execution of the tool"""
|
||||
|
||||
def run(self, **kwargs: Any) -> Generator[ToolResponse, None, None]:
|
||||
@@ -177,20 +235,103 @@ class CustomTool(BaseTool):
|
||||
|
||||
url = self._method_spec.build_url(self._base_url, path_params, query_params)
|
||||
method = self._method_spec.method
|
||||
# Log request details
|
||||
|
||||
response = requests.request(
|
||||
method, url, json=request_body, headers=self.headers
|
||||
)
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
|
||||
if "text/csv" in content_type:
|
||||
file_ids = self._save_and_get_file_references(
|
||||
response.content, content_type
|
||||
)
|
||||
tool_result = CustomToolFileResponse(file_ids=file_ids)
|
||||
response_type = "csv"
|
||||
|
||||
elif "image/" in content_type:
|
||||
file_ids = self._save_and_get_file_references(
|
||||
response.content, content_type
|
||||
)
|
||||
tool_result = CustomToolFileResponse(file_ids=file_ids)
|
||||
response_type = "image"
|
||||
|
||||
else:
|
||||
tool_result = response.json()
|
||||
response_type = "json"
|
||||
|
||||
logger.info(
|
||||
f"Returning tool response for {self._name} with type {response_type}"
|
||||
)
|
||||
|
||||
yield ToolResponse(
|
||||
id=CUSTOM_TOOL_RESPONSE_ID,
|
||||
response=CustomToolCallSummary(
|
||||
tool_name=self._name, tool_result=response.json()
|
||||
tool_name=self._name,
|
||||
response_type=response_type,
|
||||
tool_result=tool_result,
|
||||
),
|
||||
)
|
||||
|
||||
def build_next_prompt(
|
||||
self,
|
||||
prompt_builder: AnswerPromptBuilder,
|
||||
tool_call_summary: ToolCallSummary,
|
||||
tool_responses: list[ToolResponse],
|
||||
using_tool_calling_llm: bool,
|
||||
) -> AnswerPromptBuilder:
|
||||
response = cast(CustomToolCallSummary, tool_responses[0].response)
|
||||
|
||||
# Handle non-file responses using parent class behavior
|
||||
if response.response_type not in ["image", "csv"]:
|
||||
return super().build_next_prompt(
|
||||
prompt_builder,
|
||||
tool_call_summary,
|
||||
tool_responses,
|
||||
using_tool_calling_llm,
|
||||
)
|
||||
|
||||
# Handle image and CSV file responses
|
||||
file_type = (
|
||||
ChatFileType.IMAGE
|
||||
if response.response_type == "image"
|
||||
else ChatFileType.CSV
|
||||
)
|
||||
|
||||
# Load files from storage
|
||||
files = []
|
||||
with get_session_with_default_tenant() as db_session:
|
||||
file_store = get_default_file_store(db_session)
|
||||
|
||||
for file_id in response.tool_result.file_ids:
|
||||
try:
|
||||
file_io = file_store.read_file(file_id, mode="b")
|
||||
files.append(
|
||||
InMemoryChatFile(
|
||||
file_id=file_id,
|
||||
filename=file_id,
|
||||
content=file_io.read(),
|
||||
file_type=file_type,
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(f"Failed to read file {file_id}")
|
||||
|
||||
# Update prompt with file content
|
||||
prompt_builder.update_user_prompt(
|
||||
build_custom_image_generation_user_prompt(
|
||||
query=prompt_builder.get_user_message_content(),
|
||||
files=files,
|
||||
file_type=file_type,
|
||||
)
|
||||
)
|
||||
|
||||
return prompt_builder
|
||||
|
||||
def final_result(self, *args: ToolResponse) -> JSON_ro:
|
||||
return cast(CustomToolCallSummary, args[0].response).tool_result
|
||||
response = cast(CustomToolCallSummary, args[0].response)
|
||||
if isinstance(response.tool_result, CustomToolFileResponse):
|
||||
return response.tool_result.model_dump()
|
||||
return response.tool_result
|
||||
|
||||
|
||||
def build_custom_tools_from_openapi_schema_and_headers(
|
||||
|
||||
25
backend/danswer/tools/tool_implementations/custom/prompt.py
Normal file
25
backend/danswer/tools/tool_implementations/custom/prompt.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
from danswer.file_store.models import ChatFileType
|
||||
from danswer.file_store.models import InMemoryChatFile
|
||||
from danswer.llm.utils import build_content_with_imgs
|
||||
|
||||
|
||||
CUSTOM_IMG_GENERATION_SUMMARY_PROMPT = """
|
||||
You have just created the attached {file_type} file in response to the following query: "{query}".
|
||||
|
||||
Can you please summarize it in a sentence or two? Do NOT include image urls or bulleted lists.
|
||||
"""
|
||||
|
||||
|
||||
def build_custom_image_generation_user_prompt(
|
||||
query: str, file_type: ChatFileType, files: list[InMemoryChatFile] | None = None
|
||||
) -> HumanMessage:
|
||||
return HumanMessage(
|
||||
content=build_content_with_imgs(
|
||||
message=CUSTOM_IMG_GENERATION_SUMMARY_PROMPT.format(
|
||||
query=query, file_type=file_type.value
|
||||
).strip(),
|
||||
files=files,
|
||||
)
|
||||
)
|
||||
@@ -119,3 +119,30 @@ def noop_fallback(*args: Any, **kwargs: Any) -> None:
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
|
||||
def fetch_ee_implementation_or_noop(
|
||||
module: str, attribute: str, noop_return_value: Any = None
|
||||
) -> Any:
|
||||
"""
|
||||
Fetches an EE implementation if EE is enabled, otherwise returns a no-op function.
|
||||
Raises an exception if EE is enabled but the fetch fails.
|
||||
|
||||
Args:
|
||||
module (str): The name of the module from which to fetch the attribute.
|
||||
attribute (str): The name of the attribute to fetch from the module.
|
||||
|
||||
Returns:
|
||||
Any: The fetched EE implementation if successful and EE is enabled, otherwise a no-op function.
|
||||
|
||||
Raises:
|
||||
Exception: If EE is enabled but the fetch fails.
|
||||
"""
|
||||
if not global_version.is_ee_version():
|
||||
return lambda *args, **kwargs: noop_return_value
|
||||
|
||||
try:
|
||||
return fetch_versioned_implementation(module, attribute)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch implementation for {module}.{attribute}: {e}")
|
||||
raise
|
||||
|
||||
@@ -4,16 +4,15 @@ from fastapi import Request
|
||||
from fastapi import status
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.auth.api_key import get_hashed_api_key_from_request
|
||||
from danswer.auth.users import current_admin_user
|
||||
from danswer.configs.app_configs import AUTH_TYPE
|
||||
from danswer.configs.app_configs import SUPER_CLOUD_API_KEY
|
||||
from danswer.configs.app_configs import SUPER_USERS
|
||||
from danswer.configs.constants import AuthType
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.api_key import fetch_user_for_api_key
|
||||
from danswer.db.models import User
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.auth.api_key import get_hashed_api_key_from_request
|
||||
from ee.danswer.db.api_key import fetch_user_for_api_key
|
||||
from ee.danswer.db.saml import get_saml_account
|
||||
from ee.danswer.server.seeding import get_seed_config
|
||||
from ee.danswer.utils.secrets import extract_hashed_cookie
|
||||
@@ -48,25 +47,6 @@ async def optional_user_(
|
||||
return user
|
||||
|
||||
|
||||
def api_key_dep(
|
||||
request: Request, db_session: Session = Depends(get_session)
|
||||
) -> User | None:
|
||||
if AUTH_TYPE == AuthType.DISABLED:
|
||||
return None
|
||||
|
||||
hashed_api_key = get_hashed_api_key_from_request(request)
|
||||
if not hashed_api_key:
|
||||
raise HTTPException(status_code=401, detail="Missing API key")
|
||||
|
||||
if hashed_api_key:
|
||||
user = fetch_user_for_api_key(hashed_api_key, db_session)
|
||||
|
||||
if user is None:
|
||||
raise HTTPException(status_code=401, detail="Invalid API key")
|
||||
|
||||
return user
|
||||
|
||||
|
||||
def get_default_admin_user_emails_() -> list[str]:
|
||||
seed_config = get_seed_config()
|
||||
if seed_config and seed_config.admin_user_emails:
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
from danswer.background.celery.apps.primary import celery_app
|
||||
from danswer.background.task_name_builders import (
|
||||
name_sync_external_doc_permissions_task,
|
||||
)
|
||||
from danswer.background.task_utils import build_celery_task_wrapper
|
||||
from danswer.configs.app_configs import JOB_TIMEOUT
|
||||
from danswer.db.chat import delete_chat_sessions_older_than
|
||||
@@ -14,9 +17,6 @@ from ee.danswer.background.celery_utils import (
|
||||
should_perform_external_group_permissions_check,
|
||||
)
|
||||
from ee.danswer.background.task_name_builders import name_chat_ttl_task
|
||||
from ee.danswer.background.task_name_builders import (
|
||||
name_sync_external_doc_permissions_task,
|
||||
)
|
||||
from ee.danswer.background.task_name_builders import (
|
||||
name_sync_external_group_permissions_task,
|
||||
)
|
||||
|
||||
@@ -3,15 +3,15 @@ from datetime import timezone
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.background.task_name_builders import (
|
||||
name_sync_external_doc_permissions_task,
|
||||
)
|
||||
from danswer.db.enums import AccessType
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.tasks import check_task_is_live_and_not_timed_out
|
||||
from danswer.db.tasks import get_latest_task
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.background.task_name_builders import name_chat_ttl_task
|
||||
from ee.danswer.background.task_name_builders import (
|
||||
name_sync_external_doc_permissions_task,
|
||||
)
|
||||
from ee.danswer.background.task_name_builders import (
|
||||
name_sync_external_group_permissions_task,
|
||||
)
|
||||
|
||||
@@ -2,12 +2,6 @@ def name_chat_ttl_task(retention_limit_days: int, tenant_id: str | None = None)
|
||||
return f"chat_ttl_{retention_limit_days}_days"
|
||||
|
||||
|
||||
def name_sync_external_doc_permissions_task(
|
||||
cc_pair_id: int, tenant_id: str | None = None
|
||||
) -> str:
|
||||
return f"sync_external_doc_permissions_task__{cc_pair_id}"
|
||||
|
||||
|
||||
def name_sync_external_group_permissions_task(
|
||||
cc_pair_id: int, tenant_id: str | None = None
|
||||
) -> str:
|
||||
|
||||
@@ -7,16 +7,6 @@ OPENID_CONFIG_URL = os.environ.get("OPENID_CONFIG_URL", "")
|
||||
SAML_CONF_DIR = os.environ.get("SAML_CONF_DIR") or "/app/ee/danswer/configs/saml_config"
|
||||
|
||||
|
||||
#####
|
||||
# API Key Configs
|
||||
#####
|
||||
# refers to the rounds described here: https://passlib.readthedocs.io/en/stable/lib/passlib.hash.sha256_crypt.html
|
||||
_API_KEY_HASH_ROUNDS_RAW = os.environ.get("API_KEY_HASH_ROUNDS")
|
||||
API_KEY_HASH_ROUNDS = (
|
||||
int(_API_KEY_HASH_ROUNDS_RAW) if _API_KEY_HASH_ROUNDS_RAW else None
|
||||
)
|
||||
|
||||
|
||||
#####
|
||||
# Auto Permission Sync
|
||||
#####
|
||||
@@ -25,3 +15,7 @@ NUM_PERMISSION_WORKERS = int(os.environ.get("NUM_PERMISSION_WORKERS") or 2)
|
||||
|
||||
STRIPE_SECRET_KEY = os.environ.get("STRIPE_SECRET_KEY")
|
||||
STRIPE_PRICE_ID = os.environ.get("STRIPE_PRICE")
|
||||
|
||||
OPENAI_DEFAULT_API_KEY = os.environ.get("OPENAI_DEFAULT_API_KEY")
|
||||
ANTHROPIC_DEFAULT_API_KEY = os.environ.get("ANTHROPIC_DEFAULT_API_KEY")
|
||||
COHERE_DEFAULT_API_KEY = os.environ.get("COHERE_DEFAULT_API_KEY")
|
||||
|
||||
@@ -65,64 +65,6 @@ def _add_user_filters(
|
||||
return stmt.where(where_clause)
|
||||
|
||||
|
||||
def fetch_all_user_token_rate_limits(
|
||||
db_session: Session,
|
||||
enabled_only: bool = False,
|
||||
ordered: bool = True,
|
||||
) -> Sequence[TokenRateLimit]:
|
||||
query = select(TokenRateLimit).where(
|
||||
TokenRateLimit.scope == TokenRateLimitScope.USER
|
||||
)
|
||||
|
||||
if enabled_only:
|
||||
query = query.where(TokenRateLimit.enabled.is_(True))
|
||||
|
||||
if ordered:
|
||||
query = query.order_by(TokenRateLimit.created_at.desc())
|
||||
|
||||
return db_session.scalars(query).all()
|
||||
|
||||
|
||||
def fetch_all_global_token_rate_limits(
|
||||
db_session: Session,
|
||||
enabled_only: bool = False,
|
||||
ordered: bool = True,
|
||||
) -> Sequence[TokenRateLimit]:
|
||||
query = select(TokenRateLimit).where(
|
||||
TokenRateLimit.scope == TokenRateLimitScope.GLOBAL
|
||||
)
|
||||
|
||||
if enabled_only:
|
||||
query = query.where(TokenRateLimit.enabled.is_(True))
|
||||
|
||||
if ordered:
|
||||
query = query.order_by(TokenRateLimit.created_at.desc())
|
||||
|
||||
token_rate_limits = db_session.scalars(query).all()
|
||||
return token_rate_limits
|
||||
|
||||
|
||||
def fetch_user_group_token_rate_limits(
|
||||
db_session: Session,
|
||||
group_id: int,
|
||||
user: User | None = None,
|
||||
enabled_only: bool = False,
|
||||
ordered: bool = True,
|
||||
get_editable: bool = True,
|
||||
) -> Sequence[TokenRateLimit]:
|
||||
stmt = select(TokenRateLimit)
|
||||
stmt = stmt.where(User__UserGroup.user_group_id == group_id)
|
||||
stmt = _add_user_filters(stmt, user, get_editable)
|
||||
|
||||
if enabled_only:
|
||||
stmt = stmt.where(TokenRateLimit.enabled.is_(True))
|
||||
|
||||
if ordered:
|
||||
stmt = stmt.order_by(TokenRateLimit.created_at.desc())
|
||||
|
||||
return db_session.scalars(stmt).all()
|
||||
|
||||
|
||||
def fetch_all_user_group_token_rate_limits_by_group(
|
||||
db_session: Session,
|
||||
) -> Sequence[Row[tuple[TokenRateLimit, str]]]:
|
||||
@@ -138,38 +80,6 @@ def fetch_all_user_group_token_rate_limits_by_group(
|
||||
return db_session.execute(query).all()
|
||||
|
||||
|
||||
def insert_user_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_settings: TokenRateLimitArgs,
|
||||
) -> TokenRateLimit:
|
||||
token_limit = TokenRateLimit(
|
||||
enabled=token_rate_limit_settings.enabled,
|
||||
token_budget=token_rate_limit_settings.token_budget,
|
||||
period_hours=token_rate_limit_settings.period_hours,
|
||||
scope=TokenRateLimitScope.USER,
|
||||
)
|
||||
db_session.add(token_limit)
|
||||
db_session.commit()
|
||||
|
||||
return token_limit
|
||||
|
||||
|
||||
def insert_global_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_settings: TokenRateLimitArgs,
|
||||
) -> TokenRateLimit:
|
||||
token_limit = TokenRateLimit(
|
||||
enabled=token_rate_limit_settings.enabled,
|
||||
token_budget=token_rate_limit_settings.token_budget,
|
||||
period_hours=token_rate_limit_settings.period_hours,
|
||||
scope=TokenRateLimitScope.GLOBAL,
|
||||
)
|
||||
db_session.add(token_limit)
|
||||
db_session.commit()
|
||||
|
||||
return token_limit
|
||||
|
||||
|
||||
def insert_user_group_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_settings: TokenRateLimitArgs,
|
||||
@@ -193,34 +103,22 @@ def insert_user_group_token_rate_limit(
|
||||
return token_limit
|
||||
|
||||
|
||||
def update_token_rate_limit(
|
||||
def fetch_user_group_token_rate_limits(
|
||||
db_session: Session,
|
||||
token_rate_limit_id: int,
|
||||
token_rate_limit_settings: TokenRateLimitArgs,
|
||||
) -> TokenRateLimit:
|
||||
token_limit = db_session.get(TokenRateLimit, token_rate_limit_id)
|
||||
if token_limit is None:
|
||||
raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found")
|
||||
group_id: int,
|
||||
user: User | None = None,
|
||||
enabled_only: bool = False,
|
||||
ordered: bool = True,
|
||||
get_editable: bool = True,
|
||||
) -> Sequence[TokenRateLimit]:
|
||||
stmt = select(TokenRateLimit)
|
||||
stmt = stmt.where(User__UserGroup.user_group_id == group_id)
|
||||
stmt = _add_user_filters(stmt, user, get_editable)
|
||||
|
||||
token_limit.enabled = token_rate_limit_settings.enabled
|
||||
token_limit.token_budget = token_rate_limit_settings.token_budget
|
||||
token_limit.period_hours = token_rate_limit_settings.period_hours
|
||||
db_session.commit()
|
||||
if enabled_only:
|
||||
stmt = stmt.where(TokenRateLimit.enabled.is_(True))
|
||||
|
||||
return token_limit
|
||||
if ordered:
|
||||
stmt = stmt.order_by(TokenRateLimit.created_at.desc())
|
||||
|
||||
|
||||
def delete_token_rate_limit(
|
||||
db_session: Session,
|
||||
token_rate_limit_id: int,
|
||||
) -> None:
|
||||
token_limit = db_session.get(TokenRateLimit, token_rate_limit_id)
|
||||
if token_limit is None:
|
||||
raise ValueError(f"TokenRateLimit with id '{token_rate_limit_id}' not found")
|
||||
|
||||
db_session.query(TokenRateLimit__UserGroup).filter(
|
||||
TokenRateLimit__UserGroup.rate_limit_id == token_rate_limit_id
|
||||
).delete()
|
||||
|
||||
db_session.delete(token_limit)
|
||||
db_session.commit()
|
||||
return db_session.scalars(stmt).all()
|
||||
|
||||
@@ -66,7 +66,7 @@ def get_all_empty_chat_message_entries(
|
||||
return
|
||||
|
||||
yield message_skeletons
|
||||
initial_id = message_skeletons[-1].message_id
|
||||
initial_id = message_skeletons[-1].chat_session_id
|
||||
|
||||
|
||||
def get_all_usage_reports(db_session: Session) -> list[UsageReportMetadata]:
|
||||
|
||||
68
backend/ee/danswer/external_permissions/gmail/doc_sync.py
Normal file
68
backend/ee/danswer/external_permissions/gmail/doc_sync.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.access.models import ExternalAccess
|
||||
from danswer.connectors.gmail.connector import GmailConnector
|
||||
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.db.document import upsert_document_external_perms__no_commit
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _get_slim_doc_generator(
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
gmail_connector: GmailConnector,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
current_time = datetime.now(timezone.utc)
|
||||
start_time = (
|
||||
cc_pair.last_time_perm_sync.replace(tzinfo=timezone.utc).timestamp()
|
||||
if cc_pair.last_time_perm_sync
|
||||
else 0.0
|
||||
)
|
||||
|
||||
return gmail_connector.retrieve_all_slim_documents(
|
||||
start=start_time, end=current_time.timestamp()
|
||||
)
|
||||
|
||||
|
||||
def gmail_doc_sync(
|
||||
db_session: Session,
|
||||
cc_pair: ConnectorCredentialPair,
|
||||
) -> None:
|
||||
"""
|
||||
Adds the external permissions to the documents in postgres
|
||||
if the document doesn't already exists in postgres, we create
|
||||
it in postgres so that when it gets created later, the permissions are
|
||||
already populated
|
||||
"""
|
||||
gmail_connector = GmailConnector(**cc_pair.connector.connector_specific_config)
|
||||
gmail_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
slim_doc_generator = _get_slim_doc_generator(cc_pair, gmail_connector)
|
||||
|
||||
for slim_doc_batch in slim_doc_generator:
|
||||
for slim_doc in slim_doc_batch:
|
||||
if slim_doc.perm_sync_data is None:
|
||||
logger.warning(f"No permissions found for document {slim_doc.id}")
|
||||
continue
|
||||
if user_email := slim_doc.perm_sync_data.get("user_email"):
|
||||
ext_access = ExternalAccess(
|
||||
external_user_emails=set([user_email]),
|
||||
external_user_group_ids=set(),
|
||||
is_public=False,
|
||||
)
|
||||
batch_add_non_web_user_if_not_exists__no_commit(
|
||||
db_session=db_session,
|
||||
emails=list(ext_access.external_user_emails),
|
||||
)
|
||||
upsert_document_external_perms__no_commit(
|
||||
db_session=db_session,
|
||||
doc_id=slim_doc.id,
|
||||
external_access=ext_access,
|
||||
source_type=cc_pair.connector.source,
|
||||
)
|
||||
@@ -6,7 +6,8 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.access.models import ExternalAccess
|
||||
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from danswer.connectors.google_drive.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_utils.resources import get_drive_service
|
||||
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from danswer.connectors.models import SlimDocument
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
@@ -56,7 +57,10 @@ def _fetch_permissions_for_permission_ids(
|
||||
return permissions
|
||||
|
||||
owner_email = permission_info.get("owner_email")
|
||||
drive_service = google_drive_connector.get_google_resource(user_email=owner_email)
|
||||
drive_service = get_drive_service(
|
||||
creds=google_drive_connector.creds,
|
||||
user_email=(owner_email or google_drive_connector.primary_admin_email),
|
||||
)
|
||||
|
||||
# Otherwise, fetch all permissions and update cache
|
||||
fetched_permissions = execute_paginated_retrieval(
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.connectors.google_drive.connector import GoogleDriveConnector
|
||||
from danswer.connectors.google_drive.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_utils.google_utils import execute_paginated_retrieval
|
||||
from danswer.connectors.google_utils.resources import get_admin_service
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from danswer.db.users import batch_add_non_web_user_if_not_exists__no_commit
|
||||
from danswer.utils.logger import setup_logger
|
||||
@@ -19,8 +20,9 @@ def gdrive_group_sync(
|
||||
**cc_pair.connector.connector_specific_config
|
||||
)
|
||||
google_drive_connector.load_credentials(cc_pair.credential.credential_json)
|
||||
|
||||
admin_service = google_drive_connector.get_google_resource("admin", "directory_v1")
|
||||
admin_service = get_admin_service(
|
||||
google_drive_connector.creds, google_drive_connector.primary_admin_email
|
||||
)
|
||||
|
||||
danswer_groups: list[ExternalUserGroup] = []
|
||||
for group in execute_paginated_retrieval(
|
||||
|
||||
@@ -6,6 +6,7 @@ from danswer.configs.constants import DocumentSource
|
||||
from danswer.db.models import ConnectorCredentialPair
|
||||
from ee.danswer.external_permissions.confluence.doc_sync import confluence_doc_sync
|
||||
from ee.danswer.external_permissions.confluence.group_sync import confluence_group_sync
|
||||
from ee.danswer.external_permissions.gmail.doc_sync import gmail_doc_sync
|
||||
from ee.danswer.external_permissions.google_drive.doc_sync import gdrive_doc_sync
|
||||
from ee.danswer.external_permissions.google_drive.group_sync import gdrive_group_sync
|
||||
from ee.danswer.external_permissions.slack.doc_sync import slack_doc_sync
|
||||
@@ -28,6 +29,7 @@ DOC_PERMISSIONS_FUNC_MAP: dict[DocumentSource, SyncFuncType] = {
|
||||
DocumentSource.GOOGLE_DRIVE: gdrive_doc_sync,
|
||||
DocumentSource.CONFLUENCE: confluence_doc_sync,
|
||||
DocumentSource.SLACK: slack_doc_sync,
|
||||
DocumentSource.GMAIL: gmail_doc_sync,
|
||||
}
|
||||
|
||||
# These functions update:
|
||||
|
||||
@@ -12,11 +12,11 @@ from danswer.configs.app_configs import WEB_DOMAIN
|
||||
from danswer.configs.constants import AuthType
|
||||
from danswer.main import get_application as get_application_base
|
||||
from danswer.main import include_router_with_global_prefix_prepended
|
||||
from danswer.server.api_key.api import router as api_key_router
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.utils.variable_functionality import global_version
|
||||
from ee.danswer.configs.app_configs import OPENID_CONFIG_URL
|
||||
from ee.danswer.server.analytics.api import router as analytics_router
|
||||
from ee.danswer.server.api_key.api import router as api_key_router
|
||||
from ee.danswer.server.auth_check import check_ee_router_auth
|
||||
from ee.danswer.server.enterprise_settings.api import (
|
||||
admin_router as enterprise_settings_admin_router,
|
||||
|
||||
@@ -8,9 +8,9 @@ from fastapi import HTTPException
|
||||
from fastapi import Request
|
||||
from fastapi import Response
|
||||
|
||||
from danswer.auth.api_key import extract_tenant_from_api_key_header
|
||||
from danswer.configs.app_configs import USER_AUTH_SECRET
|
||||
from danswer.db.engine import is_valid_schema_name
|
||||
from ee.danswer.auth.api_key import extract_tenant_from_api_key_header
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
@@ -12,6 +12,7 @@ from sqlalchemy import func
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.db.api_key import is_api_key_email_address
|
||||
from danswer.db.engine import get_session_with_tenant
|
||||
from danswer.db.models import ChatMessage
|
||||
from danswer.db.models import ChatSession
|
||||
@@ -20,12 +21,11 @@ from danswer.db.models import TokenRateLimit__UserGroup
|
||||
from danswer.db.models import User
|
||||
from danswer.db.models import User__UserGroup
|
||||
from danswer.db.models import UserGroup
|
||||
from danswer.db.token_limit import fetch_all_user_token_rate_limits
|
||||
from danswer.server.query_and_chat.token_limit import _get_cutoff_time
|
||||
from danswer.server.query_and_chat.token_limit import _is_rate_limited
|
||||
from danswer.server.query_and_chat.token_limit import _user_is_rate_limited_by_global
|
||||
from danswer.utils.threadpool_concurrency import run_functions_tuples_in_parallel
|
||||
from ee.danswer.db.api_key import is_api_key_email_address
|
||||
from ee.danswer.db.token_limit import fetch_all_user_token_rate_limits
|
||||
|
||||
|
||||
def _check_token_rate_limits(user: User | None, tenant_id: str | None) -> None:
|
||||
|
||||
@@ -7,7 +7,6 @@ from fastapi import Response
|
||||
from danswer.auth.users import auth_backend
|
||||
from danswer.auth.users import current_admin_user
|
||||
from danswer.auth.users import get_jwt_strategy
|
||||
from danswer.auth.users import get_tenant_id_for_email
|
||||
from danswer.auth.users import User
|
||||
from danswer.configs.app_configs import WEB_DOMAIN
|
||||
from danswer.db.engine import get_session_with_tenant
|
||||
@@ -15,7 +14,6 @@ from danswer.db.notification import create_notification
|
||||
from danswer.db.users import get_user_by_email
|
||||
from danswer.server.settings.store import load_settings
|
||||
from danswer.server.settings.store import store_settings
|
||||
from danswer.setup import setup_danswer
|
||||
from danswer.utils.logger import setup_logger
|
||||
from ee.danswer.auth.users import current_cloud_superuser
|
||||
from ee.danswer.configs.app_configs import STRIPE_SECRET_KEY
|
||||
@@ -23,67 +21,17 @@ from ee.danswer.server.tenants.access import control_plane_dep
|
||||
from ee.danswer.server.tenants.billing import fetch_billing_information
|
||||
from ee.danswer.server.tenants.billing import fetch_tenant_stripe_information
|
||||
from ee.danswer.server.tenants.models import BillingInformation
|
||||
from ee.danswer.server.tenants.models import CreateTenantRequest
|
||||
from ee.danswer.server.tenants.models import ImpersonateRequest
|
||||
from ee.danswer.server.tenants.models import ProductGatingRequest
|
||||
from ee.danswer.server.tenants.provisioning import add_users_to_tenant
|
||||
from ee.danswer.server.tenants.provisioning import ensure_schema_exists
|
||||
from ee.danswer.server.tenants.provisioning import run_alembic_migrations
|
||||
from ee.danswer.server.tenants.provisioning import user_owns_a_tenant
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from ee.danswer.server.tenants.user_mapping import get_tenant_id_for_email
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
|
||||
stripe.api_key = STRIPE_SECRET_KEY
|
||||
|
||||
logger = setup_logger()
|
||||
router = APIRouter(prefix="/tenants")
|
||||
|
||||
|
||||
@router.post("/create")
|
||||
def create_tenant(
|
||||
create_tenant_request: CreateTenantRequest, _: None = Depends(control_plane_dep)
|
||||
) -> dict[str, str]:
|
||||
if not MULTI_TENANT:
|
||||
raise HTTPException(status_code=403, detail="Multi-tenancy is not enabled")
|
||||
|
||||
tenant_id = create_tenant_request.tenant_id
|
||||
email = create_tenant_request.initial_admin_email
|
||||
token = None
|
||||
|
||||
if user_owns_a_tenant(email):
|
||||
raise HTTPException(
|
||||
status_code=409, detail="User already belongs to an organization"
|
||||
)
|
||||
|
||||
try:
|
||||
if not ensure_schema_exists(tenant_id):
|
||||
logger.info(f"Created schema for tenant {tenant_id}")
|
||||
else:
|
||||
logger.info(f"Schema already exists for tenant {tenant_id}")
|
||||
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
run_alembic_migrations(tenant_id)
|
||||
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
setup_danswer(db_session, tenant_id)
|
||||
|
||||
add_users_to_tenant([email], tenant_id)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Tenant {tenant_id} created successfully",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to create tenant {tenant_id}: {str(e)}")
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to create tenant: {str(e)}"
|
||||
)
|
||||
finally:
|
||||
if token is not None:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
|
||||
|
||||
@router.post("/product-gating")
|
||||
def gate_product(
|
||||
product_gating_request: ProductGatingRequest, _: None = Depends(control_plane_dep)
|
||||
|
||||
@@ -33,3 +33,8 @@ class CheckoutSessionCreationResponse(BaseModel):
|
||||
|
||||
class ImpersonateRequest(BaseModel):
|
||||
email: str
|
||||
|
||||
|
||||
class TenantCreationPayload(BaseModel):
|
||||
tenant_id: str
|
||||
email: str
|
||||
|
||||
@@ -1,114 +1,210 @@
|
||||
import os
|
||||
from types import SimpleNamespace
|
||||
import asyncio
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import text
|
||||
import aiohttp # Async HTTP client
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.schema import CreateSchema
|
||||
|
||||
from alembic import command
|
||||
from alembic.config import Config
|
||||
from danswer.db.engine import build_connection_string
|
||||
from danswer.auth.users import exceptions
|
||||
from danswer.configs.app_configs import CONTROL_PLANE_API_BASE_URL
|
||||
from danswer.db.engine import get_session_with_tenant
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.db.llm import update_default_provider
|
||||
from danswer.db.llm import upsert_cloud_embedding_provider
|
||||
from danswer.db.llm import upsert_llm_provider
|
||||
from danswer.db.models import UserTenantMapping
|
||||
from danswer.utils.logger import setup_logger
|
||||
from danswer.llm.llm_provider_options import ANTHROPIC_MODEL_NAMES
|
||||
from danswer.llm.llm_provider_options import ANTHROPIC_PROVIDER_NAME
|
||||
from danswer.llm.llm_provider_options import OPEN_AI_MODEL_NAMES
|
||||
from danswer.llm.llm_provider_options import OPENAI_PROVIDER_NAME
|
||||
from danswer.server.manage.embedding.models import CloudEmbeddingProviderCreationRequest
|
||||
from danswer.server.manage.llm.models import LLMProviderUpsertRequest
|
||||
from danswer.setup import setup_danswer
|
||||
from ee.danswer.configs.app_configs import ANTHROPIC_DEFAULT_API_KEY
|
||||
from ee.danswer.configs.app_configs import COHERE_DEFAULT_API_KEY
|
||||
from ee.danswer.configs.app_configs import OPENAI_DEFAULT_API_KEY
|
||||
from ee.danswer.server.tenants.access import generate_data_plane_token
|
||||
from ee.danswer.server.tenants.models import TenantCreationPayload
|
||||
from ee.danswer.server.tenants.schema_management import create_schema_if_not_exists
|
||||
from ee.danswer.server.tenants.schema_management import drop_schema
|
||||
from ee.danswer.server.tenants.schema_management import run_alembic_migrations
|
||||
from ee.danswer.server.tenants.user_mapping import add_users_to_tenant
|
||||
from ee.danswer.server.tenants.user_mapping import get_tenant_id_for_email
|
||||
from ee.danswer.server.tenants.user_mapping import user_owns_a_tenant
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.configs import TENANT_ID_PREFIX
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
from shared_configs.enums import EmbeddingProvider
|
||||
|
||||
logger = setup_logger()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_alembic_migrations(schema_name: str) -> None:
|
||||
logger.info(f"Starting Alembic migrations for schema: {schema_name}")
|
||||
async def get_or_create_tenant_id(email: str) -> str:
|
||||
"""Get existing tenant ID for an email or create a new tenant if none exists."""
|
||||
if not MULTI_TENANT:
|
||||
return POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
try:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
root_dir = os.path.abspath(os.path.join(current_dir, "..", "..", "..", ".."))
|
||||
alembic_ini_path = os.path.join(root_dir, "alembic.ini")
|
||||
tenant_id = get_tenant_id_for_email(email)
|
||||
except exceptions.UserNotExists:
|
||||
# If tenant does not exist and in Multi tenant mode, provision a new tenant
|
||||
try:
|
||||
tenant_id = await create_tenant(email)
|
||||
except Exception as e:
|
||||
logger.error(f"Tenant provisioning failed: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to provision tenant.")
|
||||
|
||||
# Configure Alembic
|
||||
alembic_cfg = Config(alembic_ini_path)
|
||||
alembic_cfg.set_main_option("sqlalchemy.url", build_connection_string())
|
||||
alembic_cfg.set_main_option(
|
||||
"script_location", os.path.join(root_dir, "alembic")
|
||||
if not tenant_id:
|
||||
raise HTTPException(
|
||||
status_code=401, detail="User does not belong to an organization"
|
||||
)
|
||||
|
||||
# Ensure that logging isn't broken
|
||||
alembic_cfg.attributes["configure_logger"] = False
|
||||
return tenant_id
|
||||
|
||||
# Mimic command-line options by adding 'cmd_opts' to the config
|
||||
alembic_cfg.cmd_opts = SimpleNamespace() # type: ignore
|
||||
alembic_cfg.cmd_opts.x = [f"schema={schema_name}"] # type: ignore
|
||||
|
||||
# Run migrations programmatically
|
||||
command.upgrade(alembic_cfg, "head")
|
||||
async def create_tenant(email: str) -> str:
|
||||
tenant_id = TENANT_ID_PREFIX + str(uuid.uuid4())
|
||||
try:
|
||||
# Provision tenant on data plane
|
||||
await provision_tenant(tenant_id, email)
|
||||
# Notify control plane
|
||||
await notify_control_plane(tenant_id, email)
|
||||
except Exception as e:
|
||||
logger.error(f"Tenant provisioning failed: {e}")
|
||||
await rollback_tenant_provisioning(tenant_id)
|
||||
raise HTTPException(status_code=500, detail="Failed to provision tenant.")
|
||||
return tenant_id
|
||||
|
||||
# Run migrations programmatically
|
||||
logger.info(
|
||||
f"Alembic migrations completed successfully for schema: {schema_name}"
|
||||
|
||||
async def provision_tenant(tenant_id: str, email: str) -> None:
|
||||
if not MULTI_TENANT:
|
||||
raise HTTPException(status_code=403, detail="Multi-tenancy is not enabled")
|
||||
|
||||
if user_owns_a_tenant(email):
|
||||
raise HTTPException(
|
||||
status_code=409, detail="User already belongs to an organization"
|
||||
)
|
||||
|
||||
logger.info(f"Provisioning tenant: {tenant_id}")
|
||||
token = None
|
||||
|
||||
try:
|
||||
if not create_schema_if_not_exists(tenant_id):
|
||||
logger.info(f"Created schema for tenant {tenant_id}")
|
||||
else:
|
||||
logger.info(f"Schema already exists for tenant {tenant_id}")
|
||||
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
|
||||
# Await the Alembic migrations
|
||||
await asyncio.to_thread(run_alembic_migrations, tenant_id)
|
||||
|
||||
with get_session_with_tenant(tenant_id) as db_session:
|
||||
setup_danswer(db_session, tenant_id)
|
||||
configure_default_api_keys(db_session)
|
||||
|
||||
add_users_to_tenant([email], tenant_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Alembic migration failed for schema {schema_name}: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def ensure_schema_exists(tenant_id: str) -> bool:
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
with db_session.begin():
|
||||
result = db_session.execute(
|
||||
text(
|
||||
"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema_name"
|
||||
),
|
||||
{"schema_name": tenant_id},
|
||||
)
|
||||
schema_exists = result.scalar() is not None
|
||||
if not schema_exists:
|
||||
stmt = CreateSchema(tenant_id)
|
||||
db_session.execute(stmt)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# For now, we're implementing a primitive mapping between users and tenants.
|
||||
# This function is only used to determine a user's relationship to a tenant upon creation (implying ownership).
|
||||
def user_owns_a_tenant(email: str) -> bool:
|
||||
with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session:
|
||||
result = (
|
||||
db_session.query(UserTenantMapping)
|
||||
.filter(UserTenantMapping.email == email)
|
||||
.first()
|
||||
logger.exception(f"Failed to create tenant {tenant_id}")
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to create tenant: {str(e)}"
|
||||
)
|
||||
return result is not None
|
||||
finally:
|
||||
if token is not None:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
|
||||
|
||||
def add_users_to_tenant(emails: list[str], tenant_id: str) -> None:
|
||||
with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session:
|
||||
try:
|
||||
for email in emails:
|
||||
db_session.add(UserTenantMapping(email=email, tenant_id=tenant_id))
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to add users to tenant {tenant_id}: {str(e)}")
|
||||
db_session.commit()
|
||||
async def notify_control_plane(tenant_id: str, email: str) -> None:
|
||||
logger.info("Fetching billing information")
|
||||
token = generate_data_plane_token()
|
||||
headers = {
|
||||
"Authorization": f"Bearer {token}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
payload = TenantCreationPayload(tenant_id=tenant_id, email=email)
|
||||
|
||||
|
||||
def remove_users_from_tenant(emails: list[str], tenant_id: str) -> None:
|
||||
with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session:
|
||||
try:
|
||||
mappings_to_delete = (
|
||||
db_session.query(UserTenantMapping)
|
||||
.filter(
|
||||
UserTenantMapping.email.in_(emails),
|
||||
UserTenantMapping.tenant_id == tenant_id,
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(
|
||||
f"{CONTROL_PLANE_API_BASE_URL}/tenants/create",
|
||||
headers=headers,
|
||||
json=payload.model_dump(),
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
error_text = await response.text()
|
||||
logger.error(f"Control plane tenant creation failed: {error_text}")
|
||||
raise Exception(
|
||||
f"Failed to create tenant on control plane: {error_text}"
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
for mapping in mappings_to_delete:
|
||||
db_session.delete(mapping)
|
||||
|
||||
async def rollback_tenant_provisioning(tenant_id: str) -> None:
|
||||
# Logic to rollback tenant provisioning on data plane
|
||||
logger.info(f"Rolling back tenant provisioning for tenant_id: {tenant_id}")
|
||||
try:
|
||||
# Drop the tenant's schema to rollback provisioning
|
||||
drop_schema(tenant_id)
|
||||
# Remove tenant mapping
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
db_session.query(UserTenantMapping).filter(
|
||||
UserTenantMapping.tenant_id == tenant_id
|
||||
).delete()
|
||||
db_session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to rollback tenant provisioning: {e}")
|
||||
|
||||
|
||||
def configure_default_api_keys(db_session: Session) -> None:
|
||||
if OPENAI_DEFAULT_API_KEY:
|
||||
open_provider = LLMProviderUpsertRequest(
|
||||
name="OpenAI",
|
||||
provider=OPENAI_PROVIDER_NAME,
|
||||
api_key=OPENAI_DEFAULT_API_KEY,
|
||||
default_model_name="gpt-4",
|
||||
fast_default_model_name="gpt-4o-mini",
|
||||
model_names=OPEN_AI_MODEL_NAMES,
|
||||
)
|
||||
try:
|
||||
full_provider = upsert_llm_provider(open_provider, db_session)
|
||||
update_default_provider(full_provider.id, db_session)
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Failed to remove users from tenant {tenant_id}: {str(e)}"
|
||||
)
|
||||
db_session.rollback()
|
||||
logger.error(f"Failed to configure OpenAI provider: {e}")
|
||||
else:
|
||||
logger.error(
|
||||
"OPENAI_DEFAULT_API_KEY not set, skipping OpenAI provider configuration"
|
||||
)
|
||||
|
||||
if ANTHROPIC_DEFAULT_API_KEY:
|
||||
anthropic_provider = LLMProviderUpsertRequest(
|
||||
name="Anthropic",
|
||||
provider=ANTHROPIC_PROVIDER_NAME,
|
||||
api_key=ANTHROPIC_DEFAULT_API_KEY,
|
||||
default_model_name="claude-3-5-sonnet-20241022",
|
||||
fast_default_model_name="claude-3-5-sonnet-20241022",
|
||||
model_names=ANTHROPIC_MODEL_NAMES,
|
||||
)
|
||||
try:
|
||||
full_provider = upsert_llm_provider(anthropic_provider, db_session)
|
||||
update_default_provider(full_provider.id, db_session)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to configure Anthropic provider: {e}")
|
||||
else:
|
||||
logger.error(
|
||||
"ANTHROPIC_DEFAULT_API_KEY not set, skipping Anthropic provider configuration"
|
||||
)
|
||||
|
||||
if COHERE_DEFAULT_API_KEY:
|
||||
cloud_embedding_provider = CloudEmbeddingProviderCreationRequest(
|
||||
provider_type=EmbeddingProvider.COHERE,
|
||||
api_key=COHERE_DEFAULT_API_KEY,
|
||||
)
|
||||
try:
|
||||
upsert_cloud_embedding_provider(db_session, cloud_embedding_provider)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to configure Cohere embedding provider: {e}")
|
||||
else:
|
||||
logger.error(
|
||||
"COHERE_DEFAULT_API_KEY not set, skipping Cohere embedding provider configuration"
|
||||
)
|
||||
|
||||
76
backend/ee/danswer/server/tenants/schema_management.py
Normal file
76
backend/ee/danswer/server/tenants/schema_management.py
Normal file
@@ -0,0 +1,76 @@
|
||||
import logging
|
||||
import os
|
||||
from types import SimpleNamespace
|
||||
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.schema import CreateSchema
|
||||
|
||||
from alembic import command
|
||||
from alembic.config import Config
|
||||
from danswer.db.engine import build_connection_string
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_alembic_migrations(schema_name: str) -> None:
|
||||
logger.info(f"Starting Alembic migrations for schema: {schema_name}")
|
||||
|
||||
try:
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
root_dir = os.path.abspath(os.path.join(current_dir, "..", "..", "..", ".."))
|
||||
alembic_ini_path = os.path.join(root_dir, "alembic.ini")
|
||||
|
||||
# Configure Alembic
|
||||
alembic_cfg = Config(alembic_ini_path)
|
||||
alembic_cfg.set_main_option("sqlalchemy.url", build_connection_string())
|
||||
alembic_cfg.set_main_option(
|
||||
"script_location", os.path.join(root_dir, "alembic")
|
||||
)
|
||||
|
||||
# Ensure that logging isn't broken
|
||||
alembic_cfg.attributes["configure_logger"] = False
|
||||
|
||||
# Mimic command-line options by adding 'cmd_opts' to the config
|
||||
alembic_cfg.cmd_opts = SimpleNamespace() # type: ignore
|
||||
alembic_cfg.cmd_opts.x = [f"schema={schema_name}"] # type: ignore
|
||||
|
||||
# Run migrations programmatically
|
||||
command.upgrade(alembic_cfg, "head")
|
||||
|
||||
# Run migrations programmatically
|
||||
logger.info(
|
||||
f"Alembic migrations completed successfully for schema: {schema_name}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Alembic migration failed for schema {schema_name}: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def create_schema_if_not_exists(tenant_id: str) -> bool:
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
with db_session.begin():
|
||||
result = db_session.execute(
|
||||
text(
|
||||
"SELECT schema_name FROM information_schema.schemata WHERE schema_name = :schema_name"
|
||||
),
|
||||
{"schema_name": tenant_id},
|
||||
)
|
||||
schema_exists = result.scalar() is not None
|
||||
if not schema_exists:
|
||||
stmt = CreateSchema(tenant_id)
|
||||
db_session.execute(stmt)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def drop_schema(tenant_id: str) -> None:
|
||||
if not tenant_id.isidentifier():
|
||||
raise ValueError("Invalid tenant_id.")
|
||||
with get_sqlalchemy_engine().connect() as connection:
|
||||
connection.execute(
|
||||
text("DROP SCHEMA IF EXISTS %(schema_name)s CASCADE"),
|
||||
{"schema_name": tenant_id},
|
||||
)
|
||||
70
backend/ee/danswer/server/tenants/user_mapping.py
Normal file
70
backend/ee/danswer/server/tenants/user_mapping.py
Normal file
@@ -0,0 +1,70 @@
|
||||
import logging
|
||||
|
||||
from fastapi_users import exceptions
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from danswer.db.engine import get_session_with_tenant
|
||||
from danswer.db.engine import get_sqlalchemy_engine
|
||||
from danswer.db.models import UserTenantMapping
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_tenant_id_for_email(email: str) -> str:
|
||||
if not MULTI_TENANT:
|
||||
return POSTGRES_DEFAULT_SCHEMA
|
||||
# Implement logic to get tenant_id from the mapping table
|
||||
with Session(get_sqlalchemy_engine()) as db_session:
|
||||
result = db_session.execute(
|
||||
select(UserTenantMapping.tenant_id).where(UserTenantMapping.email == email)
|
||||
)
|
||||
tenant_id = result.scalar_one_or_none()
|
||||
if tenant_id is None:
|
||||
raise exceptions.UserNotExists()
|
||||
return tenant_id
|
||||
|
||||
|
||||
def user_owns_a_tenant(email: str) -> bool:
|
||||
with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session:
|
||||
result = (
|
||||
db_session.query(UserTenantMapping)
|
||||
.filter(UserTenantMapping.email == email)
|
||||
.first()
|
||||
)
|
||||
return result is not None
|
||||
|
||||
|
||||
def add_users_to_tenant(emails: list[str], tenant_id: str) -> None:
|
||||
with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session:
|
||||
try:
|
||||
for email in emails:
|
||||
db_session.add(UserTenantMapping(email=email, tenant_id=tenant_id))
|
||||
except Exception:
|
||||
logger.exception(f"Failed to add users to tenant {tenant_id}")
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def remove_users_from_tenant(emails: list[str], tenant_id: str) -> None:
|
||||
with get_session_with_tenant(POSTGRES_DEFAULT_SCHEMA) as db_session:
|
||||
try:
|
||||
mappings_to_delete = (
|
||||
db_session.query(UserTenantMapping)
|
||||
.filter(
|
||||
UserTenantMapping.email.in_(emails),
|
||||
UserTenantMapping.tenant_id == tenant_id,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
|
||||
for mapping in mappings_to_delete:
|
||||
db_session.delete(mapping)
|
||||
|
||||
db_session.commit()
|
||||
except Exception as e:
|
||||
logger.exception(
|
||||
f"Failed to remove users from tenant {tenant_id}: {str(e)}"
|
||||
)
|
||||
db_session.rollback()
|
||||
@@ -8,14 +8,14 @@ from danswer.auth.users import current_admin_user
|
||||
from danswer.auth.users import current_curator_or_admin_user
|
||||
from danswer.db.engine import get_session
|
||||
from danswer.db.models import User
|
||||
from danswer.db.token_limit import fetch_all_user_token_rate_limits
|
||||
from danswer.db.token_limit import insert_user_token_rate_limit
|
||||
from danswer.server.query_and_chat.token_limit import any_rate_limit_exists
|
||||
from danswer.server.token_rate_limits.models import TokenRateLimitArgs
|
||||
from danswer.server.token_rate_limits.models import TokenRateLimitDisplay
|
||||
from ee.danswer.db.token_limit import fetch_all_user_group_token_rate_limits_by_group
|
||||
from ee.danswer.db.token_limit import fetch_all_user_token_rate_limits
|
||||
from ee.danswer.db.token_limit import fetch_user_group_token_rate_limits
|
||||
from ee.danswer.db.token_limit import insert_user_group_token_rate_limit
|
||||
from ee.danswer.db.token_limit import insert_user_token_rate_limit
|
||||
|
||||
router = APIRouter(prefix="/admin/token-rate-limits")
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user