Compare commits

..

4 Commits

Author SHA1 Message Date
pablodanswer
8b9e1a07d5 typing 2024-11-11 09:26:46 -08:00
pablodanswer
b6301ffcb9 spacing 2024-11-11 09:05:01 -08:00
pablodanswer
490ce0db18 cleaner approach 2024-11-11 09:03:49 -08:00
pablodanswer
b2ca13eaae treat async values differently 2024-11-11 08:59:16 -08:00
389 changed files with 16379 additions and 20475 deletions

View File

@@ -65,7 +65,6 @@ jobs:
NEXT_PUBLIC_POSTHOG_KEY=${{ secrets.POSTHOG_KEY }}
NEXT_PUBLIC_POSTHOG_HOST=${{ secrets.POSTHOG_HOST }}
NEXT_PUBLIC_SENTRY_DSN=${{ secrets.SENTRY_DSN }}
NEXT_PUBLIC_GTM_ENABLED=true
# needed due to weird interactions with the builds for different platforms
no-cache: true
labels: ${{ steps.meta.outputs.labels }}

View File

@@ -13,10 +13,7 @@ on:
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
CONFLUENCE_TEST_SPACE_URL: ${{ secrets.CONFLUENCE_TEST_SPACE_URL }}
CONFLUENCE_USER_NAME: ${{ secrets.CONFLUENCE_USER_NAME }}
CONFLUENCE_ACCESS_TOKEN: ${{ secrets.CONFLUENCE_ACCESS_TOKEN }}
jobs:
integration-tests:
# See https://runs-on.com/runners/linux/
@@ -198,13 +195,9 @@ jobs:
-e API_SERVER_HOST=api_server \
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN} \
-e CONFLUENCE_TEST_SPACE_URL=${CONFLUENCE_TEST_SPACE_URL} \
-e CONFLUENCE_USER_NAME=${CONFLUENCE_USER_NAME} \
-e CONFLUENCE_ACCESS_TOKEN=${CONFLUENCE_ACCESS_TOKEN} \
-e TEST_WEB_HOSTNAME=test-runner \
danswer/danswer-integration:test \
/app/tests/integration/tests \
/app/tests/integration/connector_job_tests
/app/tests/integration/tests
continue-on-error: true
id: run_tests

View File

@@ -1,225 +0,0 @@
name: Run Chromatic Tests
concurrency:
group: Run-Chromatic-Tests-${{ github.workflow }}-${{ github.head_ref || github.event.workflow_run.head_branch || github.run_id }}
cancel-in-progress: true
on: push
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
jobs:
playwright-tests:
name: Playwright Tests
# See https://runs-on.com/runners/linux/
runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
cache-dependency-path: |
backend/requirements/default.txt
backend/requirements/dev.txt
backend/requirements/model_server.txt
- run: |
python -m pip install --upgrade pip
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
- name: Setup node
uses: actions/setup-node@v4
with:
node-version: 22
- name: Install node dependencies
working-directory: ./web
run: npm ci
- name: Install playwright browsers
working-directory: ./web
run: npx playwright install --with-deps
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_TOKEN }}
# tag every docker image with "test" so that we can spin up the correct set
# of images during testing
# we use the runs-on cache for docker builds
# in conjunction with runs-on runners, it has better speed and unlimited caching
# https://runs-on.com/caching/s3-cache-for-github-actions/
# https://runs-on.com/caching/docker/
# https://github.com/moby/buildkit#s3-cache-experimental
# images are built and run locally for testing purposes. Not pushed.
- name: Build Web Docker image
uses: ./.github/actions/custom-build-and-push
with:
context: ./web
file: ./web/Dockerfile
platforms: linux/amd64
tags: danswer/danswer-web-server:test
push: false
load: true
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/web-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
- name: Build Backend Docker image
uses: ./.github/actions/custom-build-and-push
with:
context: ./backend
file: ./backend/Dockerfile
platforms: linux/amd64
tags: danswer/danswer-backend:test
push: false
load: true
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/backend/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
- name: Build Model Server Docker image
uses: ./.github/actions/custom-build-and-push
with:
context: ./backend
file: ./backend/Dockerfile.model_server
platforms: linux/amd64
tags: danswer/danswer-model-server:test
push: false
load: true
cache-from: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }}
cache-to: type=s3,prefix=cache/${{ github.repository }}/integration-tests/model-server/,region=${{ env.RUNS_ON_AWS_REGION }},bucket=${{ env.RUNS_ON_S3_BUCKET_CACHE }},mode=max
- name: Start Docker containers
run: |
cd deployment/docker_compose
ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=true \
AUTH_TYPE=basic \
REQUIRE_EMAIL_VERIFICATION=false \
DISABLE_TELEMETRY=true \
IMAGE_TAG=test \
docker compose -f docker-compose.dev.yml -p danswer-stack up -d
id: start_docker
- name: Wait for service to be ready
run: |
echo "Starting wait-for-service script..."
docker logs -f danswer-stack-api_server-1 &
start_time=$(date +%s)
timeout=300 # 5 minutes in seconds
while true; do
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ $elapsed_time -ge $timeout ]; then
echo "Timeout reached. Service did not become ready in 5 minutes."
exit 1
fi
# Use curl with error handling to ignore specific exit code 56
response=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/health || echo "curl_error")
if [ "$response" = "200" ]; then
echo "Service is ready!"
break
elif [ "$response" = "curl_error" ]; then
echo "Curl encountered an error, possibly exit code 56. Continuing to retry..."
else
echo "Service not ready yet (HTTP status $response). Retrying in 5 seconds..."
fi
sleep 5
done
echo "Finished waiting for service."
- name: Run pytest playwright test init
working-directory: ./backend
env:
PYTEST_IGNORE_SKIP: true
run: pytest -s tests/integration/tests/playwright/test_playwright.py
- name: Run Playwright tests
working-directory: ./web
run: npx playwright test
- uses: actions/upload-artifact@v4
if: always()
with:
# Chromatic automatically defaults to the test-results directory.
# Replace with the path to your custom directory and adjust the CHROMATIC_ARCHIVE_LOCATION environment variable accordingly.
name: test-results
path: ./web/test-results
retention-days: 30
# save before stopping the containers so the logs can be captured
- name: Save Docker logs
if: success() || failure()
run: |
cd deployment/docker_compose
docker compose -f docker-compose.dev.yml -p danswer-stack logs > docker-compose.log
mv docker-compose.log ${{ github.workspace }}/docker-compose.log
- name: Upload logs
if: success() || failure()
uses: actions/upload-artifact@v4
with:
name: docker-logs
path: ${{ github.workspace }}/docker-compose.log
- name: Stop Docker containers
run: |
cd deployment/docker_compose
docker compose -f docker-compose.dev.yml -p danswer-stack down -v
chromatic-tests:
name: Chromatic Tests
needs: playwright-tests
runs-on: [runs-on,runner=8cpu-linux-x64,ram=16,"run-id=${{ github.run_id }}"]
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Setup node
uses: actions/setup-node@v4
with:
node-version: 22
- name: Install node dependencies
working-directory: ./web
run: npm ci
- name: Download Playwright test results
uses: actions/download-artifact@v4
with:
name: test-results
path: ./web/test-results
- name: Run Chromatic
uses: chromaui/action@latest
with:
playwright: true
projectToken: ${{ secrets.CHROMATIC_PROJECT_TOKEN }}
workingDir: ./web
env:
CHROMATIC_ARCHIVE_LOCATION: ./test-results

View File

@@ -23,6 +23,21 @@ jobs:
with:
version: v3.14.4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
cache: 'pip'
cache-dependency-path: |
backend/requirements/default.txt
backend/requirements/dev.txt
backend/requirements/model_server.txt
- run: |
python -m pip install --upgrade pip
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
- name: Set up chart-testing
uses: helm/chart-testing-action@v2.6.1
@@ -37,22 +52,6 @@ jobs:
echo "changed=true" >> "$GITHUB_OUTPUT"
fi
# rkuo: I don't think we need python?
# - name: Set up Python
# uses: actions/setup-python@v5
# with:
# python-version: '3.11'
# cache: 'pip'
# cache-dependency-path: |
# backend/requirements/default.txt
# backend/requirements/dev.txt
# backend/requirements/model_server.txt
# - run: |
# python -m pip install --upgrade pip
# pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
# pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
# pip install --retries 5 --timeout 30 -r backend/requirements/model_server.txt
# lint all charts if any changes were detected
- name: Run chart-testing (lint)
if: steps.list-changed.outputs.changed == 'true'

View File

@@ -20,7 +20,6 @@ env:
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
# Google
GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }}
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR }}
GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_GMAIL_SERVICE_ACCOUNT_JSON_STR }}
GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR: ${{ secrets.GOOGLE_GMAIL_OAUTH_CREDENTIALS_JSON_STR }}

1
.gitignore vendored
View File

@@ -7,4 +7,3 @@
.vscode/
*.sw?
/backend/tests/regression/answer_quality/search_test_config.yaml
/web/test-results/

View File

@@ -203,7 +203,7 @@
"--loglevel=INFO",
"--hostname=light@%n",
"-Q",
"vespa_metadata_sync,connector_deletion,doc_permissions_upsert",
"vespa_metadata_sync,connector_deletion",
],
"presentation": {
"group": "2",
@@ -232,7 +232,7 @@
"--loglevel=INFO",
"--hostname=heavy@%n",
"-Q",
"connector_pruning,connector_doc_permissions_sync,connector_external_group_sync",
"connector_pruning",
],
"presentation": {
"group": "2",

View File

@@ -12,7 +12,7 @@
<a href="https://docs.danswer.dev/" target="_blank">
<img src="https://img.shields.io/badge/docs-view-blue" alt="Documentation">
</a>
<a href="https://join.slack.com/t/danswer/shared_invite/zt-2twesxdr6-5iQitKZQpgq~hYIZ~dv3KA" target="_blank">
<a href="https://join.slack.com/t/danswer/shared_invite/zt-2lcmqw703-071hBuZBfNEOGUsLa5PXvQ" target="_blank">
<img src="https://img.shields.io/badge/slack-join-blue.svg?logo=slack" alt="Slack">
</a>
<a href="https://discord.gg/TDJ59cGV2X" target="_blank">
@@ -135,7 +135,7 @@ Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md
## ✨Contributors
<a href="https://github.com/danswer-ai/danswer/graphs/contributors">
<a href="https://github.com/aryn-ai/sycamore/graphs/contributors">
<img alt="contributors" src="https://contrib.rocks/image?repo=danswer-ai/danswer"/>
</a>

View File

@@ -1,59 +0,0 @@
"""display custom llm models
Revision ID: 177de57c21c9
Revises: 4ee1287bd26a
Create Date: 2024-11-21 11:49:04.488677
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
from sqlalchemy import and_
revision = "177de57c21c9"
down_revision = "4ee1287bd26a"
branch_labels = None
depends_on = None
depends_on = None
def upgrade() -> None:
conn = op.get_bind()
llm_provider = sa.table(
"llm_provider",
sa.column("id", sa.Integer),
sa.column("provider", sa.String),
sa.column("model_names", postgresql.ARRAY(sa.String)),
sa.column("display_model_names", postgresql.ARRAY(sa.String)),
)
excluded_providers = ["openai", "bedrock", "anthropic", "azure"]
providers_to_update = sa.select(
llm_provider.c.id,
llm_provider.c.model_names,
llm_provider.c.display_model_names,
).where(
and_(
~llm_provider.c.provider.in_(excluded_providers),
llm_provider.c.model_names.isnot(None),
)
)
results = conn.execute(providers_to_update).fetchall()
for provider_id, model_names, display_model_names in results:
if display_model_names is None:
display_model_names = []
combined_model_names = list(set(display_model_names + model_names))
update_stmt = (
llm_provider.update()
.where(llm_provider.c.id == provider_id)
.values(display_model_names=combined_model_names)
)
conn.execute(update_stmt)
def downgrade() -> None:
pass

View File

@@ -1,68 +0,0 @@
"""default chosen assistants to none
Revision ID: 26b931506ecb
Revises: 2daa494a0851
Create Date: 2024-11-12 13:23:29.858995
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = "26b931506ecb"
down_revision = "2daa494a0851"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"user", sa.Column("chosen_assistants_new", postgresql.JSONB(), nullable=True)
)
op.execute(
"""
UPDATE "user"
SET chosen_assistants_new =
CASE
WHEN chosen_assistants = '[-2, -1, 0]' THEN NULL
ELSE chosen_assistants
END
"""
)
op.drop_column("user", "chosen_assistants")
op.alter_column(
"user", "chosen_assistants_new", new_column_name="chosen_assistants"
)
def downgrade() -> None:
op.add_column(
"user",
sa.Column(
"chosen_assistants_old",
postgresql.JSONB(),
nullable=False,
server_default="[-2, -1, 0]",
),
)
op.execute(
"""
UPDATE "user"
SET chosen_assistants_old =
CASE
WHEN chosen_assistants IS NULL THEN '[-2, -1, 0]'::jsonb
ELSE chosen_assistants
END
"""
)
op.drop_column("user", "chosen_assistants")
op.alter_column(
"user", "chosen_assistants_old", new_column_name="chosen_assistants"
)

View File

@@ -1,30 +0,0 @@
"""add-group-sync-time
Revision ID: 2daa494a0851
Revises: c0fd6e4da83a
Create Date: 2024-11-11 10:57:22.991157
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "2daa494a0851"
down_revision = "c0fd6e4da83a"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"connector_credential_pair",
sa.Column(
"last_time_external_group_sync",
sa.DateTime(timezone=True),
nullable=True,
),
)
def downgrade() -> None:
op.drop_column("connector_credential_pair", "last_time_external_group_sync")

View File

@@ -1,45 +0,0 @@
"""add persona categories
Revision ID: 47e5bef3a1d7
Revises: dfbe9e93d3c7
Create Date: 2024-11-05 18:55:02.221064
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "47e5bef3a1d7"
down_revision = "dfbe9e93d3c7"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Create the persona_category table
op.create_table(
"persona_category",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("name", sa.String(), nullable=False),
sa.Column("description", sa.String(), nullable=True),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("name"),
)
# Add category_id to persona table
op.add_column("persona", sa.Column("category_id", sa.Integer(), nullable=True))
op.create_foreign_key(
"fk_persona_category",
"persona",
"persona_category",
["category_id"],
["id"],
ondelete="SET NULL",
)
def downgrade() -> None:
op.drop_constraint("fk_persona_category", "persona", type_="foreignkey")
op.drop_column("persona", "category_id")
op.drop_table("persona_category")

View File

@@ -1,280 +0,0 @@
"""add_multiple_slack_bot_support
Revision ID: 4ee1287bd26a
Revises: 47e5bef3a1d7
Create Date: 2024-11-06 13:15:53.302644
"""
import logging
from typing import cast
from alembic import op
import sqlalchemy as sa
from sqlalchemy.orm import Session
from danswer.key_value_store.factory import get_kv_store
from danswer.db.models import SlackBot
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = "4ee1287bd26a"
down_revision = "47e5bef3a1d7"
branch_labels: None = None
depends_on: None = None
# Configure logging
logger = logging.getLogger("alembic.runtime.migration")
logger.setLevel(logging.INFO)
def upgrade() -> None:
logger.info(f"{revision}: create_table: slack_bot")
# Create new slack_bot table
op.create_table(
"slack_bot",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("name", sa.String(), nullable=False),
sa.Column("enabled", sa.Boolean(), nullable=False, server_default="true"),
sa.Column("bot_token", sa.LargeBinary(), nullable=False),
sa.Column("app_token", sa.LargeBinary(), nullable=False),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("bot_token"),
sa.UniqueConstraint("app_token"),
)
# # Create new slack_channel_config table
op.create_table(
"slack_channel_config",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("slack_bot_id", sa.Integer(), nullable=True),
sa.Column("persona_id", sa.Integer(), nullable=True),
sa.Column("channel_config", postgresql.JSONB(), nullable=False),
sa.Column("response_type", sa.String(), nullable=False),
sa.Column(
"enable_auto_filters", sa.Boolean(), nullable=False, server_default="false"
),
sa.ForeignKeyConstraint(
["slack_bot_id"],
["slack_bot.id"],
),
sa.ForeignKeyConstraint(
["persona_id"],
["persona.id"],
),
sa.PrimaryKeyConstraint("id"),
)
# Handle existing Slack bot tokens first
logger.info(f"{revision}: Checking for existing Slack bot.")
bot_token = None
app_token = None
first_row_id = None
try:
tokens = cast(dict, get_kv_store().load("slack_bot_tokens_config_key"))
except Exception:
logger.warning("No existing Slack bot tokens found.")
tokens = {}
bot_token = tokens.get("bot_token")
app_token = tokens.get("app_token")
if bot_token and app_token:
logger.info(f"{revision}: Found bot and app tokens.")
session = Session(bind=op.get_bind())
new_slack_bot = SlackBot(
name="Slack Bot (Migrated)",
enabled=True,
bot_token=bot_token,
app_token=app_token,
)
session.add(new_slack_bot)
session.commit()
first_row_id = new_slack_bot.id
# Create a default bot if none exists
# This is in case there are no slack tokens but there are channels configured
op.execute(
sa.text(
"""
INSERT INTO slack_bot (name, enabled, bot_token, app_token)
SELECT 'Default Bot', true, '', ''
WHERE NOT EXISTS (SELECT 1 FROM slack_bot)
RETURNING id;
"""
)
)
# Get the bot ID to use (either from existing migration or newly created)
bot_id_query = sa.text(
"""
SELECT COALESCE(
:first_row_id,
(SELECT id FROM slack_bot ORDER BY id ASC LIMIT 1)
) as bot_id;
"""
)
result = op.get_bind().execute(bot_id_query, {"first_row_id": first_row_id})
bot_id = result.scalar()
# CTE (Common Table Expression) that transforms the old slack_bot_config table data
# This splits up the channel_names into their own rows
channel_names_cte = """
WITH channel_names AS (
SELECT
sbc.id as config_id,
sbc.persona_id,
sbc.response_type,
sbc.enable_auto_filters,
jsonb_array_elements_text(sbc.channel_config->'channel_names') as channel_name,
sbc.channel_config->>'respond_tag_only' as respond_tag_only,
sbc.channel_config->>'respond_to_bots' as respond_to_bots,
sbc.channel_config->'respond_member_group_list' as respond_member_group_list,
sbc.channel_config->'answer_filters' as answer_filters,
sbc.channel_config->'follow_up_tags' as follow_up_tags
FROM slack_bot_config sbc
)
"""
# Insert the channel names into the new slack_channel_config table
insert_statement = """
INSERT INTO slack_channel_config (
slack_bot_id,
persona_id,
channel_config,
response_type,
enable_auto_filters
)
SELECT
:bot_id,
channel_name.persona_id,
jsonb_build_object(
'channel_name', channel_name.channel_name,
'respond_tag_only',
COALESCE((channel_name.respond_tag_only)::boolean, false),
'respond_to_bots',
COALESCE((channel_name.respond_to_bots)::boolean, false),
'respond_member_group_list',
COALESCE(channel_name.respond_member_group_list, '[]'::jsonb),
'answer_filters',
COALESCE(channel_name.answer_filters, '[]'::jsonb),
'follow_up_tags',
COALESCE(channel_name.follow_up_tags, '[]'::jsonb)
),
channel_name.response_type,
channel_name.enable_auto_filters
FROM channel_names channel_name;
"""
op.execute(sa.text(channel_names_cte + insert_statement).bindparams(bot_id=bot_id))
# Clean up old tokens if they existed
try:
if bot_token and app_token:
logger.info(f"{revision}: Removing old bot and app tokens.")
get_kv_store().delete("slack_bot_tokens_config_key")
except Exception:
logger.warning("tried to delete tokens in dynamic config but failed")
# Rename the table
op.rename_table(
"slack_bot_config__standard_answer_category",
"slack_channel_config__standard_answer_category",
)
# Rename the column
op.alter_column(
"slack_channel_config__standard_answer_category",
"slack_bot_config_id",
new_column_name="slack_channel_config_id",
)
# Drop the table with CASCADE to handle dependent objects
op.execute("DROP TABLE slack_bot_config CASCADE")
logger.info(f"{revision}: Migration complete.")
def downgrade() -> None:
# Recreate the old slack_bot_config table
op.create_table(
"slack_bot_config",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("persona_id", sa.Integer(), nullable=True),
sa.Column("channel_config", postgresql.JSONB(), nullable=False),
sa.Column("response_type", sa.String(), nullable=False),
sa.Column("enable_auto_filters", sa.Boolean(), nullable=False),
sa.ForeignKeyConstraint(
["persona_id"],
["persona.id"],
),
sa.PrimaryKeyConstraint("id"),
)
# Migrate data back to the old format
# Group by persona_id to combine channel names back into arrays
op.execute(
sa.text(
"""
INSERT INTO slack_bot_config (
persona_id,
channel_config,
response_type,
enable_auto_filters
)
SELECT DISTINCT ON (persona_id)
persona_id,
jsonb_build_object(
'channel_names', (
SELECT jsonb_agg(c.channel_config->>'channel_name')
FROM slack_channel_config c
WHERE c.persona_id = scc.persona_id
),
'respond_tag_only', (channel_config->>'respond_tag_only')::boolean,
'respond_to_bots', (channel_config->>'respond_to_bots')::boolean,
'respond_member_group_list', channel_config->'respond_member_group_list',
'answer_filters', channel_config->'answer_filters',
'follow_up_tags', channel_config->'follow_up_tags'
),
response_type,
enable_auto_filters
FROM slack_channel_config scc
WHERE persona_id IS NOT NULL;
"""
)
)
# Rename the table back
op.rename_table(
"slack_channel_config__standard_answer_category",
"slack_bot_config__standard_answer_category",
)
# Rename the column back
op.alter_column(
"slack_bot_config__standard_answer_category",
"slack_channel_config_id",
new_column_name="slack_bot_config_id",
)
# Try to save the first bot's tokens back to KV store
try:
first_bot = (
op.get_bind()
.execute(
sa.text(
"SELECT bot_token, app_token FROM slack_bot ORDER BY id LIMIT 1"
)
)
.first()
)
if first_bot and first_bot.bot_token and first_bot.app_token:
tokens = {
"bot_token": first_bot.bot_token,
"app_token": first_bot.app_token,
}
get_kv_store().store("slack_bot_tokens_config_key", tokens)
except Exception:
logger.warning("Failed to save tokens back to KV store")
# Drop the new tables in reverse order
op.drop_table("slack_channel_config")
op.drop_table("slack_bot")

View File

@@ -7,7 +7,6 @@ Create Date: 2024-10-26 13:06:06.937969
"""
from alembic import op
from sqlalchemy.orm import Session
from sqlalchemy import text
# Import your models and constants
from danswer.db.models import (
@@ -16,6 +15,7 @@ from danswer.db.models import (
Credential,
IndexAttempt,
)
from danswer.configs.constants import DocumentSource
# revision identifiers, used by Alembic.
@@ -30,11 +30,13 @@ def upgrade() -> None:
bind = op.get_bind()
session = Session(bind=bind)
# Get connectors using raw SQL
result = bind.execute(
text("SELECT id FROM connector WHERE source = 'requesttracker'")
connectors_to_delete = (
session.query(Connector)
.filter(Connector.source == DocumentSource.REQUESTTRACKER)
.all()
)
connector_ids = [row[0] for row in result]
connector_ids = [connector.id for connector in connectors_to_delete]
if connector_ids:
cc_pairs_to_delete = (

View File

@@ -1,30 +0,0 @@
"""add creator to cc pair
Revision ID: 9cf5c00f72fe
Revises: 26b931506ecb
Create Date: 2024-11-12 15:16:42.682902
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "9cf5c00f72fe"
down_revision = "26b931506ecb"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"connector_credential_pair",
sa.Column(
"creator_id",
sa.UUID(as_uuid=True),
nullable=True,
),
)
def downgrade() -> None:
op.drop_column("connector_credential_pair", "creator_id")

View File

@@ -1,29 +0,0 @@
"""add auto scroll to user model
Revision ID: a8c2065484e6
Revises: 177de57c21c9
Create Date: 2024-11-22 17:34:09.690295
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "a8c2065484e6"
down_revision = "177de57c21c9"
branch_labels = None
depends_on = None
def upgrade() -> None:
# Add the auto_scroll column with a default value of True
op.add_column(
"user",
sa.Column("auto_scroll", sa.Boolean(), nullable=True, server_default=None),
)
def downgrade() -> None:
# Remove the auto_scroll column
op.drop_column("user", "auto_scroll")

View File

@@ -1,42 +0,0 @@
"""extended_role_for_non_web
Revision ID: dfbe9e93d3c7
Revises: 9cf5c00f72fe
Create Date: 2024-11-16 07:54:18.727906
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "dfbe9e93d3c7"
down_revision = "9cf5c00f72fe"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.execute(
"""
UPDATE "user"
SET role = 'EXT_PERM_USER'
WHERE has_web_login = false
"""
)
op.drop_column("user", "has_web_login")
def downgrade() -> None:
op.add_column(
"user",
sa.Column("has_web_login", sa.Boolean(), nullable=False, server_default="true"),
)
op.execute(
"""
UPDATE "user"
SET has_web_login = false,
role = 'BASIC'
WHERE role IN ('SLACK_USER', 'EXT_PERM_USER')
"""
)

View File

@@ -1,551 +0,0 @@
Branch,Commit Hash,Author,Date,Subject
DAN-108,548c081fd6515c2e8b912d145c135e292db4613e,pablodanswer,2024-11-20,k
DAN-108,0d4abfdc85fdb62c347d0f649744f1b7c12e8011,pablodanswer,2024-11-20,folder clarity
a,36eee45a03c3227a9b070e18a043e16fe5179cb9,pablodanswer,2024-11-21,llm provider causing re render in effect
account_for_json,b37d0b91e6a6596af91e1fa32786591b76e05a67,pablodanswer,2024-11-14,fix single quote block in llm answer
account_for_json,4e0c048acba88f4c83d7c83af52bb0932234ddad,pablodanswer,2024-11-14,nit
account_for_json,a0371a6750476fccc3b9892a7c58d72182c92507,pablodanswer,2024-11-14,minor logic update
account_for_json,4f1c4baa80f7b747633bb3d528aed6de5b11f639,pablodanswer,2024-11-14,minor cosmetic update
account_for_json,b6ef7e713a4eca3d65aa411604e8f67ad5efdd87,pablodanswer,2024-11-14,k
account_for_json,66df9b6f7dae8bce61e35615d715ddefc6406614,pablodanswer,2024-11-14,improved fallback logic
account_for_json,0473888ccdb5219cc39f275652bfeb72a420b5d9,pablodanswer,2024-11-13,silence warning
accurate_user_counting,06f3a4590c05665b04851b30860aa431ad4b7217,pablodanswer,2024-11-02,ensure we remove users in time
accurate_user_counting,6e75ba007302ce9adc4469b86695aee4b4b5c513,pablodanswer,2024-11-02,validate
accurate_user_counting,11f3729ebb9f67b8e568c01a9ce1d098560033cf,pablodanswer,2024-11-02,update register
add_csv_display,e7b044cf38cd3e25fdbe17ea8fcac3e8c17d9570,pablodanswer,2024-11-03,nit
add_csv_display,93ec944a01ec87d87a4bf2b85c1164b7625a1259,pablodanswer,2024-11-02,update requirements
add_csv_display,00f8e431ff81d7980c8d2c166bdad5f899752379,pablodanswer,2024-11-02,create portal for modal
add_csv_display,a019a812bef27a20bd2e94d558974c55ded63035,pablodanswer,2024-11-02,restructure
add_csv_display,eabc519f062b5e0fec3b2c29e89f109606e747bc,pablodanswer,2024-11-01,add downloading
add_csv_display,4dbd74cacb350ebbf5ce0554239f999503a14d8f,pablodanswer,2024-11-01,add CSV display
add_tool_formats,e7361dcb17a1d205627e46c87861f5be4dc06a03,pablodanswer,2024-11-03,add multiple formats to tools
add_tool_formats,00f8e431ff81d7980c8d2c166bdad5f899752379,pablodanswer,2024-11-02,create portal for modal
add_tool_formats,a019a812bef27a20bd2e94d558974c55ded63035,pablodanswer,2024-11-02,restructure
add_tool_formats,eabc519f062b5e0fec3b2c29e89f109606e747bc,pablodanswer,2024-11-01,add downloading
add_tool_formats,4dbd74cacb350ebbf5ce0554239f999503a14d8f,pablodanswer,2024-11-01,add CSV display
admin_wonkiness,8a7f032acb35fca9260f1f15e48a6114279a1dc0,pablodanswer,2024-11-20,valid props
api_keys_are_not_users,39c3e3f84b56f2b1d661f723fe9650503d8602ad,pablodanswer,2024-11-01,typing
api_keys_are_not_users,cab9c925cc09b636e026f36057795a775d6a8289,pablodanswer,2024-11-01,don't count api keys as users
assistant_categories,425da2250c6cade36e9dfe4aa9eaca9f60ad7c1f,pablodanswer,2024-11-18,alembic (once again)
assistant_categories,c079165c60d58d781bb399220f0041a57dd27cde,pablodanswer,2024-11-18,alembic
assistant_categories,dc5f9e5aa2fbf1a502474bc56cbe9a5eaa34ed91,pablodanswer,2024-11-11,nit
assistant_categories,7ed84cf536aa5be737f4eff25e244def9987cfb3,pablodanswer,2024-11-11,typing
assistant_categories,30a58ad86d96f841103f9bf5ef92355ba7550e72,pablodanswer,2024-11-11,finalize
assistant_categories,4c5d0a45fd07dffa42717c78f4b20025ca7c67ad,pablodanswer,2024-11-11,update typing
assistant_categories,ed7c62b450dd1b42a8e399c8abcaac8ccb006b1d,pablodanswer,2024-11-11,minor update to tests
assistant_categories,501c6afdd0a8e4c67ee8ae864392549a19f68b85,pablodanswer,2024-11-11,post rebase update
assistant_categories,8cd7e50b26d8ac5d5311c1ffc4517c35c2a9a6b6,pablodanswer,2024-11-08,add tests
assistant_categories,ca0eb6f03344cf833b2aba45c5fbe4d01a112c6f,pablodanswer,2024-11-07,nit
assistant_categories,2041484a515ebaedaf05dc0e19e3cb5095b34018,pablodanswer,2024-11-07,update assistant category display
assistant_categories,a124d4e2229bcb9a9f1caf269c444357e4749700,pablodanswer,2024-11-07,finalize
assistant_categories,59fa1d07f10b7f44010207d54547b947ca789fe1,pablodanswer,2024-11-05,functionality finalized
assistant_categories,0a226b47e55dc6767dde8f478729616d1b4870f1,pablodanswer,2024-11-05,add assistant categories v1
assistant_clarity,71c60c52dd37ccebd2d4f8862676d5f21a64acf1,pablodanswer,2024-11-12,minor update
assistant_clarity,72f05a13485dab5a8ddd0d0e5ac7d4e98aed01a2,pablodanswer,2024-11-12,delete code
assistant_clarity,0c22f8ab20c32043c9e1f5f991989a07ecbd6387,pablodanswer,2024-11-12,delete code!
assistant_clarity,e376032f14621d645fda23f058b5712c33224e82,pablodanswer,2024-11-12,update paradigm
assistant_clarity,3f2738006951ffcf58ea59473da3070e8023a9d0,pablodanswer,2024-11-12,alembic fix
assistant_clarity,233f186fecb9eba7eefd6aa493ce70b299f68ac6,pablodanswer,2024-11-12,slight rejigger
assistant_clarity,0582306d9be29f7c3daff7b7d5a2c1ef1517e033,pablodanswer,2024-11-12,k
assistant_clarity,4f699b2591fe190abf1d68fefb3f2841c0f7f68e,pablodanswer,2024-11-12,add minor clarity
assistant_clarity,bc6d47a6c5702d102cc04c16e56426a1561fe3e5,pablodanswer,2024-11-12,minor clean up
assistant_clarity,09ec137a5f6fb230a0c39a67b19e9f772d3441ca,pablodanswer,2024-11-12,update organization
auth_categories,f51d87833e591bdcb9a650aa762060387a96a292,pablodanswer,2024-11-07,nit
auth_categories,01f93bab2f698bb0dc84bddb705de40a9a18e660,pablodanswer,2024-11-07,update assistant category display
auth_categories,b162e9f4c4c9ff4b9cd718f548cc20ab0e60be0f,pablodanswer,2024-11-07,finalize
auth_categories,c7097dffbd73e1b2d9b34ad67bbd8aa6e072c3b5,pablodanswer,2024-11-05,functionality finalized
auth_categories,653bbffb3cda5cbc41f61917e5634e22d70d5e26,pablodanswer,2024-11-05,add assistant categories v1
auto_prompts,06bc8f1f92e33af2c6bb1750936407ad8e29d3c0,pablodanswer,2024-10-28,base functionality
auto_prompts,8093ceeb45088c813fbb117302738b3d225c2f8b,pablodanswer,2024-10-28,formatting
auto_prompts,3d0ace1e450ac6d7271ddedc2ec122a2647be7df,pablodanswer,2024-10-28,minor nits
auto_prompts,553aba79dc41b928c163a83481b202ad56805aae,pablodanswer,2024-10-28,update based on feedback
auto_prompts,da038b317a0b5185ccc32297b01fcaa97ffbb429,pablodanswer,2024-09-21,remove logs
auto_prompts,6769dc373faf7576c2d0ac212735b88eae755293,pablodanswer,2024-09-21,minor udpate to ui
auto_prompts,b35e05315c4c506da87524fe788a9cf5aacb7375,pablodanswer,2024-09-20,use display name + minor updates to models
auto_prompts,7cfd3d2d442255616ec5c477dc4b3eb0b2cad1ed,pablodanswer,2024-09-20,cleaner cards
auto_prompts,b2aa1c864b20274386a1bbe699a3ef7e094bd858,pablodanswer,2024-09-20,slightly cleaner animation
auto_prompts,d2f8177b8f1b9be8eebce520204018e6be59b03c,pablodanswer,2024-09-20,cleaner initial chat screen
back_to_danswer,262a405195e1b1b07c96e1ae4a39df76b690ed69,pablodanswer,2024-11-06,update redirect
beat_robustification,63959454df29709c149b71f82672c8752c646cfa,pablodanswer,2024-11-03,Remove locks (#3017)
beat_robustification,96027f1d732f26b407afd2b52641615a96d5402b,pablodanswer,2024-11-02,ensure versioned apps capture
beat_robustification,80ea6a36610775a0e57ec236f9a2bdaf419a51e5,pablodanswer,2024-11-01,typing
beat_robustification,527c409f81a7d31c8ff6ebd2be465418476eba74,pablodanswer,2024-11-01,update
beat_robustification,19ab457d926a05a0d61ada33684918a5d427e619,pablodanswer,2024-11-01,address comments
beat_robustification,f5b38cd9362b4c7b84357a6fcf2bbeb4c1e7c8a8,pablodanswer,2024-10-30,nit
beat_robustification,63d1cc56acdeba0430d5da9f8b752cd470df865f,pablodanswer,2024-10-30,reorg
beat_robustification,4436bec97019893c256ee1750e28e3061edfd771,pablodanswer,2024-10-30,validate
beat_robustification,90b7198d53ec8b383051925de16a2818653c4fe3,pablodanswer,2024-10-30,add validated + reformatted dynamic beat acquisition
better_image_assistant_prompt,e9abbcdefdf21eef2000fc61342e4129bfd1498f,pablodanswer,2024-11-03,nit
better_image_assistant_prompt,89f51078690bed44b2809aa5229f39b4d543d88e,pablodanswer,2024-11-02,k
better_image_assistant_prompt,6972874aac31dcccd4ff739484b6a5b563e62405,pablodanswer,2024-11-02,slight upgrade to prompts
bg_processing_improvements,48d24860e6f5401a265951b8e49e900ed6e40f63,pablodanswer,2024-11-03,improvements
branding_update,12bbf2ad972a1f8887e5f5eb427b88261ef5097c,pablodanswer,2024-10-28,add additional configuration options
bugfix/async,8b9e1a07d55b3f090d168768a74d09d60ba19649,pablodanswer,2024-11-11,typing
bugfix/async,b6301ffcb9bb35f6d73c28ffd502bfb01f49272a,pablodanswer,2024-11-11,spacing
bugfix/async,490ce0db18df25625446a4abe163790b96431645,pablodanswer,2024-11-11,cleaner approach
bugfix/async,b2ca13eaae905af768519a62a38d3d84c239cba8,pablodanswer,2024-11-11,treat async values differently
bugfix/curator_interface,a7312f62366cff5243e4b85c5c47e33e5da29f5c,pablodanswer,2024-11-21,remove values
bugfix/curator_interface,85e08df5219f0e2e793beb65a1ce4dc36f2481d4,pablodanswer,2024-11-21,update user role
bugfix/curator_interface,937a07d705a8620f47336c1c6c125ae6b025a950,pablodanswer,2024-11-21,update
bugfix/curator_interface,1130d456aaa6ea38aeeacd234ab82504e3c5fc68,pablodanswer,2024-11-21,update
bugfix/curator_interface,cf4cda235ce02bfdea1f1cd17ad4f6a2e0f7f9f7,pablodanswer,2024-11-21,update config
bugfix/curator_interface,5a07f727c0563061398f50ed253f1efc2f83c176,pablodanswer,2024-11-21,mystery solved
bugfix/index_attempt_logging_2,209514815547074a31b3121bf47e7b1e350e817d,Richard Kuo (Danswer),2024-11-21,Move unfenced check to check_for_indexing. implement a double check pattern for all indexing error checks
bugfix/indexing_redux,0c068c47c2cb729a0450910f0f6b6d04b340b131,Richard Kuo (Danswer),2024-11-17,Merge branch 'main' of https://github.com/danswer-ai/danswer into bugfix/indexing_redux
bugfix/indexing_redux,1dfde97a5a52a8c4c3996d14348e9fffe6073743,Richard Kuo (Danswer),2024-11-14,refactor unknown index attempts and redis lock
bugfix/indexing_redux,5d95976bf1bc13caaa21655777e8e84efb682cd2,Richard Kuo (Danswer),2024-11-14,raise indexing lock timeout
bugfix/pagination,1a009c6b6a3d52302e5bbdec20c75ce15a678f5c,pablodanswer,2024-11-07,minor update
bugfix/pagination,e8cd2630e2bee96496b30f637a169df863e11495,pablodanswer,2024-11-06,minor update
bugfix/pagination,d835de1f5219248f164221464b257b5a44c6ed8f,pablodanswer,2024-11-06,fixed query history
bugfix/pagination,c6d35a8ad6be86c28ba8d3645d171d22390cc9fa,pablodanswer,2024-11-06,update side
bugfix/pagination,a5641e5a5e001dc3a4740bfcdd53c9fafb64c20a,pablodanswer,2024-11-06,fix pagination
bugfix/pruning,c27308c812f536a5e7410a73b0940f63330fb3fb,pablodanswer,2024-10-30,clarity
calendar_clarity,7edb205a6837d0328062ecbb9a9318dd6e27f9d5,pablodanswer,2024-11-22,minor calendar cleanup
callout_clarity,a8787b7be8e66d06edeaa997390ca118d1abaaac,pablodanswer,2024-11-04,k
callout_clarity,585e6b7b2fec35e17f91d55354c48631cb773ca7,pablodanswer,2024-11-04,k
callout_clarity,bdbfb62946b644ddf011a2e03a1a9b2158899f36,pablodanswer,2024-11-04,ensure props aligned
cascade_search,9c975d829d0b67d245da18e905781c22578f413f,pablodanswer,2024-10-30,minor foreign key update
clean-jira-pr,1eec84a6693add96e571eca96cf181bd32ab42f4,hagen-danswer,2024-11-20,cleanup
clean-jira-pr,658951f66dfe2cb97e20f590f71f46bcb8b1f1ef,hagen-danswer,2024-11-20,more cleanup of Jira connector
clean-jira-pr,da153ef5179592cfa11f9ce271c187739e242432,hagen-danswer,2024-11-20,fixed testing
clean-jira-pr,82118e0837d486e8d66fb7eb26d523c4fa79f8a2,hagen-danswer,2024-11-20,Added Slim connector for Jira
cloud_auth,bcce7733aa5bb2f3af2842d8e9938af6c5597c9c,pablodanswer,2024-11-11,typing
cloud_auth,eeeb84c66bf1d5aefd16ad20f9727a61b2ddc5f3,pablodanswer,2024-11-11,minor modification to be best practice
cloud_auth,a7b13762264b67ac720db21552c3a6c0f42e7c9d,pablodanswer,2024-11-11,k
cloud_auth,1c020d11c4d4257732a7fca17eecbde979e42804,pablodanswer,2024-11-11,minor clarity
cloud_auth,cb6fad26b8ec9f77a7bc82a94da8e6748bbc20f0,pablodanswer,2024-11-11,cloud auth referral source
cohere,444ad36c0801810fadfcc4a0c1f355004f59e317,pablodanswer,2024-11-13,config
cohere,227faf87c690ef9b30fbe79b1582ad36a4ec95b2,pablodanswer,2024-11-11,update config
cohere,1bf33a6b7ae5fc84a779c3c6d9d8c514523b5af9,pablodanswer,2024-11-11,ensure we properly expose name(space) for slackbot
cohere,15bd1d0ca6461ba7a9a1d2f468aea5f981e8750e,pablodanswer,2024-11-11,update configs
cohere,ce48d189aa6f9f83a6a62b353ea04bd16659d0e2,pablodanswer,2024-11-11,update
cohere,43b82e50cfdf9a1a260bde312a7e7e4f2929425b,pablodanswer,2024-11-11,update
cohere,1d06787e1d5734c25e703ba4f4b2d7df6c8bac01,pablodanswer,2024-11-11,minor improvement
cohere,8386d30f9230565136d2133b7c5cbcb623980761,pablodanswer,2024-11-11,finalize
cohere,374e51221881fcd722876efa9f53080342f3dcbd,pablodanswer,2024-11-10,add cohere default
cohere_default,8f67dc310fa1177430b8a47cfa685b4de4af105c,pablodanswer,2024-11-11,update
cohere_default,ad7d18968075a932a4539ac37d5432fa99fe99f4,pablodanswer,2024-11-11,minor improvement
cohere_default,72730a5ba3cef93523bfba9ee63994e5a1c0d63f,pablodanswer,2024-11-11,finalize
cohere_default,df8bd6daf46c1fce951efb50aaeff5e7cbc4b74a,pablodanswer,2024-11-10,add cohere default
cohere_default,6b78ab0a99bb5727df35c1dfc23c5e39008211ae,pablodanswer,2024-11-11,Cleaner EE fallback for no op (#3106)
cohere_default,e97bf1d4e28bcbf32080c3a339d0e2ac3d6d0253,Chris Weaver,2024-11-11,New assistants api (#3097)
cohere_default,293dbfb8eb7b3ac4d2878b7a72068b829b9e3469,rkuo-danswer,2024-11-09,re-enable helm (#3053)
cohere_default,f4a61202a7b6de8a011d67896b16e14f94eb981a,pablodanswer,2024-11-09,Silence auth logs (#3098)
cohere_default,53f9d94ceb7a6a8da2a0c2d94fee6971adb29bbf,pablodanswer,2024-11-11,revert
cohere_default,5058d898b8532881c517e14c22ca5c32784288fe,pablodanswer,2024-11-11,update some configs
cohere_default,bc7de4ec1b9832059426ed74f2755c9548852459,pablodanswer,2024-11-11,moderate slackbot switch
cohere_default,3ad98078f5205c2df5a3ea96cc165b982256a975,pablodanswer,2024-11-10,finalized keda
cohere_default,0fb12b42f10bae3d8633717f763fa42271349442,pablodanswer,2024-11-10,minor update
cohere_default,158329a3cc659d666328dac36bac7c5ffa87e084,pablodanswer,2024-11-10,finalize slackbot improvements
cohere_default,7f1a50823baf0f5bbab89587e7df6f03fe552e27,pablodanswer,2024-11-10,fix typing
cohere_default,0e76bcef454e0c09cb83ce91834730fdd084d930,pablodanswer,2024-11-10,add improved cloud configuration
csv_limits,45be7156c52d3b32799d67139998de7892c3490e,pablodanswer,2024-11-11,minor enforcement of CSV length for internal processing
custom_llm_display_fix,01efa818bcc82eef92457cbe4acd6c3c2fab60f0,pablodanswer,2024-11-21,Revert "clean horizontal scrollbar"
custom_llm_display_fix,dec279a9602825243ed7df4b7a5592ccd267bddd,pablodanswer,2024-11-21,update migration
custom_llm_display_fix,4b03c0e6e24b36725f4501edb81f46dc2812ff4f,pablodanswer,2024-11-21,k
custom_llm_display_fix,17eb0d3086b6249c806f51a0a45c78c927249bcd,pablodanswer,2024-11-21,ensure proper migration
custom_llm_display_fix,0f638229f56966e480d3479de5f9a3108750afc8,pablodanswer,2024-11-20,provider fix
custom_llm_display_fix,fa592a1b7a69897110a928a222b19eaef3b7267a,pablodanswer,2024-11-21,clean horizontal scrollbar
danswer_authorization_header,856c2debd98187b28e341940dafeb97eed81cad9,pablodanswer,2024-10-29,add danswer api key header
default_keys,4907d2271950fb2f45c56c21e6d641b616c02ad7,pablodanswer,2024-11-03,naming
default_keys,8766502f6dd125a43ef6cc9e9a20cec1c8f3ae8a,pablodanswer,2024-11-03,add cohere as well
default_keys,589e141bc9d2ed30c467257596f346c4824934a7,pablodanswer,2024-11-03,add default api keys for cloud users
default_prompts,d1926d47b5b65aeb01c103d7c44fa5bb63e4fb1c,pablodanswer,2024-11-06,update default live assistant logic
default_prompts,f457bdb49128b010da04612f598ef0e0810dcf7c,pablodanswer,2024-11-06,update starter message
default_prompts,00adc2d0e0cd23d7c9664b68f4caa7859bdb4eeb,Yuhong Sun,2024-11-06,touchup
default_prompts,f56b139d8dbcc44248080719fa9f3c81afdf1e81,pablodanswer,2024-11-06,nit
default_prompts,09cd3c6c2792b94e7db220a921095f0af8054e0c,pablodanswer,2024-11-06,minor update to refresh
default_prompts,32a688b6277b918afd7497f483ef457b85dc9d05,pablodanswer,2024-11-06,udpate refresh logic
default_prompts,719fb914f5094f3a35095cbb8e0c75aa4f0d0c45,pablodanswer,2024-11-06,update ux + spacing
default_prompts,7c5df1cf69e8c890cc02e27b2ba2edeac9c3c22a,pablodanswer,2024-11-05,fallback to all assistants
default_prompts,8a900b732dd67215718e07273cc62c881b6786e4,pablodanswer,2024-11-03,formating nits
default_prompts,eab00d7247cf0853b6a83888ae581c63c8c59981,pablodanswer,2024-11-03,nit
default_prompts,9460009ed306a135110bc88cc6b75f3779df96d0,pablodanswer,2024-11-03,update typing
default_prompts,4f1aa7f1ff04debb39b6ea8ea79de3d01254f4a5,pablodanswer,2024-11-03,validate
default_prompts,c97b8938920b4406477f252b01a1e561b3b24f31,pablodanswer,2024-11-03,k
default_prompts,074334e20d2208f52bbf00bda76e3e79494977c2,pablodanswer,2024-11-03,update user preferences
default_prompts,85b50855c0778fb34fc32441e7c3791b905485fa,pablodanswer,2024-11-03,update persona defaults
default_schema_slack,87931b759feb1431ce96090bd390e3e28cb30208,pablodanswer,2024-11-08,adjust default postgres schema for slack listener
detailed_filters,bde4b4029af5334699e226afbd77ba0753a04797,pablodanswer,2024-11-18,update date range filter
detailed_filters,d77629fc318db896c5b9f53c45c33dfad5038e6b,pablodanswer,2024-11-05,clarity updates
detailed_filters,0038c32213681db3dab29dee2f21324743fc6d94,pablodanswer,2024-11-05,add new complicated filters
double_auth,a7173eb689100c9abd1b68aeab890a992da32cbc,pablodanswer,2024-10-27,ports
double_auth,45170a28fc8417b6f0de7ac97c643a36e4c03284,pablodanswer,2024-10-27,fix nagging double auth issue
dropdown,c29beaf403a7722e1ee638cc50c8551931f8c5d9,pablodanswer,2024-11-13,combobox
dropdown,46f84d15f8af635123557056542829a14d5fca60,pablodanswer,2024-11-13,content scroll differences
dropdown,e8c93199f24cac94b73e8ac923b43b3159af74c9,pablodanswer,2024-11-13,minor dropdown fix
fallback_context,3734e683e1719d9f6abe9e80e475a4c2c275cdaf,pablodanswer,2024-11-07,ensure proper attribution
fallback_context,886e8c7b6e30328c1d95277f22dde48af2cb1a99,pablodanswer,2024-11-07,update comments
fallback_context,4916d66df0ec3d348caafe6c40c5e16fb28381b1,pablodanswer,2024-11-07,clearer
fallback_context,6ae512fc4e909a52e90c548f9674b60d536bdc54,pablodanswer,2024-11-06,update typing
fallback_context,159c8ee22df75036d3db59c292fa13632982b427,pablodanswer,2024-11-06,add sentinel value
feat/cert_clarity,35307d4f384039ef0df8f979e34912ab1cd4e201,pablodanswer,2024-10-30,first pass
feat/cert_clarity,e6b9ebc198973a84dc9412302e6b98a24b0a2ce3,pablodanswer,2024-10-29,ensure functionality
feat/cert_mount,a32e34b5571d60a4b8b8a1d62328b9a77fb0ad27,pablodanswer,2024-10-30,simplify
feat/cert_mount,2dc7b08a9cb73164479c03dfd4b4fed162029399,pablodanswer,2024-10-30,first pass
feat/cert_mount,e6b9ebc198973a84dc9412302e6b98a24b0a2ce3,pablodanswer,2024-10-29,ensure functionality
feat/certificate,152e8c422bb9c6bf7b08221dcfe44a60d7a2de22,pablodanswer,2024-11-01,nit
feat/certificate,45498a5f51a8efa9955c18fe5cb53b2d0f41ebd3,pablodanswer,2024-10-31,k
feat/certificate,9ecf237435cd8a5b0ac60ebaca8d26840ab0abed,pablodanswer,2024-10-31,minor clean up
feat/certificate,fed2c5666cb54d3edcfe14319e3f7d7befbed78e,pablodanswer,2024-10-30,remove now unneeded COPY command
feat/certificate,56b3f2fa999db64aec3fd069b1de2bc77d00a6b6,pablodanswer,2024-10-30,simplify
feat/certificate,7d03f3aa8cb8a4ada9af8551db62364eb8e2c217,pablodanswer,2024-10-30,first pass
feat/silence_unauth_logs,d2ba35ca45ca77701075813fd64858b04c4e9eb2,pablodanswer,2024-11-09,k
feat/silence_unauth_logs,923176ef6e1e1941f8dc461d1d7b1d76f88c4e1b,pablodanswer,2024-11-09,remove unnecessary line
feat/silence_unauth_logs,888ce3e0ced3a63c57f7ec2221059d0012e772c2,pablodanswer,2024-11-09,silence auth logs
feat/tenant_posthog,35ed1d2108dd1a28cf63ba45f776d8a25b91b5d7,pablodanswer,2024-10-27,nit
feat/tenant_posthog,d1a9e0f6c4618aa4a7e5029dbbeb6179a40ff5c7,pablodanswer,2024-10-27,distinguish tenants in posthog
fix-answer-with-specified-doc-ids,5fbcc70518bd5d1be00d6595f3fc690f81c52f21,pablodanswer,2024-11-01,minor logging updates for clarity
fix-answer-with-specified-doc-ids,7db0de9505c3510a4db76e98a47d5b079056dc93,pablodanswer,2024-10-31,minor typo
fix-answer-with-specified-doc-ids,18b4a8a26331bc013b49e486e2bf82c5ce4bfe73,pablodanswer,2024-10-31,fix stop generating
fix-answer-with-specified-doc-ids,98660be16459038b438d12616bd6f00dde418b95,Weves,2024-10-31,Fix UT
fix-answer-with-specified-doc-ids,3620266bddfbf1fca309ff2fe97f72bda7462979,Weves,2024-10-31,Remove unused exception
fix-answer-with-specified-doc-ids,2132a430cc64abd869632c0f55a35bdc42b30be9,Weves,2024-10-31,Fix image generation slowness
fix-answer-with-specified-doc-ids,24e34019ce25314c5e749d38dd0895a1c3d5141e,Weves,2024-10-31,More testing
fix-answer-with-specified-doc-ids,3cd4ed5052277428dc06343f53e0e6486af26208,Weves,2024-10-31,Testing
fix-answer-with-specified-doc-ids,200bb96853d6d96a99093f6e915fe9721ab5c6b3,Weves,2024-10-31,Add quote support
fix-answer-with-specified-doc-ids,5a0c6d003607dfb9a7445a6a87df9a6062b73bc6,Weves,2024-10-02,Fix
fix-openai-tokenizer,566e4cfd0f39db0a1fbc7c7fae040bcf98482f62,pablodanswer,2024-11-08,minor updates
fix-openai-tokenizer,3b09f3e53e7a8f948cd36255fd53423d7b5827d0,pablodanswer,2024-11-07,minor organizational update
fix-openai-tokenizer,75d5e6b8b6e81c77063fd79b4cfe532366da723a,pablodanswer,2024-11-07,minor update to ensure consistency
fix-openai-tokenizer,362bb3557246e86de131c223acdf2adf17fb14e4,pablodanswer,2024-11-06,nit
fix-openai-tokenizer,6d100d81d284dc98143bb8c94c16c25d64c56633,pablodanswer,2024-11-06,clean up test embeddings
fix-openai-tokenizer,c5be5dc4c9710b684d0954a5224a75c090befe94,Yuhong Sun,2024-11-05,k
fix_missing_json,1f6cc578c425f8bbe3b320f65f191f09c8fcfa0b,pablodanswer,2024-11-20,k
fix_missing_json,d95b7d6695ba087f0b9da9bdf245f7c34e503499,pablodanswer,2024-11-20,k
fix_missing_json,b75d4af102739a2b9e3ec2dff301f4affd08b3e5,pablodanswer,2024-11-20,remove logs
fix_missing_json,559d9ed6d4fd27de8941a104c9c83322a75abea6,pablodanswer,2024-11-20,k
fix_missing_json,9c900d658979341ce0d8c3c2eb87e7cfafd8ccf9,pablodanswer,2024-11-20,initial steps
formatting_niceties,e2b47fa84c828e1c9f6ab0dd510e2eb83faeb877,pablodanswer,2024-11-20,update styling
formatting_niceties,e4916209d6c9f4ed5765d7ae20f77903ffd93e9b,pablodanswer,2024-11-20,search bar formatting
graceful_failure,03245a4366adeb1668a337b37d070d09922f5531,pablodanswer,2024-10-28,fail gracefully on provider fetch
gtm,acff050f6b2bec0368571e0936f9342b7bcd3919,pablodanswer,2024-11-20,update github workflow
gtm,b96260442d02c9298ed110ba97f5e9eff1ed9100,pablodanswer,2024-11-20,add gtm for cloud build
gtm_v2,4f96ddf9e69923ef1209c5586c73eb40b0418aaa,pablodanswer,2024-11-21,quick fix
horizontal_scrollbar,fa82e8c74cac273563badadec0c04176575ffbbb,pablodanswer,2024-11-21,account for additional edge case
horizontal_scrollbar,fa592a1b7a69897110a928a222b19eaef3b7267a,pablodanswer,2024-11-21,clean horizontal scrollbar
improved_cert,3b19c075ad6e8930d785943b24e46b2c08555c3a,pablodanswer,2024-11-07,minor improvements
improved_cloud,379d569c61801f0c093b7474f888392aa2cb1249,pablodanswer,2024-11-11,include reset engine!
improved_cloud,53f9d94ceb7a6a8da2a0c2d94fee6971adb29bbf,pablodanswer,2024-11-11,revert
improved_cloud,5058d898b8532881c517e14c22ca5c32784288fe,pablodanswer,2024-11-11,update some configs
improved_cloud,bc7de4ec1b9832059426ed74f2755c9548852459,pablodanswer,2024-11-11,moderate slackbot switch
improved_cloud,3ad98078f5205c2df5a3ea96cc165b982256a975,pablodanswer,2024-11-10,finalized keda
improved_cloud,0fb12b42f10bae3d8633717f763fa42271349442,pablodanswer,2024-11-10,minor update
improved_cloud,158329a3cc659d666328dac36bac7c5ffa87e084,pablodanswer,2024-11-10,finalize slackbot improvements
improved_cloud,7f1a50823baf0f5bbab89587e7df6f03fe552e27,pablodanswer,2024-11-10,fix typing
improved_cloud,0e76bcef454e0c09cb83ce91834730fdd084d930,pablodanswer,2024-11-10,add improved cloud configuration
indent,95ded1611c7d2199438b863c54f327eba632a5b0,pablodanswer,2024-10-27,add indent to scan_iter
indexing_improvements,ff8e5612c9cd67a642314632658f5a55814f7c5e,pablodanswer,2024-11-05,minor
individual_deployments,fe83d549a356d802ee1e693c8739db7563ed5ddc,pablodanswer,2024-11-02,add k8s configs
individual_deployments,0e42bb64579328d18ff01049a7aaa2a0b49be142,pablodanswer,2024-10-31,remove unecessary locks
individual_deployments,41ec9b23309a3bbfe598018832fbf5d3fe91c5e1,pablodanswer,2024-10-31,minor
individual_deployments,9e4e848b98f35056dcf3df6f0815651e9fe56eba,pablodanswer,2024-10-30,initial removal of locks!
individual_deployments,1407652e3b5825fae7a90a0d5818ef67ec44f50d,pablodanswer,2024-10-30,nit
individual_deployments,2758ff7efd4dd47e891ef77c05985d6407e4cbd7,pablodanswer,2024-10-30,reorg
individual_deployments,0718d5740b714a0222eb2520c6c2f0e70c095aa1,pablodanswer,2024-10-30,validate
individual_deployments,922f3487fbd7585ce6a7251ff0644cbeca921133,pablodanswer,2024-10-30,add validated + reformatted dynamic beat acquisition
json_account,f4b3f8356a5911cb4a0610773b824bc6e6eb8c73,pablodanswer,2024-11-14,fix single quote block in llm answer
k8s_jobs,7124ce0b9a56f0b5dc45a733fe95cd581f9894a4,pablodanswer,2024-11-02,improve workers
k8s_jobs,10ab08420479ab056d807cbf0942c67a1dd6e7c7,pablodanswer,2024-11-02,improved timeouts + worker configs
k8s_jobs,9bc478fa1b7f1418fadfbd067383d67b417472aa,pablodanswer,2024-11-02,k
k8s_jobs,930e392d69ecd1058a73c0dfb0e2e021232921fc,pablodanswer,2024-11-02,update config
k8s_jobs,6d14ceeadf958cd1e7600b667b69ce0f3bf86830,pablodanswer,2024-11-02,k
k8s_jobs,efdf95eb232870f83677b2b424ffaa117463649a,pablodanswer,2024-11-02,add k8s configs
k8s_jobs,f687d3987cd9514f9fe587e563729ce27b8ff224,pablodanswer,2024-11-02,k
k8s_jobs,af4c9361a926867a992239daa283900300d7247e,pablodanswer,2024-11-02,nit
k8s_jobs,f74366bbd8699f9987ed8229e3368a5d7be71a53,pablodanswer,2024-11-01,update
k8s_jobs,734fcdca98aa5eeaa99d9936fa8db716eda93ad7,pablodanswer,2024-10-31,remove unecessary locks
k8s_jobs,dbc44315ad3cbf79509bd14a4025c2ecc4a6f86e,pablodanswer,2024-10-31,minor
k8s_jobs,d80049262406a0c30e9ad0fc647bddb23cbfbad9,pablodanswer,2024-10-30,initial removal of locks!
k8s_jobs,5646675ae094f39f3e7ead937cbcfd3fb7c7f24f,pablodanswer,2024-10-30,add validated + reformatted dynamic beat acquisition
k8s_jobs,01bdcad4f038c5d4c642ca14680593988c28bf96,pablodanswer,2024-11-02,ensure versioned apps capture
k8s_jobs,0994ac396612855ecac9afbce6ef9b8bd7e54742,pablodanswer,2024-11-01,typing
k8s_jobs,8ff8a88d5b6ad2d02a653f959c39cfeeda9ef54c,pablodanswer,2024-11-01,update
k8s_jobs,e11aee38ba5946a1453693fdc3bbd20d703d9e10,pablodanswer,2024-11-01,address comments
k8s_jobs,53c6d16c3cdc7ffb3eebd3e7b73474025ef6cafc,pablodanswer,2024-10-30,nit
k8s_jobs,a85b2a9745587c4e783e040496dee1ac83e492c9,pablodanswer,2024-10-30,reorg
k8s_jobs,4ace16c905b47b97990de0ab0ef3c029870f9be0,pablodanswer,2024-10-30,validate
k8s_jobs,89293ecc730387a864be6efc01230fedffdc7b82,pablodanswer,2024-10-30,add validated + reformatted dynamic beat acquisition
lenient_counting,4836a74e1e2789051b6d1454b7f2bd22daced61a,pablodanswer,2024-11-13,nit
lenient_counting,f7514011ef4cf62d80ab9afe170320b2e4135da2,pablodanswer,2024-11-13,lenient counting
max_height_scroll,c354912c704b0aa31737bfd41d4bd8f0c7d85769,pablodanswer,2024-11-20,ensure everythigng has a default max height in selectorformfield
migrate_tenant_upgrades_to_data_plane,572298aa8920d51320db5fff518f66fee6e42117,pablodanswer,2024-11-05,nit
migrate_tenant_upgrades_to_data_plane,40b55197ac8336e6ef081074ea65fc4b0cbeb27c,pablodanswer,2024-11-05,minor config update
migrate_tenant_upgrades_to_data_plane,4b9d868ecb78dedd3816ae7bc28e8f856881c6f4,pablodanswer,2024-11-04,minor pydantic update
migrate_tenant_upgrades_to_data_plane,1295c3a38e827024d89ba56fe3c846fcbe204bc0,pablodanswer,2024-11-04,ensure proper conditional
migrate_tenant_upgrades_to_data_plane,f2ac56d80213125f1f5d465b21a6a2e4b47566a2,pablodanswer,2024-11-04,improve import logic
migrate_tenant_upgrades_to_data_plane,fcdb3891bf196ef7e1f10e9d7a0a77512c752710,pablodanswer,2024-11-04,update provisioning
migrate_tenant_upgrades_to_data_plane,9a5d60c9a3df0891a769615e540af8332c0b416c,pablodanswer,2024-11-04,simplify
migrate_tenant_upgrades_to_data_plane,b512f35521bcb8c8ee9e748dae493028093f05bb,pablodanswer,2024-11-04,k
migrate_tenant_upgrades_to_data_plane,b872b7e778f7e0bd92e6eac9317e74e3157c12e1,pablodanswer,2024-11-04,minor clean up
migrate_tenant_upgrades_to_data_plane,b7847d16686419fe024d361cfaf2212a4decc397,pablodanswer,2024-11-04,minor cleanup
migrate_tenant_upgrades_to_data_plane,2f03ddb1bedada32576cb52bfa2cf36074fbb9fe,pablodanswer,2024-11-04,functional but scrappy
migrate_tenant_upgrades_to_data_plane,dc001a3b7b48df659bc64c2486ceded5eea3ed0f,pablodanswer,2024-11-04,add provisioning on data plane
minor,c7d58616b5943768e2e581751f4ede7a4f3292da,pablodanswer,2024-11-22,k
minor,351ee543a0773ecb6acf99f3888dd648091d7f85,pablodanswer,2024-11-22,k
minor_fixes,ea58c3259505aaa53c66343243667959ca79ecb8,pablodanswer,2024-11-05,minor changes
minor_fixes,cbf577cf4623c8352664058d21b1a80ae7ab4299,pablodanswer,2024-11-05,nit
minor_fixes,20d2301a7e594ad803c0486d63d056653c5b8c83,pablodanswer,2024-11-05,minor config update
minor_fixes,fdf9601375464f3e7f49d4472dbc3eeacd1eab8f,pablodanswer,2024-11-05,form
minor_fixes,7421328695641e943c7083639483fa36e4e9cfdb,pablodanswer,2024-11-04,minor pydantic update
minor_fixes,d600d63876e7100894c47a7dc9120b689a55521f,pablodanswer,2024-11-04,ensure proper conditional
minor_fixes,e7cae46867207789088df6611dbafc78650c8ace,pablodanswer,2024-11-04,improve import logic
minor_fixes,b0894320f99fea9cb13a94a5fbb5a1e9523ef460,pablodanswer,2024-11-04,update provisioning
minor_fixes,e623b494568d0bcc74937628984b6cc574aed9a6,pablodanswer,2024-11-04,simplify
minor_fixes,99d91bd658e812996bcc03d0be29e57277b8fb67,pablodanswer,2024-11-04,k
minor_fixes,77c180be0f8e91b9f997b90f631e18d41ba8fde2,pablodanswer,2024-11-04,minor clean up
minor_fixes,baaed72297ef248dc5dc422f0e5adcdff7599416,pablodanswer,2024-11-04,minor cleanup
minor_fixes,ab7fa7f6d0c3f1a59d97b5450262cb4ef6f8481d,pablodanswer,2024-11-04,functional but scrappy
minor_fixes,acf3ede8b4baf044391176aacd3bba6f80bb4b3f,pablodanswer,2024-11-04,add provisioning on data plane
minor_nits,bfcd418ecd9523376c605263565a9714ceeb3a18,pablodanswer,2024-11-09,k
minor_nits,5dfcb94964f977bb603865858e1e6aa6582454fd,pablodanswer,2024-11-09,update colors
minor_nits,a287cd94cd8090fefee7c1d20cc494b894bf39c1,pablodanswer,2024-11-09,nit
minor_nits,2d9586b059cfb1cb8e1f6c0fccc696af6ba8873d,pablodanswer,2024-11-08,nit
minor_nits,5dcc3692a7748ed20d49adef5f7672d45f600a4a,pablodanswer,2024-11-08,moderate component fixes
minor_slack_fixes,425a678a5350ad5716c3efd6a60c78f6a9c2738e,pablodanswer,2024-11-20,reset time
minor_slack_fixes,14adbcb497365f9e93c21aeb0476cffc72cab643,pablodanswer,2024-11-20,update slack redirect + token missing check
misc_color_cleanup,83c8f04e5a183a289f76b809d9aabdd4ea0e664b,pablodanswer,2024-11-03,formatting
misc_color_cleanup,334ff6fb5ab2e450e1e0709be16870b1ed07dae3,pablodanswer,2024-11-03,ensure tool call renders
misc_color_cleanup,94262264e768cdc28ffe4fc31b2947c0cf3774a3,pablodanswer,2024-11-03,ensure tailwind config evaluates properly + update textarea -> input
misc_color_cleanup,40cb9e9cdb4561eac777ede08ace88219d12ad96,pablodanswer,2024-11-02,additional minor nits
misc_color_cleanup,2e81962a74567c0c510d911a22aee385c56b3207,pablodanswer,2024-11-02,nit
misc_color_cleanup,76ca7eb3f2cf2408fee330f540987e6238cd632e,pablodanswer,2024-11-01,nit
misc_color_cleanup,7269b7a4aa986dbba654be4b375bea1d9334fe01,pablodanswer,2024-11-01,additional nits
misc_color_cleanup,4726a10fd7503882554d1dfaf1541657ffb45a04,pablodanswer,2024-11-01,misc color clean up
mobile_scroll,eca41cc514446a2c0b2c756add3164462fb2c49d,pablodanswer,2024-11-11,improved mobile scroll
modals,8093ceeb45088c813fbb117302738b3d225c2f8b,pablodanswer,2024-10-28,formatting
modals,3d0ace1e450ac6d7271ddedc2ec122a2647be7df,pablodanswer,2024-10-28,minor nits
modals,553aba79dc41b928c163a83481b202ad56805aae,pablodanswer,2024-10-28,update based on feedback
modals,da038b317a0b5185ccc32297b01fcaa97ffbb429,pablodanswer,2024-09-21,remove logs
modals,6769dc373faf7576c2d0ac212735b88eae755293,pablodanswer,2024-09-21,minor udpate to ui
modals,b35e05315c4c506da87524fe788a9cf5aacb7375,pablodanswer,2024-09-20,use display name + minor updates to models
modals,7cfd3d2d442255616ec5c477dc4b3eb0b2cad1ed,pablodanswer,2024-09-20,cleaner cards
modals,b2aa1c864b20274386a1bbe699a3ef7e094bd858,pablodanswer,2024-09-20,slightly cleaner animation
modals,d2f8177b8f1b9be8eebce520204018e6be59b03c,pablodanswer,2024-09-20,cleaner initial chat screen
more_theming,1744d29bd6f6740fb20bbbf8b5651cd60edbf127,pablodanswer,2024-11-21,k
more_theming,fa592a1b7a69897110a928a222b19eaef3b7267a,pablodanswer,2024-11-21,clean horizontal scrollbar
multi_api_key,67e347a47fd2e4aa9efe7b17c7b177166c893d10,pablodanswer,2024-10-31,clean
multi_api_key,3fb6e9bef96da888fa366a16f102358eb8e990e0,pablodanswer,2024-10-31,nit
multi_api_key,c4514fe68f58a03da0c3c3efae78ad23e2eb88c9,pablodanswer,2024-10-30,organization
multi_api_key,5b19209129542b885e123a51ce3da93b741d49d2,pablodanswer,2024-10-30,basic multi tenant api key
new_seq_tool_calling,59e9a33b30ece8d41340787d9d9a82e9a07a8f24,pablodanswer,2024-11-18,k
new_seq_tool_calling,6e60437c565a185475c715efbbef6caca1cfc2fb,pablodanswer,2024-11-17,quick nits
new_seq_tool_calling,9cde51f1a2ca1df2f753c9b6d7910b8f9623d8a4,pablodanswer,2024-11-07,scalable but not formalized
new_seq_tool_calling,8b8952f117e4d05bb484bc5dec1c12d4fbbafcca,pablodanswer,2024-11-07,k
new_seq_tool_calling,dc01eea610817ab821ded6e5ce584f81fe1ba065,pablodanswer,2024-11-07,add logs
new_seq_tool_calling,c89d8318c093c860037a839494876eff649f5d26,pablodanswer,2024-11-07,add image prompt citations
new_seq_tool_calling,3f2d6557dcb5964dbb9ed88ade743f74a4285411,pablodanswer,2024-11-07,functioning albeit janky
new_seq_tool_calling,b3818877afc406f9500e7bef1f2b7e233faf76fa,pablodanswer,2024-11-07,initial functioning update
new_theming_updates,102c264fd06232bbc4c7a23615add5cf7c0618be,pablodanswer,2024-11-21,minor updates
new_theming_updates,1744d29bd6f6740fb20bbbf8b5651cd60edbf127,pablodanswer,2024-11-21,k
new_theming_updates,fa592a1b7a69897110a928a222b19eaef3b7267a,pablodanswer,2024-11-21,clean horizontal scrollbar
nit,c68602f456c66279e760bd25067cfdfe03841f8a,pablodanswer,2024-11-10,specifically apply flex none to in progress!
nit_mx,c5147db1ae5387e8fd5672779689485142fb1b1d,pablodanswer,2024-11-20,formatting
nit_mx,3a6a74569544ee7d74c6b62a5a56730331838095,pablodanswer,2024-11-20,ensure margin properly applied
nit_redis,85843632c5fe61a425d425feef6480c639471af7,pablodanswer,2024-10-28,add srem and sadd to tenant wrapper
no_locks!,f687d3987cd9514f9fe587e563729ce27b8ff224,pablodanswer,2024-11-02,k
no_locks!,af4c9361a926867a992239daa283900300d7247e,pablodanswer,2024-11-02,nit
no_locks!,f74366bbd8699f9987ed8229e3368a5d7be71a53,pablodanswer,2024-11-01,update
no_locks!,734fcdca98aa5eeaa99d9936fa8db716eda93ad7,pablodanswer,2024-10-31,remove unecessary locks
no_locks!,dbc44315ad3cbf79509bd14a4025c2ecc4a6f86e,pablodanswer,2024-10-31,minor
no_locks!,d80049262406a0c30e9ad0fc647bddb23cbfbad9,pablodanswer,2024-10-30,initial removal of locks!
no_locks!,5646675ae094f39f3e7ead937cbcfd3fb7c7f24f,pablodanswer,2024-10-30,add validated + reformatted dynamic beat acquisition
no_locks!,01bdcad4f038c5d4c642ca14680593988c28bf96,pablodanswer,2024-11-02,ensure versioned apps capture
no_locks!,0994ac396612855ecac9afbce6ef9b8bd7e54742,pablodanswer,2024-11-01,typing
no_locks!,8ff8a88d5b6ad2d02a653f959c39cfeeda9ef54c,pablodanswer,2024-11-01,update
no_locks!,e11aee38ba5946a1453693fdc3bbd20d703d9e10,pablodanswer,2024-11-01,address comments
no_locks!,53c6d16c3cdc7ffb3eebd3e7b73474025ef6cafc,pablodanswer,2024-10-30,nit
no_locks!,a85b2a9745587c4e783e040496dee1ac83e492c9,pablodanswer,2024-10-30,reorg
no_locks!,4ace16c905b47b97990de0ab0ef3c029870f9be0,pablodanswer,2024-10-30,validate
no_locks!,89293ecc730387a864be6efc01230fedffdc7b82,pablodanswer,2024-10-30,add validated + reformatted dynamic beat acquisition
pinned,233713cde3516c05b857f878ff452c7714a91c48,pablodanswer,2024-11-20,hide animations
pinned,c0b17b4c51376d99685976430b9c4153c35e2ffa,Yuhong Sun,2024-11-20,k
pinned,15f30b00507e337ec9ee85624fc0cc574eb7b952,Yuhong Sun,2024-11-20,k
pinned,39d9df9b1b58dd2621bd575fa6c7ec720864d3bb,pablodanswer,2024-11-18,k
point_to_proper_docker_repository,9893301f113691111669bc2ab05a7c3abf19ae32,pablodanswer,2024-11-09,raise exits
point_to_proper_docker_repository,2344327112c01db8b2226dea0e02b2a8aa9ca875,pablodanswer,2024-11-09,ensure .github changes are passed
point_to_proper_docker_repository,caa2966ebc607fb8d2899ee78573ed2454983efb,pablodanswer,2024-11-09,robustify cloud deployment + include initial KEDA configuration
prev_doc,44f82fa928b79e7f51b41a0ee67cc93067880be3,pablodanswer,2024-11-22,k
prev_doc,2c7c9fbc130b8f0c717fa9fa4e5d2f6073f92be5,pablodanswer,2024-11-22,revert to previous doc select logic
prompting,4d8edad71ace767917a612dc628e266bd267d7d5,pablodanswer,2024-11-17,k
prompting,b1265619a27a849f2fbb9ba85b440a8b1b698d7d,pablodanswer,2024-11-16,add proper category delineation
prompting,dfe2c305866ad414143ce479b0601f8a61e615ea,pablodanswer,2024-11-05,post rebase cleanup
prompting,236c19230f5165e24ef557db53d863953faa714a,pablodanswer,2024-11-05,add auto-generated starter messages
proper_tenant_reset,4376bf773a81278ab92846673f193207be96052a,pablodanswer,2024-10-31,minor formatting
proper_tenant_reset,95f660db67b1327208fde82ae043511f2187452f,pablodanswer,2024-10-31,clear comment
proper_tenant_reset,1cdb5af9a1519ef8d63c94bf39256b00d4a8bdd2,pablodanswer,2024-10-31,add proper tenant reset
proper_token_default,4e0c048acba88f4c83d7c83af52bb0932234ddad,pablodanswer,2024-11-14,nit
proper_token_default,a0371a6750476fccc3b9892a7c58d72182c92507,pablodanswer,2024-11-14,minor logic update
proper_token_default,4f1c4baa80f7b747633bb3d528aed6de5b11f639,pablodanswer,2024-11-14,minor cosmetic update
proper_token_default,b6ef7e713a4eca3d65aa411604e8f67ad5efdd87,pablodanswer,2024-11-14,k
proper_token_default,66df9b6f7dae8bce61e35615d715ddefc6406614,pablodanswer,2024-11-14,improved fallback logic
proper_token_default,0473888ccdb5219cc39f275652bfeb72a420b5d9,pablodanswer,2024-11-13,silence warning
regenerate_clarity,3e232c39193b1c67bda9d732c1c2ee77ee14c721,pablodanswer,2024-10-29,minor udpate
regenerate_clarity,49e2da1c5c4fa34a8568ba0b3f08e79cd17cec93,pablodanswer,2024-10-29,add regeneration clarity
remove_ee,132802b295b805292f427039617a00e04dca2ae9,pablodanswer,2024-11-09,k
remove_ee,23883441f87ac3cd4e2ee717d2b033c3e7da9398,pablodanswer,2024-11-09,ensure callable
remove_ee,f43ed0b6b9391e66e210c5d90acf7a2409c3300b,pablodanswer,2024-11-09,finalize
remove_ee,fa42e5fa470e340e9b17fed5a3bd0e7976c6255e,pablodanswer,2024-11-08,finalize
remove_ee,625b5c52a044027b3d469286910a3cdd1c6bee02,pablodanswer,2024-11-08,update
remove_ee,239200dfc46f6cf18d7e689341b56a8baecdc0f6,pablodanswer,2024-11-08,update
remove_ee,5b70a8fa6f65d8513670c3bbbfd6cec13c76d530,pablodanswer,2024-11-08,general cleanup
remove_ee,14dfd6d29e178af9cfeb79ae20b7a846c5958966,pablodanswer,2024-11-08,move token rate limit to non-ee
remove_ee,dc4fdbb312881585fbc860b7aaff5adb9af4d8c5,pablodanswer,2024-11-08,finalize previous migration
remove_ee,cfd3d90493fad0af75569c98b6cfc9effa37b471,pablodanswer,2024-11-08,move api key to non-ee
remove_empty_directory,81e1ac918364467e3009eae376930199e3e2943f,pablodanswer,2024-10-28,remove empty directory
remove_endpoint,14f57d6475d835da6dfacc4ebd254e25618b3100,pablodanswer,2024-10-31,remove endpoint
rerender,1392f2454061914ac8c5f6302318a24064034a5b,pablodanswer,2024-11-21,k
rerender,617e6d905363cc91ca154bba0f6f2a11888b35e6,pablodanswer,2024-11-21,unused
rerender,da36e208cd53ae25a2c89a4cf0c598333898387a,pablodanswer,2024-11-21,clean
rerender,36eee45a03c3227a9b070e18a043e16fe5179cb9,pablodanswer,2024-11-21,llm provider causing re render in effect
reset_all,bde1510923d69ca0eb57340da6b59f9035e3de0a,pablodanswer,2024-11-04,ensure we reset all
search_chat_rework,931461bc8404fc51f15f0b75ae77e3a772a05989,pablodanswer,2024-11-21,v1
sequential_messages,5fbcc70518bd5d1be00d6595f3fc690f81c52f21,pablodanswer,2024-11-01,minor logging updates for clarity
sequential_messages,7db0de9505c3510a4db76e98a47d5b079056dc93,pablodanswer,2024-10-31,minor typo
sequential_messages,18b4a8a26331bc013b49e486e2bf82c5ce4bfe73,pablodanswer,2024-10-31,fix stop generating
sequential_messages,98660be16459038b438d12616bd6f00dde418b95,Weves,2024-10-31,Fix UT
sequential_messages,3620266bddfbf1fca309ff2fe97f72bda7462979,Weves,2024-10-31,Remove unused exception
sequential_messages,2132a430cc64abd869632c0f55a35bdc42b30be9,Weves,2024-10-31,Fix image generation slowness
sequential_messages,24e34019ce25314c5e749d38dd0895a1c3d5141e,Weves,2024-10-31,More testing
sequential_messages,3cd4ed5052277428dc06343f53e0e6486af26208,Weves,2024-10-31,Testing
sequential_messages,200bb96853d6d96a99093f6e915fe9721ab5c6b3,Weves,2024-10-31,Add quote support
sequential_messages,5a0c6d003607dfb9a7445a6a87df9a6062b73bc6,Weves,2024-10-02,Fix
shadcn,fe9be6669538db406a0c67959dcf4c91e8d4858b,pablodanswer,2024-10-28,button + input updates
shadcn,7cccb775c1f1385bc50131f7d548519d95ac64cd,pablodanswer,2024-10-28,initialization
sheet_update,98aa32055203d32a6d25eb1266deab6c58a176fb,pablodanswer,2024-11-21,update configuration
sheet_update,026134805a1418f32b61973f55571756ba102c09,pablodanswer,2024-11-21,finalized
sheet_update,36c1fc23d087f41db06e2680233a1ade7e65e594,pablodanswer,2024-11-21,k
sheet_update,3a4804b4b7d54fd3db576b698b5187d8dc0aa5ca,pablodanswer,2024-11-20,add multiple sheet stuff
sheet_update,5e326bcd08d019103f78da1c8a4a45ba4e401353,pablodanswer,2024-11-20,update sheet
sheet_update,d7f2a3e112c00bda2813933d673fb18080d6de6d,pablodanswer,2024-11-20,k
sheet_update,3eaf2a883a5fb52169af2ba2e0571189fb3712eb,pablodanswer,2024-11-20,quick pass
show_logs,189d62b72e0a2183ac3b25ea62eaea1b4db4366b,pablodanswer,2024-11-08,k
show_logs,89cb3b503cf219d90338110cec34d288892c27ed,pablodanswer,2024-11-08,minor updates
show_logs,cdda24f9ea4bc54f6a6c49d7848b63b2b5dacc9e,pablodanswer,2024-11-08,remove log
show_logs,6dc4ca344c927b5e9c02b28662252a4067a2f7da,pablodanswer,2024-11-08,k
show_logs,f91bac1cd90da5070247e70682e38adbe2722ce2,pablodanswer,2024-11-08,improved logging
show_logs,5e25488d0af1e1939a366fe12ab42949daaa77f1,pablodanswer,2024-11-08,add additional logs
silence_log,7400652fe70f86da3c8aab2a41f26103e395d739,pablodanswer,2024-11-20,silence small error
single_tool_call,0230920240fa46e06e1cc66fb67fa42f5caf81b3,pablodanswer,2024-11-01,finalize migration
single_tool_call,e7859e8bb4ea8409657cf0a7464724a5192e953e,pablodanswer,2024-11-01,single tool call per message
single_tool_call,fd3937179f14968b4103c634a83430f7ae9303bc,pablodanswer,2024-11-01,minor logging updates for clarity
single_tool_call,7a5a8f68a6e663d2b91badd47847193c92b523d0,pablodanswer,2024-10-31,minor typo
single_tool_call,122cd2082e4ddd4a56992f5f8c36b9853057581a,pablodanswer,2024-10-31,fix stop generating
single_tool_call,7384874e54a8ebc136b41efbe0842a327262b738,Weves,2024-10-31,Fix UT
single_tool_call,2b06789d5133029d99763037ded18766e8d04d74,Weves,2024-10-31,Remove unused exception
single_tool_call,4bdfd117370ac126e1bdc6e32f0192d59c51dd57,Weves,2024-10-31,Fix image generation slowness
single_tool_call,6d4ccc354514ff328473a1c35974521c465aa2f5,Weves,2024-10-31,More testing
single_tool_call,ef0ad8f8fce4eebc38cc9291047b84e5162572f3,Weves,2024-10-31,Testing
single_tool_call,99b076412aa3501cbff75d7521c4cedb8f793c34,Weves,2024-10-31,Add quote support
single_tool_call,499272ef25961ddb0861ee2a6ff6d978ea1e7772,Weves,2024-10-02,Fix
slack_scaling,dd958cff6b0999190c5116e0354497207231d5d6,pablodanswer,2024-10-30,minor foreign key update
super_user,0cc09c8b4d9ba0dca350a799ddc265fca38f4b90,pablodanswer,2024-11-02,nits
super_user,ec8ae2b5f4491e3de0701ba31ae3124d8f549e66,pablodanswer,2024-11-02,add super user
swap_buttons_cards,e6ce503bbbbed4d70734d11ebccc0db4994f69e0,pablodanswer,2024-11-01,nits
swap_buttons_cards,680a160b2560594c3c99d4f1e8cffc3bfea66064,pablodanswer,2024-11-01,update colors
swap_buttons_cards,748c99d655739c1bb7da0a25e2829c0d706ff810,pablodanswer,2024-10-31,clean build
swap_buttons_cards,a222b9d3e7819e9a7e525b6994248caa167c8ac1,pablodanswer,2024-10-30,list item + configuration updates
swap_buttons_cards,df38bde21a0f457fb6be4c1b66fae196ae32ec20,pablodanswer,2024-10-30,nits
swap_buttons_cards,ddb22e659d1fb4cd8f30ec952e68db683f5a746e,pablodanswer,2024-10-29,fully swapped
swap_buttons_cards,d91e54759a022acf478467b0906ee1a2867aa2ca,pablodanswer,2024-10-29,remove tremor
swap_buttons_cards,f6117b0f16581bac8fbd181e13a5dbc061c5debb,pablodanswer,2024-10-29,begin date picker + badge transfer
swap_buttons_cards,a8a73590bb24a59371c985931ac5dde96674f5b0,pablodanswer,2024-10-29,fix compiling
swap_buttons_cards,5f4f0c0ebb3f12e9de996661eb722561a048311b,pablodanswer,2024-10-29,migrate cards
swap_buttons_cards,8b8173bef0f05997c04ef9899d557d0f0a205767,pablodanswer,2024-10-29,minor updates
swap_buttons_cards,92b7fe45b1bd1ea39252cd8a4ac6a323a548f518,pablodanswer,2024-10-28,migrate badges
swap_buttons_cards,74091415c43c39080bd07c1ef9fc683ecc9742e2,pablodanswer,2024-10-28,migrate dividers + buttons
swap_buttons_cards,80f9af73d0adcb06c8228b868632bdecc362d616,pablodanswer,2024-10-28,button + input updates
swap_buttons_cards,efbeb2716536ea6b08fac40c1e074698a534ea11,pablodanswer,2024-10-28,initialization
switch-to-turbopack,09f5fea799633152f59fb9a54451d922eb4914e0,pablodanswer,2024-11-02,slight modification
switch-to-turbopack,f7ac9ae034605ac59a9c97650ebd6956d5628ed6,Weves,2024-11-02,Fix prettier
switch-to-turbopack,e42f4c98c487f671887de0c43680a659a9132753,Weves,2024-11-01,Style
switch-to-turbopack,f800017b21c2618ae51f16ef4f5d9b5e930f01fc,Weves,2024-11-01,Style
switch-to-turbopack,7f5744974644d6cbbcf41815e27f9017de76d738,Weves,2024-11-01,Fix charts
switch-to-turbopack,2b6514e75489842c8de0aae99d705e22daee9461,Weves,2024-11-01,Upgrade react
switch-to-turbopack,85d5857dbcbbf353a883abf7681c85a48dc4f724,Weves,2024-11-01,Remove override
switch-to-turbopack,7760230bf771cb6d3b0fca46b6e0bb35677ad5ee,Weves,2024-11-01,Update nextjs version
switch-to-turbopack,a3be5be8c6c2bf653de9df48e6a3dfc01144f849,Weves,2024-11-01,Remove unintended change
switch-to-turbopack,4d3fdba81ee2ccace76380b0b7318a5a5ed0ab79,Chris Weaver,2024-10-26,Upgrade to NextJS 15 + use turbopacK
temp/include_file61,20d29eb51cca799b9cc04552dd083bf202c760bc,pablodanswer,2024-11-03,temporary update
tenant_task_logger,02251aab75bad74647ba526654950b131748eb45,pablodanswer,2024-11-21,update
tenant_task_logger,805575ef183348ce55a7d8749db477422d0b30de,pablodanswer,2024-11-09,don't prevent seeding
tenant_task_logger,7146d02d553c568d99e7efd97a3b185f783a219a,pablodanswer,2024-11-06,update app base
tenant_task_logger,6c360ccc483de4ce42fc88724a55f793398a1445,pablodanswer,2024-11-05,remove logs from beat
tenant_task_logger,8773f215688e6775ebdf65bb5edda0f1e6080787,pablodanswer,2024-11-05,append
tenant_task_logger,d715c8be8a0465551e4d5670a43bf52d1d4635de,pablodanswer,2024-11-05,remove tenant id logs
tenant_task_logger,fa592a1b7a69897110a928a222b19eaef3b7267a,pablodanswer,2024-11-21,clean horizontal scrollbar
text_view,5d1a664fdc8c712aa644452b061e76b3302f714a,pablodanswer,2024-11-20,nit
text_view,b13a1d1d851b924f7b8f402894526d92712b09fa,pablodanswer,2024-11-18,k
text_view,77ab27f982af152818dcb9b4390da80113f17e72,pablodanswer,2024-11-15,update
text_view,61135ed7db5168d5517b8f11aed05e14b1aba471,pablodanswer,2024-11-14,basic log
text_view,7c13ca547fc42988ef9ca10bd4a354a0fd4473cc,pablodanswer,2024-11-14,minor testing update
text_view,46f9f0dc947da29271b16e893152402421cc1c85,pablodanswer,2024-11-14,update tests
text_view,756b56d2cd63b7792de532d05a03bbaac2c80960,pablodanswer,2024-11-13,wip tests
text_view,180c176136b46424021d4f0ca84052afae4946dd,pablodanswer,2024-11-13,minor docker file update
text_view,fa8a92875bc8c3637c7aa0eac937bc3a0818e66a,pablodanswer,2024-11-13,remove left over string
text_view,c6907ebebe9391140e272ebe0e89b6b6d207f8f5,pablodanswer,2024-11-13,finalize
text_view,709b87d56d0e770c1ee6240cfbd4bc76743eb521,pablodanswer,2024-11-13,finalized
text_view,b8df6e22d2d15a099aea2bc3b2e7d4c67b446ae8,pablodanswer,2024-11-13,k
text_view,ba977e3f5dae439f4ec6b62edc717ada5f49e1f5,pablodanswer,2024-11-12,minor typing update
text_view,ed5ed616efd0dceee374b2de5bec69adb4553a62,pablodanswer,2024-11-12,typing
text_view,ff4f3bb211485274250eed299247631cc2f1d9a3,pablodanswer,2024-11-12,update text view
text_view,e38fd6f7c76f3133fc407d99428a7286328843b6,pablodanswer,2024-11-12,update text view
text_view,c76602b7be9968643726f2a8818d27d290d400dd,pablodanswer,2024-11-12,k
text_view,62abe2511b8975ce050c4712a095372bf1d1ddc7,pablodanswer,2024-11-11,initial display
theming,e1eff26216e42897db4e49a02cb7bb13e9425422,pablodanswer,2024-11-18,nit
theming,4b1d428f71fd8993c516f35d8c4fa502c40baaae,pablodanswer,2024-11-18,add additional theming options
theming_updated,f95813e381acf7590e094f774c0811f375cde670,pablodanswer,2024-11-21,update neutral
theming_updated,804887fd311a783306f160591bc273866388a9f0,pablodanswer,2024-11-21,update
theming_updates,c6556857cceacce98b8a90f9a42c4ddfac3b7884,pablodanswer,2024-10-30,update our tailwind config
theming_updates,592394caeae4414bd87108ef9f8de65b77226e37,pablodanswer,2024-10-30,enforce colors
theming_updates,8f2b0eb72d55347091339c9ba39e2c12f238a776,pablodanswer,2024-10-30,remove pr
theming_updates,f92f8e7a73c238fc44ccca746d6fb597c5ad5cb8,pablodanswer,2024-10-30,nit
theming_updates,5c6fc34d6316e033b5e258b9a469fa1bd8ea3167,pablodanswer,2024-10-30,add comments
theming_updates,3472fb27371f59b454a4b27a699e2160b801ab46,pablodanswer,2024-10-30,ensure tailwind theme updated
theming_updates,8210c8930b005cfe6248618373a708b150e412f2,pablodanswer,2024-10-29,naming
theming_updates,e6b9ebc198973a84dc9412302e6b98a24b0a2ce3,pablodanswer,2024-10-29,ensure functionality
tool_call_per_message,bd0259c05ff9364a99670582ff1cd804fc1b12b7,pablodanswer,2024-11-03,validated
tool_call_per_message,381aadd24e897e28215964404048c84d7aeaa1df,pablodanswer,2024-11-03,remove print
tool_call_per_message,90c711322dc19a6c4092a60beb5905ded89079d6,pablodanswer,2024-11-01,k
tool_call_per_message,20a36e5f46755a55c022dd422c4d31e9abc24d46,pablodanswer,2024-11-01,validate simplify
tool_call_per_message,9b3a008ef42d31227290f0ddfbc5b37daa82f360,pablodanswer,2024-11-01,minor image generation fix
tool_call_per_message,a958903bd74c78457ef487debfb6084cd8ab6b2b,pablodanswer,2024-11-01,finalize migration
tool_call_per_message,4ea0aceca97734ddca8d1f60da930668e0561694,pablodanswer,2024-11-01,single tool call per message
tool_csv_image,8015e84531263cda72d7ca281ed0f790c0d0bb3f,pablodanswer,2024-11-03,add multiple formats to tools
tool_search,04be3fcbf7e128136f38760845f5d39197c94a5e,pablodanswer,2024-11-15,k
tool_search,601d497ed7acd05709384098a3132e1240d32932,pablodanswer,2024-11-15,add tests
tool_search,4de18b2e23222fc2c628982db8659d17c136adfa,pablodanswer,2024-11-07,update
tool_search,30e6e9b6dc8bebcc98fcf430fbd77af62faffd1a,pablodanswer,2024-11-07,somewhat cleaner
tool_search,ac64d4aa71cca26898a0eeb8d849a15a60945e69,pablodanswer,2024-11-06,remove logs
tool_search,1fd949ccfc6984904020ee50a845b119acd1f0be,pablodanswer,2024-11-06,finish functionality
tool_search,1253eb27f62c81780def9e37e5498b42321d6f49,pablodanswer,2024-11-06,k
tool_search,7dafd72d8c37ab505b35596fb3630c738b58688b,pablodanswer,2024-11-06,first pass
tooltips,5fe453e18565a9c2f3b8f20520fb7868b5e08675,pablodanswer,2024-11-04,nit: fix delay duration
tooltips,4bb9c461ef4c81543690f51c29c6c39949d3e882,pablodanswer,2024-11-04,clean up tooltips
typo,4f2f4e6534605287678fa046524a3ffd705e8ab4,pablodanswer,2024-11-18,(minor) typo
uf_theming,fe49e35ca476c494d0a9f36eb6cfea3e99ed0427,pablodanswer,2024-11-22,ensure added
uf_theming,804887fd311a783306f160591bc273866388a9f0,pablodanswer,2024-11-21,update
undo_temporary_fix,59fcdbaf5a096cc1bcd4599a1c0d7a256ca744f0,pablodanswer,2024-11-03,nit
undo_temporary_fix,c3118f91b9958e736704277b5d3f98a10e3943c2,pablodanswer,2024-11-03,Revert temporary modifications
update-confluence-behaviour,cc769b8bb9b47da9c955e70174bd498fb0b3231a,hagen-danswer,2024-11-15,has issue with boolean form
update-confluence-behaviour,e44646dd799c7f95db1df9616e83241344ef0035,hagen-danswer,2024-11-15,fixed mnore treljsertjoslijt
update-confluence-behaviour,b623630934171868c815b62e30be055fc6f06ec8,hagen-danswer,2024-11-15,whoops!
update-confluence-behaviour,790db4f8ea6bcb02df170d2892c57ccb50aaa119,hagen-danswer,2024-11-15,so good
update-confluence-behaviour,ccd6b8f38113b70ba3acf3beda199fa8ee6e3bab,hagen-danswer,2024-11-15,added key
update-confluence-behaviour,4beffa4be3ed029fe23c95ce08c5d18c9314e54e,hagen-danswer,2024-11-15,details!
update-confluence-behaviour,dacb1870dc98c986e1105fc797603957a2de4b5a,hagen-danswer,2024-11-15,copy change
update-confluence-behaviour,008d6cac8e86429884bd38bbe21a23dac96be123,hagen-danswer,2024-11-15,frontend cleanup
update-confluence-behaviour,f3310fbc73c45773dc19c2ef8da9f2fe4336b559,hagen-danswer,2024-11-15,fixed service account tests
update-confluence-behaviour,c7819a2c5735f812e150718a3620e4bf90ca6a1e,hagen-danswer,2024-11-15,fixed oauth admin tests
update-confluence-behaviour,f3fa6f1442910969f24ec4193b8cea3744f5847d,hagen-danswer,2024-11-15,reworked drive+confluence frontend and implied backend changes
user_defaults,fff98ddc15d8a94b44ffbaf2225545bc2c4c01b6,pablodanswer,2024-11-12,minor clarity
heads/v0.13.0-cloud.beta.0,102c264fd06232bbc4c7a23615add5cf7c0618be,pablodanswer,2024-11-21,minor updates
heads/v0.13.0-cloud.beta.0,1744d29bd6f6740fb20bbbf8b5651cd60edbf127,pablodanswer,2024-11-21,k
heads/v0.13.0-cloud.beta.0,fa592a1b7a69897110a928a222b19eaef3b7267a,pablodanswer,2024-11-21,clean horizontal scrollbar
validate,afc8075cc3076261c8b98a4fe30822641fb9d2cf,pablodanswer,2024-11-22,add filters to chat
validate,71123f54a753f243015f7f6bac62c3b8d1e6d05b,pablodanswer,2024-11-22,several steps
validate,6061adb114ef20c4bf6567c9450ae51a2938c927,pablodanswer,2024-11-22,remove chat / search toggle
validate,35300f65699862f982016284567ef12974ae05c2,pablodanswer,2024-11-22,update
validate,fe49e35ca476c494d0a9f36eb6cfea3e99ed0427,pablodanswer,2024-11-22,ensure added
validate,804887fd311a783306f160591bc273866388a9f0,pablodanswer,2024-11-21,update
vespa_improvements,7c27de6fdcc6172bc1ff4e9522711210f2113e86,pablodanswer,2024-11-14,minor configuration updates
Can't render this file because it contains an unexpected character in line 143 and column 96.

View File

@@ -16,41 +16,6 @@ class ExternalAccess:
is_public: bool
@dataclass(frozen=True)
class DocExternalAccess:
external_access: ExternalAccess
# The document ID
doc_id: str
def to_dict(self) -> dict:
return {
"external_access": {
"external_user_emails": list(self.external_access.external_user_emails),
"external_user_group_ids": list(
self.external_access.external_user_group_ids
),
"is_public": self.external_access.is_public,
},
"doc_id": self.doc_id,
}
@classmethod
def from_dict(cls, data: dict) -> "DocExternalAccess":
external_access = ExternalAccess(
external_user_emails=set(
data["external_access"].get("external_user_emails", [])
),
external_user_group_ids=set(
data["external_access"].get("external_user_group_ids", [])
),
is_public=data["external_access"]["is_public"],
)
return cls(
external_access=external_access,
doc_id=data["doc_id"],
)
@dataclass(frozen=True)
class DocumentAccess(ExternalAccess):
# User emails for Danswer users, None indicates admin

View File

@@ -2,8 +2,8 @@ from typing import cast
from danswer.configs.constants import KV_USER_STORE_KEY
from danswer.key_value_store.factory import get_kv_store
from danswer.key_value_store.interface import JSON_ro
from danswer.key_value_store.interface import KvKeyNotFoundError
from danswer.utils.special_types import JSON_ro
def get_invited_users() -> list[str]:

View File

@@ -17,21 +17,16 @@ def set_no_auth_user_preferences(
def load_no_auth_user_preferences(store: KeyValueStore) -> UserPreferences:
print("LOADING NO AUTH USER PREFERENCES")
try:
preferences_data = cast(
Mapping[str, Any], store.load(KV_NO_AUTH_USER_PREFERENCES_KEY)
)
print("PREFERENCES DATA", preferences_data)
return UserPreferences(**preferences_data)
except KvKeyNotFoundError:
return UserPreferences(
chosen_assistants=None, default_model=None, auto_scroll=True
)
return UserPreferences(chosen_assistants=None, default_model=None)
def fetch_no_auth_user(store: KeyValueStore) -> UserInfo:
print("FETCHING NO AUTH USER")
return UserInfo(
id="__no_auth_user__",
email="anonymous@danswer.ai",

View File

@@ -13,24 +13,12 @@ class UserRole(str, Enum):
groups they are curators of
- Global Curator can perform admin actions
for all groups they are a member of
- Limited can access a limited set of basic api endpoints
- Slack are users that have used danswer via slack but dont have a web login
- External permissioned users that have been picked up during the external permissions sync process but don't have a web login
"""
LIMITED = "limited"
BASIC = "basic"
ADMIN = "admin"
CURATOR = "curator"
GLOBAL_CURATOR = "global_curator"
SLACK_USER = "slack_user"
EXT_PERM_USER = "ext_perm_user"
def is_web_login(self) -> bool:
return self not in [
UserRole.SLACK_USER,
UserRole.EXT_PERM_USER,
]
class UserStatus(str, Enum):
@@ -45,8 +33,10 @@ class UserRead(schemas.BaseUser[uuid.UUID]):
class UserCreate(schemas.BaseUserCreate):
role: UserRole = UserRole.BASIC
has_web_login: bool | None = True
tenant_id: str | None = None
class UserUpdate(schemas.BaseUserUpdate):
role: UserRole
has_web_login: bool | None = True

View File

@@ -49,6 +49,7 @@ from httpx_oauth.oauth2 import BaseOAuth2
from httpx_oauth.oauth2 import OAuth2Token
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import attributes
from sqlalchemy.orm import Session
from danswer.auth.api_key import get_hashed_api_key_from_request
@@ -221,25 +222,18 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
reset_password_token_secret = USER_AUTH_SECRET
verification_token_secret = USER_AUTH_SECRET
user_db: SQLAlchemyUserDatabase[User, uuid.UUID]
async def create(
self,
user_create: schemas.UC | UserCreate,
safe: bool = False,
request: Optional[Request] = None,
) -> User:
referral_source = None
if request is not None:
referral_source = request.cookies.get("referral_source", None)
tenant_id = await fetch_ee_implementation_or_noop(
"danswer.server.tenants.provisioning",
"get_or_create_tenant_id",
async_return_default_schema,
)(
email=user_create.email,
referral_source=referral_source,
)
async with get_async_session_with_tenant(tenant_id) as db_session:
@@ -248,9 +242,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
verify_email_is_invited(user_create.email)
verify_email_domain(user_create.email)
if MULTI_TENANT:
tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID](
db_session, User, OAuthAccount
)
tenant_user_db = SQLAlchemyUserAdminDB(db_session, User, OAuthAccount)
self.user_db = tenant_user_db
self.database = tenant_user_db
@@ -269,9 +261,14 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
except exceptions.UserAlreadyExists:
user = await self.get_by_email(user_create.email)
# Handle case where user has used product outside of web and is now creating an account through web
if not user.role.is_web_login() and user_create.role.is_web_login():
if (
not user.has_web_login
and hasattr(user_create, "has_web_login")
and user_create.has_web_login
):
user_update = UserUpdate(
password=user_create.password,
has_web_login=True,
role=user_create.role,
is_verified=user_create.is_verified,
)
@@ -285,7 +282,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
return user
async def oauth_callback(
self,
self: "BaseUserManager[models.UOAP, models.ID]",
oauth_name: str,
access_token: str,
account_id: str,
@@ -296,18 +293,13 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
*,
associate_by_email: bool = False,
is_verified_by_default: bool = False,
) -> User:
referral_source = None
if request:
referral_source = getattr(request.state, "referral_source", None)
) -> models.UOAP:
tenant_id = await fetch_ee_implementation_or_noop(
"danswer.server.tenants.provisioning",
"get_or_create_tenant_id",
async_return_default_schema,
)(
email=account_email,
referral_source=referral_source,
)
if not tenant_id:
@@ -322,11 +314,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
verify_email_domain(account_email)
if MULTI_TENANT:
tenant_user_db = SQLAlchemyUserAdminDB[User, uuid.UUID](
db_session, User, OAuthAccount
)
tenant_user_db = SQLAlchemyUserAdminDB(db_session, User, OAuthAccount)
self.user_db = tenant_user_db
self.database = tenant_user_db
self.database = tenant_user_db # type: ignore
oauth_account_dict = {
"oauth_name": oauth_name,
@@ -378,11 +368,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
and existing_oauth_account.oauth_name == oauth_name
):
user = await self.user_db.update_oauth_account(
user,
# NOTE: OAuthAccount DOES implement the OAuthAccountProtocol
# but the type checker doesn't know that :(
existing_oauth_account, # type: ignore
oauth_account_dict,
user, existing_oauth_account, oauth_account_dict
)
# NOTE: Most IdPs have very short expiry times, and we don't want to force the user to
@@ -395,15 +381,16 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
)
# Handle case where user has used product outside of web and is now creating an account through web
if not user.role.is_web_login():
if not user.has_web_login: # type: ignore
await self.user_db.update(
user,
{
"is_verified": is_verified_by_default,
"role": UserRole.BASIC,
"has_web_login": True,
},
)
user.is_verified = is_verified_by_default
user.has_web_login = True # type: ignore
# this is needed if an organization goes from `TRACK_EXTERNAL_IDP_EXPIRY=true` to `false`
# otherwise, the oidc expiry will always be old, and the user will never be able to login
@@ -478,7 +465,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
self.password_helper.hash(credentials.password)
return None
if not user.role.is_web_login():
has_web_login = attributes.get_attribute(user, "has_web_login")
if not has_web_login:
raise BasicAuthenticationError(
detail="NO_WEB_LOGIN_AND_HAS_NO_PASSWORD",
)
@@ -663,24 +652,10 @@ async def current_user_with_expired_token(
return await double_check_user(user, include_expired=True)
async def current_limited_user(
user: User | None = Depends(optional_user),
) -> User | None:
return await double_check_user(user)
async def current_user(
user: User | None = Depends(optional_user),
) -> User | None:
user = await double_check_user(user)
if not user:
return None
if user.role == UserRole.LIMITED:
raise BasicAuthenticationError(
detail="Access denied. User role is LIMITED. BASIC or higher permissions are required.",
)
return user
return await double_check_user(user)
async def current_curator_or_admin_user(
@@ -736,6 +711,8 @@ def generate_state_token(
# refer to https://github.com/fastapi-users/fastapi-users/blob/42ddc241b965475390e2bce887b084152ae1a2cd/fastapi_users/fastapi_users.py#L91
def create_danswer_oauth_router(
oauth_client: BaseOAuth2,
backend: AuthenticationBackend,
@@ -785,22 +762,15 @@ def get_oauth_router(
response_model=OAuth2AuthorizeResponse,
)
async def authorize(
request: Request,
scopes: List[str] = Query(None),
request: Request, scopes: List[str] = Query(None)
) -> OAuth2AuthorizeResponse:
referral_source = request.cookies.get("referral_source", None)
if redirect_url is not None:
authorize_redirect_url = redirect_url
else:
authorize_redirect_url = str(request.url_for(callback_route_name))
next_url = request.query_params.get("next", "/")
state_data: Dict[str, str] = {
"next_url": next_url,
"referral_source": referral_source or "default_referral",
}
state_data: Dict[str, str] = {"next_url": next_url}
state = generate_state_token(state_data, state_secret)
authorization_url = await oauth_client.get_authorization_url(
authorize_redirect_url,
@@ -859,11 +829,8 @@ def get_oauth_router(
raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST)
next_url = state_data.get("next_url", "/")
referral_source = state_data.get("referral_source", None)
request.state.referral_source = referral_source
# Proceed to authenticate or create the user
# Authenticate user
try:
user = await user_manager.oauth_callback(
oauth_client.name,
@@ -905,6 +872,7 @@ def get_oauth_router(
redirect_response.status_code = response.status_code
if hasattr(response, "media_type"):
redirect_response.media_type = response.media_type
return redirect_response
return router

View File

@@ -24,8 +24,6 @@ from danswer.document_index.vespa_constants import VESPA_CONFIG_SERVER_URL
from danswer.redis.redis_connector import RedisConnector
from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
from danswer.redis.redis_connector_delete import RedisConnectorDelete
from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
from danswer.redis.redis_connector_prune import RedisConnectorPrune
from danswer.redis.redis_document_set import RedisDocumentSet
from danswer.redis.redis_pool import get_redis_client
@@ -138,22 +136,6 @@ def on_task_postrun(
RedisConnectorPrune.remove_from_taskset(int(cc_pair_id), task_id, r)
return
if task_id.startswith(RedisConnectorPermissionSync.SUBTASK_PREFIX):
cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
if cc_pair_id is not None:
RedisConnectorPermissionSync.remove_from_taskset(
int(cc_pair_id), task_id, r
)
return
if task_id.startswith(RedisConnectorExternalGroupSync.SUBTASK_PREFIX):
cc_pair_id = RedisConnector.get_id_from_task_id(task_id)
if cc_pair_id is not None:
RedisConnectorExternalGroupSync.remove_from_taskset(
int(cc_pair_id), task_id, r
)
return
def on_celeryd_init(sender: Any = None, conf: Any = None, **kwargs: Any) -> None:
"""The first signal sent on celery worker startup"""

View File

@@ -12,7 +12,6 @@ from danswer.db.engine import get_all_tenant_ids
from danswer.db.engine import SqlEngine
from danswer.utils.logger import setup_logger
from danswer.utils.variable_functionality import fetch_versioned_implementation
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
from shared_configs.configs import MULTI_TENANT
logger = setup_logger(__name__)
@@ -73,15 +72,6 @@ class DynamicTenantScheduler(PersistentScheduler):
logger.info(f"Found {len(existing_tenants)} existing tenants in schedule")
for tenant_id in tenant_ids:
if (
IGNORED_SYNCING_TENANT_LIST
and tenant_id in IGNORED_SYNCING_TENANT_LIST
):
logger.info(
f"Skipping tenant {tenant_id} as it is in the ignored syncing list"
)
continue
if tenant_id not in existing_tenants:
logger.info(f"Processing new tenant: {tenant_id}")

View File

@@ -91,7 +91,5 @@ def on_setup_logging(
celery_app.autodiscover_tasks(
[
"danswer.background.celery.tasks.pruning",
"danswer.background.celery.tasks.doc_permission_syncing",
"danswer.background.celery.tasks.external_group_syncing",
]
)

View File

@@ -6,7 +6,6 @@ from celery import signals
from celery import Task
from celery.signals import celeryd_init
from celery.signals import worker_init
from celery.signals import worker_process_init
from celery.signals import worker_ready
from celery.signals import worker_shutdown
@@ -60,7 +59,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency)
SqlEngine.init_engine(pool_size=8, max_overflow=0)
# Startup checks are not needed in multi-tenant case
if MULTI_TENANT:
@@ -82,11 +81,6 @@ def on_worker_shutdown(sender: Any, **kwargs: Any) -> None:
app_base.on_worker_shutdown(sender, **kwargs)
@worker_process_init.connect
def init_worker(**kwargs: Any) -> None:
SqlEngine.reset_engine()
@signals.setup_logging.connect
def on_setup_logging(
loglevel: Any, logfile: Any, format: Any, colorize: Any, **kwargs: Any

View File

@@ -92,6 +92,5 @@ celery_app.autodiscover_tasks(
"danswer.background.celery.tasks.shared",
"danswer.background.celery.tasks.vespa",
"danswer.background.celery.tasks.connector_deletion",
"danswer.background.celery.tasks.doc_permission_syncing",
]
)

View File

@@ -1,6 +1,5 @@
import multiprocessing
from typing import Any
from typing import cast
from celery import bootsteps # type: ignore
from celery import Celery
@@ -15,18 +14,12 @@ from celery.signals import worker_shutdown
import danswer.background.celery.apps.app_base as app_base
from danswer.background.celery.apps.app_base import task_logger
from danswer.background.celery.celery_utils import celery_is_worker_primary
from danswer.background.celery.tasks.vespa.tasks import get_unfenced_index_attempt_ids
from danswer.configs.constants import CELERY_PRIMARY_WORKER_LOCK_TIMEOUT
from danswer.configs.constants import DanswerRedisLocks
from danswer.configs.constants import POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME
from danswer.db.engine import get_session_with_default_tenant
from danswer.db.engine import SqlEngine
from danswer.db.index_attempt import get_index_attempt
from danswer.db.index_attempt import mark_attempt_failed
from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
from danswer.redis.redis_connector_delete import RedisConnectorDelete
from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
from danswer.redis.redis_connector_index import RedisConnectorIndex
from danswer.redis.redis_connector_prune import RedisConnectorPrune
from danswer.redis.redis_connector_stop import RedisConnectorStop
@@ -96,15 +89,6 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
# by the primary worker. This is unnecessary in the multi tenant scenario
r = get_redis_client(tenant_id=None)
# Log the role and slave count - being connected to a slave or slave count > 0 could be problematic
info: dict[str, Any] = cast(dict, r.info("replication"))
role: str = cast(str, info.get("role"))
connected_slaves: int = info.get("connected_slaves", 0)
logger.info(
f"Redis INFO REPLICATION: role={role} connected_slaves={connected_slaves}"
)
# For the moment, we're assuming that we are the only primary worker
# that should be running.
# TODO: maybe check for or clean up another zombie primary worker if we detect it
@@ -150,27 +134,6 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
RedisConnectorStop.reset_all(r)
RedisConnectorPermissionSync.reset_all(r)
RedisConnectorExternalGroupSync.reset_all(r)
# mark orphaned index attempts as failed
with get_session_with_default_tenant() as db_session:
unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
for attempt_id in unfenced_attempt_ids:
attempt = get_index_attempt(db_session, attempt_id)
if not attempt:
continue
failure_reason = (
f"Orphaned index attempt found on startup: "
f"index_attempt={attempt.id} "
f"cc_pair={attempt.connector_credential_pair_id} "
f"search_settings={attempt.search_settings_id}"
)
logger.warning(failure_reason)
mark_attempt_failed(attempt.id, db_session, failure_reason)
@worker_ready.connect
def on_worker_ready(sender: Any, **kwargs: Any) -> None:
@@ -270,8 +233,6 @@ celery_app.autodiscover_tasks(
"danswer.background.celery.tasks.connector_deletion",
"danswer.background.celery.tasks.indexing",
"danswer.background.celery.tasks.periodic",
"danswer.background.celery.tasks.doc_permission_syncing",
"danswer.background.celery.tasks.external_group_syncing",
"danswer.background.celery.tasks.pruning",
"danswer.background.celery.tasks.shared",
"danswer.background.celery.tasks.vespa",

View File

@@ -0,0 +1,96 @@
from datetime import timedelta
from typing import Any
from celery.beat import PersistentScheduler # type: ignore
from celery.utils.log import get_task_logger
from danswer.db.engine import get_all_tenant_ids
from danswer.utils.variable_functionality import fetch_versioned_implementation
logger = get_task_logger(__name__)
class DynamicTenantScheduler(PersistentScheduler):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self._reload_interval = timedelta(minutes=1)
self._last_reload = self.app.now() - self._reload_interval
def setup_schedule(self) -> None:
super().setup_schedule()
def tick(self) -> float:
retval = super().tick()
now = self.app.now()
if (
self._last_reload is None
or (now - self._last_reload) > self._reload_interval
):
logger.info("Reloading schedule to check for new tenants...")
self._update_tenant_tasks()
self._last_reload = now
return retval
def _update_tenant_tasks(self) -> None:
logger.info("Checking for tenant task updates...")
try:
tenant_ids = get_all_tenant_ids()
tasks_to_schedule = fetch_versioned_implementation(
"danswer.background.celery.tasks.beat_schedule", "get_tasks_to_schedule"
)
new_beat_schedule: dict[str, dict[str, Any]] = {}
current_schedule = getattr(self, "_store", {"entries": {}}).get(
"entries", {}
)
existing_tenants = set()
for task_name in current_schedule.keys():
if "-" in task_name:
existing_tenants.add(task_name.split("-")[-1])
for tenant_id in tenant_ids:
if tenant_id not in existing_tenants:
logger.info(f"Found new tenant: {tenant_id}")
for task in tasks_to_schedule():
task_name = f"{task['name']}-{tenant_id}"
new_task = {
"task": task["task"],
"schedule": task["schedule"],
"kwargs": {"tenant_id": tenant_id},
}
if options := task.get("options"):
new_task["options"] = options
new_beat_schedule[task_name] = new_task
if self._should_update_schedule(current_schedule, new_beat_schedule):
logger.info(
"Updating schedule",
extra={
"new_tasks": len(new_beat_schedule),
"current_tasks": len(current_schedule),
},
)
if not hasattr(self, "_store"):
self._store: dict[str, dict] = {"entries": {}}
self.update_from_dict(new_beat_schedule)
logger.info(f"New schedule: {new_beat_schedule}")
logger.info("Tenant tasks updated successfully")
else:
logger.debug("No schedule updates needed")
except (AttributeError, KeyError):
logger.exception("Failed to process task configuration")
except Exception:
logger.exception("Unexpected error updating tenant tasks")
def _should_update_schedule(
self, current_schedule: dict, new_schedule: dict
) -> bool:
"""Compare schedules to determine if an update is needed."""
current_tasks = set(current_schedule.keys())
new_tasks = set(new_schedule.keys())
return current_tasks != new_tasks

View File

@@ -4,6 +4,7 @@ from typing import Any
from sqlalchemy.orm import Session
from danswer.background.indexing.run_indexing import RunIndexingCallbackInterface
from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
rate_limit_builder,
@@ -16,7 +17,6 @@ from danswer.connectors.models import Document
from danswer.db.connector_credential_pair import get_connector_credential_pair
from danswer.db.enums import TaskStatus
from danswer.db.models import TaskQueueState
from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.redis.redis_connector import RedisConnector
from danswer.server.documents.models import DeletionAttemptSnapshot
from danswer.utils.logger import setup_logger
@@ -78,10 +78,10 @@ def document_batch_to_ids(
def extract_ids_from_runnable_connector(
runnable_connector: BaseConnector,
callback: IndexingHeartbeatInterface | None = None,
callback: RunIndexingCallbackInterface | None = None,
) -> set[str]:
"""
If the SlimConnector hasnt been implemented for the given connector, just pull
If the PruneConnector hasnt been implemented for the given connector, just pull
all docs using the load_from_state and grab out the IDs.
Optionally, a callback can be passed to handle the length of each document batch.
@@ -111,15 +111,10 @@ def extract_ids_from_runnable_connector(
for doc_batch in doc_batch_generator:
if callback:
if callback.should_stop():
raise RuntimeError(
"extract_ids_from_runnable_connector: Stop signal detected"
)
raise RuntimeError("Stop signal received")
callback.progress(len(doc_batch))
all_connector_doc_ids.update(doc_batch_processing_func(doc_batch))
if callback:
callback.progress("extract_ids_from_runnable_connector", len(doc_batch))
return all_connector_doc_ids

View File

@@ -8,7 +8,7 @@ tasks_to_schedule = [
{
"name": "check-for-vespa-sync",
"task": "check_for_vespa_sync_task",
"schedule": timedelta(seconds=20),
"schedule": timedelta(seconds=5),
"options": {"priority": DanswerCeleryPriority.HIGH},
},
{
@@ -20,13 +20,13 @@ tasks_to_schedule = [
{
"name": "check-for-indexing",
"task": "check_for_indexing",
"schedule": timedelta(seconds=15),
"schedule": timedelta(seconds=10),
"options": {"priority": DanswerCeleryPriority.HIGH},
},
{
"name": "check-for-prune",
"task": "check_for_pruning",
"schedule": timedelta(seconds=15),
"schedule": timedelta(seconds=10),
"options": {"priority": DanswerCeleryPriority.HIGH},
},
{
@@ -41,18 +41,6 @@ tasks_to_schedule = [
"schedule": timedelta(seconds=5),
"options": {"priority": DanswerCeleryPriority.HIGH},
},
{
"name": "check-for-doc-permissions-sync",
"task": "check_for_doc_permissions_sync",
"schedule": timedelta(seconds=30),
"options": {"priority": DanswerCeleryPriority.HIGH},
},
{
"name": "check-for-external-group-sync",
"task": "check_for_external_group_sync",
"schedule": timedelta(seconds=20),
"options": {"priority": DanswerCeleryPriority.HIGH},
},
]

View File

@@ -1,12 +1,12 @@
from datetime import datetime
from datetime import timezone
import redis
from celery import Celery
from celery import shared_task
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from redis import Redis
from redis.lock import Lock as RedisLock
from sqlalchemy.orm import Session
from danswer.background.celery.apps.app_base import task_logger
@@ -19,7 +19,7 @@ from danswer.db.engine import get_session_with_tenant
from danswer.db.enums import ConnectorCredentialPairStatus
from danswer.db.search_settings import get_all_search_settings
from danswer.redis.redis_connector import RedisConnector
from danswer.redis.redis_connector_delete import RedisConnectorDeletePayload
from danswer.redis.redis_connector_delete import RedisConnectorDeletionFenceData
from danswer.redis.redis_pool import get_redis_client
@@ -87,7 +87,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
cc_pair_id: int,
db_session: Session,
r: Redis,
lock_beat: RedisLock,
lock_beat: redis.lock.Lock,
tenant_id: str | None,
) -> int | None:
"""Returns an int if syncing is needed. The int represents the number of sync tasks generated.
@@ -118,7 +118,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
return None
# set a basic fence to start
fence_payload = RedisConnectorDeletePayload(
fence_payload = RedisConnectorDeletionFenceData(
num_tasks=None,
submitted=datetime.now(timezone.utc),
)
@@ -143,12 +143,6 @@ def try_generate_document_cc_pair_cleanup_tasks(
f"cc_pair={cc_pair_id}"
)
if redis_connector.permissions.fenced:
raise TaskDependencyError(
f"Connector deletion - Delayed (permissions in progress): "
f"cc_pair={cc_pair_id}"
)
# add tasks to celery and build up the task set to monitor in redis
redis_connector.delete.taskset_clear()

View File

@@ -1,321 +0,0 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from uuid import uuid4
from celery import Celery
from celery import shared_task
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from redis import Redis
from danswer.access.models import DocExternalAccess
from danswer.background.celery.apps.app_base import task_logger
from danswer.configs.app_configs import JOB_TIMEOUT
from danswer.configs.constants import CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT
from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
from danswer.configs.constants import DanswerCeleryPriority
from danswer.configs.constants import DanswerCeleryQueues
from danswer.configs.constants import DanswerRedisLocks
from danswer.configs.constants import DocumentSource
from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
from danswer.db.engine import get_session_with_tenant
from danswer.db.enums import AccessType
from danswer.db.enums import ConnectorCredentialPairStatus
from danswer.db.models import ConnectorCredentialPair
from danswer.db.users import batch_add_ext_perm_user_if_not_exists
from danswer.redis.redis_connector import RedisConnector
from danswer.redis.redis_connector_doc_perm_sync import (
RedisConnectorPermissionSyncData,
)
from danswer.redis.redis_pool import get_redis_client
from danswer.utils.logger import doc_permission_sync_ctx
from danswer.utils.logger import setup_logger
from ee.danswer.db.connector_credential_pair import get_all_auto_sync_cc_pairs
from ee.danswer.db.document import upsert_document_external_perms
from ee.danswer.external_permissions.sync_params import DOC_PERMISSION_SYNC_PERIODS
from ee.danswer.external_permissions.sync_params import DOC_PERMISSIONS_FUNC_MAP
logger = setup_logger()
DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES = 3
# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
LIGHT_SOFT_TIME_LIMIT = 105
LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
def _is_external_doc_permissions_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
"""Returns boolean indicating if external doc permissions sync is due."""
if cc_pair.access_type != AccessType.SYNC:
return False
# skip doc permissions sync if not active
if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
return False
if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
return False
# If the last sync is None, it has never been run so we run the sync
last_perm_sync = cc_pair.last_time_perm_sync
if last_perm_sync is None:
return True
source_sync_period = DOC_PERMISSION_SYNC_PERIODS.get(cc_pair.connector.source)
# If RESTRICTED_FETCH_PERIOD[source] is None, we always run the sync.
if not source_sync_period:
return True
# If the last sync is greater than the full fetch period, we run the sync
next_sync = last_perm_sync + timedelta(seconds=source_sync_period)
if datetime.now(timezone.utc) >= next_sync:
return True
return False
@shared_task(
name="check_for_doc_permissions_sync",
soft_time_limit=JOB_TIMEOUT,
bind=True,
)
def check_for_doc_permissions_sync(self: Task, *, tenant_id: str | None) -> None:
r = get_redis_client(tenant_id=tenant_id)
lock_beat = r.lock(
DanswerRedisLocks.CHECK_CONNECTOR_DOC_PERMISSIONS_SYNC_BEAT_LOCK,
timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
)
try:
# these tasks should never overlap
if not lock_beat.acquire(blocking=False):
return
# get all cc pairs that need to be synced
cc_pair_ids_to_sync: list[int] = []
with get_session_with_tenant(tenant_id) as db_session:
cc_pairs = get_all_auto_sync_cc_pairs(db_session)
for cc_pair in cc_pairs:
if _is_external_doc_permissions_sync_due(cc_pair):
cc_pair_ids_to_sync.append(cc_pair.id)
for cc_pair_id in cc_pair_ids_to_sync:
tasks_created = try_creating_permissions_sync_task(
self.app, cc_pair_id, r, tenant_id
)
if not tasks_created:
continue
task_logger.info(f"Doc permissions sync queued: cc_pair={cc_pair_id}")
except SoftTimeLimitExceeded:
task_logger.info(
"Soft time limit exceeded, task is being terminated gracefully."
)
except Exception:
task_logger.exception(f"Unexpected exception: tenant={tenant_id}")
finally:
if lock_beat.owned():
lock_beat.release()
def try_creating_permissions_sync_task(
app: Celery,
cc_pair_id: int,
r: Redis,
tenant_id: str | None,
) -> int | None:
"""Returns an int if syncing is needed. The int represents the number of sync tasks generated.
Returns None if no syncing is required."""
redis_connector = RedisConnector(tenant_id, cc_pair_id)
LOCK_TIMEOUT = 30
lock = r.lock(
DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_generate_permissions_sync_tasks",
timeout=LOCK_TIMEOUT,
)
acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2)
if not acquired:
return None
try:
if redis_connector.permissions.fenced:
return None
if redis_connector.delete.fenced:
return None
if redis_connector.prune.fenced:
return None
redis_connector.permissions.generator_clear()
redis_connector.permissions.taskset_clear()
custom_task_id = f"{redis_connector.permissions.generator_task_key}_{uuid4()}"
app.send_task(
"connector_permission_sync_generator_task",
kwargs=dict(
cc_pair_id=cc_pair_id,
tenant_id=tenant_id,
),
queue=DanswerCeleryQueues.CONNECTOR_DOC_PERMISSIONS_SYNC,
task_id=custom_task_id,
priority=DanswerCeleryPriority.HIGH,
)
# set a basic fence to start
payload = RedisConnectorPermissionSyncData(
started=None,
)
redis_connector.permissions.set_fence(payload)
except Exception:
task_logger.exception(f"Unexpected exception: cc_pair={cc_pair_id}")
return None
finally:
if lock.owned():
lock.release()
return 1
@shared_task(
name="connector_permission_sync_generator_task",
acks_late=False,
soft_time_limit=JOB_TIMEOUT,
track_started=True,
trail=False,
bind=True,
)
def connector_permission_sync_generator_task(
self: Task,
cc_pair_id: int,
tenant_id: str | None,
) -> None:
"""
Permission sync task that handles document permission syncing for a given connector credential pair
This task assumes that the task has already been properly fenced
"""
doc_permission_sync_ctx_dict = doc_permission_sync_ctx.get()
doc_permission_sync_ctx_dict["cc_pair_id"] = cc_pair_id
doc_permission_sync_ctx_dict["request_id"] = self.request.id
doc_permission_sync_ctx.set(doc_permission_sync_ctx_dict)
redis_connector = RedisConnector(tenant_id, cc_pair_id)
r = get_redis_client(tenant_id=tenant_id)
lock = r.lock(
DanswerRedisLocks.CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX
+ f"_{redis_connector.id}",
timeout=CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT,
)
acquired = lock.acquire(blocking=False)
if not acquired:
task_logger.warning(
f"Permission sync task already running, exiting...: cc_pair={cc_pair_id}"
)
return None
try:
with get_session_with_tenant(tenant_id) as db_session:
cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session)
if cc_pair is None:
raise ValueError(
f"No connector credential pair found for id: {cc_pair_id}"
)
source_type = cc_pair.connector.source
doc_sync_func = DOC_PERMISSIONS_FUNC_MAP.get(source_type)
if doc_sync_func is None:
raise ValueError(f"No doc sync func found for {source_type}")
logger.info(f"Syncing docs for {source_type}")
payload = RedisConnectorPermissionSyncData(
started=datetime.now(timezone.utc),
)
redis_connector.permissions.set_fence(payload)
document_external_accesses: list[DocExternalAccess] = doc_sync_func(cc_pair)
task_logger.info(
f"RedisConnector.permissions.generate_tasks starting. cc_pair={cc_pair_id}"
)
tasks_generated = redis_connector.permissions.generate_tasks(
self.app, lock, document_external_accesses, source_type
)
if tasks_generated is None:
return None
task_logger.info(
f"RedisConnector.permissions.generate_tasks finished. "
f"cc_pair={cc_pair_id} tasks_generated={tasks_generated}"
)
redis_connector.permissions.generator_complete = tasks_generated
except Exception as e:
task_logger.exception(f"Failed to run permission sync: cc_pair={cc_pair_id}")
redis_connector.permissions.generator_clear()
redis_connector.permissions.taskset_clear()
redis_connector.permissions.set_fence(None)
raise e
finally:
if lock.owned():
lock.release()
@shared_task(
name="update_external_document_permissions_task",
soft_time_limit=LIGHT_SOFT_TIME_LIMIT,
time_limit=LIGHT_TIME_LIMIT,
max_retries=DOCUMENT_PERMISSIONS_UPDATE_MAX_RETRIES,
bind=True,
)
def update_external_document_permissions_task(
self: Task,
tenant_id: str | None,
serialized_doc_external_access: dict,
source_string: str,
) -> bool:
document_external_access = DocExternalAccess.from_dict(
serialized_doc_external_access
)
doc_id = document_external_access.doc_id
external_access = document_external_access.external_access
try:
with get_session_with_tenant(tenant_id) as db_session:
# Then we build the update requests to update vespa
batch_add_ext_perm_user_if_not_exists(
db_session=db_session,
emails=list(external_access.external_user_emails),
)
upsert_document_external_perms(
db_session=db_session,
doc_id=doc_id,
external_access=external_access,
source_type=DocumentSource(source_string),
)
logger.debug(
f"Successfully synced postgres document permissions for {doc_id}"
)
return True
except Exception:
logger.exception("Error Syncing Document Permissions")
return False

View File

@@ -1,265 +0,0 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from uuid import uuid4
from celery import Celery
from celery import shared_task
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from redis import Redis
from danswer.background.celery.apps.app_base import task_logger
from danswer.configs.app_configs import JOB_TIMEOUT
from danswer.configs.constants import CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT
from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
from danswer.configs.constants import DANSWER_REDIS_FUNCTION_LOCK_PREFIX
from danswer.configs.constants import DanswerCeleryPriority
from danswer.configs.constants import DanswerCeleryQueues
from danswer.configs.constants import DanswerRedisLocks
from danswer.db.connector import mark_cc_pair_as_external_group_synced
from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
from danswer.db.engine import get_session_with_tenant
from danswer.db.enums import AccessType
from danswer.db.enums import ConnectorCredentialPairStatus
from danswer.db.models import ConnectorCredentialPair
from danswer.redis.redis_connector import RedisConnector
from danswer.redis.redis_pool import get_redis_client
from danswer.utils.logger import setup_logger
from ee.danswer.db.connector_credential_pair import get_all_auto_sync_cc_pairs
from ee.danswer.db.external_perm import ExternalUserGroup
from ee.danswer.db.external_perm import replace_user__ext_group_for_cc_pair
from ee.danswer.external_permissions.sync_params import EXTERNAL_GROUP_SYNC_PERIODS
from ee.danswer.external_permissions.sync_params import GROUP_PERMISSIONS_FUNC_MAP
logger = setup_logger()
EXTERNAL_GROUPS_UPDATE_MAX_RETRIES = 3
# 5 seconds more than RetryDocumentIndex STOP_AFTER+MAX_WAIT
LIGHT_SOFT_TIME_LIMIT = 105
LIGHT_TIME_LIMIT = LIGHT_SOFT_TIME_LIMIT + 15
def _is_external_group_sync_due(cc_pair: ConnectorCredentialPair) -> bool:
"""Returns boolean indicating if external group sync is due."""
if cc_pair.access_type != AccessType.SYNC:
return False
# skip pruning if not active
if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
return False
if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
return False
# If there is not group sync function for the connector, we don't run the sync
# This is fine because all sources dont necessarily have a concept of groups
if not GROUP_PERMISSIONS_FUNC_MAP.get(cc_pair.connector.source):
return False
# If the last sync is None, it has never been run so we run the sync
last_ext_group_sync = cc_pair.last_time_external_group_sync
if last_ext_group_sync is None:
return True
source_sync_period = EXTERNAL_GROUP_SYNC_PERIODS.get(cc_pair.connector.source)
# If EXTERNAL_GROUP_SYNC_PERIODS is None, we always run the sync.
if not source_sync_period:
return True
# If the last sync is greater than the full fetch period, we run the sync
next_sync = last_ext_group_sync + timedelta(seconds=source_sync_period)
if datetime.now(timezone.utc) >= next_sync:
return True
return False
@shared_task(
name="check_for_external_group_sync",
soft_time_limit=JOB_TIMEOUT,
bind=True,
)
def check_for_external_group_sync(self: Task, *, tenant_id: str | None) -> None:
r = get_redis_client(tenant_id=tenant_id)
lock_beat = r.lock(
DanswerRedisLocks.CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK,
timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
)
try:
# these tasks should never overlap
if not lock_beat.acquire(blocking=False):
return
cc_pair_ids_to_sync: list[int] = []
with get_session_with_tenant(tenant_id) as db_session:
cc_pairs = get_all_auto_sync_cc_pairs(db_session)
for cc_pair in cc_pairs:
if _is_external_group_sync_due(cc_pair):
cc_pair_ids_to_sync.append(cc_pair.id)
for cc_pair_id in cc_pair_ids_to_sync:
tasks_created = try_creating_permissions_sync_task(
self.app, cc_pair_id, r, tenant_id
)
if not tasks_created:
continue
task_logger.info(f"External group sync queued: cc_pair={cc_pair_id}")
except SoftTimeLimitExceeded:
task_logger.info(
"Soft time limit exceeded, task is being terminated gracefully."
)
except Exception:
task_logger.exception(f"Unexpected exception: tenant={tenant_id}")
finally:
if lock_beat.owned():
lock_beat.release()
def try_creating_permissions_sync_task(
app: Celery,
cc_pair_id: int,
r: Redis,
tenant_id: str | None,
) -> int | None:
"""Returns an int if syncing is needed. The int represents the number of sync tasks generated.
Returns None if no syncing is required."""
redis_connector = RedisConnector(tenant_id, cc_pair_id)
LOCK_TIMEOUT = 30
lock = r.lock(
DANSWER_REDIS_FUNCTION_LOCK_PREFIX + "try_generate_external_group_sync_tasks",
timeout=LOCK_TIMEOUT,
)
acquired = lock.acquire(blocking_timeout=LOCK_TIMEOUT / 2)
if not acquired:
return None
try:
# Dont kick off a new sync if the previous one is still running
if redis_connector.external_group_sync.fenced:
return None
redis_connector.external_group_sync.generator_clear()
redis_connector.external_group_sync.taskset_clear()
custom_task_id = f"{redis_connector.external_group_sync.taskset_key}_{uuid4()}"
_ = app.send_task(
"connector_external_group_sync_generator_task",
kwargs=dict(
cc_pair_id=cc_pair_id,
tenant_id=tenant_id,
),
queue=DanswerCeleryQueues.CONNECTOR_EXTERNAL_GROUP_SYNC,
task_id=custom_task_id,
priority=DanswerCeleryPriority.HIGH,
)
# set a basic fence to start
redis_connector.external_group_sync.set_fence(True)
except Exception:
task_logger.exception(
f"Unexpected exception while trying to create external group sync task: cc_pair={cc_pair_id}"
)
return None
finally:
if lock.owned():
lock.release()
return 1
@shared_task(
name="connector_external_group_sync_generator_task",
acks_late=False,
soft_time_limit=JOB_TIMEOUT,
track_started=True,
trail=False,
bind=True,
)
def connector_external_group_sync_generator_task(
self: Task,
cc_pair_id: int,
tenant_id: str | None,
) -> None:
"""
Permission sync task that handles document permission syncing for a given connector credential pair
This task assumes that the task has already been properly fenced
"""
redis_connector = RedisConnector(tenant_id, cc_pair_id)
r = get_redis_client(tenant_id=tenant_id)
lock = r.lock(
DanswerRedisLocks.CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX
+ f"_{redis_connector.id}",
timeout=CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT,
)
try:
acquired = lock.acquire(blocking=False)
if not acquired:
task_logger.warning(
f"External group sync task already running, exiting...: cc_pair={cc_pair_id}"
)
return None
with get_session_with_tenant(tenant_id) as db_session:
cc_pair = get_connector_credential_pair_from_id(cc_pair_id, db_session)
if cc_pair is None:
raise ValueError(
f"No connector credential pair found for id: {cc_pair_id}"
)
source_type = cc_pair.connector.source
ext_group_sync_func = GROUP_PERMISSIONS_FUNC_MAP.get(source_type)
if ext_group_sync_func is None:
raise ValueError(f"No external group sync func found for {source_type}")
logger.info(f"Syncing docs for {source_type}")
external_user_groups: list[ExternalUserGroup] = ext_group_sync_func(cc_pair)
logger.info(
f"Syncing {len(external_user_groups)} external user groups for {source_type}"
)
replace_user__ext_group_for_cc_pair(
db_session=db_session,
cc_pair_id=cc_pair.id,
group_defs=external_user_groups,
source=cc_pair.connector.source,
)
logger.info(
f"Synced {len(external_user_groups)} external user groups for {source_type}"
)
mark_cc_pair_as_external_group_synced(db_session, cc_pair.id)
except Exception as e:
task_logger.exception(
f"Failed to run external group sync: cc_pair={cc_pair_id}"
)
redis_connector.external_group_sync.generator_clear()
redis_connector.external_group_sync.taskset_clear()
raise e
finally:
# we always want to clear the fence after the task is done or failed so it doesn't get stuck
redis_connector.external_group_sync.set_fence(False)
if lock.owned():
lock.release()

View File

@@ -3,19 +3,19 @@ from datetime import timezone
from http import HTTPStatus
from time import sleep
import redis
import sentry_sdk
from celery import Celery
from celery import shared_task
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from redis import Redis
from redis.exceptions import LockError
from redis.lock import Lock as RedisLock
from sqlalchemy.orm import Session
from danswer.background.celery.apps.app_base import task_logger
from danswer.background.indexing.job_client import SimpleJobClient
from danswer.background.indexing.run_indexing import run_indexing_entrypoint
from danswer.background.indexing.run_indexing import RunIndexingCallbackInterface
from danswer.configs.app_configs import DISABLE_INDEX_UPDATE_ON_SWAP
from danswer.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -41,11 +41,10 @@ from danswer.db.models import SearchSettings
from danswer.db.search_settings import get_current_search_settings
from danswer.db.search_settings import get_secondary_search_settings
from danswer.db.swap_index import check_index_swap
from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
from danswer.redis.redis_connector import RedisConnector
from danswer.redis.redis_connector_index import RedisConnectorIndexPayload
from danswer.redis.redis_connector_index import RedisConnectorIndexingFenceData
from danswer.redis.redis_pool import get_redis_client
from danswer.utils.logger import setup_logger
from danswer.utils.variable_functionality import global_version
@@ -57,46 +56,27 @@ from shared_configs.configs import SENTRY_DSN
logger = setup_logger()
class IndexingCallback(IndexingHeartbeatInterface):
class RunIndexingCallback(RunIndexingCallbackInterface):
def __init__(
self,
stop_key: str,
generator_progress_key: str,
redis_lock: RedisLock,
redis_lock: redis.lock.Lock,
redis_client: Redis,
):
super().__init__()
self.redis_lock: RedisLock = redis_lock
self.redis_lock: redis.lock.Lock = redis_lock
self.stop_key: str = stop_key
self.generator_progress_key: str = generator_progress_key
self.redis_client = redis_client
self.started: datetime = datetime.now(timezone.utc)
self.redis_lock.reacquire()
self.last_tag: str = ""
self.last_lock_reacquire: datetime = datetime.now(timezone.utc)
def should_stop(self) -> bool:
if self.redis_client.exists(self.stop_key):
return True
return False
def progress(self, tag: str, amount: int) -> None:
try:
self.redis_lock.reacquire()
self.last_tag = tag
self.last_lock_reacquire = datetime.now(timezone.utc)
except LockError:
logger.exception(
f"IndexingCallback - lock.reacquire exceptioned. "
f"lock_timeout={self.redis_lock.timeout} "
f"start={self.started} "
f"last_tag={self.last_tag} "
f"last_reacquired={self.last_lock_reacquire} "
f"now={datetime.now(timezone.utc)}"
)
raise
def progress(self, amount: int) -> None:
self.redis_lock.reacquire()
self.redis_client.incrby(self.generator_progress_key, amount)
@@ -195,8 +175,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
)
if attempt_id:
task_logger.info(
f"Connector indexing queued: "
f"index_attempt={attempt_id} "
f"Indexing queued: index_attempt={attempt_id} "
f"cc_pair={cc_pair.id} "
f"search_settings={search_settings_instance.id} "
)
@@ -346,7 +325,7 @@ def try_creating_indexing_task(
redis_connector_index.generator_clear()
# set a basic fence to start
payload = RedisConnectorIndexPayload(
payload = RedisConnectorIndexingFenceData(
index_attempt_id=None,
started=None,
submitted=datetime.now(timezone.utc),
@@ -387,8 +366,9 @@ def try_creating_indexing_task(
payload.index_attempt_id = index_attempt_id
payload.celery_task_id = result.id
redis_connector_index.set_fence(payload)
except Exception:
redis_connector_index.set_fence(None)
redis_connector_index.set_fence(payload)
task_logger.exception(
f"Unexpected exception: "
f"tenant={tenant_id} "
@@ -519,8 +499,7 @@ def connector_indexing_task(
logger.debug("Sentry DSN not provided, skipping Sentry initialization")
logger.info(
f"Indexing spawned task starting: "
f"attempt={index_attempt_id} "
f"Indexing spawned task starting: attempt={index_attempt_id} "
f"tenant={tenant_id} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id}"
@@ -622,7 +601,7 @@ def connector_indexing_task(
)
# define a callback class
callback = IndexingCallback(
callback = RunIndexingCallback(
redis_connector.stop.fence_key,
redis_connector_index.generator_progress_key,
lock,

View File

@@ -12,7 +12,7 @@ from sqlalchemy.orm import Session
from danswer.background.celery.apps.app_base import task_logger
from danswer.background.celery.celery_utils import extract_ids_from_runnable_connector
from danswer.background.celery.tasks.indexing.tasks import IndexingCallback
from danswer.background.celery.tasks.indexing.tasks import RunIndexingCallback
from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING
from danswer.configs.app_configs import JOB_TIMEOUT
from danswer.configs.constants import CELERY_PRUNING_LOCK_TIMEOUT
@@ -38,42 +38,6 @@ from danswer.utils.logger import setup_logger
logger = setup_logger()
def _is_pruning_due(cc_pair: ConnectorCredentialPair) -> bool:
"""Returns boolean indicating if pruning is due.
Next pruning time is calculated as a delta from the last successful prune, or the
last successful indexing if pruning has never succeeded.
TODO(rkuo): consider whether we should allow pruning to be immediately rescheduled
if pruning fails (which is what it does now). A backoff could be reasonable.
"""
# skip pruning if no prune frequency is set
# pruning can still be forced via the API which will run a pruning task directly
if not cc_pair.connector.prune_freq:
return False
# skip pruning if not active
if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
return False
# skip pruning if the next scheduled prune time hasn't been reached yet
last_pruned = cc_pair.last_pruned
if not last_pruned:
if not cc_pair.last_successful_index_time:
# if we've never indexed, we can't prune
return False
# if never pruned, use the last time the connector indexed successfully
last_pruned = cc_pair.last_successful_index_time
next_prune = last_pruned + timedelta(seconds=cc_pair.connector.prune_freq)
if datetime.now(timezone.utc) < next_prune:
return False
return True
@shared_task(
name="check_for_pruning",
soft_time_limit=JOB_TIMEOUT,
@@ -105,7 +69,7 @@ def check_for_pruning(self: Task, *, tenant_id: str | None) -> None:
if not cc_pair:
continue
if not _is_pruning_due(cc_pair):
if not is_pruning_due(cc_pair, db_session, r):
continue
tasks_created = try_creating_prune_generator_task(
@@ -126,6 +90,47 @@ def check_for_pruning(self: Task, *, tenant_id: str | None) -> None:
lock_beat.release()
def is_pruning_due(
cc_pair: ConnectorCredentialPair,
db_session: Session,
r: Redis,
) -> bool:
"""Returns an int if pruning is triggered.
The int represents the number of prune tasks generated (in this case, only one
because the task is a long running generator task.)
Returns None if no pruning is triggered (due to not being needed or
other reasons such as simultaneous pruning restrictions.
Checks for scheduling related conditions, then delegates the rest of the checks to
try_creating_prune_generator_task.
"""
# skip pruning if no prune frequency is set
# pruning can still be forced via the API which will run a pruning task directly
if not cc_pair.connector.prune_freq:
return False
# skip pruning if not active
if cc_pair.status != ConnectorCredentialPairStatus.ACTIVE:
return False
# skip pruning if the next scheduled prune time hasn't been reached yet
last_pruned = cc_pair.last_pruned
if not last_pruned:
if not cc_pair.last_successful_index_time:
# if we've never indexed, we can't prune
return False
# if never pruned, use the last time the connector indexed successfully
last_pruned = cc_pair.last_successful_index_time
next_prune = last_pruned + timedelta(seconds=cc_pair.connector.prune_freq)
if datetime.now(timezone.utc) < next_prune:
return False
return True
def try_creating_prune_generator_task(
celery_app: Celery,
cc_pair: ConnectorCredentialPair,
@@ -161,16 +166,10 @@ def try_creating_prune_generator_task(
return None
try:
# skip pruning if already pruning
if redis_connector.prune.fenced:
if redis_connector.prune.fenced: # skip pruning if already pruning
return None
# skip pruning if the cc_pair is deleting
if redis_connector.delete.fenced:
return None
# skip pruning if doc permissions sync is running
if redis_connector.permissions.fenced:
if redis_connector.delete.fenced: # skip pruning if the cc_pair is deleting
return None
db_session.refresh(cc_pair)
@@ -232,8 +231,6 @@ def connector_pruning_generator_task(
pruning_ctx_dict["request_id"] = self.request.id
pruning_ctx.set(pruning_ctx_dict)
task_logger.info(f"Pruning generator starting: cc_pair={cc_pair_id}")
redis_connector = RedisConnector(tenant_id, cc_pair_id)
r = get_redis_client(tenant_id=tenant_id)
@@ -264,11 +261,6 @@ def connector_pruning_generator_task(
)
return
task_logger.info(
f"Pruning generator running connector: "
f"cc_pair={cc_pair_id} "
f"connector_source={cc_pair.connector.source}"
)
runnable_connector = instantiate_connector(
db_session,
cc_pair.connector.source,
@@ -277,13 +269,12 @@ def connector_pruning_generator_task(
cc_pair.credential,
)
callback = IndexingCallback(
callback = RunIndexingCallback(
redis_connector.stop.fence_key,
redis_connector.prune.generator_progress_key,
lock,
r,
)
# a list of docs in the source
all_connector_doc_ids: set[str] = extract_ids_from_runnable_connector(
runnable_connector, callback
@@ -305,8 +296,8 @@ def connector_pruning_generator_task(
task_logger.info(
f"Pruning set collected: "
f"cc_pair={cc_pair_id} "
f"connector_source={cc_pair.connector.source} "
f"docs_to_remove={len(doc_ids_to_remove)}"
f"docs_to_remove={len(doc_ids_to_remove)} "
f"doc_source={cc_pair.connector.source}"
)
task_logger.info(
@@ -329,10 +320,10 @@ def connector_pruning_generator_task(
f"Failed to run pruning: cc_pair={cc_pair_id} connector={connector_id}"
)
redis_connector.prune.reset()
redis_connector.prune.generator_clear()
redis_connector.prune.taskset_clear()
redis_connector.prune.set_fence(False)
raise e
finally:
if lock.owned():
lock.release()
task_logger.info(f"Pruning generator finished: cc_pair={cc_pair_id}")

View File

@@ -59,7 +59,7 @@ def document_by_cc_pair_cleanup_task(
connector / credential pair from the access list
(6) delete all relevant entries from postgres
"""
task_logger.debug(f"Task start: tenant={tenant_id} doc={document_id}")
task_logger.info(f"tenant={tenant_id} doc={document_id}")
try:
with get_session_with_tenant(tenant_id) as db_session:
@@ -141,9 +141,7 @@ def document_by_cc_pair_cleanup_task(
return False
except Exception as ex:
if isinstance(ex, RetryError):
task_logger.warning(
f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}"
)
task_logger.info(f"Retry failed: {ex.last_attempt.attempt_number}")
# only set the inner exception if it is of type Exception
e_temp = ex.last_attempt.exception()
@@ -173,21 +171,11 @@ def document_by_cc_pair_cleanup_task(
else:
# This is the last attempt! mark the document as dirty in the db so that it
# eventually gets fixed out of band via stale document reconciliation
task_logger.warning(
f"Max celery task retries reached. Marking doc as dirty for reconciliation: "
task_logger.info(
f"Max retries reached. Marking doc as dirty for reconciliation: "
f"tenant={tenant_id} doc={document_id}"
)
with get_session_with_tenant(tenant_id) as db_session:
# delete the cc pair relationship now and let reconciliation clean it up
# in vespa
delete_document_by_connector_credential_pair__no_commit(
db_session=db_session,
document_id=document_id,
connector_credential_pair_identifier=ConnectorCredentialPairIdentifier(
connector_id=connector_id,
credential_id=credential_id,
),
)
with get_session_with_tenant(tenant_id):
mark_document_as_modified(document_id, db_session)
return False

View File

@@ -13,7 +13,6 @@ from celery.exceptions import SoftTimeLimitExceeded
from celery.result import AsyncResult
from celery.states import READY_STATES
from redis import Redis
from redis.lock import Lock as RedisLock
from sqlalchemy.orm import Session
from tenacity import RetryError
@@ -28,7 +27,6 @@ from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
from danswer.configs.constants import DanswerCeleryQueues
from danswer.configs.constants import DanswerRedisLocks
from danswer.db.connector import fetch_connector_by_id
from danswer.db.connector import mark_cc_pair_as_permissions_synced
from danswer.db.connector import mark_ccpair_as_pruned
from danswer.db.connector_credential_pair import add_deletion_failure_message
from danswer.db.connector_credential_pair import (
@@ -60,10 +58,6 @@ from danswer.document_index.interfaces import VespaDocumentFields
from danswer.redis.redis_connector import RedisConnector
from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
from danswer.redis.redis_connector_delete import RedisConnectorDelete
from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
from danswer.redis.redis_connector_doc_perm_sync import (
RedisConnectorPermissionSyncData,
)
from danswer.redis.redis_connector_index import RedisConnectorIndex
from danswer.redis.redis_connector_prune import RedisConnectorPrune
from danswer.redis.redis_document_set import RedisDocumentSet
@@ -168,7 +162,7 @@ def try_generate_stale_document_sync_tasks(
celery_app: Celery,
db_session: Session,
r: Redis,
lock_beat: RedisLock,
lock_beat: redis.lock.Lock,
tenant_id: str | None,
) -> int | None:
# the fence is up, do nothing
@@ -186,12 +180,7 @@ def try_generate_stale_document_sync_tasks(
f"Stale documents found (at least {stale_doc_count}). Generating sync tasks by cc pair."
)
task_logger.info(
"RedisConnector.generate_tasks starting by cc_pair. "
"Documents spanning multiple cc_pairs will only be synced once."
)
docs_to_skip: set[str] = set()
task_logger.info("RedisConnector.generate_tasks starting by cc_pair.")
# rkuo: we could technically sync all stale docs in one big pass.
# but I feel it's more understandable to group the docs by cc_pair
@@ -199,21 +188,22 @@ def try_generate_stale_document_sync_tasks(
cc_pairs = get_connector_credential_pairs(db_session)
for cc_pair in cc_pairs:
rc = RedisConnectorCredentialPair(tenant_id, cc_pair.id)
rc.set_skip_docs(docs_to_skip)
result = rc.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)
tasks_generated = rc.generate_tasks(
celery_app, db_session, r, lock_beat, tenant_id
)
if result is None:
if tasks_generated is None:
continue
if result[1] == 0:
if tasks_generated == 0:
continue
task_logger.info(
f"RedisConnector.generate_tasks finished for single cc_pair. "
f"cc_pair={cc_pair.id} tasks_generated={result[0]} tasks_possible={result[1]}"
f"cc_pair_id={cc_pair.id} tasks_generated={tasks_generated}"
)
total_tasks_generated += result[0]
total_tasks_generated += tasks_generated
task_logger.info(
f"RedisConnector.generate_tasks finished for all cc_pairs. total_tasks_generated={total_tasks_generated}"
@@ -228,7 +218,7 @@ def try_generate_document_set_sync_tasks(
document_set_id: int,
db_session: Session,
r: Redis,
lock_beat: RedisLock,
lock_beat: redis.lock.Lock,
tenant_id: str | None,
) -> int | None:
lock_beat.reacquire()
@@ -256,11 +246,12 @@ def try_generate_document_set_sync_tasks(
)
# Add all documents that need to be updated into the queue
result = rds.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)
if result is None:
tasks_generated = rds.generate_tasks(
celery_app, db_session, r, lock_beat, tenant_id
)
if tasks_generated is None:
return None
tasks_generated = result[0]
# Currently we are allowing the sync to proceed with 0 tasks.
# It's possible for sets/groups to be generated initially with no entries
# and they still need to be marked as up to date.
@@ -269,7 +260,7 @@ def try_generate_document_set_sync_tasks(
task_logger.info(
f"RedisDocumentSet.generate_tasks finished. "
f"document_set={document_set.id} tasks_generated={tasks_generated}"
f"document_set_id={document_set.id} tasks_generated={tasks_generated}"
)
# set this only after all tasks have been added
@@ -282,7 +273,7 @@ def try_generate_user_group_sync_tasks(
usergroup_id: int,
db_session: Session,
r: Redis,
lock_beat: RedisLock,
lock_beat: redis.lock.Lock,
tenant_id: str | None,
) -> int | None:
lock_beat.reacquire()
@@ -311,11 +302,12 @@ def try_generate_user_group_sync_tasks(
task_logger.info(
f"RedisUserGroup.generate_tasks starting. usergroup_id={usergroup.id}"
)
result = rug.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)
if result is None:
tasks_generated = rug.generate_tasks(
celery_app, db_session, r, lock_beat, tenant_id
)
if tasks_generated is None:
return None
tasks_generated = result[0]
# Currently we are allowing the sync to proceed with 0 tasks.
# It's possible for sets/groups to be generated initially with no entries
# and they still need to be marked as up to date.
@@ -324,7 +316,7 @@ def try_generate_user_group_sync_tasks(
task_logger.info(
f"RedisUserGroup.generate_tasks finished. "
f"usergroup={usergroup.id} tasks_generated={tasks_generated}"
f"usergroup_id={usergroup.id} tasks_generated={tasks_generated}"
)
# set this only after all tasks have been added
@@ -444,22 +436,11 @@ def monitor_connector_deletion_taskset(
db_session, cc_pair.connector_id, cc_pair.credential_id
)
if len(doc_ids) > 0:
# NOTE(rkuo): if this happens, documents somehow got added while
# deletion was in progress. Likely a bug gating off pruning and indexing
# work before deletion starts.
# if this happens, documents somehow got added while deletion was in progress. Likely a bug
# gating off pruning and indexing work before deletion starts
task_logger.warning(
"Connector deletion - documents still found after taskset completion. "
"Clearing the current deletion attempt and allowing deletion to restart: "
f"cc_pair={cc_pair_id} "
f"docs_deleted={fence_data.num_tasks} "
f"docs_remaining={len(doc_ids)}"
)
# We don't want to waive off why we get into this state, but resetting
# our attempt and letting the deletion restart is a good way to recover
redis_connector.delete.reset()
raise RuntimeError(
"Connector deletion - documents still found after taskset completion"
f"Connector deletion - documents still found after taskset completion: "
f"cc_pair={cc_pair_id} num={len(doc_ids)}"
)
# clean up the rest of the related Postgres entities
@@ -523,7 +504,8 @@ def monitor_connector_deletion_taskset(
f"docs_deleted={fence_data.num_tasks}"
)
redis_connector.delete.reset()
redis_connector.delete.taskset_clear()
redis_connector.delete.set_fence(None)
def monitor_ccpair_pruning_taskset(
@@ -564,47 +546,6 @@ def monitor_ccpair_pruning_taskset(
redis_connector.prune.set_fence(False)
def monitor_ccpair_permissions_taskset(
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
) -> None:
fence_key = key_bytes.decode("utf-8")
cc_pair_id_str = RedisConnector.get_id_from_fence_key(fence_key)
if cc_pair_id_str is None:
task_logger.warning(
f"monitor_ccpair_permissions_taskset: could not parse cc_pair_id from {fence_key}"
)
return
cc_pair_id = int(cc_pair_id_str)
redis_connector = RedisConnector(tenant_id, cc_pair_id)
if not redis_connector.permissions.fenced:
return
initial = redis_connector.permissions.generator_complete
if initial is None:
return
remaining = redis_connector.permissions.get_remaining()
task_logger.info(
f"Permissions sync progress: cc_pair={cc_pair_id} remaining={remaining} initial={initial}"
)
if remaining > 0:
return
payload: RedisConnectorPermissionSyncData | None = (
redis_connector.permissions.payload
)
start_time: datetime | None = payload.started if payload else None
mark_cc_pair_as_permissions_synced(db_session, int(cc_pair_id), start_time)
task_logger.info(f"Successfully synced permissions for cc_pair={cc_pair_id}")
redis_connector.permissions.taskset_clear()
redis_connector.permissions.generator_clear()
redis_connector.permissions.set_fence(None)
def monitor_ccpair_indexing_taskset(
tenant_id: str | None, key_bytes: bytes, r: Redis, db_session: Session
) -> None:
@@ -639,8 +580,8 @@ def monitor_ccpair_indexing_taskset(
progress = redis_connector_index.get_progress()
if progress is not None:
task_logger.info(
f"Connector indexing progress: cc_pair={cc_pair_id} "
f"search_settings={search_settings_id} "
f"Connector indexing progress: cc_pair_id={cc_pair_id} "
f"search_settings_id={search_settings_id} "
f"progress={progress} "
f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
)
@@ -655,41 +596,33 @@ def monitor_ccpair_indexing_taskset(
result_state = result.state
status_int = redis_connector_index.get_completion()
if status_int is None: # completion signal not set ... check for errors
# If we get here, and then the task both sets the completion signal and finishes,
# we will incorrectly abort the task. We must check result state, then check
# get_completion again to avoid the race condition.
if status_int is None:
if result_state in READY_STATES:
if redis_connector_index.get_completion() is None:
# IF the task state is READY, THEN generator_complete should be set
# if it isn't, then the worker crashed
msg = (
f"Connector indexing aborted or exceptioned: "
f"attempt={payload.index_attempt_id} "
f"celery_task={payload.celery_task_id} "
f"result_state={result_state} "
f"cc_pair={cc_pair_id} "
f"search_settings={search_settings_id} "
f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
# IF the task state is READY, THEN generator_complete should be set
# if it isn't, then the worker crashed
task_logger.info(
f"Connector indexing aborted: "
f"cc_pair_id={cc_pair_id} "
f"search_settings_id={search_settings_id} "
f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
)
index_attempt = get_index_attempt(db_session, payload.index_attempt_id)
if index_attempt:
mark_attempt_failed(
index_attempt_id=payload.index_attempt_id,
db_session=db_session,
failure_reason="Connector indexing aborted or exceptioned.",
)
task_logger.warning(msg)
index_attempt = get_index_attempt(db_session, payload.index_attempt_id)
if index_attempt:
mark_attempt_failed(
index_attempt_id=payload.index_attempt_id,
db_session=db_session,
failure_reason=msg,
)
redis_connector_index.reset()
redis_connector_index.reset()
return
status_enum = HTTPStatus(status_int)
task_logger.info(
f"Connector indexing finished: cc_pair={cc_pair_id} "
f"search_settings={search_settings_id} "
f"Connector indexing finished: cc_pair_id={cc_pair_id} "
f"search_settings_id={search_settings_id} "
f"status={status_enum.name} "
f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
)
@@ -697,37 +630,6 @@ def monitor_ccpair_indexing_taskset(
redis_connector_index.reset()
def get_unfenced_index_attempt_ids(db_session: Session, r: redis.Redis) -> list[int]:
"""Gets a list of unfenced index attempts. Should not be possible, so we'd typically
want to clean them up.
Unfenced = attempt not in terminal state and fence does not exist.
"""
unfenced_attempts: list[int] = []
# do some cleanup before clearing fences
# check the db for any outstanding index attempts
attempts: list[IndexAttempt] = []
attempts.extend(
get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session)
)
attempts.extend(
get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session)
)
for attempt in attempts:
# if attempts exist in the db but we don't detect them in redis, mark them as failed
fence_key = RedisConnectorIndex.fence_key_with_ids(
attempt.connector_credential_pair_id, attempt.search_settings_id
)
if r.exists(fence_key):
continue
unfenced_attempts.append(attempt.id)
return unfenced_attempts
@shared_task(name="monitor_vespa_sync", soft_time_limit=300, bind=True)
def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
"""This is a celery beat task that monitors and finalizes metadata sync tasksets.
@@ -741,7 +643,7 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
"""
r = get_redis_client(tenant_id=tenant_id)
lock_beat: RedisLock = r.lock(
lock_beat: redis.lock.Lock = r.lock(
DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK,
timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
)
@@ -766,37 +668,40 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
n_pruning = celery_get_queue_length(
DanswerCeleryQueues.CONNECTOR_PRUNING, r_celery
)
n_permissions_sync = celery_get_queue_length(
DanswerCeleryQueues.CONNECTOR_DOC_PERMISSIONS_SYNC, r_celery
)
task_logger.info(
f"Queue lengths: celery={n_celery} "
f"indexing={n_indexing} "
f"sync={n_sync} "
f"deletion={n_deletion} "
f"pruning={n_pruning} "
f"permissions_sync={n_permissions_sync} "
f"pruning={n_pruning}"
)
# Fail any index attempts in the DB that don't have fences
# do some cleanup before clearing fences
# check the db for any outstanding index attempts
with get_session_with_tenant(tenant_id) as db_session:
unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
for attempt_id in unfenced_attempt_ids:
attempt = get_index_attempt(db_session, attempt_id)
if not attempt:
continue
attempts: list[IndexAttempt] = []
attempts.extend(
get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session)
)
attempts.extend(
get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session)
)
failure_reason = (
f"Unfenced index attempt found in DB: "
f"index_attempt={attempt.id} "
f"cc_pair={attempt.connector_credential_pair_id} "
f"search_settings={attempt.search_settings_id}"
)
task_logger.warning(failure_reason)
mark_attempt_failed(
attempt.id, db_session, failure_reason=failure_reason
for a in attempts:
# if attempts exist in the db but we don't detect them in redis, mark them as failed
fence_key = RedisConnectorIndex.fence_key_with_ids(
a.connector_credential_pair_id, a.search_settings_id
)
if not r.exists(fence_key):
failure_reason = (
f"Unknown index attempt. Might be left over from a process restart: "
f"index_attempt={a.id} "
f"cc_pair={a.connector_credential_pair_id} "
f"search_settings={a.search_settings_id}"
)
task_logger.warning(failure_reason)
mark_attempt_failed(a.id, db_session, failure_reason=failure_reason)
lock_beat.reacquire()
if r.exists(RedisConnectorCredentialPair.get_fence_key()):
@@ -836,12 +741,6 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
with get_session_with_tenant(tenant_id) as db_session:
monitor_ccpair_indexing_taskset(tenant_id, key_bytes, r, db_session)
lock_beat.reacquire()
for key_bytes in r.scan_iter(RedisConnectorPermissionSync.FENCE_PREFIX + "*"):
lock_beat.reacquire()
with get_session_with_tenant(tenant_id) as db_session:
monitor_ccpair_permissions_taskset(tenant_id, key_bytes, r, db_session)
# uncomment for debugging if needed
# r_celery = celery_app.broker_connection().channel().client
# length = celery_get_queue_length(DanswerCeleryQueues.VESPA_METADATA_SYNC, r_celery)
@@ -912,9 +811,7 @@ def vespa_metadata_sync_task(
)
except Exception as ex:
if isinstance(ex, RetryError):
task_logger.warning(
f"Tenacity retry failed: num_attempts={ex.last_attempt.attempt_number}"
)
task_logger.warning(f"Retry failed: {ex.last_attempt.attempt_number}")
# only set the inner exception if it is of type Exception
e_temp = ex.last_attempt.exception()

View File

@@ -29,26 +29,18 @@ JobStatusType = (
def _initializer(
func: Callable, args: list | tuple, kwargs: dict[str, Any] | None = None
) -> Any:
"""Initialize the child process with a fresh SQLAlchemy Engine.
"""Ensure the parent proc's database connections are not touched
in the new connection pool
Based on SQLAlchemy's recommendations to handle multiprocessing:
Based on the recommended approach in the SQLAlchemy docs found:
https://docs.sqlalchemy.org/en/20/core/pooling.html#using-connection-pools-with-multiprocessing-or-os-fork
"""
if kwargs is None:
kwargs = {}
logger.info("Initializing spawned worker child process.")
# Reset the engine in the child process
SqlEngine.reset_engine()
# Optionally set a custom app name for database logging purposes
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_CHILD_APP_NAME)
# Initialize a new engine with desired parameters
SqlEngine.init_engine(pool_size=4, max_overflow=12, pool_recycle=60)
# Proceed with executing the target function
return func(*args, **kwargs)

View File

@@ -1,5 +1,7 @@
import time
import traceback
from abc import ABC
from abc import abstractmethod
from datetime import datetime
from datetime import timedelta
from datetime import timezone
@@ -29,10 +31,10 @@ from danswer.db.models import IndexingStatus
from danswer.db.models import IndexModelStatus
from danswer.document_index.factory import get_default_document_index
from danswer.indexing.embedder import DefaultIndexingEmbedder
from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.indexing.indexing_heartbeat import IndexingHeartbeat
from danswer.indexing.indexing_pipeline import build_indexing_pipeline
from danswer.utils.logger import IndexAttemptSingleton
from danswer.utils.logger import setup_logger
from danswer.utils.logger import TaskAttemptSingleton
from danswer.utils.variable_functionality import global_version
logger = setup_logger()
@@ -40,6 +42,19 @@ logger = setup_logger()
INDEXING_TRACER_NUM_PRINT_ENTRIES = 5
class RunIndexingCallbackInterface(ABC):
"""Defines a callback interface to be passed to
to run_indexing_entrypoint."""
@abstractmethod
def should_stop(self) -> bool:
"""Signal to stop the looping function in flight."""
@abstractmethod
def progress(self, amount: int) -> None:
"""Send progress updates to the caller."""
def _get_connector_runner(
db_session: Session,
attempt: IndexAttempt,
@@ -91,7 +106,7 @@ def _run_indexing(
db_session: Session,
index_attempt: IndexAttempt,
tenant_id: str | None,
callback: IndexingHeartbeatInterface | None = None,
callback: RunIndexingCallbackInterface | None = None,
) -> None:
"""
1. Get documents which are either new or updated from specified application
@@ -123,7 +138,13 @@ def _run_indexing(
embedding_model = DefaultIndexingEmbedder.from_db_search_settings(
search_settings=search_settings,
callback=callback,
heartbeat=IndexingHeartbeat(
index_attempt_id=index_attempt.id,
db_session=db_session,
# let the world know we're still making progress after
# every 10 batches
freq=10,
),
)
indexing_pipeline = build_indexing_pipeline(
@@ -136,7 +157,6 @@ def _run_indexing(
),
db_session=db_session,
tenant_id=tenant_id,
callback=callback,
)
db_cc_pair = index_attempt.connector_credential_pair
@@ -208,9 +228,7 @@ def _run_indexing(
# contents still need to be initially pulled.
if callback:
if callback.should_stop():
raise RuntimeError(
"_run_indexing: Connector stop signal detected"
)
raise RuntimeError("Connector stop signal detected")
# TODO: should we move this into the above callback instead?
db_session.refresh(db_cc_pair)
@@ -271,7 +289,7 @@ def _run_indexing(
db_session.commit()
if callback:
callback.progress("_run_indexing", len(doc_batch))
callback.progress(len(doc_batch))
# This new value is updated every batch, so UI can refresh per batch update
update_docs_indexed(
@@ -401,7 +419,7 @@ def run_indexing_entrypoint(
tenant_id: str | None,
connector_credential_pair_id: int,
is_ee: bool = False,
callback: IndexingHeartbeatInterface | None = None,
callback: RunIndexingCallbackInterface | None = None,
) -> None:
try:
if is_ee:
@@ -409,19 +427,17 @@ def run_indexing_entrypoint(
# set the indexing attempt ID so that all log messages from this process
# will have it added as a prefix
TaskAttemptSingleton.set_cc_and_index_id(
IndexAttemptSingleton.set_cc_and_index_id(
index_attempt_id, connector_credential_pair_id
)
with get_session_with_tenant(tenant_id) as db_session:
attempt = transition_attempt_to_in_progress(index_attempt_id, db_session)
tenant_str = ""
if tenant_id is not None:
tenant_str = f" for tenant {tenant_id}"
logger.info(
f"Indexing starting{tenant_str}: "
f"connector='{attempt.connector_credential_pair.connector.name}' "
f"Indexing starting for tenant {tenant_id}: "
if tenant_id is not None
else ""
+ f"connector='{attempt.connector_credential_pair.connector.name}' "
f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
f"credentials='{attempt.connector_credential_pair.connector_id}'"
)
@@ -429,8 +445,10 @@ def run_indexing_entrypoint(
_run_indexing(db_session, attempt, tenant_id, callback)
logger.info(
f"Indexing finished{tenant_str}: "
f"connector='{attempt.connector_credential_pair.connector.name}' "
f"Indexing finished for tenant {tenant_id}: "
if tenant_id is not None
else ""
+ f"connector='{attempt.connector_credential_pair.connector.name}' "
f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
f"credentials='{attempt.connector_credential_pair.connector_id}'"
)

View File

@@ -0,0 +1,4 @@
def name_sync_external_doc_permissions_task(
cc_pair_id: int, tenant_id: str | None = None
) -> str:
return f"sync_external_doc_permissions_task__{cc_pair_id}"

View File

@@ -14,6 +14,15 @@ from danswer.db.tasks import mark_task_start
from danswer.db.tasks import register_task
def name_cc_prune_task(
connector_id: int | None = None, credential_id: int | None = None
) -> str:
task_name = f"prune_connector_credential_pair_{connector_id}_{credential_id}"
if not connector_id or not credential_id:
task_name = "prune_connector_credential_pair"
return task_name
T = TypeVar("T", bound=Callable)

View File

@@ -5,7 +5,7 @@ personas:
# this is for DanswerBot to use when tagged in a non-configured channel
# Careful setting specific IDs, this won't autoincrement the next ID value for postgres
- id: 0
name: "Search"
name: "Knowledge"
description: >
Assistant with access to documents from your Connected Sources.
# Default Prompt objects attached to the persona, see prompts.yaml

View File

@@ -112,7 +112,6 @@ from danswer.tools.tool_implementations.search.search_tool import (
)
from danswer.tools.tool_runner import ToolCallFinalResult
from danswer.utils.logger import setup_logger
from danswer.utils.long_term_log import LongTermLogger
from danswer.utils.timing import log_generator_function_time
logger = setup_logger()
@@ -317,11 +316,6 @@ def stream_chat_message_objects(
retrieval_options = new_msg_req.retrieval_options
alternate_assistant_id = new_msg_req.alternate_assistant_id
# permanent "log" store, used primarily for debugging
long_term_logger = LongTermLogger(
metadata={"user_id": str(user_id), "chat_session_id": str(chat_session_id)}
)
# use alternate persona if alternative assistant id is passed in
if alternate_assistant_id is not None:
persona = get_persona_by_id(
@@ -347,7 +341,6 @@ def stream_chat_message_objects(
persona=persona,
llm_override=new_msg_req.llm_override or chat_session.llm_override,
additional_headers=litellm_additional_headers,
long_term_logger=long_term_logger,
)
except GenAIDisabledException:
raise RuntimeError("LLM is disabled. Can't use chat flow without LLM.")
@@ -605,7 +598,6 @@ def stream_chat_message_objects(
additional_headers=custom_tool_additional_headers,
),
)
tools: list[Tool] = []
for tool_list in tool_dict.values():
tools.extend(tool_list)

View File

@@ -503,7 +503,3 @@ _API_KEY_HASH_ROUNDS_RAW = os.environ.get("API_KEY_HASH_ROUNDS")
API_KEY_HASH_ROUNDS = (
int(_API_KEY_HASH_ROUNDS_RAW) if _API_KEY_HASH_ROUNDS_RAW else None
)
POD_NAME = os.environ.get("POD_NAME")
POD_NAMESPACE = os.environ.get("POD_NAMESPACE")

View File

@@ -60,6 +60,7 @@ KV_GMAIL_CRED_KEY = "gmail_app_credential"
KV_GMAIL_SERVICE_ACCOUNT_KEY = "gmail_service_account_key"
KV_GOOGLE_DRIVE_CRED_KEY = "google_drive_app_credential"
KV_GOOGLE_DRIVE_SERVICE_ACCOUNT_KEY = "google_drive_service_account_key"
KV_SLACK_BOT_TOKENS_CONFIG_KEY = "slack_bot_tokens_config_key"
KV_GEN_AI_KEY_CHECK_TIME = "genai_api_key_last_check_time"
KV_SETTINGS_KEY = "danswer_settings"
KV_CUSTOMER_UUID_KEY = "customer_uuid"
@@ -73,16 +74,12 @@ CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120
# needs to be long enough to cover the maximum time it takes to download an object
# if we can get callbacks as object bytes download, we could lower this a lot.
CELERY_INDEXING_LOCK_TIMEOUT = 3 * 60 * 60 # 60 min
CELERY_INDEXING_LOCK_TIMEOUT = 60 * 60 # 60 min
# needs to be long enough to cover the maximum time it takes to download an object
# if we can get callbacks as object bytes download, we could lower this a lot.
CELERY_PRUNING_LOCK_TIMEOUT = 300 # 5 min
CELERY_PERMISSIONS_SYNC_LOCK_TIMEOUT = 300 # 5 min
CELERY_EXTERNAL_GROUP_SYNC_LOCK_TIMEOUT = 300 # 5 min
DANSWER_REDIS_FUNCTION_LOCK_PREFIX = "da_function_lock:"
@@ -212,17 +209,9 @@ class PostgresAdvisoryLocks(Enum):
class DanswerCeleryQueues:
# Light queue
VESPA_METADATA_SYNC = "vespa_metadata_sync"
DOC_PERMISSIONS_UPSERT = "doc_permissions_upsert"
CONNECTOR_DELETION = "connector_deletion"
# Heavy queue
CONNECTOR_PRUNING = "connector_pruning"
CONNECTOR_DOC_PERMISSIONS_SYNC = "connector_doc_permissions_sync"
CONNECTOR_EXTERNAL_GROUP_SYNC = "connector_external_group_sync"
# Indexing queue
CONNECTOR_INDEXING = "connector_indexing"
@@ -232,18 +221,8 @@ class DanswerRedisLocks:
CHECK_CONNECTOR_DELETION_BEAT_LOCK = "da_lock:check_connector_deletion_beat"
CHECK_PRUNE_BEAT_LOCK = "da_lock:check_prune_beat"
CHECK_INDEXING_BEAT_LOCK = "da_lock:check_indexing_beat"
CHECK_CONNECTOR_DOC_PERMISSIONS_SYNC_BEAT_LOCK = (
"da_lock:check_connector_doc_permissions_sync_beat"
)
CHECK_CONNECTOR_EXTERNAL_GROUP_SYNC_BEAT_LOCK = (
"da_lock:check_connector_external_group_sync_beat"
)
MONITOR_VESPA_SYNC_BEAT_LOCK = "da_lock:monitor_vespa_sync_beat"
CONNECTOR_DOC_PERMISSIONS_SYNC_LOCK_PREFIX = (
"da_lock:connector_doc_permissions_sync"
)
CONNECTOR_EXTERNAL_GROUP_SYNC_LOCK_PREFIX = "da_lock:connector_external_group_sync"
PRUNING_LOCK_PREFIX = "da_lock:pruning"
INDEXING_METADATA_PREFIX = "da_metadata:indexing"

View File

@@ -119,14 +119,3 @@ if _LITELLM_PASS_THROUGH_HEADERS_RAW:
logger.error(
"Failed to parse LITELLM_PASS_THROUGH_HEADERS, must be a valid JSON object"
)
# if specified, will merge the specified JSON with the existing body of the
# request before sending it to the LLM
LITELLM_EXTRA_BODY: dict | None = None
_LITELLM_EXTRA_BODY_RAW = os.environ.get("LITELLM_EXTRA_BODY")
if _LITELLM_EXTRA_BODY_RAW:
try:
LITELLM_EXTRA_BODY = json.loads(_LITELLM_EXTRA_BODY_RAW)
except Exception:
pass

View File

@@ -5,9 +5,9 @@ from io import BytesIO
from typing import Any
from typing import Optional
import boto3 # type: ignore
from botocore.client import Config # type: ignore
from mypy_boto3_s3 import S3Client # type: ignore
import boto3
from botocore.client import Config
from mypy_boto3_s3 import S3Client
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import BlobType

View File

@@ -7,9 +7,9 @@ from danswer.configs.app_configs import CONFLUENCE_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from danswer.configs.app_configs import INDEX_BATCH_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.confluence.onyx_confluence import build_confluence_client
from danswer.connectors.confluence.onyx_confluence import OnyxConfluence
from danswer.connectors.confluence.utils import attachment_to_content
from danswer.connectors.confluence.utils import build_confluence_client
from danswer.connectors.confluence.utils import build_confluence_document_id
from danswer.connectors.confluence.utils import datetime_from_string
from danswer.connectors.confluence.utils import extract_text_from_confluence_html
@@ -70,7 +70,7 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
) -> None:
self.batch_size = batch_size
self.continue_on_failure = continue_on_failure
self._confluence_client: OnyxConfluence | None = None
self.confluence_client: OnyxConfluence | None = None
self.is_cloud = is_cloud
# Remove trailing slash from wiki_base if present
@@ -81,15 +81,15 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
if cql_query:
# if a cql_query is provided, we will use it to fetch the pages
cql_page_query = cql_query
elif space:
# if no cql_query is provided, we will use the space to fetch the pages
cql_page_query += f" and space='{quote(space)}'"
elif page_id:
# if a cql_query is not provided, we will use the page_id to fetch the page
if index_recursively:
cql_page_query += f" and ancestor='{page_id}'"
else:
# if neither a space nor a cql_query is provided, we will use the page_id to fetch the page
cql_page_query += f" and id='{page_id}'"
elif space:
# if no cql_query or page_id is provided, we will use the space to fetch the pages
cql_page_query += f" and space='{quote(space)}'"
self.cql_page_query = cql_page_query
self.cql_time_filter = ""
@@ -97,44 +97,39 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
self.cql_label_filter = ""
if labels_to_skip:
labels_to_skip = list(set(labels_to_skip))
comma_separated_labels = ",".join(
f"'{quote(label)}'" for label in labels_to_skip
)
comma_separated_labels = ",".join(f"'{label}'" for label in labels_to_skip)
self.cql_label_filter = f" and label not in ({comma_separated_labels})"
@property
def confluence_client(self) -> OnyxConfluence:
if self._confluence_client is None:
raise ConnectorMissingCredentialError("Confluence")
return self._confluence_client
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
# see https://github.com/atlassian-api/atlassian-python-api/blob/master/atlassian/rest_client.py
# for a list of other hidden constructor args
self._confluence_client = build_confluence_client(
credentials=credentials,
self.confluence_client = build_confluence_client(
credentials_json=credentials,
is_cloud=self.is_cloud,
wiki_base=self.wiki_base,
)
return None
def _get_comment_string_for_page_id(self, page_id: str) -> str:
if self.confluence_client is None:
raise ConnectorMissingCredentialError("Confluence")
comment_string = ""
comment_cql = f"type=comment and container='{page_id}'"
comment_cql += self.cql_label_filter
expand = ",".join(_COMMENT_EXPANSION_FIELDS)
for comment in self.confluence_client.paginated_cql_retrieval(
for comments in self.confluence_client.paginated_cql_page_retrieval(
cql=comment_cql,
expand=expand,
):
comment_string += "\nComment:\n"
comment_string += extract_text_from_confluence_html(
confluence_client=self.confluence_client,
confluence_object=comment,
fetched_titles=set(),
)
for comment in comments:
comment_string += "\nComment:\n"
comment_string += extract_text_from_confluence_html(
confluence_client=self.confluence_client,
confluence_object=comment,
)
return comment_string
@@ -146,28 +141,28 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
If its a page, it extracts the text, adds the comments for the document text.
If its an attachment, it just downloads the attachment and converts that into a document.
"""
if self.confluence_client is None:
raise ConnectorMissingCredentialError("Confluence")
# The url and the id are the same
object_url = build_confluence_document_id(
self.wiki_base, confluence_object["_links"]["webui"], self.is_cloud
self.wiki_base, confluence_object["_links"]["webui"]
)
object_text = None
# Extract text from page
if confluence_object["type"] == "page":
object_text = extract_text_from_confluence_html(
confluence_client=self.confluence_client,
confluence_object=confluence_object,
fetched_titles={confluence_object.get("title", "")},
self.confluence_client, confluence_object
)
# Add comments to text
object_text += self._get_comment_string_for_page_id(confluence_object["id"])
elif confluence_object["type"] == "attachment":
object_text = attachment_to_content(
confluence_client=self.confluence_client, attachment=confluence_object
self.confluence_client, confluence_object
)
if object_text is None:
# This only happens for attachments that are not parseable
return None
# Get space name
@@ -198,39 +193,44 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
)
def _fetch_document_batches(self) -> GenerateDocumentsOutput:
if self.confluence_client is None:
raise ConnectorMissingCredentialError("Confluence")
doc_batch: list[Document] = []
confluence_page_ids: list[str] = []
page_query = self.cql_page_query + self.cql_label_filter + self.cql_time_filter
# Fetch pages as Documents
for page in self.confluence_client.paginated_cql_retrieval(
for page_batch in self.confluence_client.paginated_cql_page_retrieval(
cql=page_query,
expand=",".join(_PAGE_EXPANSION_FIELDS),
limit=self.batch_size,
):
confluence_page_ids.append(page["id"])
doc = self._convert_object_to_document(page)
if doc is not None:
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
for page in page_batch:
confluence_page_ids.append(page["id"])
doc = self._convert_object_to_document(page)
if doc is not None:
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
# Fetch attachments as Documents
for confluence_page_id in confluence_page_ids:
attachment_cql = f"type=attachment and container='{confluence_page_id}'"
attachment_cql += self.cql_label_filter
# TODO: maybe should add time filter as well?
for attachment in self.confluence_client.paginated_cql_retrieval(
for attachments in self.confluence_client.paginated_cql_page_retrieval(
cql=attachment_cql,
expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
):
doc = self._convert_object_to_document(attachment)
if doc is not None:
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
for attachment in attachments:
doc = self._convert_object_to_document(attachment)
if doc is not None:
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch
@@ -255,47 +255,48 @@ class ConfluenceConnector(LoadConnector, PollConnector, SlimConnector):
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateSlimDocumentOutput:
if self.confluence_client is None:
raise ConnectorMissingCredentialError("Confluence")
doc_metadata_list: list[SlimDocument] = []
restrictions_expand = ",".join(_RESTRICTIONS_EXPANSION_FIELDS)
page_query = self.cql_page_query + self.cql_label_filter
for page in self.confluence_client.cql_paginate_all_expansions(
for pages in self.confluence_client.cql_paginate_all_expansions(
cql=page_query,
expand=restrictions_expand,
):
# If the page has restrictions, add them to the perm_sync_data
# These will be used by doc_sync.py to sync permissions
perm_sync_data = {
"restrictions": page.get("restrictions", {}),
"space_key": page.get("space", {}).get("key"),
}
for page in pages:
# If the page has restrictions, add them to the perm_sync_data
# These will be used by doc_sync.py to sync permissions
perm_sync_data = {
"restrictions": page.get("restrictions", {}),
"space_key": page.get("space", {}).get("key"),
}
doc_metadata_list.append(
SlimDocument(
id=build_confluence_document_id(
self.wiki_base,
page["_links"]["webui"],
self.is_cloud,
),
perm_sync_data=perm_sync_data,
)
)
attachment_cql = f"type=attachment and container='{page['id']}'"
attachment_cql += self.cql_label_filter
for attachment in self.confluence_client.cql_paginate_all_expansions(
cql=attachment_cql,
expand=restrictions_expand,
):
doc_metadata_list.append(
SlimDocument(
id=build_confluence_document_id(
self.wiki_base,
attachment["_links"]["webui"],
self.is_cloud,
self.wiki_base, page["_links"]["webui"]
),
perm_sync_data=perm_sync_data,
)
)
yield doc_metadata_list
doc_metadata_list = []
attachment_cql = f"type=attachment and container='{page['id']}'"
attachment_cql += self.cql_label_filter
for attachments in self.confluence_client.cql_paginate_all_expansions(
cql=attachment_cql,
expand=restrictions_expand,
):
for attachment in attachments:
doc_metadata_list.append(
SlimDocument(
id=build_confluence_document_id(
self.wiki_base, attachment["_links"]["webui"]
),
perm_sync_data=perm_sync_data,
)
)
yield doc_metadata_list
doc_metadata_list = []

View File

@@ -20,10 +20,6 @@ F = TypeVar("F", bound=Callable[..., Any])
RATE_LIMIT_MESSAGE_LOWERCASE = "Rate limit exceeded".lower()
# https://jira.atlassian.com/browse/CONFCLOUD-76433
_PROBLEMATIC_EXPANSIONS = "body.storage.value"
_REPLACEMENT_EXPANSIONS = "body.view.value"
class ConfluenceRateLimitError(Exception):
pass
@@ -84,7 +80,7 @@ def handle_confluence_rate_limit(confluence_call: F) -> F:
def wrapped_call(*args: list[Any], **kwargs: Any) -> Any:
MAX_RETRIES = 5
TIMEOUT = 600
TIMEOUT = 3600
timeout_at = time.monotonic() + TIMEOUT
for attempt in range(MAX_RETRIES):
@@ -99,10 +95,6 @@ def handle_confluence_rate_limit(confluence_call: F) -> F:
return confluence_call(*args, **kwargs)
except HTTPError as e:
delay_until = _handle_http_error(e, attempt)
logger.warning(
f"HTTPError in confluence call. "
f"Retrying in {delay_until} seconds..."
)
while time.monotonic() < delay_until:
# in the future, check a signal here to exit
time.sleep(1)
@@ -149,7 +141,7 @@ class OnyxConfluence(Confluence):
def _paginate_url(
self, url_suffix: str, limit: int | None = None
) -> Iterator[dict[str, Any]]:
) -> Iterator[list[dict[str, Any]]]:
"""
This will paginate through the top level query.
"""
@@ -161,43 +153,46 @@ class OnyxConfluence(Confluence):
while url_suffix:
try:
logger.debug(f"Making confluence call to {url_suffix}")
next_response = self.get(url_suffix)
except Exception as e:
logger.warning(f"Error in confluence call to {url_suffix}")
# If the problematic expansion is in the url, replace it
# with the replacement expansion and try again
# If that fails, raise the error
if _PROBLEMATIC_EXPANSIONS not in url_suffix:
logger.exception(f"Error in confluence call to {url_suffix}")
raise e
logger.warning(
f"Replacing {_PROBLEMATIC_EXPANSIONS} with {_REPLACEMENT_EXPANSIONS}"
" and trying again."
)
url_suffix = url_suffix.replace(
_PROBLEMATIC_EXPANSIONS,
_REPLACEMENT_EXPANSIONS,
)
continue
# yield the results individually
yield from next_response.get("results", [])
logger.exception("Error in danswer_cql: \n")
raise e
yield next_response.get("results", [])
url_suffix = next_response.get("_links", {}).get("next")
def paginated_cql_retrieval(
def paginated_groups_retrieval(
self,
limit: int | None = None,
) -> Iterator[list[dict[str, Any]]]:
return self._paginate_url("rest/api/group", limit)
def paginated_group_members_retrieval(
self,
group_name: str,
limit: int | None = None,
) -> Iterator[list[dict[str, Any]]]:
group_name = quote(group_name)
return self._paginate_url(f"rest/api/group/{group_name}/member", limit)
def paginated_cql_user_retrieval(
self,
cql: str,
expand: str | None = None,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
The content/search endpoint can be used to fetch pages, attachments, and comments.
"""
) -> Iterator[list[dict[str, Any]]]:
expand_string = f"&expand={expand}" if expand else ""
yield from self._paginate_url(
return self._paginate_url(
f"rest/api/search/user?cql={cql}{expand_string}", limit
)
def paginated_cql_page_retrieval(
self,
cql: str,
expand: str | None = None,
limit: int | None = None,
) -> Iterator[list[dict[str, Any]]]:
expand_string = f"&expand={expand}" if expand else ""
return self._paginate_url(
f"rest/api/content/search?cql={cql}{expand_string}", limit
)
@@ -206,7 +201,7 @@ class OnyxConfluence(Confluence):
cql: str,
expand: str | None = None,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
) -> Iterator[list[dict[str, Any]]]:
"""
This function will paginate through the top level query first, then
paginate through all of the expansions.
@@ -226,110 +221,6 @@ class OnyxConfluence(Confluence):
for item in data:
_traverse_and_update(item)
for confluence_object in self.paginated_cql_retrieval(cql, expand, limit):
_traverse_and_update(confluence_object)
yield confluence_object
def paginated_cql_user_retrieval(
self,
expand: str | None = None,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
The search/user endpoint can be used to fetch users.
It's a seperate endpoint from the content/search endpoint used only for users.
Otherwise it's very similar to the content/search endpoint.
"""
cql = "type=user"
url = "rest/api/search/user" if self.cloud else "rest/api/search"
expand_string = f"&expand={expand}" if expand else ""
url += f"?cql={cql}{expand_string}"
yield from self._paginate_url(url, limit)
def paginated_groups_by_user_retrieval(
self,
user: dict[str, Any],
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
This is not an SQL like query.
It's a confluence specific endpoint that can be used to fetch groups.
"""
user_field = "accountId" if self.cloud else "key"
user_value = user["accountId"] if self.cloud else user["userKey"]
# Server uses userKey (but calls it key during the API call), Cloud uses accountId
user_query = f"{user_field}={quote(user_value)}"
url = f"rest/api/user/memberof?{user_query}"
yield from self._paginate_url(url, limit)
def paginated_groups_retrieval(
self,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
This is not an SQL like query.
It's a confluence specific endpoint that can be used to fetch groups.
"""
yield from self._paginate_url("rest/api/group", limit)
def paginated_group_members_retrieval(
self,
group_name: str,
limit: int | None = None,
) -> Iterator[dict[str, Any]]:
"""
This is not an SQL like query.
It's a confluence specific endpoint that can be used to fetch the members of a group.
THIS DOESN'T WORK FOR SERVER because it breaks when there is a slash in the group name.
E.g. neither "test/group" nor "test%2Fgroup" works for confluence.
"""
group_name = quote(group_name)
yield from self._paginate_url(f"rest/api/group/{group_name}/member", limit)
def _validate_connector_configuration(
credentials: dict[str, Any],
is_cloud: bool,
wiki_base: str,
) -> None:
# test connection with direct client, no retries
confluence_client_without_retries = Confluence(
api_version="cloud" if is_cloud else "latest",
url=wiki_base.rstrip("/"),
username=credentials["confluence_username"] if is_cloud else None,
password=credentials["confluence_access_token"] if is_cloud else None,
token=credentials["confluence_access_token"] if not is_cloud else None,
)
spaces = confluence_client_without_retries.get_all_spaces(limit=1)
if not spaces:
raise RuntimeError(
f"No spaces found at {wiki_base}! "
"Check your credentials and wiki_base and make sure "
"is_cloud is set correctly."
)
def build_confluence_client(
credentials: dict[str, Any],
is_cloud: bool,
wiki_base: str,
) -> OnyxConfluence:
_validate_connector_configuration(
credentials=credentials,
is_cloud=is_cloud,
wiki_base=wiki_base,
)
return OnyxConfluence(
api_version="cloud" if is_cloud else "latest",
# Remove trailing slash from wiki_base if present
url=wiki_base.rstrip("/"),
# passing in username causes issues for Confluence data center
username=credentials["confluence_username"] if is_cloud else None,
password=credentials["confluence_access_token"] if is_cloud else None,
token=credentials["confluence_access_token"] if not is_cloud else None,
backoff_and_retry=True,
max_backoff_retries=10,
max_backoff_seconds=60,
)
for results in self.paginated_cql_page_retrieval(cql, expand, limit):
_traverse_and_update(results)
yield results

View File

@@ -2,7 +2,6 @@ import io
from datetime import datetime
from datetime import timezone
from typing import Any
from urllib.parse import quote
import bs4
@@ -72,9 +71,7 @@ def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str:
def extract_text_from_confluence_html(
confluence_client: OnyxConfluence,
confluence_object: dict[str, Any],
fetched_titles: set[str],
confluence_client: OnyxConfluence, confluence_object: dict[str, Any]
) -> str:
"""Parse a Confluence html page and replace the 'user Id' by the real
User Display Name
@@ -82,7 +79,7 @@ def extract_text_from_confluence_html(
Args:
confluence_object (dict): The confluence object as a dict
confluence_client (Confluence): Confluence client
fetched_titles (set[str]): The titles of the pages that have already been fetched
Returns:
str: loaded and formated Confluence page
"""
@@ -103,73 +100,6 @@ def extract_text_from_confluence_html(
continue
# Include @ sign for tagging, more clear for LLM
user.replaceWith("@" + _get_user(confluence_client, user_id))
for html_page_reference in soup.findAll("ac:structured-macro"):
# Here, we only want to process page within page macros
if html_page_reference.attrs.get("ac:name") != "include":
continue
page_data = html_page_reference.find("ri:page")
if not page_data:
logger.warning(
f"Skipping retrieval of {html_page_reference} because because page data is missing"
)
continue
page_title = page_data.attrs.get("ri:content-title")
if not page_title:
# only fetch pages that have a title
logger.warning(
f"Skipping retrieval of {html_page_reference} because it has no title"
)
continue
if page_title in fetched_titles:
# prevent recursive fetching of pages
logger.debug(f"Skipping {page_title} because it has already been fetched")
continue
fetched_titles.add(page_title)
# Wrap this in a try-except because there are some pages that might not exist
try:
page_query = f"type=page and title='{quote(page_title)}'"
page_contents: dict[str, Any] | None = None
# Confluence enforces title uniqueness, so we should only get one result here
for page in confluence_client.paginated_cql_retrieval(
cql=page_query,
expand="body.storage.value",
limit=1,
):
page_contents = page
break
except Exception as e:
logger.warning(
f"Error getting page contents for object {confluence_object}: {e}"
)
continue
if not page_contents:
continue
text_from_page = extract_text_from_confluence_html(
confluence_client=confluence_client,
confluence_object=page_contents,
fetched_titles=fetched_titles,
)
html_page_reference.replaceWith(text_from_page)
for html_link_body in soup.findAll("ac:link-body"):
# This extracts the text from inline links in the page so they can be
# represented in the document text as plain text
try:
text_from_link = html_link_body.text
html_link_body.replaceWith(f"(LINK TEXT: {text_from_link})")
except Exception as e:
logger.warning(f"Error processing ac:link-body: {e}")
return format_document_soup(soup)
@@ -223,9 +153,7 @@ def attachment_to_content(
return extracted_text
def build_confluence_document_id(
base_url: str, content_url: str, is_cloud: bool
) -> str:
def build_confluence_document_id(base_url: str, content_url: str) -> str:
"""For confluence, the document id is the page url for a page based document
or the attachment download url for an attachment based document
@@ -236,8 +164,6 @@ def build_confluence_document_id(
Returns:
str: The document id
"""
if is_cloud and not base_url.endswith("/wiki"):
base_url += "/wiki"
return f"{base_url}{content_url}"
@@ -269,3 +195,20 @@ def datetime_from_string(datetime_string: str) -> datetime:
datetime_object = datetime_object.astimezone(timezone.utc)
return datetime_object
def build_confluence_client(
credentials_json: dict[str, Any], is_cloud: bool, wiki_base: str
) -> OnyxConfluence:
return OnyxConfluence(
api_version="cloud" if is_cloud else "latest",
# Remove trailing slash from wiki_base if present
url=wiki_base.rstrip("/"),
# passing in username causes issues for Confluence data center
username=credentials_json["confluence_username"] if is_cloud else None,
password=credentials_json["confluence_access_token"] if is_cloud else None,
token=credentials_json["confluence_access_token"] if not is_cloud else None,
backoff_and_retry=True,
max_backoff_retries=60,
max_backoff_seconds=60,
)

View File

@@ -1,8 +1,8 @@
import os
from collections.abc import Iterable
from datetime import datetime
from datetime import timezone
from typing import Any
from urllib.parse import urlparse
from jira import JIRA
from jira.resources import Issue
@@ -12,93 +12,129 @@ from danswer.configs.app_configs import JIRA_CONNECTOR_LABELS_TO_SKIP
from danswer.configs.app_configs import JIRA_CONNECTOR_MAX_TICKET_SIZE
from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from danswer.connectors.danswer_jira.utils import best_effort_basic_expert_info
from danswer.connectors.danswer_jira.utils import best_effort_get_field_from_issue
from danswer.connectors.danswer_jira.utils import build_jira_client
from danswer.connectors.danswer_jira.utils import build_jira_url
from danswer.connectors.danswer_jira.utils import extract_jira_project
from danswer.connectors.danswer_jira.utils import extract_text_from_adf
from danswer.connectors.danswer_jira.utils import get_comment_strs
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import GenerateSlimDocumentOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.interfaces import PollConnector
from danswer.connectors.interfaces import SecondsSinceUnixEpoch
from danswer.connectors.interfaces import SlimConnector
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import ConnectorMissingCredentialError
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.connectors.models import SlimDocument
from danswer.utils.logger import setup_logger
logger = setup_logger()
PROJECT_URL_PAT = "projects"
JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
_JIRA_SLIM_PAGE_SIZE = 500
_JIRA_FULL_PAGE_SIZE = 50
def _paginate_jql_search(
jira_client: JIRA,
jql: str,
max_results: int,
fields: str | None = None,
) -> Iterable[Issue]:
start = 0
while True:
logger.debug(
f"Fetching Jira issues with JQL: {jql}, "
f"starting at {start}, max results: {max_results}"
)
issues = jira_client.search_issues(
jql_str=jql,
startAt=start,
maxResults=max_results,
fields=fields,
)
def extract_jira_project(url: str) -> tuple[str, str]:
parsed_url = urlparse(url)
jira_base = parsed_url.scheme + "://" + parsed_url.netloc
for issue in issues:
if isinstance(issue, Issue):
yield issue
else:
raise Exception(f"Found Jira object not of type Issue: {issue}")
# Split the path by '/' and find the position of 'projects' to get the project name
split_path = parsed_url.path.split("/")
if PROJECT_URL_PAT in split_path:
project_pos = split_path.index(PROJECT_URL_PAT)
if len(split_path) > project_pos + 1:
jira_project = split_path[project_pos + 1]
else:
raise ValueError("No project name found in the URL")
else:
raise ValueError("'projects' not found in the URL")
if len(issues) < max_results:
break
return jira_base, jira_project
start += max_results
def extract_text_from_adf(adf: dict | None) -> str:
"""Extracts plain text from Atlassian Document Format:
https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
WARNING: This function is incomplete and will e.g. skip lists!
"""
texts = []
if adf is not None and "content" in adf:
for block in adf["content"]:
if "content" in block:
for item in block["content"]:
if item["type"] == "text":
texts.append(item["text"])
return " ".join(texts)
def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any:
if hasattr(jira_issue.fields, field):
return getattr(jira_issue.fields, field)
try:
return jira_issue.raw["fields"][field]
except Exception:
return None
def _get_comment_strs(
jira: Issue, comment_email_blacklist: tuple[str, ...] = ()
) -> list[str]:
comment_strs = []
for comment in jira.fields.comment.comments:
try:
body_text = (
comment.body
if JIRA_API_VERSION == "2"
else extract_text_from_adf(comment.raw["body"])
)
if (
hasattr(comment, "author")
and hasattr(comment.author, "emailAddress")
and comment.author.emailAddress in comment_email_blacklist
):
continue # Skip adding comment if author's email is in blacklist
comment_strs.append(body_text)
except Exception as e:
logger.error(f"Failed to process comment due to an error: {e}")
continue
return comment_strs
def fetch_jira_issues_batch(
jira_client: JIRA,
jql: str,
batch_size: int,
start_index: int,
jira_client: JIRA,
batch_size: int = INDEX_BATCH_SIZE,
comment_email_blacklist: tuple[str, ...] = (),
labels_to_skip: set[str] | None = None,
) -> Iterable[Document]:
for issue in _paginate_jql_search(
jira_client=jira_client,
jql=jql,
max_results=batch_size,
):
if labels_to_skip:
if any(label in issue.fields.labels for label in labels_to_skip):
logger.info(
f"Skipping {issue.key} because it has a label to skip. Found "
f"labels: {issue.fields.labels}. Labels to skip: {labels_to_skip}."
)
continue
) -> tuple[list[Document], int]:
doc_batch = []
batch = jira_client.search_issues(
jql,
startAt=start_index,
maxResults=batch_size,
)
for jira in batch:
if type(jira) != Issue:
logger.warning(f"Found Jira object not of type Issue {jira}")
continue
if labels_to_skip and any(
label in jira.fields.labels for label in labels_to_skip
):
logger.info(
f"Skipping {jira.key} because it has a label to skip. Found "
f"labels: {jira.fields.labels}. Labels to skip: {labels_to_skip}."
)
continue
description = (
issue.fields.description
jira.fields.description
if JIRA_API_VERSION == "2"
else extract_text_from_adf(issue.raw["fields"]["description"])
)
comments = get_comment_strs(
issue=issue,
comment_email_blacklist=comment_email_blacklist,
else extract_text_from_adf(jira.raw["fields"]["description"])
)
comments = _get_comment_strs(jira, comment_email_blacklist)
ticket_content = f"{description}\n" + "\n".join(
[f"Comment: {comment}" for comment in comments if comment]
)
@@ -106,53 +142,66 @@ def fetch_jira_issues_batch(
# Check ticket size
if len(ticket_content.encode("utf-8")) > JIRA_CONNECTOR_MAX_TICKET_SIZE:
logger.info(
f"Skipping {issue.key} because it exceeds the maximum size of "
f"Skipping {jira.key} because it exceeds the maximum size of "
f"{JIRA_CONNECTOR_MAX_TICKET_SIZE} bytes."
)
continue
page_url = f"{jira_client.client_info()}/browse/{issue.key}"
page_url = f"{jira_client.client_info()}/browse/{jira.key}"
people = set()
try:
creator = best_effort_get_field_from_issue(issue, "creator")
if basic_expert_info := best_effort_basic_expert_info(creator):
people.add(basic_expert_info)
people.add(
BasicExpertInfo(
display_name=jira.fields.creator.displayName,
email=jira.fields.creator.emailAddress,
)
)
except Exception:
# Author should exist but if not, doesn't matter
pass
try:
assignee = best_effort_get_field_from_issue(issue, "assignee")
if basic_expert_info := best_effort_basic_expert_info(assignee):
people.add(basic_expert_info)
people.add(
BasicExpertInfo(
display_name=jira.fields.assignee.displayName, # type: ignore
email=jira.fields.assignee.emailAddress, # type: ignore
)
)
except Exception:
# Author should exist but if not, doesn't matter
pass
metadata_dict = {}
if priority := best_effort_get_field_from_issue(issue, "priority"):
priority = best_effort_get_field_from_issue(jira, "priority")
if priority:
metadata_dict["priority"] = priority.name
if status := best_effort_get_field_from_issue(issue, "status"):
status = best_effort_get_field_from_issue(jira, "status")
if status:
metadata_dict["status"] = status.name
if resolution := best_effort_get_field_from_issue(issue, "resolution"):
resolution = best_effort_get_field_from_issue(jira, "resolution")
if resolution:
metadata_dict["resolution"] = resolution.name
if labels := best_effort_get_field_from_issue(issue, "labels"):
labels = best_effort_get_field_from_issue(jira, "labels")
if labels:
metadata_dict["label"] = labels
yield Document(
id=page_url,
sections=[Section(link=page_url, text=ticket_content)],
source=DocumentSource.JIRA,
semantic_identifier=issue.fields.summary,
doc_updated_at=time_str_to_utc(issue.fields.updated),
primary_owners=list(people) or None,
# TODO add secondary_owners (commenters) if needed
metadata=metadata_dict,
doc_batch.append(
Document(
id=page_url,
sections=[Section(link=page_url, text=ticket_content)],
source=DocumentSource.JIRA,
semantic_identifier=jira.fields.summary,
doc_updated_at=time_str_to_utc(jira.fields.updated),
primary_owners=list(people) or None,
# TODO add secondary_owners (commenters) if needed
metadata=metadata_dict,
)
)
return doc_batch, len(batch)
class JiraConnector(LoadConnector, PollConnector, SlimConnector):
class JiraConnector(LoadConnector, PollConnector):
def __init__(
self,
jira_project_url: str,
@@ -164,8 +213,8 @@ class JiraConnector(LoadConnector, PollConnector, SlimConnector):
labels_to_skip: list[str] = JIRA_CONNECTOR_LABELS_TO_SKIP,
) -> None:
self.batch_size = batch_size
self.jira_base, self._jira_project = extract_jira_project(jira_project_url)
self._jira_client: JIRA | None = None
self.jira_base, self.jira_project = extract_jira_project(jira_project_url)
self.jira_client: JIRA | None = None
self._comment_email_blacklist = comment_email_blacklist or []
self.labels_to_skip = set(labels_to_skip)
@@ -174,45 +223,54 @@ class JiraConnector(LoadConnector, PollConnector, SlimConnector):
def comment_email_blacklist(self) -> tuple:
return tuple(email.strip() for email in self._comment_email_blacklist)
@property
def jira_client(self) -> JIRA:
if self._jira_client is None:
raise ConnectorMissingCredentialError("Jira")
return self._jira_client
@property
def quoted_jira_project(self) -> str:
# Quote the project name to handle reserved words
return f'"{self._jira_project}"'
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self._jira_client = build_jira_client(
credentials=credentials,
jira_base=self.jira_base,
)
api_token = credentials["jira_api_token"]
# if user provide an email we assume it's cloud
if "jira_user_email" in credentials:
email = credentials["jira_user_email"]
self.jira_client = JIRA(
basic_auth=(email, api_token),
server=self.jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
else:
self.jira_client = JIRA(
token_auth=api_token,
server=self.jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
return None
def load_from_state(self) -> GenerateDocumentsOutput:
jql = f"project = {self.quoted_jira_project}"
if self.jira_client is None:
raise ConnectorMissingCredentialError("Jira")
document_batch = []
for doc in fetch_jira_issues_batch(
jira_client=self.jira_client,
jql=jql,
batch_size=_JIRA_FULL_PAGE_SIZE,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
):
document_batch.append(doc)
if len(document_batch) >= self.batch_size:
yield document_batch
document_batch = []
# Quote the project name to handle reserved words
quoted_project = f'"{self.jira_project}"'
start_ind = 0
while True:
doc_batch, fetched_batch_size = fetch_jira_issues_batch(
jql=f"project = {quoted_project}",
start_index=start_ind,
jira_client=self.jira_client,
batch_size=self.batch_size,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
)
yield document_batch
if doc_batch:
yield doc_batch
start_ind += fetched_batch_size
if fetched_batch_size < self.batch_size:
break
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
if self.jira_client is None:
raise ConnectorMissingCredentialError("Jira")
start_date_str = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
"%Y-%m-%d %H:%M"
)
@@ -220,54 +278,31 @@ class JiraConnector(LoadConnector, PollConnector, SlimConnector):
"%Y-%m-%d %H:%M"
)
# Quote the project name to handle reserved words
quoted_project = f'"{self.jira_project}"'
jql = (
f"project = {self.quoted_jira_project} AND "
f"project = {quoted_project} AND "
f"updated >= '{start_date_str}' AND "
f"updated <= '{end_date_str}'"
)
document_batch = []
for doc in fetch_jira_issues_batch(
jira_client=self.jira_client,
jql=jql,
batch_size=_JIRA_FULL_PAGE_SIZE,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
):
document_batch.append(doc)
if len(document_batch) >= self.batch_size:
yield document_batch
document_batch = []
yield document_batch
def retrieve_all_slim_documents(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateSlimDocumentOutput:
jql = f"project = {self.quoted_jira_project}"
slim_doc_batch = []
for issue in _paginate_jql_search(
jira_client=self.jira_client,
jql=jql,
max_results=_JIRA_SLIM_PAGE_SIZE,
fields="key",
):
issue_key = best_effort_get_field_from_issue(issue, "key")
id = build_jira_url(self.jira_client, issue_key)
slim_doc_batch.append(
SlimDocument(
id=id,
perm_sync_data=None,
)
start_ind = 0
while True:
doc_batch, fetched_batch_size = fetch_jira_issues_batch(
jql=jql,
start_index=start_ind,
jira_client=self.jira_client,
batch_size=self.batch_size,
comment_email_blacklist=self.comment_email_blacklist,
labels_to_skip=self.labels_to_skip,
)
if len(slim_doc_batch) >= _JIRA_SLIM_PAGE_SIZE:
yield slim_doc_batch
slim_doc_batch = []
yield slim_doc_batch
if doc_batch:
yield doc_batch
start_ind += fetched_batch_size
if fetched_batch_size < self.batch_size:
break
if __name__ == "__main__":

View File

@@ -1,136 +1,17 @@
"""Module with custom fields processing functions"""
import os
from typing import Any
from typing import List
from urllib.parse import urlparse
from jira import JIRA
from jira.resources import CustomFieldOption
from jira.resources import Issue
from jira.resources import User
from danswer.connectors.models import BasicExpertInfo
from danswer.utils.logger import setup_logger
logger = setup_logger()
PROJECT_URL_PAT = "projects"
JIRA_API_VERSION = os.environ.get("JIRA_API_VERSION") or "2"
def best_effort_basic_expert_info(obj: Any) -> BasicExpertInfo | None:
display_name = None
email = None
if hasattr(obj, "display_name"):
display_name = obj.display_name
else:
display_name = obj.get("displayName")
if hasattr(obj, "emailAddress"):
email = obj.emailAddress
else:
email = obj.get("emailAddress")
if not email and not display_name:
return None
return BasicExpertInfo(display_name=display_name, email=email)
def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any:
if hasattr(jira_issue.fields, field):
return getattr(jira_issue.fields, field)
try:
return jira_issue.raw["fields"][field]
except Exception:
return None
def extract_text_from_adf(adf: dict | None) -> str:
"""Extracts plain text from Atlassian Document Format:
https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
WARNING: This function is incomplete and will e.g. skip lists!
"""
texts = []
if adf is not None and "content" in adf:
for block in adf["content"]:
if "content" in block:
for item in block["content"]:
if item["type"] == "text":
texts.append(item["text"])
return " ".join(texts)
def build_jira_url(jira_client: JIRA, issue_key: str) -> str:
return f"{jira_client.client_info()}/browse/{issue_key}"
def build_jira_client(credentials: dict[str, Any], jira_base: str) -> JIRA:
api_token = credentials["jira_api_token"]
# if user provide an email we assume it's cloud
if "jira_user_email" in credentials:
email = credentials["jira_user_email"]
return JIRA(
basic_auth=(email, api_token),
server=jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
else:
return JIRA(
token_auth=api_token,
server=jira_base,
options={"rest_api_version": JIRA_API_VERSION},
)
def extract_jira_project(url: str) -> tuple[str, str]:
parsed_url = urlparse(url)
jira_base = parsed_url.scheme + "://" + parsed_url.netloc
# Split the path by '/' and find the position of 'projects' to get the project name
split_path = parsed_url.path.split("/")
if PROJECT_URL_PAT in split_path:
project_pos = split_path.index(PROJECT_URL_PAT)
if len(split_path) > project_pos + 1:
jira_project = split_path[project_pos + 1]
else:
raise ValueError("No project name found in the URL")
else:
raise ValueError("'projects' not found in the URL")
return jira_base, jira_project
def get_comment_strs(
issue: Issue, comment_email_blacklist: tuple[str, ...] = ()
) -> list[str]:
comment_strs = []
for comment in issue.fields.comment.comments:
try:
body_text = (
comment.body
if JIRA_API_VERSION == "2"
else extract_text_from_adf(comment.raw["body"])
)
if (
hasattr(comment, "author")
and hasattr(comment.author, "emailAddress")
and comment.author.emailAddress in comment_email_blacklist
):
continue # Skip adding comment if author's email is in blacklist
comment_strs.append(body_text)
except Exception as e:
logger.error(f"Failed to process comment due to an error: {e}")
continue
return comment_strs
class CustomFieldExtractor:
@staticmethod
def _process_custom_field_value(value: Any) -> str:

View File

@@ -305,7 +305,6 @@ class GmailConnector(LoadConnector, PollConnector, SlimConnector):
query = _build_time_range_query(time_range_start, time_range_end)
doc_batch = []
for user_email in self._get_all_user_emails():
logger.info(f"Fetching slim threads for user: {user_email}")
gmail_service = get_gmail_service(self.creds, user_email)
for thread in execute_paginated_retrieval(
retrieval_function=gmail_service.users().threads().list,

View File

@@ -15,7 +15,6 @@ from danswer.connectors.google_drive.doc_conversion import (
convert_drive_item_to_document,
)
from danswer.connectors.google_drive.file_retrieval import crawl_folders_for_files
from danswer.connectors.google_drive.file_retrieval import get_all_files_for_oauth
from danswer.connectors.google_drive.file_retrieval import get_all_files_in_my_drive
from danswer.connectors.google_drive.file_retrieval import get_files_in_shared_drive
from danswer.connectors.google_drive.models import GoogleDriveFileType
@@ -83,31 +82,12 @@ def _process_files_batch(
yield doc_batch
def _clean_requested_drive_ids(
requested_drive_ids: set[str],
requested_folder_ids: set[str],
all_drive_ids_available: set[str],
) -> tuple[set[str], set[str]]:
invalid_requested_drive_ids = requested_drive_ids - all_drive_ids_available
filtered_folder_ids = requested_folder_ids - all_drive_ids_available
if invalid_requested_drive_ids:
logger.warning(
f"Some shared drive IDs were not found. IDs: {invalid_requested_drive_ids}"
)
logger.warning("Checking for folder access instead...")
filtered_folder_ids.update(invalid_requested_drive_ids)
valid_requested_drive_ids = requested_drive_ids - invalid_requested_drive_ids
return valid_requested_drive_ids, filtered_folder_ids
class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
def __init__(
self,
include_shared_drives: bool = False,
include_my_drives: bool = False,
include_files_shared_with_me: bool = False,
include_shared_drives: bool = True,
shared_drive_urls: str | None = None,
include_my_drives: bool = True,
my_drive_emails: str | None = None,
shared_folder_urls: str | None = None,
batch_size: int = INDEX_BATCH_SIZE,
@@ -140,36 +120,22 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
if (
not include_shared_drives
and not include_my_drives
and not include_files_shared_with_me
and not shared_folder_urls
and not my_drive_emails
and not shared_drive_urls
):
raise ValueError(
"Nothing to index. Please specify at least one of the following: "
"include_shared_drives, include_my_drives, include_files_shared_with_me, "
"shared_folder_urls, or my_drive_emails"
"At least one of include_shared_drives, include_my_drives,"
" or shared_folder_urls must be true"
)
self.batch_size = batch_size
specific_requests_made = False
if bool(shared_drive_urls) or bool(my_drive_emails) or bool(shared_folder_urls):
specific_requests_made = True
self.include_files_shared_with_me = (
False if specific_requests_made else include_files_shared_with_me
)
self.include_my_drives = False if specific_requests_made else include_my_drives
self.include_shared_drives = (
False if specific_requests_made else include_shared_drives
)
self.include_shared_drives = include_shared_drives
shared_drive_url_list = _extract_str_list_from_comma_str(shared_drive_urls)
self._requested_shared_drive_ids = set(
_extract_ids_from_urls(shared_drive_url_list)
)
self.include_my_drives = include_my_drives
self._requested_my_drive_emails = set(
_extract_str_list_from_comma_str(my_drive_emails)
)
@@ -226,72 +192,80 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
def _update_traversed_parent_ids(self, folder_id: str) -> None:
self._retrieved_ids.add(folder_id)
def _get_all_user_emails(self) -> list[str]:
# Start with primary admin email
user_emails = [self.primary_admin_email]
# Only fetch additional users if using service account
if isinstance(self.creds, OAuthCredentials):
return user_emails
def _get_all_user_emails(self, admins_only: bool) -> list[str]:
admin_service = get_admin_service(
creds=self.creds,
user_email=self.primary_admin_email,
)
# Get admins first since they're more likely to have access to most files
for is_admin in [True, False]:
query = "isAdmin=true" if is_admin else "isAdmin=false"
for user in execute_paginated_retrieval(
retrieval_function=admin_service.users().list,
list_key="users",
fields=USER_FIELDS,
domain=self.google_domain,
query=query,
):
if email := user.get("primaryEmail"):
if email not in user_emails:
user_emails.append(email)
return user_emails
query = "isAdmin=true" if admins_only else "isAdmin=false"
emails = []
for user in execute_paginated_retrieval(
retrieval_function=admin_service.users().list,
list_key="users",
fields=USER_FIELDS,
domain=self.google_domain,
query=query,
):
if email := user.get("primaryEmail"):
emails.append(email)
return emails
def _get_all_drive_ids(self) -> set[str]:
primary_drive_service = get_drive_service(
creds=self.creds,
user_email=self.primary_admin_email,
)
is_service_account = isinstance(self.creds, ServiceAccountCredentials)
all_drive_ids = set()
for drive in execute_paginated_retrieval(
retrieval_function=primary_drive_service.drives().list,
list_key="drives",
useDomainAdminAccess=is_service_account,
useDomainAdminAccess=True,
fields="drives(id)",
):
all_drive_ids.add(drive["id"])
if not all_drive_ids:
logger.warning(
"No drives found even though we are indexing shared drives was requested."
)
return all_drive_ids
def _initialize_all_class_variables(self) -> None:
# Get all user emails
# Get admins first becuase they are more likely to have access to the most files
user_emails = [self.primary_admin_email]
for admins_only in [True, False]:
for email in self._get_all_user_emails(admins_only=admins_only):
if email not in user_emails:
user_emails.append(email)
self._all_org_emails = user_emails
self._all_drive_ids: set[str] = self._get_all_drive_ids()
# remove drive ids from the folder ids because they are queried differently
self._requested_folder_ids -= self._all_drive_ids
# Remove drive_ids that are not in the all_drive_ids and check them as folders instead
invalid_drive_ids = self._requested_shared_drive_ids - self._all_drive_ids
if invalid_drive_ids:
logger.warning(
f"Some shared drive IDs were not found. IDs: {invalid_drive_ids}"
)
logger.warning("Checking for folder access instead...")
self._requested_folder_ids.update(invalid_drive_ids)
if not self.include_shared_drives:
self._requested_shared_drive_ids = set()
elif not self._requested_shared_drive_ids:
self._requested_shared_drive_ids = self._all_drive_ids
def _impersonate_user_for_retrieval(
self,
user_email: str,
is_slim: bool,
filtered_drive_ids: set[str],
filtered_folder_ids: set[str],
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]:
drive_service = get_drive_service(self.creds, user_email)
# if we are including my drives, try to get the current user's my
# drive if any of the following are true:
# - include_my_drives is true
# - the current user's email is in the requested emails
if self.include_my_drives or user_email in self._requested_my_drive_emails:
if self.include_my_drives and (
not self._requested_my_drive_emails
or user_email in self._requested_my_drive_emails
):
yield from get_all_files_in_my_drive(
service=drive_service,
update_traversed_ids_func=self._update_traversed_parent_ids,
@@ -300,7 +274,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
end=end,
)
remaining_drive_ids = filtered_drive_ids - self._retrieved_ids
remaining_drive_ids = self._requested_shared_drive_ids - self._retrieved_ids
for drive_id in remaining_drive_ids:
yield from get_files_in_shared_drive(
service=drive_service,
@@ -311,7 +285,7 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
end=end,
)
remaining_folders = filtered_folder_ids - self._retrieved_ids
remaining_folders = self._requested_folder_ids - self._retrieved_ids
for folder_id in remaining_folders:
yield from crawl_folders_for_files(
service=drive_service,
@@ -322,141 +296,32 @@ class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
end=end,
)
def _manage_service_account_retrieval(
self,
is_slim: bool,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]:
all_org_emails: list[str] = self._get_all_user_emails()
all_drive_ids: set[str] = self._get_all_drive_ids()
drive_ids_to_retrieve: set[str] = set()
folder_ids_to_retrieve: set[str] = set()
if self._requested_shared_drive_ids or self._requested_folder_ids:
drive_ids_to_retrieve, folder_ids_to_retrieve = _clean_requested_drive_ids(
requested_drive_ids=self._requested_shared_drive_ids,
requested_folder_ids=self._requested_folder_ids,
all_drive_ids_available=all_drive_ids,
)
elif self.include_shared_drives:
drive_ids_to_retrieve = all_drive_ids
# Process users in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_email = {
executor.submit(
self._impersonate_user_for_retrieval,
email,
is_slim,
drive_ids_to_retrieve,
folder_ids_to_retrieve,
start,
end,
): email
for email in all_org_emails
}
# Yield results as they complete
for future in as_completed(future_to_email):
yield from future.result()
remaining_folders = (
drive_ids_to_retrieve | folder_ids_to_retrieve
) - self._retrieved_ids
if remaining_folders:
logger.warning(
f"Some folders/drives were not retrieved. IDs: {remaining_folders}"
)
def _manage_oauth_retrieval(
self,
is_slim: bool,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]:
drive_service = get_drive_service(self.creds, self.primary_admin_email)
if self.include_files_shared_with_me or self.include_my_drives:
yield from get_all_files_for_oauth(
service=drive_service,
include_files_shared_with_me=self.include_files_shared_with_me,
include_my_drives=self.include_my_drives,
include_shared_drives=self.include_shared_drives,
is_slim=is_slim,
start=start,
end=end,
)
all_requested = (
self.include_files_shared_with_me
and self.include_my_drives
and self.include_shared_drives
)
if all_requested:
# If all 3 are true, we already yielded from get_all_files_for_oauth
return
all_drive_ids = self._get_all_drive_ids()
drive_ids_to_retrieve: set[str] = set()
folder_ids_to_retrieve: set[str] = set()
if self._requested_shared_drive_ids or self._requested_folder_ids:
drive_ids_to_retrieve, folder_ids_to_retrieve = _clean_requested_drive_ids(
requested_drive_ids=self._requested_shared_drive_ids,
requested_folder_ids=self._requested_folder_ids,
all_drive_ids_available=all_drive_ids,
)
elif self.include_shared_drives:
drive_ids_to_retrieve = all_drive_ids
for drive_id in drive_ids_to_retrieve:
yield from get_files_in_shared_drive(
service=drive_service,
drive_id=drive_id,
is_slim=is_slim,
update_traversed_ids_func=self._update_traversed_parent_ids,
start=start,
end=end,
)
# Even if no folders were requested, we still check if any drives were requested
# that could be folders.
remaining_folders = folder_ids_to_retrieve - self._retrieved_ids
for folder_id in remaining_folders:
yield from crawl_folders_for_files(
service=drive_service,
parent_id=folder_id,
traversed_parent_ids=self._retrieved_ids,
update_traversed_ids_func=self._update_traversed_parent_ids,
start=start,
end=end,
)
remaining_folders = (
drive_ids_to_retrieve | folder_ids_to_retrieve
) - self._retrieved_ids
if remaining_folders:
logger.warning(
f"Some folders/drives were not retrieved. IDs: {remaining_folders}"
)
def _fetch_drive_items(
self,
is_slim: bool,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]:
retrieval_method = (
self._manage_service_account_retrieval
if isinstance(self.creds, ServiceAccountCredentials)
else self._manage_oauth_retrieval
)
return retrieval_method(
is_slim=is_slim,
start=start,
end=end,
)
self._initialize_all_class_variables()
# Process users in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_email = {
executor.submit(
self._impersonate_user_for_retrieval, email, is_slim, start, end
): email
for email in self._all_org_emails
}
# Yield results as they complete
for future in as_completed(future_to_email):
yield from future.result()
remaining_folders = self._requested_folder_ids - self._retrieved_ids
if remaining_folders:
logger.warning(
f"Some folders/drives were not retrieved. IDs: {remaining_folders}"
)
def _extract_docs_from_google_drive(
self,

View File

@@ -2,7 +2,6 @@ import io
from datetime import datetime
from datetime import timezone
from googleapiclient.discovery import build # type: ignore
from googleapiclient.errors import HttpError # type: ignore
from danswer.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
@@ -49,67 +48,6 @@ def _extract_sections_basic(
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
try:
if mime_type == GDriveMimeType.SPREADSHEET.value:
try:
sheets_service = build(
"sheets", "v4", credentials=service._http.credentials
)
spreadsheet = (
sheets_service.spreadsheets()
.get(spreadsheetId=file["id"])
.execute()
)
sections = []
for sheet in spreadsheet["sheets"]:
sheet_name = sheet["properties"]["title"]
sheet_id = sheet["properties"]["sheetId"]
# Get sheet dimensions
grid_properties = sheet["properties"].get("gridProperties", {})
row_count = grid_properties.get("rowCount", 1000)
column_count = grid_properties.get("columnCount", 26)
# Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
end_column = ""
while column_count:
column_count, remainder = divmod(column_count - 1, 26)
end_column = chr(65 + remainder) + end_column
range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
try:
result = (
sheets_service.spreadsheets()
.values()
.get(spreadsheetId=file["id"], range=range_name)
.execute()
)
values = result.get("values", [])
if values:
text = f"Sheet: {sheet_name}\n"
for row in values:
text += "\t".join(str(cell) for cell in row) + "\n"
sections.append(
Section(
link=f"{link}#gid={sheet_id}",
text=text,
)
)
except HttpError as e:
logger.warning(
f"Error fetching data for sheet '{sheet_name}': {e}"
)
continue
return sections
except Exception as e:
logger.warning(
f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
" Falling back to basic extraction."
)
if mime_type in [
GDriveMimeType.DOC.value,
GDriveMimeType.PPT.value,
@@ -127,7 +65,6 @@ def _extract_sections_basic(
.decode("utf-8")
)
return [Section(link=link, text=text)]
elif mime_type in [
GDriveMimeType.PLAIN_TEXT.value,
GDriveMimeType.MARKDOWN.value,

View File

@@ -140,8 +140,8 @@ def get_files_in_shared_drive(
) -> Iterator[GoogleDriveFileType]:
# If we know we are going to folder crawl later, we can cache the folders here
# Get all folders being queried and add them to the traversed set
folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
folder_query += " and trashed = false"
query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
query += " and trashed = false"
found_folders = False
for file in execute_paginated_retrieval(
retrieval_function=service.files().list,
@@ -152,7 +152,7 @@ def get_files_in_shared_drive(
supportsAllDrives=True,
includeItemsFromAllDrives=True,
fields="nextPageToken, files(id)",
q=folder_query,
q=query,
):
update_traversed_ids_func(file["id"])
found_folders = True
@@ -160,9 +160,9 @@ def get_files_in_shared_drive(
update_traversed_ids_func(drive_id)
# Get all files in the shared drive
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
file_query += " and trashed = false"
file_query += _generate_time_range_filter(start, end)
query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
query += " and trashed = false"
query += _generate_time_range_filter(start, end)
yield from execute_paginated_retrieval(
retrieval_function=service.files().list,
list_key="files",
@@ -172,7 +172,7 @@ def get_files_in_shared_drive(
supportsAllDrives=True,
includeItemsFromAllDrives=True,
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
q=file_query,
q=query,
)
@@ -185,16 +185,14 @@ def get_all_files_in_my_drive(
) -> Iterator[GoogleDriveFileType]:
# If we know we are going to folder crawl later, we can cache the folders here
# Get all folders being queried and add them to the traversed set
folder_query = f"mimeType = '{DRIVE_FOLDER_TYPE}'"
folder_query += " and trashed = false"
folder_query += " and 'me' in owners"
query = "trashed = false and 'me' in owners"
found_folders = False
for file in execute_paginated_retrieval(
retrieval_function=service.files().list,
list_key="files",
corpora="user",
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
q=folder_query,
q=query,
):
update_traversed_ids_func(file["id"])
found_folders = True
@@ -202,52 +200,18 @@ def get_all_files_in_my_drive(
update_traversed_ids_func(get_root_folder_id(service))
# Then get the files
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
file_query += " and trashed = false"
file_query += " and 'me' in owners"
file_query += _generate_time_range_filter(start, end)
query = "trashed = false and 'me' in owners"
query += _generate_time_range_filter(start, end)
fields = "files(id, name, mimeType, webViewLink, modifiedTime, createdTime)"
if not is_slim:
fields += ", files(permissions, permissionIds, owners)"
yield from execute_paginated_retrieval(
retrieval_function=service.files().list,
list_key="files",
corpora="user",
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
q=file_query,
)
def get_all_files_for_oauth(
service: Any,
include_files_shared_with_me: bool,
include_my_drives: bool,
# One of the above 2 should be true
include_shared_drives: bool,
is_slim: bool = False,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> Iterator[GoogleDriveFileType]:
should_get_all = (
include_shared_drives and include_my_drives and include_files_shared_with_me
)
corpora = "allDrives" if should_get_all else "user"
file_query = f"mimeType != '{DRIVE_FOLDER_TYPE}'"
file_query += " and trashed = false"
file_query += _generate_time_range_filter(start, end)
if not should_get_all:
if include_files_shared_with_me and not include_my_drives:
file_query += " and not 'me' in owners"
if not include_files_shared_with_me and include_my_drives:
file_query += " and 'me' in owners"
yield from execute_paginated_retrieval(
retrieval_function=service.files().list,
list_key="files",
corpora=corpora,
includeItemsFromAllDrives=should_get_all,
supportsAllDrives=should_get_all,
fields=SLIM_FILE_FIELDS if is_slim else FILE_FIELDS,
q=file_query,
q=query,
)

View File

@@ -105,7 +105,7 @@ def execute_paginated_retrieval(
)()
elif e.resp.status == 404 or e.resp.status == 403:
if continue_on_404_or_403:
logger.debug(f"Error executing request: {e}")
logger.warning(f"Error executing request: {e}")
results = {}
else:
raise e

View File

@@ -2,8 +2,8 @@ import os
from sqlalchemy.orm import Session
from danswer.db.models import SlackChannelConfig
from danswer.db.slack_channel_config import fetch_slack_channel_configs
from danswer.db.models import SlackBotConfig
from danswer.db.slack_bot_config import fetch_slack_bot_configs
VALID_SLACK_FILTERS = [
@@ -13,59 +13,53 @@ VALID_SLACK_FILTERS = [
]
def get_slack_channel_config_for_bot_and_channel(
db_session: Session,
slack_bot_id: int,
channel_name: str | None,
) -> SlackChannelConfig | None:
def get_slack_bot_config_for_channel(
channel_name: str | None, db_session: Session
) -> SlackBotConfig | None:
if not channel_name:
return None
slack_bot_configs = fetch_slack_channel_configs(
db_session=db_session, slack_bot_id=slack_bot_id
)
slack_bot_configs = fetch_slack_bot_configs(db_session=db_session)
for config in slack_bot_configs:
if channel_name in config.channel_config["channel_name"]:
if channel_name in config.channel_config["channel_names"]:
return config
return None
def validate_channel_name(
def validate_channel_names(
channel_names: list[str],
current_slack_bot_config_id: int | None,
db_session: Session,
current_slack_bot_id: int,
channel_name: str,
current_slack_channel_config_id: int | None,
) -> str:
"""Make sure that this channel_name does not exist in other Slack channel configs.
Returns a cleaned up channel name (e.g. '#' removed if present)"""
slack_bot_configs = fetch_slack_channel_configs(
db_session=db_session,
slack_bot_id=current_slack_bot_id,
)
cleaned_channel_name = channel_name.lstrip("#").lower()
for slack_channel_config in slack_bot_configs:
if slack_channel_config.id == current_slack_channel_config_id:
) -> list[str]:
"""Make sure that these channel_names don't exist in other slack bot configs.
Returns a list of cleaned up channel names (e.g. '#' removed if present)"""
slack_bot_configs = fetch_slack_bot_configs(db_session=db_session)
cleaned_channel_names = [
channel_name.lstrip("#").lower() for channel_name in channel_names
]
for slack_bot_config in slack_bot_configs:
if slack_bot_config.id == current_slack_bot_config_id:
continue
if cleaned_channel_name == slack_channel_config.channel_config["channel_name"]:
raise ValueError(
f"Channel name '{channel_name}' already exists in "
"another Slack channel config with in Slack Bot with name: "
f"{slack_channel_config.slack_bot.name}"
)
for channel_name in cleaned_channel_names:
if channel_name in slack_bot_config.channel_config["channel_names"]:
raise ValueError(
f"Channel name '{channel_name}' already exists in "
"another slack bot config"
)
return cleaned_channel_name
return cleaned_channel_names
# Scaling configurations for multi-tenant Slack channel handling
# Scaling configurations for multi-tenant Slack bot handling
TENANT_LOCK_EXPIRATION = 1800 # How long a pod can hold exclusive access to a tenant before other pods can acquire it
TENANT_HEARTBEAT_INTERVAL = (
15 # How often pods send heartbeats to indicate they are still processing a tenant
60 # How often pods send heartbeats to indicate they are still processing a tenant
)
TENANT_HEARTBEAT_EXPIRATION = (
30 # How long before a tenant's heartbeat expires, allowing other pods to take over
TENANT_HEARTBEAT_EXPIRATION = 180 # How long before a tenant's heartbeat expires, allowing other pods to take over
TENANT_ACQUISITION_INTERVAL = (
60 # How often pods attempt to acquire unprocessed tenants
)
TENANT_ACQUISITION_INTERVAL = 60 # How often pods attempt to acquire unprocessed tenants and checks for new tokens
MAX_TENANTS_PER_POD = int(os.getenv("MAX_TENANTS_PER_POD", 50))

View File

@@ -13,7 +13,7 @@ from danswer.connectors.slack.utils import expert_info_from_slack_id
from danswer.connectors.slack.utils import make_slack_api_rate_limited
from danswer.danswerbot.slack.blocks import build_follow_up_resolved_blocks
from danswer.danswerbot.slack.blocks import get_document_feedback_blocks
from danswer.danswerbot.slack.config import get_slack_channel_config_for_bot_and_channel
from danswer.danswerbot.slack.config import get_slack_bot_config_for_channel
from danswer.danswerbot.slack.constants import DISLIKE_BLOCK_ACTION_ID
from danswer.danswerbot.slack.constants import FeedbackVisibility
from danswer.danswerbot.slack.constants import LIKE_BLOCK_ACTION_ID
@@ -117,10 +117,8 @@ def handle_generate_answer_button(
)
with get_session_with_tenant(client.tenant_id) as db_session:
slack_channel_config = get_slack_channel_config_for_bot_and_channel(
db_session=db_session,
slack_bot_id=client.slack_bot_id,
channel_name=channel_name,
slack_bot_config = get_slack_bot_config_for_channel(
channel_name=channel_name, db_session=db_session
)
handle_regular_answer(
@@ -135,7 +133,7 @@ def handle_generate_answer_button(
is_bot_msg=False,
is_bot_dm=False,
),
slack_channel_config=slack_channel_config,
slack_bot_config=slack_bot_config,
receiver_ids=None,
client=client.web_client,
tenant_id=client.tenant_id,
@@ -258,13 +256,11 @@ def handle_followup_button(
channel_name, is_dm = get_channel_name_from_id(
client=client.web_client, channel_id=channel_id
)
slack_channel_config = get_slack_channel_config_for_bot_and_channel(
db_session=db_session,
slack_bot_id=client.slack_bot_id,
channel_name=channel_name,
slack_bot_config = get_slack_bot_config_for_channel(
channel_name=channel_name, db_session=db_session
)
if slack_channel_config:
tag_names = slack_channel_config.channel_config.get("follow_up_tags")
if slack_bot_config:
tag_names = slack_bot_config.channel_config.get("follow_up_tags")
remaining = None
if tag_names:
tag_ids, remaining = fetch_user_ids_from_emails(

View File

@@ -19,8 +19,8 @@ from danswer.danswerbot.slack.utils import respond_in_thread
from danswer.danswerbot.slack.utils import slack_usage_report
from danswer.danswerbot.slack.utils import update_emote_react
from danswer.db.engine import get_session_with_tenant
from danswer.db.models import SlackChannelConfig
from danswer.db.users import add_slack_user_if_not_exists
from danswer.db.models import SlackBotConfig
from danswer.db.users import add_non_web_user_if_not_exists
from danswer.utils.logger import setup_logger
from shared_configs.configs import SLACK_CHANNEL_ID
@@ -106,7 +106,7 @@ def remove_scheduled_feedback_reminder(
def handle_message(
message_info: SlackMessageInfo,
slack_channel_config: SlackChannelConfig | None,
slack_bot_config: SlackBotConfig | None,
client: WebClient,
feedback_reminder_id: str | None,
tenant_id: str | None,
@@ -140,7 +140,7 @@ def handle_message(
)
document_set_names: list[str] | None = None
persona = slack_channel_config.persona if slack_channel_config else None
persona = slack_bot_config.persona if slack_bot_config else None
prompt = None
if persona:
document_set_names = [
@@ -152,8 +152,8 @@ def handle_message(
respond_member_group_list = None
channel_conf = None
if slack_channel_config and slack_channel_config.channel_config:
channel_conf = slack_channel_config.channel_config
if slack_bot_config and slack_bot_config.channel_config:
channel_conf = slack_bot_config.channel_config
if not bypass_filters and "answer_filters" in channel_conf:
if (
"questionmark_prefilter" in channel_conf["answer_filters"]
@@ -213,13 +213,13 @@ def handle_message(
with get_session_with_tenant(tenant_id) as db_session:
if message_info.email:
add_slack_user_if_not_exists(db_session, message_info.email)
add_non_web_user_if_not_exists(db_session, message_info.email)
# first check if we need to respond with a standard answer
used_standard_answer = handle_standard_answers(
message_info=message_info,
receiver_ids=send_to,
slack_channel_config=slack_channel_config,
slack_bot_config=slack_bot_config,
prompt=prompt,
logger=logger,
client=client,
@@ -231,7 +231,7 @@ def handle_message(
# if no standard answer applies, try a regular answer
issue_with_regular_answer = handle_regular_answer(
message_info=message_info,
slack_channel_config=slack_channel_config,
slack_bot_config=slack_bot_config,
receiver_ids=send_to,
client=client,
channel=channel,

View File

@@ -34,8 +34,8 @@ from danswer.danswerbot.slack.utils import SlackRateLimiter
from danswer.danswerbot.slack.utils import update_emote_react
from danswer.db.engine import get_session_with_tenant
from danswer.db.models import Persona
from danswer.db.models import SlackBotConfig
from danswer.db.models import SlackBotResponseType
from danswer.db.models import SlackChannelConfig
from danswer.db.persona import fetch_persona_by_id
from danswer.db.search_settings import get_current_search_settings
from danswer.db.users import get_user_by_email
@@ -81,7 +81,7 @@ def rate_limits(
def handle_regular_answer(
message_info: SlackMessageInfo,
slack_channel_config: SlackChannelConfig | None,
slack_bot_config: SlackBotConfig | None,
receiver_ids: list[str] | None,
client: WebClient,
channel: str,
@@ -96,7 +96,7 @@ def handle_regular_answer(
disable_cot: bool = DANSWER_BOT_DISABLE_COT,
reflexion: bool = ENABLE_DANSWERBOT_REFLEXION,
) -> bool:
channel_conf = slack_channel_config.channel_config if slack_channel_config else None
channel_conf = slack_bot_config.channel_config if slack_bot_config else None
messages = message_info.thread_messages
message_ts_to_respond_to = message_info.msg_to_respond
@@ -108,7 +108,7 @@ def handle_regular_answer(
user = get_user_by_email(message_info.email, db_session)
document_set_names: list[str] | None = None
persona = slack_channel_config.persona if slack_channel_config else None
persona = slack_bot_config.persona if slack_bot_config else None
prompt = None
if persona:
document_set_names = [
@@ -120,9 +120,9 @@ def handle_regular_answer(
bypass_acl = False
if (
slack_channel_config
and slack_channel_config.persona
and slack_channel_config.persona.document_sets
slack_bot_config
and slack_bot_config.persona
and slack_bot_config.persona.document_sets
):
# For Slack channels, use the full document set, admin will be warned when configuring it
# with non-public document sets
@@ -131,8 +131,8 @@ def handle_regular_answer(
# figure out if we want to use citations or quotes
use_citations = (
not DANSWER_BOT_USE_QUOTES
if slack_channel_config is None
else slack_channel_config.response_type == SlackBotResponseType.CITATIONS
if slack_bot_config is None
else slack_bot_config.response_type == SlackBotResponseType.CITATIONS
)
if not message_ts_to_respond_to and not is_bot_msg:
@@ -234,8 +234,8 @@ def handle_regular_answer(
# persona.llm_filter_extraction if persona is not None else True
# )
auto_detect_filters = (
slack_channel_config.enable_auto_filters
if slack_channel_config is not None
slack_bot_config.enable_auto_filters
if slack_bot_config is not None
else False
)
retrieval_details = RetrievalDetails(

View File

@@ -3,7 +3,7 @@ from sqlalchemy.orm import Session
from danswer.danswerbot.slack.models import SlackMessageInfo
from danswer.db.models import Prompt
from danswer.db.models import SlackChannelConfig
from danswer.db.models import SlackBotConfig
from danswer.utils.logger import DanswerLoggingAdapter
from danswer.utils.logger import setup_logger
from danswer.utils.variable_functionality import fetch_versioned_implementation
@@ -14,7 +14,7 @@ logger = setup_logger()
def handle_standard_answers(
message_info: SlackMessageInfo,
receiver_ids: list[str] | None,
slack_channel_config: SlackChannelConfig | None,
slack_bot_config: SlackBotConfig | None,
prompt: Prompt | None,
logger: DanswerLoggingAdapter,
client: WebClient,
@@ -29,7 +29,7 @@ def handle_standard_answers(
return versioned_handle_standard_answers(
message_info=message_info,
receiver_ids=receiver_ids,
slack_channel_config=slack_channel_config,
slack_bot_config=slack_bot_config,
prompt=prompt,
logger=logger,
client=client,
@@ -40,7 +40,7 @@ def handle_standard_answers(
def _handle_standard_answers(
message_info: SlackMessageInfo,
receiver_ids: list[str] | None,
slack_channel_config: SlackChannelConfig | None,
slack_bot_config: SlackBotConfig | None,
prompt: Prompt | None,
logger: DanswerLoggingAdapter,
client: WebClient,

View File

@@ -4,7 +4,6 @@ import signal
import sys
import threading
import time
from collections.abc import Callable
from threading import Event
from types import FrameType
from typing import Any
@@ -17,17 +16,14 @@ from prometheus_client import start_http_server
from slack_sdk import WebClient
from slack_sdk.socket_mode.request import SocketModeRequest
from slack_sdk.socket_mode.response import SocketModeResponse
from sqlalchemy.orm import Session
from danswer.configs.app_configs import POD_NAME
from danswer.configs.app_configs import POD_NAMESPACE
from danswer.configs.constants import DanswerRedisLocks
from danswer.configs.constants import MessageType
from danswer.configs.danswerbot_configs import DANSWER_BOT_REPHRASE_MESSAGE
from danswer.configs.danswerbot_configs import DANSWER_BOT_RESPOND_EVERY_CHANNEL
from danswer.configs.danswerbot_configs import NOTIFY_SLACKBOT_NO_ANSWER
from danswer.connectors.slack.utils import expert_info_from_slack_id
from danswer.danswerbot.slack.config import get_slack_channel_config_for_bot_and_channel
from danswer.danswerbot.slack.config import get_slack_bot_config_for_channel
from danswer.danswerbot.slack.config import MAX_TENANTS_PER_POD
from danswer.danswerbot.slack.config import TENANT_ACQUISITION_INTERVAL
from danswer.danswerbot.slack.config import TENANT_HEARTBEAT_EXPIRATION
@@ -56,20 +52,20 @@ from danswer.danswerbot.slack.handlers.handle_message import (
)
from danswer.danswerbot.slack.handlers.handle_message import schedule_feedback_reminder
from danswer.danswerbot.slack.models import SlackMessageInfo
from danswer.danswerbot.slack.tokens import fetch_tokens
from danswer.danswerbot.slack.utils import check_message_limit
from danswer.danswerbot.slack.utils import decompose_action_id
from danswer.danswerbot.slack.utils import get_channel_name_from_id
from danswer.danswerbot.slack.utils import get_danswer_bot_slack_bot_id
from danswer.danswerbot.slack.utils import get_danswer_bot_app_id
from danswer.danswerbot.slack.utils import read_slack_thread
from danswer.danswerbot.slack.utils import remove_danswer_bot_tag
from danswer.danswerbot.slack.utils import rephrase_slack_message
from danswer.danswerbot.slack.utils import respond_in_thread
from danswer.danswerbot.slack.utils import TenantSocketModeClient
from danswer.db.engine import CURRENT_TENANT_ID_CONTEXTVAR
from danswer.db.engine import get_all_tenant_ids
from danswer.db.engine import get_session_with_tenant
from danswer.db.models import SlackBot
from danswer.db.search_settings import get_current_search_settings
from danswer.db.slack_bot import fetch_slack_bots
from danswer.key_value_store.interface import KvKeyNotFoundError
from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
@@ -79,21 +75,16 @@ from danswer.search.retrieval.search_runner import download_nltk_data
from danswer.server.manage.models import SlackBotTokens
from danswer.utils.logger import setup_logger
from danswer.utils.variable_functionality import set_is_ee_based_on_env_variable
from shared_configs.configs import DISALLOWED_SLACK_BOT_TENANT_LIST
from shared_configs.configs import MODEL_SERVER_HOST
from shared_configs.configs import MODEL_SERVER_PORT
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
from shared_configs.configs import SLACK_CHANNEL_ID
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
logger = setup_logger()
# Prometheus metric for HPA
active_tenants_gauge = Gauge(
"active_tenants",
"Number of active tenants handled by this pod",
["namespace", "pod"],
"active_tenants", "Number of active tenants handled by this pod"
)
# In rare cases, some users have been experiencing a massive amount of trivial messages coming through
@@ -117,10 +108,8 @@ class SlackbotHandler:
def __init__(self) -> None:
logger.info("Initializing SlackbotHandler")
self.tenant_ids: Set[str | None] = set()
# The keys for these dictionaries are tuples of (tenant_id, slack_bot_id)
self.socket_clients: Dict[tuple[str | None, int], TenantSocketModeClient] = {}
self.slack_bot_tokens: Dict[tuple[str | None, int], SlackBotTokens] = {}
self.socket_clients: Dict[str | None, TenantSocketModeClient] = {}
self.slack_bot_tokens: Dict[str | None, SlackBotTokens] = {}
self.running = True
self.pod_id = self.get_pod_id()
self._shutdown_event = Event()
@@ -158,9 +147,7 @@ class SlackbotHandler:
while not self._shutdown_event.is_set():
try:
self.acquire_tenants()
active_tenants_gauge.labels(namespace=POD_NAMESPACE, pod=POD_NAME).set(
len(self.tenant_ids)
)
active_tenants_gauge.set(len(self.tenant_ids))
logger.debug(f"Current active tenants: {len(self.tenant_ids)}")
except Exception as e:
logger.exception(f"Error in Slack acquisition: {e}")
@@ -175,63 +162,11 @@ class SlackbotHandler:
logger.exception(f"Error in heartbeat loop: {e}")
self._shutdown_event.wait(timeout=TENANT_HEARTBEAT_INTERVAL)
def _manage_clients_per_tenant(
self, db_session: Session, tenant_id: str | None, bot: SlackBot
) -> None:
slack_bot_tokens = SlackBotTokens(
bot_token=bot.bot_token,
app_token=bot.app_token,
)
tenant_bot_pair = (tenant_id, bot.id)
# If the tokens are not set, we need to close the socket client and delete the tokens
# for the tenant and app
if not slack_bot_tokens:
logger.debug(
f"No Slack bot token found for tenant {tenant_id}, bot {bot.id}"
)
if tenant_bot_pair in self.socket_clients:
asyncio.run(self.socket_clients[tenant_bot_pair].close())
del self.socket_clients[tenant_bot_pair]
del self.slack_bot_tokens[tenant_bot_pair]
return
tokens_exist = tenant_bot_pair in self.slack_bot_tokens
tokens_changed = (
tokens_exist and slack_bot_tokens != self.slack_bot_tokens[tenant_bot_pair]
)
if not tokens_exist or tokens_changed:
if tokens_exist:
logger.info(
f"Slack Bot tokens have changed for tenant {tenant_id}, bot {bot.id} - reconnecting"
)
else:
search_settings = get_current_search_settings(db_session)
embedding_model = EmbeddingModel.from_db_model(
search_settings=search_settings,
server_host=MODEL_SERVER_HOST,
server_port=MODEL_SERVER_PORT,
)
warm_up_bi_encoder(embedding_model=embedding_model)
self.slack_bot_tokens[tenant_bot_pair] = slack_bot_tokens
if tenant_bot_pair in self.socket_clients:
asyncio.run(self.socket_clients[tenant_bot_pair].close())
self.start_socket_client(bot.id, tenant_id, slack_bot_tokens)
def acquire_tenants(self) -> None:
tenant_ids = get_all_tenant_ids()
logger.debug(f"Found {len(tenant_ids)} total tenants in Postgres")
for tenant_id in tenant_ids:
if (
DISALLOWED_SLACK_BOT_TENANT_LIST is not None
and tenant_id in DISALLOWED_SLACK_BOT_TENANT_LIST
):
logger.debug(f"Tenant {tenant_id} is in the disallowed list, skipping")
continue
if tenant_id in self.tenant_ids:
logger.debug(f"Tenant {tenant_id} already in self.tenant_ids")
continue
@@ -255,30 +190,63 @@ class SlackbotHandler:
continue
logger.debug(f"Acquired lock for tenant {tenant_id}")
self.tenant_ids.add(tenant_id)
for tenant_id in self.tenant_ids:
token = CURRENT_TENANT_ID_CONTEXTVAR.set(
tenant_id or POSTGRES_DEFAULT_SCHEMA
)
try:
with get_session_with_tenant(tenant_id) as db_session:
try:
bots = fetch_slack_bots(db_session=db_session)
for bot in bots:
self._manage_clients_per_tenant(
db_session=db_session,
tenant_id=tenant_id,
bot=bot,
logger.debug(
f"Setting tenant ID context variable for tenant {tenant_id}"
)
slack_bot_tokens = fetch_tokens()
logger.debug(f"Fetched Slack bot tokens for tenant {tenant_id}")
logger.debug(
f"Reset tenant ID context variable for tenant {tenant_id}"
)
if not slack_bot_tokens:
logger.debug(
f"No Slack bot token found for tenant {tenant_id}"
)
if tenant_id in self.socket_clients:
asyncio.run(self.socket_clients[tenant_id].close())
del self.socket_clients[tenant_id]
del self.slack_bot_tokens[tenant_id]
continue
if (
tenant_id not in self.slack_bot_tokens
or slack_bot_tokens != self.slack_bot_tokens[tenant_id]
):
if tenant_id in self.slack_bot_tokens:
logger.info(
f"Slack Bot tokens have changed for tenant {tenant_id} - reconnecting"
)
else:
search_settings = get_current_search_settings(
db_session
)
embedding_model = EmbeddingModel.from_db_model(
search_settings=search_settings,
server_host=MODEL_SERVER_HOST,
server_port=MODEL_SERVER_PORT,
)
warm_up_bi_encoder(embedding_model=embedding_model)
self.slack_bot_tokens[tenant_id] = slack_bot_tokens
if tenant_id in self.socket_clients:
asyncio.run(self.socket_clients[tenant_id].close())
self.start_socket_client(tenant_id, slack_bot_tokens)
except KvKeyNotFoundError:
logger.debug(f"Missing Slack Bot tokens for tenant {tenant_id}")
if (tenant_id, bot.id) in self.socket_clients:
asyncio.run(self.socket_clients[tenant_id, bot.id].close())
del self.socket_clients[tenant_id, bot.id]
del self.slack_bot_tokens[tenant_id, bot.id]
if tenant_id in self.socket_clients:
asyncio.run(self.socket_clients[tenant_id].close())
del self.socket_clients[tenant_id]
del self.slack_bot_tokens[tenant_id]
except Exception as e:
logger.exception(f"Error handling tenant {tenant_id}: {e}")
finally:
@@ -297,37 +265,26 @@ class SlackbotHandler:
)
def start_socket_client(
self, slack_bot_id: int, tenant_id: str | None, slack_bot_tokens: SlackBotTokens
self, tenant_id: str | None, slack_bot_tokens: SlackBotTokens
) -> None:
logger.info(
f"Starting socket client for tenant: {tenant_id}, app: {slack_bot_id}"
)
socket_client: TenantSocketModeClient = _get_socket_client(
slack_bot_tokens, tenant_id, slack_bot_id
)
logger.info(f"Starting socket client for tenant {tenant_id}")
socket_client = _get_socket_client(slack_bot_tokens, tenant_id)
# Append the event handler
process_slack_event = create_process_slack_event()
socket_client.socket_mode_request_listeners.append(process_slack_event) # type: ignore
# Establish a WebSocket connection to the Socket Mode servers
logger.info(
f"Connecting socket client for tenant: {tenant_id}, app: {slack_bot_id}"
)
logger.info(f"Connecting socket client for tenant {tenant_id}")
socket_client.connect()
self.socket_clients[tenant_id, slack_bot_id] = socket_client
self.socket_clients[tenant_id] = socket_client
self.tenant_ids.add(tenant_id)
logger.info(
f"Started SocketModeClient for tenant: {tenant_id}, app: {slack_bot_id}"
)
logger.info(f"Started SocketModeClient for tenant {tenant_id}")
def stop_socket_clients(self) -> None:
logger.info(f"Stopping {len(self.socket_clients)} socket clients")
for (tenant_id, slack_bot_id), client in self.socket_clients.items():
for tenant_id, client in self.socket_clients.items():
asyncio.run(client.close())
logger.info(
f"Stopped SocketModeClient for tenant: {tenant_id}, app: {slack_bot_id}"
)
logger.info(f"Stopped SocketModeClient for tenant {tenant_id}")
def shutdown(self, signum: int | None, frame: FrameType | None) -> None:
if not self.running:
@@ -341,16 +298,6 @@ class SlackbotHandler:
logger.info(f"Stopping {len(self.socket_clients)} socket clients")
self.stop_socket_clients()
# Release locks for all tenants
logger.info(f"Releasing locks for {len(self.tenant_ids)} tenants")
for tenant_id in self.tenant_ids:
try:
redis_client = get_redis_client(tenant_id=tenant_id)
redis_client.delete(DanswerRedisLocks.SLACK_BOT_LOCK)
logger.info(f"Released lock for tenant {tenant_id}")
except Exception as e:
logger.error(f"Error releasing lock for tenant {tenant_id}: {e}")
# Wait for background threads to finish (with timeout)
logger.info("Waiting for background threads to finish...")
self.acquire_thread.join(timeout=5)
@@ -411,7 +358,7 @@ def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -
)
return False
bot_tag_id = get_danswer_bot_slack_bot_id(client.web_client)
bot_tag_id = get_danswer_bot_app_id(client.web_client)
if event_type == "message":
is_dm = event.get("channel_type") == "im"
is_tagged = bot_tag_id and bot_tag_id in msg
@@ -434,15 +381,13 @@ def prefilter_requests(req: SocketModeRequest, client: TenantSocketModeClient) -
)
with get_session_with_tenant(client.tenant_id) as db_session:
slack_channel_config = get_slack_channel_config_for_bot_and_channel(
db_session=db_session,
slack_bot_id=client.slack_bot_id,
channel_name=channel_name,
slack_bot_config = get_slack_bot_config_for_channel(
channel_name=channel_name, db_session=db_session
)
# If DanswerBot is not specifically tagged and the channel is not set to respond to bots, ignore the message
if (not bot_tag_id or bot_tag_id not in msg) and (
not slack_channel_config
or not slack_channel_config.channel_config.get("respond_to_bots")
not slack_bot_config
or not slack_bot_config.channel_config.get("respond_to_bots")
):
channel_specific_logger.info("Ignoring message from bot")
return False
@@ -647,16 +592,14 @@ def process_message(
token = CURRENT_TENANT_ID_CONTEXTVAR.set(client.tenant_id)
try:
with get_session_with_tenant(client.tenant_id) as db_session:
slack_channel_config = get_slack_channel_config_for_bot_and_channel(
db_session=db_session,
slack_bot_id=client.slack_bot_id,
channel_name=channel_name,
slack_bot_config = get_slack_bot_config_for_channel(
channel_name=channel_name, db_session=db_session
)
# Be careful about this default, don't want to accidentally spam every channel
# Users should be able to DM slack bot in their private channels though
if (
slack_channel_config is None
slack_bot_config is None
and not respond_every_channel
# Can't have configs for DMs so don't toss them out
and not is_dm
@@ -667,10 +610,9 @@ def process_message(
return
follow_up = bool(
slack_channel_config
and slack_channel_config.channel_config
and slack_channel_config.channel_config.get("follow_up_tags")
is not None
slack_bot_config
and slack_bot_config.channel_config
and slack_bot_config.channel_config.get("follow_up_tags") is not None
)
feedback_reminder_id = schedule_feedback_reminder(
details=details, client=client.web_client, include_followup=follow_up
@@ -678,7 +620,7 @@ def process_message(
failed = handle_message(
message_info=details,
slack_channel_config=slack_channel_config,
slack_bot_config=slack_bot_config,
client=client.web_client,
feedback_reminder_id=feedback_reminder_id,
tenant_id=client.tenant_id,
@@ -730,32 +672,26 @@ def view_routing(req: SocketModeRequest, client: TenantSocketModeClient) -> None
return process_feedback(req, client)
def create_process_slack_event() -> (
Callable[[TenantSocketModeClient, SocketModeRequest], None]
):
def process_slack_event(
client: TenantSocketModeClient, req: SocketModeRequest
) -> None:
# Always respond right away, if Slack doesn't receive these frequently enough
# it will assume the Bot is DEAD!!! :(
acknowledge_message(req, client)
def process_slack_event(client: TenantSocketModeClient, req: SocketModeRequest) -> None:
# Always respond right away, if Slack doesn't receive these frequently enough
# it will assume the Bot is DEAD!!! :(
acknowledge_message(req, client)
try:
if req.type == "interactive":
if req.payload.get("type") == "block_actions":
return action_routing(req, client)
elif req.payload.get("type") == "view_submission":
return view_routing(req, client)
elif req.type == "events_api" or req.type == "slash_commands":
return process_message(req, client)
except Exception:
logger.exception("Failed to process slack event")
return process_slack_event
try:
if req.type == "interactive":
if req.payload.get("type") == "block_actions":
return action_routing(req, client)
elif req.payload.get("type") == "view_submission":
return view_routing(req, client)
elif req.type == "events_api" or req.type == "slash_commands":
return process_message(req, client)
except Exception as e:
logger.exception(f"Failed to process slack event. Error: {e}")
logger.error(f"Slack request payload: {req.payload}")
def _get_socket_client(
slack_bot_tokens: SlackBotTokens, tenant_id: str | None, slack_bot_id: int
slack_bot_tokens: SlackBotTokens, tenant_id: str | None
) -> TenantSocketModeClient:
# For more info on how to set this up, checkout the docs:
# https://docs.danswer.dev/slack_bot_setup
@@ -764,7 +700,6 @@ def _get_socket_client(
app_token=slack_bot_tokens.app_token,
web_client=WebClient(token=slack_bot_tokens.bot_token),
tenant_id=tenant_id,
slack_bot_id=slack_bot_id,
)

View File

@@ -0,0 +1,28 @@
import os
from typing import cast
from danswer.configs.constants import KV_SLACK_BOT_TOKENS_CONFIG_KEY
from danswer.key_value_store.factory import get_kv_store
from danswer.server.manage.models import SlackBotTokens
def fetch_tokens() -> SlackBotTokens:
# first check env variables
app_token = os.environ.get("DANSWER_BOT_SLACK_APP_TOKEN")
bot_token = os.environ.get("DANSWER_BOT_SLACK_BOT_TOKEN")
if app_token and bot_token:
return SlackBotTokens(app_token=app_token, bot_token=bot_token)
dynamic_config_store = get_kv_store()
return SlackBotTokens(
**cast(dict, dynamic_config_store.load(key=KV_SLACK_BOT_TOKENS_CONFIG_KEY))
)
def save_tokens(
tokens: SlackBotTokens,
) -> None:
dynamic_config_store = get_kv_store()
dynamic_config_store.store(
key=KV_SLACK_BOT_TOKENS_CONFIG_KEY, val=dict(tokens), encrypt=True
)

View File

@@ -30,6 +30,7 @@ from danswer.configs.danswerbot_configs import (
from danswer.connectors.slack.utils import make_slack_api_rate_limited
from danswer.connectors.slack.utils import SlackTextCleaner
from danswer.danswerbot.slack.constants import FeedbackVisibility
from danswer.danswerbot.slack.tokens import fetch_tokens
from danswer.db.engine import get_session_with_tenant
from danswer.db.users import get_user_by_email
from danswer.llm.exceptions import GenAIDisabledException
@@ -46,16 +47,16 @@ from danswer.utils.text_processing import replace_whitespaces_w_space
logger = setup_logger()
_DANSWER_BOT_SLACK_BOT_ID: str | None = None
_DANSWER_BOT_APP_ID: str | None = None
_DANSWER_BOT_MESSAGE_COUNT: int = 0
_DANSWER_BOT_COUNT_START_TIME: float = time.time()
def get_danswer_bot_slack_bot_id(web_client: WebClient) -> Any:
global _DANSWER_BOT_SLACK_BOT_ID
if _DANSWER_BOT_SLACK_BOT_ID is None:
_DANSWER_BOT_SLACK_BOT_ID = web_client.auth_test().get("user_id")
return _DANSWER_BOT_SLACK_BOT_ID
def get_danswer_bot_app_id(web_client: WebClient) -> Any:
global _DANSWER_BOT_APP_ID
if _DANSWER_BOT_APP_ID is None:
_DANSWER_BOT_APP_ID = web_client.auth_test().get("user_id")
return _DANSWER_BOT_APP_ID
def check_message_limit() -> bool:
@@ -136,10 +137,15 @@ def update_emote_react(
def remove_danswer_bot_tag(message_str: str, client: WebClient) -> str:
bot_tag_id = get_danswer_bot_slack_bot_id(web_client=client)
bot_tag_id = get_danswer_bot_app_id(web_client=client)
return re.sub(rf"<@{bot_tag_id}>\s", "", message_str)
def get_web_client() -> WebClient:
slack_tokens = fetch_tokens()
return WebClient(token=slack_tokens.bot_token)
@retry(
tries=DANSWER_BOT_NUM_RETRIES,
delay=0.25,
@@ -431,9 +437,9 @@ def read_slack_thread(
)
message_type = MessageType.USER
else:
self_slack_bot_id = get_danswer_bot_slack_bot_id(client)
self_app_id = get_danswer_bot_app_id(client)
if reply.get("user") == self_slack_bot_id:
if reply.get("user") == self_app_id:
# DanswerBot response
message_type = MessageType.ASSISTANT
user_sem_id = "Assistant"
@@ -576,9 +582,6 @@ def get_feedback_visibility() -> FeedbackVisibility:
class TenantSocketModeClient(SocketModeClient):
def __init__(
self, tenant_id: str | None, slack_bot_id: int, *args: Any, **kwargs: Any
):
def __init__(self, tenant_id: str | None, *args: Any, **kwargs: Any):
super().__init__(*args, **kwargs)
self.tenant_id = tenant_id
self.slack_bot_id = slack_bot_id

View File

@@ -4,7 +4,6 @@ from typing import Any
from typing import Dict
from fastapi import Depends
from fastapi_users.models import ID
from fastapi_users.models import UP
from fastapi_users_db_sqlalchemy import SQLAlchemyUserDatabase
from fastapi_users_db_sqlalchemy.access_token import SQLAlchemyAccessTokenDatabase
@@ -44,10 +43,7 @@ def get_total_users_count(db_session: Session) -> int:
"""
user_count = (
db_session.query(User)
.filter(
~User.email.endswith(get_api_key_email_pattern()), # type: ignore
User.role != UserRole.EXT_PERM_USER,
)
.filter(~User.email.endswith(get_api_key_email_pattern())) # type: ignore
.count()
)
invited_users = len(get_invited_users())
@@ -65,7 +61,7 @@ async def get_user_count() -> int:
# Need to override this because FastAPI Users doesn't give flexibility for backend field creation logic in OAuth flow
class SQLAlchemyUserAdminDB(SQLAlchemyUserDatabase[UP, ID]):
class SQLAlchemyUserAdminDB(SQLAlchemyUserDatabase):
async def create(
self,
create_dict: Dict[str, Any],

View File

@@ -282,32 +282,3 @@ def mark_ccpair_as_pruned(cc_pair_id: int, db_session: Session) -> None:
cc_pair.last_pruned = datetime.now(timezone.utc)
db_session.commit()
def mark_cc_pair_as_permissions_synced(
db_session: Session, cc_pair_id: int, start_time: datetime | None
) -> None:
stmt = select(ConnectorCredentialPair).where(
ConnectorCredentialPair.id == cc_pair_id
)
cc_pair = db_session.scalar(stmt)
if cc_pair is None:
raise ValueError(f"No cc_pair with ID: {cc_pair_id}")
cc_pair.last_time_perm_sync = start_time
db_session.commit()
def mark_cc_pair_as_external_group_synced(db_session: Session, cc_pair_id: int) -> None:
stmt = select(ConnectorCredentialPair).where(
ConnectorCredentialPair.id == cc_pair_id
)
cc_pair = db_session.scalar(stmt)
if cc_pair is None:
raise ValueError(f"No cc_pair with ID: {cc_pair_id}")
# The sync time can be marked after it ran because all group syncs
# are run in full, not polling for changes.
# If this changes, we need to update this function.
cc_pair.last_time_external_group_sync = datetime.now(timezone.utc)
db_session.commit()

View File

@@ -76,10 +76,8 @@ def _add_user_filters(
.where(~UG__CCpair.user_group_id.in_(user_groups))
.correlate(ConnectorCredentialPair)
)
where_clause |= ConnectorCredentialPair.creator_id == user.id
else:
where_clause |= ConnectorCredentialPair.access_type == AccessType.PUBLIC
where_clause |= ConnectorCredentialPair.access_type == AccessType.SYNC
return stmt.where(where_clause)
@@ -389,7 +387,6 @@ def add_credential_to_connector(
)
association = ConnectorCredentialPair(
creator_id=user.id if user else None,
connector_id=connector_id,
credential_id=credential_id,
name=cc_pair_name,

View File

@@ -19,7 +19,6 @@ from sqlalchemy.orm import Session
from sqlalchemy.sql.expression import null
from danswer.configs.constants import DEFAULT_BOOST
from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
from danswer.db.enums import AccessType
from danswer.db.enums import ConnectorCredentialPairStatus
from danswer.db.feedback import delete_document_feedback_for_documents__no_commit
@@ -47,21 +46,13 @@ def count_documents_by_needs_sync(session: Session) -> int:
"""Get the count of all documents where:
1. last_modified is newer than last_synced
2. last_synced is null (meaning we've never synced)
AND the document has a relationship with a connector/credential pair
TODO: The documents without a relationship with a connector/credential pair
should be cleaned up somehow eventually.
This function executes the query and returns the count of
documents matching the criteria."""
count = (
session.query(func.count(DbDocument.id.distinct()))
session.query(func.count())
.select_from(DbDocument)
.join(
DocumentByConnectorCredentialPair,
DbDocument.id == DocumentByConnectorCredentialPair.id,
)
.filter(
or_(
DbDocument.last_modified > DbDocument.last_synced,
@@ -100,22 +91,6 @@ def construct_document_select_for_connector_credential_pair_by_needs_sync(
return stmt
def get_all_documents_needing_vespa_sync_for_cc_pair(
db_session: Session, cc_pair_id: int
) -> list[DbDocument]:
cc_pair = get_connector_credential_pair_from_id(
cc_pair_id=cc_pair_id, db_session=db_session
)
if not cc_pair:
raise ValueError(f"No CC pair found with ID: {cc_pair_id}")
stmt = construct_document_select_for_connector_credential_pair_by_needs_sync(
cc_pair.connector_id, cc_pair.credential_id
)
return list(db_session.scalars(stmt).all())
def construct_document_select_for_connector_credential_pair(
connector_id: int, credential_id: int | None = None
) -> Select:
@@ -129,21 +104,6 @@ def construct_document_select_for_connector_credential_pair(
return stmt
def get_documents_for_cc_pair(
db_session: Session,
cc_pair_id: int,
) -> list[DbDocument]:
cc_pair = get_connector_credential_pair_from_id(
cc_pair_id=cc_pair_id, db_session=db_session
)
if not cc_pair:
raise ValueError(f"No CC pair found with ID: {cc_pair_id}")
stmt = construct_document_select_for_connector_credential_pair(
connector_id=cc_pair.connector_id, credential_id=cc_pair.credential_id
)
return list(db_session.scalars(stmt).all())
def get_document_ids_for_connector_credential_pair(
db_session: Session, connector_id: int, credential_id: int, limit: int | None = None
) -> list[str]:
@@ -209,7 +169,6 @@ def get_document_connector_counts(
def get_document_counts_for_cc_pairs(
db_session: Session, cc_pair_identifiers: list[ConnectorCredentialPairIdentifier]
) -> Sequence[tuple[int, int, int]]:
"""Returns a sequence of tuples of (connector_id, credential_id, document count)"""
stmt = (
select(
DocumentByConnectorCredentialPair.connector_id,
@@ -347,8 +306,6 @@ def upsert_documents(
]
)
# This does not update the permissions of the document if
# the document already exists.
on_conflict_stmt = insert_stmt.on_conflict_do_update(
index_elements=["id"], # Conflict target
set_={
@@ -366,23 +323,23 @@ def upsert_documents(
def upsert_document_by_connector_credential_pair(
db_session: Session, connector_id: int, credential_id: int, document_ids: list[str]
db_session: Session, document_metadata_batch: list[DocumentMetadata]
) -> None:
"""NOTE: this function is Postgres specific. Not all DBs support the ON CONFLICT clause."""
if not document_ids:
logger.info("`document_ids` is empty. Skipping.")
if not document_metadata_batch:
logger.info("`document_metadata_batch` is empty. Skipping.")
return
insert_stmt = insert(DocumentByConnectorCredentialPair).values(
[
model_to_dict(
DocumentByConnectorCredentialPair(
id=doc_id,
connector_id=connector_id,
credential_id=credential_id,
id=document_metadata.document_id,
connector_id=document_metadata.connector_id,
credential_id=document_metadata.credential_id,
)
)
for doc_id in document_ids
for document_metadata in document_metadata_batch
]
)
# for now, there are no columns to update. If more metadata is added, then this
@@ -443,6 +400,17 @@ def mark_document_as_synced(document_id: str, db_session: Session) -> None:
db_session.commit()
def upsert_documents_complete(
db_session: Session,
document_metadata_batch: list[DocumentMetadata],
) -> None:
upsert_documents(db_session, document_metadata_batch)
upsert_document_by_connector_credential_pair(db_session, document_metadata_batch)
logger.info(
f"Upserted {len(document_metadata_batch)} document store entries into DB"
)
def delete_document_by_connector_credential_pair__no_commit(
db_session: Session,
document_id: str,
@@ -495,6 +463,7 @@ def delete_documents_complete__no_commit(
db_session: Session, document_ids: list[str]
) -> None:
"""This completely deletes the documents from the db, including all foreign key relationships"""
logger.info(f"Deleting {len(document_ids)} documents from the DB")
delete_documents_by_connector_credential_pair__no_commit(db_session, document_ids)
delete_document_feedback_for_documents__no_commit(
document_ids=document_ids, db_session=db_session
@@ -551,7 +520,7 @@ def prepare_to_modify_documents(
db_session.commit() # ensure that we're not in a transaction
lock_acquired = False
for i in range(_NUM_LOCK_ATTEMPTS):
for _ in range(_NUM_LOCK_ATTEMPTS):
try:
with db_session.begin() as transaction:
lock_acquired = acquire_document_locks(
@@ -562,7 +531,7 @@ def prepare_to_modify_documents(
break
except OperationalError as e:
logger.warning(
f"Failed to acquire locks for documents on attempt {i}, retrying. Error: {e}"
f"Failed to acquire locks for documents, retrying. Error: {e}"
)
time.sleep(retry_delay)

View File

@@ -189,13 +189,6 @@ class SqlEngine:
return ""
return cls._app_name
@classmethod
def reset_engine(cls) -> None:
with cls._lock:
if cls._engine:
cls._engine.dispose()
cls._engine = None
def get_all_tenant_ids() -> list[str] | list[None]:
if not MULTI_TENANT:
@@ -319,9 +312,7 @@ async def get_async_session_with_tenant(
await session.execute(text(f'SET search_path = "{tenant_id}"'))
if POSTGRES_IDLE_SESSIONS_TIMEOUT:
await session.execute(
text(
f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
)
f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
)
except Exception:
logger.exception("Error setting search_path.")
@@ -382,9 +373,7 @@ def get_session_with_tenant(
cursor.execute(f'SET search_path = "{tenant_id}"')
if POSTGRES_IDLE_SESSIONS_TIMEOUT:
cursor.execute(
text(
f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
)
f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
)
finally:
cursor.close()

View File

@@ -53,7 +53,7 @@ from danswer.db.enums import IndexingStatus
from danswer.db.enums import IndexModelStatus
from danswer.db.enums import TaskStatus
from danswer.db.pydantic_type import PydanticType
from danswer.utils.special_types import JSON_ro
from danswer.key_value_store.interface import JSON_ro
from danswer.file_store.models import FileDescriptor
from danswer.llm.override_models import LLMOverride
from danswer.llm.override_models import PromptOverride
@@ -126,9 +126,8 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
# if specified, controls the assistants that are shown to the user + their order
# if not specified, all assistants are shown
auto_scroll: Mapped[bool] = mapped_column(Boolean, default=True)
chosen_assistants: Mapped[list[int] | None] = mapped_column(
postgresql.JSONB(), nullable=True, default=None
chosen_assistants: Mapped[list[int]] = mapped_column(
postgresql.JSONB(), nullable=False, default=[-2, -1, 0]
)
visible_assistants: Mapped[list[int]] = mapped_column(
postgresql.JSONB(), nullable=False, default=[]
@@ -172,11 +171,8 @@ class User(SQLAlchemyBaseUserTableUUID, Base):
notifications: Mapped[list["Notification"]] = relationship(
"Notification", back_populates="user"
)
cc_pairs: Mapped[list["ConnectorCredentialPair"]] = relationship(
"ConnectorCredentialPair",
back_populates="creator",
primaryjoin="User.id == foreign(ConnectorCredentialPair.creator_id)",
)
# Whether the user has logged in via web. False if user has only used Danswer through Slack bot
has_web_login: Mapped[bool] = mapped_column(Boolean, default=True)
class InputPrompt(Base):
@@ -351,11 +347,11 @@ class StandardAnswer__StandardAnswerCategory(Base):
)
class SlackChannelConfig__StandardAnswerCategory(Base):
__tablename__ = "slack_channel_config__standard_answer_category"
class SlackBotConfig__StandardAnswerCategory(Base):
__tablename__ = "slack_bot_config__standard_answer_category"
slack_channel_config_id: Mapped[int] = mapped_column(
ForeignKey("slack_channel_config.id"), primary_key=True
slack_bot_config_id: Mapped[int] = mapped_column(
ForeignKey("slack_bot_config.id"), primary_key=True
)
standard_answer_category_id: Mapped[int] = mapped_column(
ForeignKey("standard_answer_category.id"), primary_key=True
@@ -424,9 +420,6 @@ class ConnectorCredentialPair(Base):
last_time_perm_sync: Mapped[datetime.datetime | None] = mapped_column(
DateTime(timezone=True), nullable=True
)
last_time_external_group_sync: Mapped[datetime.datetime | None] = mapped_column(
DateTime(timezone=True), nullable=True
)
# Time finished, not used for calculating backend jobs which uses time started (created)
last_successful_index_time: Mapped[datetime.datetime | None] = mapped_column(
DateTime(timezone=True), default=None
@@ -459,14 +452,6 @@ class ConnectorCredentialPair(Base):
"IndexAttempt", back_populates="connector_credential_pair"
)
# the user id of the user that created this cc pair
creator_id: Mapped[UUID | None] = mapped_column(nullable=True)
creator: Mapped["User"] = relationship(
"User",
back_populates="cc_pairs",
primaryjoin="foreign(ConnectorCredentialPair.creator_id) == remote(User.id)",
)
class Document(Base):
__tablename__ = "document"
@@ -1182,7 +1167,7 @@ class LLMProvider(Base):
default_model_name: Mapped[str] = mapped_column(String)
fast_default_model_name: Mapped[str | None] = mapped_column(String, nullable=True)
# Models to actually display to users
# Models to actually disp;aly to users
# If nulled out, we assume in the application logic we should present all
display_model_names: Mapped[list[str] | None] = mapped_column(
postgresql.ARRAY(String), nullable=True
@@ -1364,9 +1349,6 @@ class Persona(Base):
recency_bias: Mapped[RecencyBiasSetting] = mapped_column(
Enum(RecencyBiasSetting, native_enum=False)
)
category_id: Mapped[int | None] = mapped_column(
ForeignKey("persona_category.id"), nullable=True
)
# Allows the Persona to specify a different LLM version than is controlled
# globablly via env variables. For flexibility, validity is not currently enforced
# NOTE: only is applied on the actual response generation - is not used for things like
@@ -1438,9 +1420,6 @@ class Persona(Base):
secondary="persona__user_group",
viewonly=True,
)
category: Mapped["PersonaCategory"] = relationship(
"PersonaCategory", back_populates="personas"
)
# Default personas loaded via yaml cannot have the same name
__table_args__ = (
@@ -1453,17 +1432,6 @@ class Persona(Base):
)
class PersonaCategory(Base):
__tablename__ = "persona_category"
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String, unique=True)
description: Mapped[str | None] = mapped_column(String, nullable=True)
personas: Mapped[list["Persona"]] = relationship(
"Persona", back_populates="category"
)
AllowedAnswerFilters = (
Literal["well_answered_postfilter"] | Literal["questionmark_prefilter"]
)
@@ -1473,7 +1441,7 @@ class ChannelConfig(TypedDict):
"""NOTE: is a `TypedDict` so it can be used as a type hint for a JSONB column
in Postgres"""
channel_name: str
channel_names: list[str]
respond_tag_only: NotRequired[bool] # defaults to False
respond_to_bots: NotRequired[bool] # defaults to False
respond_member_group_list: NotRequired[list[str]]
@@ -1488,11 +1456,10 @@ class SlackBotResponseType(str, PyEnum):
CITATIONS = "citations"
class SlackChannelConfig(Base):
__tablename__ = "slack_channel_config"
class SlackBotConfig(Base):
__tablename__ = "slack_bot_config"
id: Mapped[int] = mapped_column(primary_key=True)
slack_bot_id: Mapped[int] = mapped_column(ForeignKey("slack_bot.id"), nullable=True)
persona_id: Mapped[int | None] = mapped_column(
ForeignKey("persona.id"), nullable=True
)
@@ -1509,30 +1476,10 @@ class SlackChannelConfig(Base):
)
persona: Mapped[Persona | None] = relationship("Persona")
slack_bot: Mapped["SlackBot"] = relationship(
"SlackBot",
back_populates="slack_channel_configs",
)
standard_answer_categories: Mapped[list["StandardAnswerCategory"]] = relationship(
"StandardAnswerCategory",
secondary=SlackChannelConfig__StandardAnswerCategory.__table__,
back_populates="slack_channel_configs",
)
class SlackBot(Base):
__tablename__ = "slack_bot"
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String)
enabled: Mapped[bool] = mapped_column(Boolean, default=True)
bot_token: Mapped[str] = mapped_column(EncryptedString(), unique=True)
app_token: Mapped[str] = mapped_column(EncryptedString(), unique=True)
slack_channel_configs: Mapped[list[SlackChannelConfig]] = relationship(
"SlackChannelConfig",
back_populates="slack_bot",
secondary=SlackBotConfig__StandardAnswerCategory.__table__,
back_populates="slack_bot_configs",
)
@@ -1771,9 +1718,9 @@ class StandardAnswerCategory(Base):
secondary=StandardAnswer__StandardAnswerCategory.__table__,
back_populates="categories",
)
slack_channel_configs: Mapped[list["SlackChannelConfig"]] = relationship(
"SlackChannelConfig",
secondary=SlackChannelConfig__StandardAnswerCategory.__table__,
slack_bot_configs: Mapped[list["SlackBotConfig"]] = relationship(
"SlackBotConfig",
secondary=SlackBotConfig__StandardAnswerCategory.__table__,
back_populates="standard_answer_categories",
)

View File

@@ -26,7 +26,6 @@ from danswer.db.models import DocumentSet
from danswer.db.models import Persona
from danswer.db.models import Persona__User
from danswer.db.models import Persona__UserGroup
from danswer.db.models import PersonaCategory
from danswer.db.models import Prompt
from danswer.db.models import StarterMessage
from danswer.db.models import Tool
@@ -418,7 +417,6 @@ def upsert_persona(
search_start_date: datetime | None = None,
builtin_persona: bool = False,
is_default_persona: bool = False,
category_id: int | None = None,
chunks_above: int = CONTEXT_CHUNKS_ABOVE,
chunks_below: int = CONTEXT_CHUNKS_BELOW,
) -> Persona:
@@ -489,7 +487,7 @@ def upsert_persona(
persona.is_visible = is_visible
persona.search_start_date = search_start_date
persona.is_default_persona = is_default_persona
persona.category_id = category_id
# Do not delete any associations manually added unless
# a new updated list is provided
if document_sets is not None:
@@ -530,7 +528,6 @@ def upsert_persona(
is_visible=is_visible,
search_start_date=search_start_date,
is_default_persona=is_default_persona,
category_id=category_id,
)
db_session.add(persona)
@@ -746,40 +743,5 @@ def delete_persona_by_name(
)
db_session.execute(stmt)
db_session.commit()
def get_assistant_categories(db_session: Session) -> list[PersonaCategory]:
return db_session.query(PersonaCategory).all()
def create_assistant_category(
db_session: Session, name: str, description: str
) -> PersonaCategory:
category = PersonaCategory(name=name, description=description)
db_session.add(category)
db_session.commit()
return category
def update_persona_category(
category_id: int,
category_description: str,
category_name: str,
db_session: Session,
) -> None:
persona_category = (
db_session.query(PersonaCategory)
.filter(PersonaCategory.id == category_id)
.one_or_none()
)
if persona_category is None:
raise ValueError(f"Persona category with ID {category_id} does not exist")
persona_category.description = category_description
persona_category.name = category_name
db_session.commit()
def delete_persona_category(category_id: int, db_session: Session) -> None:
db_session.query(PersonaCategory).filter(PersonaCategory.id == category_id).delete()
db_session.commit()

View File

@@ -1,76 +0,0 @@
from collections.abc import Sequence
from sqlalchemy import select
from sqlalchemy.orm import Session
from danswer.db.models import SlackBot
def insert_slack_bot(
db_session: Session,
name: str,
enabled: bool,
bot_token: str,
app_token: str,
) -> SlackBot:
slack_bot = SlackBot(
name=name,
enabled=enabled,
bot_token=bot_token,
app_token=app_token,
)
db_session.add(slack_bot)
db_session.commit()
return slack_bot
def update_slack_bot(
db_session: Session,
slack_bot_id: int,
name: str,
enabled: bool,
bot_token: str,
app_token: str,
) -> SlackBot:
slack_bot = db_session.scalar(select(SlackBot).where(SlackBot.id == slack_bot_id))
if slack_bot is None:
raise ValueError(f"Unable to find Slack Bot with ID {slack_bot_id}")
# update the app
slack_bot.name = name
slack_bot.enabled = enabled
slack_bot.bot_token = bot_token
slack_bot.app_token = app_token
db_session.commit()
return slack_bot
def fetch_slack_bot(
db_session: Session,
slack_bot_id: int,
) -> SlackBot:
slack_bot = db_session.scalar(select(SlackBot).where(SlackBot.id == slack_bot_id))
if slack_bot is None:
raise ValueError(f"Unable to find Slack Bot with ID {slack_bot_id}")
return slack_bot
def remove_slack_bot(
db_session: Session,
slack_bot_id: int,
) -> None:
slack_bot = fetch_slack_bot(
db_session=db_session,
slack_bot_id=slack_bot_id,
)
db_session.delete(slack_bot)
db_session.commit()
def fetch_slack_bots(db_session: Session) -> Sequence[SlackBot]:
return db_session.scalars(select(SlackBot)).all()

View File

@@ -9,8 +9,8 @@ from danswer.db.constants import SLACK_BOT_PERSONA_PREFIX
from danswer.db.models import ChannelConfig
from danswer.db.models import Persona
from danswer.db.models import Persona__DocumentSet
from danswer.db.models import SlackBotConfig
from danswer.db.models import SlackBotResponseType
from danswer.db.models import SlackChannelConfig
from danswer.db.models import User
from danswer.db.persona import get_default_prompt
from danswer.db.persona import mark_persona_as_deleted
@@ -22,8 +22,8 @@ from danswer.utils.variable_functionality import (
)
def _build_persona_name(channel_name: str) -> str:
return f"{SLACK_BOT_PERSONA_PREFIX}{channel_name}"
def _build_persona_name(channel_names: list[str]) -> str:
return f"{SLACK_BOT_PERSONA_PREFIX}{'-'.join(channel_names)}"
def _cleanup_relationships(db_session: Session, persona_id: int) -> None:
@@ -38,9 +38,9 @@ def _cleanup_relationships(db_session: Session, persona_id: int) -> None:
db_session.delete(rel)
def create_slack_channel_persona(
def create_slack_bot_persona(
db_session: Session,
channel_name: str,
channel_names: list[str],
document_set_ids: list[int],
existing_persona_id: int | None = None,
num_chunks: float = MAX_CHUNKS_FED_TO_CHAT,
@@ -48,11 +48,11 @@ def create_slack_channel_persona(
) -> Persona:
"""NOTE: does not commit changes"""
# create/update persona associated with the Slack channel
persona_name = _build_persona_name(channel_name)
# create/update persona associated with the slack bot
persona_name = _build_persona_name(channel_names)
default_prompt = get_default_prompt(db_session)
persona = upsert_persona(
user=None, # Slack channel Personas are not attached to users
user=None, # Slack Bot Personas are not attached to users
persona_id=existing_persona_id,
name=persona_name,
description="",
@@ -78,15 +78,14 @@ def _no_ee_standard_answer_categories(*args: Any, **kwargs: Any) -> list:
return []
def insert_slack_channel_config(
db_session: Session,
slack_bot_id: int,
def insert_slack_bot_config(
persona_id: int | None,
channel_config: ChannelConfig,
response_type: SlackBotResponseType,
standard_answer_category_ids: list[int],
enable_auto_filters: bool,
) -> SlackChannelConfig:
db_session: Session,
) -> SlackBotConfig:
versioned_fetch_standard_answer_categories_by_ids = (
fetch_versioned_implementation_with_fallback(
"danswer.db.standard_answer",
@@ -111,37 +110,34 @@ def insert_slack_channel_config(
f"Some or all categories with ids {standard_answer_category_ids} do not exist"
)
slack_channel_config = SlackChannelConfig(
slack_bot_id=slack_bot_id,
slack_bot_config = SlackBotConfig(
persona_id=persona_id,
channel_config=channel_config,
response_type=response_type,
standard_answer_categories=existing_standard_answer_categories,
enable_auto_filters=enable_auto_filters,
)
db_session.add(slack_channel_config)
db_session.add(slack_bot_config)
db_session.commit()
return slack_channel_config
return slack_bot_config
def update_slack_channel_config(
db_session: Session,
slack_channel_config_id: int,
def update_slack_bot_config(
slack_bot_config_id: int,
persona_id: int | None,
channel_config: ChannelConfig,
response_type: SlackBotResponseType,
standard_answer_category_ids: list[int],
enable_auto_filters: bool,
) -> SlackChannelConfig:
slack_channel_config = db_session.scalar(
select(SlackChannelConfig).where(
SlackChannelConfig.id == slack_channel_config_id
)
db_session: Session,
) -> SlackBotConfig:
slack_bot_config = db_session.scalar(
select(SlackBotConfig).where(SlackBotConfig.id == slack_bot_config_id)
)
if slack_channel_config is None:
if slack_bot_config is None:
raise ValueError(
f"Unable to find Slack channel config with ID {slack_channel_config_id}"
f"Unable to find slack bot config with ID {slack_bot_config_id}"
)
versioned_fetch_standard_answer_categories_by_ids = (
@@ -163,25 +159,25 @@ def update_slack_channel_config(
)
# get the existing persona id before updating the object
existing_persona_id = slack_channel_config.persona_id
existing_persona_id = slack_bot_config.persona_id
# update the config
# NOTE: need to do this before cleaning up the old persona or else we
# will encounter `violates foreign key constraint` errors
slack_channel_config.persona_id = persona_id
slack_channel_config.channel_config = channel_config
slack_channel_config.response_type = response_type
slack_channel_config.standard_answer_categories = list(
slack_bot_config.persona_id = persona_id
slack_bot_config.channel_config = channel_config
slack_bot_config.response_type = response_type
slack_bot_config.standard_answer_categories = list(
existing_standard_answer_categories
)
slack_channel_config.enable_auto_filters = enable_auto_filters
slack_bot_config.enable_auto_filters = enable_auto_filters
# if the persona has changed, then clean up the old persona
if persona_id != existing_persona_id and existing_persona_id:
existing_persona = db_session.scalar(
select(Persona).where(Persona.id == existing_persona_id)
)
# if the existing persona was one created just for use with this Slack channel,
# if the existing persona was one created just for use with this Slack Bot,
# then clean it up
if existing_persona and existing_persona.name.startswith(
SLACK_BOT_PERSONA_PREFIX
@@ -192,30 +188,28 @@ def update_slack_channel_config(
db_session.commit()
return slack_channel_config
return slack_bot_config
def remove_slack_channel_config(
db_session: Session,
slack_channel_config_id: int,
def remove_slack_bot_config(
slack_bot_config_id: int,
user: User | None,
db_session: Session,
) -> None:
slack_channel_config = db_session.scalar(
select(SlackChannelConfig).where(
SlackChannelConfig.id == slack_channel_config_id
)
slack_bot_config = db_session.scalar(
select(SlackBotConfig).where(SlackBotConfig.id == slack_bot_config_id)
)
if slack_channel_config is None:
if slack_bot_config is None:
raise ValueError(
f"Unable to find Slack channel config with ID {slack_channel_config_id}"
f"Unable to find slack bot config with ID {slack_bot_config_id}"
)
existing_persona_id = slack_channel_config.persona_id
existing_persona_id = slack_bot_config.persona_id
if existing_persona_id:
existing_persona = db_session.scalar(
select(Persona).where(Persona.id == existing_persona_id)
)
# if the existing persona was one created just for use with this Slack channel,
# if the existing persona was one created just for use with this Slack Bot,
# then clean it up
if existing_persona and existing_persona.name.startswith(
SLACK_BOT_PERSONA_PREFIX
@@ -227,28 +221,17 @@ def remove_slack_channel_config(
persona_id=existing_persona_id, user=user, db_session=db_session
)
db_session.delete(slack_channel_config)
db_session.delete(slack_bot_config)
db_session.commit()
def fetch_slack_channel_configs(
db_session: Session, slack_bot_id: int | None = None
) -> Sequence[SlackChannelConfig]:
if not slack_bot_id:
return db_session.scalars(select(SlackChannelConfig)).all()
return db_session.scalars(
select(SlackChannelConfig).where(
SlackChannelConfig.slack_bot_id == slack_bot_id
)
).all()
def fetch_slack_channel_config(
db_session: Session, slack_channel_config_id: int
) -> SlackChannelConfig | None:
def fetch_slack_bot_config(
db_session: Session, slack_bot_config_id: int
) -> SlackBotConfig | None:
return db_session.scalar(
select(SlackChannelConfig).where(
SlackChannelConfig.id == slack_channel_config_id
)
select(SlackBotConfig).where(SlackBotConfig.id == slack_bot_config_id)
)
def fetch_slack_bot_configs(db_session: Session) -> Sequence[SlackBotConfig]:
return db_session.scalars(select(SlackBotConfig)).all()

View File

@@ -1,7 +1,6 @@
from collections.abc import Sequence
from uuid import UUID
from fastapi import HTTPException
from fastapi_users.password import PasswordHelper
from sqlalchemy import func
from sqlalchemy import select
@@ -11,94 +10,15 @@ from danswer.auth.schemas import UserRole
from danswer.db.models import User
def validate_user_role_update(requested_role: UserRole, current_role: UserRole) -> None:
"""
Validate that a user role update is valid.
Assumed only admins can hit this endpoint.
raise if:
- requested role is a curator
- requested role is a slack user
- requested role is an external permissioned user
- requested role is a limited user
- current role is a slack user
- current role is an external permissioned user
- current role is a limited user
"""
if current_role == UserRole.SLACK_USER:
raise HTTPException(
status_code=400,
detail="To change a Slack User's role, they must first login to Danswer via the web app.",
)
if current_role == UserRole.EXT_PERM_USER:
# This shouldn't happen, but just in case
raise HTTPException(
status_code=400,
detail="To change an External Permissioned User's role, they must first login to Danswer via the web app.",
)
if current_role == UserRole.LIMITED:
raise HTTPException(
status_code=400,
detail="To change a Limited User's role, they must first login to Danswer via the web app.",
)
if requested_role == UserRole.CURATOR:
# This shouldn't happen, but just in case
raise HTTPException(
status_code=400,
detail="Curator role must be set via the User Group Menu",
)
if requested_role == UserRole.LIMITED:
# This shouldn't happen, but just in case
raise HTTPException(
status_code=400,
detail=(
"A user cannot be set to a Limited User role. "
"This role is automatically assigned to users through certain endpoints in the API."
),
)
if requested_role == UserRole.SLACK_USER:
# This shouldn't happen, but just in case
raise HTTPException(
status_code=400,
detail=(
"A user cannot be set to a Slack User role. "
"This role is automatically assigned to users who only use Danswer via Slack."
),
)
if requested_role == UserRole.EXT_PERM_USER:
# This shouldn't happen, but just in case
raise HTTPException(
status_code=400,
detail=(
"A user cannot be set to an External Permissioned User role. "
"This role is automatically assigned to users who have been "
"pulled in to the system via an external permissions system."
),
)
def list_users(
db_session: Session, email_filter_string: str = "", include_external: bool = False
db_session: Session, email_filter_string: str = "", user: User | None = None
) -> Sequence[User]:
"""List all users. No pagination as of now, as the # of users
is assumed to be relatively small (<< 1 million)"""
stmt = select(User)
where_clause = []
if not include_external:
where_clause.append(User.role != UserRole.EXT_PERM_USER)
if email_filter_string:
where_clause.append(User.email.ilike(f"%{email_filter_string}%")) # type: ignore
stmt = stmt.where(*where_clause)
stmt = stmt.where(User.email.ilike(f"%{email_filter_string}%")) # type: ignore
return db_session.scalars(stmt).unique().all()
@@ -125,58 +45,55 @@ def get_user_by_email(email: str, db_session: Session) -> User | None:
def fetch_user_by_id(db_session: Session, user_id: UUID) -> User | None:
return db_session.query(User).filter(User.id == user_id).first() # type: ignore
user = db_session.query(User).filter(User.id == user_id).first() # type: ignore
return user
def _generate_non_web_slack_user(email: str) -> User:
def _generate_non_web_user(email: str) -> User:
fastapi_users_pw_helper = PasswordHelper()
password = fastapi_users_pw_helper.generate()
hashed_pass = fastapi_users_pw_helper.hash(password)
return User(
email=email,
hashed_password=hashed_pass,
role=UserRole.SLACK_USER,
has_web_login=False,
role=UserRole.BASIC,
)
def add_slack_user_if_not_exists(db_session: Session, email: str) -> User:
email = email.lower()
def add_non_web_user_if_not_exists(db_session: Session, email: str) -> User:
user = get_user_by_email(email, db_session)
if user is not None:
# If the user is an external permissioned user, we update it to a slack user
if user.role == UserRole.EXT_PERM_USER:
user.role = UserRole.SLACK_USER
db_session.commit()
return user
user = _generate_non_web_slack_user(email=email)
user = _generate_non_web_user(email=email)
db_session.add(user)
db_session.commit()
return user
def _generate_non_web_permissioned_user(email: str) -> User:
fastapi_users_pw_helper = PasswordHelper()
password = fastapi_users_pw_helper.generate()
hashed_pass = fastapi_users_pw_helper.hash(password)
return User(
email=email,
hashed_password=hashed_pass,
role=UserRole.EXT_PERM_USER,
)
def add_non_web_user_if_not_exists__no_commit(db_session: Session, email: str) -> User:
user = get_user_by_email(email, db_session)
if user is not None:
return user
user = _generate_non_web_user(email=email)
db_session.add(user)
db_session.flush() # generate id
return user
def batch_add_ext_perm_user_if_not_exists(
def batch_add_non_web_user_if_not_exists__no_commit(
db_session: Session, emails: list[str]
) -> list[User]:
emails = [email.lower() for email in emails]
found_users, missing_user_emails = get_users_by_emails(db_session, emails)
new_users: list[User] = []
for email in missing_user_emails:
new_users.append(_generate_non_web_permissioned_user(email=email))
new_users.append(_generate_non_web_user(email=email))
db_session.add_all(new_users)
db_session.commit()
db_session.flush() # generate ids
return found_users + new_users

View File

@@ -15,7 +15,7 @@ schema DANSWER_CHUNK_NAME {
# Must have an additional field for whether to skip title embeddings
# This information cannot be extracted from either the title field nor title embedding
field skip_title type bool {
indexing: attribute
indexing: attribute
}
# May not always match the `semantic_identifier` e.g. for Slack docs the
# `semantic_identifier` will be the channel name, but the `title` will be empty
@@ -36,7 +36,7 @@ schema DANSWER_CHUNK_NAME {
}
# Title embedding (x1)
field title_embedding type tensor<float>(x[VARIABLE_DIM]) {
indexing: attribute | index
indexing: attribute
attribute {
distance-metric: angular
}
@@ -44,7 +44,7 @@ schema DANSWER_CHUNK_NAME {
# Content embeddings (chunk + optional mini chunks embeddings)
# "t" and "x" are arbitrary names, not special keywords
field embeddings type tensor<float>(t{},x[VARIABLE_DIM]) {
indexing: attribute | index
indexing: attribute
attribute {
distance-metric: angular
}

View File

@@ -2,7 +2,6 @@ import concurrent.futures
import json
from datetime import datetime
from datetime import timezone
from http import HTTPStatus
import httpx
from retry import retry
@@ -195,14 +194,6 @@ def _index_vespa_chunk(
logger.exception(
f"Failed to index document: '{document.id}'. Got response: '{res.text}'"
)
if isinstance(e, httpx.HTTPStatusError):
if e.response.status_code == HTTPStatus.INSUFFICIENT_STORAGE:
logger.error(
"NOTE: HTTP Status 507 Insufficient Storage usually means "
"you need to allocate more memory or disk space to the "
"Vespa/index container."
)
raise e

View File

@@ -10,7 +10,7 @@ from danswer.connectors.cross_connector_utils.miscellaneous_utils import (
get_metadata_keys_to_ignore,
)
from danswer.connectors.models import Document
from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.indexing.indexing_heartbeat import Heartbeat
from danswer.indexing.models import DocAwareChunk
from danswer.natural_language_processing.utils import BaseTokenizer
from danswer.utils.logger import setup_logger
@@ -125,7 +125,7 @@ class Chunker:
chunk_token_limit: int = DOC_EMBEDDING_CONTEXT_SIZE,
chunk_overlap: int = CHUNK_OVERLAP,
mini_chunk_size: int = MINI_CHUNK_SIZE,
callback: IndexingHeartbeatInterface | None = None,
heartbeat: Heartbeat | None = None,
) -> None:
from llama_index.text_splitter import SentenceSplitter
@@ -134,7 +134,7 @@ class Chunker:
self.enable_multipass = enable_multipass
self.enable_large_chunks = enable_large_chunks
self.tokenizer = tokenizer
self.callback = callback
self.heartbeat = heartbeat
self.blurb_splitter = SentenceSplitter(
tokenizer=tokenizer.tokenize,
@@ -356,14 +356,9 @@ class Chunker:
def chunk(self, documents: list[Document]) -> list[DocAwareChunk]:
final_chunks: list[DocAwareChunk] = []
for document in documents:
if self.callback:
if self.callback.should_stop():
raise RuntimeError("Chunker.chunk: Stop signal detected")
final_chunks.extend(self._handle_single_document(document))
chunks = self._handle_single_document(document)
final_chunks.extend(chunks)
if self.callback:
self.callback.progress("Chunker.chunk", len(chunks))
if self.heartbeat:
self.heartbeat.heartbeat()
return final_chunks

View File

@@ -2,7 +2,7 @@ from abc import ABC
from abc import abstractmethod
from danswer.db.models import SearchSettings
from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.indexing.indexing_heartbeat import Heartbeat
from danswer.indexing.models import ChunkEmbedding
from danswer.indexing.models import DocAwareChunk
from danswer.indexing.models import IndexChunk
@@ -34,7 +34,7 @@ class IndexingEmbedder(ABC):
api_url: str | None,
api_version: str | None,
deployment_name: str | None,
callback: IndexingHeartbeatInterface | None,
heartbeat: Heartbeat | None,
):
self.model_name = model_name
self.normalize = normalize
@@ -60,7 +60,7 @@ class IndexingEmbedder(ABC):
server_host=INDEXING_MODEL_SERVER_HOST,
server_port=INDEXING_MODEL_SERVER_PORT,
retrim_content=True,
callback=callback,
heartbeat=heartbeat,
)
@abstractmethod
@@ -83,7 +83,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
api_url: str | None = None,
api_version: str | None = None,
deployment_name: str | None = None,
callback: IndexingHeartbeatInterface | None = None,
heartbeat: Heartbeat | None = None,
):
super().__init__(
model_name,
@@ -95,7 +95,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
api_url,
api_version,
deployment_name,
callback,
heartbeat,
)
@log_function_time()
@@ -201,9 +201,7 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
@classmethod
def from_db_search_settings(
cls,
search_settings: SearchSettings,
callback: IndexingHeartbeatInterface | None = None,
cls, search_settings: SearchSettings, heartbeat: Heartbeat | None = None
) -> "DefaultIndexingEmbedder":
return cls(
model_name=search_settings.model_name,
@@ -215,5 +213,5 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
api_url=search_settings.api_url,
api_version=search_settings.api_version,
deployment_name=search_settings.deployment_name,
callback=callback,
heartbeat=heartbeat,
)

View File

@@ -1,15 +1,41 @@
from abc import ABC
from abc import abstractmethod
import abc
from typing import Any
from sqlalchemy import func
from sqlalchemy.orm import Session
from danswer.db.index_attempt import get_index_attempt
from danswer.utils.logger import setup_logger
logger = setup_logger()
class IndexingHeartbeatInterface(ABC):
"""Defines a callback interface to be passed to
to run_indexing_entrypoint."""
class Heartbeat(abc.ABC):
"""Useful for any long-running work that goes through a bunch of items
and needs to occasionally give updates on progress.
e.g. chunking, embedding, updating vespa, etc."""
@abstractmethod
def should_stop(self) -> bool:
"""Signal to stop the looping function in flight."""
@abc.abstractmethod
def heartbeat(self, metadata: Any = None) -> None:
raise NotImplementedError
@abstractmethod
def progress(self, tag: str, amount: int) -> None:
"""Send progress updates to the caller."""
class IndexingHeartbeat(Heartbeat):
def __init__(self, index_attempt_id: int, db_session: Session, freq: int):
self.cnt = 0
self.index_attempt_id = index_attempt_id
self.db_session = db_session
self.freq = freq
def heartbeat(self, metadata: Any = None) -> None:
self.cnt += 1
if self.cnt % self.freq == 0:
index_attempt = get_index_attempt(
db_session=self.db_session, index_attempt_id=self.index_attempt_id
)
if index_attempt:
index_attempt.time_updated = func.now()
self.db_session.commit()
else:
logger.error("Index attempt not found, this should not happen!")

View File

@@ -1,9 +1,7 @@
import traceback
from functools import partial
from http import HTTPStatus
from typing import Protocol
import httpx
from pydantic import BaseModel
from pydantic import ConfigDict
from sqlalchemy.orm import Session
@@ -22,8 +20,7 @@ from danswer.db.document import get_documents_by_ids
from danswer.db.document import prepare_to_modify_documents
from danswer.db.document import update_docs_last_modified__no_commit
from danswer.db.document import update_docs_updated_at__no_commit
from danswer.db.document import upsert_document_by_connector_credential_pair
from danswer.db.document import upsert_documents
from danswer.db.document import upsert_documents_complete
from danswer.db.document_set import fetch_document_sets_for_documents
from danswer.db.index_attempt import create_index_attempt_error
from danswer.db.models import Document as DBDocument
@@ -34,7 +31,7 @@ from danswer.document_index.interfaces import DocumentIndex
from danswer.document_index.interfaces import DocumentMetadata
from danswer.indexing.chunker import Chunker
from danswer.indexing.embedder import IndexingEmbedder
from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.indexing.indexing_heartbeat import IndexingHeartbeat
from danswer.indexing.models import DocAwareChunk
from danswer.indexing.models import DocMetadataAwareIndexChunk
from danswer.utils.logger import setup_logger
@@ -59,13 +56,13 @@ class IndexingPipelineProtocol(Protocol):
...
def _upsert_documents_in_db(
def upsert_documents_in_db(
documents: list[Document],
index_attempt_metadata: IndexAttemptMetadata,
db_session: Session,
) -> None:
# Metadata here refers to basic document info, not metadata about the actual content
document_metadata_list: list[DocumentMetadata] = []
doc_m_batch: list[DocumentMetadata] = []
for doc in documents:
first_link = next(
(section.link for section in doc.sections if section.link), ""
@@ -80,9 +77,12 @@ def _upsert_documents_in_db(
secondary_owners=get_experts_stores_representations(doc.secondary_owners),
from_ingestion_api=doc.from_ingestion_api,
)
document_metadata_list.append(db_doc_metadata)
doc_m_batch.append(db_doc_metadata)
upsert_documents(db_session, document_metadata_list)
upsert_documents_complete(
db_session=db_session,
document_metadata_batch=doc_m_batch,
)
# Insert document content metadata
for doc in documents:
@@ -95,25 +95,21 @@ def _upsert_documents_in_db(
document_id=doc.id,
db_session=db_session,
)
continue
create_or_add_document_tag(
tag_key=k,
tag_value=v,
source=doc.source,
document_id=doc.id,
db_session=db_session,
)
else:
create_or_add_document_tag(
tag_key=k,
tag_value=v,
source=doc.source,
document_id=doc.id,
db_session=db_session,
)
def get_doc_ids_to_update(
documents: list[Document], db_docs: list[DBDocument]
) -> list[Document]:
"""Figures out which documents actually need to be updated. If a document is already present
and the `updated_at` hasn't changed, we shouldn't need to do anything with it.
NB: Still need to associate the document in the DB if multiple connectors are
indexing the same doc."""
and the `updated_at` hasn't changed, we shouldn't need to do anything with it."""
id_update_time_map = {
doc.id: doc.doc_updated_at for doc in db_docs if doc.doc_updated_at
}
@@ -156,14 +152,6 @@ def index_doc_batch_with_handler(
tenant_id=tenant_id,
)
except Exception as e:
if isinstance(e, httpx.HTTPStatusError):
if e.response.status_code == HTTPStatus.INSUFFICIENT_STORAGE:
logger.error(
"NOTE: HTTP Status 507 Insufficient Storage indicates "
"you need to allocate more memory or disk space to the "
"Vespa/index container."
)
if INDEXING_EXCEPTION_LIMIT == 0:
raise
@@ -207,9 +195,9 @@ def index_doc_batch_prepare(
db_session: Session,
ignore_time_skip: bool = False,
) -> DocumentBatchPrepareContext | None:
"""Sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
"""This sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
This preceeds indexing it into the actual document index."""
documents: list[Document] = []
documents = []
for document in document_batch:
empty_contents = not any(section.text.strip() for section in document.sections)
if (
@@ -224,65 +212,50 @@ def index_doc_batch_prepare(
logger.warning(
f"Skipping document with ID {document.id} as it has neither title nor content."
)
continue
if document.title is not None and not document.title.strip() and empty_contents:
elif (
document.title is not None and not document.title.strip() and empty_contents
):
# The title is explicitly empty ("" and not None) and the document is empty
# so when building the chunk text representation, it will be empty and unuseable
logger.warning(
f"Skipping document with ID {document.id} as the chunks will be empty."
)
continue
else:
documents.append(document)
documents.append(document)
# Create a trimmed list of docs that don't have a newer updated at
# Shortcuts the time-consuming flow on connector index retries
document_ids: list[str] = [document.id for document in documents]
document_ids = [document.id for document in documents]
db_docs: list[DBDocument] = get_documents_by_ids(
db_session=db_session,
document_ids=document_ids,
)
# Skip indexing docs that don't have a newer updated at
# Shortcuts the time-consuming flow on connector index retries
updatable_docs = (
get_doc_ids_to_update(documents=documents, db_docs=db_docs)
if not ignore_time_skip
else documents
)
# for all updatable docs, upsert into the DB
# Does not include doc_updated_at which is also used to indicate a successful update
if updatable_docs:
_upsert_documents_in_db(
documents=updatable_docs,
index_attempt_metadata=index_attempt_metadata,
db_session=db_session,
)
logger.info(
f"Upserted {len(updatable_docs)} changed docs out of "
f"{len(documents)} total docs into the DB"
)
# for all docs, upsert the document to cc pair relationship
upsert_document_by_connector_credential_pair(
db_session,
index_attempt_metadata.connector_id,
index_attempt_metadata.credential_id,
document_ids,
)
# No docs to process because the batch is empty or every doc was already indexed
# No docs to update either because the batch is empty or every doc was already indexed
if not updatable_docs:
return None
# Create records in the source of truth about these documents,
# does not include doc_updated_at which is also used to indicate a successful update
upsert_documents_in_db(
documents=documents,
index_attempt_metadata=index_attempt_metadata,
db_session=db_session,
)
id_to_db_doc_map = {doc.id: doc for doc in db_docs}
return DocumentBatchPrepareContext(
updatable_docs=updatable_docs, id_to_db_doc_map=id_to_db_doc_map
)
@log_function_time(debug_only=True)
@log_function_time()
def index_doc_batch(
*,
chunker: Chunker,
@@ -296,10 +269,7 @@ def index_doc_batch(
) -> tuple[int, int]:
"""Takes different pieces of the indexing pipeline and applies it to a batch of documents
Note that the documents should already be batched at this point so that it does not inflate the
memory requirements
Returns a tuple where the first element is the number of new docs and the
second element is the number of chunks."""
memory requirements"""
no_access = DocumentAccess.build(
user_emails=[],
@@ -342,9 +312,9 @@ def index_doc_batch(
# we're concerned about race conditions where multiple simultaneous indexings might result
# in one set of metadata overwriting another one in vespa.
# we still write data here for the immediate and most likely correct sync, but
# we still write data here for immediate and most likely correct sync, but
# to resolve this, an update of the last modified field at the end of this loop
# always triggers a final metadata sync via the celery queue
# always triggers a final metadata sync
access_aware_chunks = [
DocMetadataAwareIndexChunk.from_index_chunk(
index_chunk=chunk,
@@ -381,8 +351,7 @@ def index_doc_batch(
ids_to_new_updated_at = {}
for doc in successful_docs:
last_modified_ids.append(doc.id)
# doc_updated_at is the source's idea (on the other end of the connector)
# of when the doc was last modified
# doc_updated_at is the connector source's idea of when the doc was last modified
if doc.doc_updated_at is None:
continue
ids_to_new_updated_at[doc.id] = doc.doc_updated_at
@@ -397,13 +366,10 @@ def index_doc_batch(
db_session.commit()
result = (
len([r for r in insertion_records if r.already_existed is False]),
len(access_aware_chunks),
return len([r for r in insertion_records if r.already_existed is False]), len(
access_aware_chunks
)
return result
def build_indexing_pipeline(
*,
@@ -414,7 +380,6 @@ def build_indexing_pipeline(
ignore_time_skip: bool = False,
attempt_id: int | None = None,
tenant_id: str | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> IndexingPipelineProtocol:
"""Builds a pipeline which takes in a list (batch) of docs and indexes them."""
search_settings = get_current_search_settings(db_session)
@@ -441,8 +406,13 @@ def build_indexing_pipeline(
tokenizer=embedder.embedding_model.tokenizer,
enable_multipass=multipass,
enable_large_chunks=enable_large_chunks,
# after every doc, update status in case there are a bunch of really long docs
callback=callback,
# after every doc, update status in case there are a bunch of
# really long docs
heartbeat=IndexingHeartbeat(
index_attempt_id=attempt_id, db_session=db_session, freq=1
)
if attempt_id
else None,
)
return partial(

View File

@@ -1,6 +1,12 @@
import abc
from collections.abc import Mapping
from collections.abc import Sequence
from typing import TypeAlias
from danswer.utils.special_types import JSON_ro
JSON_ro: TypeAlias = (
Mapping[str, "JSON_ro"] | Sequence["JSON_ro"] | str | int | float | bool | None
)
class KvKeyNotFoundError(Exception):

View File

@@ -11,11 +11,11 @@ from sqlalchemy.orm import Session
from danswer.db.engine import get_sqlalchemy_engine
from danswer.db.engine import is_valid_schema_name
from danswer.db.models import KVStore
from danswer.key_value_store.interface import JSON_ro
from danswer.key_value_store.interface import KeyValueStore
from danswer.key_value_store.interface import KvKeyNotFoundError
from danswer.redis.redis_pool import get_redis_client
from danswer.utils.logger import setup_logger
from danswer.utils.special_types import JSON_ro
from shared_configs.configs import MULTI_TENANT
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR

View File

@@ -263,7 +263,6 @@ class Answer:
message_history=self.message_history,
llm_config=self.llm.config,
single_message_history=self.single_message_history,
raw_user_text=self.question,
)
prompt_builder.update_system_prompt(
default_build_system_message(self.prompt_config)

View File

@@ -59,7 +59,6 @@ class AnswerPromptBuilder:
message_history: list[PreviousMessage],
llm_config: LLMConfig,
single_message_history: str | None = None,
raw_user_text: str | None = None,
) -> None:
self.max_tokens = compute_max_llm_input_tokens(llm_config)
@@ -89,12 +88,6 @@ class AnswerPromptBuilder:
self.new_messages_and_token_cnts: list[tuple[BaseMessage, int]] = []
self.raw_user_message = (
HumanMessage(content=raw_user_text)
if raw_user_text is not None
else user_message
)
def update_system_prompt(self, system_message: SystemMessage | None) -> None:
if not system_message:
self.system_message_and_token_cnt = None

View File

@@ -231,16 +231,16 @@ class QuotesProcessor:
model_previous = self.model_output
self.model_output += token
if not self.found_answer_start:
m = answer_pattern.search(self.model_output)
if m:
self.found_answer_start = True
# Prevent heavy cases of hallucinations
if self.is_json_prompt and len(self.model_output) > 400:
self.found_answer_end = True
if self.is_json_prompt and len(self.model_output) > 70:
logger.warning("LLM did not produce json as prompted")
logger.debug("Model output thus far:", self.model_output)
self.found_answer_end = True
return
remaining = self.model_output[m.end() :]

View File

@@ -1,8 +1,6 @@
import json
import os
import traceback
from collections.abc import Iterator
from collections.abc import Sequence
from typing import Any
from typing import cast
@@ -23,18 +21,15 @@ from langchain_core.messages import SystemMessage
from langchain_core.messages import SystemMessageChunk
from langchain_core.messages.tool import ToolCallChunk
from langchain_core.messages.tool import ToolMessage
from langchain_core.prompt_values import PromptValue
from danswer.configs.app_configs import LOG_ALL_MODEL_INTERACTIONS
from danswer.configs.app_configs import LOG_DANSWER_MODEL_INTERACTIONS
from danswer.configs.model_configs import DISABLE_LITELLM_STREAMING
from danswer.configs.model_configs import GEN_AI_TEMPERATURE
from danswer.configs.model_configs import LITELLM_EXTRA_BODY
from danswer.llm.interfaces import LLM
from danswer.llm.interfaces import LLMConfig
from danswer.llm.interfaces import ToolChoiceOptions
from danswer.server.utils import mask_string
from danswer.utils.logger import setup_logger
from danswer.utils.long_term_log import LongTermLogger
logger = setup_logger()
@@ -44,7 +39,7 @@ logger = setup_logger()
litellm.drop_params = True
litellm.telemetry = False
_LLM_PROMPT_LONG_TERM_LOG_CATEGORY = "llm_prompt"
litellm.set_verbose = LOG_ALL_MODEL_INTERACTIONS
def _base_msg_to_role(msg: BaseMessage) -> str:
@@ -200,23 +195,6 @@ def _convert_delta_to_message_chunk(
raise ValueError(f"Unknown role: {role}")
def _prompt_to_dict(
prompt: LanguageModelInput,
) -> Sequence[str | list[str] | dict[str, Any] | tuple[str, str]]:
# NOTE: this must go first, since it is also a Sequence
if isinstance(prompt, str):
return [_convert_message_to_dict(HumanMessage(content=prompt))]
if isinstance(prompt, (list, Sequence)):
return [
_convert_message_to_dict(msg) if isinstance(msg, BaseMessage) else msg
for msg in prompt
]
if isinstance(prompt, PromptValue):
return [_convert_message_to_dict(message) for message in prompt.to_messages()]
class DefaultMultiLLM(LLM):
"""Uses Litellm library to allow easy configuration to use a multitude of LLMs
See https://python.langchain.com/docs/integrations/chat/litellm"""
@@ -235,8 +213,6 @@ class DefaultMultiLLM(LLM):
temperature: float = GEN_AI_TEMPERATURE,
custom_config: dict[str, str] | None = None,
extra_headers: dict[str, str] | None = None,
extra_body: dict | None = LITELLM_EXTRA_BODY,
long_term_logger: LongTermLogger | None = None,
):
self._timeout = timeout
self._model_provider = model_provider
@@ -247,7 +223,6 @@ class DefaultMultiLLM(LLM):
self._api_base = api_base
self._api_version = api_version
self._custom_llm_provider = custom_llm_provider
self._long_term_logger = long_term_logger
# This can be used to store the maximum output tokens for this model.
# self._max_output_tokens = (
@@ -271,60 +246,12 @@ class DefaultMultiLLM(LLM):
model_kwargs: dict[str, Any] = {}
if extra_headers:
model_kwargs.update({"extra_headers": extra_headers})
if extra_body:
model_kwargs.update({"extra_body": extra_body})
self._model_kwargs = model_kwargs
def log_model_configs(self) -> None:
logger.debug(f"Config: {self.config}")
def _safe_model_config(self) -> dict:
dump = self.config.model_dump()
dump["api_key"] = mask_string(dump.get("api_key", ""))
return dump
def _record_call(self, prompt: LanguageModelInput) -> None:
if self._long_term_logger:
self._long_term_logger.record(
{"prompt": _prompt_to_dict(prompt), "model": self._safe_model_config()},
category=_LLM_PROMPT_LONG_TERM_LOG_CATEGORY,
)
def _record_result(
self, prompt: LanguageModelInput, model_output: BaseMessage
) -> None:
if self._long_term_logger:
self._long_term_logger.record(
{
"prompt": _prompt_to_dict(prompt),
"content": model_output.content,
"tool_calls": (
model_output.tool_calls
if hasattr(model_output, "tool_calls")
else []
),
"model": self._safe_model_config(),
},
category=_LLM_PROMPT_LONG_TERM_LOG_CATEGORY,
)
def _record_error(self, prompt: LanguageModelInput, error: Exception) -> None:
if self._long_term_logger:
self._long_term_logger.record(
{
"prompt": _prompt_to_dict(prompt),
"error": str(error),
"traceback": "".join(
traceback.format_exception(
type(error), error, error.__traceback__
)
),
"model": self._safe_model_config(),
},
category=_LLM_PROMPT_LONG_TERM_LOG_CATEGORY,
)
# def _calculate_max_output_tokens(self, prompt: LanguageModelInput) -> int:
# # NOTE: This method can be used for calculating the maximum tokens for the stream,
# # but it isn't used in practice due to the computational cost of counting tokens
@@ -357,10 +284,14 @@ class DefaultMultiLLM(LLM):
stream: bool,
structured_response_format: dict | None = None,
) -> litellm.ModelResponse | litellm.CustomStreamWrapper:
# litellm doesn't accept LangChain BaseMessage objects, so we need to convert them
# to a dict representation
processed_prompt = _prompt_to_dict(prompt)
self._record_call(processed_prompt)
if isinstance(prompt, list):
prompt = [
_convert_message_to_dict(msg) if isinstance(msg, BaseMessage) else msg
for msg in prompt
]
elif isinstance(prompt, str):
prompt = [_convert_message_to_dict(HumanMessage(content=prompt))]
try:
return litellm.completion(
@@ -373,7 +304,7 @@ class DefaultMultiLLM(LLM):
api_version=self._api_version or None,
custom_llm_provider=self._custom_llm_provider or None,
# actual input
messages=processed_prompt,
messages=prompt,
tools=tools,
tool_choice=tool_choice if tools else None,
# streaming choice
@@ -393,7 +324,6 @@ class DefaultMultiLLM(LLM):
**self._model_kwargs,
)
except Exception as e:
self._record_error(processed_prompt, e)
# for break pointing
raise e
@@ -427,10 +357,7 @@ class DefaultMultiLLM(LLM):
)
choice = response.choices[0]
if hasattr(choice, "message"):
output = _convert_litellm_message_to_langchain_message(choice.message)
if output:
self._record_result(prompt, output)
return output
return _convert_litellm_message_to_langchain_message(choice.message)
else:
raise ValueError("Unexpected response choice type")
@@ -479,9 +406,6 @@ class DefaultMultiLLM(LLM):
"The AI model failed partway through generation, please try again."
)
if output:
self._record_result(prompt, output)
if LOG_DANSWER_MODEL_INTERACTIONS and output:
content = output.content or ""
if isinstance(output, AIMessage):

View File

@@ -10,7 +10,6 @@ from danswer.llm.exceptions import GenAIDisabledException
from danswer.llm.interfaces import LLM
from danswer.llm.override_models import LLMOverride
from danswer.utils.headers import build_llm_extra_headers
from danswer.utils.long_term_log import LongTermLogger
def get_main_llm_from_tuple(
@@ -23,7 +22,6 @@ def get_llms_for_persona(
persona: Persona,
llm_override: LLMOverride | None = None,
additional_headers: dict[str, str] | None = None,
long_term_logger: LongTermLogger | None = None,
) -> tuple[LLM, LLM]:
model_provider_override = llm_override.model_provider if llm_override else None
model_version_override = llm_override.model_version if llm_override else None
@@ -34,7 +32,6 @@ def get_llms_for_persona(
return get_default_llms(
temperature=temperature_override or GEN_AI_TEMPERATURE,
additional_headers=additional_headers,
long_term_logger=long_term_logger,
)
with get_session_context_manager() as db_session:
@@ -60,7 +57,6 @@ def get_llms_for_persona(
api_version=llm_provider.api_version,
custom_config=llm_provider.custom_config,
additional_headers=additional_headers,
long_term_logger=long_term_logger,
)
return _create_llm(model), _create_llm(fast_model)
@@ -70,7 +66,6 @@ def get_default_llms(
timeout: int = QA_TIMEOUT,
temperature: float = GEN_AI_TEMPERATURE,
additional_headers: dict[str, str] | None = None,
long_term_logger: LongTermLogger | None = None,
) -> tuple[LLM, LLM]:
if DISABLE_GENERATIVE_AI:
raise GenAIDisabledException()
@@ -102,7 +97,6 @@ def get_default_llms(
timeout=timeout,
temperature=temperature,
additional_headers=additional_headers,
long_term_logger=long_term_logger,
)
return _create_llm(model_name), _create_llm(fast_model_name)
@@ -119,7 +113,6 @@ def get_llm(
temperature: float = GEN_AI_TEMPERATURE,
timeout: int = QA_TIMEOUT,
additional_headers: dict[str, str] | None = None,
long_term_logger: LongTermLogger | None = None,
) -> LLM:
return DefaultMultiLLM(
model_provider=provider,
@@ -132,5 +125,4 @@ def get_llm(
temperature=temperature,
custom_config=custom_config,
extra_headers=build_llm_extra_headers(additional_headers),
long_term_logger=long_term_logger,
)

View File

@@ -64,9 +64,6 @@ from danswer.server.features.prompt.api import basic_router as prompt_router
from danswer.server.features.tool.api import admin_router as admin_tool_router
from danswer.server.features.tool.api import router as tool_router
from danswer.server.gpts.api import router as gpts_router
from danswer.server.long_term_logs.long_term_logs_api import (
router as long_term_logs_router,
)
from danswer.server.manage.administrative import router as admin_router
from danswer.server.manage.embedding.api import admin_router as embedding_admin_router
from danswer.server.manage.embedding.api import basic_router as embedding_router
@@ -279,7 +276,6 @@ def get_application() -> FastAPI:
include_router_with_global_prefix_prepended(
application, get_full_openai_assistants_api_router()
)
include_router_with_global_prefix_prepended(application, long_term_logs_router)
if AUTH_TYPE == AuthType.DISABLED:
# Server logs this during auth setup verification step
@@ -319,7 +315,7 @@ def get_application() -> FastAPI:
tags=["users"],
)
if AUTH_TYPE == AuthType.GOOGLE_OAUTH:
if AUTH_TYPE == AuthType.GOOGLE_OAUTH or AUTH_TYPE == AuthType.CLOUD:
oauth_client = GoogleOAuth2(OAUTH_CLIENT_ID, OAUTH_CLIENT_SECRET)
include_router_with_global_prefix_prepended(
application,

View File

@@ -16,7 +16,7 @@ from danswer.configs.model_configs import (
)
from danswer.configs.model_configs import DOC_EMBEDDING_CONTEXT_SIZE
from danswer.db.models import SearchSettings
from danswer.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from danswer.indexing.indexing_heartbeat import Heartbeat
from danswer.natural_language_processing.utils import get_tokenizer
from danswer.natural_language_processing.utils import tokenizer_trim_content
from danswer.utils.logger import setup_logger
@@ -99,7 +99,7 @@ class EmbeddingModel:
api_url: str | None,
provider_type: EmbeddingProvider | None,
retrim_content: bool = False,
callback: IndexingHeartbeatInterface | None = None,
heartbeat: Heartbeat | None = None,
api_version: str | None = None,
deployment_name: str | None = None,
) -> None:
@@ -116,7 +116,7 @@ class EmbeddingModel:
self.tokenizer = get_tokenizer(
model_name=model_name, provider_type=provider_type
)
self.callback = callback
self.heartbeat = heartbeat
model_server_url = build_model_server_url(server_host, server_port)
self.embed_server_endpoint = f"{model_server_url}/encoder/bi-encoder-embed"
@@ -160,10 +160,6 @@ class EmbeddingModel:
embeddings: list[Embedding] = []
for idx, text_batch in enumerate(text_batches, start=1):
if self.callback:
if self.callback.should_stop():
raise RuntimeError("_batch_encode_texts detected stop signal")
logger.debug(f"Encoding batch {idx} of {len(text_batches)}")
embed_request = EmbedRequest(
model_name=self.model_name,
@@ -183,8 +179,8 @@ class EmbeddingModel:
response = self._make_model_server_request(embed_request)
embeddings.extend(response.embeddings)
if self.callback:
self.callback.progress("_batch_encode_texts", 1)
if self.heartbeat:
self.heartbeat.heartbeat()
return embeddings
def encode(

View File

@@ -89,70 +89,67 @@ def _check_tokenizer_cache(
model_provider: EmbeddingProvider | None, model_name: str | None
) -> BaseTokenizer:
global _TOKENIZER_CACHE
id_tuple = (model_provider, model_name)
if id_tuple not in _TOKENIZER_CACHE:
tokenizer = None
if model_provider in [EmbeddingProvider.OPENAI, EmbeddingProvider.AZURE]:
if model_name is None:
raise ValueError(
"model_name is required for OPENAI and AZURE embeddings"
)
if model_name:
tokenizer = _try_initialize_tokenizer(model_name, model_provider)
_TOKENIZER_CACHE[id_tuple] = TiktokenTokenizer(model_name)
return _TOKENIZER_CACHE[id_tuple]
if not tokenizer:
logger.info(
try:
if model_name is None:
model_name = DOCUMENT_ENCODER_MODEL
logger.debug(f"Initializing HuggingFaceTokenizer for: {model_name}")
_TOKENIZER_CACHE[id_tuple] = HuggingFaceTokenizer(model_name)
except Exception as primary_error:
logger.error(
f"Error initializing HuggingFaceTokenizer for {model_name}: {primary_error}"
)
logger.warning(
f"Falling back to default embedding model: {DOCUMENT_ENCODER_MODEL}"
)
tokenizer = HuggingFaceTokenizer(DOCUMENT_ENCODER_MODEL)
_TOKENIZER_CACHE[id_tuple] = tokenizer
try:
# Cache this tokenizer name to the default so we don't have to try to load it again
# and fail again
_TOKENIZER_CACHE[id_tuple] = HuggingFaceTokenizer(
DOCUMENT_ENCODER_MODEL
)
except Exception as fallback_error:
logger.error(
f"Error initializing fallback HuggingFaceTokenizer: {fallback_error}"
)
raise ValueError(
f"Failed to initialize tokenizer for {model_name} and fallback model"
) from fallback_error
return _TOKENIZER_CACHE[id_tuple]
def _try_initialize_tokenizer(
model_name: str, model_provider: EmbeddingProvider | None
) -> BaseTokenizer | None:
tokenizer: BaseTokenizer | None = None
if model_provider is not None:
# Try using TiktokenTokenizer first if model_provider exists
try:
tokenizer = TiktokenTokenizer(model_name)
logger.info(f"Initialized TiktokenTokenizer for: {model_name}")
return tokenizer
except Exception as tiktoken_error:
logger.debug(
f"TiktokenTokenizer not available for model {model_name}: {tiktoken_error}"
)
else:
# If no provider specified, try HuggingFaceTokenizer
try:
tokenizer = HuggingFaceTokenizer(model_name)
logger.info(f"Initialized HuggingFaceTokenizer for: {model_name}")
return tokenizer
except Exception as hf_error:
logger.warning(
f"Error initializing HuggingFaceTokenizer for {model_name}: {hf_error}"
)
# If both initializations fail, return None
return None
_DEFAULT_TOKENIZER: BaseTokenizer = HuggingFaceTokenizer(DOCUMENT_ENCODER_MODEL)
def get_tokenizer(
model_name: str | None, provider_type: EmbeddingProvider | str | None
) -> BaseTokenizer:
if isinstance(provider_type, str):
try:
provider_type = EmbeddingProvider(provider_type)
except ValueError:
logger.debug(
f"Invalid provider_type '{provider_type}'. Falling back to default tokenizer."
)
return _DEFAULT_TOKENIZER
return _check_tokenizer_cache(provider_type, model_name)
if provider_type is not None:
if isinstance(provider_type, str):
try:
provider_type = EmbeddingProvider(provider_type)
except ValueError:
logger.debug(
f"Invalid provider_type '{provider_type}'. Falling back to default tokenizer."
)
return _DEFAULT_TOKENIZER
return _check_tokenizer_cache(provider_type, model_name)
return _DEFAULT_TOKENIZER
def tokenizer_trim_content(

View File

@@ -64,7 +64,6 @@ from danswer.tools.tool_implementations.search.search_tool import (
)
from danswer.tools.tool_runner import ToolCallKickoff
from danswer.utils.logger import setup_logger
from danswer.utils.long_term_log import LongTermLogger
from danswer.utils.timing import log_generator_function_time
from danswer.utils.variable_functionality import fetch_ee_implementation_or_noop
@@ -125,11 +124,6 @@ def stream_answer_objects(
danswerbot_flow=danswerbot_flow,
)
# permanent "log" store, used primarily for debugging
long_term_logger = LongTermLogger(
metadata={"user_id": str(user_id), "chat_session_id": str(chat_session.id)}
)
temporary_persona: Persona | None = None
if query_req.persona_config is not None:
@@ -140,9 +134,7 @@ def stream_answer_objects(
persona = temporary_persona if temporary_persona else chat_session.persona
try:
llm, fast_llm = get_llms_for_persona(
persona=persona, long_term_logger=long_term_logger
)
llm, fast_llm = get_llms_for_persona(persona=persona)
except ValueError as e:
logger.error(
f"Failed to initialize LLMs for persona '{persona.name}': {str(e)}"
@@ -245,9 +237,7 @@ def stream_answer_objects(
question=query_msg.message,
answer_style_config=answer_config,
prompt_config=PromptConfig.from_model(prompt),
llm=get_main_llm_from_tuple(
get_llms_for_persona(persona=persona, long_term_logger=long_term_logger)
),
llm=get_main_llm_from_tuple(get_llms_for_persona(persona=persona)),
single_message_history=history_str,
tools=[search_tool] if search_tool else [],
force_use_tool=(

View File

@@ -1,8 +1,6 @@
import redis
from danswer.redis.redis_connector_delete import RedisConnectorDelete
from danswer.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSync
from danswer.redis.redis_connector_ext_group_sync import RedisConnectorExternalGroupSync
from danswer.redis.redis_connector_index import RedisConnectorIndex
from danswer.redis.redis_connector_prune import RedisConnectorPrune
from danswer.redis.redis_connector_stop import RedisConnectorStop
@@ -21,10 +19,6 @@ class RedisConnector:
self.stop = RedisConnectorStop(tenant_id, id, self.redis)
self.prune = RedisConnectorPrune(tenant_id, id, self.redis)
self.delete = RedisConnectorDelete(tenant_id, id, self.redis)
self.permissions = RedisConnectorPermissionSync(tenant_id, id, self.redis)
self.external_group_sync = RedisConnectorExternalGroupSync(
tenant_id, id, self.redis
)
def new_index(self, search_settings_id: int) -> RedisConnectorIndex:
return RedisConnectorIndex(

View File

@@ -1,10 +1,9 @@
import time
from typing import cast
from uuid import uuid4
import redis
from celery import Celery
from redis import Redis
from redis.lock import Lock as RedisLock
from sqlalchemy.orm import Session
from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -14,7 +13,6 @@ from danswer.db.connector_credential_pair import get_connector_credential_pair_f
from danswer.db.document import (
construct_document_select_for_connector_credential_pair_by_needs_sync,
)
from danswer.db.models import Document
from danswer.redis.redis_object_helper import RedisObjectHelper
@@ -32,9 +30,6 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
def __init__(self, tenant_id: str | None, id: int) -> None:
super().__init__(tenant_id, str(id))
# documents that should be skipped
self.skip_docs: set[str] = set()
@classmethod
def get_fence_key(cls) -> str:
return RedisConnectorCredentialPair.FENCE_PREFIX
@@ -50,19 +45,14 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
# example: connector_taskset
return f"{self.TASKSET_PREFIX}"
def set_skip_docs(self, skip_docs: set[str]) -> None:
# documents that should be skipped. Note that this classes updates
# the list on the fly
self.skip_docs = skip_docs
def generate_tasks(
self,
celery_app: Celery,
db_session: Session,
redis_client: Redis,
lock: RedisLock,
lock: redis.lock.Lock,
tenant_id: str | None,
) -> tuple[int, int] | None:
) -> int | None:
last_lock_time = time.monotonic()
async_results = []
@@ -73,11 +63,7 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
stmt = construct_document_select_for_connector_credential_pair_by_needs_sync(
cc_pair.connector_id, cc_pair.credential_id
)
num_docs = 0
for doc in db_session.scalars(stmt).yield_per(1):
doc = cast(Document, doc)
current_time = time.monotonic()
if current_time - last_lock_time >= (
CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
@@ -85,12 +71,6 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
lock.reacquire()
last_lock_time = current_time
num_docs += 1
# check if we should skip the document (typically because it's already syncing)
if doc.id in self.skip_docs:
continue
# celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
# the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
# we prefix the task id so it's easier to keep track of who created the task
@@ -113,6 +93,5 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
)
async_results.append(result)
self.skip_docs.add(doc.id)
return len(async_results), num_docs
return len(async_results)

View File

@@ -6,7 +6,6 @@ from uuid import uuid4
import redis
from celery import Celery
from pydantic import BaseModel
from redis.lock import Lock as RedisLock
from sqlalchemy.orm import Session
from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -14,10 +13,9 @@ from danswer.configs.constants import DanswerCeleryPriority
from danswer.configs.constants import DanswerCeleryQueues
from danswer.db.connector_credential_pair import get_connector_credential_pair_from_id
from danswer.db.document import construct_document_select_for_connector_credential_pair
from danswer.db.models import Document as DbDocument
class RedisConnectorDeletePayload(BaseModel):
class RedisConnectorDeletionFenceData(BaseModel):
num_tasks: int | None
submitted: datetime
@@ -54,18 +52,20 @@ class RedisConnectorDelete:
return False
@property
def payload(self) -> RedisConnectorDeletePayload | None:
def payload(self) -> RedisConnectorDeletionFenceData | None:
# read related data and evaluate/print task progress
fence_bytes = cast(bytes, self.redis.get(self.fence_key))
if fence_bytes is None:
return None
fence_str = fence_bytes.decode("utf-8")
payload = RedisConnectorDeletePayload.model_validate_json(cast(str, fence_str))
payload = RedisConnectorDeletionFenceData.model_validate_json(
cast(str, fence_str)
)
return payload
def set_fence(self, payload: RedisConnectorDeletePayload | None) -> None:
def set_fence(self, payload: RedisConnectorDeletionFenceData | None) -> None:
if not payload:
self.redis.delete(self.fence_key)
return
@@ -83,7 +83,7 @@ class RedisConnectorDelete:
self,
celery_app: Celery,
db_session: Session,
lock: RedisLock,
lock: redis.lock.Lock,
) -> int | None:
"""Returns None if the cc_pair doesn't exist.
Otherwise, returns an int with the number of generated tasks."""
@@ -97,8 +97,7 @@ class RedisConnectorDelete:
stmt = construct_document_select_for_connector_credential_pair(
cc_pair.connector_id, cc_pair.credential_id
)
for doc_temp in db_session.scalars(stmt).yield_per(1):
doc: DbDocument = doc_temp
for doc in db_session.scalars(stmt).yield_per(1):
current_time = time.monotonic()
if current_time - last_lock_time >= (
CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
@@ -130,10 +129,6 @@ class RedisConnectorDelete:
return len(async_results)
def reset(self) -> None:
self.redis.delete(self.taskset_key)
self.redis.delete(self.fence_key)
@staticmethod
def remove_from_taskset(id: int, task_id: str, r: redis.Redis) -> None:
taskset_key = f"{RedisConnectorDelete.TASKSET_PREFIX}_{id}"

View File

@@ -1,188 +0,0 @@
import time
from datetime import datetime
from typing import cast
from uuid import uuid4
import redis
from celery import Celery
from pydantic import BaseModel
from redis.lock import Lock as RedisLock
from danswer.access.models import DocExternalAccess
from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
from danswer.configs.constants import DanswerCeleryPriority
from danswer.configs.constants import DanswerCeleryQueues
class RedisConnectorPermissionSyncData(BaseModel):
started: datetime | None
class RedisConnectorPermissionSync:
"""Manages interactions with redis for doc permission sync tasks. Should only be accessed
through RedisConnector."""
PREFIX = "connectordocpermissionsync"
FENCE_PREFIX = f"{PREFIX}_fence"
# phase 1 - geneartor task and progress signals
GENERATORTASK_PREFIX = f"{PREFIX}+generator" # connectorpermissions+generator
GENERATOR_PROGRESS_PREFIX = (
PREFIX + "_generator_progress"
) # connectorpermissions_generator_progress
GENERATOR_COMPLETE_PREFIX = (
PREFIX + "_generator_complete"
) # connectorpermissions_generator_complete
TASKSET_PREFIX = f"{PREFIX}_taskset" # connectorpermissions_taskset
SUBTASK_PREFIX = f"{PREFIX}+sub" # connectorpermissions+sub
def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None:
self.tenant_id: str | None = tenant_id
self.id = id
self.redis = redis
self.fence_key: str = f"{self.FENCE_PREFIX}_{id}"
self.generator_task_key = f"{self.GENERATORTASK_PREFIX}_{id}"
self.generator_progress_key = f"{self.GENERATOR_PROGRESS_PREFIX}_{id}"
self.generator_complete_key = f"{self.GENERATOR_COMPLETE_PREFIX}_{id}"
self.taskset_key = f"{self.TASKSET_PREFIX}_{id}"
self.subtask_prefix: str = f"{self.SUBTASK_PREFIX}_{id}"
def taskset_clear(self) -> None:
self.redis.delete(self.taskset_key)
def generator_clear(self) -> None:
self.redis.delete(self.generator_progress_key)
self.redis.delete(self.generator_complete_key)
def get_remaining(self) -> int:
remaining = cast(int, self.redis.scard(self.taskset_key))
return remaining
def get_active_task_count(self) -> int:
"""Count of active permission sync tasks"""
count = 0
for _ in self.redis.scan_iter(RedisConnectorPermissionSync.FENCE_PREFIX + "*"):
count += 1
return count
@property
def fenced(self) -> bool:
if self.redis.exists(self.fence_key):
return True
return False
@property
def payload(self) -> RedisConnectorPermissionSyncData | None:
# read related data and evaluate/print task progress
fence_bytes = cast(bytes, self.redis.get(self.fence_key))
if fence_bytes is None:
return None
fence_str = fence_bytes.decode("utf-8")
payload = RedisConnectorPermissionSyncData.model_validate_json(
cast(str, fence_str)
)
return payload
def set_fence(
self,
payload: RedisConnectorPermissionSyncData | None,
) -> None:
if not payload:
self.redis.delete(self.fence_key)
return
self.redis.set(self.fence_key, payload.model_dump_json())
@property
def generator_complete(self) -> int | None:
"""the fence payload is an int representing the starting number of
permission sync tasks to be processed ... just after the generator completes."""
fence_bytes = self.redis.get(self.generator_complete_key)
if fence_bytes is None:
return None
if fence_bytes == b"None":
return None
fence_int = int(cast(bytes, fence_bytes).decode())
return fence_int
@generator_complete.setter
def generator_complete(self, payload: int | None) -> None:
"""Set the payload to an int to set the fence, otherwise if None it will
be deleted"""
if payload is None:
self.redis.delete(self.generator_complete_key)
return
self.redis.set(self.generator_complete_key, payload)
def generate_tasks(
self,
celery_app: Celery,
lock: RedisLock | None,
new_permissions: list[DocExternalAccess],
source_string: str,
) -> int | None:
last_lock_time = time.monotonic()
async_results = []
# Create a task for each document permission sync
for doc_perm in new_permissions:
current_time = time.monotonic()
if lock and current_time - last_lock_time >= (
CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
):
lock.reacquire()
last_lock_time = current_time
# Add task for document permissions sync
custom_task_id = f"{self.subtask_prefix}_{uuid4()}"
self.redis.sadd(self.taskset_key, custom_task_id)
result = celery_app.send_task(
"update_external_document_permissions_task",
kwargs=dict(
tenant_id=self.tenant_id,
serialized_doc_external_access=doc_perm.to_dict(),
source_string=source_string,
),
queue=DanswerCeleryQueues.DOC_PERMISSIONS_UPSERT,
task_id=custom_task_id,
priority=DanswerCeleryPriority.MEDIUM,
)
async_results.append(result)
return len(async_results)
@staticmethod
def remove_from_taskset(id: int, task_id: str, r: redis.Redis) -> None:
taskset_key = f"{RedisConnectorPermissionSync.TASKSET_PREFIX}_{id}"
r.srem(taskset_key, task_id)
return
@staticmethod
def reset_all(r: redis.Redis) -> None:
"""Deletes all redis values for all connectors"""
for key in r.scan_iter(RedisConnectorPermissionSync.TASKSET_PREFIX + "*"):
r.delete(key)
for key in r.scan_iter(
RedisConnectorPermissionSync.GENERATOR_COMPLETE_PREFIX + "*"
):
r.delete(key)
for key in r.scan_iter(
RedisConnectorPermissionSync.GENERATOR_PROGRESS_PREFIX + "*"
):
r.delete(key)
for key in r.scan_iter(RedisConnectorPermissionSync.FENCE_PREFIX + "*"):
r.delete(key)

View File

@@ -1,134 +0,0 @@
from typing import cast
import redis
from celery import Celery
from redis.lock import Lock as RedisLock
from sqlalchemy.orm import Session
class RedisConnectorExternalGroupSync:
"""Manages interactions with redis for external group syncing tasks. Should only be accessed
through RedisConnector."""
PREFIX = "connectorexternalgroupsync"
FENCE_PREFIX = f"{PREFIX}_fence"
# phase 1 - geneartor task and progress signals
GENERATORTASK_PREFIX = f"{PREFIX}+generator" # connectorexternalgroupsync+generator
GENERATOR_PROGRESS_PREFIX = (
PREFIX + "_generator_progress"
) # connectorexternalgroupsync_generator_progress
GENERATOR_COMPLETE_PREFIX = (
PREFIX + "_generator_complete"
) # connectorexternalgroupsync_generator_complete
TASKSET_PREFIX = f"{PREFIX}_taskset" # connectorexternalgroupsync_taskset
SUBTASK_PREFIX = f"{PREFIX}+sub" # connectorexternalgroupsync+sub
def __init__(self, tenant_id: str | None, id: int, redis: redis.Redis) -> None:
self.tenant_id: str | None = tenant_id
self.id = id
self.redis = redis
self.fence_key: str = f"{self.FENCE_PREFIX}_{id}"
self.generator_task_key = f"{self.GENERATORTASK_PREFIX}_{id}"
self.generator_progress_key = f"{self.GENERATOR_PROGRESS_PREFIX}_{id}"
self.generator_complete_key = f"{self.GENERATOR_COMPLETE_PREFIX}_{id}"
self.taskset_key = f"{self.TASKSET_PREFIX}_{id}"
self.subtask_prefix: str = f"{self.SUBTASK_PREFIX}_{id}"
def taskset_clear(self) -> None:
self.redis.delete(self.taskset_key)
def generator_clear(self) -> None:
self.redis.delete(self.generator_progress_key)
self.redis.delete(self.generator_complete_key)
def get_remaining(self) -> int:
# todo: move into fence
remaining = cast(int, self.redis.scard(self.taskset_key))
return remaining
def get_active_task_count(self) -> int:
"""Count of active external group syncing tasks"""
count = 0
for _ in self.redis.scan_iter(
RedisConnectorExternalGroupSync.FENCE_PREFIX + "*"
):
count += 1
return count
@property
def fenced(self) -> bool:
if self.redis.exists(self.fence_key):
return True
return False
def set_fence(self, value: bool) -> None:
if not value:
self.redis.delete(self.fence_key)
return
self.redis.set(self.fence_key, 0)
@property
def generator_complete(self) -> int | None:
"""the fence payload is an int representing the starting number of
external group syncing tasks to be processed ... just after the generator completes.
"""
fence_bytes = self.redis.get(self.generator_complete_key)
if fence_bytes is None:
return None
if fence_bytes == b"None":
return None
fence_int = int(cast(bytes, fence_bytes).decode())
return fence_int
@generator_complete.setter
def generator_complete(self, payload: int | None) -> None:
"""Set the payload to an int to set the fence, otherwise if None it will
be deleted"""
if payload is None:
self.redis.delete(self.generator_complete_key)
return
self.redis.set(self.generator_complete_key, payload)
def generate_tasks(
self,
celery_app: Celery,
db_session: Session,
lock: RedisLock | None,
) -> int | None:
pass
@staticmethod
def remove_from_taskset(id: int, task_id: str, r: redis.Redis) -> None:
taskset_key = f"{RedisConnectorExternalGroupSync.TASKSET_PREFIX}_{id}"
r.srem(taskset_key, task_id)
return
@staticmethod
def reset_all(r: redis.Redis) -> None:
"""Deletes all redis values for all connectors"""
for key in r.scan_iter(RedisConnectorExternalGroupSync.TASKSET_PREFIX + "*"):
r.delete(key)
for key in r.scan_iter(
RedisConnectorExternalGroupSync.GENERATOR_COMPLETE_PREFIX + "*"
):
r.delete(key)
for key in r.scan_iter(
RedisConnectorExternalGroupSync.GENERATOR_PROGRESS_PREFIX + "*"
):
r.delete(key)
for key in r.scan_iter(RedisConnectorExternalGroupSync.FENCE_PREFIX + "*"):
r.delete(key)

View File

@@ -6,7 +6,7 @@ import redis
from pydantic import BaseModel
class RedisConnectorIndexPayload(BaseModel):
class RedisConnectorIndexingFenceData(BaseModel):
index_attempt_id: int | None
started: datetime | None
submitted: datetime
@@ -71,20 +71,22 @@ class RedisConnectorIndex:
return False
@property
def payload(self) -> RedisConnectorIndexPayload | None:
def payload(self) -> RedisConnectorIndexingFenceData | None:
# read related data and evaluate/print task progress
fence_bytes = cast(bytes, self.redis.get(self.fence_key))
if fence_bytes is None:
return None
fence_str = fence_bytes.decode("utf-8")
payload = RedisConnectorIndexPayload.model_validate_json(cast(str, fence_str))
payload = RedisConnectorIndexingFenceData.model_validate_json(
cast(str, fence_str)
)
return payload
def set_fence(
self,
payload: RedisConnectorIndexPayload | None,
payload: RedisConnectorIndexingFenceData | None,
) -> None:
if not payload:
self.redis.delete(self.fence_key)

Some files were not shown because too many files have changed in this diff Show More