Compare commits

..

7 Commits

Author SHA1 Message Date
pablonyx
b8bc300ab8 update 2025-04-02 10:19:27 -07:00
rkuo-danswer
8a8526dbbb harden join function (#4424)
* harden join function

* remove log spam

* use time.monotonic

* add pid logging

* client only celery app

---------

Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-04-02 01:04:00 -07:00
Weves
be20586ba1 Add retries for confluence calls 2025-04-01 23:00:37 -07:00
Weves
a314462d1e Fix migrations 2025-04-01 21:48:32 -07:00
rkuo-danswer
155f53c3d7 Revert "Add user invitation test (#4161)" (#4422)
This reverts commit 806de92feb.

Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
2025-04-01 19:55:04 -07:00
pablonyx
7c027df186 Fix cc pair doc deletion (#4420) 2025-04-01 18:44:15 -07:00
pablonyx
0a5db96026 update (#4415) 2025-04-02 00:42:42 +00:00
20 changed files with 154 additions and 119 deletions

View File

@@ -0,0 +1,50 @@
"""add prompt length limit
Revision ID: f71470ba9274
Revises: 6a804aeb4830
Create Date: 2025-04-01 15:07:14.977435
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "f71470ba9274"
down_revision = "6a804aeb4830"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.alter_column(
"prompt",
"system_prompt",
existing_type=sa.TEXT(),
type_=sa.String(length=8000),
existing_nullable=False,
)
op.alter_column(
"prompt",
"task_prompt",
existing_type=sa.TEXT(),
type_=sa.String(length=8000),
existing_nullable=False,
)
def downgrade() -> None:
op.alter_column(
"prompt",
"system_prompt",
existing_type=sa.String(length=8000),
type_=sa.TEXT(),
existing_nullable=False,
)
op.alter_column(
"prompt",
"task_prompt",
existing_type=sa.String(length=8000),
type_=sa.TEXT(),
existing_nullable=False,
)

View File

@@ -1,7 +1,7 @@
"""updated constraints for ccpairs
Revision ID: f7505c5b0284
Revises: 6a804aeb4830
Revises: f71470ba9274
Create Date: 2025-04-01 17:50:42.504818
"""
@@ -10,7 +10,7 @@ from alembic import op
# revision identifiers, used by Alembic.
revision = "f7505c5b0284"
down_revision = "6a804aeb4830"
down_revision = "f71470ba9274"
branch_labels = None
depends_on = None

View File

@@ -1,5 +1,6 @@
import logging
import multiprocessing
import os
import time
from typing import Any
from typing import cast
@@ -305,7 +306,7 @@ def wait_for_db(sender: Any, **kwargs: Any) -> None:
def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None:
logger.info("Running as a secondary celery worker.")
logger.info(f"Running as a secondary celery worker: pid={os.getpid()}")
# Set up variables for waiting on primary worker
WAIT_INTERVAL = 5

View File

@@ -0,0 +1,7 @@
from celery import Celery
import onyx.background.celery.apps.app_base as app_base
celery_app = Celery(__name__)
celery_app.config_from_object("onyx.background.celery.configs.client")
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]

View File

@@ -1,4 +1,5 @@
import logging
import os
from typing import Any
from typing import cast
@@ -95,7 +96,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
logger.info("Running as the primary celery worker.")
logger.info(f"Running as the primary celery worker: pid={os.getpid()}")
# Less startup checks in multi-tenant case
if MULTI_TENANT:

View File

@@ -0,0 +1,16 @@
import onyx.background.celery.configs.base as shared_config
broker_url = shared_config.broker_url
broker_connection_retry_on_startup = shared_config.broker_connection_retry_on_startup
broker_pool_limit = shared_config.broker_pool_limit
broker_transport_options = shared_config.broker_transport_options
redis_socket_keepalive = shared_config.redis_socket_keepalive
redis_retry_on_timeout = shared_config.redis_retry_on_timeout
redis_backend_health_check_interval = shared_config.redis_backend_health_check_interval
result_backend = shared_config.result_backend
result_expires = shared_config.result_expires # 86400 seconds is the default
task_default_priority = shared_config.task_default_priority
task_acks_late = shared_config.task_acks_late

View File

@@ -0,0 +1,20 @@
"""Factory stub for running celery worker / celery beat.
This code is different from the primary/beat stubs because there is no EE version to
fetch. Port over the code in those files if we add an EE version of this worker.
This is an app stub purely for sending tasks as a client.
"""
from celery import Celery
from onyx.utils.variable_functionality import set_is_ee_based_on_env_variable
set_is_ee_based_on_env_variable()
def get_app() -> Celery:
from onyx.background.celery.apps.client import celery_app
return celery_app
app = get_app()

View File

@@ -13,6 +13,7 @@ from typing import TYPE_CHECKING
from typing import TypeVar
from urllib.parse import parse_qs
from urllib.parse import quote
from urllib.parse import urljoin
from urllib.parse import urlparse
import requests
@@ -342,9 +343,14 @@ def build_confluence_document_id(
Returns:
str: The document id
"""
if is_cloud and not base_url.endswith("/wiki"):
base_url += "/wiki"
return f"{base_url}{content_url}"
# NOTE: urljoin is tricky and will drop the last segment of the base if it doesn't
# end with "/" because it believes that makes it a file.
final_url = base_url.rstrip("/") + "/"
if is_cloud and not final_url.endswith("/wiki/"):
final_url = urljoin(final_url, "wiki") + "/"
final_url = urljoin(final_url, content_url.lstrip("/"))
return final_url
def datetime_from_string(datetime_string: str) -> datetime:
@@ -454,6 +460,19 @@ def _handle_http_error(e: requests.HTTPError, attempt: int) -> int:
logger.warning("HTTPError with `None` as response or as headers")
raise e
# Confluence Server returns 403 when rate limited
if e.response.status_code == 403:
FORBIDDEN_MAX_RETRY_ATTEMPTS = 7
FORBIDDEN_RETRY_DELAY = 10
if attempt < FORBIDDEN_MAX_RETRY_ATTEMPTS:
logger.warning(
"403 error. This sometimes happens when we hit "
f"Confluence rate limits. Retrying in {FORBIDDEN_RETRY_DELAY} seconds..."
)
return FORBIDDEN_RETRY_DELAY
raise e
if (
e.response.status_code != 429
and RATE_LIMIT_MESSAGE_LOWERCASE not in e.response.text.lower()

View File

@@ -1658,8 +1658,8 @@ class Prompt(Base):
)
name: Mapped[str] = mapped_column(String)
description: Mapped[str] = mapped_column(String)
system_prompt: Mapped[str] = mapped_column(Text)
task_prompt: Mapped[str] = mapped_column(Text)
system_prompt: Mapped[str] = mapped_column(String(length=8000))
task_prompt: Mapped[str] = mapped_column(String(length=8000))
include_citations: Mapped[bool] = mapped_column(Boolean, default=True)
datetime_aware: Mapped[bool] = mapped_column(Boolean, default=True)
# Default prompts are configured via backend during deployment

View File

@@ -602,7 +602,7 @@ def get_max_input_tokens(
)
if input_toks <= 0:
raise RuntimeError("No tokens for input for the LLM given settings")
return GEN_AI_MODEL_FALLBACK_MAX_TOKENS
return input_toks

View File

@@ -21,7 +21,7 @@ from onyx.background.celery.tasks.external_group_syncing.tasks import (
from onyx.background.celery.tasks.pruning.tasks import (
try_creating_prune_generator_task,
)
from onyx.background.celery.versioned_apps.primary import app as primary_app
from onyx.background.celery.versioned_apps.client import app as client_app
from onyx.background.indexing.models import IndexAttemptErrorPydantic
from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryTask
@@ -219,7 +219,7 @@ def update_cc_pair_status(
continue
# Revoke the task to prevent it from running
primary_app.control.revoke(index_payload.celery_task_id)
client_app.control.revoke(index_payload.celery_task_id)
# If it is running, then signaling for termination will get the
# watchdog thread to kill the spawned task
@@ -238,7 +238,7 @@ def update_cc_pair_status(
db_session.commit()
# this speeds up the start of indexing by firing the check immediately
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_INDEXING,
kwargs=dict(tenant_id=tenant_id),
priority=OnyxCeleryPriority.HIGH,
@@ -376,7 +376,7 @@ def prune_cc_pair(
f"{cc_pair.connector.name} connector."
)
payload_id = try_creating_prune_generator_task(
primary_app, cc_pair, db_session, r, tenant_id
client_app, cc_pair, db_session, r, tenant_id
)
if not payload_id:
raise HTTPException(
@@ -450,7 +450,7 @@ def sync_cc_pair(
f"{cc_pair.connector.name} connector."
)
payload_id = try_creating_permissions_sync_task(
primary_app, cc_pair_id, r, tenant_id
client_app, cc_pair_id, r, tenant_id
)
if not payload_id:
raise HTTPException(
@@ -524,7 +524,7 @@ def sync_cc_pair_groups(
f"{cc_pair.connector.name} connector."
)
payload_id = try_creating_external_group_sync_task(
primary_app, cc_pair_id, r, tenant_id
client_app, cc_pair_id, r, tenant_id
)
if not payload_id:
raise HTTPException(
@@ -634,7 +634,7 @@ def associate_credential_to_connector(
)
# trigger indexing immediately
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_INDEXING,
priority=OnyxCeleryPriority.HIGH,
kwargs={"tenant_id": tenant_id},

View File

@@ -20,7 +20,7 @@ from onyx.auth.users import current_admin_user
from onyx.auth.users import current_chat_accessible_user
from onyx.auth.users import current_curator_or_admin_user
from onyx.auth.users import current_user
from onyx.background.celery.versioned_apps.primary import app as primary_app
from onyx.background.celery.versioned_apps.client import app as client_app
from onyx.configs.app_configs import ENABLED_CONNECTOR_TYPES
from onyx.configs.app_configs import MOCK_CONNECTOR_FILE_PATH
from onyx.configs.constants import DocumentSource
@@ -928,7 +928,7 @@ def create_connector_with_mock_credential(
)
# trigger indexing immediately
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_INDEXING,
priority=OnyxCeleryPriority.HIGH,
kwargs={"tenant_id": tenant_id},
@@ -1314,7 +1314,7 @@ def trigger_indexing_for_cc_pair(
# run the beat task to pick up the triggers immediately
priority = OnyxCeleryPriority.HIGHEST if is_user_file else OnyxCeleryPriority.HIGH
logger.info(f"Sending indexing check task with priority {priority}")
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_INDEXING,
priority=priority,
kwargs={"tenant_id": tenant_id},

View File

@@ -6,7 +6,7 @@ from sqlalchemy.orm import Session
from onyx.auth.users import current_curator_or_admin_user
from onyx.auth.users import current_user
from onyx.background.celery.versioned_apps.primary import app as primary_app
from onyx.background.celery.versioned_apps.client import app as client_app
from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryTask
from onyx.db.document_set import check_document_sets_are_public
@@ -52,7 +52,7 @@ def create_document_set(
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_VESPA_SYNC_TASK,
kwargs={"tenant_id": tenant_id},
priority=OnyxCeleryPriority.HIGH,
@@ -85,7 +85,7 @@ def patch_document_set(
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_VESPA_SYNC_TASK,
kwargs={"tenant_id": tenant_id},
priority=OnyxCeleryPriority.HIGH,
@@ -108,7 +108,7 @@ def delete_document_set(
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_VESPA_SYNC_TASK,
kwargs={"tenant_id": tenant_id},
priority=OnyxCeleryPriority.HIGH,

View File

@@ -10,7 +10,7 @@ from sqlalchemy.orm import Session
from onyx.auth.users import current_admin_user
from onyx.auth.users import current_curator_or_admin_user
from onyx.background.celery.versioned_apps.primary import app as primary_app
from onyx.background.celery.versioned_apps.client import app as client_app
from onyx.configs.app_configs import GENERATIVE_MODEL_ACCESS_CHECK_FREQ
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import KV_GEN_AI_KEY_CHECK_TIME
@@ -192,7 +192,7 @@ def create_deletion_attempt_for_connector_id(
db_session.commit()
# run the beat task to pick up this deletion from the db immediately
primary_app.send_task(
client_app.send_task(
OnyxCeleryTask.CHECK_FOR_CONNECTOR_DELETION,
priority=OnyxCeleryPriority.HIGH,
kwargs={"tenant_id": tenant_id},

View File

@@ -165,17 +165,18 @@ class DocumentManager:
doc["fields"]["document_id"]: doc["fields"] for doc in retrieved_docs_dict
}
# NOTE(rkuo): too much log spam
# Left this here for debugging purposes.
import json
# import json
print("DEBUGGING DOCUMENTS")
print(retrieved_docs)
for doc in retrieved_docs.values():
printable_doc = doc.copy()
print(printable_doc.keys())
printable_doc.pop("embeddings")
printable_doc.pop("title_embedding")
print(json.dumps(printable_doc, indent=2))
# print("DEBUGGING DOCUMENTS")
# print(retrieved_docs)
# for doc in retrieved_docs.values():
# printable_doc = doc.copy()
# print(printable_doc.keys())
# printable_doc.pop("embeddings")
# printable_doc.pop("title_embedding")
# print(json.dumps(printable_doc, indent=2))
for document in cc_pair.documents:
retrieved_doc = retrieved_docs.get(document.id)

View File

@@ -1,3 +1,4 @@
import time
from datetime import datetime
from datetime import timedelta
from urllib.parse import urlencode
@@ -191,7 +192,7 @@ class IndexAttemptManager:
user_performing_action: DATestUser | None = None,
) -> None:
"""Wait for an IndexAttempt to complete"""
start = datetime.now()
start = time.monotonic()
while True:
index_attempt = IndexAttemptManager.get_index_attempt_by_id(
index_attempt_id=index_attempt_id,
@@ -203,7 +204,7 @@ class IndexAttemptManager:
print(f"IndexAttempt {index_attempt_id} completed")
return
elapsed = (datetime.now() - start).total_seconds()
elapsed = time.monotonic() - start
if elapsed > timeout:
raise TimeoutError(
f"IndexAttempt {index_attempt_id} did not complete within {timeout} seconds"

View File

@@ -313,29 +313,3 @@ class UserManager:
)
response.raise_for_status()
return UserInfo(**response.json())
@staticmethod
def invite_users(
user_performing_action: DATestUser,
emails: list[str],
) -> int:
response = requests.put(
url=f"{API_SERVER_URL}/manage/admin/users",
json={"emails": emails},
headers=user_performing_action.headers,
)
response.raise_for_status()
return response.json()
@staticmethod
def remove_invited_user(
user_performing_action: DATestUser,
user_email: str,
) -> int:
response = requests.patch(
url=f"{API_SERVER_URL}/manage/admin/remove-invited-user",
json={"user_email": user_email},
headers=user_performing_action.headers,
)
response.raise_for_status()
return response.json()

View File

@@ -22,7 +22,6 @@ from onyx.document_index.document_index_utils import get_multipass_config
from onyx.document_index.vespa.index import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa.index import VespaIndex
from onyx.indexing.models import IndexingSetting
from onyx.redis.redis_pool import get_redis_client
from onyx.setup import setup_postgres
from onyx.setup import setup_vespa
from onyx.utils.logger import setup_logger
@@ -238,12 +237,6 @@ def reset_vespa() -> None:
time.sleep(5)
def reset_redis() -> None:
"""Reset the Redis database."""
redis_client = get_redis_client()
redis_client.flushall()
def reset_postgres_multitenant() -> None:
"""Reset the Postgres database for all tenants in a multitenant setup."""
@@ -348,8 +341,6 @@ def reset_all() -> None:
reset_postgres()
logger.info("Resetting Vespa...")
reset_vespa()
logger.info("Resetting Redis...")
reset_redis()
def reset_all_multitenant() -> None:

View File

@@ -1,38 +0,0 @@
import pytest
from requests import HTTPError
from onyx.auth.schemas import UserRole
from tests.integration.common_utils.managers.user import UserManager
from tests.integration.common_utils.test_models import DATestUser
def test_inviting_users_flow(reset: None) -> None:
"""
Test that verifies the functionality around inviting users:
1. Creating an admin user
2. Admin inviting a new user
3. Invited user successfully signing in
4. Non-invited user attempting to sign in (should result in an error)
"""
# 1) Create an admin user (the first user created is automatically admin)
admin_user: DATestUser = UserManager.create(name="admin_user")
assert admin_user is not None
assert UserManager.is_role(admin_user, UserRole.ADMIN)
# 2) Admin invites a new user
invited_email = "invited_user@test.com"
invite_response = UserManager.invite_users(admin_user, [invited_email])
assert invite_response == 1
# 3) The invited user successfully registers/logs in
invited_user: DATestUser = UserManager.create(
name="invited_user", email=invited_email
)
assert invited_user is not None
assert invited_user.email == invited_email
assert UserManager.is_role(invited_user, UserRole.BASIC)
# 4) A non-invited user attempts to sign in/register (should fail)
with pytest.raises(HTTPError):
UserManager.create(name="uninvited_user", email="uninvited_user@test.com")

View File

@@ -242,15 +242,7 @@ export function AssistantEditor({
enabledToolsMap[tool.id] = personaCurrentToolIds.includes(tool.id);
});
const {
selectedFiles,
selectedFolders,
addSelectedFile,
removeSelectedFile,
addSelectedFolder,
removeSelectedFolder,
clearSelectedItems,
} = useDocumentsContext();
const { selectedFiles, selectedFolders } = useDocumentsContext();
const [showVisibilityWarning, setShowVisibilityWarning] = useState(false);