Merge pull request #3161 from danswer-ai/hotfix/v0.13-indexing-redux

enhanced logging for indexing and increased indexing timeouts
no idea how those files got into the merge
2026-02-26 12:15:48 +00:00 · 2024-11-18 19:16:39 -08:00 · 2024-11-18 18:38:29 -08:00 · 2024-11-18 18:14:21 -08:00 · 2024-11-16 22:34:02 -08:00 · 2024-11-16 21:11:39 -08:00
18 changed files with 340 additions and 157 deletions
--- a/backend/danswer/background/celery/apps/indexing.py
+++ b/backend/danswer/background/celery/apps/indexing.py
@@ -59,7 +59,7 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    logger.info(f"Multiprocessing start method: {multiprocessing.get_start_method()}")

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
-    SqlEngine.init_engine(pool_size=8, max_overflow=0)
+    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency)

    # Startup checks are not needed in multi-tenant case
    if MULTI_TENANT:
--- a/backend/danswer/background/celery/apps/primary.py
+++ b/backend/danswer/background/celery/apps/primary.py
@@ -14,10 +14,14 @@ from celery.signals import worker_shutdown
 import danswer.background.celery.apps.app_base as app_base
 from danswer.background.celery.apps.app_base import task_logger
 from danswer.background.celery.celery_utils import celery_is_worker_primary
+from danswer.background.celery.tasks.vespa.tasks import get_unfenced_index_attempt_ids
 from danswer.configs.constants import CELERY_PRIMARY_WORKER_LOCK_TIMEOUT
 from danswer.configs.constants import DanswerRedisLocks
 from danswer.configs.constants import POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME
+from danswer.db.engine import get_session_with_default_tenant
 from danswer.db.engine import SqlEngine
+from danswer.db.index_attempt import get_index_attempt
+from danswer.db.index_attempt import mark_attempt_failed
 from danswer.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
 from danswer.redis.redis_connector_delete import RedisConnectorDelete
 from danswer.redis.redis_connector_index import RedisConnectorIndex
@@ -134,6 +138,23 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:

    RedisConnectorStop.reset_all(r)

+    # mark orphaned index attempts as failed
+    with get_session_with_default_tenant() as db_session:
+        unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
+        for attempt_id in unfenced_attempt_ids:
+            attempt = get_index_attempt(db_session, attempt_id)
+            if not attempt:
+                continue
+
+            failure_reason = (
+                f"Orphaned index attempt found on startup: "
+                f"index_attempt={attempt.id} "
+                f"cc_pair={attempt.connector_credential_pair_id} "
+                f"search_settings={attempt.search_settings_id}"
+            )
+            logger.warning(failure_reason)
+            mark_attempt_failed(attempt.id, db_session, failure_reason)
+

@worker_ready.connect
 def on_worker_ready(sender: Any, **kwargs: Any) -> None:
--- a/backend/danswer/background/celery/tasks/connector_deletion/tasks.py
+++ b/backend/danswer/background/celery/tasks/connector_deletion/tasks.py
@@ -1,12 +1,12 @@
 from datetime import datetime
 from datetime import timezone

-import redis
 from celery import Celery
 from celery import shared_task
 from celery import Task
 from celery.exceptions import SoftTimeLimitExceeded
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.background.celery.apps.app_base import task_logger
@@ -87,7 +87,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
    cc_pair_id: int,
    db_session: Session,
    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    """Returns an int if syncing is needed. The int represents the number of sync tasks generated.
--- a/backend/danswer/background/celery/tasks/indexing/tasks.py
+++ b/backend/danswer/background/celery/tasks/indexing/tasks.py
@@ -3,13 +3,14 @@ from datetime import timezone
 from http import HTTPStatus
 from time import sleep

-import redis
 import sentry_sdk
 from celery import Celery
 from celery import shared_task
 from celery import Task
 from celery.exceptions import SoftTimeLimitExceeded
 from redis import Redis
+from redis.exceptions import LockError
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.background.celery.apps.app_base import task_logger
@@ -44,7 +45,7 @@ from danswer.db.swap_index import check_index_swap
 from danswer.natural_language_processing.search_nlp_models import EmbeddingModel
 from danswer.natural_language_processing.search_nlp_models import warm_up_bi_encoder
 from danswer.redis.redis_connector import RedisConnector
-from danswer.redis.redis_connector_index import RedisConnectorIndexingFenceData
+from danswer.redis.redis_connector_index import RedisConnectorIndexPayload
 from danswer.redis.redis_pool import get_redis_client
 from danswer.utils.logger import setup_logger
 from danswer.utils.variable_functionality import global_version
@@ -61,14 +62,18 @@ class RunIndexingCallback(RunIndexingCallbackInterface):
        self,
        stop_key: str,
        generator_progress_key: str,
-        redis_lock: redis.lock.Lock,
+        redis_lock: RedisLock,
        redis_client: Redis,
    ):
        super().__init__()
-        self.redis_lock: redis.lock.Lock = redis_lock
+        self.redis_lock: RedisLock = redis_lock
        self.stop_key: str = stop_key
        self.generator_progress_key: str = generator_progress_key
        self.redis_client = redis_client
+        self.started: datetime = datetime.now(timezone.utc)
+        self.redis_lock.reacquire()
+
+        self.last_lock_reacquire: datetime = datetime.now(timezone.utc)

    def should_stop(self) -> bool:
        if self.redis_client.exists(self.stop_key):
@@ -76,7 +81,19 @@ class RunIndexingCallback(RunIndexingCallbackInterface):
        return False

    def progress(self, amount: int) -> None:
-        self.redis_lock.reacquire()
+        try:
+            self.redis_lock.reacquire()
+            self.last_lock_reacquire = datetime.now(timezone.utc)
+        except LockError:
+            logger.exception(
+                f"RunIndexingCallback - lock.reacquire exceptioned. "
+                f"lock_timeout={self.redis_lock.timeout} "
+                f"start={self.started} "
+                f"last_reacquired={self.last_lock_reacquire} "
+                f"now={datetime.now(timezone.utc)}"
+            )
+            raise
+
        self.redis_client.incrby(self.generator_progress_key, amount)


@@ -325,7 +342,7 @@ def try_creating_indexing_task(
        redis_connector_index.generator_clear()

        # set a basic fence to start
-        payload = RedisConnectorIndexingFenceData(
+        payload = RedisConnectorIndexPayload(
            index_attempt_id=None,
            started=None,
            submitted=datetime.now(timezone.utc),
@@ -368,7 +385,7 @@ def try_creating_indexing_task(
        redis_connector_index.set_fence(payload)

    except Exception:
-        redis_connector_index.set_fence(payload)
+        redis_connector_index.set_fence(None)
        task_logger.exception(
            f"Unexpected exception: "
            f"tenant={tenant_id} "
--- a/backend/danswer/background/celery/tasks/vespa/tasks.py
+++ b/backend/danswer/background/celery/tasks/vespa/tasks.py
@@ -13,6 +13,7 @@ from celery.exceptions import SoftTimeLimitExceeded
 from celery.result import AsyncResult
 from celery.states import READY_STATES
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session
 from tenacity import RetryError

@@ -162,7 +163,7 @@ def try_generate_stale_document_sync_tasks(
    celery_app: Celery,
    db_session: Session,
    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    # the fence is up, do nothing
@@ -180,7 +181,12 @@ def try_generate_stale_document_sync_tasks(
        f"Stale documents found (at least {stale_doc_count}). Generating sync tasks by cc pair."
    )

-    task_logger.info("RedisConnector.generate_tasks starting by cc_pair.")
+    task_logger.info(
+        "RedisConnector.generate_tasks starting by cc_pair. "
+        "Documents spanning multiple cc_pairs will only be synced once."
+    )
+
+    docs_to_skip: set[str] = set()

    # rkuo: we could technically sync all stale docs in one big pass.
    # but I feel it's more understandable to group the docs by cc_pair
@@ -188,22 +194,21 @@ def try_generate_stale_document_sync_tasks(
    cc_pairs = get_connector_credential_pairs(db_session)
    for cc_pair in cc_pairs:
        rc = RedisConnectorCredentialPair(tenant_id, cc_pair.id)
-        tasks_generated = rc.generate_tasks(
-            celery_app, db_session, r, lock_beat, tenant_id
-        )
+        rc.set_skip_docs(docs_to_skip)
+        result = rc.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)

-        if tasks_generated is None:
+        if result is None:
            continue

-        if tasks_generated == 0:
+        if result[1] == 0:
            continue

        task_logger.info(
            f"RedisConnector.generate_tasks finished for single cc_pair. "
-            f"cc_pair_id={cc_pair.id} tasks_generated={tasks_generated}"
+            f"cc_pair={cc_pair.id} tasks_generated={result[0]} tasks_possible={result[1]}"
        )

-        total_tasks_generated += tasks_generated
+        total_tasks_generated += result[0]

    task_logger.info(
        f"RedisConnector.generate_tasks finished for all cc_pairs. total_tasks_generated={total_tasks_generated}"
@@ -218,7 +223,7 @@ def try_generate_document_set_sync_tasks(
    document_set_id: int,
    db_session: Session,
    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    lock_beat.reacquire()
@@ -246,12 +251,11 @@ def try_generate_document_set_sync_tasks(
    )

    # Add all documents that need to be updated into the queue
-    tasks_generated = rds.generate_tasks(
-        celery_app, db_session, r, lock_beat, tenant_id
-    )
-    if tasks_generated is None:
+    result = rds.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)
+    if result is None:
        return None

+    tasks_generated = result[0]
    # Currently we are allowing the sync to proceed with 0 tasks.
    # It's possible for sets/groups to be generated initially with no entries
    # and they still need to be marked as up to date.
@@ -260,7 +264,7 @@ def try_generate_document_set_sync_tasks(

    task_logger.info(
        f"RedisDocumentSet.generate_tasks finished. "
-        f"document_set_id={document_set.id} tasks_generated={tasks_generated}"
+        f"document_set={document_set.id} tasks_generated={tasks_generated}"
    )

    # set this only after all tasks have been added
@@ -273,7 +277,7 @@ def try_generate_user_group_sync_tasks(
    usergroup_id: int,
    db_session: Session,
    r: Redis,
-    lock_beat: redis.lock.Lock,
+    lock_beat: RedisLock,
    tenant_id: str | None,
 ) -> int | None:
    lock_beat.reacquire()
@@ -302,12 +306,11 @@ def try_generate_user_group_sync_tasks(
    task_logger.info(
        f"RedisUserGroup.generate_tasks starting. usergroup_id={usergroup.id}"
    )
-    tasks_generated = rug.generate_tasks(
-        celery_app, db_session, r, lock_beat, tenant_id
-    )
-    if tasks_generated is None:
+    result = rug.generate_tasks(celery_app, db_session, r, lock_beat, tenant_id)
+    if result is None:
        return None

+    tasks_generated = result[0]
    # Currently we are allowing the sync to proceed with 0 tasks.
    # It's possible for sets/groups to be generated initially with no entries
    # and they still need to be marked as up to date.
@@ -316,7 +319,7 @@ def try_generate_user_group_sync_tasks(

    task_logger.info(
        f"RedisUserGroup.generate_tasks finished. "
-        f"usergroup_id={usergroup.id} tasks_generated={tasks_generated}"
+        f"usergroup={usergroup.id} tasks_generated={tasks_generated}"
    )

    # set this only after all tasks have been added
@@ -580,8 +583,8 @@ def monitor_ccpair_indexing_taskset(
    progress = redis_connector_index.get_progress()
    if progress is not None:
        task_logger.info(
-            f"Connector indexing progress: cc_pair_id={cc_pair_id} "
-            f"search_settings_id={search_settings_id} "
+            f"Connector indexing progress: cc_pair={cc_pair_id} "
+            f"search_settings={search_settings_id} "
            f"progress={progress} "
            f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
        )
@@ -602,8 +605,8 @@ def monitor_ccpair_indexing_taskset(
            # if it isn't, then the worker crashed
            task_logger.info(
                f"Connector indexing aborted: "
-                f"cc_pair_id={cc_pair_id} "
-                f"search_settings_id={search_settings_id} "
+                f"cc_pair={cc_pair_id} "
+                f"search_settings={search_settings_id} "
                f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
            )

@@ -621,8 +624,8 @@ def monitor_ccpair_indexing_taskset(
    status_enum = HTTPStatus(status_int)

    task_logger.info(
-        f"Connector indexing finished: cc_pair_id={cc_pair_id} "
-        f"search_settings_id={search_settings_id} "
+        f"Connector indexing finished: cc_pair={cc_pair_id} "
+        f"search_settings={search_settings_id} "
        f"status={status_enum.name} "
        f"elapsed_submitted={elapsed_submitted.total_seconds():.2f}"
    )
@@ -630,6 +633,37 @@ def monitor_ccpair_indexing_taskset(
    redis_connector_index.reset()


+def get_unfenced_index_attempt_ids(db_session: Session, r: redis.Redis) -> list[int]:
+    """Gets a list of unfenced index attempts. Should not be possible, so we'd typically
+    want to clean them up.
+
+    Unfenced = attempt not in terminal state and fence does not exist.
+    """
+    unfenced_attempts: list[int] = []
+
+    # do some cleanup before clearing fences
+    # check the db for any outstanding index attempts
+    attempts: list[IndexAttempt] = []
+    attempts.extend(
+        get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session)
+    )
+    attempts.extend(
+        get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session)
+    )
+
+    for attempt in attempts:
+        # if attempts exist in the db but we don't detect them in redis, mark them as failed
+        fence_key = RedisConnectorIndex.fence_key_with_ids(
+            attempt.connector_credential_pair_id, attempt.search_settings_id
+        )
+        if r.exists(fence_key):
+            continue
+
+        unfenced_attempts.append(attempt.id)
+
+    return unfenced_attempts
+
+
@shared_task(name="monitor_vespa_sync", soft_time_limit=300, bind=True)
 def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
    """This is a celery beat task that monitors and finalizes metadata sync tasksets.
@@ -643,7 +677,7 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
    """
    r = get_redis_client(tenant_id=tenant_id)

-    lock_beat: redis.lock.Lock = r.lock(
+    lock_beat: RedisLock = r.lock(
        DanswerRedisLocks.MONITOR_VESPA_SYNC_BEAT_LOCK,
        timeout=CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT,
    )
@@ -677,31 +711,24 @@ def monitor_vespa_sync(self: Task, tenant_id: str | None) -> bool:
            f"pruning={n_pruning}"
        )

-        # do some cleanup before clearing fences
-        # check the db for any outstanding index attempts
+        # Fail any index attempts in the DB that don't have fences
        with get_session_with_tenant(tenant_id) as db_session:
-            attempts: list[IndexAttempt] = []
-            attempts.extend(
-                get_all_index_attempts_by_status(IndexingStatus.NOT_STARTED, db_session)
-            )
-            attempts.extend(
-                get_all_index_attempts_by_status(IndexingStatus.IN_PROGRESS, db_session)
-            )
+            unfenced_attempt_ids = get_unfenced_index_attempt_ids(db_session, r)
+            for attempt_id in unfenced_attempt_ids:
+                attempt = get_index_attempt(db_session, attempt_id)
+                if not attempt:
+                    continue

-            for a in attempts:
-                # if attempts exist in the db but we don't detect them in redis, mark them as failed
-                fence_key = RedisConnectorIndex.fence_key_with_ids(
-                    a.connector_credential_pair_id, a.search_settings_id
+                failure_reason = (
+                    f"Unfenced index attempt found in DB: "
+                    f"index_attempt={attempt.id} "
+                    f"cc_pair={attempt.connector_credential_pair_id} "
+                    f"search_settings={attempt.search_settings_id}"
+                )
+                task_logger.warning(failure_reason)
+                mark_attempt_failed(
+                    attempt.id, db_session, failure_reason=failure_reason
                )
-                if not r.exists(fence_key):
-                    failure_reason = (
-                        f"Unknown index attempt. Might be left over from a process restart: "
-                        f"index_attempt={a.id} "
-                        f"cc_pair={a.connector_credential_pair_id} "
-                        f"search_settings={a.search_settings_id}"
-                    )
-                    task_logger.warning(failure_reason)
-                    mark_attempt_failed(a.id, db_session, failure_reason=failure_reason)

        lock_beat.reacquire()
        if r.exists(RedisConnectorCredentialPair.get_fence_key()):
--- a/backend/danswer/background/indexing/run_indexing.py
+++ b/backend/danswer/background/indexing/run_indexing.py
@@ -433,11 +433,13 @@ def run_indexing_entrypoint(
        with get_session_with_tenant(tenant_id) as db_session:
            attempt = transition_attempt_to_in_progress(index_attempt_id, db_session)

+            tenant_str = ""
+            if tenant_id is not None:
+                tenant_str = f" for tenant {tenant_id}"
+
            logger.info(
-                f"Indexing starting for tenant {tenant_id}: "
-                if tenant_id is not None
-                else ""
-                + f"connector='{attempt.connector_credential_pair.connector.name}' "
+                f"Indexing starting{tenant_str}: "
+                f"connector='{attempt.connector_credential_pair.connector.name}' "
                f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
                f"credentials='{attempt.connector_credential_pair.connector_id}'"
            )
@@ -445,10 +447,8 @@ def run_indexing_entrypoint(
            _run_indexing(db_session, attempt, tenant_id, callback)

            logger.info(
-                f"Indexing finished for tenant {tenant_id}: "
-                if tenant_id is not None
-                else ""
-                + f"connector='{attempt.connector_credential_pair.connector.name}' "
+                f"Indexing finished{tenant_str}: "
+                f"connector='{attempt.connector_credential_pair.connector.name}' "
                f"config='{attempt.connector_credential_pair.connector.connector_specific_config}' "
                f"credentials='{attempt.connector_credential_pair.connector_id}'"
            )
--- a/backend/danswer/configs/constants.py
+++ b/backend/danswer/configs/constants.py
@@ -74,7 +74,7 @@ CELERY_PRIMARY_WORKER_LOCK_TIMEOUT = 120

 # needs to be long enough to cover the maximum time it takes to download an object
 # if we can get callbacks as object bytes download, we could lower this a lot.
-CELERY_INDEXING_LOCK_TIMEOUT = 60 * 60  # 60 min
+CELERY_INDEXING_LOCK_TIMEOUT = 3 * 60 * 60  # 60 min

 # needs to be long enough to cover the maximum time it takes to download an object
 # if we can get callbacks as object bytes download, we could lower this a lot.
--- a/backend/danswer/db/document.py
+++ b/backend/danswer/db/document.py
@@ -169,6 +169,7 @@ def get_document_connector_counts(
 def get_document_counts_for_cc_pairs(
    db_session: Session, cc_pair_identifiers: list[ConnectorCredentialPairIdentifier]
 ) -> Sequence[tuple[int, int, int]]:
+    """Returns a sequence of tuples of (connector_id, credential_id, document count)"""
    stmt = (
        select(
            DocumentByConnectorCredentialPair.connector_id,
@@ -323,23 +324,23 @@ def upsert_documents(


 def upsert_document_by_connector_credential_pair(
-    db_session: Session, document_metadata_batch: list[DocumentMetadata]
+    db_session: Session, connector_id: int, credential_id: int, document_ids: list[str]
 ) -> None:
    """NOTE: this function is Postgres specific. Not all DBs support the ON CONFLICT clause."""
-    if not document_metadata_batch:
-        logger.info("`document_metadata_batch` is empty. Skipping.")
+    if not document_ids:
+        logger.info("`document_ids` is empty. Skipping.")
        return

    insert_stmt = insert(DocumentByConnectorCredentialPair).values(
        [
            model_to_dict(
                DocumentByConnectorCredentialPair(
-                    id=document_metadata.document_id,
-                    connector_id=document_metadata.connector_id,
-                    credential_id=document_metadata.credential_id,
+                    id=doc_id,
+                    connector_id=connector_id,
+                    credential_id=credential_id,
                )
            )
-            for document_metadata in document_metadata_batch
+            for doc_id in document_ids
        ]
    )
    # for now, there are no columns to update. If more metadata is added, then this
@@ -400,17 +401,6 @@ def mark_document_as_synced(document_id: str, db_session: Session) -> None:
    db_session.commit()


-def upsert_documents_complete(
-    db_session: Session,
-    document_metadata_batch: list[DocumentMetadata],
-) -> None:
-    upsert_documents(db_session, document_metadata_batch)
-    upsert_document_by_connector_credential_pair(db_session, document_metadata_batch)
-    logger.info(
-        f"Upserted {len(document_metadata_batch)} document store entries into DB"
-    )
-
-
 def delete_document_by_connector_credential_pair__no_commit(
    db_session: Session,
    document_id: str,
@@ -520,7 +510,7 @@ def prepare_to_modify_documents(
    db_session.commit()  # ensure that we're not in a transaction

    lock_acquired = False
-    for _ in range(_NUM_LOCK_ATTEMPTS):
+    for i in range(_NUM_LOCK_ATTEMPTS):
        try:
            with db_session.begin() as transaction:
                lock_acquired = acquire_document_locks(
@@ -531,7 +521,7 @@ def prepare_to_modify_documents(
                    break
        except OperationalError as e:
            logger.warning(
-                f"Failed to acquire locks for documents, retrying. Error: {e}"
+                f"Failed to acquire locks for documents on attempt {i}, retrying. Error: {e}"
            )

        time.sleep(retry_delay)
--- a/backend/danswer/db/engine.py
+++ b/backend/danswer/db/engine.py
@@ -312,7 +312,9 @@ async def get_async_session_with_tenant(
            await session.execute(text(f'SET search_path = "{tenant_id}"'))
            if POSTGRES_IDLE_SESSIONS_TIMEOUT:
                await session.execute(
-                    f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
+                    text(
+                        f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
+                    )
                )
        except Exception:
            logger.exception("Error setting search_path.")
@@ -373,7 +375,9 @@ def get_session_with_tenant(
                cursor.execute(f'SET search_path = "{tenant_id}"')
                if POSTGRES_IDLE_SESSIONS_TIMEOUT:
                    cursor.execute(
-                        f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
+                        text(
+                            f"SET SESSION idle_in_transaction_session_timeout = {POSTGRES_IDLE_SESSIONS_TIMEOUT}"
+                        )
                    )
            finally:
                cursor.close()
--- a/backend/danswer/indexing/indexing_pipeline.py
+++ b/backend/danswer/indexing/indexing_pipeline.py
@@ -20,7 +20,8 @@ from danswer.db.document import get_documents_by_ids
 from danswer.db.document import prepare_to_modify_documents
 from danswer.db.document import update_docs_last_modified__no_commit
 from danswer.db.document import update_docs_updated_at__no_commit
-from danswer.db.document import upsert_documents_complete
+from danswer.db.document import upsert_document_by_connector_credential_pair
+from danswer.db.document import upsert_documents
 from danswer.db.document_set import fetch_document_sets_for_documents
 from danswer.db.index_attempt import create_index_attempt_error
 from danswer.db.models import Document as DBDocument
@@ -56,13 +57,13 @@ class IndexingPipelineProtocol(Protocol):
        ...


-def upsert_documents_in_db(
+def _upsert_documents_in_db(
    documents: list[Document],
    index_attempt_metadata: IndexAttemptMetadata,
    db_session: Session,
 ) -> None:
    # Metadata here refers to basic document info, not metadata about the actual content
-    doc_m_batch: list[DocumentMetadata] = []
+    document_metadata_list: list[DocumentMetadata] = []
    for doc in documents:
        first_link = next(
            (section.link for section in doc.sections if section.link), ""
@@ -77,12 +78,9 @@ def upsert_documents_in_db(
            secondary_owners=get_experts_stores_representations(doc.secondary_owners),
            from_ingestion_api=doc.from_ingestion_api,
        )
-        doc_m_batch.append(db_doc_metadata)
+        document_metadata_list.append(db_doc_metadata)

-    upsert_documents_complete(
-        db_session=db_session,
-        document_metadata_batch=doc_m_batch,
-    )
+    upsert_documents(db_session, document_metadata_list)

    # Insert document content metadata
    for doc in documents:
@@ -95,21 +93,25 @@ def upsert_documents_in_db(
                    document_id=doc.id,
                    db_session=db_session,
                )
-            else:
-                create_or_add_document_tag(
-                    tag_key=k,
-                    tag_value=v,
-                    source=doc.source,
-                    document_id=doc.id,
-                    db_session=db_session,
-                )
+                continue
+
+            create_or_add_document_tag(
+                tag_key=k,
+                tag_value=v,
+                source=doc.source,
+                document_id=doc.id,
+                db_session=db_session,
+            )


 def get_doc_ids_to_update(
    documents: list[Document], db_docs: list[DBDocument]
 ) -> list[Document]:
    """Figures out which documents actually need to be updated. If a document is already present
-    and the `updated_at` hasn't changed, we shouldn't need to do anything with it."""
+    and the `updated_at` hasn't changed, we shouldn't need to do anything with it.
+
+    NB: Still need to associate the document in the DB if multiple connectors are
+    indexing the same doc."""
    id_update_time_map = {
        doc.id: doc.doc_updated_at for doc in db_docs if doc.doc_updated_at
    }
@@ -195,9 +197,9 @@ def index_doc_batch_prepare(
    db_session: Session,
    ignore_time_skip: bool = False,
 ) -> DocumentBatchPrepareContext | None:
-    """This sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
+    """Sets up the documents in the relational DB (source of truth) for permissions, metadata, etc.
    This preceeds indexing it into the actual document index."""
-    documents = []
+    documents: list[Document] = []
    for document in document_batch:
        empty_contents = not any(section.text.strip() for section in document.sections)
        if (
@@ -212,43 +214,58 @@ def index_doc_batch_prepare(
            logger.warning(
                f"Skipping document with ID {document.id} as it has neither title nor content."
            )
-        elif (
-            document.title is not None and not document.title.strip() and empty_contents
-        ):
+            continue
+
+        if document.title is not None and not document.title.strip() and empty_contents:
            # The title is explicitly empty ("" and not None) and the document is empty
            # so when building the chunk text representation, it will be empty and unuseable
            logger.warning(
                f"Skipping document with ID {document.id} as the chunks will be empty."
            )
-        else:
-            documents.append(document)
+            continue

-    document_ids = [document.id for document in documents]
+        documents.append(document)
+
+    # Create a trimmed list of docs that don't have a newer updated at
+    # Shortcuts the time-consuming flow on connector index retries
+    document_ids: list[str] = [document.id for document in documents]
    db_docs: list[DBDocument] = get_documents_by_ids(
        db_session=db_session,
        document_ids=document_ids,
    )

-    # Skip indexing docs that don't have a newer updated at
-    # Shortcuts the time-consuming flow on connector index retries
    updatable_docs = (
        get_doc_ids_to_update(documents=documents, db_docs=db_docs)
        if not ignore_time_skip
        else documents
    )

-    # No docs to update either because the batch is empty or every doc was already indexed
+    # for all updatable docs, upsert into the DB
+    # Does not include doc_updated_at which is also used to indicate a successful update
+    if updatable_docs:
+        _upsert_documents_in_db(
+            documents=updatable_docs,
+            index_attempt_metadata=index_attempt_metadata,
+            db_session=db_session,
+        )
+
+    logger.info(
+        f"Upserted {len(updatable_docs)} changed docs out of "
+        f"{len(documents)} total docs into the DB"
+    )
+
+    # for all docs, upsert the document to cc pair relationship
+    upsert_document_by_connector_credential_pair(
+        db_session,
+        index_attempt_metadata.connector_id,
+        index_attempt_metadata.credential_id,
+        document_ids,
+    )
+
+    # No docs to process because the batch is empty or every doc was already indexed
    if not updatable_docs:
        return None

-    # Create records in the source of truth about these documents,
-    # does not include doc_updated_at which is also used to indicate a successful update
-    upsert_documents_in_db(
-        documents=documents,
-        index_attempt_metadata=index_attempt_metadata,
-        db_session=db_session,
-    )
-
    id_to_db_doc_map = {doc.id: doc for doc in db_docs}
    return DocumentBatchPrepareContext(
        updatable_docs=updatable_docs, id_to_db_doc_map=id_to_db_doc_map
@@ -269,7 +286,10 @@ def index_doc_batch(
 ) -> tuple[int, int]:
    """Takes different pieces of the indexing pipeline and applies it to a batch of documents
    Note that the documents should already be batched at this point so that it does not inflate the
-    memory requirements"""
+    memory requirements
+
+    Returns a tuple where the first element is the number of new docs and the
+    second element is the number of chunks."""

    no_access = DocumentAccess.build(
        user_emails=[],
@@ -312,9 +332,9 @@ def index_doc_batch(

        # we're concerned about race conditions where multiple simultaneous indexings might result
        # in one set of metadata overwriting another one in vespa.
-        # we still write data here for immediate and most likely correct sync, but
+        # we still write data here for the immediate and most likely correct sync, but
        # to resolve this, an update of the last modified field at the end of this loop
-        # always triggers a final metadata sync
+        # always triggers a final metadata sync via the celery queue
        access_aware_chunks = [
            DocMetadataAwareIndexChunk.from_index_chunk(
                index_chunk=chunk,
@@ -351,7 +371,8 @@ def index_doc_batch(
        ids_to_new_updated_at = {}
        for doc in successful_docs:
            last_modified_ids.append(doc.id)
-            # doc_updated_at is the connector source's idea of when the doc was last modified
+            # doc_updated_at is the source's idea (on the other end of the connector)
+            # of when the doc was last modified
            if doc.doc_updated_at is None:
                continue
            ids_to_new_updated_at[doc.id] = doc.doc_updated_at
@@ -366,10 +387,13 @@ def index_doc_batch(

        db_session.commit()

-    return len([r for r in insertion_records if r.already_existed is False]), len(
-        access_aware_chunks
+    result = (
+        len([r for r in insertion_records if r.already_existed is False]),
+        len(access_aware_chunks),
    )

+    return result
+

 def build_indexing_pipeline(
    *,
--- a/backend/danswer/redis/redis_connector_credential_pair.py
+++ b/backend/danswer/redis/redis_connector_credential_pair.py
@@ -1,9 +1,10 @@
 import time
+from typing import cast
 from uuid import uuid4

-import redis
 from celery import Celery
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -13,6 +14,7 @@ from danswer.db.connector_credential_pair import get_connector_credential_pair_f
 from danswer.db.document import (
    construct_document_select_for_connector_credential_pair_by_needs_sync,
 )
+from danswer.db.models import Document
 from danswer.redis.redis_object_helper import RedisObjectHelper


@@ -30,6 +32,9 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
    def __init__(self, tenant_id: str | None, id: int) -> None:
        super().__init__(tenant_id, str(id))

+        # documents that should be skipped
+        self.skip_docs: set[str] = set()
+
    @classmethod
    def get_fence_key(cls) -> str:
        return RedisConnectorCredentialPair.FENCE_PREFIX
@@ -45,14 +50,19 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
        # example: connector_taskset
        return f"{self.TASKSET_PREFIX}"

+    def set_skip_docs(self, skip_docs: set[str]) -> None:
+        # documents that should be skipped. Note that this classes updates
+        # the list on the fly
+        self.skip_docs = skip_docs
+
    def generate_tasks(
        self,
        celery_app: Celery,
        db_session: Session,
        redis_client: Redis,
-        lock: redis.lock.Lock,
+        lock: RedisLock,
        tenant_id: str | None,
-    ) -> int | None:
+    ) -> tuple[int, int] | None:
        last_lock_time = time.monotonic()

        async_results = []
@@ -63,7 +73,10 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
        stmt = construct_document_select_for_connector_credential_pair_by_needs_sync(
            cc_pair.connector_id, cc_pair.credential_id
        )
+
+        num_docs = 0
        for doc in db_session.scalars(stmt).yield_per(1):
+            doc = cast(Document, doc)
            current_time = time.monotonic()
            if current_time - last_lock_time >= (
                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
@@ -71,6 +84,12 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
                lock.reacquire()
                last_lock_time = current_time

+            num_docs += 1
+
+            # check if we should skip the document (typically because it's already syncing)
+            if doc.id in self.skip_docs:
+                continue
+
            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
            # the key for the result is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
            # we prefix the task id so it's easier to keep track of who created the task
@@ -93,5 +112,6 @@ class RedisConnectorCredentialPair(RedisObjectHelper):
            )

            async_results.append(result)
+            self.skip_docs.add(doc.id)

-        return len(async_results)
+        return len(async_results), num_docs
--- a/backend/danswer/redis/redis_connector_delete.py
+++ b/backend/danswer/redis/redis_connector_delete.py
@@ -6,6 +6,7 @@ from uuid import uuid4
 import redis
 from celery import Celery
 from pydantic import BaseModel
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -83,7 +84,7 @@ class RedisConnectorDelete:
        self,
        celery_app: Celery,
        db_session: Session,
-        lock: redis.lock.Lock,
+        lock: RedisLock,
    ) -> int | None:
        """Returns None if the cc_pair doesn't exist.
        Otherwise, returns an int with the number of generated tasks."""
--- a/backend/danswer/redis/redis_connector_index.py
+++ b/backend/danswer/redis/redis_connector_index.py
@@ -6,7 +6,7 @@ import redis
 from pydantic import BaseModel


-class RedisConnectorIndexingFenceData(BaseModel):
+class RedisConnectorIndexPayload(BaseModel):
    index_attempt_id: int | None
    started: datetime | None
    submitted: datetime
@@ -71,22 +71,20 @@ class RedisConnectorIndex:
        return False

    @property
-    def payload(self) -> RedisConnectorIndexingFenceData | None:
+    def payload(self) -> RedisConnectorIndexPayload | None:
        # read related data and evaluate/print task progress
        fence_bytes = cast(bytes, self.redis.get(self.fence_key))
        if fence_bytes is None:
            return None

        fence_str = fence_bytes.decode("utf-8")
-        payload = RedisConnectorIndexingFenceData.model_validate_json(
-            cast(str, fence_str)
-        )
+        payload = RedisConnectorIndexPayload.model_validate_json(cast(str, fence_str))

        return payload

    def set_fence(
        self,
-        payload: RedisConnectorIndexingFenceData | None,
+        payload: RedisConnectorIndexPayload | None,
    ) -> None:
        if not payload:
            self.redis.delete(self.fence_key)
--- a/backend/danswer/redis/redis_connector_prune.py
+++ b/backend/danswer/redis/redis_connector_prune.py
@@ -4,6 +4,7 @@ from uuid import uuid4

 import redis
 from celery import Celery
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -105,7 +106,7 @@ class RedisConnectorPrune:
        documents_to_prune: set[str],
        celery_app: Celery,
        db_session: Session,
-        lock: redis.lock.Lock | None,
+        lock: RedisLock | None,
    ) -> int | None:
        last_lock_time = time.monotonic()

--- a/backend/danswer/redis/redis_document_set.py
+++ b/backend/danswer/redis/redis_document_set.py
@@ -5,6 +5,7 @@ from uuid import uuid4
 import redis
 from celery import Celery
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -50,9 +51,9 @@ class RedisDocumentSet(RedisObjectHelper):
        celery_app: Celery,
        db_session: Session,
        redis_client: Redis,
-        lock: redis.lock.Lock,
+        lock: RedisLock,
        tenant_id: str | None,
-    ) -> int | None:
+    ) -> tuple[int, int] | None:
        last_lock_time = time.monotonic()

        async_results = []
@@ -84,7 +85,7 @@ class RedisDocumentSet(RedisObjectHelper):

            async_results.append(result)

-        return len(async_results)
+        return len(async_results), len(async_results)

    def reset(self) -> None:
        self.redis.delete(self.taskset_key)
--- a/backend/danswer/redis/redis_object_helper.py
+++ b/backend/danswer/redis/redis_object_helper.py
@@ -1,9 +1,9 @@
 from abc import ABC
 from abc import abstractmethod

-import redis
 from celery import Celery
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.redis.redis_pool import get_redis_client
@@ -85,7 +85,13 @@ class RedisObjectHelper(ABC):
        celery_app: Celery,
        db_session: Session,
        redis_client: Redis,
-        lock: redis.lock.Lock,
+        lock: RedisLock,
        tenant_id: str | None,
-    ) -> int | None:
-        pass
+    ) -> tuple[int, int] | None:
+        """First element should be the number of actual tasks generated, second should
+        be the number of docs that were candidates to be synced for the cc pair.
+
+        The need for this is when we are syncing stale docs referenced by multiple
+        connectors. In a single pass across multiple cc pairs, we only want a task
+        for be created for a particular document id the first time we see it.
+        The rest can be skipped."""
--- a/backend/danswer/redis/redis_usergroup.py
+++ b/backend/danswer/redis/redis_usergroup.py
@@ -5,6 +5,7 @@ from uuid import uuid4
 import redis
 from celery import Celery
 from redis import Redis
+from redis.lock import Lock as RedisLock
 from sqlalchemy.orm import Session

 from danswer.configs.constants import CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT
@@ -51,15 +52,15 @@ class RedisUserGroup(RedisObjectHelper):
        celery_app: Celery,
        db_session: Session,
        redis_client: Redis,
-        lock: redis.lock.Lock,
+        lock: RedisLock,
        tenant_id: str | None,
-    ) -> int | None:
+    ) -> tuple[int, int] | None:
        last_lock_time = time.monotonic()

        async_results = []

        if not global_version.is_ee_version():
-            return 0
+            return 0, 0

        try:
            construct_document_select_by_usergroup = fetch_versioned_implementation(
@@ -67,7 +68,7 @@ class RedisUserGroup(RedisObjectHelper):
                "construct_document_select_by_usergroup",
            )
        except ModuleNotFoundError:
-            return 0
+            return 0, 0

        stmt = construct_document_select_by_usergroup(int(self._id))
        for doc in db_session.scalars(stmt).yield_per(1):
@@ -97,7 +98,7 @@ class RedisUserGroup(RedisObjectHelper):

            async_results.append(result)

-        return len(async_results)
+        return len(async_results), len(async_results)

    def reset(self) -> None:
        self.redis.delete(self.taskset_key)
--- a/backend/tests/integration/tests/connector/test_connector_deletion.py
+++ b/backend/tests/integration/tests/connector/test_connector_deletion.py
@@ -29,6 +29,78 @@ from tests.integration.common_utils.test_models import DATestUserGroup
 from tests.integration.common_utils.vespa import vespa_fixture


+# def test_connector_creation(reset: None) -> None:
+#     # Creating an admin user (first user created is automatically an admin)
+#     admin_user: DATestUser = UserManager.create(name="admin_user")
+
+#     # create connectors
+#     cc_pair_1 = CCPairManager.create_from_scratch(
+#         source=DocumentSource.INGESTION_API,
+#         user_performing_action=admin_user,
+#     )
+
+#     cc_pair_info = CCPairManager.get_single(
+#         cc_pair_1.id, user_performing_action=admin_user
+#     )
+#     assert cc_pair_info
+#     assert cc_pair_info.creator
+#     assert str(cc_pair_info.creator) == admin_user.id
+#     assert cc_pair_info.creator_email == admin_user.email
+
+
+# TODO(rkuo): will enable this once i have credentials on github
+# def test_overlapping_connector_creation(reset: None) -> None:
+#     # Creating an admin user (first user created is automatically an admin)
+#     admin_user: DATestUser = UserManager.create(name="admin_user")
+
+#     config = {
+#         "wiki_base": os.environ["CONFLUENCE_TEST_SPACE_URL"],
+#         "space": os.environ["CONFLUENCE_TEST_SPACE"],
+#         "is_cloud": True,
+#         "page_id": "",
+#     }
+
+#     credential = {
+#         "confluence_username": os.environ["CONFLUENCE_USER_NAME"],
+#         "confluence_access_token": os.environ["CONFLUENCE_ACCESS_TOKEN"],
+#     }
+
+#     # store the time before we create the connector so that we know after
+#     # when the indexing should have started
+#     now = datetime.now(timezone.utc)
+
+#     # create connector
+#     cc_pair_1 = CCPairManager.create_from_scratch(
+#         source=DocumentSource.CONFLUENCE,
+#         connector_specific_config=config,
+#         credential_json=credential,
+#         user_performing_action=admin_user,
+#     )
+
+#     CCPairManager.wait_for_indexing(
+#         cc_pair_1, now, timeout=60, user_performing_action=admin_user
+#     )
+
+#     cc_pair_2 = CCPairManager.create_from_scratch(
+#         source=DocumentSource.CONFLUENCE,
+#         connector_specific_config=config,
+#         credential_json=credential,
+#         user_performing_action=admin_user,
+#     )
+
+#     CCPairManager.wait_for_indexing(
+#         cc_pair_2, now, timeout=60, user_performing_action=admin_user
+#     )
+
+#     info_1 = CCPairManager.get_single(cc_pair_1.id)
+#     assert info_1
+
+#     info_2 = CCPairManager.get_single(cc_pair_2.id)
+#     assert info_2
+
+#     assert info_1.num_docs_indexed == info_2.num_docs_indexed
+
+
 def test_connector_deletion(reset: None, vespa_client: vespa_fixture) -> None:
    # Creating an admin user (first user created is automatically an admin)
    admin_user: DATestUser = UserManager.create(name="admin_user")
Author	SHA1	Message	Date
rkuo-danswer	9456fef307	Merge pull request #3161 from danswer-ai/hotfix/v0.13-indexing-redux enhanced logging for indexing and increased indexing timeouts	2024-11-18 19:16:39 -08:00
Richard Kuo (Danswer)	cc3c0800f0	no idea how those files got into the merge	2024-11-18 18:38:29 -08:00
Richard Kuo (Danswer)	e860f15b64	hotfix merge	2024-11-18 18:14:21 -08:00
rkuo-danswer	574ef470a4	Merge pull request #3149 from danswer-ai/hotfix/v0.13-overlapping-connectors merge overlapping connector hotfix	2024-11-16 22:34:02 -08:00
Richard Kuo	9e391495c2	fix unused stuff for hotfix	2024-11-16 21:11:39 -08:00
Richard Kuo	e26d5430fa	merge overlapping connector hotfix	2024-11-16 20:59:00 -08:00
rkuo-danswer	cce0ec2f22	Merge pull request #3141 from danswer-ai/hotfix/v0.13-indexing-concurrency Merge hotfix/v0.13-indexing-concurrency into release/v0.13	2024-11-15 12:51:41 -08:00
rkuo-danswer	a4f09a62a5	Merge pull request #3142 from danswer-ai/hotfix/v0.13-session-text Merge hotfix/v0.13-session-text into release/v0.13	2024-11-15 12:51:23 -08:00
rkuo-danswer	fd2428d97f	Merge pull request #3131 from danswer-ai/bugfix/session_text use text()	2024-11-15 20:23:18 +00:00
rkuo-danswer	cfc46812c8	scale indexing sql pool based on concurrency (#3130 )	2024-11-15 20:21:43 +00:00