.

Update
2026-03-26 01:52:45 +00:00 · 2026-03-26 10:33:56 +11:00 · 2026-03-26 10:26:58 +11:00 · 2026-03-26 09:59:26 +11:00 · 2026-03-26 09:57:24 +11:00
2 changed files with 25 additions and 16 deletions
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -1,5 +1,6 @@
 from collections import defaultdict
 from collections.abc import Callable
+from collections.abc import Iterable
 from typing import cast
 from typing import Protocol

@@ -48,6 +49,7 @@ from onyx.indexing.chunker import Chunker
 from onyx.indexing.embedder import embed_chunks_with_failure_handling
 from onyx.indexing.embedder import IndexingEmbedder
 from onyx.indexing.models import DocAwareChunk
+from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.indexing.models import IndexingBatchAdapter
 from onyx.indexing.models import UpdatableChunkData
 from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
@@ -772,6 +774,10 @@ def index_doc_batch(

        primary_doc_idx_insertion_records: list[DocumentInsertionRecord] | None = None
        primary_doc_idx_vector_db_write_failures: list[ConnectorFailure] | None = None
+
+        def chunk_iterable_creator() -> Iterable[DocMetadataAwareIndexChunk]:
+            return metadata_aware_chunks
+
        for document_index in document_indices:
            # A document will not be spread across different batches, so all the
            # documents with chunks in this set, are fully represented by the chunks
@@ -781,7 +787,7 @@ def index_doc_batch(
                vector_db_write_failures,
            ) = write_chunks_to_vector_db_with_backoff(
                document_index=document_index,
-                chunks=metadata_aware_chunks,
+                make_chunks=chunk_iterable_creator,
                index_batch_params=IndexBatchParams(
                    doc_id_to_previous_chunk_cnt=enricher.doc_id_to_previous_chunk_cnt,
                    doc_id_to_new_chunk_cnt=enricher.doc_id_to_new_chunk_cnt,
@@ -813,7 +819,7 @@ def index_doc_batch(
                    f"Updatable IDs: {updatable_ids}, "
                    f"Returned IDs: {all_returned_doc_ids}. "
                    "This should never happen."
-                    f"This occured for document index {document_index.__class__.__name__}"
+                    f"This occurred for document index {document_index.__class__.__name__}"
                )
            # We treat the first document index we got as the primary one used
            # for reporting the state of indexing.
--- a/backend/onyx/indexing/vector_db_insertion.py
+++ b/backend/onyx/indexing/vector_db_insertion.py
@@ -1,6 +1,9 @@
 import time
-from collections import defaultdict
+from collections.abc import Callable
+from collections.abc import Iterable
 from http import HTTPStatus
+from itertools import chain
+from itertools import groupby

 import httpx

@@ -28,22 +31,22 @@ def _log_insufficient_storage_error(e: Exception) -> None:

 def write_chunks_to_vector_db_with_backoff(
    document_index: DocumentIndex,
-    chunks: list[DocMetadataAwareIndexChunk],
+    make_chunks: Callable[[], Iterable[DocMetadataAwareIndexChunk]],
    index_batch_params: IndexBatchParams,
 ) -> tuple[list[DocumentInsertionRecord], list[ConnectorFailure]]:
    """Tries to insert all chunks in one large batch. If that batch fails for any reason,
    goes document by document to isolate the failure(s).

    IMPORTANT: must pass in whole documents at a time not individual chunks, since the
-    vector DB interface assumes that all chunks for a single document are present.
+    vector DB interface assumes that all chunks for a single document are present. The
+    chunks must also be in contiguous batches
    """
-
    # first try to write the chunks to the vector db
    try:
        return (
            list(
                document_index.index(
-                    chunks=chunks,
+                    chunks=make_chunks(),
                    index_batch_params=index_batch_params,
                )
            ),
@@ -60,14 +63,16 @@ def write_chunks_to_vector_db_with_backoff(
        # wait a couple seconds just to give the vector db a chance to recover
        time.sleep(2)

-    # try writing each doc one by one
-    chunks_for_docs: dict[str, list[DocMetadataAwareIndexChunk]] = defaultdict(list)
-    for chunk in chunks:
-        chunks_for_docs[chunk.source_document.id].append(chunk)
-
    insertion_records: list[DocumentInsertionRecord] = []
    failures: list[ConnectorFailure] = []
-    for doc_id, chunks_for_doc in chunks_for_docs.items():
+
+    def key(chunk: DocMetadataAwareIndexChunk) -> str:
+        return chunk.source_document.id
+
+    for doc_id, chunks_for_doc in groupby(make_chunks(), key=key):
+        first_chunk = next(chunks_for_doc)
+        chunks_for_doc = chain([first_chunk], chunks_for_doc)
+
        try:
            insertion_records.extend(
                document_index.index(
@@ -87,9 +92,7 @@ def write_chunks_to_vector_db_with_backoff(
                ConnectorFailure(
                    failed_document=DocumentFailure(
                        document_id=doc_id,
-                        document_link=(
-                            chunks_for_doc[0].get_link() if chunks_for_doc else None
-                        ),
+                        document_link=first_chunk.get_link(),
                    ),
                    failure_message=str(e),
                    exception=e,
Author	SHA1	Message	Date
Dane Urban	6985661dcd	.	2026-03-26 10:33:56 +11:00
Dane Urban	3e2a10ce9d	.	2026-03-26 10:26:58 +11:00
Dane Urban	389eb6c281	Update	2026-03-26 09:59:26 +11:00
Dane Urban	ff88d1886b	Update	2026-03-26 09:57:24 +11:00