.

Update
2026-03-27 02:22:41 +00:00 · 2026-03-26 10:33:56 +11:00 · 2026-03-26 10:26:58 +11:00 · 2026-03-26 09:59:26 +11:00 · 2026-03-26 09:57:24 +11:00 · 2026-03-24 21:40:43 +11:00
12 changed files with 656 additions and 194 deletions
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -787,6 +787,10 @@ MINI_CHUNK_SIZE = 150
 # This is the number of regular chunks per large chunk
 LARGE_CHUNK_RATIO = 4

+# The maximum number of chunks that can be held for 1 document processing batch
+# The purpose of this is to set an upper bound on memory usage
+MAX_CHUNKS_PER_DOC_BATCH = int(os.environ.get("MAX_CHUNKS_PER_DOC_BATCH") or 1000)
+
 # Include the document level metadata in each chunk. If the metadata is too long, then it is thrown out
 # We don't want the metadata to overwhelm the actual contents of the chunk
 SKIP_METADATA_IN_CHUNK = os.environ.get("SKIP_METADATA_IN_CHUNK", "").lower() == "true"
--- a/backend/onyx/document_index/opensearch/opensearch_document_index.py
+++ b/backend/onyx/document_index/opensearch/opensearch_document_index.py
@@ -6,6 +6,7 @@ import httpx
 from opensearchpy import NotFoundError

 from onyx.access.models import DocumentAccess
+from onyx.configs.app_configs import MAX_CHUNKS_PER_DOC_BATCH
 from onyx.configs.app_configs import VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT
 from onyx.configs.chat_configs import NUM_RETURNED_HITS
 from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
@@ -737,6 +738,9 @@ class OpenSearchDocumentIndex(DocumentIndex):
                    _flush_chunks(current_chunks)
                current_doc_id = doc_id
                current_chunks = [chunk]
+            elif len(current_chunks) >= MAX_CHUNKS_PER_DOC_BATCH:
+                _flush_chunks(current_chunks)
+                current_chunks = [chunk]
            else:
                current_chunks.append(chunk)

--- a/backend/onyx/document_index/vespa/vespa_document_index.py
+++ b/backend/onyx/document_index/vespa/vespa_document_index.py
@@ -10,6 +10,7 @@ import httpx
 from pydantic import BaseModel
 from retry import retry

+from onyx.configs.app_configs import MAX_CHUNKS_PER_DOC_BATCH
 from onyx.configs.app_configs import RECENCY_BIAS_MULTIPLIER
 from onyx.configs.app_configs import RERANK_COUNT
 from onyx.configs.chat_configs import DOC_TIME_DECAY
@@ -427,7 +428,9 @@ class VespaDocumentIndex(DocumentIndex):
                new_document_id_to_original_document_id,
                all_cleaned_doc_ids,
            )
-            for chunk_batch in batch_generator(cleaned_chunks, BATCH_SIZE):
+            for chunk_batch in batch_generator(
+                cleaned_chunks, min(BATCH_SIZE, MAX_CHUNKS_PER_DOC_BATCH)
+            ):
                batch_index_vespa_chunks(
                    chunks=chunk_batch,
                    index_name=self._index_name,
--- a/backend/onyx/indexing/adapters/document_indexing_adapter.py
+++ b/backend/onyx/indexing/adapters/document_indexing_adapter.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import contextlib
 from collections.abc import Generator

@@ -19,7 +21,8 @@ from onyx.db.document import update_docs_updated_at__no_commit
 from onyx.db.document_set import fetch_document_sets_for_documents
 from onyx.indexing.indexing_pipeline import DocumentBatchPrepareContext
 from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
-from onyx.indexing.models import BuildMetadataAwareChunksResult
+from onyx.indexing.models import ChunkEnrichmentContext
+from onyx.indexing.models import DocAwareChunk
 from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.indexing.models import IndexChunk
 from onyx.indexing.models import UpdatableChunkData
@@ -85,14 +88,21 @@ class DocumentIndexingBatchAdapter:
        ) as transaction:
            yield transaction

-    def build_metadata_aware_chunks(
+    def prepare_enrichment(
        self,
-        chunks_with_embeddings: list[IndexChunk],
-        chunk_content_scores: list[float],
-        tenant_id: str,
        context: DocumentBatchPrepareContext,
-    ) -> BuildMetadataAwareChunksResult:
-        """Enrich chunks with access, document sets, boosts, token counts, and hierarchy."""
+        tenant_id: str,
+        chunks: list[DocAwareChunk],
+    ) -> DocumentChunkEnricher:
+        """Do all DB lookups once and return a per-chunk enricher."""
+        updatable_ids = [doc.id for doc in context.updatable_docs]
+
+        doc_id_to_new_chunk_cnt: dict[str, int] = {
+            doc_id: 0 for doc_id in updatable_ids
+        }
+        for chunk in chunks:
+            if chunk.source_document.id in doc_id_to_new_chunk_cnt:
+                doc_id_to_new_chunk_cnt[chunk.source_document.id] += 1

        no_access = DocumentAccess.build(
            user_emails=[],
@@ -102,67 +112,30 @@ class DocumentIndexingBatchAdapter:
            is_public=False,
        )

-        updatable_ids = [doc.id for doc in context.updatable_docs]
-
-        doc_id_to_access_info = get_access_for_documents(
-            document_ids=updatable_ids, db_session=self.db_session
-        )
-        doc_id_to_document_set = {
-            document_id: document_sets
-            for document_id, document_sets in fetch_document_sets_for_documents(
+        return DocumentChunkEnricher(
+            doc_id_to_access_info=get_access_for_documents(
                document_ids=updatable_ids, db_session=self.db_session
-            )
-        }
-
-        doc_id_to_previous_chunk_cnt: dict[str, int] = {
-            document_id: chunk_count
-            for document_id, chunk_count in fetch_chunk_counts_for_documents(
-                document_ids=updatable_ids,
-                db_session=self.db_session,
-            )
-        }
-
-        doc_id_to_new_chunk_cnt: dict[str, int] = {
-            doc_id: 0 for doc_id in updatable_ids
-        }
-        for chunk in chunks_with_embeddings:
-            if chunk.source_document.id in doc_id_to_new_chunk_cnt:
-                doc_id_to_new_chunk_cnt[chunk.source_document.id] += 1
-
-        # Get ancestor hierarchy node IDs for each document
-        doc_id_to_ancestor_ids = self._get_ancestor_ids_for_documents(
-            context.updatable_docs, tenant_id
-        )
-
-        access_aware_chunks = [
-            DocMetadataAwareIndexChunk.from_index_chunk(
-                index_chunk=chunk,
-                access=doc_id_to_access_info.get(chunk.source_document.id, no_access),
-                document_sets=set(
-                    doc_id_to_document_set.get(chunk.source_document.id, [])
-                ),
-                user_project=[],
-                personas=[],
-                boost=(
-                    context.id_to_boost_map[chunk.source_document.id]
-                    if chunk.source_document.id in context.id_to_boost_map
-                    else DEFAULT_BOOST
-                ),
-                tenant_id=tenant_id,
-                aggregated_chunk_boost_factor=chunk_content_scores[chunk_num],
-                ancestor_hierarchy_node_ids=doc_id_to_ancestor_ids[
-                    chunk.source_document.id
-                ],
-            )
-            for chunk_num, chunk in enumerate(chunks_with_embeddings)
-        ]
-
-        return BuildMetadataAwareChunksResult(
-            chunks=access_aware_chunks,
-            doc_id_to_previous_chunk_cnt=doc_id_to_previous_chunk_cnt,
-            doc_id_to_new_chunk_cnt=doc_id_to_new_chunk_cnt,
-            user_file_id_to_raw_text={},
-            user_file_id_to_token_count={},
+            ),
+            doc_id_to_document_set={
+                document_id: document_sets
+                for document_id, document_sets in fetch_document_sets_for_documents(
+                    document_ids=updatable_ids, db_session=self.db_session
+                )
+            },
+            doc_id_to_ancestor_ids=self._get_ancestor_ids_for_documents(
+                context.updatable_docs, tenant_id
+            ),
+            id_to_boost_map=context.id_to_boost_map,
+            doc_id_to_previous_chunk_cnt={
+                document_id: chunk_count
+                for document_id, chunk_count in fetch_chunk_counts_for_documents(
+                    document_ids=updatable_ids,
+                    db_session=self.db_session,
+                )
+            },
+            doc_id_to_new_chunk_cnt=dict(doc_id_to_new_chunk_cnt),
+            no_access=no_access,
+            tenant_id=tenant_id,
        )

    def _get_ancestor_ids_for_documents(
@@ -203,7 +176,7 @@ class DocumentIndexingBatchAdapter:
        context: DocumentBatchPrepareContext,
        updatable_chunk_data: list[UpdatableChunkData],
        filtered_documents: list[Document],
-        result: BuildMetadataAwareChunksResult,
+        enrichment: ChunkEnrichmentContext,
    ) -> None:
        """Finalize DB updates, store plaintext, and mark docs as indexed."""
        updatable_ids = [doc.id for doc in context.updatable_docs]
@@ -227,7 +200,7 @@ class DocumentIndexingBatchAdapter:

        update_docs_chunk_count__no_commit(
            document_ids=updatable_ids,
-            doc_id_to_chunk_count=result.doc_id_to_new_chunk_cnt,
+            doc_id_to_chunk_count=enrichment.doc_id_to_new_chunk_cnt,
            db_session=self.db_session,
        )

@@ -249,3 +222,52 @@ class DocumentIndexingBatchAdapter:
        )

        self.db_session.commit()
+
+
+class DocumentChunkEnricher:
+    """Pre-computed metadata for per-chunk enrichment of connector documents."""
+
+    def __init__(
+        self,
+        doc_id_to_access_info: dict[str, DocumentAccess],
+        doc_id_to_document_set: dict[str, list[str]],
+        doc_id_to_ancestor_ids: dict[str, list[int]],
+        id_to_boost_map: dict[str, int],
+        doc_id_to_previous_chunk_cnt: dict[str, int],
+        doc_id_to_new_chunk_cnt: dict[str, int],
+        no_access: DocumentAccess,
+        tenant_id: str,
+    ) -> None:
+        self._doc_id_to_access_info = doc_id_to_access_info
+        self._doc_id_to_document_set = doc_id_to_document_set
+        self._doc_id_to_ancestor_ids = doc_id_to_ancestor_ids
+        self._id_to_boost_map = id_to_boost_map
+        self._no_access = no_access
+        self._tenant_id = tenant_id
+        self.doc_id_to_previous_chunk_cnt = doc_id_to_previous_chunk_cnt
+        self.doc_id_to_new_chunk_cnt = doc_id_to_new_chunk_cnt
+
+    def enrich_chunk(
+        self, chunk: IndexChunk, score: float
+    ) -> DocMetadataAwareIndexChunk:
+        return DocMetadataAwareIndexChunk.from_index_chunk(
+            index_chunk=chunk,
+            access=self._doc_id_to_access_info.get(
+                chunk.source_document.id, self._no_access
+            ),
+            document_sets=set(
+                self._doc_id_to_document_set.get(chunk.source_document.id, [])
+            ),
+            user_project=[],
+            personas=[],
+            boost=(
+                self._id_to_boost_map[chunk.source_document.id]
+                if chunk.source_document.id in self._id_to_boost_map
+                else DEFAULT_BOOST
+            ),
+            tenant_id=self._tenant_id,
+            aggregated_chunk_boost_factor=score,
+            ancestor_hierarchy_node_ids=self._doc_id_to_ancestor_ids[
+                chunk.source_document.id
+            ],
+        )
--- a/backend/onyx/indexing/adapters/user_file_indexing_adapter.py
+++ b/backend/onyx/indexing/adapters/user_file_indexing_adapter.py
@@ -1,6 +1,9 @@
+from __future__ import annotations
+
 import contextlib
 import datetime
 import time
+from collections import defaultdict
 from collections.abc import Generator
 from uuid import UUID

@@ -24,7 +27,8 @@ from onyx.db.user_file import fetch_persona_ids_for_user_files
 from onyx.db.user_file import fetch_user_project_ids_for_user_files
 from onyx.file_store.utils import store_user_file_plaintext
 from onyx.indexing.indexing_pipeline import DocumentBatchPrepareContext
-from onyx.indexing.models import BuildMetadataAwareChunksResult
+from onyx.indexing.models import ChunkEnrichmentContext
+from onyx.indexing.models import DocAwareChunk
 from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.indexing.models import IndexChunk
 from onyx.indexing.models import UpdatableChunkData
@@ -101,13 +105,20 @@ class UserFileIndexingAdapter:
                f"Failed to acquire locks after {_NUM_LOCK_ATTEMPTS} attempts for user files: {[doc.id for doc in documents]}"
            )

-    def build_metadata_aware_chunks(
+    def prepare_enrichment(
        self,
-        chunks_with_embeddings: list[IndexChunk],
-        chunk_content_scores: list[float],
-        tenant_id: str,
        context: DocumentBatchPrepareContext,
-    ) -> BuildMetadataAwareChunksResult:
+        tenant_id: str,
+        chunks: list[DocAwareChunk],
+    ) -> UserFileChunkEnricher:
+        """Do all DB lookups and pre-compute file metadata from chunks."""
+        updatable_ids = [doc.id for doc in context.updatable_docs]
+
+        doc_id_to_new_chunk_cnt: dict[str, int] = defaultdict(int)
+        content_by_file: dict[str, list[str]] = defaultdict(list)
+        for chunk in chunks:
+            doc_id_to_new_chunk_cnt[chunk.source_document.id] += 1
+            content_by_file[chunk.source_document.id].append(chunk.content)

        no_access = DocumentAccess.build(
            user_emails=[],
@@ -117,7 +128,6 @@ class UserFileIndexingAdapter:
            is_public=False,
        )

-        updatable_ids = [doc.id for doc in context.updatable_docs]
        user_file_id_to_project_ids = fetch_user_project_ids_for_user_files(
            user_file_ids=updatable_ids,
            db_session=self.db_session,
@@ -138,17 +148,6 @@ class UserFileIndexingAdapter:
            )
        }

-        user_file_id_to_new_chunk_cnt: dict[str, int] = {
-            user_file_id: len(
-                [
-                    chunk
-                    for chunk in chunks_with_embeddings
-                    if chunk.source_document.id == user_file_id
-                ]
-            )
-            for user_file_id in updatable_ids
-        }
-
        # Initialize tokenizer used for token count calculation
        try:
            llm = get_default_llm()
@@ -163,15 +162,9 @@ class UserFileIndexingAdapter:
        user_file_id_to_raw_text: dict[str, str] = {}
        user_file_id_to_token_count: dict[str, int | None] = {}
        for user_file_id in updatable_ids:
-            user_file_chunks = [
-                chunk
-                for chunk in chunks_with_embeddings
-                if chunk.source_document.id == user_file_id
-            ]
-            if user_file_chunks:
-                combined_content = " ".join(
-                    [chunk.content for chunk in user_file_chunks]
-                )
+            contents = content_by_file.get(user_file_id)
+            if contents:
+                combined_content = " ".join(contents)
                user_file_id_to_raw_text[str(user_file_id)] = combined_content
                token_count = (
                    len(llm_tokenizer.encode(combined_content)) if llm_tokenizer else 0
@@ -181,28 +174,16 @@ class UserFileIndexingAdapter:
                user_file_id_to_raw_text[str(user_file_id)] = ""
                user_file_id_to_token_count[str(user_file_id)] = None

-        access_aware_chunks = [
-            DocMetadataAwareIndexChunk.from_index_chunk(
-                index_chunk=chunk,
-                access=user_file_id_to_access.get(chunk.source_document.id, no_access),
-                document_sets=set(),
-                user_project=user_file_id_to_project_ids.get(
-                    chunk.source_document.id, []
-                ),
-                personas=user_file_id_to_persona_ids.get(chunk.source_document.id, []),
-                boost=DEFAULT_BOOST,
-                tenant_id=tenant_id,
-                aggregated_chunk_boost_factor=chunk_content_scores[chunk_num],
-            )
-            for chunk_num, chunk in enumerate(chunks_with_embeddings)
-        ]
-
-        return BuildMetadataAwareChunksResult(
-            chunks=access_aware_chunks,
+        return UserFileChunkEnricher(
+            user_file_id_to_access=user_file_id_to_access,
+            user_file_id_to_project_ids=user_file_id_to_project_ids,
+            user_file_id_to_persona_ids=user_file_id_to_persona_ids,
            doc_id_to_previous_chunk_cnt=user_file_id_to_previous_chunk_cnt,
-            doc_id_to_new_chunk_cnt=user_file_id_to_new_chunk_cnt,
+            doc_id_to_new_chunk_cnt=dict(doc_id_to_new_chunk_cnt),
            user_file_id_to_raw_text=user_file_id_to_raw_text,
            user_file_id_to_token_count=user_file_id_to_token_count,
+            no_access=no_access,
+            tenant_id=tenant_id,
        )

    def _notify_assistant_owners_if_files_ready(
@@ -246,8 +227,9 @@ class UserFileIndexingAdapter:
        context: DocumentBatchPrepareContext,
        updatable_chunk_data: list[UpdatableChunkData],  # noqa: ARG002
        filtered_documents: list[Document],  # noqa: ARG002
-        result: BuildMetadataAwareChunksResult,
+        enrichment: ChunkEnrichmentContext,
    ) -> None:
+        assert isinstance(enrichment, UserFileChunkEnricher)
        user_file_ids = [doc.id for doc in context.updatable_docs]

        user_files = (
@@ -263,8 +245,10 @@ class UserFileIndexingAdapter:
            user_file.last_project_sync_at = datetime.datetime.now(
                datetime.timezone.utc
            )
-            user_file.chunk_count = result.doc_id_to_new_chunk_cnt[str(user_file.id)]
-            user_file.token_count = result.user_file_id_to_token_count[
+            user_file.chunk_count = enrichment.doc_id_to_new_chunk_cnt.get(
+                str(user_file.id), 0
+            )
+            user_file.token_count = enrichment.user_file_id_to_token_count[
                str(user_file.id)
            ]

@@ -276,8 +260,54 @@ class UserFileIndexingAdapter:
        # Store the plaintext in the file store for faster retrieval
        # NOTE: this creates its own session to avoid committing the overall
        # transaction.
-        for user_file_id, raw_text in result.user_file_id_to_raw_text.items():
+        for user_file_id, raw_text in enrichment.user_file_id_to_raw_text.items():
            store_user_file_plaintext(
                user_file_id=UUID(user_file_id),
                plaintext_content=raw_text,
            )
+
+
+class UserFileChunkEnricher:
+    """Pre-computed metadata for per-chunk enrichment of user-uploaded files."""
+
+    def __init__(
+        self,
+        user_file_id_to_access: dict[str, DocumentAccess],
+        user_file_id_to_project_ids: dict[str, list[int]],
+        user_file_id_to_persona_ids: dict[str, list[int]],
+        doc_id_to_previous_chunk_cnt: dict[str, int],
+        doc_id_to_new_chunk_cnt: dict[str, int],
+        user_file_id_to_raw_text: dict[str, str],
+        user_file_id_to_token_count: dict[str, int | None],
+        no_access: DocumentAccess,
+        tenant_id: str,
+    ) -> None:
+        self._user_file_id_to_access = user_file_id_to_access
+        self._user_file_id_to_project_ids = user_file_id_to_project_ids
+        self._user_file_id_to_persona_ids = user_file_id_to_persona_ids
+        self._no_access = no_access
+        self._tenant_id = tenant_id
+        self.doc_id_to_previous_chunk_cnt = doc_id_to_previous_chunk_cnt
+        self.doc_id_to_new_chunk_cnt = doc_id_to_new_chunk_cnt
+        self.user_file_id_to_raw_text = user_file_id_to_raw_text
+        self.user_file_id_to_token_count = user_file_id_to_token_count
+
+    def enrich_chunk(
+        self, chunk: IndexChunk, score: float
+    ) -> DocMetadataAwareIndexChunk:
+        return DocMetadataAwareIndexChunk.from_index_chunk(
+            index_chunk=chunk,
+            access=self._user_file_id_to_access.get(
+                chunk.source_document.id, self._no_access
+            ),
+            document_sets=set(),
+            user_project=self._user_file_id_to_project_ids.get(
+                chunk.source_document.id, []
+            ),
+            personas=self._user_file_id_to_persona_ids.get(
+                chunk.source_document.id, []
+            ),
+            boost=DEFAULT_BOOST,
+            tenant_id=self._tenant_id,
+            aggregated_chunk_boost_factor=score,
+        )
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -1,5 +1,7 @@
 from collections import defaultdict
 from collections.abc import Callable
+from collections.abc import Iterable
+from typing import cast
 from typing import Protocol

 from pydantic import BaseModel
@@ -47,6 +49,7 @@ from onyx.indexing.chunker import Chunker
 from onyx.indexing.embedder import embed_chunks_with_failure_handling
 from onyx.indexing.embedder import IndexingEmbedder
 from onyx.indexing.models import DocAwareChunk
+from onyx.indexing.models import DocMetadataAwareIndexChunk
 from onyx.indexing.models import IndexingBatchAdapter
 from onyx.indexing.models import UpdatableChunkData
 from onyx.indexing.vector_db_insertion import write_chunks_to_vector_db_with_backoff
@@ -91,6 +94,15 @@ class IndexingPipelineResult(BaseModel):

    failures: list[ConnectorFailure]

+    @classmethod
+    def empty(cls, total_docs: int) -> "IndexingPipelineResult":
+        return cls(
+            new_docs=0,
+            total_docs=total_docs,
+            total_chunks=0,
+            failures=[],
+        )
+

 class IndexingPipelineProtocol(Protocol):
    def __call__(
@@ -672,12 +684,7 @@ def index_doc_batch(
    filtered_documents = filter_fnc(document_batch)
    context = adapter.prepare(filtered_documents, ignore_time_skip)
    if not context:
-        return IndexingPipelineResult(
-            new_docs=0,
-            total_docs=len(filtered_documents),
-            total_chunks=0,
-            failures=[],
-        )
+        return IndexingPipelineResult.empty(len(filtered_documents))

    # Convert documents to IndexingDocument objects with processed section
    # logger.debug("Processing image sections")
@@ -748,19 +755,29 @@ def index_doc_batch(
        # we still write data here for the immediate and most likely correct sync, but
        # to resolve this, an update of the last modified field at the end of this loop
        # always triggers a final metadata sync via the celery queue
-        result = adapter.build_metadata_aware_chunks(
-            chunks_with_embeddings=chunks_with_embeddings,
-            chunk_content_scores=chunk_content_scores,
-            tenant_id=tenant_id,
+        enricher = adapter.prepare_enrichment(
            context=context,
+            tenant_id=tenant_id,
+            chunks=cast(list[DocAwareChunk], chunks_with_embeddings),
        )

-        short_descriptor_list = [chunk.to_short_descriptor() for chunk in result.chunks]
+        metadata_aware_chunks = [
+            enricher.enrich_chunk(chunk, score)
+            for chunk, score in zip(chunks_with_embeddings, chunk_content_scores)
+        ]
+
+        short_descriptor_list = [
+            chunk.to_short_descriptor() for chunk in metadata_aware_chunks
+        ]
        short_descriptor_log = str(short_descriptor_list)[:1024]
        logger.debug(f"Indexing the following chunks: {short_descriptor_log}")

        primary_doc_idx_insertion_records: list[DocumentInsertionRecord] | None = None
        primary_doc_idx_vector_db_write_failures: list[ConnectorFailure] | None = None
+
+        def chunk_iterable_creator() -> Iterable[DocMetadataAwareIndexChunk]:
+            return metadata_aware_chunks
+
        for document_index in document_indices:
            # A document will not be spread across different batches, so all the
            # documents with chunks in this set, are fully represented by the chunks
@@ -770,10 +787,10 @@ def index_doc_batch(
                vector_db_write_failures,
            ) = write_chunks_to_vector_db_with_backoff(
                document_index=document_index,
-                chunks=result.chunks,
+                make_chunks=chunk_iterable_creator,
                index_batch_params=IndexBatchParams(
-                    doc_id_to_previous_chunk_cnt=result.doc_id_to_previous_chunk_cnt,
-                    doc_id_to_new_chunk_cnt=result.doc_id_to_new_chunk_cnt,
+                    doc_id_to_previous_chunk_cnt=enricher.doc_id_to_previous_chunk_cnt,
+                    doc_id_to_new_chunk_cnt=enricher.doc_id_to_new_chunk_cnt,
                    tenant_id=tenant_id,
                    large_chunks_enabled=chunker.enable_large_chunks,
                ),
@@ -802,7 +819,7 @@ def index_doc_batch(
                    f"Updatable IDs: {updatable_ids}, "
                    f"Returned IDs: {all_returned_doc_ids}. "
                    "This should never happen."
-                    f"This occured for document index {document_index.__class__.__name__}"
+                    f"This occurred for document index {document_index.__class__.__name__}"
                )
            # We treat the first document index we got as the primary one used
            # for reporting the state of indexing.
@@ -815,7 +832,7 @@ def index_doc_batch(
            context=context,
            updatable_chunk_data=updatable_chunk_data,
            filtered_documents=filtered_documents,
-            result=result,
+            enrichment=enricher,
        )

    assert primary_doc_idx_insertion_records is not None
--- a/backend/onyx/indexing/models.py
+++ b/backend/onyx/indexing/models.py
@@ -235,12 +235,16 @@ class UpdatableChunkData(BaseModel):
    boost_score: float


-class BuildMetadataAwareChunksResult(BaseModel):
-    chunks: list[DocMetadataAwareIndexChunk]
+class ChunkEnrichmentContext(Protocol):
+    """Returned by prepare_enrichment. Holds pre-computed metadata lookups
+    and provides per-chunk enrichment."""
+
    doc_id_to_previous_chunk_cnt: dict[str, int]
    doc_id_to_new_chunk_cnt: dict[str, int]
-    user_file_id_to_raw_text: dict[str, str]
-    user_file_id_to_token_count: dict[str, int | None]
+
+    def enrich_chunk(
+        self, chunk: IndexChunk, score: float
+    ) -> DocMetadataAwareIndexChunk: ...


 class IndexingBatchAdapter(Protocol):
@@ -254,18 +258,24 @@ class IndexingBatchAdapter(Protocol):
    ) -> Generator[TransactionalContext, None, None]:
        """Provide a transaction/row-lock context for critical updates."""

-    def build_metadata_aware_chunks(
+    def prepare_enrichment(
        self,
-        chunks_with_embeddings: list[IndexChunk],
-        chunk_content_scores: list[float],
-        tenant_id: str,
        context: "DocumentBatchPrepareContext",
-    ) -> BuildMetadataAwareChunksResult: ...
+        tenant_id: str,
+        chunks: list[DocAwareChunk],
+    ) -> ChunkEnrichmentContext:
+        """Prepare per-chunk enrichment data (access, document sets, boost, etc.).
+
+        Precondition: ``chunks`` have already been through the embedding step
+        (i.e. they are ``IndexChunk`` instances with populated embeddings,
+        passed here as the base ``DocAwareChunk`` type).
+        """
+        ...

    def post_index(
        self,
        context: "DocumentBatchPrepareContext",
        updatable_chunk_data: list[UpdatableChunkData],
        filtered_documents: list[Document],
-        result: BuildMetadataAwareChunksResult,
+        enrichment: ChunkEnrichmentContext,
    ) -> None: ...
--- a/backend/onyx/indexing/vector_db_insertion.py
+++ b/backend/onyx/indexing/vector_db_insertion.py
@@ -1,6 +1,9 @@
 import time
-from collections import defaultdict
+from collections.abc import Callable
+from collections.abc import Iterable
 from http import HTTPStatus
+from itertools import chain
+from itertools import groupby

 import httpx

@@ -28,22 +31,22 @@ def _log_insufficient_storage_error(e: Exception) -> None:

 def write_chunks_to_vector_db_with_backoff(
    document_index: DocumentIndex,
-    chunks: list[DocMetadataAwareIndexChunk],
+    make_chunks: Callable[[], Iterable[DocMetadataAwareIndexChunk]],
    index_batch_params: IndexBatchParams,
 ) -> tuple[list[DocumentInsertionRecord], list[ConnectorFailure]]:
    """Tries to insert all chunks in one large batch. If that batch fails for any reason,
    goes document by document to isolate the failure(s).

    IMPORTANT: must pass in whole documents at a time not individual chunks, since the
-    vector DB interface assumes that all chunks for a single document are present.
+    vector DB interface assumes that all chunks for a single document are present. The
+    chunks must also be in contiguous batches
    """
-
    # first try to write the chunks to the vector db
    try:
        return (
            list(
                document_index.index(
-                    chunks=chunks,
+                    chunks=make_chunks(),
                    index_batch_params=index_batch_params,
                )
            ),
@@ -60,14 +63,16 @@ def write_chunks_to_vector_db_with_backoff(
        # wait a couple seconds just to give the vector db a chance to recover
        time.sleep(2)

-    # try writing each doc one by one
-    chunks_for_docs: dict[str, list[DocMetadataAwareIndexChunk]] = defaultdict(list)
-    for chunk in chunks:
-        chunks_for_docs[chunk.source_document.id].append(chunk)
-
    insertion_records: list[DocumentInsertionRecord] = []
    failures: list[ConnectorFailure] = []
-    for doc_id, chunks_for_doc in chunks_for_docs.items():
+
+    def key(chunk: DocMetadataAwareIndexChunk) -> str:
+        return chunk.source_document.id
+
+    for doc_id, chunks_for_doc in groupby(make_chunks(), key=key):
+        first_chunk = next(chunks_for_doc)
+        chunks_for_doc = chain([first_chunk], chunks_for_doc)
+
        try:
            insertion_records.extend(
                document_index.index(
@@ -87,9 +92,7 @@ def write_chunks_to_vector_db_with_backoff(
                ConnectorFailure(
                    failed_document=DocumentFailure(
                        document_id=doc_id,
-                        document_link=(
-                            chunks_for_doc[0].get_link() if chunks_for_doc else None
-                        ),
+                        document_link=first_chunk.get_link(),
                    ),
                    failure_message=str(e),
                    exception=e,
--- a/backend/tests/external_dependency_unit/celery/test_user_file_indexing_adapter.py
+++ b/backend/tests/external_dependency_unit/celery/test_user_file_indexing_adapter.py
@@ -153,15 +153,13 @@ class TestAdapterWritesBothMetadataFields:
        doc = chunk.source_document
        context = DocumentBatchPrepareContext(updatable_docs=[doc], id_to_boost_map={})

-        result = adapter.build_metadata_aware_chunks(
-            chunks_with_embeddings=[chunk],
-            chunk_content_scores=[1.0],
-            tenant_id=TEST_TENANT_ID,
+        enricher = adapter.prepare_enrichment(
            context=context,
+            tenant_id=TEST_TENANT_ID,
+            chunks=[chunk],
        )
+        aware_chunk = enricher.enrich_chunk(chunk, 1.0)

-        assert len(result.chunks) == 1
-        aware_chunk = result.chunks[0]
        assert persona.id in aware_chunk.personas
        assert aware_chunk.user_project == []

@@ -190,15 +188,13 @@ class TestAdapterWritesBothMetadataFields:
            updatable_docs=[chunk.source_document], id_to_boost_map={}
        )

-        result = adapter.build_metadata_aware_chunks(
-            chunks_with_embeddings=[chunk],
-            chunk_content_scores=[1.0],
-            tenant_id=TEST_TENANT_ID,
+        enricher = adapter.prepare_enrichment(
            context=context,
+            tenant_id=TEST_TENANT_ID,
+            chunks=[chunk],
        )
+        aware_chunk = enricher.enrich_chunk(chunk, 1.0)

-        assert len(result.chunks) == 1
-        aware_chunk = result.chunks[0]
        assert project.id in aware_chunk.user_project
        assert aware_chunk.personas == []

@@ -229,14 +225,13 @@ class TestAdapterWritesBothMetadataFields:
            updatable_docs=[chunk.source_document], id_to_boost_map={}
        )

-        result = adapter.build_metadata_aware_chunks(
-            chunks_with_embeddings=[chunk],
-            chunk_content_scores=[1.0],
-            tenant_id=TEST_TENANT_ID,
+        enricher = adapter.prepare_enrichment(
            context=context,
+            tenant_id=TEST_TENANT_ID,
+            chunks=[chunk],
        )
+        aware_chunk = enricher.enrich_chunk(chunk, 1.0)

-        aware_chunk = result.chunks[0]
        assert persona.id in aware_chunk.personas
        assert project.id in aware_chunk.user_project

@@ -261,14 +256,13 @@ class TestAdapterWritesBothMetadataFields:
            updatable_docs=[chunk.source_document], id_to_boost_map={}
        )

-        result = adapter.build_metadata_aware_chunks(
-            chunks_with_embeddings=[chunk],
-            chunk_content_scores=[1.0],
-            tenant_id=TEST_TENANT_ID,
+        enricher = adapter.prepare_enrichment(
            context=context,
+            tenant_id=TEST_TENANT_ID,
+            chunks=[chunk],
        )
+        aware_chunk = enricher.enrich_chunk(chunk, 1.0)

-        aware_chunk = result.chunks[0]
        assert aware_chunk.personas == []
        assert aware_chunk.user_project == []

@@ -300,12 +294,11 @@ class TestAdapterWritesBothMetadataFields:
            updatable_docs=[chunk.source_document], id_to_boost_map={}
        )

-        result = adapter.build_metadata_aware_chunks(
-            chunks_with_embeddings=[chunk],
-            chunk_content_scores=[1.0],
-            tenant_id=TEST_TENANT_ID,
+        enricher = adapter.prepare_enrichment(
            context=context,
+            tenant_id=TEST_TENANT_ID,
+            chunks=[chunk],
        )
+        aware_chunk = enricher.enrich_chunk(chunk, 1.0)

-        aware_chunk = result.chunks[0]
        assert set(aware_chunk.personas) == {persona_a.id, persona_b.id}
--- a/backend/tests/unit/onyx/document_index/opensearch/test_opensearch_batch_flush.py
+++ b/backend/tests/unit/onyx/document_index/opensearch/test_opensearch_batch_flush.py
@@ -0,0 +1,226 @@
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+from onyx.access.models import DocumentAccess
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.models import Document
+from onyx.connectors.models import TextSection
+from onyx.document_index.interfaces_new import IndexingMetadata
+from onyx.document_index.interfaces_new import TenantState
+from onyx.document_index.opensearch.opensearch_document_index import (
+    OpenSearchDocumentIndex,
+)
+from onyx.indexing.models import DocMetadataAwareIndexChunk
+
+
+def _make_chunk(
+    doc_id: str,
+    chunk_id: int,
+) -> DocMetadataAwareIndexChunk:
+    """Creates a minimal DocMetadataAwareIndexChunk for testing."""
+    doc = Document(
+        id=doc_id,
+        sections=[TextSection(text="test", link="http://test.com")],
+        source=DocumentSource.FILE,
+        semantic_identifier="test_doc",
+        metadata={},
+    )
+    access = DocumentAccess.build(
+        user_emails=[],
+        user_groups=[],
+        external_user_emails=[],
+        external_user_group_ids=[],
+        is_public=True,
+    )
+    return DocMetadataAwareIndexChunk(
+        chunk_id=chunk_id,
+        blurb="test",
+        content="test content",
+        source_links={0: "http://test.com"},
+        image_file_id=None,
+        section_continuation=False,
+        source_document=doc,
+        title_prefix="",
+        metadata_suffix_semantic="",
+        metadata_suffix_keyword="",
+        mini_chunk_texts=None,
+        large_chunk_id=None,
+        doc_summary="",
+        chunk_context="",
+        contextual_rag_reserved_tokens=0,
+        embeddings={"full_embedding": [0.1] * 10, "mini_chunk_embeddings": []},
+        title_embedding=[0.1] * 10,
+        tenant_id="test_tenant",
+        access=access,
+        document_sets=set(),
+        user_project=[],
+        personas=[],
+        boost=0,
+        aggregated_chunk_boost_factor=1.0,
+        ancestor_hierarchy_node_ids=[],
+    )
+
+
+def _make_index() -> OpenSearchDocumentIndex:
+    """Creates an OpenSearchDocumentIndex with a mocked client."""
+    mock_client = MagicMock()
+    mock_client.bulk_index_documents = MagicMock()
+
+    tenant_state = TenantState(tenant_id="test_tenant", multitenant=False)
+
+    index = OpenSearchDocumentIndex.__new__(OpenSearchDocumentIndex)
+    index._index_name = "test_index"
+    index._client = mock_client
+    index._tenant_state = tenant_state
+
+    return index
+
+
+def _make_metadata(doc_id: str, chunk_count: int) -> IndexingMetadata:
+    return IndexingMetadata(
+        doc_id_to_chunk_cnt_diff={
+            doc_id: IndexingMetadata.ChunkCounts(
+                old_chunk_cnt=0,
+                new_chunk_cnt=chunk_count,
+            ),
+        },
+    )
+
+
+@patch(
+    "onyx.document_index.opensearch.opensearch_document_index.MAX_CHUNKS_PER_DOC_BATCH",
+    100,
+)
+def test_single_doc_under_batch_limit_flushes_once() -> None:
+    """A document with fewer chunks than MAX_CHUNKS_PER_DOC_BATCH should flush once."""
+    index = _make_index()
+    doc_id = "doc_1"
+    num_chunks = 50
+    chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
+    metadata = _make_metadata(doc_id, num_chunks)
+
+    with patch.object(index, "delete", return_value=0):
+        index.index(chunks, metadata)
+
+    assert index._client.bulk_index_documents.call_count == 1
+    batch_arg = index._client.bulk_index_documents.call_args_list[0]
+    assert len(batch_arg.kwargs["documents"]) == num_chunks
+
+
+@patch(
+    "onyx.document_index.opensearch.opensearch_document_index.MAX_CHUNKS_PER_DOC_BATCH",
+    100,
+)
+def test_single_doc_over_batch_limit_flushes_multiple_times() -> None:
+    """A document with more chunks than MAX_CHUNKS_PER_DOC_BATCH should flush multiple times."""
+    index = _make_index()
+    doc_id = "doc_1"
+    num_chunks = 250
+    chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
+    metadata = _make_metadata(doc_id, num_chunks)
+
+    with patch.object(index, "delete", return_value=0):
+        index.index(chunks, metadata)
+
+    # 250 chunks / 100 per batch = 3 flushes (100 + 100 + 50)
+    assert index._client.bulk_index_documents.call_count == 3
+    batch_sizes = [
+        len(call.kwargs["documents"])
+        for call in index._client.bulk_index_documents.call_args_list
+    ]
+    assert batch_sizes == [100, 100, 50]
+
+
+@patch(
+    "onyx.document_index.opensearch.opensearch_document_index.MAX_CHUNKS_PER_DOC_BATCH",
+    100,
+)
+def test_single_doc_exactly_at_batch_limit() -> None:
+    """A document with exactly MAX_CHUNKS_PER_DOC_BATCH chunks should flush once
+    (the flush happens on the next chunk, not at the boundary)."""
+    index = _make_index()
+    doc_id = "doc_1"
+    num_chunks = 100
+    chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
+    metadata = _make_metadata(doc_id, num_chunks)
+
+    with patch.object(index, "delete", return_value=0):
+        index.index(chunks, metadata)
+
+    # 100 chunks hit the >= check on chunk 101 which doesn't exist,
+    # so final flush handles all 100
+    # Actually: the elif fires when len(current_chunks) >= 100, which happens
+    # when current_chunks has 100 items and the 101st chunk arrives.
+    # With exactly 100 chunks, the 100th chunk makes len == 99, then appended -> 100.
+    # No 101st chunk arrives, so the final flush handles all 100.
+    assert index._client.bulk_index_documents.call_count == 1
+
+
+@patch(
+    "onyx.document_index.opensearch.opensearch_document_index.MAX_CHUNKS_PER_DOC_BATCH",
+    100,
+)
+def test_single_doc_one_over_batch_limit() -> None:
+    """101 chunks for one doc: first 100 flushed when the 101st arrives, then
+    the 101st is flushed at the end."""
+    index = _make_index()
+    doc_id = "doc_1"
+    num_chunks = 101
+    chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
+    metadata = _make_metadata(doc_id, num_chunks)
+
+    with patch.object(index, "delete", return_value=0):
+        index.index(chunks, metadata)
+
+    assert index._client.bulk_index_documents.call_count == 2
+    batch_sizes = [
+        len(call.kwargs["documents"])
+        for call in index._client.bulk_index_documents.call_args_list
+    ]
+    assert batch_sizes == [100, 1]
+
+
+@patch(
+    "onyx.document_index.opensearch.opensearch_document_index.MAX_CHUNKS_PER_DOC_BATCH",
+    100,
+)
+def test_multiple_docs_each_under_limit_flush_per_doc() -> None:
+    """Multiple documents each under the batch limit should flush once per document."""
+    index = _make_index()
+    chunks = []
+    for doc_idx in range(3):
+        doc_id = f"doc_{doc_idx}"
+        for chunk_idx in range(50):
+            chunks.append(_make_chunk(doc_id, chunk_idx))
+
+    metadata = IndexingMetadata(
+        doc_id_to_chunk_cnt_diff={
+            f"doc_{i}": IndexingMetadata.ChunkCounts(old_chunk_cnt=0, new_chunk_cnt=50)
+            for i in range(3)
+        },
+    )
+
+    with patch.object(index, "delete", return_value=0):
+        index.index(chunks, metadata)
+
+    # 3 documents = 3 flushes (one per doc boundary + final)
+    assert index._client.bulk_index_documents.call_count == 3
+
+
+@patch(
+    "onyx.document_index.opensearch.opensearch_document_index.MAX_CHUNKS_PER_DOC_BATCH",
+    100,
+)
+def test_delete_called_once_per_document() -> None:
+    """Even with multiple flushes for a single document, delete should only be
+    called once per document."""
+    index = _make_index()
+    doc_id = "doc_1"
+    num_chunks = 250
+    chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
+    metadata = _make_metadata(doc_id, num_chunks)
+
+    with patch.object(index, "delete", return_value=0) as mock_delete:
+        index.index(chunks, metadata)
+
+    mock_delete.assert_called_once_with(doc_id, None)
--- a/backend/tests/unit/onyx/document_index/vespa/test_vespa_batch_flush.py
+++ b/backend/tests/unit/onyx/document_index/vespa/test_vespa_batch_flush.py
@@ -0,0 +1,152 @@
+"""Unit tests for VespaDocumentIndex.index().
+
+These tests mock all external I/O (HTTP calls, thread pools) and verify
+the streaming logic, ID cleaning/mapping, and DocumentInsertionRecord
+construction.
+"""
+
+from unittest.mock import MagicMock
+from unittest.mock import patch
+
+from onyx.access.models import DocumentAccess
+from onyx.configs.constants import DocumentSource
+from onyx.connectors.models import Document
+from onyx.connectors.models import TextSection
+from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
+from onyx.document_index.interfaces_new import IndexingMetadata
+from onyx.document_index.interfaces_new import TenantState
+from onyx.document_index.vespa.vespa_document_index import VespaDocumentIndex
+from onyx.indexing.models import ChunkEmbedding
+from onyx.indexing.models import DocMetadataAwareIndexChunk
+from onyx.indexing.models import IndexChunk
+
+
+def _make_chunk(
+    doc_id: str,
+    chunk_id: int = 0,
+    content: str = "test content",
+) -> DocMetadataAwareIndexChunk:
+    doc = Document(
+        id=doc_id,
+        semantic_identifier="test_doc",
+        sections=[TextSection(text=content, link=None)],
+        source=DocumentSource.NOT_APPLICABLE,
+        metadata={},
+    )
+    index_chunk = IndexChunk(
+        chunk_id=chunk_id,
+        blurb=content[:50],
+        content=content,
+        source_links=None,
+        image_file_id=None,
+        section_continuation=False,
+        source_document=doc,
+        title_prefix="",
+        metadata_suffix_semantic="",
+        metadata_suffix_keyword="",
+        contextual_rag_reserved_tokens=0,
+        doc_summary="",
+        chunk_context="",
+        mini_chunk_texts=None,
+        large_chunk_id=None,
+        embeddings=ChunkEmbedding(
+            full_embedding=[0.1] * 10,
+            mini_chunk_embeddings=[],
+        ),
+        title_embedding=None,
+    )
+    access = DocumentAccess.build(
+        user_emails=[],
+        user_groups=[],
+        external_user_emails=[],
+        external_user_group_ids=[],
+        is_public=True,
+    )
+    return DocMetadataAwareIndexChunk.from_index_chunk(
+        index_chunk=index_chunk,
+        access=access,
+        document_sets=set(),
+        user_project=[],
+        personas=[],
+        boost=0,
+        aggregated_chunk_boost_factor=1.0,
+        tenant_id="test_tenant",
+    )
+
+
+def _make_indexing_metadata(
+    doc_ids: list[str],
+    old_counts: list[int],
+    new_counts: list[int],
+) -> IndexingMetadata:
+    return IndexingMetadata(
+        doc_id_to_chunk_cnt_diff={
+            doc_id: IndexingMetadata.ChunkCounts(
+                old_chunk_cnt=old,
+                new_chunk_cnt=new,
+            )
+            for doc_id, old, new in zip(doc_ids, old_counts, new_counts)
+        }
+    )
+
+
+def _stub_enrich(
+    doc_id: str,
+    old_chunk_cnt: int,
+) -> EnrichedDocumentIndexingInfo:
+    """Build an EnrichedDocumentIndexingInfo that says 'no chunks to delete'
+    when old_chunk_cnt == 0, or 'has existing chunks' otherwise."""
+    return EnrichedDocumentIndexingInfo(
+        doc_id=doc_id,
+        chunk_start_index=0,
+        old_version=False,
+        chunk_end_index=old_chunk_cnt,
+    )
+
+
+@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
+@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
+@patch(
+    "onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
+    return_value=[],
+)
+@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
+@patch(
+    "onyx.document_index.vespa.vespa_document_index.BATCH_SIZE",
+    3,
+)
+def test_index_respects_batch_size(
+    mock_enrich: MagicMock,
+    mock_get_chunk_ids: MagicMock,  # noqa: ARG001
+    mock_delete: MagicMock,  # noqa: ARG001
+    mock_batch_index: MagicMock,
+) -> None:
+    """When chunks exceed BATCH_SIZE, batch_index_vespa_chunks is called
+    multiple times with correctly sized batches."""
+    mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
+
+    index = VespaDocumentIndex(
+        index_name="test_index",
+        tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
+        large_chunks_enabled=False,
+        httpx_client=MagicMock(),
+    )
+
+    chunks = [_make_chunk("doc1", chunk_id=i) for i in range(7)]
+    metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[7])
+
+    results = index.index(chunks=chunks, indexing_metadata=metadata)
+
+    assert len(results) == 1
+
+    # With BATCH_SIZE=3 and 7 chunks: batches of 3, 3, 1
+    assert mock_batch_index.call_count == 3
+    batch_sizes = [len(c.kwargs["chunks"]) for c in mock_batch_index.call_args_list]
+    assert batch_sizes == [3, 3, 1]
+
+    # Verify all chunks are accounted for and in order
+    all_indexed = [
+        chunk for c in mock_batch_index.call_args_list for chunk in c.kwargs["chunks"]
+    ]
+    assert len(all_indexed) == 7
+    assert [c.chunk_id for c in all_indexed] == list(range(7))
--- a/backend/tests/unit/onyx/indexing/test_personas_in_chunks.py
+++ b/backend/tests/unit/onyx/indexing/test_personas_in_chunks.py
@@ -116,7 +116,7 @@ def _run_adapter_build(
    project_ids_map: dict[str, list[int]],
    persona_ids_map: dict[str, list[int]],
 ) -> list[DocMetadataAwareIndexChunk]:
-    """Helper that runs UserFileIndexingAdapter.build_metadata_aware_chunks
+    """Helper that runs UserFileIndexingAdapter.prepare_enrichment + enrich_chunk
    with all external dependencies mocked."""
    from onyx.indexing.adapters.user_file_indexing_adapter import (
        UserFileIndexingAdapter,
@@ -155,14 +155,12 @@ def _run_adapter_build(
            side_effect=Exception("no LLM in tests"),
        ),
    ):
-        result = adapter.build_metadata_aware_chunks(
-            chunks_with_embeddings=[chunk],
-            chunk_content_scores=[1.0],
-            tenant_id="test_tenant",
+        enricher = adapter.prepare_enrichment(
            context=context,
+            tenant_id="test_tenant",
+            chunks=[chunk],
        )
-
-    return result.chunks
+        return [enricher.enrich_chunk(chunk, 1.0)]


 def test_build_metadata_aware_chunks_includes_persona_ids() -> None:
Author	SHA1	Message	Date
Dane Urban	6985661dcd	.	2026-03-26 10:33:56 +11:00
Dane Urban	3e2a10ce9d	.	2026-03-26 10:26:58 +11:00
Dane Urban	389eb6c281	Update	2026-03-26 09:59:26 +11:00
Dane Urban	ff88d1886b	Update	2026-03-26 09:57:24 +11:00
Dane Urban	18dac2ba71	.	2026-03-24 21:40:43 +11:00
Dane Urban	96cd5bb751	.	2026-03-24 21:31:21 +11:00
Dane Urban	30a7c40c55	.	2026-03-24 21:24:44 +11:00
Dane Urban	641fb61c45	.	2026-03-24 21:23:36 +11:00
Dane Urban	6f8d9cfdd7	.	2026-03-24 21:23:36 +11:00
Dane Urban	2784e42cfe	.	2026-03-24 21:23:36 +11:00
Dane Urban	4f5fc65428	.	2026-03-24 21:23:36 +11:00
Dane Urban	8fcdd3a3fb	.	2026-03-24 21:23:36 +11:00
Dane Urban	3b7c53aeb1	Adapter refactor	2026-03-24 21:23:36 +11:00
Dane Urban	ea58e82aed	Support streaming via document adapter	2026-03-24 21:23:36 +11:00
Dane Urban	bd35585785	Add extra tests	2026-03-24 21:23:34 +11:00
Dane Urban	cf9bd7e511	.	2026-03-24 21:21:01 +11:00
Dane Urban	b5dd17a371	.	2026-03-24 21:19:38 +11:00
Dane Urban	d62d0c1864	.	2026-03-24 21:14:52 +11:00
Dane Urban	2c92742c62	.	2026-03-24 21:06:48 +11:00
Dane Urban	1e1402e4f1	.	2026-03-24 21:06:48 +11:00
Dane Urban	440818a082	Max chunks	2026-03-24 21:06:48 +11:00
Dane Urban	bd9f40d1c1	.	2026-03-24 21:06:46 +11:00
Dane Urban	c85e090c13	.	2026-03-24 21:06:23 +11:00