.

file saving
.
2026-04-20 08:56:43 +00:00 · 2026-04-17 13:50:33 -07:00 · 2026-04-17 11:53:55 -07:00 · 2026-04-17 09:58:55 -07:00 · 2026-04-16 18:38:46 -07:00 · 2026-04-16 15:50:29 -07:00
14 changed files with 556 additions and 0 deletions
--- a/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
+++ b/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
@@ -0,0 +1,27 @@
+"""Add file_id to documents
+
+Revision ID: 91d150c361f6
+Revises: d129f37b3d87
+Create Date: 2026-04-16 15:43:30.314823
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "91d150c361f6"
+down_revision = "d129f37b3d87"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "document",
+        sa.Column("file_id", sa.String(), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("document", "file_id")
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -58,6 +58,8 @@ from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.db.models import IndexAttempt
 from onyx.file_store.document_batch_storage import DocumentBatchStorage
 from onyx.file_store.document_batch_storage import get_document_batch_storage
+from onyx.file_store.staging import build_raw_file_callback
+from onyx.file_store.staging import RawFileCallback
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
 from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
@@ -90,6 +92,7 @@ def _get_connector_runner(
    end_time: datetime,
    include_permissions: bool,
    leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
+    raw_file_callback: RawFileCallback | None = None,
 ) -> ConnectorRunner:
    """
    NOTE: `start_time` and `end_time` are only used for poll connectors
@@ -108,6 +111,7 @@ def _get_connector_runner(
            input_type=task,
            connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
            credential=attempt.connector_credential_pair.credential,
+            raw_file_callback=raw_file_callback,
        )

        # validate the connector settings
@@ -275,6 +279,12 @@ def run_docfetching_entrypoint(
        f"credentials='{credential_id}'"
    )

+    raw_file_callback = build_raw_file_callback(
+        index_attempt_id=index_attempt_id,
+        cc_pair_id=connector_credential_pair_id,
+        tenant_id=tenant_id,
+    )
+
    connector_document_extraction(
        app,
        index_attempt_id,
@@ -282,6 +292,7 @@ def run_docfetching_entrypoint(
        attempt.search_settings_id,
        tenant_id,
        callback,
+        raw_file_callback=raw_file_callback,
    )

    logger.info(
@@ -301,6 +312,7 @@ def connector_document_extraction(
    search_settings_id: int,
    tenant_id: str,
    callback: IndexingHeartbeatInterface | None = None,
+    raw_file_callback: RawFileCallback | None = None,
 ) -> None:
    """Extract documents from connector and queue them for indexing pipeline processing.

@@ -451,6 +463,7 @@ def connector_document_extraction(
            start_time=window_start,
            end_time=window_end,
            include_permissions=should_fetch_permissions_during_indexing,
+            raw_file_callback=raw_file_callback,
        )

        # don't use a checkpoint if we're explicitly indexing from
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -372,6 +372,7 @@ class FileOrigin(str, Enum):
    CONNECTOR_METADATA = "connector_metadata"
    GENERATED_REPORT = "generated_report"
    INDEXING_CHECKPOINT = "indexing_checkpoint"
+    INDEXING_STAGING = "indexing_staging"
    PLAINTEXT_CACHE = "plaintext_cache"
    OTHER = "other"
    QUERY_HISTORY_CSV = "query_history_csv"
--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@@ -22,6 +22,7 @@ from onyx.db.credentials import backend_update_credential_json
 from onyx.db.credentials import fetch_credential_by_id
 from onyx.db.enums import AccessType
 from onyx.db.models import Credential
+from onyx.file_store.staging import RawFileCallback
 from shared_configs.contextvars import get_current_tenant_id


@@ -107,6 +108,7 @@ def instantiate_connector(
    input_type: InputType,
    connector_specific_config: dict[str, Any],
    credential: Credential,
+    raw_file_callback: RawFileCallback | None = None,
 ) -> BaseConnector:
    connector_class = identify_connector_class(source, input_type)

@@ -130,6 +132,9 @@ def instantiate_connector(

    connector.set_allow_images(get_image_extraction_and_analysis_enabled())

+    if raw_file_callback is not None:
+        connector.set_raw_file_callback(raw_file_callback)
+
    return connector


--- a/backend/onyx/connectors/interfaces.py
+++ b/backend/onyx/connectors/interfaces.py
@@ -15,6 +15,7 @@ from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import HierarchyNode
 from onyx.connectors.models import SlimDocument
+from onyx.file_store.staging import RawFileCallback
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop

@@ -42,6 +43,9 @@ class NormalizationResult(BaseModel):
 class BaseConnector(abc.ABC, Generic[CT]):
    REDIS_KEY_PREFIX = "da_connector_data:"

+    # Optional raw-file persistence hook to save original file
+    raw_file_callback: RawFileCallback | None = None
+
    @abc.abstractmethod
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        raise NotImplementedError
@@ -88,6 +92,15 @@ class BaseConnector(abc.ABC, Generic[CT]):
        """Implement if the underlying connector wants to skip/allow image downloading
        based on the application level image analysis setting."""

+    def set_raw_file_callback(self, callback: RawFileCallback) -> None:
+        """Inject the per-attempt raw-file persistence callback.
+
+        Wired up by the docfetching entrypoint via `instantiate_connector`.
+        Connectors that don't care about persisting raw bytes can ignore this
+        — `raw_file_callback` simply stays `None`.
+        """
+        self.raw_file_callback = callback
+
    @classmethod
    def normalize_url(cls, url: str) -> "NormalizationResult":  # noqa: ARG003
        """Normalize a URL to match the canonical Document.id format used during ingestion.
--- a/backend/onyx/connectors/models.py
+++ b/backend/onyx/connectors/models.py
@@ -231,6 +231,8 @@ class DocumentBase(BaseModel):
    # Set during docfetching after hierarchy nodes are cached
    parent_hierarchy_node_id: int | None = None

+    file_id: str | None = None
+
    def get_title_for_document_index(
        self,
    ) -> str | None:
@@ -370,6 +372,7 @@ class Document(DocumentBase):
            secondary_owners=base.secondary_owners,
            title=base.title,
            from_ingestion_api=base.from_ingestion_api,
+            file_id=base.file_id,
        )

    def __sizeof__(self) -> int:
--- a/backend/onyx/db/document.py
+++ b/backend/onyx/db/document.py
@@ -696,6 +696,7 @@ def upsert_documents(
                        else {}
                    ),
                    doc_metadata=doc.doc_metadata,
+                    file_id=doc.file_id,
                )
            )
            for doc in seen_documents.values()
@@ -712,6 +713,7 @@ def upsert_documents(
        "secondary_owners": insert_stmt.excluded.secondary_owners,
        "doc_metadata": insert_stmt.excluded.doc_metadata,
        "parent_hierarchy_node_id": insert_stmt.excluded.parent_hierarchy_node_id,
+        "file_id": insert_stmt.excluded.file_id,
    }
    if includes_permissions:
        # Use COALESCE to preserve existing permissions when new values are NULL.
--- a/backend/onyx/db/file_record.py
+++ b/backend/onyx/db/file_record.py
@@ -62,6 +62,21 @@ def delete_filerecord_by_file_id(
    db_session.query(FileRecord).filter_by(file_id=file_id).delete()


+def update_filerecord_origin(
+    file_id: str,
+    from_origin: FileOrigin,
+    to_origin: FileOrigin,
+    db_session: Session,
+) -> None:
+    """Change a file_record's `file_origin`, filtered on the current origin
+    so the update is idempotent. Caller owns the commit.
+    """
+    db_session.query(FileRecord).filter(
+        FileRecord.file_id == file_id,
+        FileRecord.file_origin == from_origin,
+    ).update({FileRecord.file_origin: to_origin})
+
+
 def upsert_filerecord(
    file_id: str,
    display_name: str,
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -952,6 +952,7 @@ class Document(Base):
    semantic_id: Mapped[str] = mapped_column(NullFilteredString)
    # First Section's link
    link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
+    file_id: Mapped[str | None] = mapped_column(String, nullable=True)

    # The updated time is also used as a measure of the last successful state of the doc
    # pulled from the source (to help skip reindexing already updated docs in case of
--- a/backend/onyx/document_index/interfaces.py
+++ b/backend/onyx/document_index/interfaces.py
@@ -98,6 +98,9 @@ class DocumentMetadata:
    # The resolved database ID of the parent hierarchy node (folder/container)
    parent_hierarchy_node_id: int | None = None

+    # Opt-in pointer to the persisted raw file for this document (file_store id).
+    file_id: str | None = None
+

@dataclass
 class VespaDocumentFields:
--- a/backend/onyx/file_store/staging.py
+++ b/backend/onyx/file_store/staging.py
@@ -0,0 +1,82 @@
+from collections.abc import Callable
+from typing import Any
+from typing import IO
+
+from sqlalchemy.orm import Session
+
+from onyx.configs.constants import FileOrigin
+from onyx.db.file_record import update_filerecord_origin
+from onyx.file_store.file_store import get_default_file_store
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+# (content, content_type) -> file_id
+RawFileCallback = Callable[[IO[bytes], str], str]
+
+
+def stage_raw_file(
+    content: IO,
+    content_type: str,
+    *,
+    metadata: dict[str, Any],
+) -> str:
+    """Persist raw bytes to the file store with FileOrigin.INDEXING_STAGING.
+
+    `metadata` is attached to the file_record so that downstream promotion
+    (in docprocessing) and orphan reaping (TTL janitor) can locate the file
+    by its originating context.
+    """
+    file_store = get_default_file_store()
+    file_id = file_store.save_file(
+        content=content,
+        display_name=None,
+        file_origin=FileOrigin.INDEXING_STAGING,
+        file_type=content_type,
+        file_metadata=metadata,
+    )
+    return file_id
+
+
+def build_raw_file_callback(
+    *,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    tenant_id: str,
+) -> RawFileCallback:
+    """Build a per-attempt callback that connectors can invoke to opt in to
+    raw-file persistence. The closure binds the attempt-level context as the
+    staging metadata so the connector only needs to pass per-call info
+    (bytes, content_type) and gets back a file_id to attach to its Document.
+    """
+    metadata: dict[str, Any] = {
+        "index_attempt_id": index_attempt_id,
+        "cc_pair_id": cc_pair_id,
+        "tenant_id": tenant_id,
+    }
+
+    def _callback(content: IO[bytes], content_type: str) -> str:
+        return stage_raw_file(
+            content=content,
+            content_type=content_type,
+            metadata=metadata,
+        )
+
+    return _callback
+
+
+def promote_staged_file(db_session: Session, file_id: str) -> None:
+    """Mark a previously-staged file as `FileOrigin.CONNECTOR`.
+
+    Idempotent — the underlying update filters on the STAGING origin so
+    repeated calls no-op once the file has already been promoted or removed.
+    Caller owns the commit so promotion stays transactional with whatever
+    document-level bookkeeping the caller is doing.
+    """
+    update_filerecord_origin(
+        file_id=file_id,
+        from_origin=FileOrigin.INDEXING_STAGING,
+        to_origin=FileOrigin.CONNECTOR,
+        db_session=db_session,
+    )
--- a/backend/onyx/indexing/indexing_pipeline.py
+++ b/backend/onyx/indexing/indexing_pipeline.py
@@ -49,6 +49,7 @@ from onyx.document_index.interfaces import DocumentMetadata
 from onyx.document_index.interfaces import IndexBatchParams
 from onyx.file_processing.image_summarization import summarize_image_with_error_handling
 from onyx.file_store.file_store import get_default_file_store
+from onyx.file_store.staging import promote_staged_file
 from onyx.hooks.executor import execute_hook
 from onyx.hooks.executor import HookSkipped
 from onyx.hooks.executor import HookSoftFailed
@@ -154,6 +155,7 @@ def _upsert_documents_in_db(
            doc_metadata=doc.doc_metadata,
            # parent_hierarchy_node_id is resolved in docfetching using Redis cache
            parent_hierarchy_node_id=doc.parent_hierarchy_node_id,
+            file_id=doc.file_id,
        )
        document_metadata_list.append(db_doc_metadata)

@@ -364,6 +366,39 @@ def index_doc_batch_with_handler(
    return index_pipeline_result


+def _apply_file_id_transitions(
+    documents: list[Document],
+    previous_file_ids: dict[str, str],
+    db_session: Session,
+) -> None:
+    """Finalize file_id lifecycle for the batch.
+
+    `document.file_id` is already written by `upsert_documents`. For each doc
+    whose file_id changed, promote the new staged file to `CONNECTOR` (so the
+    TTL janitor leaves it alone) and delete the replaced one. The delete is
+    best-effort; if it fails the janitor will reap the orphan.
+    """
+    file_store = get_default_file_store()
+    for doc in documents:
+        new_file_id = doc.file_id
+        old_file_id = previous_file_ids.get(doc.id)
+
+        if new_file_id == old_file_id:
+            continue
+
+        if new_file_id is not None:
+            promote_staged_file(db_session=db_session, file_id=new_file_id)
+
+        if old_file_id is not None:
+            try:
+                file_store.delete_file(old_file_id, error_on_missing=False)
+            except Exception:
+                logger.exception(
+                    f"Failed to delete replaced file_id={old_file_id}; "
+                    "will be reaped by janitor."
+                )
+
+
 def index_doc_batch_prepare(
    documents: list[Document],
    index_attempt_metadata: IndexAttemptMetadata,
@@ -382,6 +417,11 @@ def index_doc_batch_prepare(
        document_ids=document_ids,
    )

+    # Capture previous file_ids BEFORE any writes so we know what to reap.
+    previous_file_ids: dict[str, str] = {
+        db_doc.id: db_doc.file_id for db_doc in db_docs if db_doc.file_id is not None
+    }
+
    updatable_docs = (
        get_doc_ids_to_update(documents=documents, db_docs=db_docs)
        if not ignore_time_skip
@@ -404,6 +444,11 @@ def index_doc_batch_prepare(
            index_attempt_metadata=index_attempt_metadata,
            db_session=db_session,
        )
+        _apply_file_id_transitions(
+            documents=updatable_docs,
+            previous_file_ids=previous_file_ids,
+            db_session=db_session,
+        )

    logger.info(
        f"Upserted {len(updatable_docs)} changed docs out of {len(documents)} total docs into the DB"
--- a/backend/tests/external_dependency_unit/indexing/init.py
+++ b/backend/tests/external_dependency_unit/indexing/init.py
--- a/backend/tests/external_dependency_unit/indexing/test_index_doc_batch_prepare.py
+++ b/backend/tests/external_dependency_unit/indexing/test_index_doc_batch_prepare.py
@@ -0,0 +1,346 @@
+"""External dependency unit tests for `index_doc_batch_prepare`.
+
+Validates the file_id lifecycle that runs alongside the document upsert:
+
+    * `document.file_id` is written on insert AND on conflict (upsert path)
+    * Newly-staged files get promoted from INDEXING_STAGING -> CONNECTOR
+    * Replaced files are deleted from both `file_record` and S3
+    * No-op when the file_id is unchanged
+
+Uses real PostgreSQL + real S3/MinIO via the file store.
+"""
+
+from collections.abc import Generator
+from io import BytesIO
+from uuid import uuid4
+
+import pytest
+from sqlalchemy.orm import Session
+
+from onyx.configs.constants import DocumentSource
+from onyx.configs.constants import FileOrigin
+from onyx.connectors.models import Document
+from onyx.connectors.models import IndexAttemptMetadata
+from onyx.connectors.models import InputType
+from onyx.connectors.models import TextSection
+from onyx.db.enums import AccessType
+from onyx.db.enums import ConnectorCredentialPairStatus
+from onyx.db.file_record import get_filerecord_by_file_id_optional
+from onyx.db.models import Connector
+from onyx.db.models import ConnectorCredentialPair
+from onyx.db.models import Credential
+from onyx.db.models import Document as DBDocument
+from onyx.db.models import FileRecord
+from onyx.file_store.file_store import get_default_file_store
+from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_doc(doc_id: str, file_id: str | None = None) -> Document:
+    """Minimal Document for indexing-pipeline tests. MOCK_CONNECTOR avoids
+    triggering the hierarchy-node linking branch (NOTION/CONFLUENCE only)."""
+    return Document(
+        id=doc_id,
+        source=DocumentSource.MOCK_CONNECTOR,
+        semantic_identifier=f"semantic-{doc_id}",
+        sections=[TextSection(text="content", link=None)],
+        metadata={},
+        file_id=file_id,
+    )
+
+
+def _stage_file(content: bytes = b"raw bytes") -> str:
+    """Write bytes to the file store as INDEXING_STAGING and return the file_id.
+
+    Mirrors what the connector raw_file_callback would do during fetch.
+    """
+    return get_default_file_store().save_file(
+        content=BytesIO(content),
+        display_name=None,
+        file_origin=FileOrigin.INDEXING_STAGING,
+        file_type="application/octet-stream",
+        file_metadata={"test": True},
+    )
+
+
+def _get_doc_row(db_session: Session, doc_id: str) -> DBDocument | None:
+    """Reload the document row fresh from DB so we see post-upsert state."""
+    db_session.expire_all()
+    return db_session.query(DBDocument).filter(DBDocument.id == doc_id).one_or_none()
+
+
+def _get_filerecord(db_session: Session, file_id: str) -> FileRecord | None:
+    db_session.expire_all()
+    return get_filerecord_by_file_id_optional(file_id=file_id, db_session=db_session)
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def cc_pair(
+    db_session: Session,
+    tenant_context: None,  # noqa: ARG001
+    initialize_file_store: None,  # noqa: ARG001
+) -> Generator[ConnectorCredentialPair, None, None]:
+    """Create a connector + credential + cc_pair backing the index attempt."""
+    connector = Connector(
+        name=f"test-connector-{uuid4().hex[:8]}",
+        source=DocumentSource.MOCK_CONNECTOR,
+        input_type=InputType.LOAD_STATE,
+        connector_specific_config={},
+        refresh_freq=None,
+        prune_freq=None,
+        indexing_start=None,
+    )
+    db_session.add(connector)
+    db_session.flush()
+
+    credential = Credential(
+        source=DocumentSource.MOCK_CONNECTOR,
+        credential_json={},
+    )
+    db_session.add(credential)
+    db_session.flush()
+
+    pair = ConnectorCredentialPair(
+        connector_id=connector.id,
+        credential_id=credential.id,
+        name=f"test-cc-pair-{uuid4().hex[:8]}",
+        status=ConnectorCredentialPairStatus.ACTIVE,
+        access_type=AccessType.PUBLIC,
+        auto_sync_options=None,
+    )
+    db_session.add(pair)
+    db_session.commit()
+    db_session.refresh(pair)
+
+    yield pair
+
+
+@pytest.fixture
+def attempt_metadata(cc_pair: ConnectorCredentialPair) -> IndexAttemptMetadata:
+    return IndexAttemptMetadata(
+        connector_id=cc_pair.connector_id,
+        credential_id=cc_pair.credential_id,
+        attempt_id=None,
+        request_id="test-request",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestNewDocuments:
+    """First-time inserts — no previous file_id to reconcile against."""
+
+    def test_new_doc_without_file_id(
+        self,
+        db_session: Session,
+        attempt_metadata: IndexAttemptMetadata,
+    ) -> None:
+        doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=None)
+
+        index_doc_batch_prepare(
+            documents=[doc],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        row = _get_doc_row(db_session, doc.id)
+        assert row is not None
+        assert row.file_id is None
+
+    def test_new_doc_with_staged_file_id_promotes_to_connector(
+        self,
+        db_session: Session,
+        attempt_metadata: IndexAttemptMetadata,
+    ) -> None:
+        file_id = _stage_file()
+        doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
+
+        index_doc_batch_prepare(
+            documents=[doc],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        row = _get_doc_row(db_session, doc.id)
+        assert row is not None and row.file_id == file_id
+
+        record = _get_filerecord(db_session, file_id)
+        assert record is not None
+        assert record.file_origin == FileOrigin.CONNECTOR
+
+
+class TestExistingDocuments:
+    """Re-index path — a `document` row already exists with some file_id."""
+
+    def test_unchanged_file_id_is_noop(
+        self,
+        db_session: Session,
+        attempt_metadata: IndexAttemptMetadata,
+    ) -> None:
+        file_id = _stage_file()
+        doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
+
+        # First pass: inserts the row + promotes the file.
+        index_doc_batch_prepare(
+            documents=[doc],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        # Second pass with the same file_id — should not delete or re-promote.
+        index_doc_batch_prepare(
+            documents=[doc],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        record = _get_filerecord(db_session, file_id)
+        assert record is not None
+        assert record.file_origin == FileOrigin.CONNECTOR
+
+        row = _get_doc_row(db_session, doc.id)
+        assert row is not None and row.file_id == file_id
+
+    def test_swapping_file_id_promotes_new_and_deletes_old(
+        self,
+        db_session: Session,
+        attempt_metadata: IndexAttemptMetadata,
+    ) -> None:
+        old_file_id = _stage_file(content=b"old bytes")
+        doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=old_file_id)
+
+        index_doc_batch_prepare(
+            documents=[doc],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        # Re-fetch produces a new staged file_id for the same doc.
+        new_file_id = _stage_file(content=b"new bytes")
+        doc_v2 = _make_doc(doc.id, file_id=new_file_id)
+
+        index_doc_batch_prepare(
+            documents=[doc_v2],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        row = _get_doc_row(db_session, doc.id)
+        assert row is not None and row.file_id == new_file_id
+
+        new_record = _get_filerecord(db_session, new_file_id)
+        assert new_record is not None
+        assert new_record.file_origin == FileOrigin.CONNECTOR
+
+        # Old file_record + S3 object are gone.
+        assert _get_filerecord(db_session, old_file_id) is None
+
+    def test_clearing_file_id_deletes_old_and_nulls_column(
+        self,
+        db_session: Session,
+        attempt_metadata: IndexAttemptMetadata,
+    ) -> None:
+        old_file_id = _stage_file()
+        doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=old_file_id)
+
+        index_doc_batch_prepare(
+            documents=[doc],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        # Connector opts out on next run — yields the doc without a file_id.
+        doc_v2 = _make_doc(doc.id, file_id=None)
+
+        index_doc_batch_prepare(
+            documents=[doc_v2],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        row = _get_doc_row(db_session, doc.id)
+        assert row is not None and row.file_id is None
+        assert _get_filerecord(db_session, old_file_id) is None
+
+
+class TestBatchHandling:
+    """Mixed batches — multiple docs at different lifecycle states in one call."""
+
+    def test_mixed_batch_each_doc_handled_independently(
+        self,
+        db_session: Session,
+        attempt_metadata: IndexAttemptMetadata,
+    ) -> None:
+        # Pre-seed an existing doc with a file_id we'll swap.
+        existing_old_id = _stage_file(content=b"existing-old")
+        existing_doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=existing_old_id)
+        index_doc_batch_prepare(
+            documents=[existing_doc],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        # Now: swap the existing one, add a brand-new doc with file_id, and a
+        # brand-new doc without file_id.
+        swap_new_id = _stage_file(content=b"existing-new")
+        new_with_file_id = _stage_file(content=b"new-with-file")
+        existing_v2 = _make_doc(existing_doc.id, file_id=swap_new_id)
+        new_with = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=new_with_file_id)
+        new_without = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=None)
+
+        index_doc_batch_prepare(
+            documents=[existing_v2, new_with, new_without],
+            index_attempt_metadata=attempt_metadata,
+            db_session=db_session,
+            ignore_time_skip=True,
+        )
+        db_session.commit()
+
+        # Existing doc was swapped: old file gone, new file promoted.
+        existing_row = _get_doc_row(db_session, existing_doc.id)
+        assert existing_row is not None and existing_row.file_id == swap_new_id
+        assert _get_filerecord(db_session, existing_old_id) is None
+        swap_record = _get_filerecord(db_session, swap_new_id)
+        assert swap_record is not None
+        assert swap_record.file_origin == FileOrigin.CONNECTOR
+
+        # New doc with file_id: row exists, file promoted.
+        new_with_row = _get_doc_row(db_session, new_with.id)
+        assert new_with_row is not None and new_with_row.file_id == new_with_file_id
+        new_with_record = _get_filerecord(db_session, new_with_file_id)
+        assert new_with_record is not None
+        assert new_with_record.file_origin == FileOrigin.CONNECTOR
+
+        # New doc without file_id: row exists, no file_record involvement.
+        new_without_row = _get_doc_row(db_session, new_without.id)
+        assert new_without_row is not None and new_without_row.file_id is None
Author	SHA1	Message	Date
Dane Urban	4f6826ca0c	.	2026-04-17 13:50:33 -07:00
Dane Urban	895c329c00	file saving	2026-04-17 11:53:55 -07:00
Dane Urban	5303a2bd00	.	2026-04-17 09:58:55 -07:00
Dane Urban	ada1d4f9bc	New callback	2026-04-16 18:38:46 -07:00
Dane Urban	0708832290	Add file id col to Document db model	2026-04-16 15:50:29 -07:00