.

New callback
Add file id col to Document db model
2026-04-20 08:56:43 +00:00 · 2026-04-17 09:58:55 -07:00 · 2026-04-16 18:38:46 -07:00 · 2026-04-16 15:50:29 -07:00
7 changed files with 123 additions and 0 deletions
--- a/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
+++ b/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
@@ -0,0 +1,27 @@
+"""Add file_id to documents
+
+Revision ID: 91d150c361f6
+Revises: d129f37b3d87
+Create Date: 2026-04-16 15:43:30.314823
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "91d150c361f6"
+down_revision = "d129f37b3d87"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "document",
+        sa.Column("file_id", sa.String(), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("document", "file_id")
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -58,6 +58,8 @@ from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.db.models import IndexAttempt
 from onyx.file_store.document_batch_storage import DocumentBatchStorage
 from onyx.file_store.document_batch_storage import get_document_batch_storage
+from onyx.file_store.staging import build_raw_file_callback
+from onyx.file_store.staging import RawFileCallback
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
 from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
@@ -90,6 +92,7 @@ def _get_connector_runner(
    end_time: datetime,
    include_permissions: bool,
    leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
+    raw_file_callback: RawFileCallback | None = None,
 ) -> ConnectorRunner:
    """
    NOTE: `start_time` and `end_time` are only used for poll connectors
@@ -108,6 +111,7 @@ def _get_connector_runner(
            input_type=task,
            connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
            credential=attempt.connector_credential_pair.credential,
+            raw_file_callback=raw_file_callback,
        )

        # validate the connector settings
@@ -275,6 +279,12 @@ def run_docfetching_entrypoint(
        f"credentials='{credential_id}'"
    )

+    raw_file_callback = build_raw_file_callback(
+        index_attempt_id=index_attempt_id,
+        cc_pair_id=connector_credential_pair_id,
+        tenant_id=tenant_id,
+    )
+
    connector_document_extraction(
        app,
        index_attempt_id,
@@ -282,6 +292,7 @@ def run_docfetching_entrypoint(
        attempt.search_settings_id,
        tenant_id,
        callback,
+        raw_file_callback=raw_file_callback,
    )

    logger.info(
@@ -301,6 +312,7 @@ def connector_document_extraction(
    search_settings_id: int,
    tenant_id: str,
    callback: IndexingHeartbeatInterface | None = None,
+    raw_file_callback: RawFileCallback | None = None,
 ) -> None:
    """Extract documents from connector and queue them for indexing pipeline processing.

@@ -451,6 +463,7 @@ def connector_document_extraction(
            start_time=window_start,
            end_time=window_end,
            include_permissions=should_fetch_permissions_during_indexing,
+            raw_file_callback=raw_file_callback,
        )

        # don't use a checkpoint if we're explicitly indexing from
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -372,6 +372,7 @@ class FileOrigin(str, Enum):
    CONNECTOR_METADATA = "connector_metadata"
    GENERATED_REPORT = "generated_report"
    INDEXING_CHECKPOINT = "indexing_checkpoint"
+    INDEXING_STAGING = "indexing_staging"
    PLAINTEXT_CACHE = "plaintext_cache"
    OTHER = "other"
    QUERY_HISTORY_CSV = "query_history_csv"
--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@@ -22,6 +22,7 @@ from onyx.db.credentials import backend_update_credential_json
 from onyx.db.credentials import fetch_credential_by_id
 from onyx.db.enums import AccessType
 from onyx.db.models import Credential
+from onyx.file_store.staging import RawFileCallback
 from shared_configs.contextvars import get_current_tenant_id


@@ -107,6 +108,7 @@ def instantiate_connector(
    input_type: InputType,
    connector_specific_config: dict[str, Any],
    credential: Credential,
+    raw_file_callback: RawFileCallback | None = None,
 ) -> BaseConnector:
    connector_class = identify_connector_class(source, input_type)

@@ -130,6 +132,9 @@ def instantiate_connector(

    connector.set_allow_images(get_image_extraction_and_analysis_enabled())

+    if raw_file_callback is not None:
+        connector.set_raw_file_callback(raw_file_callback)
+
    return connector


--- a/backend/onyx/connectors/interfaces.py
+++ b/backend/onyx/connectors/interfaces.py
@@ -15,6 +15,7 @@ from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import HierarchyNode
 from onyx.connectors.models import SlimDocument
+from onyx.file_store.staging import RawFileCallback
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop

@@ -42,6 +43,9 @@ class NormalizationResult(BaseModel):
 class BaseConnector(abc.ABC, Generic[CT]):
    REDIS_KEY_PREFIX = "da_connector_data:"

+    # Optional raw-file persistence hook to save original file
+    raw_file_callback: RawFileCallback | None = None
+
    @abc.abstractmethod
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        raise NotImplementedError
@@ -88,6 +92,15 @@ class BaseConnector(abc.ABC, Generic[CT]):
        """Implement if the underlying connector wants to skip/allow image downloading
        based on the application level image analysis setting."""

+    def set_raw_file_callback(self, callback: RawFileCallback) -> None:
+        """Inject the per-attempt raw-file persistence callback.
+
+        Wired up by the docfetching entrypoint via `instantiate_connector`.
+        Connectors that don't care about persisting raw bytes can ignore this
+        — `raw_file_callback` simply stays `None`.
+        """
+        self.raw_file_callback = callback
+
    @classmethod
    def normalize_url(cls, url: str) -> "NormalizationResult":  # noqa: ARG003
        """Normalize a URL to match the canonical Document.id format used during ingestion.
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -952,6 +952,7 @@ class Document(Base):
    semantic_id: Mapped[str] = mapped_column(NullFilteredString)
    # First Section's link
    link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
+    file_id: Mapped[str | None] = mapped_column(String, nullable=True)

    # The updated time is also used as a measure of the last successful state of the doc
    # pulled from the source (to help skip reindexing already updated docs in case of
--- a/backend/onyx/file_store/staging.py
+++ b/backend/onyx/file_store/staging.py
@@ -0,0 +1,63 @@
+from collections.abc import Callable
+from typing import Any
+from typing import IO
+
+from onyx.configs.constants import FileOrigin
+from onyx.file_store.file_store import get_default_file_store
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+# (content, content_type) -> file_id
+RawFileCallback = Callable[[IO[bytes], str], str]
+
+
+def stage_raw_file(
+    content: IO,
+    content_type: str,
+    *,
+    metadata: dict[str, Any],
+) -> str:
+    """Persist raw bytes to the file store with FileOrigin.INDEXING_STAGING.
+
+    `metadata` is attached to the file_record so that downstream promotion
+    (in docprocessing) and orphan reaping (TTL janitor) can locate the file
+    by its originating context.
+    """
+    file_store = get_default_file_store()
+    file_id = file_store.save_file(
+        content=content,
+        display_name=None,
+        file_origin=FileOrigin.INDEXING_STAGING,
+        file_type=content_type,
+        file_metadata=metadata,
+    )
+    return file_id
+
+
+def build_raw_file_callback(
+    *,
+    index_attempt_id: int,
+    cc_pair_id: int,
+    tenant_id: str,
+) -> RawFileCallback:
+    """Build a per-attempt callback that connectors can invoke to opt in to
+    raw-file persistence. The closure binds the attempt-level context as the
+    staging metadata so the connector only needs to pass per-call info
+    (bytes, content_type) and gets back a file_id to attach to its Document.
+    """
+    metadata: dict[str, Any] = {
+        "index_attempt_id": index_attempt_id,
+        "cc_pair_id": cc_pair_id,
+        "tenant_id": tenant_id,
+    }
+
+    def _callback(content: IO[bytes], content_type: str) -> str:
+        return stage_raw_file(
+            content=content,
+            content_type=content_type,
+            metadata=metadata,
+        )
+
+    return _callback
Author	SHA1	Message	Date
Dane Urban	5303a2bd00	.	2026-04-17 09:58:55 -07:00
Dane Urban	ada1d4f9bc	New callback	2026-04-16 18:38:46 -07:00
Dane Urban	0708832290	Add file id col to Document db model	2026-04-16 15:50:29 -07:00