Compare commits

...

3 Commits

Author SHA1 Message Date
Dane Urban
5303a2bd00 . 2026-04-17 09:58:55 -07:00
Dane Urban
ada1d4f9bc New callback 2026-04-16 18:38:46 -07:00
Dane Urban
0708832290 Add file id col to Document db model 2026-04-16 15:50:29 -07:00
7 changed files with 123 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
"""Add file_id to documents
Revision ID: 91d150c361f6
Revises: d129f37b3d87
Create Date: 2026-04-16 15:43:30.314823
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "91d150c361f6"
down_revision = "d129f37b3d87"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"document",
sa.Column("file_id", sa.String(), nullable=True),
)
def downgrade() -> None:
op.drop_column("document", "file_id")

View File

@@ -58,6 +58,8 @@ from onyx.db.indexing_coordination import IndexingCoordination
from onyx.db.models import IndexAttempt
from onyx.file_store.document_batch_storage import DocumentBatchStorage
from onyx.file_store.document_batch_storage import get_document_batch_storage
from onyx.file_store.staging import build_raw_file_callback
from onyx.file_store.staging import RawFileCallback
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
@@ -90,6 +92,7 @@ def _get_connector_runner(
end_time: datetime,
include_permissions: bool,
leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
raw_file_callback: RawFileCallback | None = None,
) -> ConnectorRunner:
"""
NOTE: `start_time` and `end_time` are only used for poll connectors
@@ -108,6 +111,7 @@ def _get_connector_runner(
input_type=task,
connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
credential=attempt.connector_credential_pair.credential,
raw_file_callback=raw_file_callback,
)
# validate the connector settings
@@ -275,6 +279,12 @@ def run_docfetching_entrypoint(
f"credentials='{credential_id}'"
)
raw_file_callback = build_raw_file_callback(
index_attempt_id=index_attempt_id,
cc_pair_id=connector_credential_pair_id,
tenant_id=tenant_id,
)
connector_document_extraction(
app,
index_attempt_id,
@@ -282,6 +292,7 @@ def run_docfetching_entrypoint(
attempt.search_settings_id,
tenant_id,
callback,
raw_file_callback=raw_file_callback,
)
logger.info(
@@ -301,6 +312,7 @@ def connector_document_extraction(
search_settings_id: int,
tenant_id: str,
callback: IndexingHeartbeatInterface | None = None,
raw_file_callback: RawFileCallback | None = None,
) -> None:
"""Extract documents from connector and queue them for indexing pipeline processing.
@@ -451,6 +463,7 @@ def connector_document_extraction(
start_time=window_start,
end_time=window_end,
include_permissions=should_fetch_permissions_during_indexing,
raw_file_callback=raw_file_callback,
)
# don't use a checkpoint if we're explicitly indexing from

View File

@@ -372,6 +372,7 @@ class FileOrigin(str, Enum):
CONNECTOR_METADATA = "connector_metadata"
GENERATED_REPORT = "generated_report"
INDEXING_CHECKPOINT = "indexing_checkpoint"
INDEXING_STAGING = "indexing_staging"
PLAINTEXT_CACHE = "plaintext_cache"
OTHER = "other"
QUERY_HISTORY_CSV = "query_history_csv"

View File

@@ -22,6 +22,7 @@ from onyx.db.credentials import backend_update_credential_json
from onyx.db.credentials import fetch_credential_by_id
from onyx.db.enums import AccessType
from onyx.db.models import Credential
from onyx.file_store.staging import RawFileCallback
from shared_configs.contextvars import get_current_tenant_id
@@ -107,6 +108,7 @@ def instantiate_connector(
input_type: InputType,
connector_specific_config: dict[str, Any],
credential: Credential,
raw_file_callback: RawFileCallback | None = None,
) -> BaseConnector:
connector_class = identify_connector_class(source, input_type)
@@ -130,6 +132,9 @@ def instantiate_connector(
connector.set_allow_images(get_image_extraction_and_analysis_enabled())
if raw_file_callback is not None:
connector.set_raw_file_callback(raw_file_callback)
return connector

View File

@@ -15,6 +15,7 @@ from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import SlimDocument
from onyx.file_store.staging import RawFileCallback
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
@@ -42,6 +43,9 @@ class NormalizationResult(BaseModel):
class BaseConnector(abc.ABC, Generic[CT]):
REDIS_KEY_PREFIX = "da_connector_data:"
# Optional raw-file persistence hook to save original file
raw_file_callback: RawFileCallback | None = None
@abc.abstractmethod
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
raise NotImplementedError
@@ -88,6 +92,15 @@ class BaseConnector(abc.ABC, Generic[CT]):
"""Implement if the underlying connector wants to skip/allow image downloading
based on the application level image analysis setting."""
def set_raw_file_callback(self, callback: RawFileCallback) -> None:
"""Inject the per-attempt raw-file persistence callback.
Wired up by the docfetching entrypoint via `instantiate_connector`.
Connectors that don't care about persisting raw bytes can ignore this
— `raw_file_callback` simply stays `None`.
"""
self.raw_file_callback = callback
@classmethod
def normalize_url(cls, url: str) -> "NormalizationResult": # noqa: ARG003
"""Normalize a URL to match the canonical Document.id format used during ingestion.

View File

@@ -952,6 +952,7 @@ class Document(Base):
semantic_id: Mapped[str] = mapped_column(NullFilteredString)
# First Section's link
link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
file_id: Mapped[str | None] = mapped_column(String, nullable=True)
# The updated time is also used as a measure of the last successful state of the doc
# pulled from the source (to help skip reindexing already updated docs in case of

View File

@@ -0,0 +1,63 @@
from collections.abc import Callable
from typing import Any
from typing import IO
from onyx.configs.constants import FileOrigin
from onyx.file_store.file_store import get_default_file_store
from onyx.utils.logger import setup_logger
logger = setup_logger()
# (content, content_type) -> file_id
RawFileCallback = Callable[[IO[bytes], str], str]
def stage_raw_file(
content: IO,
content_type: str,
*,
metadata: dict[str, Any],
) -> str:
"""Persist raw bytes to the file store with FileOrigin.INDEXING_STAGING.
`metadata` is attached to the file_record so that downstream promotion
(in docprocessing) and orphan reaping (TTL janitor) can locate the file
by its originating context.
"""
file_store = get_default_file_store()
file_id = file_store.save_file(
content=content,
display_name=None,
file_origin=FileOrigin.INDEXING_STAGING,
file_type=content_type,
file_metadata=metadata,
)
return file_id
def build_raw_file_callback(
*,
index_attempt_id: int,
cc_pair_id: int,
tenant_id: str,
) -> RawFileCallback:
"""Build a per-attempt callback that connectors can invoke to opt in to
raw-file persistence. The closure binds the attempt-level context as the
staging metadata so the connector only needs to pass per-call info
(bytes, content_type) and gets back a file_id to attach to its Document.
"""
metadata: dict[str, Any] = {
"index_attempt_id": index_attempt_id,
"cc_pair_id": cc_pair_id,
"tenant_id": tenant_id,
}
def _callback(content: IO[bytes], content_type: str) -> str:
return stage_raw_file(
content=content,
content_type=content_type,
metadata=metadata,
)
return _callback