enhance file processing with content type handling (#5196)

2025-08-14 14:29:53 +05:30
parent 2af64ebf4c
commit 46f3af4f68
3 changed files with 28 additions and 9 deletions
--- a/backend/onyx/connectors/file/connector.py
+++ b/backend/onyx/connectors/file/connector.py
@@ -72,6 +72,7 @@ def _process_file(
    file: IO[Any],
    metadata: dict[str, Any] | None,
    pdf_pass: str | None,
+    file_type: str | None,
 ) -> list[Document]:
    """
    Process a file and return a list of Documents.
@@ -148,6 +149,7 @@ def _process_file(
        file=file,
        file_name=file_name,
        pdf_pass=pdf_pass,
+        content_type=file_type,
    )

    # Each file may have file-specific ONYX_METADATA https://docs.onyx.app/connectors/file
@@ -278,6 +280,7 @@ class LocalFileConnector(LoadConnector):
                file=file_io,
                metadata=metadata,
                pdf_pass=self.pdf_pass,
+                file_type=file_record.file_type,
            )
            documents.extend(new_docs)

--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -29,6 +29,7 @@ from pypdf.errors import PdfStreamError
 from onyx.configs.constants import FileOrigin
 from onyx.configs.constants import ONYX_METADATA_FILENAME
 from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
+from onyx.file_processing.file_validation import TEXT_MIME_TYPE
 from onyx.file_processing.html_utils import parse_html_page_basic
 from onyx.file_processing.unstructured import get_unstructured_api_key
 from onyx.file_processing.unstructured import unstructured_to_text
@@ -492,10 +493,23 @@ class ExtractionResult(NamedTuple):
    metadata: dict[str, Any]


+def extract_result_from_text_file(file: IO[Any]) -> ExtractionResult:
+    encoding = detect_encoding(file)
+    text_content_raw, file_metadata = read_text_file(
+        file, encoding=encoding, ignore_onyx_metadata=False
+    )
+    return ExtractionResult(
+        text_content=text_content_raw,
+        embedded_images=[],
+        metadata=file_metadata,
+    )
+
+
 def extract_text_and_images(
    file: IO[Any],
    file_name: str,
    pdf_pass: str | None = None,
+    content_type: str | None = None,
 ) -> ExtractionResult:
    """
    Primary new function for the updated connector.
@@ -516,6 +530,13 @@ def extract_text_and_images(
            )
            file.seek(0)  # Reset file pointer just in case

+    # When we upload a document via a connector or MyDocuments, we extract and store the content of files
+    # with content types in UploadMimeTypes.DOCUMENT_MIME_TYPES as plain text files.
+    # As a result, the file name extension may differ from the original content type.
+    # We process files with a plain text content type first to handle this scenario.
+    if content_type == TEXT_MIME_TYPE:
+        return extract_result_from_text_file(file)
+
    # Default processing
    try:
        extension = get_file_ext(file_name)
@@ -574,15 +595,7 @@ def extract_text_and_images(

        # If we reach here and it's a recognized text extension
        if is_text_file_extension(file_name):
-            encoding = detect_encoding(file)
-            text_content_raw, file_metadata = read_text_file(
-                file, encoding=encoding, ignore_onyx_metadata=False
-            )
-            return ExtractionResult(
-                text_content=text_content_raw,
-                embedded_images=[],
-                metadata=file_metadata,
-            )
+            return extract_result_from_text_file(file)

        # If it's an image file or something else, we do not parse embedded images from them
        # just return empty text
--- a/backend/onyx/file_processing/file_validation.py
+++ b/backend/onyx/file_processing/file_validation.py
@@ -21,6 +21,9 @@ EXCLUDED_IMAGE_TYPES = [
    "image/avif",
 ]

+# Text MIME types
+TEXT_MIME_TYPE = "text/plain"
+

 def is_valid_image_type(mime_type: str) -> bool:
    """