forked from github/onyx
enhance file processing with content type handling (#5196)
This commit is contained in:
@@ -72,6 +72,7 @@ def _process_file(
|
||||
file: IO[Any],
|
||||
metadata: dict[str, Any] | None,
|
||||
pdf_pass: str | None,
|
||||
file_type: str | None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Process a file and return a list of Documents.
|
||||
@@ -148,6 +149,7 @@ def _process_file(
|
||||
file=file,
|
||||
file_name=file_name,
|
||||
pdf_pass=pdf_pass,
|
||||
content_type=file_type,
|
||||
)
|
||||
|
||||
# Each file may have file-specific ONYX_METADATA https://docs.onyx.app/connectors/file
|
||||
@@ -278,6 +280,7 @@ class LocalFileConnector(LoadConnector):
|
||||
file=file_io,
|
||||
metadata=metadata,
|
||||
pdf_pass=self.pdf_pass,
|
||||
file_type=file_record.file_type,
|
||||
)
|
||||
documents.extend(new_docs)
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ from pypdf.errors import PdfStreamError
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.configs.constants import ONYX_METADATA_FILENAME
|
||||
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
|
||||
from onyx.file_processing.file_validation import TEXT_MIME_TYPE
|
||||
from onyx.file_processing.html_utils import parse_html_page_basic
|
||||
from onyx.file_processing.unstructured import get_unstructured_api_key
|
||||
from onyx.file_processing.unstructured import unstructured_to_text
|
||||
@@ -492,10 +493,23 @@ class ExtractionResult(NamedTuple):
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
def extract_result_from_text_file(file: IO[Any]) -> ExtractionResult:
|
||||
encoding = detect_encoding(file)
|
||||
text_content_raw, file_metadata = read_text_file(
|
||||
file, encoding=encoding, ignore_onyx_metadata=False
|
||||
)
|
||||
return ExtractionResult(
|
||||
text_content=text_content_raw,
|
||||
embedded_images=[],
|
||||
metadata=file_metadata,
|
||||
)
|
||||
|
||||
|
||||
def extract_text_and_images(
|
||||
file: IO[Any],
|
||||
file_name: str,
|
||||
pdf_pass: str | None = None,
|
||||
content_type: str | None = None,
|
||||
) -> ExtractionResult:
|
||||
"""
|
||||
Primary new function for the updated connector.
|
||||
@@ -516,6 +530,13 @@ def extract_text_and_images(
|
||||
)
|
||||
file.seek(0) # Reset file pointer just in case
|
||||
|
||||
# When we upload a document via a connector or MyDocuments, we extract and store the content of files
|
||||
# with content types in UploadMimeTypes.DOCUMENT_MIME_TYPES as plain text files.
|
||||
# As a result, the file name extension may differ from the original content type.
|
||||
# We process files with a plain text content type first to handle this scenario.
|
||||
if content_type == TEXT_MIME_TYPE:
|
||||
return extract_result_from_text_file(file)
|
||||
|
||||
# Default processing
|
||||
try:
|
||||
extension = get_file_ext(file_name)
|
||||
@@ -574,15 +595,7 @@ def extract_text_and_images(
|
||||
|
||||
# If we reach here and it's a recognized text extension
|
||||
if is_text_file_extension(file_name):
|
||||
encoding = detect_encoding(file)
|
||||
text_content_raw, file_metadata = read_text_file(
|
||||
file, encoding=encoding, ignore_onyx_metadata=False
|
||||
)
|
||||
return ExtractionResult(
|
||||
text_content=text_content_raw,
|
||||
embedded_images=[],
|
||||
metadata=file_metadata,
|
||||
)
|
||||
return extract_result_from_text_file(file)
|
||||
|
||||
# If it's an image file or something else, we do not parse embedded images from them
|
||||
# just return empty text
|
||||
|
||||
@@ -21,6 +21,9 @@ EXCLUDED_IMAGE_TYPES = [
|
||||
"image/avif",
|
||||
]
|
||||
|
||||
# Text MIME types
|
||||
TEXT_MIME_TYPE = "text/plain"
|
||||
|
||||
|
||||
def is_valid_image_type(mime_type: str) -> bool:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user