mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-16 23:35:46 +00:00
Compare commits
16 Commits
experiment
...
faster_tex
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
884f9624a3 | ||
|
|
e2b0924077 | ||
|
|
5650e88ecb | ||
|
|
1df3fb5f62 | ||
|
|
5f7340ebcf | ||
|
|
99dc19ee88 | ||
|
|
b64e96a582 | ||
|
|
f7ce933759 | ||
|
|
2b3c409081 | ||
|
|
be1d5426e3 | ||
|
|
2c9c03b97b | ||
|
|
1516a43b10 | ||
|
|
5b4f8848d6 | ||
|
|
cff48f813a | ||
|
|
64ca568feb | ||
|
|
298aca158d |
@@ -4,6 +4,7 @@ from datetime import timezone
|
||||
|
||||
from googleapiclient.discovery import build # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from markitdown import MarkItDown # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
@@ -26,8 +27,8 @@ from onyx.file_processing.unstructured import get_unstructured_api_key
|
||||
from onyx.file_processing.unstructured import unstructured_to_text
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# these errors don't represent a failure in the connector, but simply files
|
||||
# that can't / shouldn't be indexed
|
||||
@@ -38,177 +39,41 @@ ERRORS_TO_CONTINUE_ON = [
|
||||
]
|
||||
|
||||
|
||||
def _extract_sections_basic(
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
mime_type = file["mimeType"]
|
||||
link = file["webViewLink"]
|
||||
|
||||
if mime_type not in set(item.value for item in GDriveMimeType):
|
||||
# Unsupported file types can still have a title, finding this way is still useful
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
try:
|
||||
if mime_type == GDriveMimeType.SPREADSHEET.value:
|
||||
try:
|
||||
sheets_service = build(
|
||||
"sheets", "v4", credentials=service._http.credentials
|
||||
)
|
||||
spreadsheet = (
|
||||
sheets_service.spreadsheets()
|
||||
.get(spreadsheetId=file["id"])
|
||||
.execute()
|
||||
)
|
||||
|
||||
sections = []
|
||||
for sheet in spreadsheet["sheets"]:
|
||||
sheet_name = sheet["properties"]["title"]
|
||||
sheet_id = sheet["properties"]["sheetId"]
|
||||
|
||||
# Get sheet dimensions
|
||||
grid_properties = sheet["properties"].get("gridProperties", {})
|
||||
row_count = grid_properties.get("rowCount", 1000)
|
||||
column_count = grid_properties.get("columnCount", 26)
|
||||
|
||||
# Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
|
||||
end_column = ""
|
||||
while column_count:
|
||||
column_count, remainder = divmod(column_count - 1, 26)
|
||||
end_column = chr(65 + remainder) + end_column
|
||||
|
||||
range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
|
||||
|
||||
try:
|
||||
result = (
|
||||
sheets_service.spreadsheets()
|
||||
.values()
|
||||
.get(spreadsheetId=file["id"], range=range_name)
|
||||
.execute()
|
||||
)
|
||||
values = result.get("values", [])
|
||||
|
||||
if values:
|
||||
text = f"Sheet: {sheet_name}\n"
|
||||
for row in values:
|
||||
text += "\t".join(str(cell) for cell in row) + "\n"
|
||||
sections.append(
|
||||
Section(
|
||||
link=f"{link}#gid={sheet_id}",
|
||||
text=text,
|
||||
)
|
||||
)
|
||||
except HttpError as e:
|
||||
logger.warning(
|
||||
f"Error fetching data for sheet '{sheet_name}': {e}"
|
||||
)
|
||||
continue
|
||||
return sections
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
|
||||
" Falling back to basic extraction."
|
||||
)
|
||||
|
||||
if mime_type in [
|
||||
GDriveMimeType.DOC.value,
|
||||
GDriveMimeType.PPT.value,
|
||||
GDriveMimeType.SPREADSHEET.value,
|
||||
]:
|
||||
export_mime_type = (
|
||||
"text/plain"
|
||||
if mime_type != GDriveMimeType.SPREADSHEET.value
|
||||
else "text/csv"
|
||||
)
|
||||
text = (
|
||||
service.files()
|
||||
.export(fileId=file["id"], mimeType=export_mime_type)
|
||||
.execute()
|
||||
.decode("utf-8")
|
||||
)
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
elif mime_type in [
|
||||
GDriveMimeType.PLAIN_TEXT.value,
|
||||
GDriveMimeType.MARKDOWN.value,
|
||||
]:
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=service.files()
|
||||
.get_media(fileId=file["id"])
|
||||
.execute()
|
||||
.decode("utf-8"),
|
||||
)
|
||||
]
|
||||
if mime_type in [
|
||||
GDriveMimeType.WORD_DOC.value,
|
||||
GDriveMimeType.POWERPOINT.value,
|
||||
GDriveMimeType.PDF.value,
|
||||
]:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
if get_unstructured_api_key():
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=unstructured_to_text(
|
||||
file=io.BytesIO(response),
|
||||
file_name=file.get("name", file["id"]),
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
return [
|
||||
Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
text, _ = read_pdf_file(file=io.BytesIO(response))
|
||||
return [Section(link=link, text=text)]
|
||||
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
return [
|
||||
Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
except Exception:
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
|
||||
def convert_drive_item_to_document(
|
||||
file: GoogleDriveFileType,
|
||||
drive_service: GoogleDriveService,
|
||||
docs_service: GoogleDocsService,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Converts a Google Drive file into an internal Document object, extracting
|
||||
the text and organizing it into sections. Uses specialized methods for Google Docs
|
||||
to preserve structure. Falls back to basic extraction for all other formats.
|
||||
"""
|
||||
try:
|
||||
# Skip files that are shortcuts
|
||||
# Skip shortcuts and folders
|
||||
if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
|
||||
logger.info("Ignoring Drive Shortcut Filetype")
|
||||
return None
|
||||
# Skip files that are folders
|
||||
if file.get("mimeType") == DRIVE_FOLDER_TYPE:
|
||||
logger.info("Ignoring Drive Folder Filetype")
|
||||
return None
|
||||
|
||||
sections: list[Section] = []
|
||||
|
||||
# Special handling for Google Docs to preserve structure, link
|
||||
# to headers
|
||||
# Special handling for Google Docs to preserve structure
|
||||
if file.get("mimeType") == GDriveMimeType.DOC.value:
|
||||
try:
|
||||
sections = get_document_sections(docs_service, file["id"])
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
|
||||
" Falling back to basic extraction."
|
||||
f"Exception '{e}' when pulling sections from Google Doc '{file['name']}'. "
|
||||
"Falling back to basic extraction."
|
||||
)
|
||||
# NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
|
||||
|
||||
# If not a GDoc or GDoc extraction failed
|
||||
if not sections:
|
||||
try:
|
||||
# For all other file types just extract the text
|
||||
sections = _extract_sections_basic(file, drive_service)
|
||||
|
||||
except HttpError as e:
|
||||
reason = e.error_details[0]["reason"] if e.error_details else e.reason
|
||||
message = e.error_details[0]["message"] if e.error_details else e.reason
|
||||
@@ -217,8 +82,8 @@ def convert_drive_item_to_document(
|
||||
f"Could not export file '{file['name']}' due to '{message}', skipping..."
|
||||
)
|
||||
return None
|
||||
|
||||
raise
|
||||
|
||||
if not sections:
|
||||
return None
|
||||
|
||||
@@ -238,9 +103,248 @@ def convert_drive_item_to_document(
|
||||
except Exception as e:
|
||||
if not CONTINUE_ON_CONNECTOR_FAILURE:
|
||||
raise e
|
||||
|
||||
logger.exception("Ran into exception when pulling a file from Google Drive")
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _extract_sections_basic(
|
||||
file: GoogleDriveFileType, service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Extracts text from a Google Drive file based on its MIME type.
|
||||
"""
|
||||
mime_type = file["mimeType"]
|
||||
link = file["webViewLink"]
|
||||
|
||||
# Handle unsupported MIME types
|
||||
if mime_type not in {item.value for item in GDriveMimeType}:
|
||||
logger.debug(
|
||||
f"Unsupported MIME type '{mime_type}' for file '{file.get('name')}'"
|
||||
)
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
# Specialized handling for Google Sheets
|
||||
if mime_type == GDriveMimeType.SPREADSHEET.value:
|
||||
try:
|
||||
return _extract_google_sheets(file, service)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error extracting data from Google Sheet '{file['name']}': {e}. "
|
||||
"Falling back to basic content extraction."
|
||||
)
|
||||
|
||||
# For other types
|
||||
return _extract_general_content(file, service)
|
||||
|
||||
|
||||
def _extract_google_sheets(
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Specialized extraction logic for Google Sheets.
|
||||
Iterates through each sheet, fetches all data, and returns a list of Section objects.
|
||||
"""
|
||||
link = file["webViewLink"]
|
||||
file_id = file["id"]
|
||||
|
||||
sheets_service = build("sheets", "v4", credentials=service._http.credentials)
|
||||
spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
|
||||
|
||||
sections: list[Section] = []
|
||||
for sheet in spreadsheet.get("sheets", []):
|
||||
sheet_name = sheet["properties"]["title"]
|
||||
sheet_id = sheet["properties"]["sheetId"]
|
||||
|
||||
grid_props = sheet["properties"].get("gridProperties", {})
|
||||
row_count = grid_props.get("rowCount", 1000)
|
||||
column_count = grid_props.get("columnCount", 26)
|
||||
|
||||
# Convert a number to a spreadsheet column letter (1->A, 26->Z, 27->AA,...)
|
||||
end_column = ""
|
||||
col_count = column_count
|
||||
while col_count > 0:
|
||||
col_count, remainder = divmod(col_count - 1, 26)
|
||||
end_column = chr(65 + remainder) + end_column
|
||||
|
||||
range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
|
||||
|
||||
try:
|
||||
result = (
|
||||
sheets_service.spreadsheets()
|
||||
.values()
|
||||
.get(spreadsheetId=file_id, range=range_name)
|
||||
.execute()
|
||||
)
|
||||
values = result.get("values", [])
|
||||
|
||||
if values:
|
||||
text = f"Sheet: {sheet_name}\n"
|
||||
for row in values:
|
||||
text += "\t".join(str(cell) for cell in row) + "\n"
|
||||
|
||||
sections.append(Section(link=f"{link}#gid={sheet_id}", text=text))
|
||||
except HttpError as e:
|
||||
logger.warning(
|
||||
f"Error fetching data for sheet '{sheet_name}' in '{file.get('name')}' : {e}"
|
||||
)
|
||||
continue
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def _extract_general_content(
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Extracts general file content for files other than Google Sheets.
|
||||
- PDF: Revert to read_pdf_file
|
||||
- DOCX: Unstructured, then docx_to_text, then MarkItDown.
|
||||
- PPTX: Unstructured, then pptx_to_text, then MarkItDown.
|
||||
- TXT: Decode the content; if empty, log.
|
||||
- Google Docs/Slides: Export as text/plain and return directly.
|
||||
"""
|
||||
link = file["webViewLink"]
|
||||
mime_type = file["mimeType"]
|
||||
file_id = file["id"]
|
||||
file_name = file.get("name", file_id)
|
||||
|
||||
try:
|
||||
# Google Docs and Google Slides (internal GDrive formats)
|
||||
if (
|
||||
mime_type == GDriveMimeType.DOC.value
|
||||
or mime_type == GDriveMimeType.PPT.value
|
||||
):
|
||||
logger.debug(f"Extracting Google-native doc/presentation: {file_name}")
|
||||
export_mime_type = "text/plain"
|
||||
content = (
|
||||
service.files()
|
||||
.export(fileId=file_id, mimeType=export_mime_type)
|
||||
.execute()
|
||||
)
|
||||
text = content.decode("utf-8", errors="replace").strip()
|
||||
if not text:
|
||||
logger.warning(
|
||||
f"No text extracted from Google Docs/Slides file '{file_name}'."
|
||||
)
|
||||
text = UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
# For all other formats, get raw content
|
||||
content = service.files().get_media(fileId=file_id).execute()
|
||||
|
||||
if mime_type == GDriveMimeType.PDF.value:
|
||||
# Revert to original PDF extraction
|
||||
logger.debug(f"Extracting PDF content for '{file_name}'")
|
||||
text, _ = read_pdf_file(file=io.BytesIO(content))
|
||||
if not text:
|
||||
logger.warning(
|
||||
f"No text extracted from PDF '{file_name}' with read_pdf_file."
|
||||
)
|
||||
text = UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
logger.debug(f"Extracting DOCX content for '{file_name}'")
|
||||
return [
|
||||
Section(link=link, text=_extract_docx_pptx_txt(content, file, "docx"))
|
||||
]
|
||||
|
||||
if mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
logger.debug(f"Extracting PPTX content for '{file_name}'")
|
||||
return [
|
||||
Section(link=link, text=_extract_docx_pptx_txt(content, file, "pptx"))
|
||||
]
|
||||
|
||||
if (
|
||||
mime_type == GDriveMimeType.PLAIN_TEXT.value
|
||||
or mime_type == GDriveMimeType.MARKDOWN.value
|
||||
):
|
||||
logger.debug(f"Extracting plain text/markdown content for '{file_name}'")
|
||||
text = content.decode("utf-8", errors="replace").strip()
|
||||
if not text:
|
||||
logger.warning(
|
||||
f"No text extracted from TXT/MD '{file_name}'. Returning unsupported message."
|
||||
)
|
||||
text = UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
# If we reach here, it's some other format supported by MarkItDown/unstructured
|
||||
logger.debug(f"Trying MarkItDown/unstructured fallback for '{file_name}'")
|
||||
text = _extract_docx_pptx_txt(content, file, None) # generic fallback
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error extracting file content for '{file_name}': {e}", exc_info=True
|
||||
)
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
|
||||
def _extract_docx_pptx_txt(
|
||||
content: bytes, file: dict[str, str], file_type: str | None
|
||||
) -> str:
|
||||
"""
|
||||
Attempts to extract text from DOCX, PPTX, or any supported format using:
|
||||
1. unstructured (if configured)
|
||||
2. docx_to_text/pptx_to_text if known format
|
||||
3. MarkItDown fallback
|
||||
"""
|
||||
file_name = file.get("name", file["id"])
|
||||
|
||||
# 1. Try unstructured first
|
||||
if get_unstructured_api_key():
|
||||
try:
|
||||
logger.debug(f"Attempting unstructured extraction for '{file_name}'...")
|
||||
text = unstructured_to_text(io.BytesIO(content), file_name)
|
||||
if text.strip():
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"Unstructured returned empty text for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Unstructured extraction failed for '{file_name}': {e}")
|
||||
|
||||
# 2. If format is docx or pptx, try direct extraction methods
|
||||
if file_type == "docx":
|
||||
try:
|
||||
logger.debug(f"Trying docx_to_text for '{file_name}'...")
|
||||
text = docx_to_text(file=io.BytesIO(content))
|
||||
if text.strip():
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"docx_to_text returned empty for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.warning(f"docx_to_text failed for '{file_name}': {e}")
|
||||
|
||||
if file_type == "pptx":
|
||||
try:
|
||||
logger.debug(f"Trying pptx_to_text for '{file_name}'...")
|
||||
text = pptx_to_text(file=io.BytesIO(content))
|
||||
if text.strip():
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"pptx_to_text returned empty for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.warning(f"pptx_to_text failed for '{file_name}': {e}")
|
||||
|
||||
# 3. Fallback to MarkItDown
|
||||
try:
|
||||
logger.debug(f"Falling back to MarkItDown for '{file_name}'...")
|
||||
md = MarkItDown()
|
||||
result = md.convert(io.BytesIO(content))
|
||||
if result and result.text_content and result.text_content.strip():
|
||||
return result.text_content
|
||||
else:
|
||||
logger.warning(f"MarkItDown returned empty text for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"MarkItDown conversion failed for '{file_name}': {e}", exc_info=True
|
||||
)
|
||||
|
||||
# If all methods fail or return empty, return unsupported message
|
||||
logger.error(
|
||||
f"All extraction methods failed for '{file_name}', returning unsupported file message."
|
||||
)
|
||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
|
||||
|
||||
def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
|
||||
|
||||
@@ -14,10 +14,9 @@ from typing import IO
|
||||
|
||||
import chardet
|
||||
import docx # type: ignore
|
||||
import openpyxl # type: ignore
|
||||
import pptx # type: ignore
|
||||
from docx import Document
|
||||
from fastapi import UploadFile
|
||||
from markitdown import MarkItDown # type: ignore
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfStreamError
|
||||
|
||||
@@ -60,6 +59,9 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
|
||||
".html",
|
||||
]
|
||||
|
||||
# These are the file extensions that we use markitdown for
|
||||
MARKITDOWN_FILE_EXTENSIONS = [".docx", ".pptx", ".xlsx"]
|
||||
|
||||
|
||||
def is_text_file_extension(file_name: str) -> bool:
|
||||
return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
|
||||
@@ -74,6 +76,10 @@ def is_valid_file_ext(ext: str) -> bool:
|
||||
return ext in VALID_FILE_EXTENSIONS
|
||||
|
||||
|
||||
def is_markitdown_file_ext(ext: str) -> bool:
|
||||
return ext in MARKITDOWN_FILE_EXTENSIONS
|
||||
|
||||
|
||||
def is_text_file(file: IO[bytes]) -> bool:
|
||||
"""
|
||||
checks if the first 1024 bytes only contain printable or whitespace characters
|
||||
@@ -185,13 +191,6 @@ def read_text_file(
|
||||
return file_content_raw, metadata
|
||||
|
||||
|
||||
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||
"""Extract text from a PDF file."""
|
||||
# Return only the extracted text from read_pdf_file
|
||||
text, _ = read_pdf_file(file, pdf_pass)
|
||||
return text
|
||||
|
||||
|
||||
def read_pdf_file(
|
||||
file: IO[Any],
|
||||
pdf_pass: str | None = None,
|
||||
@@ -299,16 +298,11 @@ def pptx_to_text(file: IO[Any]) -> str:
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def xlsx_to_text(file: IO[Any]) -> str:
|
||||
workbook = openpyxl.load_workbook(file, read_only=True)
|
||||
text_content = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_string = "\n".join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||
)
|
||||
text_content.append(sheet_string)
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||
"""Extract text from a PDF file."""
|
||||
# Return only the extracted text from read_pdf_file
|
||||
text, _ = read_pdf_file(file, pdf_pass)
|
||||
return text
|
||||
|
||||
|
||||
def eml_to_text(file: IO[Any]) -> str:
|
||||
@@ -346,9 +340,6 @@ def extract_file_text(
|
||||
) -> str:
|
||||
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
||||
".pdf": pdf_to_text,
|
||||
".docx": docx_to_text,
|
||||
".pptx": pptx_to_text,
|
||||
".xlsx": xlsx_to_text,
|
||||
".eml": eml_to_text,
|
||||
".epub": epub_to_text,
|
||||
".html": parse_html_page_basic,
|
||||
@@ -358,6 +349,8 @@ def extract_file_text(
|
||||
if get_unstructured_api_key():
|
||||
return unstructured_to_text(file, file_name)
|
||||
|
||||
md = MarkItDown()
|
||||
|
||||
if file_name or extension:
|
||||
if extension is not None:
|
||||
final_extension = extension
|
||||
@@ -365,6 +358,12 @@ def extract_file_text(
|
||||
final_extension = get_file_ext(file_name)
|
||||
|
||||
if is_valid_file_ext(final_extension):
|
||||
if is_markitdown_file_ext(final_extension):
|
||||
with BytesIO(file.read()) as file_like_object:
|
||||
result = md.convert_stream(
|
||||
file_like_object, file_extension=final_extension
|
||||
)
|
||||
return result.text_content
|
||||
return extension_to_function.get(final_extension, file_io_to_text)(file)
|
||||
|
||||
# Either the file somehow has no name or the extension is not one that we recognize
|
||||
@@ -382,29 +381,37 @@ def extract_file_text(
|
||||
return ""
|
||||
|
||||
|
||||
def convert_docx_to_txt(
|
||||
def convert_docx_to_markdown(
|
||||
file: UploadFile, file_store: FileStore, file_path: str
|
||||
) -> None:
|
||||
file.file.seek(0)
|
||||
docx_content = file.file.read()
|
||||
doc = Document(BytesIO(docx_content))
|
||||
try:
|
||||
# Read the file content
|
||||
file_content = file.file.read()
|
||||
|
||||
# Extract text from the document
|
||||
full_text = []
|
||||
for para in doc.paragraphs:
|
||||
full_text.append(para.text)
|
||||
if not file_content:
|
||||
raise ValueError(f"File {file.filename} is empty")
|
||||
|
||||
# Join the extracted text
|
||||
text_content = "\n".join(full_text)
|
||||
# Reset the file pointer to the beginning
|
||||
file.file.seek(0)
|
||||
|
||||
txt_file_path = docx_to_txt_filename(file_path)
|
||||
file_store.save_file(
|
||||
file_name=txt_file_path,
|
||||
content=BytesIO(text_content.encode("utf-8")),
|
||||
display_name=file.filename,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type="text/plain",
|
||||
)
|
||||
text_content = extract_file_text(
|
||||
file=file.file, file_name=file.filename or "", extension=".docx"
|
||||
)
|
||||
|
||||
if not text_content:
|
||||
raise ValueError(f"Failed to extract text from {file.filename}")
|
||||
|
||||
txt_file_path = docx_to_txt_filename(file_path)
|
||||
file_store.save_file(
|
||||
file_name=txt_file_path,
|
||||
content=BytesIO(text_content.encode("utf-8")),
|
||||
display_name=file.filename,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type="text/plain",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting DOCX to Markdown: {str(e)}")
|
||||
raise RuntimeError(f"Failed to process file {file.filename}: {str(e)}") from e
|
||||
|
||||
|
||||
def docx_to_txt_filename(file_path: str) -> str:
|
||||
|
||||
@@ -87,7 +87,7 @@ from onyx.db.models import SearchSettings
|
||||
from onyx.db.models import User
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.file_processing.extract_file_text import convert_docx_to_txt
|
||||
from onyx.file_processing.extract_file_text import convert_docx_to_markdown
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.key_value_store.interface import KvKeyNotFoundError
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
@@ -396,11 +396,12 @@ def upload_files(
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type=file.content_type or "text/plain",
|
||||
)
|
||||
file.file.seek(0)
|
||||
|
||||
if file.content_type and file.content_type.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
convert_docx_to_txt(file, file_store, file_path)
|
||||
convert_docx_to_markdown(file, file_store, file_path)
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@@ -81,4 +81,5 @@ stripe==10.12.0
|
||||
urllib3==2.2.3
|
||||
mistune==0.8.4
|
||||
sentry-sdk==2.14.0
|
||||
prometheus_client==0.21.0
|
||||
prometheus_client==0.21.0
|
||||
markitdown==0.0.1a3
|
||||
@@ -6,6 +6,7 @@ import {
|
||||
} from "@/app/chat/message/MemoizedTextComponents";
|
||||
import React, { useMemo } from "react";
|
||||
import ReactMarkdown from "react-markdown";
|
||||
import rehypePrism from "rehype-prism-plus";
|
||||
import remarkGfm from "remark-gfm";
|
||||
|
||||
interface MinimalMarkdownProps {
|
||||
@@ -35,9 +36,10 @@ export const MinimalMarkdown: React.FC<MinimalMarkdownProps> = ({
|
||||
|
||||
return (
|
||||
<ReactMarkdown
|
||||
className={`w-full text-wrap break-word ${className}`}
|
||||
className={`prose max-w-full text-base ${className}`}
|
||||
components={markdownComponents}
|
||||
remarkPlugins={[remarkGfm]}
|
||||
rehypePlugins={[[rehypePrism, { ignoreMissing: true }]]}
|
||||
>
|
||||
{content}
|
||||
</ReactMarkdown>
|
||||
|
||||
@@ -21,11 +21,11 @@ export default function TextView({
|
||||
onClose,
|
||||
}: TextViewProps) {
|
||||
const [zoom, setZoom] = useState(100);
|
||||
const [fileContent, setFileContent] = useState<string>("");
|
||||
const [fileUrl, setFileUrl] = useState<string>("");
|
||||
const [fileName, setFileName] = useState<string>("");
|
||||
const [fileContent, setFileContent] = useState("");
|
||||
const [fileUrl, setFileUrl] = useState("");
|
||||
const [fileName, setFileName] = useState("");
|
||||
const [isLoading, setIsLoading] = useState(true);
|
||||
const [fileType, setFileType] = useState<string>("application/octet-stream");
|
||||
const [fileType, setFileType] = useState("application/octet-stream");
|
||||
|
||||
const isMarkdownFormat = (mimeType: string): boolean => {
|
||||
const markdownFormats = [
|
||||
@@ -51,18 +51,17 @@ export default function TextView({
|
||||
|
||||
const fetchFile = useCallback(async () => {
|
||||
setIsLoading(true);
|
||||
const fileId = presentingDocument.document_id.split("__")[1];
|
||||
try {
|
||||
const fileId = presentingDocument.document_id.split("__")[1];
|
||||
const response = await fetch(
|
||||
`/api/chat/file/${encodeURIComponent(fileId)}`,
|
||||
{
|
||||
method: "GET",
|
||||
}
|
||||
`/api/chat/file/${encodeURIComponent(fileId)}`
|
||||
);
|
||||
const blob = await response.blob();
|
||||
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
setFileUrl(url);
|
||||
setFileName(presentingDocument.semantic_identifier || "document");
|
||||
|
||||
const contentType =
|
||||
response.headers.get("Content-Type") || "application/octet-stream";
|
||||
setFileType(contentType);
|
||||
@@ -70,9 +69,28 @@ export default function TextView({
|
||||
if (isMarkdownFormat(blob.type)) {
|
||||
const text = await blob.text();
|
||||
setFileContent(text);
|
||||
} else if (blob.type === "application/octet-stream") {
|
||||
try {
|
||||
const text = await blob.text();
|
||||
let nonPrintingCount = 0;
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const code = text.charCodeAt(i);
|
||||
if (code < 32 && ![9, 10, 13].includes(code)) {
|
||||
nonPrintingCount++;
|
||||
}
|
||||
}
|
||||
const ratio = nonPrintingCount / text.length;
|
||||
|
||||
if (ratio < 0.05) {
|
||||
setFileContent(text);
|
||||
setFileType("text/plain");
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Failed to parse octet-stream as text", err);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error("Error fetching file:", error);
|
||||
} catch (err) {
|
||||
console.error("Error fetching file:", err);
|
||||
} finally {
|
||||
setTimeout(() => {
|
||||
setIsLoading(false);
|
||||
@@ -137,7 +155,7 @@ export default function TextView({
|
||||
</div>
|
||||
) : (
|
||||
<div
|
||||
className={`w-full h-full transform origin-center transition-transform duration-300 ease-in-out`}
|
||||
className="w-full h-full transform origin-center transition-transform duration-300 ease-in-out"
|
||||
style={{ transform: `scale(${zoom / 100})` }}
|
||||
>
|
||||
{isSupportedIframeFormat(fileType) ? (
|
||||
@@ -146,7 +164,7 @@ export default function TextView({
|
||||
className="w-full h-full border-none"
|
||||
title="File Viewer"
|
||||
/>
|
||||
) : isMarkdownFormat(fileType) ? (
|
||||
) : isMarkdownFormat(fileType) || fileType === "text/plain" ? (
|
||||
<div className="w-full h-full p-6 overflow-y-scroll overflow-x-hidden">
|
||||
<MinimalMarkdown
|
||||
content={fileContent}
|
||||
|
||||
Reference in New Issue
Block a user