update

= -> ==
cosmetic improvements
2026-02-16 23:35:46 +00:00 · 2024-12-19 14:22:13 -08:00 · 2024-12-19 13:43:38 -08:00 · 2024-12-18 10:48:17 -08:00 · 2024-12-18 10:10:16 -08:00 · 2024-12-18 10:09:18 -08:00
6 changed files with 341 additions and 208 deletions
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -4,6 +4,7 @@ from datetime import timezone

 from googleapiclient.discovery import build  # type: ignore
 from googleapiclient.errors import HttpError  # type: ignore
+from markitdown import MarkItDown  # type: ignore

 from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
 from onyx.configs.constants import DocumentSource
@@ -26,8 +27,8 @@ from onyx.file_processing.unstructured import get_unstructured_api_key
 from onyx.file_processing.unstructured import unstructured_to_text
 from onyx.utils.logger import setup_logger

-logger = setup_logger()

+logger = setup_logger()

 # these errors don't represent a failure in the connector, but simply files
 # that can't / shouldn't be indexed
@@ -38,177 +39,41 @@ ERRORS_TO_CONTINUE_ON = [
 ]


-def _extract_sections_basic(
-    file: dict[str, str], service: GoogleDriveService
-) -> list[Section]:
-    mime_type = file["mimeType"]
-    link = file["webViewLink"]
-
-    if mime_type not in set(item.value for item in GDriveMimeType):
-        # Unsupported file types can still have a title, finding this way is still useful
-        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
-
-    try:
-        if mime_type == GDriveMimeType.SPREADSHEET.value:
-            try:
-                sheets_service = build(
-                    "sheets", "v4", credentials=service._http.credentials
-                )
-                spreadsheet = (
-                    sheets_service.spreadsheets()
-                    .get(spreadsheetId=file["id"])
-                    .execute()
-                )
-
-                sections = []
-                for sheet in spreadsheet["sheets"]:
-                    sheet_name = sheet["properties"]["title"]
-                    sheet_id = sheet["properties"]["sheetId"]
-
-                    # Get sheet dimensions
-                    grid_properties = sheet["properties"].get("gridProperties", {})
-                    row_count = grid_properties.get("rowCount", 1000)
-                    column_count = grid_properties.get("columnCount", 26)
-
-                    # Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
-                    end_column = ""
-                    while column_count:
-                        column_count, remainder = divmod(column_count - 1, 26)
-                        end_column = chr(65 + remainder) + end_column
-
-                    range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
-
-                    try:
-                        result = (
-                            sheets_service.spreadsheets()
-                            .values()
-                            .get(spreadsheetId=file["id"], range=range_name)
-                            .execute()
-                        )
-                        values = result.get("values", [])
-
-                        if values:
-                            text = f"Sheet: {sheet_name}\n"
-                            for row in values:
-                                text += "\t".join(str(cell) for cell in row) + "\n"
-                            sections.append(
-                                Section(
-                                    link=f"{link}#gid={sheet_id}",
-                                    text=text,
-                                )
-                            )
-                    except HttpError as e:
-                        logger.warning(
-                            f"Error fetching data for sheet '{sheet_name}': {e}"
-                        )
-                        continue
-                return sections
-
-            except Exception as e:
-                logger.warning(
-                    f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
-                    " Falling back to basic extraction."
-                )
-
-        if mime_type in [
-            GDriveMimeType.DOC.value,
-            GDriveMimeType.PPT.value,
-            GDriveMimeType.SPREADSHEET.value,
-        ]:
-            export_mime_type = (
-                "text/plain"
-                if mime_type != GDriveMimeType.SPREADSHEET.value
-                else "text/csv"
-            )
-            text = (
-                service.files()
-                .export(fileId=file["id"], mimeType=export_mime_type)
-                .execute()
-                .decode("utf-8")
-            )
-            return [Section(link=link, text=text)]
-
-        elif mime_type in [
-            GDriveMimeType.PLAIN_TEXT.value,
-            GDriveMimeType.MARKDOWN.value,
-        ]:
-            return [
-                Section(
-                    link=link,
-                    text=service.files()
-                    .get_media(fileId=file["id"])
-                    .execute()
-                    .decode("utf-8"),
-                )
-            ]
-        if mime_type in [
-            GDriveMimeType.WORD_DOC.value,
-            GDriveMimeType.POWERPOINT.value,
-            GDriveMimeType.PDF.value,
-        ]:
-            response = service.files().get_media(fileId=file["id"]).execute()
-            if get_unstructured_api_key():
-                return [
-                    Section(
-                        link=link,
-                        text=unstructured_to_text(
-                            file=io.BytesIO(response),
-                            file_name=file.get("name", file["id"]),
-                        ),
-                    )
-                ]
-
-            if mime_type == GDriveMimeType.WORD_DOC.value:
-                return [
-                    Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
-                ]
-            elif mime_type == GDriveMimeType.PDF.value:
-                text, _ = read_pdf_file(file=io.BytesIO(response))
-                return [Section(link=link, text=text)]
-            elif mime_type == GDriveMimeType.POWERPOINT.value:
-                return [
-                    Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
-                ]
-
-        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
-
-    except Exception:
-        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
-
-
 def convert_drive_item_to_document(
    file: GoogleDriveFileType,
    drive_service: GoogleDriveService,
    docs_service: GoogleDocsService,
 ) -> Document | None:
+    """
+    Converts a Google Drive file into an internal Document object, extracting
+    the text and organizing it into sections. Uses specialized methods for Google Docs
+    to preserve structure. Falls back to basic extraction for all other formats.
+    """
    try:
-        # Skip files that are shortcuts
+        # Skip shortcuts and folders
        if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
            logger.info("Ignoring Drive Shortcut Filetype")
            return None
-        # Skip files that are folders
        if file.get("mimeType") == DRIVE_FOLDER_TYPE:
            logger.info("Ignoring Drive Folder Filetype")
            return None

        sections: list[Section] = []

-        # Special handling for Google Docs to preserve structure, link
-        # to headers
+        # Special handling for Google Docs to preserve structure
        if file.get("mimeType") == GDriveMimeType.DOC.value:
            try:
                sections = get_document_sections(docs_service, file["id"])
            except Exception as e:
                logger.warning(
-                    f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
-                    " Falling back to basic extraction."
+                    f"Exception '{e}' when pulling sections from Google Doc '{file['name']}'. "
+                    "Falling back to basic extraction."
                )
-        # NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
+
+        # If not a GDoc or GDoc extraction failed
        if not sections:
            try:
-                # For all other file types just extract the text
                sections = _extract_sections_basic(file, drive_service)
-
            except HttpError as e:
                reason = e.error_details[0]["reason"] if e.error_details else e.reason
                message = e.error_details[0]["message"] if e.error_details else e.reason
@@ -217,8 +82,8 @@ def convert_drive_item_to_document(
                        f"Could not export file '{file['name']}' due to '{message}', skipping..."
                    )
                    return None
-
                raise
+
        if not sections:
            return None

@@ -238,9 +103,248 @@ def convert_drive_item_to_document(
    except Exception as e:
        if not CONTINUE_ON_CONNECTOR_FAILURE:
            raise e
-
        logger.exception("Ran into exception when pulling a file from Google Drive")
-    return None
+        return None
+
+
+def _extract_sections_basic(
+    file: GoogleDriveFileType, service: GoogleDriveService
+) -> list[Section]:
+    """
+    Extracts text from a Google Drive file based on its MIME type.
+    """
+    mime_type = file["mimeType"]
+    link = file["webViewLink"]
+
+    # Handle unsupported MIME types
+    if mime_type not in {item.value for item in GDriveMimeType}:
+        logger.debug(
+            f"Unsupported MIME type '{mime_type}' for file '{file.get('name')}'"
+        )
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+    # Specialized handling for Google Sheets
+    if mime_type == GDriveMimeType.SPREADSHEET.value:
+        try:
+            return _extract_google_sheets(file, service)
+        except Exception as e:
+            logger.warning(
+                f"Error extracting data from Google Sheet '{file['name']}': {e}. "
+                "Falling back to basic content extraction."
+            )
+
+    # For other types
+    return _extract_general_content(file, service)
+
+
+def _extract_google_sheets(
+    file: dict[str, str], service: GoogleDriveService
+) -> list[Section]:
+    """
+    Specialized extraction logic for Google Sheets.
+    Iterates through each sheet, fetches all data, and returns a list of Section objects.
+    """
+    link = file["webViewLink"]
+    file_id = file["id"]
+
+    sheets_service = build("sheets", "v4", credentials=service._http.credentials)
+    spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
+
+    sections: list[Section] = []
+    for sheet in spreadsheet.get("sheets", []):
+        sheet_name = sheet["properties"]["title"]
+        sheet_id = sheet["properties"]["sheetId"]
+
+        grid_props = sheet["properties"].get("gridProperties", {})
+        row_count = grid_props.get("rowCount", 1000)
+        column_count = grid_props.get("columnCount", 26)
+
+        # Convert a number to a spreadsheet column letter (1->A, 26->Z, 27->AA,...)
+        end_column = ""
+        col_count = column_count
+        while col_count > 0:
+            col_count, remainder = divmod(col_count - 1, 26)
+            end_column = chr(65 + remainder) + end_column
+
+        range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
+
+        try:
+            result = (
+                sheets_service.spreadsheets()
+                .values()
+                .get(spreadsheetId=file_id, range=range_name)
+                .execute()
+            )
+            values = result.get("values", [])
+
+            if values:
+                text = f"Sheet: {sheet_name}\n"
+                for row in values:
+                    text += "\t".join(str(cell) for cell in row) + "\n"
+
+                sections.append(Section(link=f"{link}#gid={sheet_id}", text=text))
+        except HttpError as e:
+            logger.warning(
+                f"Error fetching data for sheet '{sheet_name}' in '{file.get('name')}' : {e}"
+            )
+            continue
+
+    return sections
+
+
+def _extract_general_content(
+    file: dict[str, str], service: GoogleDriveService
+) -> list[Section]:
+    """
+    Extracts general file content for files other than Google Sheets.
+    - PDF: Revert to read_pdf_file
+    - DOCX: Unstructured, then docx_to_text, then MarkItDown.
+    - PPTX: Unstructured, then pptx_to_text, then MarkItDown.
+    - TXT: Decode the content; if empty, log.
+    - Google Docs/Slides: Export as text/plain and return directly.
+    """
+    link = file["webViewLink"]
+    mime_type = file["mimeType"]
+    file_id = file["id"]
+    file_name = file.get("name", file_id)
+
+    try:
+        # Google Docs and Google Slides (internal GDrive formats)
+        if (
+            mime_type == GDriveMimeType.DOC.value
+            or mime_type == GDriveMimeType.PPT.value
+        ):
+            logger.debug(f"Extracting Google-native doc/presentation: {file_name}")
+            export_mime_type = "text/plain"
+            content = (
+                service.files()
+                .export(fileId=file_id, mimeType=export_mime_type)
+                .execute()
+            )
+            text = content.decode("utf-8", errors="replace").strip()
+            if not text:
+                logger.warning(
+                    f"No text extracted from Google Docs/Slides file '{file_name}'."
+                )
+                text = UNSUPPORTED_FILE_TYPE_CONTENT
+            return [Section(link=link, text=text)]
+
+        # For all other formats, get raw content
+        content = service.files().get_media(fileId=file_id).execute()
+
+        if mime_type == GDriveMimeType.PDF.value:
+            # Revert to original PDF extraction
+            logger.debug(f"Extracting PDF content for '{file_name}'")
+            text, _ = read_pdf_file(file=io.BytesIO(content))
+            if not text:
+                logger.warning(
+                    f"No text extracted from PDF '{file_name}' with read_pdf_file."
+                )
+                text = UNSUPPORTED_FILE_TYPE_CONTENT
+            return [Section(link=link, text=text)]
+
+        if mime_type == GDriveMimeType.WORD_DOC.value:
+            logger.debug(f"Extracting DOCX content for '{file_name}'")
+            return [
+                Section(link=link, text=_extract_docx_pptx_txt(content, file, "docx"))
+            ]
+
+        if mime_type == GDriveMimeType.POWERPOINT.value:
+            logger.debug(f"Extracting PPTX content for '{file_name}'")
+            return [
+                Section(link=link, text=_extract_docx_pptx_txt(content, file, "pptx"))
+            ]
+
+        if (
+            mime_type == GDriveMimeType.PLAIN_TEXT.value
+            or mime_type == GDriveMimeType.MARKDOWN.value
+        ):
+            logger.debug(f"Extracting plain text/markdown content for '{file_name}'")
+            text = content.decode("utf-8", errors="replace").strip()
+            if not text:
+                logger.warning(
+                    f"No text extracted from TXT/MD '{file_name}'. Returning unsupported message."
+                )
+                text = UNSUPPORTED_FILE_TYPE_CONTENT
+            return [Section(link=link, text=text)]
+
+        # If we reach here, it's some other format supported by MarkItDown/unstructured
+        logger.debug(f"Trying MarkItDown/unstructured fallback for '{file_name}'")
+        text = _extract_docx_pptx_txt(content, file, None)  # generic fallback
+        return [Section(link=link, text=text)]
+
+    except Exception as e:
+        logger.error(
+            f"Error extracting file content for '{file_name}': {e}", exc_info=True
+        )
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+
+def _extract_docx_pptx_txt(
+    content: bytes, file: dict[str, str], file_type: str | None
+) -> str:
+    """
+    Attempts to extract text from DOCX, PPTX, or any supported format using:
+    1. unstructured (if configured)
+    2. docx_to_text/pptx_to_text if known format
+    3. MarkItDown fallback
+    """
+    file_name = file.get("name", file["id"])
+
+    # 1. Try unstructured first
+    if get_unstructured_api_key():
+        try:
+            logger.debug(f"Attempting unstructured extraction for '{file_name}'...")
+            text = unstructured_to_text(io.BytesIO(content), file_name)
+            if text.strip():
+                return text
+            else:
+                logger.warning(f"Unstructured returned empty text for '{file_name}'.")
+        except Exception as e:
+            logger.warning(f"Unstructured extraction failed for '{file_name}': {e}")
+
+    # 2. If format is docx or pptx, try direct extraction methods
+    if file_type == "docx":
+        try:
+            logger.debug(f"Trying docx_to_text for '{file_name}'...")
+            text = docx_to_text(file=io.BytesIO(content))
+            if text.strip():
+                return text
+            else:
+                logger.warning(f"docx_to_text returned empty for '{file_name}'.")
+        except Exception as e:
+            logger.warning(f"docx_to_text failed for '{file_name}': {e}")
+
+    if file_type == "pptx":
+        try:
+            logger.debug(f"Trying pptx_to_text for '{file_name}'...")
+            text = pptx_to_text(file=io.BytesIO(content))
+            if text.strip():
+                return text
+            else:
+                logger.warning(f"pptx_to_text returned empty for '{file_name}'.")
+        except Exception as e:
+            logger.warning(f"pptx_to_text failed for '{file_name}': {e}")
+
+    # 3. Fallback to MarkItDown
+    try:
+        logger.debug(f"Falling back to MarkItDown for '{file_name}'...")
+        md = MarkItDown()
+        result = md.convert(io.BytesIO(content))
+        if result and result.text_content and result.text_content.strip():
+            return result.text_content
+        else:
+            logger.warning(f"MarkItDown returned empty text for '{file_name}'.")
+    except Exception as e:
+        logger.error(
+            f"MarkItDown conversion failed for '{file_name}': {e}", exc_info=True
+        )
+
+    # If all methods fail or return empty, return unsupported message
+    logger.error(
+        f"All extraction methods failed for '{file_name}', returning unsupported file message."
+    )
+    return UNSUPPORTED_FILE_TYPE_CONTENT


 def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -14,10 +14,9 @@ from typing import IO

 import chardet
 import docx  # type: ignore
-import openpyxl  # type: ignore
 import pptx  # type: ignore
-from docx import Document
 from fastapi import UploadFile
+from markitdown import MarkItDown  # type: ignore
 from pypdf import PdfReader
 from pypdf.errors import PdfStreamError

@@ -60,6 +59,9 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
    ".html",
 ]

+# These are the file extensions that we use markitdown for
+MARKITDOWN_FILE_EXTENSIONS = [".docx", ".pptx", ".xlsx"]
+

 def is_text_file_extension(file_name: str) -> bool:
    return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
@@ -74,6 +76,10 @@ def is_valid_file_ext(ext: str) -> bool:
    return ext in VALID_FILE_EXTENSIONS


+def is_markitdown_file_ext(ext: str) -> bool:
+    return ext in MARKITDOWN_FILE_EXTENSIONS
+
+
 def is_text_file(file: IO[bytes]) -> bool:
    """
    checks if the first 1024 bytes only contain printable or whitespace characters
@@ -185,13 +191,6 @@ def read_text_file(
    return file_content_raw, metadata


-def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
-    """Extract text from a PDF file."""
-    # Return only the extracted text from read_pdf_file
-    text, _ = read_pdf_file(file, pdf_pass)
-    return text
-
-
 def read_pdf_file(
    file: IO[Any],
    pdf_pass: str | None = None,
@@ -299,16 +298,11 @@ def pptx_to_text(file: IO[Any]) -> str:
    return TEXT_SECTION_SEPARATOR.join(text_content)


-def xlsx_to_text(file: IO[Any]) -> str:
-    workbook = openpyxl.load_workbook(file, read_only=True)
-    text_content = []
-    for sheet in workbook.worksheets:
-        sheet_string = "\n".join(
-            ",".join(map(str, row))
-            for row in sheet.iter_rows(min_row=1, values_only=True)
-        )
-        text_content.append(sheet_string)
-    return TEXT_SECTION_SEPARATOR.join(text_content)
+def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
+    """Extract text from a PDF file."""
+    # Return only the extracted text from read_pdf_file
+    text, _ = read_pdf_file(file, pdf_pass)
+    return text


 def eml_to_text(file: IO[Any]) -> str:
@@ -346,9 +340,6 @@ def extract_file_text(
 ) -> str:
    extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
        ".pdf": pdf_to_text,
-        ".docx": docx_to_text,
-        ".pptx": pptx_to_text,
-        ".xlsx": xlsx_to_text,
        ".eml": eml_to_text,
        ".epub": epub_to_text,
        ".html": parse_html_page_basic,
@@ -358,6 +349,8 @@ def extract_file_text(
        if get_unstructured_api_key():
            return unstructured_to_text(file, file_name)

+        md = MarkItDown()
+
        if file_name or extension:
            if extension is not None:
                final_extension = extension
@@ -365,6 +358,12 @@ def extract_file_text(
                final_extension = get_file_ext(file_name)

            if is_valid_file_ext(final_extension):
+                if is_markitdown_file_ext(final_extension):
+                    with BytesIO(file.read()) as file_like_object:
+                        result = md.convert_stream(
+                            file_like_object, file_extension=final_extension
+                        )
+                    return result.text_content
                return extension_to_function.get(final_extension, file_io_to_text)(file)

        # Either the file somehow has no name or the extension is not one that we recognize
@@ -382,29 +381,37 @@ def extract_file_text(
        return ""


-def convert_docx_to_txt(
+def convert_docx_to_markdown(
    file: UploadFile, file_store: FileStore, file_path: str
 ) -> None:
-    file.file.seek(0)
-    docx_content = file.file.read()
-    doc = Document(BytesIO(docx_content))
+    try:
+        # Read the file content
+        file_content = file.file.read()

-    # Extract text from the document
-    full_text = []
-    for para in doc.paragraphs:
-        full_text.append(para.text)
+        if not file_content:
+            raise ValueError(f"File {file.filename} is empty")

-    # Join the extracted text
-    text_content = "\n".join(full_text)
+        # Reset the file pointer to the beginning
+        file.file.seek(0)

-    txt_file_path = docx_to_txt_filename(file_path)
-    file_store.save_file(
-        file_name=txt_file_path,
-        content=BytesIO(text_content.encode("utf-8")),
-        display_name=file.filename,
-        file_origin=FileOrigin.CONNECTOR,
-        file_type="text/plain",
-    )
+        text_content = extract_file_text(
+            file=file.file, file_name=file.filename or "", extension=".docx"
+        )
+
+        if not text_content:
+            raise ValueError(f"Failed to extract text from {file.filename}")
+
+        txt_file_path = docx_to_txt_filename(file_path)
+        file_store.save_file(
+            file_name=txt_file_path,
+            content=BytesIO(text_content.encode("utf-8")),
+            display_name=file.filename,
+            file_origin=FileOrigin.CONNECTOR,
+            file_type="text/plain",
+        )
+    except Exception as e:
+        logger.error(f"Error converting DOCX to Markdown: {str(e)}")
+        raise RuntimeError(f"Failed to process file {file.filename}: {str(e)}") from e


 def docx_to_txt_filename(file_path: str) -> str:
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -87,7 +87,7 @@ from onyx.db.models import SearchSettings
 from onyx.db.models import User
 from onyx.db.search_settings import get_current_search_settings
 from onyx.db.search_settings import get_secondary_search_settings
-from onyx.file_processing.extract_file_text import convert_docx_to_txt
+from onyx.file_processing.extract_file_text import convert_docx_to_markdown
 from onyx.file_store.file_store import get_default_file_store
 from onyx.key_value_store.interface import KvKeyNotFoundError
 from onyx.redis.redis_connector import RedisConnector
@@ -396,11 +396,12 @@ def upload_files(
                file_origin=FileOrigin.CONNECTOR,
                file_type=file.content_type or "text/plain",
            )
+            file.file.seek(0)

            if file.content_type and file.content_type.startswith(
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            ):
-                convert_docx_to_txt(file, file_store, file_path)
+                convert_docx_to_markdown(file, file_store, file_path)

    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -81,4 +81,5 @@ stripe==10.12.0
 urllib3==2.2.3
 mistune==0.8.4
 sentry-sdk==2.14.0
-prometheus_client==0.21.0
+prometheus_client==0.21.0
+markitdown==0.0.1a3
--- a/web/src/components/chat_search/MinimalMarkdown.tsx
+++ b/web/src/components/chat_search/MinimalMarkdown.tsx
@@ -6,6 +6,7 @@ import {
 } from "@/app/chat/message/MemoizedTextComponents";
 import React, { useMemo } from "react";
 import ReactMarkdown from "react-markdown";
+import rehypePrism from "rehype-prism-plus";
 import remarkGfm from "remark-gfm";

 interface MinimalMarkdownProps {
@@ -35,9 +36,10 @@ export const MinimalMarkdown: React.FC<MinimalMarkdownProps> = ({

  return (
    <ReactMarkdown
-      className={`w-full text-wrap break-word ${className}`}
+      className={`prose max-w-full text-base ${className}`}
      components={markdownComponents}
      remarkPlugins={[remarkGfm]}
+      rehypePlugins={[[rehypePrism, { ignoreMissing: true }]]}
    >
      {content}
    </ReactMarkdown>
--- a/web/src/components/chat_search/TextView.tsx
+++ b/web/src/components/chat_search/TextView.tsx
@@ -21,11 +21,11 @@ export default function TextView({
  onClose,
 }: TextViewProps) {
  const [zoom, setZoom] = useState(100);
-  const [fileContent, setFileContent] = useState<string>("");
-  const [fileUrl, setFileUrl] = useState<string>("");
-  const [fileName, setFileName] = useState<string>("");
+  const [fileContent, setFileContent] = useState("");
+  const [fileUrl, setFileUrl] = useState("");
+  const [fileName, setFileName] = useState("");
  const [isLoading, setIsLoading] = useState(true);
-  const [fileType, setFileType] = useState<string>("application/octet-stream");
+  const [fileType, setFileType] = useState("application/octet-stream");

  const isMarkdownFormat = (mimeType: string): boolean => {
    const markdownFormats = [
@@ -51,18 +51,17 @@ export default function TextView({

  const fetchFile = useCallback(async () => {
    setIsLoading(true);
-    const fileId = presentingDocument.document_id.split("__")[1];
    try {
+      const fileId = presentingDocument.document_id.split("__")[1];
      const response = await fetch(
-        `/api/chat/file/${encodeURIComponent(fileId)}`,
-        {
-          method: "GET",
-        }
+        `/api/chat/file/${encodeURIComponent(fileId)}`
      );
      const blob = await response.blob();
+
      const url = window.URL.createObjectURL(blob);
      setFileUrl(url);
      setFileName(presentingDocument.semantic_identifier || "document");
+
      const contentType =
        response.headers.get("Content-Type") || "application/octet-stream";
      setFileType(contentType);
@@ -70,9 +69,28 @@ export default function TextView({
      if (isMarkdownFormat(blob.type)) {
        const text = await blob.text();
        setFileContent(text);
+      } else if (blob.type === "application/octet-stream") {
+        try {
+          const text = await blob.text();
+          let nonPrintingCount = 0;
+          for (let i = 0; i < text.length; i++) {
+            const code = text.charCodeAt(i);
+            if (code < 32 && ![9, 10, 13].includes(code)) {
+              nonPrintingCount++;
+            }
+          }
+          const ratio = nonPrintingCount / text.length;
+
+          if (ratio < 0.05) {
+            setFileContent(text);
+            setFileType("text/plain");
+          }
+        } catch (err) {
+          console.error("Failed to parse octet-stream as text", err);
+        }
      }
-    } catch (error) {
-      console.error("Error fetching file:", error);
+    } catch (err) {
+      console.error("Error fetching file:", err);
    } finally {
      setTimeout(() => {
        setIsLoading(false);
@@ -137,7 +155,7 @@ export default function TextView({
              </div>
            ) : (
              <div
-                className={`w-full h-full transform origin-center transition-transform duration-300 ease-in-out`}
+                className="w-full h-full transform origin-center transition-transform duration-300 ease-in-out"
                style={{ transform: `scale(${zoom / 100})` }}
              >
                {isSupportedIframeFormat(fileType) ? (
@@ -146,7 +164,7 @@ export default function TextView({
                    className="w-full h-full border-none"
                    title="File Viewer"
                  />
-                ) : isMarkdownFormat(fileType) ? (
+                ) : isMarkdownFormat(fileType) || fileType === "text/plain" ? (
                  <div className="w-full h-full p-6 overflow-y-scroll overflow-x-hidden">
                    <MinimalMarkdown
                      content={fileContent}
Author	SHA1	Message	Date
pablodanswer	884f9624a3	update	2024-12-19 14:22:13 -08:00
pablodanswer	e2b0924077	= -> ==	2024-12-19 13:43:38 -08:00
pablodanswer	5650e88ecb	cosmetic improvements	2024-12-18 10:48:17 -08:00
pablodanswer	1df3fb5f62	quick nit	2024-12-18 10:10:16 -08:00
pablodanswer	5f7340ebcf	import update	2024-12-18 10:09:18 -08:00
pablodanswer	99dc19ee88	update values	2024-12-18 10:09:18 -08:00
pablodanswer	b64e96a582	update requirements	2024-12-18 10:09:18 -08:00
pablodanswer	f7ce933759	minor nit	2024-12-18 10:09:18 -08:00
pablodanswer	2b3c409081	quick cleanup	2024-12-18 10:09:18 -08:00
pablodanswer	be1d5426e3	minor clean up	2024-12-18 10:09:18 -08:00
pablodanswer	2c9c03b97b	clean up	2024-12-18 10:09:18 -08:00
pablodanswer	1516a43b10	gdrive parsing updates	2024-12-18 10:09:18 -08:00
pablodanswer	5b4f8848d6	quick clean up	2024-12-18 10:09:18 -08:00
pablodanswer	cff48f813a	quick cleanup	2024-12-18 10:09:18 -08:00
pablodanswer	64ca568feb	migrate to markitdown	2024-12-18 10:09:18 -08:00
pablodanswer	298aca158d	minor updates	2024-12-18 10:09:18 -08:00