k

log
validation
2026-02-17 07:45:47 +00:00 · 2024-12-18 20:01:44 -08:00 · 2024-12-18 19:20:55 -08:00 · 2024-12-18 19:13:09 -08:00 · 2024-12-19 01:32:09 +00:00 · 2024-12-19 00:05:57 +00:00
29 changed files with 398 additions and 392 deletions
--- a/backend/ee/onyx/utils/telemetry.py
+++ b/backend/ee/onyx/utils/telemetry.py
@@ -1,3 +1,5 @@
+from typing import Any
+
 from posthog import Posthog

 from ee.onyx.configs.app_configs import POSTHOG_API_KEY
@@ -6,13 +8,31 @@ from onyx.utils.logger import setup_logger

 logger = setup_logger()

-posthog = Posthog(project_api_key=POSTHOG_API_KEY, host=POSTHOG_HOST)
+
+def posthog_on_error(error: Any, items: Any) -> None:
+    logger.error(f"PostHog error: {error}, items: {items}")
+
+
+posthog = Posthog(
+    project_api_key=POSTHOG_API_KEY,
+    host=POSTHOG_HOST,
+    debug=True,
+    on_error=posthog_on_error,
+)


 def event_telemetry(
-    distinct_id: str,
-    event: str,
-    properties: dict | None = None,
+    distinct_id: str, event: str, properties: dict | None = None
 ) -> None:
    logger.info(f"Capturing Posthog event: {distinct_id} {event} {properties}")
-    posthog.capture(distinct_id, event, properties)
+    print("API KEY", POSTHOG_API_KEY)
+    print("HOST", POSTHOG_HOST)
+    try:
+        print(type(distinct_id))
+        print(type(event))
+        print(type(properties))
+        response = posthog.capture(distinct_id, event, properties)
+        posthog.flush()
+        print(response)
+    except Exception as e:
+        logger.error(f"Error capturing Posthog event: {e}")
--- a/backend/onyx/auth/users.py
+++ b/backend/onyx/auth/users.py
@@ -5,6 +5,7 @@ from datetime import datetime
 from datetime import timezone
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
+from typing import cast
 from typing import Dict
 from typing import List
 from typing import Optional
@@ -228,6 +229,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        safe: bool = False,
        request: Optional[Request] = None,
    ) -> User:
+        # We verify the password here to make sure it's valid before we proceed
+        await self.validate_password(
+            user_create.password, cast(schemas.UC, user_create)
+        )
+
        user_count: int | None = None
        referral_source = (
            request.cookies.get("referral_source", None)
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -3,7 +3,6 @@ import multiprocessing
 import time
 from typing import Any

-import requests
 import sentry_sdk
 from celery import Task
 from celery.app import trace
@@ -23,6 +22,7 @@ from onyx.background.celery.apps.task_formatters import CeleryTaskPlainFormatter
 from onyx.background.celery.celery_utils import celery_is_worker_primary
 from onyx.configs.constants import OnyxRedisLocks
 from onyx.db.engine import get_sqlalchemy_engine
+from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
 from onyx.document_index.vespa_constants import VESPA_CONFIG_SERVER_URL
 from onyx.redis.redis_connector import RedisConnector
 from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
@@ -262,7 +262,8 @@ def wait_for_vespa(sender: Any, **kwargs: Any) -> None:
    logger.info("Vespa: Readiness probe starting.")
    while True:
        try:
-            response = requests.get(f"{VESPA_CONFIG_SERVER_URL}/state/v1/health")
+            client = get_vespa_http_client()
+            response = client.get(f"{VESPA_CONFIG_SERVER_URL}/state/v1/health")
            response.raise_for_status()

            response_dict = response.json()
--- a/backend/onyx/background/celery/apps/beat.py
+++ b/backend/onyx/background/celery/apps/beat.py
@@ -13,7 +13,6 @@ from onyx.db.engine import SqlEngine
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import fetch_versioned_implementation
 from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
-from shared_configs.configs import MULTI_TENANT

 logger = setup_logger(__name__)

@@ -154,10 +153,6 @@ def on_beat_init(sender: Any, **kwargs: Any) -> None:
    SqlEngine.set_app_name(POSTGRES_CELERY_BEAT_APP_NAME)
    SqlEngine.init_engine(pool_size=2, max_overflow=0)

-    # Startup checks are not needed in multi-tenant case
-    if MULTI_TENANT:
-        return
-
    app_base.wait_for_redis(sender, **kwargs)


--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -61,13 +61,14 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
    SqlEngine.init_engine(pool_size=4, max_overflow=12)

-    # Startup checks are not needed in multi-tenant case
-    if MULTI_TENANT:
-        return
-
    app_base.wait_for_redis(sender, **kwargs)
    app_base.wait_for_db(sender, **kwargs)
    app_base.wait_for_vespa(sender, **kwargs)
+
+    # Less startup checks in multi-tenant case
+    if MULTI_TENANT:
+        return
+
    app_base.on_secondary_worker_init(sender, **kwargs)


--- a/backend/onyx/background/celery/apps/indexing.py
+++ b/backend/onyx/background/celery/apps/indexing.py
@@ -62,13 +62,14 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency)

-    # Startup checks are not needed in multi-tenant case
-    if MULTI_TENANT:
-        return
-
    app_base.wait_for_redis(sender, **kwargs)
    app_base.wait_for_db(sender, **kwargs)
    app_base.wait_for_vespa(sender, **kwargs)
+
+    # Less startup checks in multi-tenant case
+    if MULTI_TENANT:
+        return
+
    app_base.on_secondary_worker_init(sender, **kwargs)


--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -60,13 +60,15 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
-    # Startup checks are not needed in multi-tenant case
-    if MULTI_TENANT:
-        return

    app_base.wait_for_redis(sender, **kwargs)
    app_base.wait_for_db(sender, **kwargs)
    app_base.wait_for_vespa(sender, **kwargs)
+
+    # Less startup checks in multi-tenant case
+    if MULTI_TENANT:
+        return
+
    app_base.on_secondary_worker_init(sender, **kwargs)


--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -84,14 +84,14 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
    SqlEngine.init_engine(pool_size=8, max_overflow=0)

-    # Startup checks are not needed in multi-tenant case
-    if MULTI_TENANT:
-        return
-
    app_base.wait_for_redis(sender, **kwargs)
    app_base.wait_for_db(sender, **kwargs)
    app_base.wait_for_vespa(sender, **kwargs)

+    # Less startup checks in multi-tenant case
+    if MULTI_TENANT:
+        return
+
    logger.info("Running as the primary celery worker.")

    # This is singleton work that should be done on startup exactly once
--- a/backend/onyx/background/celery/tasks/indexing/tasks.py
+++ b/backend/onyx/background/celery/tasks/indexing/tasks.py
@@ -29,7 +29,6 @@ from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisLocks
-from onyx.configs.constants import OnyxRedisSignals
 from onyx.db.connector import mark_ccpair_with_indexing_trigger
 from onyx.db.connector_credential_pair import fetch_connector_credential_pairs
 from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
@@ -176,7 +175,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:

    # we need to use celery's redis client to access its redis data
    # (which lives on a different db number)
-    redis_client_celery: Redis = self.app.broker_connection().channel().client  # type: ignore
+    # redis_client_celery: Redis = self.app.broker_connection().channel().client  # type: ignore

    lock_beat: RedisLock = redis_client.lock(
        OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK,
@@ -319,20 +318,23 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
                    attempt.id, db_session, failure_reason=failure_reason
                )

-        # we want to run this less frequently than the overall task
-        if not redis_client.exists(OnyxRedisSignals.VALIDATE_INDEXING_FENCES):
-            # clear any indexing fences that don't have associated celery tasks in progress
-            # tasks can be in the queue in redis, in reserved tasks (prefetched by the worker),
-            # or be currently executing
-            try:
-                task_logger.info("Validating indexing fences...")
-                validate_indexing_fences(
-                    tenant_id, self.app, redis_client, redis_client_celery, lock_beat
-                )
-            except Exception:
-                task_logger.exception("Exception while validating indexing fences")
+        # rkuo: The following code logically appears to work, but the celery inspect code may be unstable
+        # turning off for the moment to see if it helps cloud stability

-            redis_client.set(OnyxRedisSignals.VALIDATE_INDEXING_FENCES, 1, ex=60)
+        # we want to run this less frequently than the overall task
+        # if not redis_client.exists(OnyxRedisSignals.VALIDATE_INDEXING_FENCES):
+        #     # clear any indexing fences that don't have associated celery tasks in progress
+        #     # tasks can be in the queue in redis, in reserved tasks (prefetched by the worker),
+        #     # or be currently executing
+        #     try:
+        #         task_logger.info("Validating indexing fences...")
+        #         validate_indexing_fences(
+        #             tenant_id, self.app, redis_client, redis_client_celery, lock_beat
+        #         )
+        #     except Exception:
+        #         task_logger.exception("Exception while validating indexing fences")
+
+        #     redis_client.set(OnyxRedisSignals.VALIDATE_INDEXING_FENCES, 1, ex=60)

    except SoftTimeLimitExceeded:
        task_logger.info(
--- a/backend/onyx/connectors/google_drive/doc_conversion.py
+++ b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -4,7 +4,6 @@ from datetime import timezone

 from googleapiclient.discovery import build  # type: ignore
 from googleapiclient.errors import HttpError  # type: ignore
-from markitdown import MarkItDown  # type: ignore

 from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
 from onyx.configs.constants import DocumentSource
@@ -27,9 +26,9 @@ from onyx.file_processing.unstructured import get_unstructured_api_key
 from onyx.file_processing.unstructured import unstructured_to_text
 from onyx.utils.logger import setup_logger

-
 logger = setup_logger()

+
 # these errors don't represent a failure in the connector, but simply files
 # that can't / shouldn't be indexed
 ERRORS_TO_CONTINUE_ON = [
@@ -39,41 +38,177 @@ ERRORS_TO_CONTINUE_ON = [
 ]


+def _extract_sections_basic(
+    file: dict[str, str], service: GoogleDriveService
+) -> list[Section]:
+    mime_type = file["mimeType"]
+    link = file["webViewLink"]
+
+    if mime_type not in set(item.value for item in GDriveMimeType):
+        # Unsupported file types can still have a title, finding this way is still useful
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+    try:
+        if mime_type == GDriveMimeType.SPREADSHEET.value:
+            try:
+                sheets_service = build(
+                    "sheets", "v4", credentials=service._http.credentials
+                )
+                spreadsheet = (
+                    sheets_service.spreadsheets()
+                    .get(spreadsheetId=file["id"])
+                    .execute()
+                )
+
+                sections = []
+                for sheet in spreadsheet["sheets"]:
+                    sheet_name = sheet["properties"]["title"]
+                    sheet_id = sheet["properties"]["sheetId"]
+
+                    # Get sheet dimensions
+                    grid_properties = sheet["properties"].get("gridProperties", {})
+                    row_count = grid_properties.get("rowCount", 1000)
+                    column_count = grid_properties.get("columnCount", 26)
+
+                    # Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
+                    end_column = ""
+                    while column_count:
+                        column_count, remainder = divmod(column_count - 1, 26)
+                        end_column = chr(65 + remainder) + end_column
+
+                    range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
+
+                    try:
+                        result = (
+                            sheets_service.spreadsheets()
+                            .values()
+                            .get(spreadsheetId=file["id"], range=range_name)
+                            .execute()
+                        )
+                        values = result.get("values", [])
+
+                        if values:
+                            text = f"Sheet: {sheet_name}\n"
+                            for row in values:
+                                text += "\t".join(str(cell) for cell in row) + "\n"
+                            sections.append(
+                                Section(
+                                    link=f"{link}#gid={sheet_id}",
+                                    text=text,
+                                )
+                            )
+                    except HttpError as e:
+                        logger.warning(
+                            f"Error fetching data for sheet '{sheet_name}': {e}"
+                        )
+                        continue
+                return sections
+
+            except Exception as e:
+                logger.warning(
+                    f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
+                    " Falling back to basic extraction."
+                )
+
+        if mime_type in [
+            GDriveMimeType.DOC.value,
+            GDriveMimeType.PPT.value,
+            GDriveMimeType.SPREADSHEET.value,
+        ]:
+            export_mime_type = (
+                "text/plain"
+                if mime_type != GDriveMimeType.SPREADSHEET.value
+                else "text/csv"
+            )
+            text = (
+                service.files()
+                .export(fileId=file["id"], mimeType=export_mime_type)
+                .execute()
+                .decode("utf-8")
+            )
+            return [Section(link=link, text=text)]
+
+        elif mime_type in [
+            GDriveMimeType.PLAIN_TEXT.value,
+            GDriveMimeType.MARKDOWN.value,
+        ]:
+            return [
+                Section(
+                    link=link,
+                    text=service.files()
+                    .get_media(fileId=file["id"])
+                    .execute()
+                    .decode("utf-8"),
+                )
+            ]
+        if mime_type in [
+            GDriveMimeType.WORD_DOC.value,
+            GDriveMimeType.POWERPOINT.value,
+            GDriveMimeType.PDF.value,
+        ]:
+            response = service.files().get_media(fileId=file["id"]).execute()
+            if get_unstructured_api_key():
+                return [
+                    Section(
+                        link=link,
+                        text=unstructured_to_text(
+                            file=io.BytesIO(response),
+                            file_name=file.get("name", file["id"]),
+                        ),
+                    )
+                ]
+
+            if mime_type == GDriveMimeType.WORD_DOC.value:
+                return [
+                    Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
+                ]
+            elif mime_type == GDriveMimeType.PDF.value:
+                text, _ = read_pdf_file(file=io.BytesIO(response))
+                return [Section(link=link, text=text)]
+            elif mime_type == GDriveMimeType.POWERPOINT.value:
+                return [
+                    Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
+                ]
+
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+    except Exception:
+        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+
+
 def convert_drive_item_to_document(
    file: GoogleDriveFileType,
    drive_service: GoogleDriveService,
    docs_service: GoogleDocsService,
 ) -> Document | None:
-    """
-    Converts a Google Drive file into an internal Document object, extracting
-    the text and organizing it into sections. Uses specialized methods for Google Docs
-    to preserve structure. Falls back to basic extraction for all other formats.
-    """
    try:
-        # Skip shortcuts and folders
+        # Skip files that are shortcuts
        if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
            logger.info("Ignoring Drive Shortcut Filetype")
            return None
+        # Skip files that are folders
        if file.get("mimeType") == DRIVE_FOLDER_TYPE:
            logger.info("Ignoring Drive Folder Filetype")
            return None

        sections: list[Section] = []

-        # Special handling for Google Docs to preserve structure
+        # Special handling for Google Docs to preserve structure, link
+        # to headers
        if file.get("mimeType") == GDriveMimeType.DOC.value:
            try:
                sections = get_document_sections(docs_service, file["id"])
            except Exception as e:
                logger.warning(
-                    f"Exception '{e}' when pulling sections from Google Doc '{file['name']}'. "
-                    "Falling back to basic extraction."
+                    f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
+                    " Falling back to basic extraction."
                )
-
-        # If not a GDoc or GDoc extraction failed
+        # NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
        if not sections:
            try:
+                # For all other file types just extract the text
                sections = _extract_sections_basic(file, drive_service)
+
            except HttpError as e:
                reason = e.error_details[0]["reason"] if e.error_details else e.reason
                message = e.error_details[0]["message"] if e.error_details else e.reason
@@ -82,8 +217,8 @@ def convert_drive_item_to_document(
                        f"Could not export file '{file['name']}' due to '{message}', skipping..."
                    )
                    return None
-                raise

+                raise
        if not sections:
            return None

@@ -103,248 +238,9 @@ def convert_drive_item_to_document(
    except Exception as e:
        if not CONTINUE_ON_CONNECTOR_FAILURE:
            raise e
+
        logger.exception("Ran into exception when pulling a file from Google Drive")
-        return None
-
-
-def _extract_sections_basic(
-    file: GoogleDriveFileType, service: GoogleDriveService
-) -> list[Section]:
-    """
-    Extracts text from a Google Drive file based on its MIME type.
-    """
-    mime_type = file["mimeType"]
-    link = file["webViewLink"]
-
-    # Handle unsupported MIME types
-    if mime_type not in {item.value for item in GDriveMimeType}:
-        logger.debug(
-            f"Unsupported MIME type '{mime_type}' for file '{file.get('name')}'"
-        )
-        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
-
-    # Specialized handling for Google Sheets
-    if mime_type == GDriveMimeType.SPREADSHEET.value:
-        try:
-            return _extract_google_sheets(file, service)
-        except Exception as e:
-            logger.warning(
-                f"Error extracting data from Google Sheet '{file['name']}': {e}. "
-                "Falling back to basic content extraction."
-            )
-
-    # For other types
-    return _extract_general_content(file, service)
-
-
-def _extract_google_sheets(
-    file: dict[str, str], service: GoogleDriveService
-) -> list[Section]:
-    """
-    Specialized extraction logic for Google Sheets.
-    Iterates through each sheet, fetches all data, and returns a list of Section objects.
-    """
-    link = file["webViewLink"]
-    file_id = file["id"]
-
-    sheets_service = build("sheets", "v4", credentials=service._http.credentials)
-    spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
-
-    sections: list[Section] = []
-    for sheet in spreadsheet.get("sheets", []):
-        sheet_name = sheet["properties"]["title"]
-        sheet_id = sheet["properties"]["sheetId"]
-
-        grid_props = sheet["properties"].get("gridProperties", {})
-        row_count = grid_props.get("rowCount", 1000)
-        column_count = grid_props.get("columnCount", 26)
-
-        # Convert a number to a spreadsheet column letter (1->A, 26->Z, 27->AA,...)
-        end_column = ""
-        col_count = column_count
-        while col_count > 0:
-            col_count, remainder = divmod(col_count - 1, 26)
-            end_column = chr(65 + remainder) + end_column
-
-        range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
-
-        try:
-            result = (
-                sheets_service.spreadsheets()
-                .values()
-                .get(spreadsheetId=file_id, range=range_name)
-                .execute()
-            )
-            values = result.get("values", [])
-
-            if values:
-                text = f"Sheet: {sheet_name}\n"
-                for row in values:
-                    text += "\t".join(str(cell) for cell in row) + "\n"
-
-                sections.append(Section(link=f"{link}#gid={sheet_id}", text=text))
-        except HttpError as e:
-            logger.warning(
-                f"Error fetching data for sheet '{sheet_name}' in '{file.get('name')}' : {e}"
-            )
-            continue
-
-    return sections
-
-
-def _extract_general_content(
-    file: dict[str, str], service: GoogleDriveService
-) -> list[Section]:
-    """
-    Extracts general file content for files other than Google Sheets.
-    - PDF: Revert to read_pdf_file
-    - DOCX: Unstructured, then docx_to_text, then MarkItDown.
-    - PPTX: Unstructured, then pptx_to_text, then MarkItDown.
-    - TXT: Decode the content; if empty, log.
-    - Google Docs/Slides: Export as text/plain and return directly.
-    """
-    link = file["webViewLink"]
-    mime_type = file["mimeType"]
-    file_id = file["id"]
-    file_name = file.get("name", file_id)
-
-    try:
-        # Google Docs and Google Slides (internal GDrive formats)
-        if (
-            mime_type == GDriveMimeType.DOC.value
-            or mime_type == GDriveMimeType.PPT.value
-        ):
-            logger.debug(f"Extracting Google-native doc/presentation: {file_name}")
-            export_mime_type = "text/plain"
-            content = (
-                service.files()
-                .export(fileId=file_id, mimeType=export_mime_type)
-                .execute()
-            )
-            text = content.decode("utf-8", errors="replace").strip()
-            if not text:
-                logger.warning(
-                    f"No text extracted from Google Docs/Slides file '{file_name}'."
-                )
-                text = UNSUPPORTED_FILE_TYPE_CONTENT
-            return [Section(link=link, text=text)]
-
-        # For all other formats, get raw content
-        content = service.files().get_media(fileId=file_id).execute()
-
-        if mime_type == GDriveMimeType.PDF.value:
-            # Revert to original PDF extraction
-            logger.debug(f"Extracting PDF content for '{file_name}'")
-            text, _ = read_pdf_file(file=io.BytesIO(content))
-            if not text:
-                logger.warning(
-                    f"No text extracted from PDF '{file_name}' with read_pdf_file."
-                )
-                text = UNSUPPORTED_FILE_TYPE_CONTENT
-            return [Section(link=link, text=text)]
-
-        if mime_type == GDriveMimeType.WORD_DOC.value:
-            logger.debug(f"Extracting DOCX content for '{file_name}'")
-            return [
-                Section(link=link, text=_extract_docx_pptx_txt(content, file, "docx"))
-            ]
-
-        if mime_type == GDriveMimeType.POWERPOINT.value:
-            logger.debug(f"Extracting PPTX content for '{file_name}'")
-            return [
-                Section(link=link, text=_extract_docx_pptx_txt(content, file, "pptx"))
-            ]
-
-        if (
-            mime_type == GDriveMimeType.PLAIN_TEXT.value
-            or mime_type == GDriveMimeType.MARKDOWN.value
-        ):
-            logger.debug(f"Extracting plain text/markdown content for '{file_name}'")
-            text = content.decode("utf-8", errors="replace").strip()
-            if not text:
-                logger.warning(
-                    f"No text extracted from TXT/MD '{file_name}'. Returning unsupported message."
-                )
-                text = UNSUPPORTED_FILE_TYPE_CONTENT
-            return [Section(link=link, text=text)]
-
-        # If we reach here, it's some other format supported by MarkItDown/unstructured
-        logger.debug(f"Trying MarkItDown/unstructured fallback for '{file_name}'")
-        text = _extract_docx_pptx_txt(content, file, None)  # generic fallback
-        return [Section(link=link, text=text)]
-
-    except Exception as e:
-        logger.error(
-            f"Error extracting file content for '{file_name}': {e}", exc_info=True
-        )
-        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
-
-
-def _extract_docx_pptx_txt(
-    content: bytes, file: dict[str, str], file_type: str | None
-) -> str:
-    """
-    Attempts to extract text from DOCX, PPTX, or any supported format using:
-    1. unstructured (if configured)
-    2. docx_to_text/pptx_to_text if known format
-    3. MarkItDown fallback
-    """
-    file_name = file.get("name", file["id"])
-
-    # 1. Try unstructured first
-    if get_unstructured_api_key():
-        try:
-            logger.debug(f"Attempting unstructured extraction for '{file_name}'...")
-            text = unstructured_to_text(io.BytesIO(content), file_name)
-            if text.strip():
-                return text
-            else:
-                logger.warning(f"Unstructured returned empty text for '{file_name}'.")
-        except Exception as e:
-            logger.warning(f"Unstructured extraction failed for '{file_name}': {e}")
-
-    # 2. If format is docx or pptx, try direct extraction methods
-    if file_type == "docx":
-        try:
-            logger.debug(f"Trying docx_to_text for '{file_name}'...")
-            text = docx_to_text(file=io.BytesIO(content))
-            if text.strip():
-                return text
-            else:
-                logger.warning(f"docx_to_text returned empty for '{file_name}'.")
-        except Exception as e:
-            logger.warning(f"docx_to_text failed for '{file_name}': {e}")
-
-    if file_type == "pptx":
-        try:
-            logger.debug(f"Trying pptx_to_text for '{file_name}'...")
-            text = pptx_to_text(file=io.BytesIO(content))
-            if text.strip():
-                return text
-            else:
-                logger.warning(f"pptx_to_text returned empty for '{file_name}'.")
-        except Exception as e:
-            logger.warning(f"pptx_to_text failed for '{file_name}': {e}")
-
-    # 3. Fallback to MarkItDown
-    try:
-        logger.debug(f"Falling back to MarkItDown for '{file_name}'...")
-        md = MarkItDown()
-        result = md.convert(io.BytesIO(content))
-        if result and result.text_content and result.text_content.strip():
-            return result.text_content
-        else:
-            logger.warning(f"MarkItDown returned empty text for '{file_name}'.")
-    except Exception as e:
-        logger.error(
-            f"MarkItDown conversion failed for '{file_name}': {e}", exc_info=True
-        )
-
-    # If all methods fail or return empty, return unsupported message
-    logger.error(
-        f"All extraction methods failed for '{file_name}', returning unsupported file message."
-    )
-    return UNSUPPORTED_FILE_TYPE_CONTENT
+    return None


 def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
--- a/backend/onyx/document_index/vespa/index.py
+++ b/backend/onyx/document_index/vespa/index.py
@@ -535,7 +535,7 @@ class VespaIndex(DocumentIndex):
        if self.secondary_index_name:
            index_names.append(self.secondary_index_name)

-        with get_vespa_http_client() as http_client:
+        with get_vespa_http_client(http2=False) as http_client:
            for index_name in index_names:
                params = httpx.QueryParams(
                    {
@@ -546,8 +546,12 @@ class VespaIndex(DocumentIndex):

                while True:
                    try:
+                        vespa_url = (
+                            f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}"
+                        )
+                        logger.debug(f'update_single PUT on URL "{vespa_url}"')
                        resp = http_client.put(
-                            f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}",
+                            vespa_url,
                            params=params,
                            headers={"Content-Type": "application/json"},
                            json=update_dict,
@@ -619,7 +623,7 @@ class VespaIndex(DocumentIndex):
        if self.secondary_index_name:
            index_names.append(self.secondary_index_name)

-        with get_vespa_http_client() as http_client:
+        with get_vespa_http_client(http2=False) as http_client:
            for index_name in index_names:
                params = httpx.QueryParams(
                    {
@@ -630,8 +634,12 @@ class VespaIndex(DocumentIndex):

                while True:
                    try:
+                        vespa_url = (
+                            f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}"
+                        )
+                        logger.debug(f'delete_single DELETE on URL "{vespa_url}"')
                        resp = http_client.delete(
-                            f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
+                            vespa_url,
                            params=params,
                        )
                        resp.raise_for_status()
--- a/backend/onyx/document_index/vespa/shared_utils/utils.py
+++ b/backend/onyx/document_index/vespa/shared_utils/utils.py
@@ -55,7 +55,9 @@ def remove_invalid_unicode_chars(text: str) -> str:
    return _illegal_xml_chars_RE.sub("", text)


-def get_vespa_http_client(no_timeout: bool = False) -> httpx.Client:
+def get_vespa_http_client(
+    no_timeout: bool = False, http2: bool = False
+) -> httpx.Client:
    """
    Configure and return an HTTP client for communicating with Vespa,
    including authentication if needed.
@@ -67,5 +69,5 @@ def get_vespa_http_client(no_timeout: bool = False) -> httpx.Client:
        else None,
        verify=False if not MANAGED_VESPA else True,
        timeout=None if no_timeout else VESPA_REQUEST_TIMEOUT,
-        http2=True,
+        http2=http2,
    )
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -14,9 +14,10 @@ from typing import IO

 import chardet
 import docx  # type: ignore
+import openpyxl  # type: ignore
 import pptx  # type: ignore
+from docx import Document
 from fastapi import UploadFile
-from markitdown import MarkItDown  # type: ignore
 from pypdf import PdfReader
 from pypdf.errors import PdfStreamError

@@ -59,9 +60,6 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
    ".html",
 ]

-# These are the file extensions that we use markitdown for
-MARKITDOWN_FILE_EXTENSIONS = [".docx", ".pptx", ".xlsx"]
-

 def is_text_file_extension(file_name: str) -> bool:
    return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
@@ -76,10 +74,6 @@ def is_valid_file_ext(ext: str) -> bool:
    return ext in VALID_FILE_EXTENSIONS


-def is_markitdown_file_ext(ext: str) -> bool:
-    return ext in MARKITDOWN_FILE_EXTENSIONS
-
-
 def is_text_file(file: IO[bytes]) -> bool:
    """
    checks if the first 1024 bytes only contain printable or whitespace characters
@@ -191,6 +185,13 @@ def read_text_file(
    return file_content_raw, metadata


+def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
+    """Extract text from a PDF file."""
+    # Return only the extracted text from read_pdf_file
+    text, _ = read_pdf_file(file, pdf_pass)
+    return text
+
+
 def read_pdf_file(
    file: IO[Any],
    pdf_pass: str | None = None,
@@ -298,11 +299,16 @@ def pptx_to_text(file: IO[Any]) -> str:
    return TEXT_SECTION_SEPARATOR.join(text_content)


-def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
-    """Extract text from a PDF file."""
-    # Return only the extracted text from read_pdf_file
-    text, _ = read_pdf_file(file, pdf_pass)
-    return text
+def xlsx_to_text(file: IO[Any]) -> str:
+    workbook = openpyxl.load_workbook(file, read_only=True)
+    text_content = []
+    for sheet in workbook.worksheets:
+        sheet_string = "\n".join(
+            ",".join(map(str, row))
+            for row in sheet.iter_rows(min_row=1, values_only=True)
+        )
+        text_content.append(sheet_string)
+    return TEXT_SECTION_SEPARATOR.join(text_content)


 def eml_to_text(file: IO[Any]) -> str:
@@ -340,6 +346,9 @@ def extract_file_text(
 ) -> str:
    extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
        ".pdf": pdf_to_text,
+        ".docx": docx_to_text,
+        ".pptx": pptx_to_text,
+        ".xlsx": xlsx_to_text,
        ".eml": eml_to_text,
        ".epub": epub_to_text,
        ".html": parse_html_page_basic,
@@ -349,8 +358,6 @@ def extract_file_text(
        if get_unstructured_api_key():
            return unstructured_to_text(file, file_name)

-        md = MarkItDown()
-
        if file_name or extension:
            if extension is not None:
                final_extension = extension
@@ -358,12 +365,6 @@ def extract_file_text(
                final_extension = get_file_ext(file_name)

            if is_valid_file_ext(final_extension):
-                if is_markitdown_file_ext(final_extension):
-                    with BytesIO(file.read()) as file_like_object:
-                        result = md.convert_stream(
-                            file_like_object, file_extension=final_extension
-                        )
-                    return result.text_content
                return extension_to_function.get(final_extension, file_io_to_text)(file)

        # Either the file somehow has no name or the extension is not one that we recognize
@@ -381,37 +382,29 @@ def extract_file_text(
        return ""


-def convert_docx_to_markdown(
+def convert_docx_to_txt(
    file: UploadFile, file_store: FileStore, file_path: str
 ) -> None:
-    try:
-        # Read the file content
-        file_content = file.file.read()
+    file.file.seek(0)
+    docx_content = file.file.read()
+    doc = Document(BytesIO(docx_content))

-        if not file_content:
-            raise ValueError(f"File {file.filename} is empty")
+    # Extract text from the document
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)

-        # Reset the file pointer to the beginning
-        file.file.seek(0)
+    # Join the extracted text
+    text_content = "\n".join(full_text)

-        text_content = extract_file_text(
-            file=file.file, file_name=file.filename or "", extension=".docx"
-        )
-
-        if not text_content:
-            raise ValueError(f"Failed to extract text from {file.filename}")
-
-        txt_file_path = docx_to_txt_filename(file_path)
-        file_store.save_file(
-            file_name=txt_file_path,
-            content=BytesIO(text_content.encode("utf-8")),
-            display_name=file.filename,
-            file_origin=FileOrigin.CONNECTOR,
-            file_type="text/plain",
-        )
-    except Exception as e:
-        logger.error(f"Error converting DOCX to Markdown: {str(e)}")
-        raise RuntimeError(f"Failed to process file {file.filename}: {str(e)}") from e
+    txt_file_path = docx_to_txt_filename(file_path)
+    file_store.save_file(
+        file_name=txt_file_path,
+        content=BytesIO(text_content.encode("utf-8")),
+        display_name=file.filename,
+        file_origin=FileOrigin.CONNECTOR,
+        file_type="text/plain",
+    )


 def docx_to_txt_filename(file_path: str) -> str:
--- a/backend/onyx/llm/chat_llm.py
+++ b/backend/onyx/llm/chat_llm.py
@@ -453,7 +453,9 @@ class DefaultMultiLLM(LLM):
        if LOG_DANSWER_MODEL_INTERACTIONS:
            self.log_model_configs()

-        if DISABLE_LITELLM_STREAMING:
+        if (
+            DISABLE_LITELLM_STREAMING or self.config.model_name == "o1-2024-12-17"
+        ):  # TODO: remove once litellm supports streaming
            yield self.invoke(prompt, tools, tool_choice, structured_response_format)
            return

--- a/backend/onyx/llm/llm_provider_options.py
+++ b/backend/onyx/llm/llm_provider_options.py
@@ -29,6 +29,7 @@ OPENAI_PROVIDER_NAME = "openai"
 OPEN_AI_MODEL_NAMES = [
    "o1-mini",
    "o1-preview",
+    "o1-2024-12-17",
    "gpt-4",
    "gpt-4o",
    "gpt-4o-mini",
--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -87,7 +87,7 @@ from onyx.db.models import SearchSettings
 from onyx.db.models import User
 from onyx.db.search_settings import get_current_search_settings
 from onyx.db.search_settings import get_secondary_search_settings
-from onyx.file_processing.extract_file_text import convert_docx_to_markdown
+from onyx.file_processing.extract_file_text import convert_docx_to_txt
 from onyx.file_store.file_store import get_default_file_store
 from onyx.key_value_store.interface import KvKeyNotFoundError
 from onyx.redis.redis_connector import RedisConnector
@@ -396,12 +396,11 @@ def upload_files(
                file_origin=FileOrigin.CONNECTOR,
                file_type=file.content_type or "text/plain",
            )
-            file.file.seek(0)

            if file.content_type and file.content_type.startswith(
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            ):
-                convert_docx_to_markdown(file, file_store, file_path)
+                convert_docx_to_txt(file, file_store, file_path)

    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))
--- a/backend/onyx/utils/telemetry.py
+++ b/backend/onyx/utils/telemetry.py
@@ -22,6 +22,7 @@ from onyx.utils.variable_functionality import (
 from onyx.utils.variable_functionality import noop_fallback
 from shared_configs.configs import MULTI_TENANT

+
 _DANSWER_TELEMETRY_ENDPOINT = "https://telemetry.onyx.app/anonymous_telemetry"
 _CACHED_UUID: str | None = None
 _CACHED_INSTANCE_DOMAIN: str | None = None
@@ -117,9 +118,12 @@ def mt_cloud_telemetry(
    event: MilestoneRecordType,
    properties: dict | None = None,
 ) -> None:
+    print(f"mt_cloud_telemetry {distinct_id} {event} {properties}")
    if not MULTI_TENANT:
+        print("mt_cloud_telemetry not MULTI_TENANT")
        return

+    print("mt_cloud_telemetry MULTI_TENANT")
    # MIT version should not need to include any Posthog code
    # This is only for Onyx MT Cloud, this code should also never be hit, no reason for any orgs to
    # be running the Multi Tenant version of Onyx.
@@ -137,8 +141,11 @@ def create_milestone_and_report(
    properties: dict | None,
    db_session: Session,
 ) -> None:
+    print(f"create_milestone_and_report {user} {event_type} {db_session}")
    _, is_new = create_milestone_if_not_exists(user, event_type, db_session)
+    print(f"create_milestone_and_report {is_new}")
    if is_new:
+        print("create_milestone_and_report is_new")
        mt_cloud_telemetry(
            distinct_id=distinct_id,
            event=event_type,
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -29,7 +29,7 @@ trafilatura==1.12.2
 langchain==0.1.17
 langchain-core==0.1.50
 langchain-text-splitters==0.0.1
-litellm==1.54.1
+litellm==1.55.4
 lxml==5.3.0
 lxml_html_clean==0.2.2
 llama-index==0.9.45
@@ -81,5 +81,4 @@ stripe==10.12.0
 urllib3==2.2.3
 mistune==0.8.4
 sentry-sdk==2.14.0
-prometheus_client==0.21.0
-markitdown==0.0.1a3
+prometheus_client==0.21.0
--- a/backend/requirements/model_server.txt
+++ b/backend/requirements/model_server.txt
@@ -12,5 +12,5 @@ torch==2.2.0
 transformers==4.39.2
 uvicorn==0.21.1
 voyageai==0.2.3
-litellm==1.54.1
+litellm==1.55.4
 sentry-sdk[fastapi,celery,starlette]==2.14.0
--- a/web/public/Amazon.svg
+++ b/web/public/Amazon.svg
--- a/web/public/Meta.svg
+++ b/web/public/Meta.svg
--- a/web/public/Microsoft.svg
+++ b/web/public/Microsoft.svg
@@ -0,0 +1,6 @@
+<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
+<rect x="1.33325" y="1.3335" width="6.33333" height="6.33333" fill="#F25022"/>
+<rect x="8.33325" y="1.3335" width="6.33333" height="6.33333" fill="#80BA01"/>
+<rect x="8.33325" y="8.3335" width="6.33333" height="6.33333" fill="#FFB902"/>
+<rect x="1.33325" y="8.3335" width="6.33333" height="6.33333" fill="#02A4EF"/>
+</svg>
--- a/web/public/Mistral.svg
+++ b/web/public/Mistral.svg
@@ -0,0 +1 @@
+<svg viewBox="0 0 512 512" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd" stroke-linejoin="round" stroke-miterlimit="2"><path d="M189.08 303.228H94.587l.044-94.446h94.497l-.048 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.528 397.674h-94.493l.044-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.575 303.228H189.08l.046-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M378.07 303.228h-94.495l.044-94.446h94.498l-.047 94.446zM189.128 208.779H94.633l.044-94.448h94.498l-.047 94.448zM378.115 208.779h-94.494l.045-94.448h94.496l-.047 94.448zM94.587 303.227H.093l.044-96.017h94.496l-.046 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.633 208.779H.138l.046-94.448H94.68l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.68 115.902H.185L.23 19.885h94.498l-.047 96.017zM472.657 114.331h-94.495l.044-94.446h94.497l-.046 94.446zM94.54 399.244H.046l.044-97.588h94.497l-.047 97.588z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.495 492.123H0l.044-94.446H94.54l-.045 94.446zM472.563 303.228H378.07l.044-94.446h94.496l-.047 94.446zM472.61 208.779h-94.495l.044-94.448h94.498l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.517 397.674h-94.494l.044-94.446h94.497l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.47 492.121h-94.493l.044-96.017h94.496l-.047 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M228.375 303.22h-96.061l.046-94.446h96.067l-.052 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M322.827 397.666h-94.495l.044-96.018h94.498l-.047 96.018z" fill="#ff4900" fill-rule="nonzero"/><path d="M324.444 303.22h-97.636l.046-94.446h97.638l-.048 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M418.938 303.22h-96.064l.045-94.446h96.066l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M228.423 208.77H132.36l.045-94.445h96.066l-.05 94.446zM418.985 208.77H322.92l.044-94.445h96.069l-.048 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.883 304.79H39.392l.044-96.017h94.496l-.049 96.017z" fill="#ff7000" fill-rule="nonzero"/><path d="M133.929 208.77H39.437l.044-95.445h94.496l-.048 95.445z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.976 114.325H39.484l.044-94.448h94.497l-.05 94.448zM511.954 115.325h-94.493l.044-95.448h94.497l-.048 95.448z" fill="#ffce00" fill-rule="nonzero"/><path d="M133.836 399.667H39.345l.044-96.447h94.496l-.049 96.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M133.79 492.117H39.3l.044-94.448h94.496l-.049 94.448z" fill="#ff0107" fill-rule="nonzero"/><path d="M511.862 303.22h-94.495l.046-94.446h94.496l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M511.907 208.77h-94.493l.044-94.445h94.496l-.047 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M511.815 398.666h-94.493l.044-95.447h94.496l-.047 95.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M511.77 492.117h-94.496l.046-94.448h94.496l-.047 94.448z" fill="#ff0107" fill-rule="nonzero"/></svg>
--- a/web/src/app/admin/configuration/llm/interfaces.ts
+++ b/web/src/app/admin/configuration/llm/interfaces.ts
@@ -1,8 +1,12 @@
 import {
  AnthropicIcon,
+  AmazonIcon,
  AWSIcon,
  AzureIcon,
  CPUIcon,
+  MicrosoftIconSVG,
+  MistralIcon,
+  MetaIcon,
  OpenAIIcon,
  GeminiIcon,
  OpenSourceIcon,
@@ -72,12 +76,25 @@ export const getProviderIcon = (providerName: string, modelName?: string) => {
  switch (providerName) {
    case "openai":
      // Special cases for openai based on modelName
+      if (modelName?.toLowerCase().includes("amazon")) {
+        return AmazonIcon;
+      }
+      if (modelName?.toLowerCase().includes("phi")) {
+        return MicrosoftIconSVG;
+      }    
+      if (modelName?.toLowerCase().includes("mistral")) {
+        return MistralIcon;
+      }    
+      if (modelName?.toLowerCase().includes("llama")) {
+        return MetaIcon;
+      }      
      if (modelName?.toLowerCase().includes("gemini")) {
        return GeminiIcon;
      }
      if (modelName?.toLowerCase().includes("claude")) {
        return AnthropicIcon;
      }
+
      return OpenAIIcon; // Default for openai
    case "anthropic":
      return AnthropicIcon;
--- a/web/src/components/chat_search/MinimalMarkdown.tsx
+++ b/web/src/components/chat_search/MinimalMarkdown.tsx
@@ -6,7 +6,6 @@ import {
 } from "@/app/chat/message/MemoizedTextComponents";
 import React, { useMemo } from "react";
 import ReactMarkdown from "react-markdown";
-import rehypePrism from "rehype-prism-plus";
 import remarkGfm from "remark-gfm";

 interface MinimalMarkdownProps {
@@ -36,10 +35,9 @@ export const MinimalMarkdown: React.FC<MinimalMarkdownProps> = ({

  return (
    <ReactMarkdown
-      className={`prose max-w-full text-base ${className}`}
+      className={`w-full text-wrap break-word ${className}`}
      components={markdownComponents}
      remarkPlugins={[remarkGfm]}
-      rehypePlugins={[[rehypePrism, { ignoreMissing: true }]]}
    >
      {content}
    </ReactMarkdown>
--- a/web/src/components/chat_search/TextView.tsx
+++ b/web/src/components/chat_search/TextView.tsx
@@ -21,11 +21,11 @@ export default function TextView({
  onClose,
 }: TextViewProps) {
  const [zoom, setZoom] = useState(100);
-  const [fileContent, setFileContent] = useState("");
-  const [fileUrl, setFileUrl] = useState("");
-  const [fileName, setFileName] = useState("");
+  const [fileContent, setFileContent] = useState<string>("");
+  const [fileUrl, setFileUrl] = useState<string>("");
+  const [fileName, setFileName] = useState<string>("");
  const [isLoading, setIsLoading] = useState(true);
-  const [fileType, setFileType] = useState("application/octet-stream");
+  const [fileType, setFileType] = useState<string>("application/octet-stream");

  const isMarkdownFormat = (mimeType: string): boolean => {
    const markdownFormats = [
@@ -51,17 +51,18 @@ export default function TextView({

  const fetchFile = useCallback(async () => {
    setIsLoading(true);
+    const fileId = presentingDocument.document_id.split("__")[1];
    try {
-      const fileId = presentingDocument.document_id.split("__")[1];
      const response = await fetch(
-        `/api/chat/file/${encodeURIComponent(fileId)}`
+        `/api/chat/file/${encodeURIComponent(fileId)}`,
+        {
+          method: "GET",
+        }
      );
      const blob = await response.blob();
-
      const url = window.URL.createObjectURL(blob);
      setFileUrl(url);
      setFileName(presentingDocument.semantic_identifier || "document");
-
      const contentType =
        response.headers.get("Content-Type") || "application/octet-stream";
      setFileType(contentType);
@@ -69,28 +70,9 @@ export default function TextView({
      if (isMarkdownFormat(blob.type)) {
        const text = await blob.text();
        setFileContent(text);
-      } else if (blob.type === "application/octet-stream") {
-        try {
-          const text = await blob.text();
-          let nonPrintingCount = 0;
-          for (let i = 0; i < text.length; i++) {
-            const code = text.charCodeAt(i);
-            if (code < 32 && ![9, 10, 13].includes(code)) {
-              nonPrintingCount++;
-            }
-          }
-          const ratio = nonPrintingCount / text.length;
-
-          if (ratio < 0.05) {
-            setFileContent(text);
-            setFileType("text/plain");
-          }
-        } catch (err) {
-          console.error("Failed to parse octet-stream as text", err);
-        }
      }
-    } catch (err) {
-      console.error("Error fetching file:", err);
+    } catch (error) {
+      console.error("Error fetching file:", error);
    } finally {
      setTimeout(() => {
        setIsLoading(false);
@@ -155,7 +137,7 @@ export default function TextView({
              </div>
            ) : (
              <div
-                className="w-full h-full transform origin-center transition-transform duration-300 ease-in-out"
+                className={`w-full h-full transform origin-center transition-transform duration-300 ease-in-out`}
                style={{ transform: `scale(${zoom / 100})` }}
              >
                {isSupportedIframeFormat(fileType) ? (
@@ -164,7 +146,7 @@ export default function TextView({
                    className="w-full h-full border-none"
                    title="File Viewer"
                  />
-                ) : isMarkdownFormat(fileType) || fileType === "text/plain" ? (
+                ) : isMarkdownFormat(fileType) ? (
                  <div className="w-full h-full p-6 overflow-y-scroll overflow-x-hidden">
                    <MinimalMarkdown
                      content={fileContent}
--- a/web/src/components/icons/icons.tsx
+++ b/web/src/components/icons/icons.tsx
@@ -39,7 +39,10 @@ import Image, { StaticImageData } from "next/image";
 import jiraSVG from "../../../public/Jira.svg";
 import confluenceSVG from "../../../public/Confluence.svg";
 import openAISVG from "../../../public/Openai.svg";
+import amazonSVG from "../../../public/Amazon.svg";
 import geminiSVG from "../../../public/Gemini.svg";
+import metaSVG from "../../../public/Meta.svg";
+import mistralSVG from "../../../public/Mistral.svg";
 import openSourceIcon from "../../../public/OpenSource.png";
 import litellmIcon from "../../../public/LiteLLM.jpg";

@@ -49,6 +52,7 @@ import asanaIcon from "../../../public/Asana.png";
 import anthropicSVG from "../../../public/Anthropic.svg";
 import nomicSVG from "../../../public/nomic.svg";
 import microsoftIcon from "../../../public/microsoft.png";
+import microsoftSVG from "../../../public/Microsoft.svg";
 import mixedBreadSVG from "../../../public/Mixedbread.png";

 import OCIStorageSVG from "../../../public/OCI.svg";
@@ -1104,6 +1108,26 @@ export const GeminiIcon = ({
  className = defaultTailwindCSS,
 }: IconProps) => <LogoIcon size={size} className={className} src={geminiSVG} />;

+export const AmazonIcon = ({
+  size = 16,
+  className = defaultTailwindCSS,
+}: IconProps) => <LogoIcon size={size} className={className} src={amazonSVG} />;
+
+export const MetaIcon = ({
+  size = 16,
+  className = defaultTailwindCSS,
+}: IconProps) => <LogoIcon size={size} className={className} src={metaSVG} />;
+
+export const MicrosoftIconSVG = ({
+  size = 16,
+  className = defaultTailwindCSS,
+}: IconProps) => <LogoIcon size={size} className={className} src={microsoftSVG} />;
+
+export const MistralIcon = ({
+  size = 16,
+  className = defaultTailwindCSS,
+}: IconProps) => <LogoIcon size={size} className={className} src={mistralSVG} />;
+
 export const VoyageIcon = ({
  size = 16,
  className = defaultTailwindCSS,
--- a/web/src/lib/hooks.ts
+++ b/web/src/lib/hooks.ts
@@ -299,6 +299,7 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
  // OpenAI models
  "o1-mini": "O1 Mini",
  "o1-preview": "O1 Preview",
+  "o1-2024-12-17": "O1",
  "gpt-4": "GPT 4",
  "gpt-4o": "GPT 4o",
  "gpt-4o-2024-08-06": "GPT 4o (Structured Outputs)",
@@ -318,6 +319,21 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
  "gpt-3.5-turbo-16k-0613": "GPT 3.5 Turbo 16k (June 2023)",
  "gpt-3.5-turbo-0301": "GPT 3.5 Turbo (March 2023)",

+  // Amazon models
+  "amazon.nova-micro@v1": "Amazon Nova Micro",
+  "amazon.nova-lite@v1": "Amazon Nova Lite",
+  "amazon.nova-pro@v1": "Amazon Nova Pro",
+
+  // Meta models
+  "llama-3.2-90b-vision-instruct": "Llama 3.2 90B",
+  "llama-3.2-11b-vision-instruct": "Llama 3.2 11B",
+  "llama-3.3-70b-instruct": "Llama 3.3 70B",
+
+  // Microsoft models
+  "phi-3.5-mini-instruct": "Phi 3.5 Mini",
+  "phi-3.5-moe-instruct": "Phi 3.5 MoE",
+  "phi-3.5-vision-instruct": "Phi 3.5 Vision",
+
  // Anthropic models
  "claude-3-opus-20240229": "Claude 3 Opus",
  "claude-3-sonnet-20240229": "Claude 3 Sonnet",
@@ -329,6 +345,9 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
  "claude-3-5-sonnet-20241022": "Claude 3.5 Sonnet (New)",
  "claude-3-5-sonnet-v2@20241022": "Claude 3.5 Sonnet (New)",
  "claude-3.5-sonnet-v2@20241022": "Claude 3.5 Sonnet (New)",
+  "claude-3-5-haiku-20241022": "Claude 3.5 Haiku",
+  "claude-3-5-haiku@20241022": "Claude 3.5 Haiku",
+  "claude-3.5-haiku@20241022": "Claude 3.5 Haiku",

  // Google Models
  "gemini-1.5-pro": "Gemini 1.5 Pro",
@@ -337,6 +356,11 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
  "gemini-1.5-flash-001": "Gemini 1.5 Flash",
  "gemini-1.5-pro-002": "Gemini 1.5 Pro (v2)",
  "gemini-1.5-flash-002": "Gemini 1.5 Flash (v2)",
+  "gemini-2.0-flash-exp": "Gemini 2.0 Flash (Experimental)",
+
+  // Mistral Models
+  "mistral-large-2411": "Mistral Large 24.11",
+  "mistral-large@2411": "Mistral Large 24.11",

  // Bedrock models
  "meta.llama3-1-70b-instruct-v1:0": "Llama 3.1 70B",
--- a/web/src/lib/llm/utils.ts
+++ b/web/src/lib/llm/utils.ts
@@ -74,6 +74,8 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
  "claude-3-opus-20240229",
  "claude-3-sonnet-20240229",
  "claude-3-haiku-20240307",
+  // custom claude names
+  "claude-3.5-sonnet-v2@20241022",
  // claude names with AWS Bedrock Suffix
  "claude-3-opus-20240229-v1:0",
  "claude-3-sonnet-20240229-v1:0",
@@ -93,6 +95,13 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
  "gemini-1.5-flash-001",
  "gemini-1.5-pro-002",
  "gemini-1.5-flash-002",
+  "gemini-2.0-flash-exp",
+  // amazon models
+  "amazon.nova-lite@v1",
+  "amazon.nova-pro@v1",
+  // meta models
+  "llama-3.2-90b-vision-instruct",
+  "llama-3.2-11b-vision-instruct"  
 ];

 export function checkLLMSupportsImageInput(model: string) {
Author	SHA1	Message	Date
pablodanswer	09e6bd3c9c	k	2024-12-18 20:01:44 -08:00
pablodanswer	c1803cdd56	log	2024-12-18 19:20:55 -08:00
pablodanswer	a5b9c76012	validation	2024-12-18 19:13:09 -08:00
rkuo-danswer	e9b10e8b41	temporarily disabling validate indexing fences (#3502 ) * temporarily disabling validate indexing fences * add back a few startup checks in the cloud * use common vespa client to perform health check * log vespa url and try using http1 on light worker index methods --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com> Co-authored-by: Richard Kuo (Danswer) <rkuo@onyx.app>	2024-12-19 01:32:09 +00:00
pablonyx	a0fa4adb60	Ensure password validation errors propagate (#3509 ) * ensure password validation errors propagate * copy update * support o1 * improve typing * Revert "support o1" This reverts commit `9b7aa6008c`.	2024-12-19 00:05:57 +00:00
pablonyx	ca9ba925bd	Support o1 (#3510 ) * support o1 * nit	2024-12-19 00:05:00 +00:00
rkuo-danswer	833cc5c97c	Merge pull request #3497 from emerzon/new_icons New model icons for LLM Picker	2024-12-18 16:38:31 -08:00
Emerson Gomes	f56fda27c9	Add also Microsoft models	2024-12-17 16:37:52 -06:00
Emerson Gomes	b1e4d4ea8d	Adds icons for Amazon, Meta and Mistral models (when proxied via LiteLLM)	2024-12-17 16:20:46 -06:00
				`@@ -0,0 +1 @@`
				<svg viewBox="0 0 512 512" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd" stroke-linejoin="round" stroke-miterlimit="2"><path d="M189.08 303.228H94.587l.044-94.446h94.497l-.048 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.528 397.674h-94.493l.044-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.575 303.228H189.08l.046-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M378.07 303.228h-94.495l.044-94.446h94.498l-.047 94.446zM189.128 208.779H94.633l.044-94.448h94.498l-.047 94.448zM378.115 208.779h-94.494l.045-94.448h94.496l-.047 94.448zM94.587 303.227H.093l.044-96.017h94.496l-.046 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.633 208.779H.138l.046-94.448H94.68l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.68 115.902H.185L.23 19.885h94.498l-.047 96.017zM472.657 114.331h-94.495l.044-94.446h94.497l-.046 94.446zM94.54 399.244H.046l.044-97.588h94.497l-.047 97.588z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.495 492.123H0l.044-94.446H94.54l-.045 94.446zM472.563 303.228H378.07l.044-94.446h94.496l-.047 94.446zM472.61 208.779h-94.495l.044-94.448h94.498l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.517 397.674h-94.494l.044-94.446h94.497l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.47 492.121h-94.493l.044-96.017h94.496l-.047 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M228.375 303.22h-96.061l.046-94.446h96.067l-.052 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M322.827 397.666h-94.495l.044-96.018h94.498l-.047 96.018z" fill="#ff4900" fill-rule="nonzero"/><path d="M324.444 303.22h-97.636l.046-94.446h97.638l-.048 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M418.938 303.22h-96.064l.045-94.446h96.066l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M228.423 208.77H132.36l.045-94.445h96.066l-.05 94.446zM418.985 208.77H322.92l.044-94.445h96.069l-.048 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.883 304.79H39.392l.044-96.017h94.496l-.049 96.017z" fill="#ff7000" fill-rule="nonzero"/><path d="M133.929 208.77H39.437l.044-95.445h94.496l-.048 95.445z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.976 114.325H39.484l.044-94.448h94.497l-.05 94.448zM511.954 115.325h-94.493l.044-95.448h94.497l-.048 95.448z" fill="#ffce00" fill-rule="nonzero"/><path d="M133.836 399.667H39.345l.044-96.447h94.496l-.049 96.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M133.79 492.117H39.3l.044-94.448h94.496l-.049 94.448z" fill="#ff0107" fill-rule="nonzero"/><path d="M511.862 303.22h-94.495l.046-94.446h94.496l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M511.907 208.77h-94.493l.044-94.445h94.496l-.047 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M511.815 398.666h-94.493l.044-95.447h94.496l-.047 95.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M511.77 492.117h-94.496l.046-94.448h94.496l-.047 94.448z" fill="#ff0107" fill-rule="nonzero"/></svg>