mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-17 07:45:47 +00:00
Compare commits
9 Commits
faster_tex
...
cloud_debu
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
09e6bd3c9c | ||
|
|
c1803cdd56 | ||
|
|
a5b9c76012 | ||
|
|
e9b10e8b41 | ||
|
|
a0fa4adb60 | ||
|
|
ca9ba925bd | ||
|
|
833cc5c97c | ||
|
|
f56fda27c9 | ||
|
|
b1e4d4ea8d |
@@ -1,3 +1,5 @@
|
||||
from typing import Any
|
||||
|
||||
from posthog import Posthog
|
||||
|
||||
from ee.onyx.configs.app_configs import POSTHOG_API_KEY
|
||||
@@ -6,13 +8,31 @@ from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
posthog = Posthog(project_api_key=POSTHOG_API_KEY, host=POSTHOG_HOST)
|
||||
|
||||
def posthog_on_error(error: Any, items: Any) -> None:
|
||||
logger.error(f"PostHog error: {error}, items: {items}")
|
||||
|
||||
|
||||
posthog = Posthog(
|
||||
project_api_key=POSTHOG_API_KEY,
|
||||
host=POSTHOG_HOST,
|
||||
debug=True,
|
||||
on_error=posthog_on_error,
|
||||
)
|
||||
|
||||
|
||||
def event_telemetry(
|
||||
distinct_id: str,
|
||||
event: str,
|
||||
properties: dict | None = None,
|
||||
distinct_id: str, event: str, properties: dict | None = None
|
||||
) -> None:
|
||||
logger.info(f"Capturing Posthog event: {distinct_id} {event} {properties}")
|
||||
posthog.capture(distinct_id, event, properties)
|
||||
print("API KEY", POSTHOG_API_KEY)
|
||||
print("HOST", POSTHOG_HOST)
|
||||
try:
|
||||
print(type(distinct_id))
|
||||
print(type(event))
|
||||
print(type(properties))
|
||||
response = posthog.capture(distinct_id, event, properties)
|
||||
posthog.flush()
|
||||
print(response)
|
||||
except Exception as e:
|
||||
logger.error(f"Error capturing Posthog event: {e}")
|
||||
|
||||
@@ -5,6 +5,7 @@ from datetime import datetime
|
||||
from datetime import timezone
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from typing import cast
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
@@ -228,6 +229,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
safe: bool = False,
|
||||
request: Optional[Request] = None,
|
||||
) -> User:
|
||||
# We verify the password here to make sure it's valid before we proceed
|
||||
await self.validate_password(
|
||||
user_create.password, cast(schemas.UC, user_create)
|
||||
)
|
||||
|
||||
user_count: int | None = None
|
||||
referral_source = (
|
||||
request.cookies.get("referral_source", None)
|
||||
|
||||
@@ -3,7 +3,6 @@ import multiprocessing
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
import sentry_sdk
|
||||
from celery import Task
|
||||
from celery.app import trace
|
||||
@@ -23,6 +22,7 @@ from onyx.background.celery.apps.task_formatters import CeleryTaskPlainFormatter
|
||||
from onyx.background.celery.celery_utils import celery_is_worker_primary
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine import get_sqlalchemy_engine
|
||||
from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client
|
||||
from onyx.document_index.vespa_constants import VESPA_CONFIG_SERVER_URL
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.redis.redis_connector_credential_pair import RedisConnectorCredentialPair
|
||||
@@ -262,7 +262,8 @@ def wait_for_vespa(sender: Any, **kwargs: Any) -> None:
|
||||
logger.info("Vespa: Readiness probe starting.")
|
||||
while True:
|
||||
try:
|
||||
response = requests.get(f"{VESPA_CONFIG_SERVER_URL}/state/v1/health")
|
||||
client = get_vespa_http_client()
|
||||
response = client.get(f"{VESPA_CONFIG_SERVER_URL}/state/v1/health")
|
||||
response.raise_for_status()
|
||||
|
||||
response_dict = response.json()
|
||||
|
||||
@@ -13,7 +13,6 @@ from onyx.db.engine import SqlEngine
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.variable_functionality import fetch_versioned_implementation
|
||||
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger(__name__)
|
||||
|
||||
@@ -154,10 +153,6 @@ def on_beat_init(sender: Any, **kwargs: Any) -> None:
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_BEAT_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=2, max_overflow=0)
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -61,13 +61,14 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=4, max_overflow=12)
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.on_secondary_worker_init(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -62,13 +62,14 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=sender.concurrency)
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.on_secondary_worker_init(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -60,13 +60,15 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8)
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.on_secondary_worker_init(sender, **kwargs)
|
||||
|
||||
|
||||
|
||||
@@ -84,14 +84,14 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
|
||||
SqlEngine.init_engine(pool_size=8, max_overflow=0)
|
||||
|
||||
# Startup checks are not needed in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_vespa(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
return
|
||||
|
||||
logger.info("Running as the primary celery worker.")
|
||||
|
||||
# This is singleton work that should be done on startup exactly once
|
||||
|
||||
@@ -29,7 +29,6 @@ from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryQueues
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.configs.constants import OnyxRedisSignals
|
||||
from onyx.db.connector import mark_ccpair_with_indexing_trigger
|
||||
from onyx.db.connector_credential_pair import fetch_connector_credential_pairs
|
||||
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
|
||||
@@ -176,7 +175,7 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
|
||||
|
||||
# we need to use celery's redis client to access its redis data
|
||||
# (which lives on a different db number)
|
||||
redis_client_celery: Redis = self.app.broker_connection().channel().client # type: ignore
|
||||
# redis_client_celery: Redis = self.app.broker_connection().channel().client # type: ignore
|
||||
|
||||
lock_beat: RedisLock = redis_client.lock(
|
||||
OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK,
|
||||
@@ -319,20 +318,23 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
|
||||
attempt.id, db_session, failure_reason=failure_reason
|
||||
)
|
||||
|
||||
# we want to run this less frequently than the overall task
|
||||
if not redis_client.exists(OnyxRedisSignals.VALIDATE_INDEXING_FENCES):
|
||||
# clear any indexing fences that don't have associated celery tasks in progress
|
||||
# tasks can be in the queue in redis, in reserved tasks (prefetched by the worker),
|
||||
# or be currently executing
|
||||
try:
|
||||
task_logger.info("Validating indexing fences...")
|
||||
validate_indexing_fences(
|
||||
tenant_id, self.app, redis_client, redis_client_celery, lock_beat
|
||||
)
|
||||
except Exception:
|
||||
task_logger.exception("Exception while validating indexing fences")
|
||||
# rkuo: The following code logically appears to work, but the celery inspect code may be unstable
|
||||
# turning off for the moment to see if it helps cloud stability
|
||||
|
||||
redis_client.set(OnyxRedisSignals.VALIDATE_INDEXING_FENCES, 1, ex=60)
|
||||
# we want to run this less frequently than the overall task
|
||||
# if not redis_client.exists(OnyxRedisSignals.VALIDATE_INDEXING_FENCES):
|
||||
# # clear any indexing fences that don't have associated celery tasks in progress
|
||||
# # tasks can be in the queue in redis, in reserved tasks (prefetched by the worker),
|
||||
# # or be currently executing
|
||||
# try:
|
||||
# task_logger.info("Validating indexing fences...")
|
||||
# validate_indexing_fences(
|
||||
# tenant_id, self.app, redis_client, redis_client_celery, lock_beat
|
||||
# )
|
||||
# except Exception:
|
||||
# task_logger.exception("Exception while validating indexing fences")
|
||||
|
||||
# redis_client.set(OnyxRedisSignals.VALIDATE_INDEXING_FENCES, 1, ex=60)
|
||||
|
||||
except SoftTimeLimitExceeded:
|
||||
task_logger.info(
|
||||
|
||||
@@ -4,7 +4,6 @@ from datetime import timezone
|
||||
|
||||
from googleapiclient.discovery import build # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from markitdown import MarkItDown # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
@@ -27,9 +26,9 @@ from onyx.file_processing.unstructured import get_unstructured_api_key
|
||||
from onyx.file_processing.unstructured import unstructured_to_text
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
# these errors don't represent a failure in the connector, but simply files
|
||||
# that can't / shouldn't be indexed
|
||||
ERRORS_TO_CONTINUE_ON = [
|
||||
@@ -39,41 +38,177 @@ ERRORS_TO_CONTINUE_ON = [
|
||||
]
|
||||
|
||||
|
||||
def _extract_sections_basic(
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
mime_type = file["mimeType"]
|
||||
link = file["webViewLink"]
|
||||
|
||||
if mime_type not in set(item.value for item in GDriveMimeType):
|
||||
# Unsupported file types can still have a title, finding this way is still useful
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
try:
|
||||
if mime_type == GDriveMimeType.SPREADSHEET.value:
|
||||
try:
|
||||
sheets_service = build(
|
||||
"sheets", "v4", credentials=service._http.credentials
|
||||
)
|
||||
spreadsheet = (
|
||||
sheets_service.spreadsheets()
|
||||
.get(spreadsheetId=file["id"])
|
||||
.execute()
|
||||
)
|
||||
|
||||
sections = []
|
||||
for sheet in spreadsheet["sheets"]:
|
||||
sheet_name = sheet["properties"]["title"]
|
||||
sheet_id = sheet["properties"]["sheetId"]
|
||||
|
||||
# Get sheet dimensions
|
||||
grid_properties = sheet["properties"].get("gridProperties", {})
|
||||
row_count = grid_properties.get("rowCount", 1000)
|
||||
column_count = grid_properties.get("columnCount", 26)
|
||||
|
||||
# Convert column count to letter (e.g., 26 -> Z, 27 -> AA)
|
||||
end_column = ""
|
||||
while column_count:
|
||||
column_count, remainder = divmod(column_count - 1, 26)
|
||||
end_column = chr(65 + remainder) + end_column
|
||||
|
||||
range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
|
||||
|
||||
try:
|
||||
result = (
|
||||
sheets_service.spreadsheets()
|
||||
.values()
|
||||
.get(spreadsheetId=file["id"], range=range_name)
|
||||
.execute()
|
||||
)
|
||||
values = result.get("values", [])
|
||||
|
||||
if values:
|
||||
text = f"Sheet: {sheet_name}\n"
|
||||
for row in values:
|
||||
text += "\t".join(str(cell) for cell in row) + "\n"
|
||||
sections.append(
|
||||
Section(
|
||||
link=f"{link}#gid={sheet_id}",
|
||||
text=text,
|
||||
)
|
||||
)
|
||||
except HttpError as e:
|
||||
logger.warning(
|
||||
f"Error fetching data for sheet '{sheet_name}': {e}"
|
||||
)
|
||||
continue
|
||||
return sections
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Ran into exception '{e}' when pulling data from Google Sheet '{file['name']}'."
|
||||
" Falling back to basic extraction."
|
||||
)
|
||||
|
||||
if mime_type in [
|
||||
GDriveMimeType.DOC.value,
|
||||
GDriveMimeType.PPT.value,
|
||||
GDriveMimeType.SPREADSHEET.value,
|
||||
]:
|
||||
export_mime_type = (
|
||||
"text/plain"
|
||||
if mime_type != GDriveMimeType.SPREADSHEET.value
|
||||
else "text/csv"
|
||||
)
|
||||
text = (
|
||||
service.files()
|
||||
.export(fileId=file["id"], mimeType=export_mime_type)
|
||||
.execute()
|
||||
.decode("utf-8")
|
||||
)
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
elif mime_type in [
|
||||
GDriveMimeType.PLAIN_TEXT.value,
|
||||
GDriveMimeType.MARKDOWN.value,
|
||||
]:
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=service.files()
|
||||
.get_media(fileId=file["id"])
|
||||
.execute()
|
||||
.decode("utf-8"),
|
||||
)
|
||||
]
|
||||
if mime_type in [
|
||||
GDriveMimeType.WORD_DOC.value,
|
||||
GDriveMimeType.POWERPOINT.value,
|
||||
GDriveMimeType.PDF.value,
|
||||
]:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
if get_unstructured_api_key():
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=unstructured_to_text(
|
||||
file=io.BytesIO(response),
|
||||
file_name=file.get("name", file["id"]),
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
return [
|
||||
Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
text, _ = read_pdf_file(file=io.BytesIO(response))
|
||||
return [Section(link=link, text=text)]
|
||||
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
return [
|
||||
Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
except Exception:
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
|
||||
def convert_drive_item_to_document(
|
||||
file: GoogleDriveFileType,
|
||||
drive_service: GoogleDriveService,
|
||||
docs_service: GoogleDocsService,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Converts a Google Drive file into an internal Document object, extracting
|
||||
the text and organizing it into sections. Uses specialized methods for Google Docs
|
||||
to preserve structure. Falls back to basic extraction for all other formats.
|
||||
"""
|
||||
try:
|
||||
# Skip shortcuts and folders
|
||||
# Skip files that are shortcuts
|
||||
if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
|
||||
logger.info("Ignoring Drive Shortcut Filetype")
|
||||
return None
|
||||
# Skip files that are folders
|
||||
if file.get("mimeType") == DRIVE_FOLDER_TYPE:
|
||||
logger.info("Ignoring Drive Folder Filetype")
|
||||
return None
|
||||
|
||||
sections: list[Section] = []
|
||||
|
||||
# Special handling for Google Docs to preserve structure
|
||||
# Special handling for Google Docs to preserve structure, link
|
||||
# to headers
|
||||
if file.get("mimeType") == GDriveMimeType.DOC.value:
|
||||
try:
|
||||
sections = get_document_sections(docs_service, file["id"])
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Exception '{e}' when pulling sections from Google Doc '{file['name']}'. "
|
||||
"Falling back to basic extraction."
|
||||
f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
|
||||
" Falling back to basic extraction."
|
||||
)
|
||||
|
||||
# If not a GDoc or GDoc extraction failed
|
||||
# NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
|
||||
if not sections:
|
||||
try:
|
||||
# For all other file types just extract the text
|
||||
sections = _extract_sections_basic(file, drive_service)
|
||||
|
||||
except HttpError as e:
|
||||
reason = e.error_details[0]["reason"] if e.error_details else e.reason
|
||||
message = e.error_details[0]["message"] if e.error_details else e.reason
|
||||
@@ -82,8 +217,8 @@ def convert_drive_item_to_document(
|
||||
f"Could not export file '{file['name']}' due to '{message}', skipping..."
|
||||
)
|
||||
return None
|
||||
raise
|
||||
|
||||
raise
|
||||
if not sections:
|
||||
return None
|
||||
|
||||
@@ -103,248 +238,9 @@ def convert_drive_item_to_document(
|
||||
except Exception as e:
|
||||
if not CONTINUE_ON_CONNECTOR_FAILURE:
|
||||
raise e
|
||||
|
||||
logger.exception("Ran into exception when pulling a file from Google Drive")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_sections_basic(
|
||||
file: GoogleDriveFileType, service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Extracts text from a Google Drive file based on its MIME type.
|
||||
"""
|
||||
mime_type = file["mimeType"]
|
||||
link = file["webViewLink"]
|
||||
|
||||
# Handle unsupported MIME types
|
||||
if mime_type not in {item.value for item in GDriveMimeType}:
|
||||
logger.debug(
|
||||
f"Unsupported MIME type '{mime_type}' for file '{file.get('name')}'"
|
||||
)
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
# Specialized handling for Google Sheets
|
||||
if mime_type == GDriveMimeType.SPREADSHEET.value:
|
||||
try:
|
||||
return _extract_google_sheets(file, service)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Error extracting data from Google Sheet '{file['name']}': {e}. "
|
||||
"Falling back to basic content extraction."
|
||||
)
|
||||
|
||||
# For other types
|
||||
return _extract_general_content(file, service)
|
||||
|
||||
|
||||
def _extract_google_sheets(
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Specialized extraction logic for Google Sheets.
|
||||
Iterates through each sheet, fetches all data, and returns a list of Section objects.
|
||||
"""
|
||||
link = file["webViewLink"]
|
||||
file_id = file["id"]
|
||||
|
||||
sheets_service = build("sheets", "v4", credentials=service._http.credentials)
|
||||
spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=file_id).execute()
|
||||
|
||||
sections: list[Section] = []
|
||||
for sheet in spreadsheet.get("sheets", []):
|
||||
sheet_name = sheet["properties"]["title"]
|
||||
sheet_id = sheet["properties"]["sheetId"]
|
||||
|
||||
grid_props = sheet["properties"].get("gridProperties", {})
|
||||
row_count = grid_props.get("rowCount", 1000)
|
||||
column_count = grid_props.get("columnCount", 26)
|
||||
|
||||
# Convert a number to a spreadsheet column letter (1->A, 26->Z, 27->AA,...)
|
||||
end_column = ""
|
||||
col_count = column_count
|
||||
while col_count > 0:
|
||||
col_count, remainder = divmod(col_count - 1, 26)
|
||||
end_column = chr(65 + remainder) + end_column
|
||||
|
||||
range_name = f"'{sheet_name}'!A1:{end_column}{row_count}"
|
||||
|
||||
try:
|
||||
result = (
|
||||
sheets_service.spreadsheets()
|
||||
.values()
|
||||
.get(spreadsheetId=file_id, range=range_name)
|
||||
.execute()
|
||||
)
|
||||
values = result.get("values", [])
|
||||
|
||||
if values:
|
||||
text = f"Sheet: {sheet_name}\n"
|
||||
for row in values:
|
||||
text += "\t".join(str(cell) for cell in row) + "\n"
|
||||
|
||||
sections.append(Section(link=f"{link}#gid={sheet_id}", text=text))
|
||||
except HttpError as e:
|
||||
logger.warning(
|
||||
f"Error fetching data for sheet '{sheet_name}' in '{file.get('name')}' : {e}"
|
||||
)
|
||||
continue
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def _extract_general_content(
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Extracts general file content for files other than Google Sheets.
|
||||
- PDF: Revert to read_pdf_file
|
||||
- DOCX: Unstructured, then docx_to_text, then MarkItDown.
|
||||
- PPTX: Unstructured, then pptx_to_text, then MarkItDown.
|
||||
- TXT: Decode the content; if empty, log.
|
||||
- Google Docs/Slides: Export as text/plain and return directly.
|
||||
"""
|
||||
link = file["webViewLink"]
|
||||
mime_type = file["mimeType"]
|
||||
file_id = file["id"]
|
||||
file_name = file.get("name", file_id)
|
||||
|
||||
try:
|
||||
# Google Docs and Google Slides (internal GDrive formats)
|
||||
if (
|
||||
mime_type == GDriveMimeType.DOC.value
|
||||
or mime_type == GDriveMimeType.PPT.value
|
||||
):
|
||||
logger.debug(f"Extracting Google-native doc/presentation: {file_name}")
|
||||
export_mime_type = "text/plain"
|
||||
content = (
|
||||
service.files()
|
||||
.export(fileId=file_id, mimeType=export_mime_type)
|
||||
.execute()
|
||||
)
|
||||
text = content.decode("utf-8", errors="replace").strip()
|
||||
if not text:
|
||||
logger.warning(
|
||||
f"No text extracted from Google Docs/Slides file '{file_name}'."
|
||||
)
|
||||
text = UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
# For all other formats, get raw content
|
||||
content = service.files().get_media(fileId=file_id).execute()
|
||||
|
||||
if mime_type == GDriveMimeType.PDF.value:
|
||||
# Revert to original PDF extraction
|
||||
logger.debug(f"Extracting PDF content for '{file_name}'")
|
||||
text, _ = read_pdf_file(file=io.BytesIO(content))
|
||||
if not text:
|
||||
logger.warning(
|
||||
f"No text extracted from PDF '{file_name}' with read_pdf_file."
|
||||
)
|
||||
text = UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
logger.debug(f"Extracting DOCX content for '{file_name}'")
|
||||
return [
|
||||
Section(link=link, text=_extract_docx_pptx_txt(content, file, "docx"))
|
||||
]
|
||||
|
||||
if mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
logger.debug(f"Extracting PPTX content for '{file_name}'")
|
||||
return [
|
||||
Section(link=link, text=_extract_docx_pptx_txt(content, file, "pptx"))
|
||||
]
|
||||
|
||||
if (
|
||||
mime_type == GDriveMimeType.PLAIN_TEXT.value
|
||||
or mime_type == GDriveMimeType.MARKDOWN.value
|
||||
):
|
||||
logger.debug(f"Extracting plain text/markdown content for '{file_name}'")
|
||||
text = content.decode("utf-8", errors="replace").strip()
|
||||
if not text:
|
||||
logger.warning(
|
||||
f"No text extracted from TXT/MD '{file_name}'. Returning unsupported message."
|
||||
)
|
||||
text = UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
# If we reach here, it's some other format supported by MarkItDown/unstructured
|
||||
logger.debug(f"Trying MarkItDown/unstructured fallback for '{file_name}'")
|
||||
text = _extract_docx_pptx_txt(content, file, None) # generic fallback
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error extracting file content for '{file_name}': {e}", exc_info=True
|
||||
)
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
|
||||
def _extract_docx_pptx_txt(
|
||||
content: bytes, file: dict[str, str], file_type: str | None
|
||||
) -> str:
|
||||
"""
|
||||
Attempts to extract text from DOCX, PPTX, or any supported format using:
|
||||
1. unstructured (if configured)
|
||||
2. docx_to_text/pptx_to_text if known format
|
||||
3. MarkItDown fallback
|
||||
"""
|
||||
file_name = file.get("name", file["id"])
|
||||
|
||||
# 1. Try unstructured first
|
||||
if get_unstructured_api_key():
|
||||
try:
|
||||
logger.debug(f"Attempting unstructured extraction for '{file_name}'...")
|
||||
text = unstructured_to_text(io.BytesIO(content), file_name)
|
||||
if text.strip():
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"Unstructured returned empty text for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.warning(f"Unstructured extraction failed for '{file_name}': {e}")
|
||||
|
||||
# 2. If format is docx or pptx, try direct extraction methods
|
||||
if file_type == "docx":
|
||||
try:
|
||||
logger.debug(f"Trying docx_to_text for '{file_name}'...")
|
||||
text = docx_to_text(file=io.BytesIO(content))
|
||||
if text.strip():
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"docx_to_text returned empty for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.warning(f"docx_to_text failed for '{file_name}': {e}")
|
||||
|
||||
if file_type == "pptx":
|
||||
try:
|
||||
logger.debug(f"Trying pptx_to_text for '{file_name}'...")
|
||||
text = pptx_to_text(file=io.BytesIO(content))
|
||||
if text.strip():
|
||||
return text
|
||||
else:
|
||||
logger.warning(f"pptx_to_text returned empty for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.warning(f"pptx_to_text failed for '{file_name}': {e}")
|
||||
|
||||
# 3. Fallback to MarkItDown
|
||||
try:
|
||||
logger.debug(f"Falling back to MarkItDown for '{file_name}'...")
|
||||
md = MarkItDown()
|
||||
result = md.convert(io.BytesIO(content))
|
||||
if result and result.text_content and result.text_content.strip():
|
||||
return result.text_content
|
||||
else:
|
||||
logger.warning(f"MarkItDown returned empty text for '{file_name}'.")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"MarkItDown conversion failed for '{file_name}': {e}", exc_info=True
|
||||
)
|
||||
|
||||
# If all methods fail or return empty, return unsupported message
|
||||
logger.error(
|
||||
f"All extraction methods failed for '{file_name}', returning unsupported file message."
|
||||
)
|
||||
return UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
return None
|
||||
|
||||
|
||||
def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
|
||||
|
||||
@@ -535,7 +535,7 @@ class VespaIndex(DocumentIndex):
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
with get_vespa_http_client(http2=False) as http_client:
|
||||
for index_name in index_names:
|
||||
params = httpx.QueryParams(
|
||||
{
|
||||
@@ -546,8 +546,12 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
while True:
|
||||
try:
|
||||
vespa_url = (
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}"
|
||||
)
|
||||
logger.debug(f'update_single PUT on URL "{vespa_url}"')
|
||||
resp = http_client.put(
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=self.index_name)}",
|
||||
vespa_url,
|
||||
params=params,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=update_dict,
|
||||
@@ -619,7 +623,7 @@ class VespaIndex(DocumentIndex):
|
||||
if self.secondary_index_name:
|
||||
index_names.append(self.secondary_index_name)
|
||||
|
||||
with get_vespa_http_client() as http_client:
|
||||
with get_vespa_http_client(http2=False) as http_client:
|
||||
for index_name in index_names:
|
||||
params = httpx.QueryParams(
|
||||
{
|
||||
@@ -630,8 +634,12 @@ class VespaIndex(DocumentIndex):
|
||||
|
||||
while True:
|
||||
try:
|
||||
vespa_url = (
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}"
|
||||
)
|
||||
logger.debug(f'delete_single DELETE on URL "{vespa_url}"')
|
||||
resp = http_client.delete(
|
||||
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
|
||||
vespa_url,
|
||||
params=params,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
@@ -55,7 +55,9 @@ def remove_invalid_unicode_chars(text: str) -> str:
|
||||
return _illegal_xml_chars_RE.sub("", text)
|
||||
|
||||
|
||||
def get_vespa_http_client(no_timeout: bool = False) -> httpx.Client:
|
||||
def get_vespa_http_client(
|
||||
no_timeout: bool = False, http2: bool = False
|
||||
) -> httpx.Client:
|
||||
"""
|
||||
Configure and return an HTTP client for communicating with Vespa,
|
||||
including authentication if needed.
|
||||
@@ -67,5 +69,5 @@ def get_vespa_http_client(no_timeout: bool = False) -> httpx.Client:
|
||||
else None,
|
||||
verify=False if not MANAGED_VESPA else True,
|
||||
timeout=None if no_timeout else VESPA_REQUEST_TIMEOUT,
|
||||
http2=True,
|
||||
http2=http2,
|
||||
)
|
||||
|
||||
@@ -14,9 +14,10 @@ from typing import IO
|
||||
|
||||
import chardet
|
||||
import docx # type: ignore
|
||||
import openpyxl # type: ignore
|
||||
import pptx # type: ignore
|
||||
from docx import Document
|
||||
from fastapi import UploadFile
|
||||
from markitdown import MarkItDown # type: ignore
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfStreamError
|
||||
|
||||
@@ -59,9 +60,6 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
|
||||
".html",
|
||||
]
|
||||
|
||||
# These are the file extensions that we use markitdown for
|
||||
MARKITDOWN_FILE_EXTENSIONS = [".docx", ".pptx", ".xlsx"]
|
||||
|
||||
|
||||
def is_text_file_extension(file_name: str) -> bool:
|
||||
return any(file_name.endswith(ext) for ext in PLAIN_TEXT_FILE_EXTENSIONS)
|
||||
@@ -76,10 +74,6 @@ def is_valid_file_ext(ext: str) -> bool:
|
||||
return ext in VALID_FILE_EXTENSIONS
|
||||
|
||||
|
||||
def is_markitdown_file_ext(ext: str) -> bool:
|
||||
return ext in MARKITDOWN_FILE_EXTENSIONS
|
||||
|
||||
|
||||
def is_text_file(file: IO[bytes]) -> bool:
|
||||
"""
|
||||
checks if the first 1024 bytes only contain printable or whitespace characters
|
||||
@@ -191,6 +185,13 @@ def read_text_file(
|
||||
return file_content_raw, metadata
|
||||
|
||||
|
||||
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||
"""Extract text from a PDF file."""
|
||||
# Return only the extracted text from read_pdf_file
|
||||
text, _ = read_pdf_file(file, pdf_pass)
|
||||
return text
|
||||
|
||||
|
||||
def read_pdf_file(
|
||||
file: IO[Any],
|
||||
pdf_pass: str | None = None,
|
||||
@@ -298,11 +299,16 @@ def pptx_to_text(file: IO[Any]) -> str:
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||
"""Extract text from a PDF file."""
|
||||
# Return only the extracted text from read_pdf_file
|
||||
text, _ = read_pdf_file(file, pdf_pass)
|
||||
return text
|
||||
def xlsx_to_text(file: IO[Any]) -> str:
|
||||
workbook = openpyxl.load_workbook(file, read_only=True)
|
||||
text_content = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_string = "\n".join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||
)
|
||||
text_content.append(sheet_string)
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def eml_to_text(file: IO[Any]) -> str:
|
||||
@@ -340,6 +346,9 @@ def extract_file_text(
|
||||
) -> str:
|
||||
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
||||
".pdf": pdf_to_text,
|
||||
".docx": docx_to_text,
|
||||
".pptx": pptx_to_text,
|
||||
".xlsx": xlsx_to_text,
|
||||
".eml": eml_to_text,
|
||||
".epub": epub_to_text,
|
||||
".html": parse_html_page_basic,
|
||||
@@ -349,8 +358,6 @@ def extract_file_text(
|
||||
if get_unstructured_api_key():
|
||||
return unstructured_to_text(file, file_name)
|
||||
|
||||
md = MarkItDown()
|
||||
|
||||
if file_name or extension:
|
||||
if extension is not None:
|
||||
final_extension = extension
|
||||
@@ -358,12 +365,6 @@ def extract_file_text(
|
||||
final_extension = get_file_ext(file_name)
|
||||
|
||||
if is_valid_file_ext(final_extension):
|
||||
if is_markitdown_file_ext(final_extension):
|
||||
with BytesIO(file.read()) as file_like_object:
|
||||
result = md.convert_stream(
|
||||
file_like_object, file_extension=final_extension
|
||||
)
|
||||
return result.text_content
|
||||
return extension_to_function.get(final_extension, file_io_to_text)(file)
|
||||
|
||||
# Either the file somehow has no name or the extension is not one that we recognize
|
||||
@@ -381,37 +382,29 @@ def extract_file_text(
|
||||
return ""
|
||||
|
||||
|
||||
def convert_docx_to_markdown(
|
||||
def convert_docx_to_txt(
|
||||
file: UploadFile, file_store: FileStore, file_path: str
|
||||
) -> None:
|
||||
try:
|
||||
# Read the file content
|
||||
file_content = file.file.read()
|
||||
file.file.seek(0)
|
||||
docx_content = file.file.read()
|
||||
doc = Document(BytesIO(docx_content))
|
||||
|
||||
if not file_content:
|
||||
raise ValueError(f"File {file.filename} is empty")
|
||||
# Extract text from the document
|
||||
full_text = []
|
||||
for para in doc.paragraphs:
|
||||
full_text.append(para.text)
|
||||
|
||||
# Reset the file pointer to the beginning
|
||||
file.file.seek(0)
|
||||
# Join the extracted text
|
||||
text_content = "\n".join(full_text)
|
||||
|
||||
text_content = extract_file_text(
|
||||
file=file.file, file_name=file.filename or "", extension=".docx"
|
||||
)
|
||||
|
||||
if not text_content:
|
||||
raise ValueError(f"Failed to extract text from {file.filename}")
|
||||
|
||||
txt_file_path = docx_to_txt_filename(file_path)
|
||||
file_store.save_file(
|
||||
file_name=txt_file_path,
|
||||
content=BytesIO(text_content.encode("utf-8")),
|
||||
display_name=file.filename,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type="text/plain",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting DOCX to Markdown: {str(e)}")
|
||||
raise RuntimeError(f"Failed to process file {file.filename}: {str(e)}") from e
|
||||
txt_file_path = docx_to_txt_filename(file_path)
|
||||
file_store.save_file(
|
||||
file_name=txt_file_path,
|
||||
content=BytesIO(text_content.encode("utf-8")),
|
||||
display_name=file.filename,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type="text/plain",
|
||||
)
|
||||
|
||||
|
||||
def docx_to_txt_filename(file_path: str) -> str:
|
||||
|
||||
@@ -453,7 +453,9 @@ class DefaultMultiLLM(LLM):
|
||||
if LOG_DANSWER_MODEL_INTERACTIONS:
|
||||
self.log_model_configs()
|
||||
|
||||
if DISABLE_LITELLM_STREAMING:
|
||||
if (
|
||||
DISABLE_LITELLM_STREAMING or self.config.model_name == "o1-2024-12-17"
|
||||
): # TODO: remove once litellm supports streaming
|
||||
yield self.invoke(prompt, tools, tool_choice, structured_response_format)
|
||||
return
|
||||
|
||||
|
||||
@@ -29,6 +29,7 @@ OPENAI_PROVIDER_NAME = "openai"
|
||||
OPEN_AI_MODEL_NAMES = [
|
||||
"o1-mini",
|
||||
"o1-preview",
|
||||
"o1-2024-12-17",
|
||||
"gpt-4",
|
||||
"gpt-4o",
|
||||
"gpt-4o-mini",
|
||||
|
||||
@@ -87,7 +87,7 @@ from onyx.db.models import SearchSettings
|
||||
from onyx.db.models import User
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.file_processing.extract_file_text import convert_docx_to_markdown
|
||||
from onyx.file_processing.extract_file_text import convert_docx_to_txt
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.key_value_store.interface import KvKeyNotFoundError
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
@@ -396,12 +396,11 @@ def upload_files(
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
file_type=file.content_type or "text/plain",
|
||||
)
|
||||
file.file.seek(0)
|
||||
|
||||
if file.content_type and file.content_type.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
):
|
||||
convert_docx_to_markdown(file, file_store, file_path)
|
||||
convert_docx_to_txt(file, file_store, file_path)
|
||||
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
@@ -22,6 +22,7 @@ from onyx.utils.variable_functionality import (
|
||||
from onyx.utils.variable_functionality import noop_fallback
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
|
||||
_DANSWER_TELEMETRY_ENDPOINT = "https://telemetry.onyx.app/anonymous_telemetry"
|
||||
_CACHED_UUID: str | None = None
|
||||
_CACHED_INSTANCE_DOMAIN: str | None = None
|
||||
@@ -117,9 +118,12 @@ def mt_cloud_telemetry(
|
||||
event: MilestoneRecordType,
|
||||
properties: dict | None = None,
|
||||
) -> None:
|
||||
print(f"mt_cloud_telemetry {distinct_id} {event} {properties}")
|
||||
if not MULTI_TENANT:
|
||||
print("mt_cloud_telemetry not MULTI_TENANT")
|
||||
return
|
||||
|
||||
print("mt_cloud_telemetry MULTI_TENANT")
|
||||
# MIT version should not need to include any Posthog code
|
||||
# This is only for Onyx MT Cloud, this code should also never be hit, no reason for any orgs to
|
||||
# be running the Multi Tenant version of Onyx.
|
||||
@@ -137,8 +141,11 @@ def create_milestone_and_report(
|
||||
properties: dict | None,
|
||||
db_session: Session,
|
||||
) -> None:
|
||||
print(f"create_milestone_and_report {user} {event_type} {db_session}")
|
||||
_, is_new = create_milestone_if_not_exists(user, event_type, db_session)
|
||||
print(f"create_milestone_and_report {is_new}")
|
||||
if is_new:
|
||||
print("create_milestone_and_report is_new")
|
||||
mt_cloud_telemetry(
|
||||
distinct_id=distinct_id,
|
||||
event=event_type,
|
||||
|
||||
@@ -29,7 +29,7 @@ trafilatura==1.12.2
|
||||
langchain==0.1.17
|
||||
langchain-core==0.1.50
|
||||
langchain-text-splitters==0.0.1
|
||||
litellm==1.54.1
|
||||
litellm==1.55.4
|
||||
lxml==5.3.0
|
||||
lxml_html_clean==0.2.2
|
||||
llama-index==0.9.45
|
||||
@@ -81,5 +81,4 @@ stripe==10.12.0
|
||||
urllib3==2.2.3
|
||||
mistune==0.8.4
|
||||
sentry-sdk==2.14.0
|
||||
prometheus_client==0.21.0
|
||||
markitdown==0.0.1a3
|
||||
prometheus_client==0.21.0
|
||||
@@ -12,5 +12,5 @@ torch==2.2.0
|
||||
transformers==4.39.2
|
||||
uvicorn==0.21.1
|
||||
voyageai==0.2.3
|
||||
litellm==1.54.1
|
||||
litellm==1.55.4
|
||||
sentry-sdk[fastapi,celery,starlette]==2.14.0
|
||||
1
web/public/Amazon.svg
Executable file
1
web/public/Amazon.svg
Executable file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 7.0 KiB |
9
web/public/Meta.svg
Executable file
9
web/public/Meta.svg
Executable file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 340 KiB |
6
web/public/Microsoft.svg
Executable file
6
web/public/Microsoft.svg
Executable file
@@ -0,0 +1,6 @@
|
||||
<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<rect x="1.33325" y="1.3335" width="6.33333" height="6.33333" fill="#F25022"/>
|
||||
<rect x="8.33325" y="1.3335" width="6.33333" height="6.33333" fill="#80BA01"/>
|
||||
<rect x="8.33325" y="8.3335" width="6.33333" height="6.33333" fill="#FFB902"/>
|
||||
<rect x="1.33325" y="8.3335" width="6.33333" height="6.33333" fill="#02A4EF"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 425 B |
1
web/public/Mistral.svg
Executable file
1
web/public/Mistral.svg
Executable file
@@ -0,0 +1 @@
|
||||
<svg viewBox="0 0 512 512" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd" stroke-linejoin="round" stroke-miterlimit="2"><path d="M189.08 303.228H94.587l.044-94.446h94.497l-.048 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.528 397.674h-94.493l.044-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.575 303.228H189.08l.046-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M378.07 303.228h-94.495l.044-94.446h94.498l-.047 94.446zM189.128 208.779H94.633l.044-94.448h94.498l-.047 94.448zM378.115 208.779h-94.494l.045-94.448h94.496l-.047 94.448zM94.587 303.227H.093l.044-96.017h94.496l-.046 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.633 208.779H.138l.046-94.448H94.68l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.68 115.902H.185L.23 19.885h94.498l-.047 96.017zM472.657 114.331h-94.495l.044-94.446h94.497l-.046 94.446zM94.54 399.244H.046l.044-97.588h94.497l-.047 97.588z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.495 492.123H0l.044-94.446H94.54l-.045 94.446zM472.563 303.228H378.07l.044-94.446h94.496l-.047 94.446zM472.61 208.779h-94.495l.044-94.448h94.498l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.517 397.674h-94.494l.044-94.446h94.497l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.47 492.121h-94.493l.044-96.017h94.496l-.047 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M228.375 303.22h-96.061l.046-94.446h96.067l-.052 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M322.827 397.666h-94.495l.044-96.018h94.498l-.047 96.018z" fill="#ff4900" fill-rule="nonzero"/><path d="M324.444 303.22h-97.636l.046-94.446h97.638l-.048 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M418.938 303.22h-96.064l.045-94.446h96.066l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M228.423 208.77H132.36l.045-94.445h96.066l-.05 94.446zM418.985 208.77H322.92l.044-94.445h96.069l-.048 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.883 304.79H39.392l.044-96.017h94.496l-.049 96.017z" fill="#ff7000" fill-rule="nonzero"/><path d="M133.929 208.77H39.437l.044-95.445h94.496l-.048 95.445z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.976 114.325H39.484l.044-94.448h94.497l-.05 94.448zM511.954 115.325h-94.493l.044-95.448h94.497l-.048 95.448z" fill="#ffce00" fill-rule="nonzero"/><path d="M133.836 399.667H39.345l.044-96.447h94.496l-.049 96.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M133.79 492.117H39.3l.044-94.448h94.496l-.049 94.448z" fill="#ff0107" fill-rule="nonzero"/><path d="M511.862 303.22h-94.495l.046-94.446h94.496l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M511.907 208.77h-94.493l.044-94.445h94.496l-.047 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M511.815 398.666h-94.493l.044-95.447h94.496l-.047 95.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M511.77 492.117h-94.496l.046-94.448h94.496l-.047 94.448z" fill="#ff0107" fill-rule="nonzero"/></svg>
|
||||
|
After Width: | Height: | Size: 2.9 KiB |
@@ -1,8 +1,12 @@
|
||||
import {
|
||||
AnthropicIcon,
|
||||
AmazonIcon,
|
||||
AWSIcon,
|
||||
AzureIcon,
|
||||
CPUIcon,
|
||||
MicrosoftIconSVG,
|
||||
MistralIcon,
|
||||
MetaIcon,
|
||||
OpenAIIcon,
|
||||
GeminiIcon,
|
||||
OpenSourceIcon,
|
||||
@@ -72,12 +76,25 @@ export const getProviderIcon = (providerName: string, modelName?: string) => {
|
||||
switch (providerName) {
|
||||
case "openai":
|
||||
// Special cases for openai based on modelName
|
||||
if (modelName?.toLowerCase().includes("amazon")) {
|
||||
return AmazonIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("phi")) {
|
||||
return MicrosoftIconSVG;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("mistral")) {
|
||||
return MistralIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("llama")) {
|
||||
return MetaIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("gemini")) {
|
||||
return GeminiIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("claude")) {
|
||||
return AnthropicIcon;
|
||||
}
|
||||
|
||||
return OpenAIIcon; // Default for openai
|
||||
case "anthropic":
|
||||
return AnthropicIcon;
|
||||
|
||||
@@ -6,7 +6,6 @@ import {
|
||||
} from "@/app/chat/message/MemoizedTextComponents";
|
||||
import React, { useMemo } from "react";
|
||||
import ReactMarkdown from "react-markdown";
|
||||
import rehypePrism from "rehype-prism-plus";
|
||||
import remarkGfm from "remark-gfm";
|
||||
|
||||
interface MinimalMarkdownProps {
|
||||
@@ -36,10 +35,9 @@ export const MinimalMarkdown: React.FC<MinimalMarkdownProps> = ({
|
||||
|
||||
return (
|
||||
<ReactMarkdown
|
||||
className={`prose max-w-full text-base ${className}`}
|
||||
className={`w-full text-wrap break-word ${className}`}
|
||||
components={markdownComponents}
|
||||
remarkPlugins={[remarkGfm]}
|
||||
rehypePlugins={[[rehypePrism, { ignoreMissing: true }]]}
|
||||
>
|
||||
{content}
|
||||
</ReactMarkdown>
|
||||
|
||||
@@ -21,11 +21,11 @@ export default function TextView({
|
||||
onClose,
|
||||
}: TextViewProps) {
|
||||
const [zoom, setZoom] = useState(100);
|
||||
const [fileContent, setFileContent] = useState("");
|
||||
const [fileUrl, setFileUrl] = useState("");
|
||||
const [fileName, setFileName] = useState("");
|
||||
const [fileContent, setFileContent] = useState<string>("");
|
||||
const [fileUrl, setFileUrl] = useState<string>("");
|
||||
const [fileName, setFileName] = useState<string>("");
|
||||
const [isLoading, setIsLoading] = useState(true);
|
||||
const [fileType, setFileType] = useState("application/octet-stream");
|
||||
const [fileType, setFileType] = useState<string>("application/octet-stream");
|
||||
|
||||
const isMarkdownFormat = (mimeType: string): boolean => {
|
||||
const markdownFormats = [
|
||||
@@ -51,17 +51,18 @@ export default function TextView({
|
||||
|
||||
const fetchFile = useCallback(async () => {
|
||||
setIsLoading(true);
|
||||
const fileId = presentingDocument.document_id.split("__")[1];
|
||||
try {
|
||||
const fileId = presentingDocument.document_id.split("__")[1];
|
||||
const response = await fetch(
|
||||
`/api/chat/file/${encodeURIComponent(fileId)}`
|
||||
`/api/chat/file/${encodeURIComponent(fileId)}`,
|
||||
{
|
||||
method: "GET",
|
||||
}
|
||||
);
|
||||
const blob = await response.blob();
|
||||
|
||||
const url = window.URL.createObjectURL(blob);
|
||||
setFileUrl(url);
|
||||
setFileName(presentingDocument.semantic_identifier || "document");
|
||||
|
||||
const contentType =
|
||||
response.headers.get("Content-Type") || "application/octet-stream";
|
||||
setFileType(contentType);
|
||||
@@ -69,28 +70,9 @@ export default function TextView({
|
||||
if (isMarkdownFormat(blob.type)) {
|
||||
const text = await blob.text();
|
||||
setFileContent(text);
|
||||
} else if (blob.type === "application/octet-stream") {
|
||||
try {
|
||||
const text = await blob.text();
|
||||
let nonPrintingCount = 0;
|
||||
for (let i = 0; i < text.length; i++) {
|
||||
const code = text.charCodeAt(i);
|
||||
if (code < 32 && ![9, 10, 13].includes(code)) {
|
||||
nonPrintingCount++;
|
||||
}
|
||||
}
|
||||
const ratio = nonPrintingCount / text.length;
|
||||
|
||||
if (ratio < 0.05) {
|
||||
setFileContent(text);
|
||||
setFileType("text/plain");
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Failed to parse octet-stream as text", err);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("Error fetching file:", err);
|
||||
} catch (error) {
|
||||
console.error("Error fetching file:", error);
|
||||
} finally {
|
||||
setTimeout(() => {
|
||||
setIsLoading(false);
|
||||
@@ -155,7 +137,7 @@ export default function TextView({
|
||||
</div>
|
||||
) : (
|
||||
<div
|
||||
className="w-full h-full transform origin-center transition-transform duration-300 ease-in-out"
|
||||
className={`w-full h-full transform origin-center transition-transform duration-300 ease-in-out`}
|
||||
style={{ transform: `scale(${zoom / 100})` }}
|
||||
>
|
||||
{isSupportedIframeFormat(fileType) ? (
|
||||
@@ -164,7 +146,7 @@ export default function TextView({
|
||||
className="w-full h-full border-none"
|
||||
title="File Viewer"
|
||||
/>
|
||||
) : isMarkdownFormat(fileType) || fileType === "text/plain" ? (
|
||||
) : isMarkdownFormat(fileType) ? (
|
||||
<div className="w-full h-full p-6 overflow-y-scroll overflow-x-hidden">
|
||||
<MinimalMarkdown
|
||||
content={fileContent}
|
||||
|
||||
@@ -39,7 +39,10 @@ import Image, { StaticImageData } from "next/image";
|
||||
import jiraSVG from "../../../public/Jira.svg";
|
||||
import confluenceSVG from "../../../public/Confluence.svg";
|
||||
import openAISVG from "../../../public/Openai.svg";
|
||||
import amazonSVG from "../../../public/Amazon.svg";
|
||||
import geminiSVG from "../../../public/Gemini.svg";
|
||||
import metaSVG from "../../../public/Meta.svg";
|
||||
import mistralSVG from "../../../public/Mistral.svg";
|
||||
import openSourceIcon from "../../../public/OpenSource.png";
|
||||
import litellmIcon from "../../../public/LiteLLM.jpg";
|
||||
|
||||
@@ -49,6 +52,7 @@ import asanaIcon from "../../../public/Asana.png";
|
||||
import anthropicSVG from "../../../public/Anthropic.svg";
|
||||
import nomicSVG from "../../../public/nomic.svg";
|
||||
import microsoftIcon from "../../../public/microsoft.png";
|
||||
import microsoftSVG from "../../../public/Microsoft.svg";
|
||||
import mixedBreadSVG from "../../../public/Mixedbread.png";
|
||||
|
||||
import OCIStorageSVG from "../../../public/OCI.svg";
|
||||
@@ -1104,6 +1108,26 @@ export const GeminiIcon = ({
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => <LogoIcon size={size} className={className} src={geminiSVG} />;
|
||||
|
||||
export const AmazonIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => <LogoIcon size={size} className={className} src={amazonSVG} />;
|
||||
|
||||
export const MetaIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => <LogoIcon size={size} className={className} src={metaSVG} />;
|
||||
|
||||
export const MicrosoftIconSVG = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => <LogoIcon size={size} className={className} src={microsoftSVG} />;
|
||||
|
||||
export const MistralIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => <LogoIcon size={size} className={className} src={mistralSVG} />;
|
||||
|
||||
export const VoyageIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
|
||||
@@ -299,6 +299,7 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
// OpenAI models
|
||||
"o1-mini": "O1 Mini",
|
||||
"o1-preview": "O1 Preview",
|
||||
"o1-2024-12-17": "O1",
|
||||
"gpt-4": "GPT 4",
|
||||
"gpt-4o": "GPT 4o",
|
||||
"gpt-4o-2024-08-06": "GPT 4o (Structured Outputs)",
|
||||
@@ -318,6 +319,21 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
"gpt-3.5-turbo-16k-0613": "GPT 3.5 Turbo 16k (June 2023)",
|
||||
"gpt-3.5-turbo-0301": "GPT 3.5 Turbo (March 2023)",
|
||||
|
||||
// Amazon models
|
||||
"amazon.nova-micro@v1": "Amazon Nova Micro",
|
||||
"amazon.nova-lite@v1": "Amazon Nova Lite",
|
||||
"amazon.nova-pro@v1": "Amazon Nova Pro",
|
||||
|
||||
// Meta models
|
||||
"llama-3.2-90b-vision-instruct": "Llama 3.2 90B",
|
||||
"llama-3.2-11b-vision-instruct": "Llama 3.2 11B",
|
||||
"llama-3.3-70b-instruct": "Llama 3.3 70B",
|
||||
|
||||
// Microsoft models
|
||||
"phi-3.5-mini-instruct": "Phi 3.5 Mini",
|
||||
"phi-3.5-moe-instruct": "Phi 3.5 MoE",
|
||||
"phi-3.5-vision-instruct": "Phi 3.5 Vision",
|
||||
|
||||
// Anthropic models
|
||||
"claude-3-opus-20240229": "Claude 3 Opus",
|
||||
"claude-3-sonnet-20240229": "Claude 3 Sonnet",
|
||||
@@ -329,6 +345,9 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
"claude-3-5-sonnet-20241022": "Claude 3.5 Sonnet (New)",
|
||||
"claude-3-5-sonnet-v2@20241022": "Claude 3.5 Sonnet (New)",
|
||||
"claude-3.5-sonnet-v2@20241022": "Claude 3.5 Sonnet (New)",
|
||||
"claude-3-5-haiku-20241022": "Claude 3.5 Haiku",
|
||||
"claude-3-5-haiku@20241022": "Claude 3.5 Haiku",
|
||||
"claude-3.5-haiku@20241022": "Claude 3.5 Haiku",
|
||||
|
||||
// Google Models
|
||||
"gemini-1.5-pro": "Gemini 1.5 Pro",
|
||||
@@ -337,6 +356,11 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
"gemini-1.5-flash-001": "Gemini 1.5 Flash",
|
||||
"gemini-1.5-pro-002": "Gemini 1.5 Pro (v2)",
|
||||
"gemini-1.5-flash-002": "Gemini 1.5 Flash (v2)",
|
||||
"gemini-2.0-flash-exp": "Gemini 2.0 Flash (Experimental)",
|
||||
|
||||
// Mistral Models
|
||||
"mistral-large-2411": "Mistral Large 24.11",
|
||||
"mistral-large@2411": "Mistral Large 24.11",
|
||||
|
||||
// Bedrock models
|
||||
"meta.llama3-1-70b-instruct-v1:0": "Llama 3.1 70B",
|
||||
|
||||
@@ -74,6 +74,8 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
|
||||
"claude-3-opus-20240229",
|
||||
"claude-3-sonnet-20240229",
|
||||
"claude-3-haiku-20240307",
|
||||
// custom claude names
|
||||
"claude-3.5-sonnet-v2@20241022",
|
||||
// claude names with AWS Bedrock Suffix
|
||||
"claude-3-opus-20240229-v1:0",
|
||||
"claude-3-sonnet-20240229-v1:0",
|
||||
@@ -93,6 +95,13 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
|
||||
"gemini-1.5-flash-001",
|
||||
"gemini-1.5-pro-002",
|
||||
"gemini-1.5-flash-002",
|
||||
"gemini-2.0-flash-exp",
|
||||
// amazon models
|
||||
"amazon.nova-lite@v1",
|
||||
"amazon.nova-pro@v1",
|
||||
// meta models
|
||||
"llama-3.2-90b-vision-instruct",
|
||||
"llama-3.2-11b-vision-instruct"
|
||||
];
|
||||
|
||||
export function checkLLMSupportsImageInput(model: string) {
|
||||
|
||||
Reference in New Issue
Block a user