Compare commits

..

1 Commits

Author SHA1 Message Date
pablonyx
9ece8f68f3 k 2025-03-03 18:18:47 -08:00
91 changed files with 1290 additions and 4551 deletions

View File

@@ -6,8 +6,7 @@ Create Date: 2025-02-26 13:07:56.217791
"""
from alembic import op
import time
from sqlalchemy import text
# revision identifiers, used by Alembic.
revision = "3bd4c84fe72f"
@@ -28,357 +27,45 @@ depends_on = None
# 4. Adds indexes to both chat_message and chat_session tables for comprehensive search
def upgrade():
# --- PART 1: chat_message table ---
# Step 1: Add nullable column (quick, minimal locking)
# op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv")
# op.execute("DROP TRIGGER IF EXISTS chat_message_tsv_trigger ON chat_message")
# op.execute("DROP FUNCTION IF EXISTS update_chat_message_tsv()")
# op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv")
# # Drop chat_session tsv trigger if it exists
# op.execute("DROP TRIGGER IF EXISTS chat_session_tsv_trigger ON chat_session")
# op.execute("DROP FUNCTION IF EXISTS update_chat_session_tsv()")
# op.execute("ALTER TABLE chat_session DROP COLUMN IF EXISTS title_tsv")
# raise Exception("Stop here")
time.time()
op.execute("ALTER TABLE chat_message ADD COLUMN IF NOT EXISTS message_tsv tsvector")
# Step 2: Create function and trigger for new/updated rows
def upgrade() -> None:
# Create a GIN index for full-text search on chat_message.message
op.execute(
"""
CREATE OR REPLACE FUNCTION update_chat_message_tsv()
RETURNS TRIGGER AS $$
BEGIN
NEW.message_tsv = to_tsvector('english', NEW.message);
RETURN NEW;
END;
$$ LANGUAGE plpgsql
"""
ALTER TABLE chat_message
ADD COLUMN message_tsv tsvector
GENERATED ALWAYS AS (to_tsvector('english', message)) STORED;
"""
)
# Create trigger in a separate execute call
# Commit the current transaction before creating concurrent indexes
op.execute("COMMIT")
op.execute(
"""
CREATE TRIGGER chat_message_tsv_trigger
BEFORE INSERT OR UPDATE ON chat_message
FOR EACH ROW EXECUTE FUNCTION update_chat_message_tsv()
"""
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
ON chat_message
USING GIN (message_tsv)
"""
)
# Step 3: Update existing rows in batches using Python
time.time()
# Get connection and count total rows
connection = op.get_bind()
total_count_result = connection.execute(
text("SELECT COUNT(*) FROM chat_message")
).scalar()
total_count = total_count_result if total_count_result is not None else 0
batch_size = 5000
batches = 0
# Calculate total batches needed
total_batches = (
(total_count + batch_size - 1) // batch_size if total_count > 0 else 0
# Also add a stored tsvector column for chat_session.description
op.execute(
"""
ALTER TABLE chat_session
ADD COLUMN description_tsv tsvector
GENERATED ALWAYS AS (to_tsvector('english', coalesce(description, ''))) STORED;
"""
)
# Process in batches - properly handling UUIDs by using OFFSET/LIMIT approach
for batch_num in range(total_batches):
offset = batch_num * batch_size
# Commit again before creating the second concurrent index
op.execute("COMMIT")
# Execute update for this batch using OFFSET/LIMIT which works with UUIDs
connection.execute(
text(
"""
UPDATE chat_message
SET message_tsv = to_tsvector('english', message)
WHERE id IN (
SELECT id FROM chat_message
WHERE message_tsv IS NULL
ORDER BY id
LIMIT :batch_size OFFSET :offset
)
"""
).bindparams(batch_size=batch_size, offset=offset)
)
# Commit each batch
connection.execute(text("COMMIT"))
# Start a new transaction
connection.execute(text("BEGIN"))
batches += 1
# Final check for any remaining NULL values
connection.execute(
text(
"""
UPDATE chat_message SET message_tsv = to_tsvector('english', message)
WHERE message_tsv IS NULL
"""
)
)
# Create GIN index concurrently
connection.execute(text("COMMIT"))
time.time()
connection.execute(
text(
"""
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
ON chat_message USING GIN (message_tsv)
"""
)
)
# First drop the trigger as it won't be needed anymore
connection.execute(
text(
"""
DROP TRIGGER IF EXISTS chat_message_tsv_trigger ON chat_message;
"""
)
)
connection.execute(
text(
"""
DROP FUNCTION IF EXISTS update_chat_message_tsv();
"""
)
)
# Add new generated column
time.time()
connection.execute(
text(
"""
ALTER TABLE chat_message
ADD COLUMN message_tsv_gen tsvector
GENERATED ALWAYS AS (to_tsvector('english', message)) STORED;
"""
)
)
connection.execute(text("COMMIT"))
time.time()
connection.execute(
text(
"""
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv_gen
ON chat_message USING GIN (message_tsv_gen)
"""
)
)
# Drop old index and column
connection.execute(text("COMMIT"))
connection.execute(
text(
"""
DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;
"""
)
)
connection.execute(text("COMMIT"))
connection.execute(
text(
"""
ALTER TABLE chat_message DROP COLUMN message_tsv;
"""
)
)
# Rename new column to old name
connection.execute(
text(
"""
ALTER TABLE chat_message RENAME COLUMN message_tsv_gen TO message_tsv;
"""
)
)
# --- PART 2: chat_session table ---
# Step 1: Add nullable column (quick, minimal locking)
time.time()
connection.execute(
text(
"ALTER TABLE chat_session ADD COLUMN IF NOT EXISTS description_tsv tsvector"
)
)
# Step 2: Create function and trigger for new/updated rows - SPLIT INTO SEPARATE CALLS
connection.execute(
text(
"""
CREATE OR REPLACE FUNCTION update_chat_session_tsv()
RETURNS TRIGGER AS $$
BEGIN
NEW.description_tsv = to_tsvector('english', COALESCE(NEW.description, ''));
RETURN NEW;
END;
$$ LANGUAGE plpgsql
"""
)
)
# Create trigger in a separate execute call
connection.execute(
text(
"""
CREATE TRIGGER chat_session_tsv_trigger
BEFORE INSERT OR UPDATE ON chat_session
FOR EACH ROW EXECUTE FUNCTION update_chat_session_tsv()
"""
)
)
# Step 3: Update existing rows in batches using Python
time.time()
# Get the maximum ID to determine batch count
# Cast id to text for MAX function since it's a UUID
max_id_result = connection.execute(
text("SELECT COALESCE(MAX(id::text), '0') FROM chat_session")
).scalar()
max_id_result if max_id_result is not None else "0"
batch_size = 5000
batches = 0
# Get all IDs ordered to process in batches
rows = connection.execute(
text("SELECT id FROM chat_session ORDER BY id")
).fetchall()
total_rows = len(rows)
# Process in batches
for batch_num, batch_start in enumerate(range(0, total_rows, batch_size)):
batch_end = min(batch_start + batch_size, total_rows)
batch_ids = [row[0] for row in rows[batch_start:batch_end]]
if not batch_ids:
continue
# Use IN clause instead of BETWEEN for UUIDs
placeholders = ", ".join([f":id{i}" for i in range(len(batch_ids))])
params = {f"id{i}": id_val for i, id_val in enumerate(batch_ids)}
# Execute update for this batch
connection.execute(
text(
f"""
UPDATE chat_session
SET description_tsv = to_tsvector('english', COALESCE(description, ''))
WHERE id IN ({placeholders})
AND description_tsv IS NULL
"""
).bindparams(**params)
)
# Commit each batch
connection.execute(text("COMMIT"))
# Start a new transaction
connection.execute(text("BEGIN"))
batches += 1
# Final check for any remaining NULL values
connection.execute(
text(
"""
UPDATE chat_session SET description_tsv = to_tsvector('english', COALESCE(description, ''))
WHERE description_tsv IS NULL
"""
)
)
# Create GIN index concurrently
connection.execute(text("COMMIT"))
time.time()
connection.execute(
text(
"""
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
ON chat_session USING GIN (description_tsv)
"""
)
)
# After Final check for chat_session
# First drop the trigger as it won't be needed anymore
connection.execute(
text(
"""
DROP TRIGGER IF EXISTS chat_session_tsv_trigger ON chat_session;
"""
)
)
connection.execute(
text(
"""
DROP FUNCTION IF EXISTS update_chat_session_tsv();
"""
)
)
# Add new generated column
time.time()
connection.execute(
text(
"""
ALTER TABLE chat_session
ADD COLUMN description_tsv_gen tsvector
GENERATED ALWAYS AS (to_tsvector('english', COALESCE(description, ''))) STORED;
"""
)
)
# Create new index on generated column
connection.execute(text("COMMIT"))
time.time()
connection.execute(
text(
"""
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv_gen
ON chat_session USING GIN (description_tsv_gen)
"""
)
)
# Drop old index and column
connection.execute(text("COMMIT"))
connection.execute(
text(
"""
DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;
"""
)
)
connection.execute(text("COMMIT"))
connection.execute(
text(
"""
ALTER TABLE chat_session DROP COLUMN description_tsv;
"""
)
)
# Rename new column to old name
connection.execute(
text(
"""
ALTER TABLE chat_session RENAME COLUMN description_tsv_gen TO description_tsv;
"""
)
op.execute(
"""
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
ON chat_session
USING GIN (description_tsv)
"""
)

View File

@@ -134,9 +134,7 @@ def fetch_chat_sessions_eagerly_by_time(
limit: int | None = 500,
initial_time: datetime | None = None,
) -> list[ChatSession]:
"""Sorted by oldest to newest, then by message id"""
asc_time_order: UnaryExpression = asc(ChatSession.time_created)
time_order: UnaryExpression = desc(ChatSession.time_created)
message_order: UnaryExpression = asc(ChatMessage.id)
filters: list[ColumnElement | BinaryExpression] = [
@@ -149,7 +147,8 @@ def fetch_chat_sessions_eagerly_by_time(
subquery = (
db_session.query(ChatSession.id, ChatSession.time_created)
.filter(*filters)
.order_by(asc_time_order)
.order_by(ChatSession.id, time_order)
.distinct(ChatSession.id)
.limit(limit)
.subquery()
)
@@ -165,7 +164,7 @@ def fetch_chat_sessions_eagerly_by_time(
ChatMessage.chat_message_feedbacks
),
)
.order_by(asc_time_order, message_order)
.order_by(time_order, message_order)
)
chat_sessions = query.all()

View File

@@ -16,18 +16,13 @@ from onyx.db.models import UsageReport
from onyx.file_store.file_store import get_default_file_store
# Gets skeletons of all messages in the given range
# Gets skeletons of all message
def get_empty_chat_messages_entries__paginated(
db_session: Session,
period: tuple[datetime, datetime],
limit: int | None = 500,
initial_time: datetime | None = None,
) -> tuple[Optional[datetime], list[ChatMessageSkeleton]]:
"""Returns a tuple where:
first element is the most recent timestamp out of the sessions iterated
- this timestamp can be used to paginate forward in time
second element is a list of messages belonging to all the sessions iterated
"""
chat_sessions = fetch_chat_sessions_eagerly_by_time(
start=period[0],
end=period[1],
@@ -57,17 +52,18 @@ def get_empty_chat_messages_entries__paginated(
if len(chat_sessions) == 0:
return None, []
return chat_sessions[-1].time_created, message_skeletons
return chat_sessions[0].time_created, message_skeletons
def get_all_empty_chat_message_entries(
db_session: Session,
period: tuple[datetime, datetime],
) -> Generator[list[ChatMessageSkeleton], None, None]:
"""period is the range of time over which to fetch messages."""
initial_time: Optional[datetime] = period[0]
ind = 0
while True:
# iterate from oldest to newest
ind += 1
time_created, message_skeletons = get_empty_chat_messages_entries__paginated(
db_session,
period,

View File

@@ -104,14 +104,14 @@ async def provision_tenant(tenant_id: str, email: str) -> None:
status_code=409, detail="User already belongs to an organization"
)
logger.debug(f"Provisioning tenant {tenant_id} for user {email}")
logger.info(f"Provisioning tenant: {tenant_id}")
token = None
try:
if not create_schema_if_not_exists(tenant_id):
logger.debug(f"Created schema for tenant {tenant_id}")
logger.info(f"Created schema for tenant {tenant_id}")
else:
logger.debug(f"Schema already exists for tenant {tenant_id}")
logger.info(f"Schema already exists for tenant {tenant_id}")
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)

View File

@@ -6,7 +6,7 @@ MODEL_WARM_UP_STRING = "hi " * 512
DEFAULT_OPENAI_MODEL = "text-embedding-3-small"
DEFAULT_COHERE_MODEL = "embed-english-light-v3.0"
DEFAULT_VOYAGE_MODEL = "voyage-large-2-instruct"
DEFAULT_VERTEX_MODEL = "text-embedding-005"
DEFAULT_VERTEX_MODEL = "text-embedding-004"
class EmbeddingModelTextType:

View File

@@ -5,7 +5,6 @@ from types import TracebackType
from typing import cast
from typing import Optional
import aioboto3 # type: ignore
import httpx
import openai
import vertexai # type: ignore
@@ -29,13 +28,11 @@ from model_server.constants import DEFAULT_VERTEX_MODEL
from model_server.constants import DEFAULT_VOYAGE_MODEL
from model_server.constants import EmbeddingModelTextType
from model_server.constants import EmbeddingProvider
from model_server.utils import pass_aws_key
from model_server.utils import simple_log_function_time
from onyx.utils.logger import setup_logger
from shared_configs.configs import API_BASED_EMBEDDING_TIMEOUT
from shared_configs.configs import INDEXING_ONLY
from shared_configs.configs import OPENAI_EMBEDDING_TIMEOUT
from shared_configs.configs import VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE
from shared_configs.enums import EmbedTextType
from shared_configs.enums import RerankerProvider
from shared_configs.model_server_models import Embedding
@@ -185,24 +182,17 @@ class CloudEmbedding:
vertexai.init(project=project_id, credentials=credentials)
client = TextEmbeddingModel.from_pretrained(model)
inputs = [TextEmbeddingInput(text, embedding_type) for text in texts]
# Split into batches of 25 texts
max_texts_per_batch = VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE
batches = [
inputs[i : i + max_texts_per_batch]
for i in range(0, len(inputs), max_texts_per_batch)
]
# Dispatch all embedding calls asynchronously at once
tasks = [
client.get_embeddings_async(batch, auto_truncate=True) for batch in batches
]
# Wait for all tasks to complete in parallel
results = await asyncio.gather(*tasks)
return [embedding.values for batch in results for embedding in batch]
embeddings = await client.get_embeddings_async(
[
TextEmbeddingInput(
text,
embedding_type,
)
for text in texts
],
auto_truncate=True, # This is the default
)
return [embedding.values for embedding in embeddings]
async def _embed_litellm_proxy(
self, texts: list[str], model_name: str | None
@@ -457,7 +447,7 @@ async def local_rerank(query: str, docs: list[str], model_name: str) -> list[flo
)
async def cohere_rerank_api(
async def cohere_rerank(
query: str, docs: list[str], model_name: str, api_key: str
) -> list[float]:
cohere_client = CohereAsyncClient(api_key=api_key)
@@ -467,45 +457,6 @@ async def cohere_rerank_api(
return [result.relevance_score for result in sorted_results]
async def cohere_rerank_aws(
query: str,
docs: list[str],
model_name: str,
region_name: str,
aws_access_key_id: str,
aws_secret_access_key: str,
) -> list[float]:
session = aioboto3.Session(
aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key
)
async with session.client(
"bedrock-runtime", region_name=region_name
) as bedrock_client:
body = json.dumps(
{
"query": query,
"documents": docs,
"api_version": 2,
}
)
# Invoke the Bedrock model asynchronously
response = await bedrock_client.invoke_model(
modelId=model_name,
accept="application/json",
contentType="application/json",
body=body,
)
# Read the response asynchronously
response_body = json.loads(await response["body"].read())
# Extract and sort the results
results = response_body.get("results", [])
sorted_results = sorted(results, key=lambda item: item["index"])
return [result["relevance_score"] for result in sorted_results]
async def litellm_rerank(
query: str, docs: list[str], api_url: str, model_name: str, api_key: str | None
) -> list[float]:
@@ -621,32 +572,15 @@ async def process_rerank_request(rerank_request: RerankRequest) -> RerankRespons
elif rerank_request.provider_type == RerankerProvider.COHERE:
if rerank_request.api_key is None:
raise RuntimeError("Cohere Rerank Requires an API Key")
sim_scores = await cohere_rerank_api(
sim_scores = await cohere_rerank(
query=rerank_request.query,
docs=rerank_request.documents,
model_name=rerank_request.model_name,
api_key=rerank_request.api_key,
)
return RerankResponse(scores=sim_scores)
elif rerank_request.provider_type == RerankerProvider.BEDROCK:
if rerank_request.api_key is None:
raise RuntimeError("Bedrock Rerank Requires an API Key")
aws_access_key_id, aws_secret_access_key, aws_region = pass_aws_key(
rerank_request.api_key
)
sim_scores = await cohere_rerank_aws(
query=rerank_request.query,
docs=rerank_request.documents,
model_name=rerank_request.model_name,
region_name=aws_region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
return RerankResponse(scores=sim_scores)
else:
raise ValueError(f"Unsupported provider: {rerank_request.provider_type}")
except Exception as e:
logger.exception(f"Error during reranking process:\n{str(e)}")
raise HTTPException(

View File

@@ -70,32 +70,3 @@ def get_gpu_type() -> str:
return GPUStatus.MAC_MPS
return GPUStatus.NONE
def pass_aws_key(api_key: str) -> tuple[str, str, str]:
"""Parse AWS API key string into components.
Args:
api_key: String in format 'aws_ACCESSKEY_SECRETKEY_REGION'
Returns:
Tuple of (access_key, secret_key, region)
Raises:
ValueError: If key format is invalid
"""
if not api_key.startswith("aws"):
raise ValueError("API key must start with 'aws' prefix")
parts = api_key.split("_")
if len(parts) != 4:
raise ValueError(
f"API key must be in format 'aws_ACCESSKEY_SECRETKEY_REGION', got {len(parts) - 1} parts"
"this is an onyx specific format for formatting the aws secrets for bedrock"
)
try:
_, aws_access_key_id, aws_secret_access_key, aws_region = parts
return aws_access_key_id, aws_secret_access_key, aws_region
except Exception as e:
raise ValueError(f"Failed to parse AWS key components: {str(e)}")

View File

@@ -98,16 +98,8 @@ def choose_tool(
# For tool calling LLMs, we want to insert the task prompt as part of this flow, this is because the LLM
# may choose to not call any tools and just generate the answer, in which case the task prompt is needed.
prompt=built_prompt,
tools=(
[tool.tool_definition() for tool in tools] or None
if using_tool_calling_llm
else None
),
tool_choice=(
"required"
if tools and force_use_tool.force_use and using_tool_calling_llm
else None
),
tools=[tool.tool_definition() for tool in tools] or None,
tool_choice=("required" if tools and force_use_tool.force_use else None),
structured_response_format=structured_response_format,
)

View File

@@ -523,13 +523,12 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
try:
user_count = await get_user_count()
logger.debug(f"Current tenant user count: {user_count}")
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
if user_count == 1:
create_milestone_and_report(
user=user,
distinct_id=user.email,
distinct_id=f"{user.email}_{tenant_id}",
event_type=MilestoneRecordType.USER_SIGNED_UP,
properties=None,
db_session=db_session,
@@ -537,7 +536,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
else:
create_milestone_and_report(
user=user,
distinct_id=user.email,
distinct_id=f"{user.email}_{tenant_id}",
event_type=MilestoneRecordType.MULTIPLE_USERS,
properties=None,
db_session=db_session,
@@ -545,7 +544,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
finally:
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
logger.debug(f"User {user.id} has registered.")
logger.notice(f"User {user.id} has registered.")
optional_telemetry(
record_type=RecordType.SIGN_UP,
data={"action": "create"},

View File

@@ -756,7 +756,6 @@ def stream_chat_message_objects(
)
# LLM prompt building, response capturing, etc.
answer = Answer(
prompt_builder=prompt_builder,
is_connected=is_connected,

View File

@@ -640,6 +640,3 @@ TEST_ENV = os.environ.get("TEST_ENV", "").lower() == "true"
MOCK_LLM_RESPONSE = (
os.environ.get("MOCK_LLM_RESPONSE") if os.environ.get("MOCK_LLM_RESPONSE") else None
)
DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB = 20

View File

@@ -1,38 +0,0 @@
from onyx.configs.app_configs import DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB
from onyx.server.settings.store import load_settings
def get_image_extraction_and_analysis_enabled() -> bool:
"""Get image extraction and analysis enabled setting from workspace settings or fallback to False"""
try:
settings = load_settings()
if settings.image_extraction_and_analysis_enabled is not None:
return settings.image_extraction_and_analysis_enabled
except Exception:
pass
return False
def get_search_time_image_analysis_enabled() -> bool:
"""Get search time image analysis enabled setting from workspace settings or fallback to False"""
try:
settings = load_settings()
if settings.search_time_image_analysis_enabled is not None:
return settings.search_time_image_analysis_enabled
except Exception:
pass
return False
def get_image_analysis_max_size_mb() -> int:
"""Get image analysis max size MB setting from workspace settings or fallback to environment variable"""
try:
settings = load_settings()
if settings.image_analysis_max_size_mb is not None:
return settings.image_analysis_max_size_mb
except Exception:
pass
return DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB

View File

@@ -200,6 +200,7 @@ class AirtableConnector(LoadConnector):
return attachment_response.content
logger.error(f"Failed to refresh attachment for {filename}")
raise
attachment_content = get_attachment_with_retry(url, record_id)

View File

@@ -11,12 +11,13 @@ from onyx.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.confluence.onyx_confluence import extract_text_from_confluence_html
from onyx.connectors.confluence.onyx_confluence import attachment_to_content
from onyx.connectors.confluence.onyx_confluence import (
extract_text_from_confluence_html,
)
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
from onyx.connectors.confluence.utils import build_confluence_document_id
from onyx.connectors.confluence.utils import convert_attachment_to_content
from onyx.connectors.confluence.utils import datetime_from_string
from onyx.connectors.confluence.utils import process_attachment
from onyx.connectors.confluence.utils import validate_attachment_filetype
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.connectors.exceptions import CredentialExpiredError
@@ -35,26 +36,28 @@ from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
logger = setup_logger()
# Potential Improvements
# 1. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
# 1. Include attachments, etc
# 2. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
_COMMENT_EXPANSION_FIELDS = ["body.storage.value"]
_PAGE_EXPANSION_FIELDS = [
"body.storage.value",
"version",
"space",
"metadata.labels",
"history.lastUpdated",
]
_ATTACHMENT_EXPANSION_FIELDS = [
"version",
"space",
"metadata.labels",
]
_RESTRICTIONS_EXPANSION_FIELDS = [
"space",
"restrictions.read.restrictions.user",
@@ -84,11 +87,7 @@ _FULL_EXTENSION_FILTER_STRING = "".join(
class ConfluenceConnector(
LoadConnector,
PollConnector,
SlimConnector,
CredentialsConnector,
VisionEnabledConnector,
LoadConnector, PollConnector, SlimConnector, CredentialsConnector
):
def __init__(
self,
@@ -106,24 +105,13 @@ class ConfluenceConnector(
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
) -> None:
self.wiki_base = wiki_base
self.is_cloud = is_cloud
self.space = space
self.page_id = page_id
self.index_recursively = index_recursively
self.cql_query = cql_query
self.batch_size = batch_size
self.continue_on_failure = continue_on_failure
self.labels_to_skip = labels_to_skip
self.timezone_offset = timezone_offset
self._confluence_client: OnyxConfluence | None = None
self._fetched_titles: set[str] = set()
# Initialize vision LLM using the mixin
self.initialize_vision_llm()
self.is_cloud = is_cloud
# Remove trailing slash from wiki_base if present
self.wiki_base = wiki_base.rstrip("/")
"""
If nothing is provided, we default to fetching all pages
Only one or none of the following options should be specified so
@@ -165,6 +153,8 @@ class ConfluenceConnector(
"max_backoff_seconds": 60,
}
self._confluence_client: OnyxConfluence | None = None
@property
def confluence_client(self) -> OnyxConfluence:
if self._confluence_client is None:
@@ -194,6 +184,7 @@ class ConfluenceConnector(
end: SecondsSinceUnixEpoch | None = None,
) -> str:
page_query = self.base_cql_page_query + self.cql_label_filter
# Add time filters
if start:
formatted_start_time = datetime.fromtimestamp(
@@ -205,6 +196,7 @@ class ConfluenceConnector(
"%Y-%m-%d %H:%M"
)
page_query += f" and lastmodified <= '{formatted_end_time}'"
return page_query
def _construct_attachment_query(self, confluence_page_id: str) -> str:
@@ -215,10 +207,11 @@ class ConfluenceConnector(
def _get_comment_string_for_page_id(self, page_id: str) -> str:
comment_string = ""
comment_cql = f"type=comment and container='{page_id}'"
comment_cql += self.cql_label_filter
expand = ",".join(_COMMENT_EXPANSION_FIELDS)
expand = ",".join(_COMMENT_EXPANSION_FIELDS)
for comment in self.confluence_client.paginated_cql_retrieval(
cql=comment_cql,
expand=expand,
@@ -229,177 +222,123 @@ class ConfluenceConnector(
confluence_object=comment,
fetched_titles=set(),
)
return comment_string
def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
def _convert_object_to_document(
self,
confluence_object: dict[str, Any],
parent_content_id: str | None = None,
) -> Document | None:
"""
Converts a Confluence page to a Document object.
Includes the page content, comments, and attachments.
"""
try:
# Extract basic page information
page_id = page["id"]
page_title = page["title"]
page_url = f"{self.wiki_base}/wiki{page['_links']['webui']}"
Takes in a confluence object, extracts all metadata, and converts it into a document.
If its a page, it extracts the text, adds the comments for the document text.
If its an attachment, it just downloads the attachment and converts that into a document.
# Get the page content
page_content = extract_text_from_confluence_html(
self.confluence_client, page, self._fetched_titles
parent_content_id: if the object is an attachment, specifies the content id that
the attachment is attached to
"""
# The url and the id are the same
object_url = build_confluence_document_id(
self.wiki_base, confluence_object["_links"]["webui"], self.is_cloud
)
object_text = None
# Extract text from page
if confluence_object["type"] == "page":
object_text = extract_text_from_confluence_html(
confluence_client=self.confluence_client,
confluence_object=confluence_object,
fetched_titles={confluence_object.get("title", "")},
)
# Add comments to text
object_text += self._get_comment_string_for_page_id(confluence_object["id"])
elif confluence_object["type"] == "attachment":
object_text = attachment_to_content(
confluence_client=self.confluence_client,
attachment=confluence_object,
parent_content_id=parent_content_id,
)
# Create the main section for the page content
sections = [Section(text=page_content, link=page_url)]
# Process comments if available
comment_text = self._get_comment_string_for_page_id(page_id)
if comment_text:
sections.append(Section(text=comment_text, link=f"{page_url}#comments"))
# Process attachments
if "children" in page and "attachment" in page["children"]:
attachments = self.confluence_client.get_attachments_for_page(
page_id, expand="metadata"
)
for attachment in attachments.get("results", []):
# Process each attachment
result = process_attachment(
self.confluence_client,
attachment,
page_title,
self.image_analysis_llm,
)
if result.text:
# Create a section for the attachment text
attachment_section = Section(
text=result.text,
link=f"{page_url}#attachment-{attachment['id']}",
image_file_name=result.file_name,
)
sections.append(attachment_section)
elif result.error:
logger.warning(
f"Error processing attachment '{attachment.get('title')}': {result.error}"
)
# Extract metadata
metadata = {}
if "space" in page:
metadata["space"] = page["space"].get("name", "")
# Extract labels
labels = []
if "metadata" in page and "labels" in page["metadata"]:
for label in page["metadata"]["labels"].get("results", []):
labels.append(label.get("name", ""))
if labels:
metadata["labels"] = labels
# Extract owners
primary_owners = []
if "version" in page and "by" in page["version"]:
author = page["version"]["by"]
display_name = author.get("displayName", "Unknown")
primary_owners.append(BasicExpertInfo(display_name=display_name))
# Create the document
return Document(
id=build_confluence_document_id(self.wiki_base, page_id, self.is_cloud),
sections=sections,
source=DocumentSource.CONFLUENCE,
semantic_identifier=page_title,
metadata=metadata,
doc_updated_at=datetime_from_string(page["version"]["when"]),
primary_owners=primary_owners if primary_owners else None,
)
except Exception as e:
logger.error(f"Error converting page {page.get('id', 'unknown')}: {e}")
if not self.continue_on_failure:
raise
if object_text is None:
# This only happens for attachments that are not parseable
return None
# Get space name
doc_metadata: dict[str, str | list[str]] = {
"Wiki Space Name": confluence_object["space"]["name"]
}
# Get labels
label_dicts = (
confluence_object.get("metadata", {}).get("labels", {}).get("results", [])
)
page_labels = [label.get("name") for label in label_dicts if label.get("name")]
if page_labels:
doc_metadata["labels"] = page_labels
# Get last modified and author email
version_dict = confluence_object.get("version", {})
last_modified = (
datetime_from_string(version_dict.get("when"))
if version_dict.get("when")
else None
)
author_email = version_dict.get("by", {}).get("email")
title = confluence_object.get("title", "Untitled Document")
return Document(
id=object_url,
sections=[Section(link=object_url, text=object_text)],
source=DocumentSource.CONFLUENCE,
semantic_identifier=title,
doc_updated_at=last_modified,
primary_owners=(
[BasicExpertInfo(email=author_email)] if author_email else None
),
metadata=doc_metadata,
)
def _fetch_document_batches(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateDocumentsOutput:
"""
Yields batches of Documents. For each page:
- Create a Document with 1 Section for the page text/comments
- Then fetch attachments. For each attachment:
- Attempt to convert it with convert_attachment_to_content(...)
- If successful, create a new Section with the extracted text or summary.
"""
doc_batch: list[Document] = []
confluence_page_ids: list[str] = []
page_query = self._construct_page_query(start, end)
logger.debug(f"page_query: {page_query}")
# Fetch pages as Documents
for page in self.confluence_client.paginated_cql_retrieval(
cql=page_query,
expand=",".join(_PAGE_EXPANSION_FIELDS),
limit=self.batch_size,
):
# Build doc from page
doc = self._convert_page_to_document(page)
if not doc:
continue
# Now get attachments for that page:
attachment_query = self._construct_attachment_query(page["id"])
# We'll use the page's XML to provide context if we summarize an image
confluence_xml = page.get("body", {}).get("storage", {}).get("value", "")
logger.debug(f"_fetch_document_batches: {page['id']}")
confluence_page_ids.append(page["id"])
doc = self._convert_object_to_document(page)
if doc is not None:
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
# Fetch attachments as Documents
for confluence_page_id in confluence_page_ids:
attachment_query = self._construct_attachment_query(confluence_page_id)
# TODO: maybe should add time filter as well?
for attachment in self.confluence_client.paginated_cql_retrieval(
cql=attachment_query,
expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
):
attachment["metadata"].get("mediaType", "")
if not validate_attachment_filetype(
attachment, self.image_analysis_llm
):
continue
# Attempt to get textual content or image summarization:
try:
logger.info(f"Processing attachment: {attachment['title']}")
response = convert_attachment_to_content(
confluence_client=self.confluence_client,
attachment=attachment,
page_context=confluence_xml,
llm=self.image_analysis_llm,
)
if response is None:
continue
content_text, file_storage_name = response
object_url = build_confluence_document_id(
self.wiki_base, page["_links"]["webui"], self.is_cloud
)
if content_text:
doc.sections.append(
Section(
text=content_text,
link=object_url,
image_file_name=file_storage_name,
)
)
except Exception as e:
logger.error(
f"Failed to extract/summarize attachment {attachment['title']}",
exc_info=e,
)
if not self.continue_on_failure:
raise
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
doc = self._convert_object_to_document(attachment, confluence_page_id)
if doc is not None:
doc_batch.append(doc)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
if doc_batch:
yield doc_batch
@@ -420,63 +359,55 @@ class ConfluenceConnector(
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
"""
Return 'slim' docs (IDs + minimal permission data).
Does not fetch actual text. Used primarily for incremental permission sync.
"""
doc_metadata_list: list[SlimDocument] = []
restrictions_expand = ",".join(_RESTRICTIONS_EXPANSION_FIELDS)
# Query pages
page_query = self.base_cql_page_query + self.cql_label_filter
for page in self.confluence_client.cql_paginate_all_expansions(
cql=page_query,
expand=restrictions_expand,
limit=_SLIM_DOC_BATCH_SIZE,
):
# If the page has restrictions, add them to the perm_sync_data
# These will be used by doc_sync.py to sync permissions
page_restrictions = page.get("restrictions")
page_space_key = page.get("space", {}).get("key")
page_ancestors = page.get("ancestors", [])
page_perm_sync_data = {
"restrictions": page_restrictions or {},
"space_key": page_space_key,
"ancestors": page_ancestors,
"ancestors": page_ancestors or [],
}
doc_metadata_list.append(
SlimDocument(
id=build_confluence_document_id(
self.wiki_base, page["_links"]["webui"], self.is_cloud
self.wiki_base,
page["_links"]["webui"],
self.is_cloud,
),
perm_sync_data=page_perm_sync_data,
)
)
# Query attachments for each page
attachment_query = self._construct_attachment_query(page["id"])
for attachment in self.confluence_client.cql_paginate_all_expansions(
cql=attachment_query,
expand=restrictions_expand,
limit=_SLIM_DOC_BATCH_SIZE,
):
# If you skip images, you'll skip them in the permission sync
attachment["metadata"].get("mediaType", "")
if not validate_attachment_filetype(
attachment, self.image_analysis_llm
):
if not validate_attachment_filetype(attachment):
continue
attachment_restrictions = attachment.get("restrictions", {})
attachment_restrictions = attachment.get("restrictions")
if not attachment_restrictions:
attachment_restrictions = page_restrictions or {}
attachment_restrictions = page_restrictions
attachment_space_key = attachment.get("space", {}).get("key")
if not attachment_space_key:
attachment_space_key = page_space_key
attachment_perm_sync_data = {
"restrictions": attachment_restrictions,
"restrictions": attachment_restrictions or {},
"space_key": attachment_space_key,
}
@@ -490,16 +421,16 @@ class ConfluenceConnector(
perm_sync_data=attachment_perm_sync_data,
)
)
if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE:
yield doc_metadata_list[:_SLIM_DOC_BATCH_SIZE]
doc_metadata_list = doc_metadata_list[_SLIM_DOC_BATCH_SIZE:]
if callback and callback.should_stop():
raise RuntimeError(
"retrieve_all_slim_documents: Stop signal detected"
)
if callback:
if callback.should_stop():
raise RuntimeError(
"retrieve_all_slim_documents: Stop signal detected"
)
callback.progress("retrieve_all_slim_documents", 1)
yield doc_metadata_list

View File

@@ -144,12 +144,6 @@ class OnyxConfluence:
self.static_credentials = credential_json
return credential_json, False
if not OAUTH_CONFLUENCE_CLOUD_CLIENT_ID:
raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_ID must be set!")
if not OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET:
raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET must be set!")
# check if we should refresh tokens. we're deciding to refresh halfway
# to expiration
now = datetime.now(timezone.utc)

View File

@@ -1,12 +1,9 @@
import io
import math
import time
from collections.abc import Callable
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from io import BytesIO
from pathlib import Path
from typing import Any
from typing import cast
from typing import TYPE_CHECKING
@@ -15,28 +12,14 @@ from urllib.parse import parse_qs
from urllib.parse import quote
from urllib.parse import urlparse
import bs4
import requests
from pydantic import BaseModel
from sqlalchemy.orm import Session
from onyx.configs.app_configs import (
CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD,
)
from onyx.configs.constants import FileOrigin
from onyx.utils.logger import setup_logger
if TYPE_CHECKING:
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.models import PGFileStore
from onyx.db.pg_file_store import create_populate_lobj
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
from onyx.db.pg_file_store import upsert_pgfilestore
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.file_validation import is_valid_image_type
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
pass
logger = setup_logger()
@@ -52,229 +35,15 @@ class TokenResponse(BaseModel):
scope: str
def validate_attachment_filetype(
attachment: dict[str, Any], llm: LLM | None = None
) -> bool:
"""
Validates if the attachment is a supported file type.
If LLM is provided, also checks if it's an image that can be processed.
"""
attachment.get("metadata", {})
media_type = attachment.get("metadata", {}).get("mediaType", "")
if media_type.startswith("image/"):
return llm is not None and is_valid_image_type(media_type)
# For non-image files, check if we support the extension
title = attachment.get("title", "")
extension = Path(title).suffix.lstrip(".").lower() if "." in title else ""
return extension in ["pdf", "doc", "docx", "txt", "md", "rtf"]
class AttachmentProcessingResult(BaseModel):
"""
A container for results after processing a Confluence attachment.
'text' is the textual content of the attachment.
'file_name' is the final file name used in PGFileStore to store the content.
'error' holds an exception or string if something failed.
"""
text: str | None
file_name: str | None
error: str | None = None
def _download_attachment(
confluence_client: "OnyxConfluence", attachment: dict[str, Any]
) -> bytes | None:
"""
Retrieves the raw bytes of an attachment from Confluence. Returns None on error.
"""
download_link = confluence_client.url + attachment["_links"]["download"]
resp = confluence_client._session.get(download_link)
if resp.status_code != 200:
logger.warning(
f"Failed to fetch {download_link} with status code {resp.status_code}"
)
return None
return resp.content
def process_attachment(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
page_context: str,
llm: LLM | None,
) -> AttachmentProcessingResult:
"""
Processes a Confluence attachment. If it's a document, extracts text,
or if it's an image and an LLM is available, summarizes it. Returns a structured result.
"""
try:
# Get the media type from the attachment metadata
media_type = attachment.get("metadata", {}).get("mediaType", "")
# Validate the attachment type
if not validate_attachment_filetype(attachment, llm):
return AttachmentProcessingResult(
text=None,
file_name=None,
error=f"Unsupported file type: {media_type}",
)
# Download the attachment
raw_bytes = _download_attachment(confluence_client, attachment)
if raw_bytes is None:
return AttachmentProcessingResult(
text=None, file_name=None, error="Failed to download attachment"
)
# Process image attachments with LLM if available
if media_type.startswith("image/") and llm:
return _process_image_attachment(
confluence_client, attachment, page_context, llm, raw_bytes, media_type
)
# Process document attachments
try:
text = extract_file_text(
file=BytesIO(raw_bytes),
file_name=attachment["title"],
)
# Skip if the text is too long
if len(text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD:
return AttachmentProcessingResult(
text=None,
file_name=None,
error=f"Attachment text too long: {len(text)} chars",
)
return AttachmentProcessingResult(text=text, file_name=None, error=None)
except Exception as e:
return AttachmentProcessingResult(
text=None, file_name=None, error=f"Failed to extract text: {e}"
)
except Exception as e:
return AttachmentProcessingResult(
text=None, file_name=None, error=f"Failed to process attachment: {e}"
)
def _process_image_attachment(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
page_context: str,
llm: LLM,
raw_bytes: bytes,
media_type: str,
) -> AttachmentProcessingResult:
"""Process an image attachment by saving it and generating a summary."""
try:
# Use the standardized image storage and section creation
with get_session_with_current_tenant() as db_session:
section, file_name = store_image_and_create_section(
db_session=db_session,
image_data=raw_bytes,
file_name=Path(attachment["id"]).name,
display_name=attachment["title"],
media_type=media_type,
llm=llm,
file_origin=FileOrigin.CONNECTOR,
)
return AttachmentProcessingResult(
text=section.text, file_name=file_name, error=None
)
except Exception as e:
msg = f"Image summarization failed for {attachment['title']}: {e}"
logger.error(msg, exc_info=e)
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
def _process_text_attachment(
attachment: dict[str, Any],
raw_bytes: bytes,
media_type: str,
) -> AttachmentProcessingResult:
"""Process a text-based attachment by extracting its content."""
try:
extracted_text = extract_file_text(
io.BytesIO(raw_bytes),
file_name=attachment["title"],
break_on_unprocessable=False,
)
except Exception as e:
msg = f"Failed to extract text for '{attachment['title']}': {e}"
logger.error(msg, exc_info=e)
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
# Check length constraints
if extracted_text is None or len(extracted_text) == 0:
msg = f"No text extracted for {attachment['title']}"
logger.warning(msg)
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD:
msg = (
f"Skipping attachment {attachment['title']} due to char count "
f"({len(extracted_text)} > {CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD})"
)
logger.warning(msg)
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
# Save the attachment
try:
with get_session_with_current_tenant() as db_session:
saved_record = save_bytes_to_pgfilestore(
db_session=db_session,
raw_bytes=raw_bytes,
media_type=media_type,
identifier=attachment["id"],
display_name=attachment["title"],
)
except Exception as e:
msg = f"Failed to save attachment '{attachment['title']}' to PG: {e}"
logger.error(msg, exc_info=e)
return AttachmentProcessingResult(
text=extracted_text, file_name=None, error=msg
)
return AttachmentProcessingResult(
text=extracted_text, file_name=saved_record.file_name, error=None
)
def convert_attachment_to_content(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
page_context: str,
llm: LLM | None,
) -> tuple[str | None, str | None] | None:
"""
Facade function which:
1. Validates attachment type
2. Extracts or summarizes content
3. Returns (content_text, stored_file_name) or None if we should skip it
"""
media_type = attachment["metadata"]["mediaType"]
# Quick check for unsupported types:
if media_type.startswith("video/") or media_type == "application/gliffy+json":
logger.warning(
f"Skipping unsupported attachment type: '{media_type}' for {attachment['title']}"
)
return None
result = process_attachment(confluence_client, attachment, page_context, llm)
if result.error is not None:
logger.warning(
f"Attachment {attachment['title']} encountered error: {result.error}"
)
return None
# Return the text and the file name
return result.text, result.file_name
def validate_attachment_filetype(attachment: dict[str, Any]) -> bool:
return attachment["metadata"]["mediaType"] not in [
"image/jpeg",
"image/png",
"image/gif",
"image/svg+xml",
"video/mp4",
"video/quicktime",
]
def build_confluence_document_id(
@@ -295,6 +64,23 @@ def build_confluence_document_id(
return f"{base_url}{content_url}"
def _extract_referenced_attachment_names(page_text: str) -> list[str]:
"""Parse a Confluence html page to generate a list of current
attachments in use
Args:
text (str): The page content
Returns:
list[str]: List of filenames currently in use by the page text
"""
referenced_attachment_filenames = []
soup = bs4.BeautifulSoup(page_text, "html.parser")
for attachment in soup.findAll("ri:attachment"):
referenced_attachment_filenames.append(attachment.attrs["ri:filename"])
return referenced_attachment_filenames
def datetime_from_string(datetime_string: str) -> datetime:
datetime_object = datetime.fromisoformat(datetime_string)
@@ -466,37 +252,3 @@ def update_param_in_path(path: str, param: str, value: str) -> str:
+ "?"
+ "&".join(f"{k}={quote(v[0])}" for k, v in query_params.items())
)
def attachment_to_file_record(
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
db_session: Session,
) -> tuple[PGFileStore, bytes]:
"""Save an attachment to the file store and return the file record."""
download_link = _attachment_to_download_link(confluence_client, attachment)
image_data = confluence_client.get(
download_link, absolute=True, not_json_response=True
)
# Save image to file store
file_name = f"confluence_attachment_{attachment['id']}"
lobj_oid = create_populate_lobj(BytesIO(image_data), db_session)
pgfilestore = upsert_pgfilestore(
file_name=file_name,
display_name=attachment["title"],
file_origin=FileOrigin.OTHER,
file_type=attachment["metadata"]["mediaType"],
lobj_oid=lobj_oid,
db_session=db_session,
commit=True,
)
return pgfilestore, image_data
def _attachment_to_download_link(
confluence_client: "OnyxConfluence", attachment: dict[str, Any]
) -> str:
"""Extracts the download link to images."""
return confluence_client.url + attachment["_links"]["download"]

View File

@@ -10,23 +10,22 @@ from sqlalchemy.orm import Session
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import BasicExpertInfo
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.pg_file_store import get_pgfilestore_by_file_name
from onyx.file_processing.extract_file_text import extract_text_and_images
from onyx.file_processing.extract_file_text import detect_encoding
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
from onyx.file_processing.extract_file_text import is_text_file_extension
from onyx.file_processing.extract_file_text import is_valid_file_ext
from onyx.file_processing.extract_file_text import load_files_from_zip
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.extract_file_text import read_text_file
from onyx.file_store.file_store import get_default_file_store
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
@@ -36,115 +35,81 @@ def _read_files_and_metadata(
file_name: str,
db_session: Session,
) -> Iterator[tuple[str, IO, dict[str, Any]]]:
"""
Reads the file from Postgres. If the file is a .zip, yields subfiles.
"""
"""Reads the file into IO, in the case of a zip file, yields each individual
file contained within, also includes the metadata dict if packaged in the zip"""
extension = get_file_ext(file_name)
metadata: dict[str, Any] = {}
directory_path = os.path.dirname(file_name)
# Read file from Postgres store
file_content = get_default_file_store(db_session).read_file(file_name, mode="b")
# If it's a zip, expand it
if extension == ".zip":
for file_info, subfile, metadata in load_files_from_zip(
for file_info, file, metadata in load_files_from_zip(
file_content, ignore_dirs=True
):
yield os.path.join(directory_path, file_info.filename), subfile, metadata
yield os.path.join(directory_path, file_info.filename), file, metadata
elif is_valid_file_ext(extension):
yield file_name, file_content, metadata
else:
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
def _create_image_section(
llm: LLM | None,
image_data: bytes,
db_session: Session,
parent_file_name: str,
display_name: str,
idx: int = 0,
) -> tuple[Section, str | None]:
"""
Create a Section object for a single image and store the image in PGFileStore.
If summarization is enabled and we have an LLM, summarize the image.
Returns:
tuple: (Section object, file_name in PGFileStore or None if storage failed)
"""
# Create a unique file name for the embedded image
file_name = f"{parent_file_name}_embedded_{idx}"
# Use the standardized utility to store the image and create a section
return store_image_and_create_section(
db_session=db_session,
image_data=image_data,
file_name=file_name,
display_name=display_name,
llm=llm,
file_origin=FileOrigin.OTHER,
)
def _process_file(
file_name: str,
file: IO[Any],
metadata: dict[str, Any] | None,
pdf_pass: str | None,
db_session: Session,
llm: LLM | None,
metadata: dict[str, Any] | None = None,
pdf_pass: str | None = None,
) -> list[Document]:
"""
Processes a single file, returning a list of Documents (typically one).
Also handles embedded images if 'EMBEDDED_IMAGE_EXTRACTION_ENABLED' is true.
"""
extension = get_file_ext(file_name)
# Fetch the DB record so we know the ID for internal URL
pg_record = get_pgfilestore_by_file_name(file_name=file_name, db_session=db_session)
if not pg_record:
logger.warning(f"No file record found for '{file_name}' in PG; skipping.")
return []
if not is_valid_file_ext(extension):
logger.warning(
f"Skipping file '{file_name}' with unrecognized extension '{extension}'"
)
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
return []
# Prepare doc metadata
if metadata is None:
metadata = {}
file_display_name = metadata.get("file_display_name") or os.path.basename(file_name)
file_metadata: dict[str, Any] = {}
# Timestamps
current_datetime = datetime.now(timezone.utc)
time_updated = metadata.get("time_updated", current_datetime)
if is_text_file_extension(file_name):
encoding = detect_encoding(file)
file_content_raw, file_metadata = read_text_file(
file, encoding=encoding, ignore_onyx_metadata=False
)
# Using the PDF reader function directly to pass in password cleanly
elif extension == ".pdf" and pdf_pass is not None:
file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass)
else:
file_content_raw = extract_file_text(
file=file,
file_name=file_name,
break_on_unprocessable=True,
)
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
# add a prefix to avoid conflicts with other connectors
doc_id = f"FILE_CONNECTOR__{file_name}"
if metadata:
doc_id = metadata.get("document_id") or doc_id
# If this is set, we will show this in the UI as the "name" of the file
file_display_name = all_metadata.get("file_display_name") or os.path.basename(
file_name
)
title = (
all_metadata["title"] or "" if "title" in all_metadata else file_display_name
)
time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc))
if isinstance(time_updated, str):
time_updated = time_str_to_utc(time_updated)
dt_str = metadata.get("doc_updated_at")
dt_str = all_metadata.get("doc_updated_at")
final_time_updated = time_str_to_utc(dt_str) if dt_str else time_updated
# Collect owners
p_owner_names = metadata.get("primary_owners")
s_owner_names = metadata.get("secondary_owners")
p_owners = (
[BasicExpertInfo(display_name=name) for name in p_owner_names]
if p_owner_names
else None
)
s_owners = (
[BasicExpertInfo(display_name=name) for name in s_owner_names]
if s_owner_names
else None
)
# Additional tags we store as doc metadata
# Metadata tags separate from the Onyx specific fields
metadata_tags = {
k: v
for k, v in metadata.items()
for k, v in all_metadata.items()
if k
not in [
"document_id",
@@ -157,142 +122,77 @@ def _process_file(
"file_display_name",
"title",
"connector_type",
"pdf_password",
]
}
source_type_str = metadata.get("connector_type")
source_type = (
DocumentSource(source_type_str) if source_type_str else DocumentSource.FILE
source_type_str = all_metadata.get("connector_type")
source_type = DocumentSource(source_type_str) if source_type_str else None
p_owner_names = all_metadata.get("primary_owners")
s_owner_names = all_metadata.get("secondary_owners")
p_owners = (
[BasicExpertInfo(display_name=name) for name in p_owner_names]
if p_owner_names
else None
)
s_owners = (
[BasicExpertInfo(display_name=name) for name in s_owner_names]
if s_owner_names
else None
)
doc_id = metadata.get("document_id") or f"FILE_CONNECTOR__{file_name}"
title = metadata.get("title") or file_display_name
# 1) If the file itself is an image, handle that scenario quickly
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
if extension in IMAGE_EXTENSIONS:
# Summarize or produce empty doc
image_data = file.read()
image_section, _ = _create_image_section(
llm, image_data, db_session, pg_record.file_name, title
)
return [
Document(
id=doc_id,
sections=[image_section],
source=source_type,
semantic_identifier=file_display_name,
title=title,
doc_updated_at=final_time_updated,
primary_owners=p_owners,
secondary_owners=s_owners,
metadata=metadata_tags,
)
]
# 2) Otherwise: text-based approach. Possibly with embedded images if enabled.
# (For example .docx with inline images).
file.seek(0)
text_content = ""
embedded_images: list[tuple[bytes, str]] = []
text_content, embedded_images = extract_text_and_images(
file=file,
file_name=file_name,
pdf_pass=pdf_pass,
)
# Build sections: first the text as a single Section
sections = []
link_in_meta = metadata.get("link")
if text_content.strip():
sections.append(Section(link=link_in_meta, text=text_content.strip()))
# Then any extracted images from docx, etc.
for idx, (img_data, img_name) in enumerate(embedded_images, start=1):
# Store each embedded image as a separate file in PGFileStore
# and create a section with the image summary
image_section, _ = _create_image_section(
llm,
img_data,
db_session,
pg_record.file_name,
f"{title} - image {idx}",
idx,
)
sections.append(image_section)
return [
Document(
id=doc_id,
sections=sections,
source=source_type,
sections=[
Section(link=all_metadata.get("link"), text=file_content_raw.strip())
],
source=source_type or DocumentSource.FILE,
semantic_identifier=file_display_name,
title=title,
doc_updated_at=final_time_updated,
primary_owners=p_owners,
secondary_owners=s_owners,
# currently metadata just houses tags, other stuff like owners / updated at have dedicated fields
metadata=metadata_tags,
)
]
class LocalFileConnector(LoadConnector, VisionEnabledConnector):
"""
Connector that reads files from Postgres and yields Documents, including
optional embedded image extraction.
"""
class LocalFileConnector(LoadConnector):
def __init__(
self,
file_locations: list[Path | str],
batch_size: int = INDEX_BATCH_SIZE,
) -> None:
self.file_locations = [str(loc) for loc in file_locations]
self.file_locations = [Path(file_location) for file_location in file_locations]
self.batch_size = batch_size
self.pdf_pass: str | None = None
# Initialize vision LLM using the mixin
self.initialize_vision_llm()
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
self.pdf_pass = credentials.get("pdf_password")
return None
def load_from_state(self) -> GenerateDocumentsOutput:
"""
Iterates over each file path, fetches from Postgres, tries to parse text
or images, and yields Document batches.
"""
documents: list[Document] = []
with get_session_with_current_tenant() as db_session:
for file_path in self.file_locations:
current_datetime = datetime.now(timezone.utc)
files_iter = _read_files_and_metadata(
file_name=file_path,
db_session=db_session,
files = _read_files_and_metadata(
file_name=str(file_path), db_session=db_session
)
for actual_file_name, file, metadata in files_iter:
for file_name, file, metadata in files:
metadata["time_updated"] = metadata.get(
"time_updated", current_datetime
)
new_docs = _process_file(
file_name=actual_file_name,
file=file,
metadata=metadata,
pdf_pass=self.pdf_pass,
db_session=db_session,
llm=self.image_analysis_llm,
documents.extend(
_process_file(file_name, file, metadata, self.pdf_pass)
)
documents.extend(new_docs)
if len(documents) >= self.batch_size:
yield documents
documents = []
if documents:
@@ -301,7 +201,7 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector):
if __name__ == "__main__":
connector = LocalFileConnector(file_locations=[os.environ["TEST_FILE"]])
connector.load_credentials({"pdf_password": os.environ.get("PDF_PASSWORD")})
doc_batches = connector.load_from_state()
for batch in doc_batches:
print("BATCH:", batch)
connector.load_credentials({"pdf_password": os.environ["PDF_PASSWORD"]})
document_batches = connector.load_from_state()
print(next(document_batches))

View File

@@ -4,12 +4,14 @@ from concurrent.futures import as_completed
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from typing import Any
from typing import cast
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
from googleapiclient.errors import HttpError # type: ignore
from onyx.configs.app_configs import INDEX_BATCH_SIZE
from onyx.configs.app_configs import MAX_FILE_SIZE_BYTES
from onyx.configs.constants import DocumentSource
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.connectors.exceptions import CredentialExpiredError
@@ -34,6 +36,7 @@ from onyx.connectors.google_utils.shared_constants import (
)
from onyx.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_STR
from onyx.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS
from onyx.connectors.google_utils.shared_constants import SCOPE_DOC_URL
from onyx.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
from onyx.connectors.google_utils.shared_constants import USER_FIELDS
from onyx.connectors.interfaces import GenerateDocumentsOutput
@@ -43,9 +46,7 @@ from onyx.connectors.interfaces import PollConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
from onyx.utils.retry_wrapper import retry_builder
@@ -65,10 +66,7 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
def _convert_single_file(
creds: Any,
primary_admin_email: str,
file: dict[str, Any],
image_analysis_llm: LLM | None,
creds: Any, primary_admin_email: str, file: dict[str, Any]
) -> Any:
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
user_drive_service = get_drive_service(creds, user_email=user_email)
@@ -77,14 +75,11 @@ def _convert_single_file(
file=file,
drive_service=user_drive_service,
docs_service=docs_service,
image_analysis_llm=image_analysis_llm, # pass the LLM so doc_conversion can summarize images
)
def _process_files_batch(
files: list[GoogleDriveFileType],
convert_func: Callable[[GoogleDriveFileType], Any],
batch_size: int,
files: list[GoogleDriveFileType], convert_func: Callable, batch_size: int
) -> GenerateDocumentsOutput:
doc_batch = []
with ThreadPoolExecutor(max_workers=min(16, len(files))) as executor:
@@ -116,9 +111,7 @@ def _clean_requested_drive_ids(
return valid_requested_drive_ids, filtered_folder_ids
class GoogleDriveConnector(
LoadConnector, PollConnector, SlimConnector, VisionEnabledConnector
):
class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
def __init__(
self,
include_shared_drives: bool = False,
@@ -136,23 +129,23 @@ class GoogleDriveConnector(
continue_on_failure: bool | None = None,
) -> None:
# Check for old input parameters
if folder_paths is not None:
logger.warning(
"The 'folder_paths' parameter is deprecated. Use 'shared_folder_urls' instead."
if (
folder_paths is not None
or include_shared is not None
or follow_shortcuts is not None
or only_org_public is not None
or continue_on_failure is not None
):
logger.exception(
"Google Drive connector received old input parameters. "
"Please visit the docs for help with the new setup: "
f"{SCOPE_DOC_URL}"
)
if include_shared is not None:
logger.warning(
"The 'include_shared' parameter is deprecated. Use 'include_files_shared_with_me' instead."
raise ConnectorValidationError(
"Google Drive connector received old input parameters. "
"Please visit the docs for help with the new setup: "
f"{SCOPE_DOC_URL}"
)
if follow_shortcuts is not None:
logger.warning("The 'follow_shortcuts' parameter is deprecated.")
if only_org_public is not None:
logger.warning("The 'only_org_public' parameter is deprecated.")
if continue_on_failure is not None:
logger.warning("The 'continue_on_failure' parameter is deprecated.")
# Initialize vision LLM using the mixin
self.initialize_vision_llm()
if (
not include_shared_drives
@@ -244,7 +237,6 @@ class GoogleDriveConnector(
credentials=credentials,
source=DocumentSource.GOOGLE_DRIVE,
)
return new_creds_dict
def _update_traversed_parent_ids(self, folder_id: str) -> None:
@@ -531,53 +523,37 @@ class GoogleDriveConnector(
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateDocumentsOutput:
# Create a larger process pool for file conversion
with ThreadPoolExecutor(max_workers=8) as executor:
# Prepare a partial function with the credentials and admin email
convert_func = partial(
_convert_single_file,
self.creds,
self.primary_admin_email,
image_analysis_llm=self.image_analysis_llm, # Use the mixin's LLM
convert_func = partial(
_convert_single_file, self.creds, self.primary_admin_email
)
# Process files in larger batches
LARGE_BATCH_SIZE = self.batch_size * 4
files_to_process = []
# Gather the files into batches to be processed in parallel
for file in self._fetch_drive_items(is_slim=False, start=start, end=end):
if (
file.get("size")
and int(cast(str, file.get("size"))) > MAX_FILE_SIZE_BYTES
):
logger.warning(
f"Skipping file {file.get('name', 'Unknown')} as it is too large: {file.get('size')} bytes"
)
continue
files_to_process.append(file)
if len(files_to_process) >= LARGE_BATCH_SIZE:
yield from _process_files_batch(
files_to_process, convert_func, self.batch_size
)
files_to_process = []
# Process any remaining files
if files_to_process:
yield from _process_files_batch(
files_to_process, convert_func, self.batch_size
)
# Fetch files in batches
files_batch: list[GoogleDriveFileType] = []
for file in self._fetch_drive_items(is_slim=False, start=start, end=end):
files_batch.append(file)
if len(files_batch) >= self.batch_size:
# Process the batch
futures = [
executor.submit(convert_func, file) for file in files_batch
]
documents = []
for future in as_completed(futures):
try:
doc = future.result()
if doc is not None:
documents.append(doc)
except Exception as e:
logger.error(f"Error converting file: {e}")
if documents:
yield documents
files_batch = []
# Process any remaining files
if files_batch:
futures = [executor.submit(convert_func, file) for file in files_batch]
documents = []
for future in as_completed(futures):
try:
doc = future.result()
if doc is not None:
documents.append(doc)
except Exception as e:
logger.error(f"Error converting file: {e}")
if documents:
yield documents
def load_from_state(self) -> GenerateDocumentsOutput:
try:
yield from self._extract_docs_from_google_drive()

View File

@@ -9,7 +9,7 @@ from googleapiclient.errors import HttpError # type: ignore
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.configs.constants import IGNORE_FOR_QA
from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
from onyx.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT
@@ -21,88 +21,32 @@ from onyx.connectors.google_utils.resources import GoogleDriveService
from onyx.connectors.models import Document
from onyx.connectors.models import Section
from onyx.connectors.models import SlimDocument
from onyx.db.engine import get_session_with_current_tenant
from onyx.file_processing.extract_file_text import docx_to_text_and_images
from onyx.file_processing.extract_file_text import docx_to_text
from onyx.file_processing.extract_file_text import pptx_to_text
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.file_validation import is_valid_image_type
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.file_processing.unstructured import get_unstructured_api_key
from onyx.file_processing.unstructured import unstructured_to_text
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
def _summarize_drive_image(
image_data: bytes, image_name: str, image_analysis_llm: LLM | None
) -> str:
"""
Summarize the given image using the provided LLM.
"""
if not image_analysis_llm:
return ""
return (
summarize_image_with_error_handling(
llm=image_analysis_llm,
image_data=image_data,
context_name=image_name,
)
or ""
)
def is_gdrive_image_mime_type(mime_type: str) -> bool:
"""
Return True if the mime_type is a common image type in GDrive.
(e.g. 'image/png', 'image/jpeg')
"""
return is_valid_image_type(mime_type)
# these errors don't represent a failure in the connector, but simply files
# that can't / shouldn't be indexed
ERRORS_TO_CONTINUE_ON = [
"cannotExportFile",
"exportSizeLimitExceeded",
"cannotDownloadFile",
]
def _extract_sections_basic(
file: dict[str, str],
service: GoogleDriveService,
image_analysis_llm: LLM | None = None,
file: dict[str, str], service: GoogleDriveService
) -> list[Section]:
"""
Extends the existing logic to handle either a docx with embedded images
or standalone images (PNG, JPG, etc).
"""
mime_type = file["mimeType"]
link = file["webViewLink"]
file_name = file.get("name", file["id"])
supported_file_types = set(item.value for item in GDriveMimeType)
# 1) If the file is an image, retrieve the raw bytes, optionally summarize
if is_gdrive_image_mime_type(mime_type):
try:
response = service.files().get_media(fileId=file["id"]).execute()
with get_session_with_current_tenant() as db_session:
section, _ = store_image_and_create_section(
db_session=db_session,
image_data=response,
file_name=file["id"],
display_name=file_name,
media_type=mime_type,
llm=image_analysis_llm,
file_origin=FileOrigin.CONNECTOR,
)
return [section]
except Exception as e:
logger.warning(f"Failed to fetch or summarize image: {e}")
return [
Section(
link=link,
text="",
image_file_name=link,
)
]
if mime_type not in supported_file_types:
# Unsupported file types can still have a title, finding this way is still useful
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
@@ -241,63 +185,45 @@ def _extract_sections_basic(
GDriveMimeType.PLAIN_TEXT.value,
GDriveMimeType.MARKDOWN.value,
]:
text_data = (
service.files().get_media(fileId=file["id"]).execute().decode("utf-8")
)
return [Section(link=link, text=text_data)]
return [
Section(
link=link,
text=service.files()
.get_media(fileId=file["id"])
.execute()
.decode("utf-8"),
)
]
# ---------------------------
# Word, PowerPoint, PDF files
elif mime_type in [
if mime_type in [
GDriveMimeType.WORD_DOC.value,
GDriveMimeType.POWERPOINT.value,
GDriveMimeType.PDF.value,
]:
response_bytes = service.files().get_media(fileId=file["id"]).execute()
# Optionally use Unstructured
response = service.files().get_media(fileId=file["id"]).execute()
if get_unstructured_api_key():
text = unstructured_to_text(
file=io.BytesIO(response_bytes),
file_name=file_name,
)
return [Section(link=link, text=text)]
return [
Section(
link=link,
text=unstructured_to_text(
file=io.BytesIO(response),
file_name=file.get("name", file["id"]),
),
)
]
if mime_type == GDriveMimeType.WORD_DOC.value:
# Use docx_to_text_and_images to get text plus embedded images
text, embedded_images = docx_to_text_and_images(
file=io.BytesIO(response_bytes),
)
sections = []
if text.strip():
sections.append(Section(link=link, text=text.strip()))
# Process each embedded image using the standardized function
with get_session_with_current_tenant() as db_session:
for idx, (img_data, img_name) in enumerate(
embedded_images, start=1
):
# Create a unique identifier for the embedded image
embedded_id = f"{file['id']}_embedded_{idx}"
section, _ = store_image_and_create_section(
db_session=db_session,
image_data=img_data,
file_name=embedded_id,
display_name=img_name or f"{file_name} - image {idx}",
llm=image_analysis_llm,
file_origin=FileOrigin.CONNECTOR,
)
sections.append(section)
return sections
return [
Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
]
elif mime_type == GDriveMimeType.PDF.value:
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_bytes))
text, _ = read_pdf_file(file=io.BytesIO(response))
return [Section(link=link, text=text)]
elif mime_type == GDriveMimeType.POWERPOINT.value:
text_data = pptx_to_text(io.BytesIO(response_bytes))
return [Section(link=link, text=text_data)]
return [
Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
]
# Catch-all case, should not happen since there should be specific handling
# for each of the supported file types
@@ -305,8 +231,7 @@ def _extract_sections_basic(
logger.error(error_message)
raise ValueError(error_message)
except Exception as e:
logger.exception(f"Error extracting sections from file: {e}")
except Exception:
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
@@ -314,62 +239,74 @@ def convert_drive_item_to_document(
file: GoogleDriveFileType,
drive_service: GoogleDriveService,
docs_service: GoogleDocsService,
image_analysis_llm: LLM | None,
) -> Document | None:
"""
Main entry point for converting a Google Drive file => Document object.
Now we accept an optional `llm` to pass to `_extract_sections_basic`.
"""
try:
# skip shortcuts or folders
if file.get("mimeType") in [DRIVE_SHORTCUT_TYPE, DRIVE_FOLDER_TYPE]:
logger.info("Skipping shortcut/folder.")
# Skip files that are shortcuts
if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
logger.info("Ignoring Drive Shortcut Filetype")
return None
# Skip files that are folders
if file.get("mimeType") == DRIVE_FOLDER_TYPE:
logger.info("Ignoring Drive Folder Filetype")
return None
# If it's a Google Doc, we might do advanced parsing
sections: list[Section] = []
# Special handling for Google Docs to preserve structure, link
# to headers
if file.get("mimeType") == GDriveMimeType.DOC.value:
try:
# get_document_sections is the advanced approach for Google Docs
sections = get_document_sections(docs_service, file["id"])
except Exception as e:
logger.warning(
f"Failed to pull google doc sections from '{file['name']}': {e}. "
"Falling back to basic extraction."
f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
" Falling back to basic extraction."
)
# If not a doc, or if we failed above, do our 'basic' approach
# NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
if not sections:
sections = _extract_sections_basic(file, drive_service, image_analysis_llm)
try:
# For all other file types just extract the text
sections = _extract_sections_basic(file, drive_service)
except HttpError as e:
reason = e.error_details[0]["reason"] if e.error_details else e.reason
message = e.error_details[0]["message"] if e.error_details else e.reason
if e.status_code == 403 and reason in ERRORS_TO_CONTINUE_ON:
logger.warning(
f"Could not export file '{file['name']}' due to '{message}', skipping..."
)
return None
raise
if not sections:
return None
doc_id = file["webViewLink"]
updated_time = datetime.fromisoformat(file["modifiedTime"]).astimezone(
timezone.utc
)
return Document(
id=doc_id,
id=file["webViewLink"],
sections=sections,
source=DocumentSource.GOOGLE_DRIVE,
semantic_identifier=file["name"],
doc_updated_at=updated_time,
metadata={}, # or any metadata from 'file'
doc_updated_at=datetime.fromisoformat(file["modifiedTime"]).astimezone(
timezone.utc
),
metadata={}
if any(section.text for section in sections)
else {IGNORE_FOR_QA: "True"},
additional_info=file.get("id"),
)
except Exception as e:
logger.exception(f"Error converting file '{file.get('name')}' to Document: {e}")
if not CONTINUE_ON_CONNECTOR_FAILURE:
raise
raise e
logger.exception("Ran into exception when pulling a file from Google Drive")
return None
def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
# Skip files that are folders or shortcuts
if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
return None
return SlimDocument(
id=file["webViewLink"],
perm_sync_data={

View File

@@ -28,8 +28,7 @@ class ConnectorMissingCredentialError(PermissionError):
class Section(BaseModel):
text: str
link: str | None = None
image_file_name: str | None = None
link: str | None
class BasicExpertInfo(BaseModel):

View File

@@ -1,45 +0,0 @@
"""
Mixin for connectors that need vision capabilities.
"""
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
from onyx.llm.factory import get_default_llm_with_vision
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
class VisionEnabledConnector:
"""
Mixin for connectors that need vision capabilities.
This mixin provides a standard way to initialize a vision-capable LLM
for image analysis during indexing.
Usage:
class MyConnector(LoadConnector, VisionEnabledConnector):
def __init__(self, ...):
super().__init__(...)
self.initialize_vision_llm()
"""
def initialize_vision_llm(self) -> None:
"""
Initialize a vision-capable LLM if enabled by configuration.
Sets self.image_analysis_llm to the LLM instance or None if disabled.
"""
self.image_analysis_llm: LLM | None = None
if get_image_extraction_and_analysis_enabled():
try:
self.image_analysis_llm = get_default_llm_with_vision()
if self.image_analysis_llm is None:
logger.warning(
"No LLM with vision found; image summarization will be disabled"
)
except Exception as e:
logger.warning(
f"Failed to initialize vision LLM due to an error: {str(e)}. "
"Image summarization will be disabled."
)
self.image_analysis_llm = None

View File

@@ -157,7 +157,6 @@ def get_internal_links(
def start_playwright() -> Tuple[Playwright, BrowserContext]:
playwright = sync_playwright().start()
browser = playwright.chromium.launch(headless=True)
context = browser.new_context()
@@ -333,7 +332,7 @@ class WebConnector(LoadConnector):
if initial_url.split(".")[-1] == "pdf":
# PDF files are not checked for links
response = requests.get(initial_url)
page_text, metadata, images = read_pdf_file(
page_text, metadata = read_pdf_file(
file=io.BytesIO(response.content)
)
last_modified = response.headers.get("Last-Modified")

View File

@@ -1,17 +1,12 @@
import base64
from collections.abc import Callable
from collections.abc import Iterator
from typing import cast
import numpy
from langchain_core.messages import BaseMessage
from langchain_core.messages import HumanMessage
from langchain_core.messages import SystemMessage
from onyx.chat.models import SectionRelevancePiece
from onyx.configs.app_configs import BLURB_SIZE
from onyx.configs.constants import RETURN_SEPARATOR
from onyx.configs.llm_configs import get_search_time_image_analysis_enabled
from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MAX
from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MIN
from onyx.context.search.enums import LLMEvaluationType
@@ -23,15 +18,11 @@ from onyx.context.search.models import MAX_METRICS_CONTENT
from onyx.context.search.models import RerankingDetails
from onyx.context.search.models import RerankMetricsContainer
from onyx.context.search.models import SearchQuery
from onyx.db.engine import get_session_with_current_tenant
from onyx.document_index.document_index_utils import (
translate_boost_count_to_multiplier,
)
from onyx.file_store.file_store import get_default_file_store
from onyx.llm.interfaces import LLM
from onyx.llm.utils import message_to_string
from onyx.natural_language_processing.search_nlp_models import RerankingModel
from onyx.prompts.image_analysis import IMAGE_ANALYSIS_SYSTEM_PROMPT
from onyx.secondary_llm_flows.chunk_usefulness import llm_batch_eval_sections
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import FunctionCall
@@ -39,124 +30,6 @@ from onyx.utils.threadpool_concurrency import run_functions_in_parallel
from onyx.utils.timing import log_function_time
def update_image_sections_with_query(
sections: list[InferenceSection],
query: str,
llm: LLM,
) -> None:
"""
For each chunk in each section that has an image URL, call an LLM to produce
a new 'content' string that directly addresses the user's query about that image.
This implementation uses parallel processing for efficiency.
"""
logger = setup_logger()
logger.debug(f"Starting image section update with query: {query}")
chunks_with_images = []
for section in sections:
for chunk in section.chunks:
if chunk.image_file_name:
chunks_with_images.append(chunk)
if not chunks_with_images:
logger.debug("No images to process in the sections")
return # No images to process
logger.info(f"Found {len(chunks_with_images)} chunks with images to process")
def process_image_chunk(chunk: InferenceChunk) -> tuple[str, str]:
try:
logger.debug(
f"Processing image chunk with ID: {chunk.unique_id}, image: {chunk.image_file_name}"
)
with get_session_with_current_tenant() as db_session:
file_record = get_default_file_store(db_session).read_file(
cast(str, chunk.image_file_name), mode="b"
)
if not file_record:
logger.error(f"Image file not found: {chunk.image_file_name}")
raise Exception("File not found")
file_content = file_record.read()
image_base64 = base64.b64encode(file_content).decode()
logger.debug(
f"Successfully loaded image data for {chunk.image_file_name}"
)
messages: list[BaseMessage] = [
SystemMessage(content=IMAGE_ANALYSIS_SYSTEM_PROMPT),
HumanMessage(
content=[
{
"type": "text",
"text": (
f"The user's question is: '{query}'. "
"Please analyze the following image in that context:\n"
),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}",
},
},
]
),
]
raw_response = llm.invoke(messages)
answer_text = message_to_string(raw_response).strip()
return (
chunk.unique_id,
answer_text if answer_text else "No relevant info found.",
)
except Exception:
logger.exception(
f"Error updating image section with query source image url: {chunk.image_file_name}"
)
return chunk.unique_id, "Error analyzing image."
image_processing_tasks = [
FunctionCall(process_image_chunk, (chunk,)) for chunk in chunks_with_images
]
logger.info(
f"Starting parallel processing of {len(image_processing_tasks)} image tasks"
)
image_processing_results = run_functions_in_parallel(image_processing_tasks)
logger.info(
f"Completed parallel processing with {len(image_processing_results)} results"
)
# Create a mapping of chunk IDs to their processed content
chunk_id_to_content = {}
success_count = 0
for task_id, result in image_processing_results.items():
if result:
chunk_id, content = result
chunk_id_to_content[chunk_id] = content
success_count += 1
else:
logger.error(f"Task {task_id} failed to return a valid result")
logger.info(
f"Successfully processed {success_count}/{len(image_processing_results)} images"
)
# Update the chunks with the processed content
updated_count = 0
for section in sections:
for chunk in section.chunks:
if chunk.unique_id in chunk_id_to_content:
chunk.content = chunk_id_to_content[chunk.unique_id]
updated_count += 1
logger.info(
f"Updated content for {updated_count} chunks with image analysis results"
)
logger = setup_logger()
@@ -413,10 +286,6 @@ def search_postprocessing(
# NOTE: if we don't rerank, we can return the chunks immediately
# since we know this is the final order.
# This way the user experience isn't delayed by the LLM step
if get_search_time_image_analysis_enabled():
update_image_sections_with_query(
retrieved_sections, search_query.query, llm
)
_log_top_section_links(search_query.search_type.value, retrieved_sections)
yield retrieved_sections
sections_yielded = True
@@ -454,13 +323,6 @@ def search_postprocessing(
)
else:
_log_top_section_links(search_query.search_type.value, reranked_sections)
# Add the image processing step here
if get_search_time_image_analysis_enabled():
update_image_sections_with_query(
reranked_sections, search_query.query, llm
)
yield reranked_sections
llm_selected_section_ids = (

View File

@@ -148,28 +148,3 @@ def upsert_pgfilestore(
db_session.commit()
return pgfilestore
def save_bytes_to_pgfilestore(
db_session: Session,
raw_bytes: bytes,
media_type: str,
identifier: str,
display_name: str,
file_origin: FileOrigin = FileOrigin.OTHER,
) -> PGFileStore:
"""
Saves raw bytes to PGFileStore and returns the resulting record.
"""
file_name = f"{file_origin.name.lower()}_{identifier}"
lobj_oid = create_populate_lobj(BytesIO(raw_bytes), db_session)
pgfilestore = upsert_pgfilestore(
file_name=file_name,
display_name=display_name,
file_origin=file_origin,
file_type=media_type,
lobj_oid=lobj_oid,
db_session=db_session,
commit=True,
)
return pgfilestore

View File

@@ -1,53 +0,0 @@
import random
from datetime import datetime
from datetime import timedelta
from onyx.configs.constants import MessageType
from onyx.db.chat import create_chat_session
from onyx.db.chat import create_new_chat_message
from onyx.db.chat import get_or_create_root_message
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.models import ChatSession
def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
"""Utility function to seed chat history for testing.
num_sessions: the number of sessions to seed
num_messages: the number of messages to seed per sessions
days: the number of days looking backwards from the current time over which to randomize
the times.
"""
with get_session_with_current_tenant() as db_session:
for y in range(0, num_sessions):
create_chat_session(db_session, f"pytest_session_{y}", None, None)
# randomize all session times
rows = db_session.query(ChatSession).all()
for row in rows:
row.time_created = datetime.utcnow() - timedelta(
days=random.randint(0, days)
)
row.time_updated = row.time_created + timedelta(
minutes=random.randint(0, 10)
)
root_message = get_or_create_root_message(row.id, db_session)
for x in range(0, num_messages):
chat_message = create_new_chat_message(
row.id,
root_message,
f"pytest_message_{x}",
None,
0,
MessageType.USER,
db_session,
)
chat_message.time_sent = row.time_created + timedelta(
minutes=random.randint(0, 10)
)
db_session.commit()
db_session.commit()

View File

@@ -55,9 +55,6 @@ schema DANSWER_CHUNK_NAME {
field blurb type string {
indexing: summary | attribute
}
field image_file_name type string {
indexing: summary | attribute
}
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
field source_type type string {
indexing: summary | attribute

View File

@@ -31,7 +31,6 @@ from onyx.document_index.vespa_constants import DOC_UPDATED_AT
from onyx.document_index.vespa_constants import DOCUMENT_ID
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa_constants import HIDDEN
from onyx.document_index.vespa_constants import IMAGE_FILE_NAME
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
from onyx.document_index.vespa_constants import MAX_ID_SEARCH_QUERY_SIZE
from onyx.document_index.vespa_constants import MAX_OR_CONDITIONS
@@ -131,7 +130,6 @@ def _vespa_hit_to_inference_chunk(
section_continuation=fields[SECTION_CONTINUATION],
document_id=fields[DOCUMENT_ID],
source_type=fields[SOURCE_TYPE],
image_file_name=fields.get(IMAGE_FILE_NAME),
title=fields.get(TITLE),
semantic_identifier=fields[SEMANTIC_IDENTIFIER],
boost=fields.get(BOOST, 1),
@@ -213,7 +211,6 @@ def _get_chunks_via_visit_api(
# Check if the response contains any documents
response_data = response.json()
if "documents" in response_data:
for document in response_data["documents"]:
if filters.access_control_list:

View File

@@ -32,7 +32,6 @@ from onyx.document_index.vespa_constants import DOCUMENT_ID
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
from onyx.document_index.vespa_constants import DOCUMENT_SETS
from onyx.document_index.vespa_constants import EMBEDDINGS
from onyx.document_index.vespa_constants import IMAGE_FILE_NAME
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
from onyx.document_index.vespa_constants import METADATA
from onyx.document_index.vespa_constants import METADATA_LIST
@@ -199,13 +198,13 @@ def _index_vespa_chunk(
# which only calls VespaIndex.update
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
IMAGE_FILE_NAME: chunk.image_file_name,
BOOST: chunk.boost,
}
if multitenant:
if chunk.tenant_id:
vespa_document_fields[TENANT_ID] = chunk.tenant_id
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_chunk_id}"
logger.debug(f'Indexing to URL "{vespa_url}"')
res = http_client.post(

View File

@@ -77,7 +77,6 @@ PRIMARY_OWNERS = "primary_owners"
SECONDARY_OWNERS = "secondary_owners"
RECENCY_BIAS = "recency_bias"
HIDDEN = "hidden"
IMAGE_FILE_NAME = "image_file_name"
# Specific to Vespa, needed for highlighting matching keywords / section
CONTENT_SUMMARY = "content_summary"
@@ -95,7 +94,6 @@ YQL_BASE = (
f"{SEMANTIC_IDENTIFIER}, "
f"{TITLE}, "
f"{SECTION_CONTINUATION}, "
f"{IMAGE_FILE_NAME}, "
f"{BOOST}, "
f"{HIDDEN}, "
f"{DOC_UPDATED_AT}, "

View File

@@ -9,17 +9,15 @@ from email.parser import Parser as EmailParser
from io import BytesIO
from pathlib import Path
from typing import Any
from typing import Dict
from typing import IO
from typing import List
from typing import Tuple
import chardet
import docx # type: ignore
import openpyxl # type: ignore
import pptx # type: ignore
from docx import Document as DocxDocument
from docx import Document
from fastapi import UploadFile
from PIL import Image
from pypdf import PdfReader
from pypdf.errors import PdfStreamError
@@ -33,8 +31,10 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
TEXT_SECTION_SEPARATOR = "\n\n"
PLAIN_TEXT_FILE_EXTENSIONS = [
".txt",
".md",
@@ -49,6 +49,7 @@ PLAIN_TEXT_FILE_EXTENSIONS = [
".yaml",
]
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".pdf",
".docx",
@@ -57,16 +58,6 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
".eml",
".epub",
".html",
".png",
".jpg",
".jpeg",
".webp",
]
IMAGE_MEDIA_TYPES = [
"image/png",
"image/jpeg",
"image/webp",
]
@@ -76,13 +67,11 @@ def is_text_file_extension(file_name: str) -> bool:
def get_file_ext(file_path_or_name: str | Path) -> str:
_, extension = os.path.splitext(file_path_or_name)
# standardize all extensions to be lowercase so that checks against
# VALID_FILE_EXTENSIONS and similar will work as intended
return extension.lower()
def is_valid_media_type(media_type: str) -> bool:
return media_type in IMAGE_MEDIA_TYPES
def is_valid_file_ext(ext: str) -> bool:
return ext in VALID_FILE_EXTENSIONS
@@ -90,18 +79,17 @@ def is_valid_file_ext(ext: str) -> bool:
def is_text_file(file: IO[bytes]) -> bool:
"""
checks if the first 1024 bytes only contain printable or whitespace characters
if it does, then we say it's a plaintext file
if it does, then we say its a plaintext file
"""
raw_data = file.read(1024)
file.seek(0)
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
return all(c in text_chars for c in raw_data)
def detect_encoding(file: IO[bytes]) -> str:
raw_data = file.read(50000)
file.seek(0)
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
file.seek(0)
return encoding
@@ -111,14 +99,14 @@ def is_macos_resource_fork_file(file_name: str) -> bool:
)
# To include additional metadata in the search index, add a .onyx_metadata.json file
# to the zip file. This file should contain a list of objects with the following format:
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
def load_files_from_zip(
zip_file_io: IO,
ignore_macos_resource_fork_files: bool = True,
ignore_dirs: bool = True,
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
"""
If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile.
"""
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
zip_metadata = {}
try:
@@ -130,31 +118,24 @@ def load_files_from_zip(
# convert list of dicts to dict of dicts
zip_metadata = {d["filename"]: d for d in zip_metadata}
except json.JSONDecodeError:
logger.warning(f"Unable to load {DANSWER_METADATA_FILENAME}")
logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
except KeyError:
logger.info(f"No {DANSWER_METADATA_FILENAME} file")
for file_info in zip_file.infolist():
if ignore_dirs and file_info.is_dir():
continue
with zip_file.open(file_info.filename, "r") as file:
if ignore_dirs and file_info.is_dir():
continue
if (
ignore_macos_resource_fork_files
and is_macos_resource_fork_file(file_info.filename)
) or file_info.filename == DANSWER_METADATA_FILENAME:
continue
with zip_file.open(file_info.filename, "r") as subfile:
yield file_info, subfile, zip_metadata.get(file_info.filename, {})
if (
ignore_macos_resource_fork_files
and is_macos_resource_fork_file(file_info.filename)
) or file_info.filename == DANSWER_METADATA_FILENAME:
continue
yield file_info, file, zip_metadata.get(file_info.filename, {})
def _extract_onyx_metadata(line: str) -> dict | None:
"""
Example: first line has:
<!-- DANSWER_METADATA={"title": "..."} -->
or
#DANSWER_METADATA={"title":"..."}
"""
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
@@ -180,13 +161,9 @@ def read_text_file(
errors: str = "replace",
ignore_onyx_metadata: bool = True,
) -> tuple[str, dict]:
"""
For plain text files. Optionally extracts Onyx metadata from the first line.
"""
metadata = {}
file_content_raw = ""
for ind, line in enumerate(file):
# decode
try:
line = line.decode(encoding) if isinstance(line, bytes) else line
except UnicodeDecodeError:
@@ -196,132 +173,131 @@ def read_text_file(
else line
)
# optionally parse metadata in the first line
if ind == 0 and not ignore_onyx_metadata:
potential_meta = _extract_onyx_metadata(line)
if potential_meta is not None:
metadata = potential_meta
continue
file_content_raw += line
if ind == 0:
metadata_or_none = (
None if ignore_onyx_metadata else _extract_onyx_metadata(line)
)
if metadata_or_none is not None:
metadata = metadata_or_none
else:
file_content_raw += line
else:
file_content_raw += line
return file_content_raw, metadata
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
"""
Extract text from a PDF. For embedded images, a more complex approach is needed.
This is a minimal approach returning text only.
"""
text, _, _ = read_pdf_file(file, pdf_pass)
"""Extract text from a PDF file."""
# Return only the extracted text from read_pdf_file
text, _ = read_pdf_file(file, pdf_pass)
return text
def read_pdf_file(
file: IO[Any], pdf_pass: str | None = None, extract_images: bool = False
) -> tuple[str, dict, list[tuple[bytes, str]]]:
"""
Returns the text, basic PDF metadata, and optionally extracted images.
"""
metadata: dict[str, Any] = {}
extracted_images: list[tuple[bytes, str]] = []
file: IO[Any],
pdf_pass: str | None = None,
) -> tuple[str, dict]:
metadata: Dict[str, Any] = {}
try:
pdf_reader = PdfReader(file)
# If marked as encrypted and a password is provided, try to decrypt
if pdf_reader.is_encrypted and pdf_pass is not None:
decrypt_success = False
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error("Unable to decrypt pdf")
if pdf_pass is not None:
try:
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
except Exception:
logger.error("Unable to decrypt pdf")
if not decrypt_success:
return "", metadata, []
# By user request, keep files that are unreadable just so they
# can be discoverable by title.
return "", metadata
elif pdf_reader.is_encrypted:
logger.warning("No Password for an encrypted PDF, returning empty text.")
return "", metadata, []
logger.warning("No Password available to decrypt pdf, returning empty")
return "", metadata
# Basic PDF metadata
# Extract metadata from the PDF, removing leading '/' from keys if present
# This standardizes the metadata keys for consistency
metadata = {}
if pdf_reader.metadata is not None:
for key, value in pdf_reader.metadata.items():
clean_key = key.lstrip("/")
if isinstance(value, str) and value.strip():
metadata[clean_key] = value
elif isinstance(value, list) and all(
isinstance(item, str) for item in value
):
metadata[clean_key] = ", ".join(value)
text = TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
return (
TEXT_SECTION_SEPARATOR.join(
page.extract_text() for page in pdf_reader.pages
),
metadata,
)
if extract_images:
for page_num, page in enumerate(pdf_reader.pages):
for image_file_object in page.images:
image = Image.open(io.BytesIO(image_file_object.data))
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format=image.format)
img_bytes = img_byte_arr.getvalue()
image_name = (
f"page_{page_num + 1}_image_{image_file_object.name}."
f"{image.format.lower() if image.format else 'png'}"
)
extracted_images.append((img_bytes, image_name))
return text, metadata, extracted_images
except PdfStreamError:
logger.exception("Invalid PDF file")
logger.exception("PDF file is not a valid PDF")
except Exception:
logger.exception("Failed to read PDF")
return "", metadata, []
# File is still discoverable by title
# but the contents are not included as they cannot be parsed
return "", metadata
def docx_to_text_and_images(
file: IO[Any],
) -> Tuple[str, List[Tuple[bytes, str]]]:
"""
Extract text from a docx. If embed_images=True, also extract inline images.
Return (text_content, list_of_images).
"""
def docx_to_text(file: IO[Any]) -> str:
def is_simple_table(table: docx.table.Table) -> bool:
for row in table.rows:
# No omitted cells
if row.grid_cols_before > 0 or row.grid_cols_after > 0:
return False
# No nested tables
if any(cell.tables for cell in row.cells):
return False
return True
def extract_cell_text(cell: docx.table._Cell) -> str:
cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
return " ".join(p for p in cell_paragraphs if p) or "N/A"
paragraphs = []
embedded_images: List[Tuple[bytes, str]] = []
doc = docx.Document(file)
for item in doc.iter_inner_content():
if isinstance(item, docx.text.paragraph.Paragraph):
paragraphs.append(item.text)
# Grab text from paragraphs
for paragraph in doc.paragraphs:
paragraphs.append(paragraph.text)
elif isinstance(item, docx.table.Table):
if not item.rows or not is_simple_table(item):
continue
# Reset position so we can re-load the doc (python-docx has read the stream)
# Note: if python-docx has fully consumed the stream, you may need to open it again from memory.
# For large docs, a more robust approach is needed.
# This is a simplified example.
# Every row is a new line, joined with a single newline
table_content = "\n".join(
[
",\t".join(extract_cell_text(cell) for cell in row.cells)
for row in item.rows
]
)
paragraphs.append(table_content)
for rel_id, rel in doc.part.rels.items():
if "image" in rel.reltype:
# image is typically in rel.target_part.blob
image_bytes = rel.target_part.blob
image_name = rel.target_part.partname
# store
embedded_images.append((image_bytes, os.path.basename(str(image_name))))
text_content = "\n".join(paragraphs)
return text_content, embedded_images
# Docx already has good spacing between paragraphs
return "\n".join(paragraphs)
def pptx_to_text(file: IO[Any]) -> str:
presentation = pptx.Presentation(file)
text_content = []
for slide_number, slide in enumerate(presentation.slides, start=1):
slide_text = f"\nSlide {slide_number}:\n"
extracted_text = f"\nSlide {slide_number}:\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
slide_text += shape.text + "\n"
text_content.append(slide_text)
extracted_text += shape.text + "\n"
text_content.append(extracted_text)
return TEXT_SECTION_SEPARATOR.join(text_content)
@@ -329,21 +305,18 @@ def xlsx_to_text(file: IO[Any]) -> str:
workbook = openpyxl.load_workbook(file, read_only=True)
text_content = []
for sheet in workbook.worksheets:
rows = []
for row in sheet.iter_rows(min_row=1, values_only=True):
row_str = ",".join(str(cell) if cell is not None else "" for cell in row)
rows.append(row_str)
sheet_str = "\n".join(rows)
text_content.append(sheet_str)
sheet_string = "\n".join(
",".join(map(str, row))
for row in sheet.iter_rows(min_row=1, values_only=True)
)
text_content.append(sheet_string)
return TEXT_SECTION_SEPARATOR.join(text_content)
def eml_to_text(file: IO[Any]) -> str:
encoding = detect_encoding(file)
text_file = io.TextIOWrapper(file, encoding=encoding)
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
parser = EmailParser()
message = parser.parse(text_file)
text_content = []
for part in message.walk():
if part.get_content_type().startswith("text/plain"):
@@ -369,8 +342,8 @@ def epub_to_text(file: IO[Any]) -> str:
def file_io_to_text(file: IO[Any]) -> str:
encoding = detect_encoding(file)
file_content, _ = read_text_file(file, encoding=encoding)
return file_content
file_content_raw, _ = read_text_file(file, encoding=encoding)
return file_content_raw
def extract_file_text(
@@ -379,13 +352,9 @@ def extract_file_text(
break_on_unprocessable: bool = True,
extension: str | None = None,
) -> str:
"""
Legacy function that returns *only text*, ignoring embedded images.
For backward-compatibility in code that only wants text.
"""
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
".pdf": pdf_to_text,
".docx": lambda f: docx_to_text_and_images(f)[0], # no images
".docx": docx_to_text,
".pptx": pptx_to_text,
".xlsx": xlsx_to_text,
".eml": eml_to_text,
@@ -399,23 +368,24 @@ def extract_file_text(
return unstructured_to_text(file, file_name)
except Exception as unstructured_error:
logger.error(
f"Failed to process with Unstructured: {str(unstructured_error)}. "
"Falling back to normal processing."
f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
)
if extension is None:
extension = get_file_ext(file_name)
# Fall through to normal processing
final_extension: str
if file_name or extension:
if extension is not None:
final_extension = extension
elif file_name is not None:
final_extension = get_file_ext(file_name)
if is_valid_file_ext(extension):
func = extension_to_function.get(extension, file_io_to_text)
file.seek(0)
return func(file)
if is_valid_file_ext(final_extension):
return extension_to_function.get(final_extension, file_io_to_text)(file)
# If unknown extension, maybe it's a text file
file.seek(0)
# Either the file somehow has no name or the extension is not one that we recognize
if is_text_file(file):
return file_io_to_text(file)
raise ValueError("Unknown file extension or not recognized as text data")
raise ValueError("Unknown file extension and unknown text encoding")
except Exception as e:
if break_on_unprocessable:
@@ -426,93 +396,20 @@ def extract_file_text(
return ""
def extract_text_and_images(
file: IO[Any],
file_name: str,
pdf_pass: str | None = None,
) -> Tuple[str, List[Tuple[bytes, str]]]:
"""
Primary new function for the updated connector.
Returns (text_content, [(embedded_img_bytes, embedded_img_name), ...]).
"""
try:
# Attempt unstructured if env var is set
if get_unstructured_api_key():
# If the user doesn't want embedded images, unstructured is fine
file.seek(0)
text_content = unstructured_to_text(file, file_name)
return (text_content, [])
extension = get_file_ext(file_name)
# docx example for embedded images
if extension == ".docx":
file.seek(0)
text_content, images = docx_to_text_and_images(file)
return (text_content, images)
# PDF example: we do not show complicated PDF image extraction here
# so we simply extract text for now and skip images.
if extension == ".pdf":
file.seek(0)
text_content, _, images = read_pdf_file(file, pdf_pass, extract_images=True)
return (text_content, images)
# For PPTX, XLSX, EML, etc., we do not show embedded image logic here.
# You can do something similar to docx if needed.
if extension == ".pptx":
file.seek(0)
return (pptx_to_text(file), [])
if extension == ".xlsx":
file.seek(0)
return (xlsx_to_text(file), [])
if extension == ".eml":
file.seek(0)
return (eml_to_text(file), [])
if extension == ".epub":
file.seek(0)
return (epub_to_text(file), [])
if extension == ".html":
file.seek(0)
return (parse_html_page_basic(file), [])
# If we reach here and it's a recognized text extension
if is_text_file_extension(file_name):
file.seek(0)
encoding = detect_encoding(file)
text_content_raw, _ = read_text_file(
file, encoding=encoding, ignore_onyx_metadata=False
)
return (text_content_raw, [])
# If it's an image file or something else, we do not parse embedded images from them
# just return empty text
file.seek(0)
return ("", [])
except Exception as e:
logger.exception(f"Failed to extract text/images from {file_name}: {e}")
return ("", [])
def convert_docx_to_txt(
file: UploadFile, file_store: FileStore, file_path: str
) -> None:
"""
Helper to convert docx to a .txt file in the same filestore.
"""
file.file.seek(0)
docx_content = file.file.read()
doc = DocxDocument(BytesIO(docx_content))
doc = Document(BytesIO(docx_content))
# Extract text from the document
all_paras = [p.text for p in doc.paragraphs]
text_content = "\n".join(all_paras)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
# Join the extracted text
text_content = "\n".join(full_text)
txt_file_path = docx_to_txt_filename(file_path)
file_store.save_file(
@@ -525,4 +422,7 @@ def convert_docx_to_txt(
def docx_to_txt_filename(file_path: str) -> str:
"""
Convert a .docx file path to its corresponding .txt file path.
"""
return file_path.rsplit(".", 1)[0] + ".txt"

View File

@@ -1,46 +0,0 @@
"""
Centralized file type validation utilities.
"""
# Standard image MIME types supported by most vision LLMs
IMAGE_MIME_TYPES = [
"image/png",
"image/jpeg",
"image/jpg",
"image/webp",
]
# Image types that should be excluded from processing
EXCLUDED_IMAGE_TYPES = [
"image/bmp",
"image/tiff",
"image/gif",
"image/svg+xml",
]
def is_valid_image_type(mime_type: str) -> bool:
"""
Check if mime_type is a valid image type.
Args:
mime_type: The MIME type to check
Returns:
True if the MIME type is a valid image type, False otherwise
"""
if not mime_type:
return False
return mime_type.startswith("image/") and mime_type not in EXCLUDED_IMAGE_TYPES
def is_supported_by_vision_llm(mime_type: str) -> bool:
"""
Check if this image type can be processed by vision LLMs.
Args:
mime_type: The MIME type to check
Returns:
True if the MIME type is supported by vision LLMs, False otherwise
"""
return mime_type in IMAGE_MIME_TYPES

View File

@@ -1,129 +0,0 @@
import base64
from io import BytesIO
from langchain_core.messages import BaseMessage
from langchain_core.messages import HumanMessage
from langchain_core.messages import SystemMessage
from PIL import Image
from onyx.llm.interfaces import LLM
from onyx.llm.utils import message_to_string
from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_SYSTEM_PROMPT
from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_USER_PROMPT
from onyx.utils.logger import setup_logger
logger = setup_logger()
def prepare_image_bytes(image_data: bytes) -> str:
"""Prepare image bytes for summarization.
Resizes image if it's larger than 20MB. Encodes image as a base64 string."""
image_data = _resize_image_if_needed(image_data)
# encode image (base64)
encoded_image = _encode_image_for_llm_prompt(image_data)
return encoded_image
def summarize_image_pipeline(
llm: LLM,
image_data: bytes,
query: str | None = None,
system_prompt: str | None = None,
) -> str:
"""Pipeline to generate a summary of an image.
Resizes images if it is bigger than 20MB. Encodes image as a base64 string.
And finally uses the Default LLM to generate a textual summary of the image."""
# resize image if it's bigger than 20MB
encoded_image = prepare_image_bytes(image_data)
summary = _summarize_image(
encoded_image,
llm,
query,
system_prompt,
)
return summary
def summarize_image_with_error_handling(
llm: LLM | None,
image_data: bytes,
context_name: str,
system_prompt: str = IMAGE_SUMMARIZATION_SYSTEM_PROMPT,
user_prompt_template: str = IMAGE_SUMMARIZATION_USER_PROMPT,
) -> str | None:
"""Wrapper function that handles error cases and configuration consistently.
Args:
llm: The LLM with vision capabilities to use for summarization
image_data: The raw image bytes
context_name: Name or title of the image for context
system_prompt: System prompt to use for the LLM
user_prompt_template: Template for the user prompt, should contain {title} placeholder
Returns:
The image summary text, or None if summarization failed or is disabled
"""
if llm is None:
return None
user_prompt = user_prompt_template.format(title=context_name)
return summarize_image_pipeline(llm, image_data, user_prompt, system_prompt)
def _summarize_image(
encoded_image: str,
llm: LLM,
query: str | None = None,
system_prompt: str | None = None,
) -> str:
"""Use default LLM (if it is multimodal) to generate a summary of an image."""
messages: list[BaseMessage] = []
if system_prompt:
messages.append(SystemMessage(content=system_prompt))
messages.append(
HumanMessage(
content=[
{"type": "text", "text": query},
{"type": "image_url", "image_url": {"url": encoded_image}},
],
),
)
try:
return message_to_string(llm.invoke(messages))
except Exception as e:
raise ValueError(f"Summarization failed. Messages: {messages}") from e
def _encode_image_for_llm_prompt(image_data: bytes) -> str:
"""Getting the base64 string."""
base64_encoded_data = base64.b64encode(image_data).decode("utf-8")
return f"data:image/jpeg;base64,{base64_encoded_data}"
def _resize_image_if_needed(image_data: bytes, max_size_mb: int = 20) -> bytes:
"""Resize image if it's larger than the specified max size in MB."""
max_size_bytes = max_size_mb * 1024 * 1024
if len(image_data) > max_size_bytes:
with Image.open(BytesIO(image_data)) as img:
# Reduce dimensions for better size reduction
img.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
output = BytesIO()
# Save with lower quality for compression
img.save(output, format="JPEG", quality=85)
resized_data = output.getvalue()
return resized_data
return image_data

View File

@@ -1,70 +0,0 @@
from typing import Tuple
from sqlalchemy.orm import Session
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.constants import FileOrigin
from onyx.connectors.models import Section
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.llm.interfaces import LLM
from onyx.utils.logger import setup_logger
logger = setup_logger()
def store_image_and_create_section(
db_session: Session,
image_data: bytes,
file_name: str,
display_name: str,
media_type: str = "image/unknown",
llm: LLM | None = None,
file_origin: FileOrigin = FileOrigin.OTHER,
) -> Tuple[Section, str | None]:
"""
Stores an image in PGFileStore and creates a Section object with optional summarization.
Args:
db_session: Database session
image_data: Raw image bytes
file_name: Base identifier for the file
display_name: Human-readable name for the image
media_type: MIME type of the image
llm: Optional LLM with vision capabilities for summarization
file_origin: Origin of the file (e.g., CONFLUENCE, GOOGLE_DRIVE, etc.)
Returns:
Tuple containing:
- Section object with image reference and optional summary text
- The file_name in PGFileStore or None if storage failed
"""
# Storage logic
stored_file_name = None
try:
pgfilestore = save_bytes_to_pgfilestore(
db_session=db_session,
raw_bytes=image_data,
media_type=media_type,
identifier=file_name,
display_name=display_name,
file_origin=file_origin,
)
stored_file_name = pgfilestore.file_name
except Exception as e:
logger.error(f"Failed to store image: {e}")
if not CONTINUE_ON_CONNECTOR_FAILURE:
raise
return Section(text=""), None
# Summarization logic
summary_text = ""
if llm:
summary_text = (
summarize_image_with_error_handling(llm, image_data, display_name) or ""
)
return (
Section(text=summary_text, image_file_name=stored_file_name),
stored_file_name,
)

View File

@@ -23,9 +23,12 @@ from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
CHUNK_OVERLAP = 0
# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
# overwhelm the actual contents of the chunk
# For example in a rare case, this could be 128 tokens for the 512 chunk and title prefix
# could be another 128 tokens leaving 256 for the actual contents
MAX_METADATA_PERCENTAGE = 0.25
CHUNK_MIN_CONTENT = 256
logger = setup_logger()
@@ -33,8 +36,16 @@ def _get_metadata_suffix_for_document_index(
metadata: dict[str, str | list[str]], include_separator: bool = False
) -> tuple[str, str]:
"""
Returns the metadata as a natural language string representation with all of the keys and values
for the vector embedding and a string of all of the values for the keyword search.
Returns the metadata as a natural language string representation with all of the keys and values for the vector embedding
and a string of all of the values for the keyword search
For example, if we have the following metadata:
{
"author": "John Doe",
"space": "Engineering"
}
The vector embedding string should include the relation between the key and value wheres as for keyword we only want John Doe
and Engineering. The keys are repeat and much more noisy.
"""
if not metadata:
return "", ""
@@ -63,17 +74,12 @@ def _get_metadata_suffix_for_document_index(
def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwareChunk:
"""
Combines multiple DocAwareChunks into one large chunk (for “multipass” mode),
appending the content and adjusting source_links accordingly.
"""
merged_chunk = DocAwareChunk(
source_document=chunks[0].source_document,
chunk_id=chunks[0].chunk_id,
blurb=chunks[0].blurb,
content=chunks[0].content,
source_links=chunks[0].source_links or {},
image_file_name=None,
section_continuation=(chunks[0].chunk_id > 0),
title_prefix=chunks[0].title_prefix,
metadata_suffix_semantic=chunks[0].metadata_suffix_semantic,
@@ -97,9 +103,6 @@ def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwar
def generate_large_chunks(chunks: list[DocAwareChunk]) -> list[DocAwareChunk]:
"""
Generates larger “grouped” chunks by combining sets of smaller chunks.
"""
large_chunks = []
for idx, i in enumerate(range(0, len(chunks), LARGE_CHUNK_RATIO)):
chunk_group = chunks[i : i + LARGE_CHUNK_RATIO]
@@ -169,60 +172,23 @@ class Chunker:
while start < total_tokens:
end = min(start + content_token_limit, total_tokens)
token_chunk = tokens[start:end]
# Join the tokens to reconstruct the text
chunk_text = " ".join(token_chunk)
chunks.append(chunk_text)
start = end
return chunks
def _extract_blurb(self, text: str) -> str:
"""
Extract a short blurb from the text (first chunk of size `blurb_size`).
"""
texts = self.blurb_splitter.split_text(text)
if not texts:
return ""
return texts[0]
def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
"""
For “multipass” mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
"""
if self.mini_chunk_splitter and chunk_text.strip():
return self.mini_chunk_splitter.split_text(chunk_text)
return None
# ADDED: extra param image_url to store in the chunk
def _create_chunk(
self,
document: Document,
chunks_list: list[DocAwareChunk],
text: str,
links: dict[int, str],
is_continuation: bool = False,
title_prefix: str = "",
metadata_suffix_semantic: str = "",
metadata_suffix_keyword: str = "",
image_file_name: str | None = None,
) -> None:
"""
Helper to create a new DocAwareChunk, append it to chunks_list.
"""
new_chunk = DocAwareChunk(
source_document=document,
chunk_id=len(chunks_list),
blurb=self._extract_blurb(text),
content=text,
source_links=links or {0: ""},
image_file_name=image_file_name,
section_continuation=is_continuation,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
mini_chunk_texts=self._get_mini_chunk_texts(text),
large_chunk_id=None,
)
chunks_list.append(new_chunk)
def _chunk_document(
self,
document: Document,
@@ -232,156 +198,122 @@ class Chunker:
content_token_limit: int,
) -> list[DocAwareChunk]:
"""
Loops through sections of the document, converting them into one or more chunks.
If a section has an image_link, we treat it as a dedicated chunk.
Loops through sections of the document, adds metadata and converts them into chunks.
"""
chunks: list[DocAwareChunk] = []
link_offsets: dict[int, str] = {}
chunk_text = ""
def _create_chunk(
text: str,
links: dict[int, str],
is_continuation: bool = False,
) -> DocAwareChunk:
return DocAwareChunk(
source_document=document,
chunk_id=len(chunks),
blurb=self._extract_blurb(text),
content=text,
source_links=links or {0: ""},
section_continuation=is_continuation,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
mini_chunk_texts=self._get_mini_chunk_texts(text),
large_chunk_id=None,
)
section_link_text: str
for section_idx, section in enumerate(document.sections):
section_text = clean_text(section.text)
section_link_text = section.link or ""
# ADDED: if the Section has an image link
image_url = section.image_file_name
# If there is no useful content, skip
# If there is no useful content, not even the title, just drop it
if not section_text and (not document.title or section_idx > 0):
# If a section is empty and the document has no title, we can just drop it. We return a list of
# DocAwareChunks where each one contains the necessary information needed down the line for indexing.
# There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
logger.warning(
f"Skipping empty or irrelevant section in doc "
f"{document.semantic_identifier}, link={section_link_text}"
f"Skipping section {section.text} from document "
f"{document.semantic_identifier} due to empty text after cleaning "
f"with link {section_link_text}"
)
continue
# CASE 1: If this is an image section, force a separate chunk
if image_url:
# First, if we have any partially built text chunk, finalize it
if chunk_text.strip():
self._create_chunk(
document,
chunks,
chunk_text,
link_offsets,
is_continuation=False,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
)
chunk_text = ""
link_offsets = {}
# Create a chunk specifically for this image
# (If the section has text describing the image, use that as content)
self._create_chunk(
document,
chunks,
section_text,
links={0: section_link_text}
if section_link_text
else {}, # No text offsets needed for images
image_file_name=image_url,
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
)
# Continue to next section
continue
# CASE 2: Normal text section
section_token_count = len(self.tokenizer.tokenize(section_text))
# If the section is large on its own, split it separately
# Large sections are considered self-contained/unique
# Therefore, they start a new chunk and are not concatenated
# at the end by other sections
if section_token_count > content_token_limit:
if chunk_text.strip():
self._create_chunk(
document,
chunks,
chunk_text,
link_offsets,
False,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
)
chunk_text = ""
if chunk_text:
chunks.append(_create_chunk(chunk_text, link_offsets))
link_offsets = {}
chunk_text = ""
split_texts = self.chunk_splitter.split_text(section_text)
for i, split_text in enumerate(split_texts):
# If even the split_text is bigger than strict limit, further split
if (
STRICT_CHUNK_TOKEN_LIMIT
and len(self.tokenizer.tokenize(split_text))
> content_token_limit
and
# Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
len(self.tokenizer.tokenize(split_text)) > content_token_limit
):
# If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
# the token count of each split text to ensure it is
# not larger than the content_token_limit
smaller_chunks = self._split_oversized_chunk(
split_text, content_token_limit
)
for j, small_chunk in enumerate(smaller_chunks):
self._create_chunk(
document,
chunks,
small_chunk,
{0: section_link_text},
is_continuation=(j != 0),
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
for i, small_chunk in enumerate(smaller_chunks):
chunks.append(
_create_chunk(
text=small_chunk,
links={0: section_link_text},
is_continuation=(i != 0),
)
)
else:
self._create_chunk(
document,
chunks,
split_text,
{0: section_link_text},
is_continuation=(i != 0),
title_prefix=title_prefix,
metadata_suffix_semantic=metadata_suffix_semantic,
metadata_suffix_keyword=metadata_suffix_keyword,
chunks.append(
_create_chunk(
text=split_text,
links={0: section_link_text},
is_continuation=(i != 0),
)
)
continue
# If we can still fit this section into the current chunk, do so
current_token_count = len(self.tokenizer.tokenize(chunk_text))
current_offset = len(shared_precompare_cleanup(chunk_text))
# In the case where the whole section is shorter than a chunk, either add
# to chunk or start a new one
next_section_tokens = (
len(self.tokenizer.tokenize(SECTION_SEPARATOR)) + section_token_count
)
if next_section_tokens + current_token_count <= content_token_limit:
if chunk_text:
chunk_text += SECTION_SEPARATOR
chunk_text += section_text
link_offsets[current_offset] = section_link_text
else:
# finalize the existing chunk
self._create_chunk(
document,
chunks,
chunk_text,
link_offsets,
False,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
)
# start a new chunk
chunks.append(_create_chunk(chunk_text, link_offsets))
link_offsets = {0: section_link_text}
chunk_text = section_text
# finalize any leftover text chunk
# Once we hit the end, if we're still in the process of building a chunk, add what we have.
# If there is only whitespace left then don't include it. If there are no chunks at all
# from the doc, we can just create a single chunk with the title.
if chunk_text.strip() or not chunks:
self._create_chunk(
document,
chunks,
chunk_text,
link_offsets or {0: ""}, # safe default
False,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
chunks.append(
_create_chunk(
chunk_text,
link_offsets or {0: section_link_text},
)
)
# If the chunk does not have any useable content, it will not be indexed
return chunks
def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
@@ -389,12 +321,10 @@ class Chunker:
if document.source == DocumentSource.GMAIL:
logger.debug(f"Chunking {document.semantic_identifier}")
# Title prep
title = self._extract_blurb(document.get_title_for_document_index() or "")
title_prefix = title + RETURN_SEPARATOR if title else ""
title_tokens = len(self.tokenizer.tokenize(title_prefix))
# Metadata prep
metadata_suffix_semantic = ""
metadata_suffix_keyword = ""
metadata_tokens = 0
@@ -407,20 +337,19 @@ class Chunker:
)
metadata_tokens = len(self.tokenizer.tokenize(metadata_suffix_semantic))
# If metadata is too large, skip it in the semantic content
if metadata_tokens >= self.chunk_token_limit * MAX_METADATA_PERCENTAGE:
# Note: we can keep the keyword suffix even if the semantic suffix is too long to fit in the model
# context, there is no limit for the keyword component
metadata_suffix_semantic = ""
metadata_tokens = 0
# Adjust content token limit to accommodate title + metadata
content_token_limit = self.chunk_token_limit - title_tokens - metadata_tokens
# If there is not enough context remaining then just index the chunk with no prefix/suffix
if content_token_limit <= CHUNK_MIN_CONTENT:
# Not enough space left, so revert to full chunk without the prefix
content_token_limit = self.chunk_token_limit
title_prefix = ""
metadata_suffix_semantic = ""
# Chunk the document
normal_chunks = self._chunk_document(
document,
title_prefix,
@@ -429,7 +358,6 @@ class Chunker:
content_token_limit,
)
# Optional “multipass” large chunk creation
if self.enable_multipass and self.enable_large_chunks:
large_chunks = generate_large_chunks(normal_chunks)
normal_chunks.extend(large_chunks)
@@ -443,8 +371,9 @@ class Chunker:
"""
final_chunks: list[DocAwareChunk] = []
for document in documents:
if self.callback and self.callback.should_stop():
raise RuntimeError("Chunker.chunk: Stop signal detected")
if self.callback:
if self.callback.should_stop():
raise RuntimeError("Chunker.chunk: Stop signal detected")
chunks = self._handle_single_document(document)
final_chunks.extend(chunks)

View File

@@ -29,7 +29,6 @@ class BaseChunk(BaseModel):
content: str
# Holds the link and the offsets into the raw Chunk text
source_links: dict[int, str] | None
image_file_name: str | None
# True if this Chunk's start is not at the start of a Section
section_continuation: bool

View File

@@ -6,14 +6,12 @@ from onyx.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
from onyx.configs.model_configs import GEN_AI_TEMPERATURE
from onyx.db.engine import get_session_context_manager
from onyx.db.llm import fetch_default_provider
from onyx.db.llm import fetch_existing_llm_providers
from onyx.db.llm import fetch_provider
from onyx.db.models import Persona
from onyx.llm.chat_llm import DefaultMultiLLM
from onyx.llm.exceptions import GenAIDisabledException
from onyx.llm.interfaces import LLM
from onyx.llm.override_models import LLMOverride
from onyx.llm.utils import model_supports_image_input
from onyx.utils.headers import build_llm_extra_headers
from onyx.utils.logger import setup_logger
from onyx.utils.long_term_log import LongTermLogger
@@ -88,48 +86,6 @@ def get_llms_for_persona(
return _create_llm(model), _create_llm(fast_model)
def get_default_llm_with_vision(
timeout: int | None = None,
temperature: float | None = None,
additional_headers: dict[str, str] | None = None,
long_term_logger: LongTermLogger | None = None,
) -> LLM | None:
if DISABLE_GENERATIVE_AI:
raise GenAIDisabledException()
with get_session_context_manager() as db_session:
llm_providers = fetch_existing_llm_providers(db_session)
if not llm_providers:
return None
for provider in llm_providers:
model_name = provider.default_model_name
fast_model_name = (
provider.fast_default_model_name or provider.default_model_name
)
if not model_name or not fast_model_name:
continue
if model_supports_image_input(model_name, provider.provider):
return get_llm(
provider=provider.provider,
model=model_name,
deployment_name=provider.deployment_name,
api_key=provider.api_key,
api_base=provider.api_base,
api_version=provider.api_version,
custom_config=provider.custom_config,
timeout=timeout,
temperature=temperature,
additional_headers=additional_headers,
long_term_logger=long_term_logger,
)
raise ValueError("No LLM provider found that supports image input")
def get_default_llms(
timeout: int | None = None,
temperature: float | None = None,

View File

@@ -1,22 +0,0 @@
# Used for creating embeddings of images for vector search
IMAGE_SUMMARIZATION_SYSTEM_PROMPT = """
You are an assistant for summarizing images for retrieval.
Summarize the content of the following image and be as precise as possible.
The summary will be embedded and used to retrieve the original image.
Therefore, write a concise summary of the image that is optimized for retrieval.
"""
# Prompt for generating image descriptions with filename context
IMAGE_SUMMARIZATION_USER_PROMPT = """
The image has the file name '{title}'.
Describe precisely and concisely what the image shows.
"""
# Used for analyzing images in response to user queries at search time
IMAGE_ANALYSIS_SYSTEM_PROMPT = (
"You are an AI assistant specialized in describing images.\n"
"You will receive a user question plus an image URL. Provide a concise textual answer.\n"
"Focus on aspects of the image that are relevant to the user's question.\n"
"Be specific and detailed about visual elements that directly address the query.\n"
)

View File

@@ -55,11 +55,7 @@ def _create_indexable_chunks(
# The section is not really used past this point since we have already done the other processing
# for the chunking and embedding.
sections=[
Section(
text=preprocessed_doc["content"],
link=preprocessed_doc["url"],
image_file_name=None,
)
Section(text=preprocessed_doc["content"], link=preprocessed_doc["url"])
],
source=DocumentSource.WEB,
semantic_identifier=preprocessed_doc["title"],
@@ -97,7 +93,6 @@ def _create_indexable_chunks(
document_sets=set(),
boost=DEFAULT_BOOST,
large_chunk_id=None,
image_file_name=None,
)
chunks.append(chunk)

View File

@@ -53,11 +53,6 @@ class Settings(BaseModel):
auto_scroll: bool | None = False
query_history_type: QueryHistoryType | None = None
# Image processing settings
image_extraction_and_analysis_enabled: bool | None = False
search_time_image_analysis_enabled: bool | None = False
image_analysis_max_size_mb: int | None = 20
class UserSettings(Settings):
notifications: list[Notification]

View File

@@ -47,7 +47,6 @@ def load_settings() -> Settings:
settings.anonymous_user_enabled = anonymous_user_enabled
settings.query_history_type = ONYX_QUERY_HISTORY_TYPE
return settings

View File

@@ -1,23 +0,0 @@
"""
Standardized error handling utilities.
"""
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.utils.logger import setup_logger
logger = setup_logger()
def handle_connector_error(e: Exception, context: str) -> None:
"""
Standard error handling for connectors.
Args:
e: The exception that was raised
context: A description of where the error occurred
Raises:
The original exception if CONTINUE_ON_CONNECTOR_FAILURE is False
"""
logger.error(f"Error in {context}: {e}", exc_info=e)
if not CONTINUE_ON_CONNECTOR_FAILURE:
raise

View File

@@ -1,10 +1,9 @@
aioboto3==14.0.0
aiohttp==3.10.2
alembic==1.10.4
asyncpg==0.27.0
atlassian-python-api==3.41.16
beautifulsoup4==4.12.3
boto3==1.36.23
boto3==1.34.84
celery==5.5.0b4
chardet==5.2.0
dask==2023.8.1

View File

@@ -13,5 +13,4 @@ transformers==4.39.2
uvicorn==0.21.1
voyageai==0.2.3
litellm==1.61.16
sentry-sdk[fastapi,celery,starlette]==2.14.0
aioboto3==13.4.0
sentry-sdk[fastapi,celery,starlette]==2.14.0

View File

@@ -1,45 +0,0 @@
import argparse
import logging
from logging import getLogger
from onyx.db.seeding.chat_history_seeding import seed_chat_history
# Configure the logger
logging.basicConfig(
level=logging.INFO, # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", # Log format
handlers=[logging.StreamHandler()], # Output logs to console
)
logger = getLogger(__name__)
def go_main(num_sessions: int, num_messages: int, num_days: int) -> None:
seed_chat_history(num_sessions, num_messages, num_days)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Seed chat history")
parser.add_argument(
"--sessions",
type=int,
default=2048,
help="Number of chat sessions to seed",
)
parser.add_argument(
"--messages",
type=int,
default=4,
help="Number of chat messages to seed per session",
)
parser.add_argument(
"--days",
type=int,
default=90,
help="Number of days looking backwards over which to seed the timestamps with",
)
args = parser.parse_args()
go_main(args.sessions, args.messages, args.days)

View File

@@ -207,7 +207,7 @@ def query_vespa(
yql: str, tenant_id: Optional[str] = None, limit: int = 10
) -> List[Dict[str, Any]]:
# Perform a Vespa query using YQL syntax.
filters = IndexFilters(tenant_id=None, access_control_list=[])
filters = IndexFilters(tenant_id=tenant_id, access_control_list=[])
filter_string = build_vespa_filters(filters, remove_trailing_and=True)
full_yql = yql.strip()
if filter_string:
@@ -472,7 +472,9 @@ def get_document_acls(
print("-" * 80)
def get_current_chunk_count(document_id: str) -> int | None:
def get_current_chunk_count(
document_id: str, index_name: str, tenant_id: str
) -> int | None:
with get_session_with_current_tenant() as session:
return (
session.query(Document.chunk_count)
@@ -484,7 +486,7 @@ def get_current_chunk_count(document_id: str) -> int | None:
def get_number_of_chunks_we_think_exist(
document_id: str, index_name: str, tenant_id: str
) -> int:
current_chunk_count = get_current_chunk_count(document_id)
current_chunk_count = get_current_chunk_count(document_id, index_name, tenant_id)
print(f"Current chunk count: {current_chunk_count}")
doc_info = VespaIndex.enrich_basic_chunk_info(
@@ -634,7 +636,6 @@ def delete_where(
Removes visited documents in `cluster` where the given selection
is true, using Vespa's 'delete where' endpoint.
:param index_name: Typically <namespace>/<document-type> from your schema
:param selection: The selection string, e.g., "true" or "foo contains 'bar'"
:param cluster: The name of the cluster where documents reside
@@ -798,7 +799,7 @@ def main() -> None:
args = parser.parse_args()
vespa_debug = VespaDebugging(args.tenant_id)
CURRENT_TENANT_ID_CONTEXTVAR.set(args.tenant_id or "public")
CURRENT_TENANT_ID_CONTEXTVAR.set(args.tenant_id)
if args.action == "delete-all-documents":
if not args.tenant_id:
parser.error("--tenant-id is required for delete-all-documents action")

View File

@@ -71,7 +71,6 @@ def generate_dummy_chunk(
title_embedding=generate_random_embedding(embedding_dim),
large_chunk_id=None,
large_chunk_reference_ids=[],
image_file_name=None,
)
document_set_names = []

View File

@@ -68,12 +68,6 @@ LOG_LEVEL = os.environ.get("LOG_LEVEL", "info")
# allow us to specify a custom timeout
API_BASED_EMBEDDING_TIMEOUT = int(os.environ.get("API_BASED_EMBEDDING_TIMEOUT", "600"))
# Local batch size for VertexAI embedding models currently calibrated for item size of 512 tokens
# NOTE: increasing this value may lead to API errors due to token limit exhaustion per call.
VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE = int(
os.environ.get("VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE", "25")
)
# Only used for OpenAI
OPENAI_EMBEDDING_TIMEOUT = int(
os.environ.get("OPENAI_EMBEDDING_TIMEOUT", API_BASED_EMBEDDING_TIMEOUT)
@@ -206,12 +200,12 @@ SUPPORTED_EMBEDDING_MODELS = [
index_name="danswer_chunk_text_embedding_3_small",
),
SupportedEmbeddingModel(
name="google/text-embedding-005",
name="google/text-embedding-004",
dim=768,
index_name="danswer_chunk_google_text_embedding_004",
),
SupportedEmbeddingModel(
name="google/text-embedding-005",
name="google/text-embedding-004",
dim=768,
index_name="danswer_chunk_text_embedding_004",
),

View File

@@ -13,7 +13,6 @@ class EmbeddingProvider(str, Enum):
class RerankerProvider(str, Enum):
COHERE = "cohere"
LITELLM = "litellm"
BEDROCK = "bedrock"
class EmbedTextType(str, Enum):

View File

@@ -1,46 +0,0 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from ee.onyx.db.usage_export import get_all_empty_chat_message_entries
from onyx.db.engine import get_session_with_current_tenant
from onyx.db.seeding.chat_history_seeding import seed_chat_history
def test_usage_reports(reset: None) -> None:
EXPECTED_SESSIONS = 2048
MESSAGES_PER_SESSION = 4
EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION
seed_chat_history(EXPECTED_SESSIONS, MESSAGES_PER_SESSION, 90)
with get_session_with_current_tenant() as db_session:
# count of all entries should be exact
period = (
datetime.fromtimestamp(0, tz=timezone.utc),
datetime.now(tz=timezone.utc),
)
count = 0
for entry_batch in get_all_empty_chat_message_entries(db_session, period):
for entry in entry_batch:
count += 1
assert count == EXPECTED_MESSAGES
# count in a one month time range should be within a certain range statistically
# this can be improved if we seed the chat history data deterministically
period = (
datetime.now(tz=timezone.utc) - timedelta(days=30),
datetime.now(tz=timezone.utc),
)
count = 0
for entry_batch in get_all_empty_chat_message_entries(db_session, period):
for entry in entry_batch:
count += 1
lower = EXPECTED_MESSAGES // 3 - (EXPECTED_MESSAGES // (3 * 3))
upper = EXPECTED_MESSAGES // 3 + (EXPECTED_MESSAGES // (3 * 3))
assert count > lower
assert count < upper

View File

@@ -31,7 +31,6 @@ def create_test_chunk(
metadata={},
match_highlights=[],
updated_at=datetime.now(),
image_file_name=None,
)

View File

@@ -80,7 +80,6 @@ def mock_inference_sections() -> list[InferenceSection]:
updated_at=datetime(2023, 1, 1),
source_links={0: "https://example.com/doc1"},
match_highlights=[],
image_file_name=None,
),
chunks=MagicMock(),
),
@@ -103,7 +102,6 @@ def mock_inference_sections() -> list[InferenceSection]:
updated_at=datetime(2023, 1, 2),
source_links={0: "https://example.com/doc2"},
match_highlights=[],
image_file_name=None,
),
chunks=MagicMock(),
),

View File

@@ -150,7 +150,6 @@ def test_fuzzy_match_quotes_to_docs() -> None:
metadata={},
match_highlights=[],
updated_at=None,
image_file_name=None,
)
test_chunk_1 = InferenceChunk(
document_id="test doc 1",
@@ -169,7 +168,6 @@ def test_fuzzy_match_quotes_to_docs() -> None:
metadata={},
match_highlights=[],
updated_at=None,
image_file_name=None,
)
test_quotes = [

View File

@@ -37,7 +37,6 @@ def create_inference_chunk(
metadata={},
match_highlights=[],
updated_at=None,
image_file_name=None,
)

View File

@@ -62,7 +62,6 @@ def test_default_indexing_embedder_embed_chunks(mock_embedding_model: Mock) -> N
mini_chunk_texts=None,
large_chunk_reference_ids=[],
large_chunk_id=None,
image_file_name=None,
)
]

View File

@@ -118,7 +118,6 @@ services:
- API_KEY_HASH_ROUNDS=${API_KEY_HASH_ROUNDS:-}
# Seeding configuration
- USE_IAM_AUTH=${USE_IAM_AUTH:-}
- ONYX_QUERY_HISTORY_TYPE=${ONYX_QUERY_HISTORY_TYPE:-}
# Uncomment the line below to use if IAM_AUTH is true and you are using iam auth for postgres
# volumes:
# - ./bundle.pem:/app/bundle.pem:ro

View File

@@ -95,7 +95,6 @@ services:
# Enterprise Edition only
- API_KEY_HASH_ROUNDS=${API_KEY_HASH_ROUNDS:-}
- ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=${ENABLE_PAID_ENTERPRISE_EDITION_FEATURES:-false}
- ONYX_QUERY_HISTORY_TYPE=${ONYX_QUERY_HISTORY_TYPE:-}
# Uncomment the line below to use if IAM_AUTH is true and you are using iam auth for postgres
# volumes:
# - ./bundle.pem:/app/bundle.pem:ro

433
web/package-lock.json generated
View File

@@ -44,7 +44,6 @@
"autoprefixer": "^10.4.14",
"class-variance-authority": "^0.7.0",
"clsx": "^2.1.1",
"cmdk": "^1.0.0",
"date-fns": "^3.6.0",
"favicon-fetch": "^1.0.0",
"formik": "^2.2.9",
@@ -9314,438 +9313,6 @@
"node": ">=6"
}
},
"node_modules/cmdk": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/cmdk/-/cmdk-1.0.0.tgz",
"integrity": "sha512-gDzVf0a09TvoJ5jnuPvygTB77+XdOSwEmJ88L6XPFPlv7T3RxbP9jgenfylrAMD0+Le1aO0nVjQUzl2g+vjz5Q==",
"license": "MIT",
"dependencies": {
"@radix-ui/react-dialog": "1.0.5",
"@radix-ui/react-primitive": "1.0.3"
},
"peerDependencies": {
"react": "^18.0.0",
"react-dom": "^18.0.0"
}
},
"node_modules/cmdk/node_modules/@radix-ui/primitive": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.0.1.tgz",
"integrity": "sha512-yQ8oGX2GVsEYMWGxcovu1uGWPCxV5BFfeeYxqPmuAzUyLT9qmaMXSAhXpb0WrspIeqYzdJpkh2vHModJPgRIaw==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-compose-refs": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-compose-refs/-/react-compose-refs-1.0.1.tgz",
"integrity": "sha512-fDSBgd44FKHa1FRMU59qBMPFcl2PZE+2nmqunj+BWFyYYjnhIDWL2ItDs3rrbJDQOtzt5nIebLCQc4QRfz6LJw==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-context": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-context/-/react-context-1.0.1.tgz",
"integrity": "sha512-ebbrdFoYTcuZ0v4wG5tedGnp9tzcV8awzsxYph7gXUyvnNLuTIcCk1q17JEbnVhXAKG9oX3KtchwiMIAYp9NLg==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-dialog": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dialog/-/react-dialog-1.0.5.tgz",
"integrity": "sha512-GjWJX/AUpB703eEBanuBnIWdIXg6NvJFCXcNlSZk4xdszCdhrJgBoUd1cGk67vFO+WdA2pfI/plOpqz/5GUP6Q==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/primitive": "1.0.1",
"@radix-ui/react-compose-refs": "1.0.1",
"@radix-ui/react-context": "1.0.1",
"@radix-ui/react-dismissable-layer": "1.0.5",
"@radix-ui/react-focus-guards": "1.0.1",
"@radix-ui/react-focus-scope": "1.0.4",
"@radix-ui/react-id": "1.0.1",
"@radix-ui/react-portal": "1.0.4",
"@radix-ui/react-presence": "1.0.1",
"@radix-ui/react-primitive": "1.0.3",
"@radix-ui/react-slot": "1.0.2",
"@radix-ui/react-use-controllable-state": "1.0.1",
"aria-hidden": "^1.1.1",
"react-remove-scroll": "2.5.5"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0",
"react-dom": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-dismissable-layer": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.0.5.tgz",
"integrity": "sha512-aJeDjQhywg9LBu2t/At58hCvr7pEm0o2Ke1x33B+MhjNmmZ17sy4KImo0KPLgsnc/zN7GPdce8Cnn0SWvwZO7g==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/primitive": "1.0.1",
"@radix-ui/react-compose-refs": "1.0.1",
"@radix-ui/react-primitive": "1.0.3",
"@radix-ui/react-use-callback-ref": "1.0.1",
"@radix-ui/react-use-escape-keydown": "1.0.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0",
"react-dom": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-dismissable-layer/node_modules/@radix-ui/react-use-callback-ref": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.0.1.tgz",
"integrity": "sha512-D94LjX4Sp0xJFVaoQOd3OO9k7tpBYNOXdVhkltUbGv2Qb9OXdrg/CpsjlZv7ia14Sylv398LswWBVVu5nqKzAQ==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-dismissable-layer/node_modules/@radix-ui/react-use-escape-keydown": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-escape-keydown/-/react-use-escape-keydown-1.0.3.tgz",
"integrity": "sha512-vyL82j40hcFicA+M4Ex7hVkB9vHgSse1ZWomAqV2Je3RleKGO5iM8KMOEtfoSB0PnIelMd2lATjTGMYqN5ylTg==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-use-callback-ref": "1.0.1"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-focus-guards": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.0.1.tgz",
"integrity": "sha512-Rect2dWbQ8waGzhMavsIbmSVCgYxkXLxxR3ZvCX79JOglzdEy4JXMb98lq4hPxUbLr77nP0UOGf4rcMU+s1pUA==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-focus-scope": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.0.4.tgz",
"integrity": "sha512-sL04Mgvf+FmyvZeYfNu1EPAaaxD+aw7cYeIB9L9Fvq8+urhltTRaEo5ysKOpHuKPclsZcSUMKlN05x4u+CINpA==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-compose-refs": "1.0.1",
"@radix-ui/react-primitive": "1.0.3",
"@radix-ui/react-use-callback-ref": "1.0.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0",
"react-dom": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-focus-scope/node_modules/@radix-ui/react-use-callback-ref": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.0.1.tgz",
"integrity": "sha512-D94LjX4Sp0xJFVaoQOd3OO9k7tpBYNOXdVhkltUbGv2Qb9OXdrg/CpsjlZv7ia14Sylv398LswWBVVu5nqKzAQ==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-id": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-id/-/react-id-1.0.1.tgz",
"integrity": "sha512-tI7sT/kqYp8p96yGWY1OAnLHrqDgzHefRBKQ2YAkBS5ja7QLcZ9Z/uY7bEjPUatf8RomoXM8/1sMj1IJaE5UzQ==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-use-layout-effect": "1.0.1"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-id/node_modules/@radix-ui/react-use-layout-effect": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-layout-effect/-/react-use-layout-effect-1.0.1.tgz",
"integrity": "sha512-v/5RegiJWYdoCvMnITBkNNx6bCj20fiaJnWtRkU18yITptraXjffz5Qbn05uOiQnOvi+dbkznkoaMltz1GnszQ==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-portal": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.0.4.tgz",
"integrity": "sha512-Qki+C/EuGUVCQTOTD5vzJzJuMUlewbzuKyUy+/iHM2uwGiru9gZeBJtHAPKAEkB5KWGi9mP/CHKcY0wt1aW45Q==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-primitive": "1.0.3"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0",
"react-dom": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-presence": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.0.1.tgz",
"integrity": "sha512-UXLW4UAbIY5ZjcvzjfRFo5gxva8QirC9hF7wRE4U5gz+TP0DbRk+//qyuAQ1McDxBt1xNMBTaciFGvEmJvAZCg==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-compose-refs": "1.0.1",
"@radix-ui/react-use-layout-effect": "1.0.1"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0",
"react-dom": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-presence/node_modules/@radix-ui/react-use-layout-effect": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-layout-effect/-/react-use-layout-effect-1.0.1.tgz",
"integrity": "sha512-v/5RegiJWYdoCvMnITBkNNx6bCj20fiaJnWtRkU18yITptraXjffz5Qbn05uOiQnOvi+dbkznkoaMltz1GnszQ==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-primitive": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-1.0.3.tgz",
"integrity": "sha512-yi58uVyoAcK/Nq1inRY56ZSjKypBNKTa/1mcL8qdl6oJeEaDbOldlzrGn7P6Q3Id5d+SYNGc5AJgc4vGhjs5+g==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-slot": "1.0.2"
},
"peerDependencies": {
"@types/react": "*",
"@types/react-dom": "*",
"react": "^16.8 || ^17.0 || ^18.0",
"react-dom": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
},
"@types/react-dom": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-slot": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.0.2.tgz",
"integrity": "sha512-YeTpuq4deV+6DusvVUW4ivBgnkHwECUu0BiN43L5UCDFgdhsRUWAghhTF5MbvNTPzmiFOx90asDSUjWuCNapwg==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-compose-refs": "1.0.1"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-use-controllable-state": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-controllable-state/-/react-use-controllable-state-1.0.1.tgz",
"integrity": "sha512-Svl5GY5FQeN758fWKrjM6Qb7asvXeiZltlT4U2gVfl8Gx5UAv2sMR0LWo8yhsIZh2oQ0eFdZ59aoOOMV7b47VA==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10",
"@radix-ui/react-use-callback-ref": "1.0.1"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/@radix-ui/react-use-controllable-state/node_modules/@radix-ui/react-use-callback-ref": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.0.1.tgz",
"integrity": "sha512-D94LjX4Sp0xJFVaoQOd3OO9k7tpBYNOXdVhkltUbGv2Qb9OXdrg/CpsjlZv7ia14Sylv398LswWBVVu5nqKzAQ==",
"license": "MIT",
"dependencies": {
"@babel/runtime": "^7.13.10"
},
"peerDependencies": {
"@types/react": "*",
"react": "^16.8 || ^17.0 || ^18.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/cmdk/node_modules/react-remove-scroll": {
"version": "2.5.5",
"resolved": "https://registry.npmjs.org/react-remove-scroll/-/react-remove-scroll-2.5.5.tgz",
"integrity": "sha512-ImKhrzJJsyXJfBZ4bzu8Bwpka14c/fQt0k+cyFp/PBhTfyDnU5hjOtM4AG/0AMyy8oKzOTR0lDgJIM7pYXI0kw==",
"license": "MIT",
"dependencies": {
"react-remove-scroll-bar": "^2.3.3",
"react-style-singleton": "^2.2.1",
"tslib": "^2.1.0",
"use-callback-ref": "^1.3.0",
"use-sidecar": "^1.1.2"
},
"engines": {
"node": ">=10"
},
"peerDependencies": {
"@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0",
"react": "^16.8.0 || ^17.0.0 || ^18.0.0"
},
"peerDependenciesMeta": {
"@types/react": {
"optional": true
}
}
},
"node_modules/co": {
"version": "4.6.0",
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",

View File

@@ -47,7 +47,6 @@
"autoprefixer": "^10.4.14",
"class-variance-authority": "^0.7.0",
"clsx": "^2.1.1",
"cmdk": "^1.0.0",
"date-fns": "^3.6.0",
"favicon-fetch": "^1.0.0",
"formik": "^2.2.9",

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 2.9 KiB

After

Width:  |  Height:  |  Size: 7.0 KiB

View File

@@ -1 +1,8 @@
<svg xmlns="http://www.w3.org/2000/svg" shape-rendering="geometricPrecision" text-rendering="geometricPrecision" image-rendering="optimizeQuality" fill-rule="evenodd" clip-rule="evenodd" viewBox="0 0 512 509.64"><path fill="#D77655" d="M115.612 0h280.775C459.974 0 512 52.026 512 115.612v278.415c0 63.587-52.026 115.612-115.613 115.612H115.612C52.026 509.639 0 457.614 0 394.027V115.612C0 52.026 52.026 0 115.612 0z"/><path fill="#FCF2EE" fill-rule="nonzero" d="M142.27 316.619l73.655-41.326 1.238-3.589-1.238-1.996-3.589-.001-12.31-.759-42.084-1.138-36.498-1.516-35.361-1.896-8.897-1.895-8.34-10.995.859-5.484 7.482-5.03 10.717.935 23.683 1.617 35.537 2.452 25.782 1.517 38.193 3.968h6.064l.86-2.451-2.073-1.517-1.618-1.517-36.776-24.922-39.81-26.338-20.852-15.166-11.273-7.683-5.687-7.204-2.451-15.721 10.237-11.273 13.75.935 3.513.936 13.928 10.716 29.749 23.027 38.848 28.612 5.687 4.727 2.275-1.617.278-1.138-2.553-4.271-21.13-38.193-22.546-38.848-10.035-16.101-2.654-9.655c-.935-3.968-1.617-7.304-1.617-11.374l11.652-15.823 6.445-2.073 15.545 2.073 6.547 5.687 9.655 22.092 15.646 34.78 24.265 47.291 7.103 14.028 3.791 12.992 1.416 3.968 2.449-.001v-2.275l1.997-26.641 3.69-32.707 3.589-42.084 1.239-11.854 5.863-14.206 11.652-7.683 9.099 4.348 7.482 10.716-1.036 6.926-4.449 28.915-8.72 45.294-5.687 30.331h3.313l3.792-3.791 15.342-20.372 25.782-32.227 11.374-12.789 13.27-14.129 8.517-6.724 16.1-.001 11.854 17.617-5.307 18.199-16.581 21.029-13.75 17.819-19.716 26.54-12.309 21.231 1.138 1.694 2.932-.278 44.536-9.479 24.062-4.347 28.714-4.928 12.992 6.066 1.416 6.167-5.106 12.613-30.71 7.583-36.018 7.204-53.636 12.689-.657.48.758.935 24.164 2.275 10.337.556h25.301l47.114 3.514 12.309 8.139 7.381 9.959-1.238 7.583-18.957 9.655-25.579-6.066-59.702-14.205-20.474-5.106-2.83-.001v1.694l17.061 16.682 31.266 28.233 39.152 36.397 1.997 8.999-5.03 7.102-5.307-.758-34.401-25.883-13.27-11.651-30.053-25.302-1.996-.001v2.654l6.926 10.136 36.574 54.975 1.895 16.859-2.653 5.485-9.479 3.311-10.414-1.895-21.408-30.054-22.092-33.844-17.819-30.331-2.173 1.238-10.515 113.261-4.929 5.788-11.374 4.348-9.478-7.204-5.03-11.652 5.03-23.027 6.066-30.052 4.928-23.886 4.449-29.674 2.654-9.858-.177-.657-2.173.278-22.37 30.71-34.021 45.977-26.919 28.815-6.445 2.553-11.173-5.789 1.037-10.337 6.243-9.2 37.257-47.392 22.47-29.371 14.508-16.961-.101-2.451h-.859l-98.954 64.251-17.618 2.275-7.583-7.103.936-11.652 3.589-3.791 29.749-20.474-.101.102.024.101z"/></svg>
<?xml version="1.0" encoding="UTF-8"?>
<svg width="256px" height="176px" viewBox="0 0 256 176" version="1.1" xmlns="http://www.w3.org/2000/svg" preserveAspectRatio="xMidYMid">
<title>Anthropic</title>
<g fill="#181818">
<path d="M147.486878,0 C147.486878,0 217.568251,175.780074 217.568251,175.780074 C217.568251,175.780074 256,175.780074 256,175.780074 C256,175.780074 185.918621,0 185.918621,0 C185.918621,0 147.486878,0 147.486878,0 C147.486878,0 147.486878,0 147.486878,0 Z"></path>
<path d="M66.1828124,106.221191 C66.1828124,106.221191 90.1624677,44.4471185 90.1624677,44.4471185 C90.1624677,44.4471185 114.142128,106.221191 114.142128,106.221191 C114.142128,106.221191 66.1828124,106.221191 66.1828124,106.221191 C66.1828124,106.221191 66.1828124,106.221191 66.1828124,106.221191 Z M70.0705318,0 C70.0705318,0 0,175.780074 0,175.780074 C0,175.780074 39.179211,175.780074 39.179211,175.780074 C39.179211,175.780074 53.5097704,138.86606 53.5097704,138.86606 C53.5097704,138.86606 126.817544,138.86606 126.817544,138.86606 C126.817544,138.86606 141.145724,175.780074 141.145724,175.780074 C141.145724,175.780074 180.324935,175.780074 180.324935,175.780074 C180.324935,175.780074 110.254409,0 110.254409,0 C110.254409,0 70.0705318,0 70.0705318,0 C70.0705318,0 70.0705318,0 70.0705318,0 Z"></path>
</g>
</svg>

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@@ -1 +0,0 @@
<svg height="1em" style="flex:none;line-height:1" viewBox="0 0 24 24" width="1em" xmlns="http://www.w3.org/2000/svg"><title>DeepSeek</title><path d="M23.748 4.482c-.254-.124-.364.113-.512.234-.051.039-.094.09-.137.136-.372.397-.806.657-1.373.626-.829-.046-1.537.214-2.163.848-.133-.782-.575-1.248-1.247-1.548-.352-.156-.708-.311-.955-.65-.172-.241-.219-.51-.305-.774-.055-.16-.11-.323-.293-.35-.2-.031-.278.136-.356.276-.313.572-.434 1.202-.422 1.84.027 1.436.633 2.58 1.838 3.393.137.093.172.187.129.323-.082.28-.18.552-.266.833-.055.179-.137.217-.329.14a5.526 5.526 0 01-1.736-1.18c-.857-.828-1.631-1.742-2.597-2.458a11.365 11.365 0 00-.689-.471c-.985-.957.13-1.743.388-1.836.27-.098.093-.432-.779-.428-.872.004-1.67.295-2.687.684a3.055 3.055 0 01-.465.137 9.597 9.597 0 00-2.883-.102c-1.885.21-3.39 1.102-4.497 2.623C.082 8.606-.231 10.684.152 12.85c.403 2.284 1.569 4.175 3.36 5.653 1.858 1.533 3.997 2.284 6.438 2.14 1.482-.085 3.133-.284 4.994-1.86.47.234.962.327 1.78.397.63.059 1.236-.03 1.705-.128.735-.156.684-.837.419-.961-2.155-1.004-1.682-.595-2.113-.926 1.096-1.296 2.746-2.642 3.392-7.003.05-.347.007-.565 0-.845-.004-.17.035-.237.23-.256a4.173 4.173 0 001.545-.475c1.396-.763 1.96-2.015 2.093-3.517.02-.23-.004-.467-.247-.588zM11.581 18c-2.089-1.642-3.102-2.183-3.52-2.16-.392.024-.321.471-.235.763.09.288.207.486.371.739.114.167.192.416-.113.603-.673.416-1.842-.14-1.897-.167-1.361-.802-2.5-1.86-3.301-3.307-.774-1.393-1.224-2.887-1.298-4.482-.02-.386.093-.522.477-.592a4.696 4.696 0 011.529-.039c2.132.312 3.946 1.265 5.468 2.774.868.86 1.525 1.887 2.202 2.891.72 1.066 1.494 2.082 2.48 2.914.348.292.625.514.891.677-.802.09-2.14.11-3.054-.614zm1-6.44a.306.306 0 01.415-.287.302.302 0 01.2.288.306.306 0 01-.31.307.303.303 0 01-.304-.308zm3.11 1.596c-.2.081-.399.151-.59.16a1.245 1.245 0 01-.798-.254c-.274-.23-.47-.358-.552-.758a1.73 1.73 0 01.016-.588c.07-.327-.008-.537-.239-.727-.187-.156-.426-.199-.688-.199a.559.559 0 01-.254-.078c-.11-.054-.2-.19-.114-.358.028-.054.16-.186.192-.21.356-.202.767-.136 1.146.016.352.144.618.408 1.001.782.391.451.462.576.685.914.176.265.336.537.445.848.067.195-.019.354-.25.452z" fill="#4D6BFE"></path></svg>

Before

Width:  |  Height:  |  Size: 2.1 KiB

View File

@@ -7,12 +7,14 @@ import {
MicrosoftIconSVG,
MistralIcon,
MetaIcon,
OpenAIIcon,
GeminiIcon,
OpenSourceIcon,
AnthropicSVG,
IconProps,
OpenAIISVG,
DeepseekIcon,
} from "@/components/icons/icons";
import { FaRobot } from "react-icons/fa";
export interface CustomConfigKey {
name: string;
@@ -74,31 +76,30 @@ export interface LLMProviderDescriptor {
}
export const getProviderIcon = (providerName: string, modelName?: string) => {
const modelIconMap: Record<
string,
({ size, className }: IconProps) => JSX.Element
> = {
amazon: AmazonIcon,
phi: MicrosoftIconSVG,
mistral: MistralIcon,
ministral: MistralIcon,
llama: MetaIcon,
gemini: GeminiIcon,
deepseek: DeepseekIcon,
claude: AnthropicIcon,
};
const modelNameToIcon = (
modelName: string,
fallbackIcon: ({ size, className }: IconProps) => JSX.Element
): (({ size, className }: IconProps) => JSX.Element) => {
const lowerModelName = modelName?.toLowerCase();
for (const [key, icon] of Object.entries(modelIconMap)) {
if (lowerModelName?.includes(key)) {
return icon;
}
if (modelName?.toLowerCase().includes("amazon")) {
return AmazonIcon;
}
if (modelName?.toLowerCase().includes("phi")) {
return MicrosoftIconSVG;
}
if (modelName?.toLowerCase().includes("mistral")) {
return MistralIcon;
}
if (modelName?.toLowerCase().includes("llama")) {
return MetaIcon;
}
if (modelName?.toLowerCase().includes("gemini")) {
return GeminiIcon;
}
if (modelName?.toLowerCase().includes("claude")) {
return AnthropicIcon;
} else {
return fallbackIcon;
}
return fallbackIcon;
};
switch (providerName) {

View File

@@ -1,6 +1,6 @@
"use client";
import { Form, Formik } from "formik";
import { ArrayHelpers, FieldArray, Form, Formik } from "formik";
import * as Yup from "yup";
import { PopupSpec } from "@/components/admin/connectors/Popup";
import {
@@ -10,14 +10,13 @@ import {
} from "./lib";
import { ConnectorStatus, DocumentSet, UserGroup, UserRole } from "@/lib/types";
import { TextFormField } from "@/components/admin/connectors/Field";
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
import { Separator } from "@/components/ui/separator";
import { Button } from "@/components/ui/button";
import { usePaidEnterpriseFeaturesEnabled } from "@/components/settings/usePaidEnterpriseFeaturesEnabled";
import { IsPublicGroupSelector } from "@/components/IsPublicGroupSelector";
import React, { useEffect, useState } from "react";
import { useUser } from "@/components/user/UserProvider";
import { ConnectorMultiSelect } from "@/components/ConnectorMultiSelect";
import { NonSelectableConnectors } from "@/components/NonSelectableConnectors";
interface SetCreationPopupProps {
ccPairs: ConnectorStatus<any, any>[];
@@ -46,7 +45,7 @@ export const DocumentSetCreationForm = ({
}, [existingDocumentSet?.is_public]);
return (
<div className="max-w-full mx-auto">
<div>
<Formik<DocumentSetCreationRequest>
initialValues={{
name: existingDocumentSet?.name ?? "",
@@ -105,122 +104,243 @@ export const DocumentSetCreationForm = ({
}}
>
{(props) => {
// Filter visible cc pairs for curator role
const visibleCcPairs =
user?.role === UserRole.CURATOR
? localCcPairs.filter(
(ccPair) =>
ccPair.access_type === "public" ||
(ccPair.groups.length > 0 &&
props.values.groups.every((group) =>
ccPair.groups.includes(group)
))
)
: localCcPairs;
// Filter non-visible cc pairs for curator role
const nonVisibleCcPairs =
user?.role === UserRole.CURATOR
? localCcPairs.filter(
(ccPair) =>
!(ccPair.access_type === "public") &&
(ccPair.groups.length === 0 ||
!props.values.groups.every((group) =>
ccPair.groups.includes(group)
))
)
: [];
// Deselect filtered out cc pairs
if (user?.role === UserRole.CURATOR) {
const visibleCcPairIds = visibleCcPairs.map(
(ccPair) => ccPair.cc_pair_id
);
props.values.cc_pair_ids = props.values.cc_pair_ids.filter((id) =>
visibleCcPairIds.includes(id)
);
}
return (
<Form className="space-y-6 w-full ">
<div className="space-y-4 w-full">
<TextFormField
name="name"
label="Name:"
placeholder="A name for the document set"
disabled={isUpdate}
autoCompleteDisabled={true}
/>
<TextFormField
name="description"
label="Description:"
placeholder="Describe what the document set represents"
autoCompleteDisabled={true}
optional={true}
<Form>
<TextFormField
name="name"
label="Name:"
placeholder="A name for the document set"
disabled={isUpdate}
autoCompleteDisabled={true}
/>
<TextFormField
name="description"
label="Description:"
placeholder="Describe what the document set represents"
autoCompleteDisabled={true}
optional={true}
/>
{isPaidEnterpriseFeaturesEnabled && (
<IsPublicGroupSelector
formikProps={props}
objectName="document set"
/>
)}
{isPaidEnterpriseFeaturesEnabled && (
<IsPublicGroupSelector
formikProps={props}
objectName="document set"
/>
)}
</div>
<Separator />
<Separator className="my-6" />
{user?.role === UserRole.CURATOR ? (
<>
<div className="flex flex-col gap-y-1">
<h2 className="mb-1 font-medium text-base">
These are the connectors available to{" "}
{userGroups && userGroups.length > 1
? "the selected group"
: "the group you curate"}
:
</h2>
<div className="space-y-6">
{user?.role === UserRole.CURATOR ? (
<>
<ConnectorMultiSelect
<p className="mb-text-sm">
All documents indexed by these selected connectors will be
a part of this document set.
</p>
<FieldArray
name="cc_pair_ids"
label={`Connectors available to ${
userGroups && userGroups.length > 1
? "the selected group"
: "the group you curate"
}`}
connectors={visibleCcPairs}
selectedIds={props.values.cc_pair_ids}
onChange={(selectedIds) => {
props.setFieldValue("cc_pair_ids", selectedIds);
render={(arrayHelpers: ArrayHelpers) => {
// Filter visible cc pairs
const visibleCcPairs = localCcPairs.filter(
(ccPair) =>
ccPair.access_type === "public" ||
(ccPair.groups.length > 0 &&
props.values.groups.every((group) =>
ccPair.groups.includes(group)
))
);
// Deselect filtered out cc pairs
const visibleCcPairIds = visibleCcPairs.map(
(ccPair) => ccPair.cc_pair_id
);
props.values.cc_pair_ids =
props.values.cc_pair_ids.filter((id) =>
visibleCcPairIds.includes(id)
);
return (
<div className="mb-3 flex gap-2 flex-wrap">
{visibleCcPairs.map((ccPair) => {
const ind = props.values.cc_pair_ids.indexOf(
ccPair.cc_pair_id
);
const isSelected = ind !== -1;
return (
<div
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
className={
`
px-3
py-1
rounded-lg
border
border-border
w-fit
flex
cursor-pointer ` +
(isSelected
? " bg-background-200"
: " hover:bg-accent-background-hovered")
}
onClick={() => {
if (isSelected) {
arrayHelpers.remove(ind);
} else {
arrayHelpers.push(ccPair.cc_pair_id);
}
}}
>
<div className="my-auto">
<ConnectorTitle
connector={ccPair.connector}
ccPairId={ccPair.cc_pair_id}
ccPairName={ccPair.name}
isLink={false}
showMetadata={false}
/>
</div>
</div>
);
})}
</div>
);
}}
placeholder="Search for connectors..."
/>
</div>
<NonSelectableConnectors
connectors={nonVisibleCcPairs}
title={`Connectors not available to the ${
userGroups && userGroups.length > 1
? `group${
props.values.groups.length > 1 ? "s" : ""
} you have selected`
: "group you curate"
}`}
description="Only connectors that are directly assigned to the group you are trying to add the document set to will be available."
<div>
<FieldArray
name="cc_pair_ids"
render={() => {
// Filter non-visible cc pairs
const nonVisibleCcPairs = localCcPairs.filter(
(ccPair) =>
!(ccPair.access_type === "public") &&
(ccPair.groups.length === 0 ||
!props.values.groups.every((group) =>
ccPair.groups.includes(group)
))
);
return nonVisibleCcPairs.length > 0 ? (
<>
<Separator />
<h2 className="mb-1 font-medium text-base">
These connectors are not available to the{" "}
{userGroups && userGroups.length > 1
? `group${
props.values.groups.length > 1 ? "s" : ""
} you have selected`
: "group you curate"}
:
</h2>
<p className="mb-3 text-sm">
Only connectors that are directly assigned to the
group you are trying to add the document set to
will be available.
</p>
<div className="mb-3 flex gap-2 flex-wrap">
{nonVisibleCcPairs.map((ccPair) => (
<div
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
className="px-3 py-1 rounded-lg border border-non-selectable-border w-fit flex cursor-not-allowed"
>
<div className="my-auto">
<ConnectorTitle
connector={ccPair.connector}
ccPairId={ccPair.cc_pair_id}
ccPairName={ccPair.name}
isLink={false}
showMetadata={false}
/>
</div>
</div>
))}
</div>
</>
) : null;
}}
/>
</>
) : (
<ConnectorMultiSelect
</div>
</>
) : (
<div>
<h2 className="mb-1 font-medium text-base">
Pick your connectors:
</h2>
<p className="mb-3 text-xs">
All documents indexed by the selected connectors will be a
part of this document set.
</p>
<FieldArray
name="cc_pair_ids"
label="Pick your connectors"
connectors={visibleCcPairs}
selectedIds={props.values.cc_pair_ids}
onChange={(selectedIds) => {
props.setFieldValue("cc_pair_ids", selectedIds);
}}
placeholder="Search for connectors..."
render={(arrayHelpers: ArrayHelpers) => (
<div className="mb-3 flex gap-2 flex-wrap">
{ccPairs.map((ccPair) => {
const ind = props.values.cc_pair_ids.indexOf(
ccPair.cc_pair_id
);
const isSelected = ind !== -1;
return (
<div
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
className={
`
px-3
py-1
rounded-lg
border
border-border
w-fit
flex
cursor-pointer ` +
(isSelected
? " bg-background-200"
: " hover:bg-accent-background-hovered")
}
onClick={() => {
if (isSelected) {
arrayHelpers.remove(ind);
} else {
arrayHelpers.push(ccPair.cc_pair_id);
}
}}
>
<div className="my-auto">
<ConnectorTitle
connector={ccPair.connector}
ccPairId={ccPair.cc_pair_id}
ccPairName={ccPair.name}
isLink={false}
showMetadata={false}
/>
</div>
</div>
);
})}
</div>
)}
/>
)}
</div>
</div>
)}
<div className="flex mt-6 pt-4 border-t border-neutral-200">
<div className="flex mt-6">
<Button
type="submit"
variant="submit"
disabled={props.isSubmitting}
className="w-56 mx-auto py-1.5 h-auto text-sm"
className="w-64 mx-auto"
>
{isUpdate ? "Update Document Set" : "Create Document Set"}
{isUpdate ? "Update!" : "Create!"}
</Button>
</div>
</Form>

View File

@@ -15,7 +15,6 @@ import {
} from "./interfaces";
import { FiExternalLink } from "react-icons/fi";
import {
AmazonIcon,
CohereIcon,
LiteLLMIcon,
MixedBreadIcon,
@@ -243,11 +242,6 @@ const RerankingDetailsForm = forwardRef<
card.rerank_provider_type == RerankerProvider.COHERE
) {
setIsApiKeyModalOpen(true);
} else if (
card.rerank_provider_type ==
RerankerProvider.BEDROCK
) {
setIsApiKeyModalOpen(true);
} else if (
card.rerank_provider_type ==
RerankerProvider.LITELLM
@@ -284,9 +278,6 @@ const RerankingDetailsForm = forwardRef<
) : card.rerank_provider_type ===
RerankerProvider.COHERE ? (
<CohereIcon size={24} className="mr-2" />
) : card.rerank_provider_type ===
RerankerProvider.BEDROCK ? (
<AmazonIcon size={24} className="mr-2" />
) : (
<MixedBreadIcon size={24} className="mr-2" />
)}
@@ -446,10 +437,7 @@ const RerankingDetailsForm = forwardRef<
placeholder={
values.rerank_api_key
? "*".repeat(values.rerank_api_key.length)
: values.rerank_provider_type ===
RerankerProvider.BEDROCK
? "aws_ACCESSKEY_SECRETKEY_REGION"
: "Enter your API key"
: undefined
}
onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
const value = e.target.value;
@@ -460,12 +448,7 @@ const RerankingDetailsForm = forwardRef<
setFieldValue("api_key", value);
}}
type="password"
label={
values.rerank_provider_type ===
RerankerProvider.BEDROCK
? "AWS Credentials in format: aws_ACCESSKEY_SECRETKEY_REGION"
: "Cohere API Key"
}
label="Cohere API Key"
name="rerank_api_key"
/>
<div className="flex w-full justify-end mt-4">

View File

@@ -18,7 +18,6 @@ export interface RerankingDetails {
export enum RerankerProvider {
COHERE = "cohere",
LITELLM = "litellm",
BEDROCK = "bedrock",
}
export enum EmbeddingPrecision {
@@ -101,15 +100,6 @@ export const rerankingModels: RerankingModel[] = [
description: "Powerful multilingual reranking model.",
link: "https://docs.cohere.com/v2/reference/rerank",
},
{
cloud: true,
rerank_provider_type: RerankerProvider.BEDROCK,
modelName: "cohere.rerank-v3-5:0",
displayName: "Cohere Rerank 3.5",
description:
"Powerful multilingual reranking model invoked through AWS Bedrock.",
link: "https://aws.amazon.com/blogs/machine-learning/cohere-rerank-3-5-is-now-available-in-amazon-bedrock-through-rerank-api",
},
];
export const getCurrentModelCopy = (

View File

@@ -26,8 +26,6 @@ import {
FiUnlock,
FiRefreshCw,
FiPauseCircle,
FiFilter,
FiX,
} from "react-icons/fi";
import {
Tooltip,
@@ -43,7 +41,6 @@ import Cookies from "js-cookie";
import { TOGGLED_CONNECTORS_COOKIE_NAME } from "@/lib/constants";
import { usePaidEnterpriseFeaturesEnabled } from "@/components/settings/usePaidEnterpriseFeaturesEnabled";
import { ConnectorCredentialPairStatus } from "../../connector/[ccPairId]/types";
import { FilterComponent, FilterOptions } from "./FilterComponent";
function SummaryRow({
source,
@@ -288,26 +285,7 @@ export function CCPairIndexingStatusTable({
return savedState ? JSON.parse(savedState) : {};
});
const [filterOptions, setFilterOptions] = useState<FilterOptions>({
accessType: null,
docsCountFilter: {
operator: null,
value: null,
},
lastStatus: null,
});
// Reference to the FilterComponent for resetting its state
const filterComponentRef = useRef<{
resetFilters: () => void;
} | null>(null);
const {
groupedStatuses,
sortedSources,
groupSummaries,
filteredGroupedStatuses,
} = useMemo(() => {
const { groupedStatuses, sortedSources, groupSummaries } = useMemo(() => {
const grouped: Record<ValidSources, ConnectorIndexingStatus<any, any>[]> =
{} as Record<ValidSources, ConnectorIndexingStatus<any, any>[]>;
@@ -359,139 +337,12 @@ export function CCPairIndexingStatusTable({
};
});
// Apply filters to create filtered grouped statuses
const filteredGrouped: Record<
ValidSources,
ConnectorIndexingStatus<any, any>[]
> = {} as Record<ValidSources, ConnectorIndexingStatus<any, any>[]>;
sorted.forEach((source) => {
const statuses = grouped[source];
// Apply filters
const filteredStatuses = statuses.filter((status) => {
// Filter by access type
if (filterOptions.accessType && filterOptions.accessType.length > 0) {
if (!filterOptions.accessType.includes(status.access_type)) {
return false;
}
}
// Filter by last status
if (filterOptions.lastStatus && filterOptions.lastStatus.length > 0) {
if (
!filterOptions.lastStatus.includes(
status.last_finished_status as any
)
) {
return false;
}
}
// Filter by docs count
if (filterOptions.docsCountFilter.operator) {
const { operator, value } = filterOptions.docsCountFilter;
// If only operator is selected (no value), show all
if (value === null) {
return true;
}
if (operator === ">" && !(status.docs_indexed > value)) {
return false;
} else if (operator === "<" && !(status.docs_indexed < value)) {
return false;
} else if (operator === "=" && status.docs_indexed !== value) {
return false;
}
}
return true;
});
if (filteredStatuses.length > 0) {
filteredGrouped[source] = filteredStatuses;
}
});
return {
groupedStatuses: grouped,
sortedSources: sorted,
groupSummaries: summaries,
filteredGroupedStatuses: filteredGrouped,
};
}, [ccPairsIndexingStatuses, editableCcPairsIndexingStatuses, filterOptions]);
// Determine which sources to display based on filters and search
const displaySources = useMemo(() => {
const hasActiveFilters =
(filterOptions.accessType && filterOptions.accessType.length > 0) ||
(filterOptions.lastStatus && filterOptions.lastStatus.length > 0) ||
filterOptions.docsCountFilter.operator !== null;
if (hasActiveFilters) {
return Object.keys(filteredGroupedStatuses) as ValidSources[];
}
return sortedSources;
}, [sortedSources, filteredGroupedStatuses, filterOptions]);
const handleFilterChange = (newFilters: FilterOptions) => {
setFilterOptions(newFilters);
// Auto-expand sources when filters are applied
if (
(newFilters.accessType && newFilters.accessType.length > 0) ||
(newFilters.lastStatus && newFilters.lastStatus.length > 0) ||
newFilters.docsCountFilter.operator !== null
) {
// We need to wait for the filteredGroupedStatuses to be updated
// before we can expand the sources
setTimeout(() => {
const sourcesToExpand = Object.keys(
filteredGroupedStatuses
) as ValidSources[];
const newConnectorsToggled = { ...connectorsToggled };
sourcesToExpand.forEach((source) => {
newConnectorsToggled[source] = true;
});
setConnectorsToggled(newConnectorsToggled);
Cookies.set(
TOGGLED_CONNECTORS_COOKIE_NAME,
JSON.stringify(newConnectorsToggled)
);
}, 0);
}
};
const clearAllFilters = () => {
const emptyFilters: FilterOptions = {
accessType: null,
docsCountFilter: {
operator: null,
value: null,
},
lastStatus: null,
};
setFilterOptions(emptyFilters);
// Reset the FilterComponent's internal state
if (filterComponentRef.current) {
filterComponentRef.current.resetFilters();
}
};
// Check if filters are active
const hasActiveFilters = useMemo(() => {
return (
(filterOptions.accessType && filterOptions.accessType.length > 0) ||
(filterOptions.lastStatus && filterOptions.lastStatus.length > 0) ||
filterOptions.docsCountFilter.operator !== null
);
}, [filterOptions]);
}, [ccPairsIndexingStatuses, editableCcPairsIndexingStatuses]);
const toggleSource = (
source: ValidSources,
@@ -525,194 +376,127 @@ export function CCPairIndexingStatusTable({
sortedSources.length;
return (
<>
<Table>
<TableHeader>
<ConnectorRow
invisible
ccPairsIndexingStatus={{
cc_pair_id: 1,
<Table>
<TableHeader>
<ConnectorRow
invisible
ccPairsIndexingStatus={{
cc_pair_id: 1,
name: "Sample File Connector",
cc_pair_status: ConnectorCredentialPairStatus.ACTIVE,
last_status: "success",
connector: {
name: "Sample File Connector",
cc_pair_status: ConnectorCredentialPairStatus.ACTIVE,
last_status: "success",
connector: {
name: "Sample File Connector",
source: ValidSources.File,
input_type: "poll",
connector_specific_config: {
file_locations: ["/path/to/sample/file.txt"],
},
refresh_freq: 86400,
prune_freq: null,
indexing_start: new Date("2023-07-01T12:00:00Z"),
id: 1,
credential_ids: [],
access_type: "public",
time_created: "2023-07-01T12:00:00Z",
time_updated: "2023-07-01T12:00:00Z",
},
credential: {
id: 1,
name: "Sample Credential",
source: ValidSources.File,
user_id: "1",
time_created: "2023-07-01T12:00:00Z",
time_updated: "2023-07-01T12:00:00Z",
credential_json: {},
admin_public: false,
source: ValidSources.File,
input_type: "poll",
connector_specific_config: {
file_locations: ["/path/to/sample/file.txt"],
},
refresh_freq: 86400,
prune_freq: null,
indexing_start: new Date("2023-07-01T12:00:00Z"),
id: 1,
credential_ids: [],
access_type: "public",
docs_indexed: 1000,
last_success: "2023-07-01T12:00:00Z",
last_finished_status: "success",
latest_index_attempt: null,
groups: [], // Add this line
}}
isEditable={false}
/>
</TableHeader>
<div className="flex -mt-12 items-center w-0 m4 gap-x-2">
<input
type="text"
ref={searchInputRef}
placeholder="Search connectors..."
value={searchTerm}
onChange={(e) => setSearchTerm(e.target.value)}
className="ml-1 w-96 h-9 border border-border flex-none rounded-md bg-background-50 px-3 py-1 text-sm shadow-sm transition-colors placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring"
/>
time_created: "2023-07-01T12:00:00Z",
time_updated: "2023-07-01T12:00:00Z",
},
credential: {
id: 1,
name: "Sample Credential",
source: ValidSources.File,
user_id: "1",
time_created: "2023-07-01T12:00:00Z",
time_updated: "2023-07-01T12:00:00Z",
credential_json: {},
admin_public: false,
},
access_type: "public",
docs_indexed: 1000,
last_success: "2023-07-01T12:00:00Z",
last_finished_status: "success",
latest_index_attempt: null,
groups: [], // Add this line
}}
isEditable={false}
/>
</TableHeader>
<div className="flex -mt-12 items-center w-0 m4 gap-x-2">
<input
type="text"
ref={searchInputRef}
placeholder="Search connectors..."
value={searchTerm}
onChange={(e) => setSearchTerm(e.target.value)}
className="ml-1 w-96 h-9 border border-border flex-none rounded-md bg-background-50 px-3 py-1 text-sm shadow-sm transition-colors placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring"
/>
<Button className="h-9" onClick={() => toggleSources()}>
{!shouldExpand ? "Collapse All" : "Expand All"}
</Button>
<div className="flex items-center gap-2">
<FilterComponent
onFilterChange={handleFilterChange}
ref={filterComponentRef}
/>
{hasActiveFilters && (
<div className="flex flex-none items-center gap-1 ml-2 max-w-[500px]">
{filterOptions.accessType &&
filterOptions.accessType.length > 0 && (
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
Access: {filterOptions.accessType.join(", ")}
</Badge>
)}
{filterOptions.lastStatus &&
filterOptions.lastStatus.length > 0 && (
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
Status:{" "}
{filterOptions.lastStatus
.map((s) => s.replace(/_/g, " "))
.join(", ")}
</Badge>
)}
{filterOptions.docsCountFilter.operator &&
filterOptions.docsCountFilter.value !== null && (
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
Docs {filterOptions.docsCountFilter.operator}{" "}
{filterOptions.docsCountFilter.value}
</Badge>
)}
{filterOptions.docsCountFilter.operator &&
filterOptions.docsCountFilter.value === null && (
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
Docs {filterOptions.docsCountFilter.operator} any
</Badge>
)}
<Badge
variant="outline"
className="px-2 py-0.5 text-xs border-red-400 bg-red-100 hover:border-red-600 cursor-pointer hover:bg-red-100 dark:hover:bg-red-900"
onClick={() => {
if (filterComponentRef.current) {
filterComponentRef.current.resetFilters();
setFilterOptions({
accessType: null,
docsCountFilter: {
operator: null,
value: null,
},
lastStatus: null,
});
}
}}
>
<span className="text-red-500 dark:text-red-400">Clear</span>
</Badge>
</div>
)}
</div>
</div>
<TableBody>
{displaySources
.filter(
(source) =>
source != "not_applicable" && source != "ingestion_api"
)
.map((source, ind) => {
const sourceMatches = source
.toLowerCase()
.includes(searchTerm.toLowerCase());
const statuses =
filteredGroupedStatuses[source] || groupedStatuses[source];
const matchingConnectors = statuses.filter((status) =>
<Button className="h-9" onClick={() => toggleSources()}>
{!shouldExpand ? "Collapse All" : "Expand All"}
</Button>
</div>
<TableBody>
{sortedSources
.filter(
(source) => source != "not_applicable" && source != "ingestion_api"
)
.map((source, ind) => {
const sourceMatches = source
.toLowerCase()
.includes(searchTerm.toLowerCase());
const matchingConnectors = groupedStatuses[source].filter(
(status) =>
(status.name || "")
.toLowerCase()
.includes(searchTerm.toLowerCase())
);
if (sourceMatches || matchingConnectors.length > 0) {
return (
<React.Fragment key={ind}>
<br className="mt-4" />
<SummaryRow
source={source}
summary={groupSummaries[source]}
isOpen={connectorsToggled[source] || false}
onToggle={() => toggleSource(source)}
/>
{connectorsToggled[source] && (
<>
<TableRow className="border border-border dark:border-neutral-700">
<TableHead>Name</TableHead>
<TableHead>Last Indexed</TableHead>
<TableHead>Activity</TableHead>
{isPaidEnterpriseFeaturesEnabled && (
<TableHead>Permissions</TableHead>
)}
<TableHead>Total Docs</TableHead>
<TableHead>Last Status</TableHead>
<TableHead></TableHead>
</TableRow>
{(sourceMatches ? statuses : matchingConnectors).map(
(ccPairsIndexingStatus) => (
<ConnectorRow
key={ccPairsIndexingStatus.cc_pair_id}
ccPairsIndexingStatus={ccPairsIndexingStatus}
isEditable={editableCcPairsIndexingStatuses.some(
(e) =>
e.cc_pair_id ===
ccPairsIndexingStatus.cc_pair_id
)}
/>
)
);
if (sourceMatches || matchingConnectors.length > 0) {
return (
<React.Fragment key={ind}>
<br className="mt-4" />
<SummaryRow
source={source}
summary={groupSummaries[source]}
isOpen={connectorsToggled[source] || false}
onToggle={() => toggleSource(source)}
/>
{connectorsToggled[source] && (
<>
<TableRow
noHover
className="border ! border-border dark:border-neutral-700"
>
<TableHead>Name</TableHead>
<TableHead>Last Indexed</TableHead>
<TableHead>Activity</TableHead>
{isPaidEnterpriseFeaturesEnabled && (
<TableHead>Permissions</TableHead>
)}
</>
)}
</React.Fragment>
);
}
return null;
})}
</TableBody>
</Table>
</>
<TableHead>Total Docs</TableHead>
<TableHead>Last Status</TableHead>
<TableHead></TableHead>
</TableRow>
{(sourceMatches
? groupedStatuses[source]
: matchingConnectors
).map((ccPairsIndexingStatus) => (
<ConnectorRow
key={ccPairsIndexingStatus.cc_pair_id}
ccPairsIndexingStatus={ccPairsIndexingStatus}
isEditable={editableCcPairsIndexingStatuses.some(
(e) =>
e.cc_pair_id === ccPairsIndexingStatus.cc_pair_id
)}
/>
))}
</>
)}
</React.Fragment>
);
}
return null;
})}
</TableBody>
</Table>
);
}

View File

@@ -1,375 +0,0 @@
"use client";
import React, { useState, useImperativeHandle, forwardRef } from "react";
import {
DropdownMenu,
DropdownMenuContent,
DropdownMenuGroup,
DropdownMenuItem,
DropdownMenuLabel,
DropdownMenuSeparator,
DropdownMenuTrigger,
DropdownMenuCheckboxItem,
DropdownMenuRadioGroup,
DropdownMenuRadioItem,
} from "@/components/ui/dropdown-menu";
import { SortIcon } from "@/components/icons/icons";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Badge } from "@/components/ui/badge";
import { AccessType, ValidStatuses } from "@/lib/types";
import { FiFilter, FiX, FiCheck } from "react-icons/fi";
export interface FilterOptions {
accessType: AccessType[] | null;
docsCountFilter: {
operator: ">" | "<" | "=" | null;
value: number | null;
};
lastStatus: ValidStatuses[] | null;
}
interface FilterComponentProps {
onFilterChange: (filters: FilterOptions) => void;
}
export const FilterComponent = forwardRef<
{ resetFilters: () => void },
FilterComponentProps
>(({ onFilterChange }, ref) => {
const [isOpen, setIsOpen] = useState(false);
const [filters, setFilters] = useState<FilterOptions>({
accessType: null,
docsCountFilter: {
operator: null,
value: null,
},
lastStatus: null,
});
// Local state for tracking selected filters before applying
const [docsOperator, setDocsOperator] = useState<">" | "<" | "=" | null>(
null
);
const [docsValue, setDocsValue] = useState<string>("");
const [selectedAccessTypes, setSelectedAccessTypes] = useState<AccessType[]>(
[]
);
const [selectedStatuses, setSelectedStatuses] = useState<ValidStatuses[]>([]);
// Expose resetFilters method via ref
useImperativeHandle(ref, () => ({
resetFilters: () => {
setDocsOperator(null);
setDocsValue("");
setSelectedAccessTypes([]);
setSelectedStatuses([]);
setFilters({
accessType: null,
docsCountFilter: {
operator: null,
value: null,
},
lastStatus: null,
});
},
}));
const handleAccessTypeChange = (accessType: AccessType) => {
const newAccessTypes = selectedAccessTypes.includes(accessType)
? selectedAccessTypes.filter((type) => type !== accessType)
: [...selectedAccessTypes, accessType];
setSelectedAccessTypes(newAccessTypes);
};
const handleStatusChange = (status: ValidStatuses) => {
const newStatuses = selectedStatuses.includes(status)
? selectedStatuses.filter((s) => s !== status)
: [...selectedStatuses, status];
setSelectedStatuses(newStatuses);
};
const handleDocsFilterChange = () => {
if (docsOperator && docsValue) {
const newFilters = {
...filters,
accessType: selectedAccessTypes.length > 0 ? selectedAccessTypes : null,
lastStatus: selectedStatuses.length > 0 ? selectedStatuses : null,
docsCountFilter: {
operator: docsOperator,
value: parseInt(docsValue),
},
};
setFilters(newFilters);
onFilterChange(newFilters);
setIsOpen(false);
}
};
const applyFilters = () => {
const newFilters = {
...filters,
accessType: selectedAccessTypes.length > 0 ? selectedAccessTypes : null,
lastStatus: selectedStatuses.length > 0 ? selectedStatuses : null,
docsCountFilter: {
operator: docsOperator,
value: docsValue ? parseInt(docsValue) : null,
},
};
setFilters(newFilters);
onFilterChange(newFilters);
setIsOpen(false);
};
const clearFilters = () => {
setSelectedAccessTypes([]);
setSelectedStatuses([]);
setDocsOperator(null);
setDocsValue("");
const newFilters = {
accessType: null,
docsCountFilter: {
operator: null,
value: null,
},
lastStatus: null,
};
setFilters(newFilters);
onFilterChange(newFilters);
};
// Sync local state with filters when dropdown opens
const handleOpenChange = (open: boolean) => {
if (open) {
// When opening, initialize local state from current filters
setSelectedAccessTypes(filters.accessType || []);
setSelectedStatuses(filters.lastStatus || []);
setDocsOperator(filters.docsCountFilter.operator);
setDocsValue(
filters.docsCountFilter.value !== null
? filters.docsCountFilter.value.toString()
: ""
);
}
setIsOpen(open);
};
const hasActiveFilters =
(filters.accessType && filters.accessType.length > 0) ||
(filters.lastStatus && filters.lastStatus.length > 0) ||
filters.docsCountFilter.operator !== null;
// Get active filter count for badge
const getActiveFilterCount = () => {
let count = 0;
if (filters.accessType && filters.accessType.length > 0) count++;
if (filters.lastStatus && filters.lastStatus.length > 0) count++;
if (filters.docsCountFilter.operator !== null) count++;
return count;
};
return (
<div className="relative">
<DropdownMenu open={isOpen} onOpenChange={handleOpenChange}>
<DropdownMenuTrigger asChild>
<Button
variant="outline"
size="sm"
className={`p-2 h-9 ${
hasActiveFilters ? "border-primary bg-primary/5" : ""
}`}
>
<SortIcon size={20} className="text-neutral-800" />
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent
align="end"
className="w-72"
onCloseAutoFocus={(e) => e.preventDefault()}
>
<div className="flex items-center justify-between px-2 py-1.5">
<DropdownMenuLabel className="text-base font-medium">
Filter Connectors
</DropdownMenuLabel>
</div>
<DropdownMenuSeparator />
<DropdownMenuGroup>
<DropdownMenuLabel className="px-2 py-1.5 text-xs text-muted-foreground">
Access Type
</DropdownMenuLabel>
<div onClick={(e) => e.stopPropagation()}>
<DropdownMenuCheckboxItem
checked={selectedAccessTypes.includes("public")}
onCheckedChange={() => handleAccessTypeChange("public")}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
Public
</DropdownMenuCheckboxItem>
<DropdownMenuCheckboxItem
checked={selectedAccessTypes.includes("private")}
onCheckedChange={() => handleAccessTypeChange("private")}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
Private
</DropdownMenuCheckboxItem>
<DropdownMenuCheckboxItem
checked={selectedAccessTypes.includes("sync")}
onCheckedChange={() => handleAccessTypeChange("sync")}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
Auto-Sync
</DropdownMenuCheckboxItem>
</div>
</DropdownMenuGroup>
<DropdownMenuSeparator />
<DropdownMenuGroup>
<DropdownMenuLabel className="px-2 py-1.5 text-xs text-muted-foreground">
Last Status
</DropdownMenuLabel>
<div onClick={(e) => e.stopPropagation()}>
<DropdownMenuCheckboxItem
checked={selectedStatuses.includes("success")}
onCheckedChange={() => handleStatusChange("success")}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
Success
</DropdownMenuCheckboxItem>
<DropdownMenuCheckboxItem
checked={selectedStatuses.includes("failed")}
onCheckedChange={() => handleStatusChange("failed")}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
Failed
</DropdownMenuCheckboxItem>
<DropdownMenuCheckboxItem
checked={selectedStatuses.includes("in_progress")}
onCheckedChange={() => handleStatusChange("in_progress")}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
In Progress
</DropdownMenuCheckboxItem>
<DropdownMenuCheckboxItem
checked={selectedStatuses.includes("not_started")}
onCheckedChange={() => handleStatusChange("not_started")}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
Not Started
</DropdownMenuCheckboxItem>
<DropdownMenuCheckboxItem
checked={selectedStatuses.includes("completed_with_errors")}
onCheckedChange={() =>
handleStatusChange("completed_with_errors")
}
className="flex items-center justify-between"
onSelect={(e) => e.preventDefault()}
>
Completed with Errors
</DropdownMenuCheckboxItem>
</div>
</DropdownMenuGroup>
<DropdownMenuSeparator />
<DropdownMenuGroup>
<DropdownMenuLabel className="px-2 py-1.5 text-xs text-muted-foreground">
Document Count
</DropdownMenuLabel>
<div
className="flex items-center px-2 py-2 gap-2"
onClick={(e) => e.stopPropagation()}
>
<div className="flex gap-2">
<Button
variant={docsOperator === ">" ? "default" : "outline"}
size="sm"
className="h-8 px-2"
onClick={(e) => {
e.preventDefault();
e.stopPropagation();
setDocsOperator(docsOperator === ">" ? null : ">");
}}
type="button"
>
&gt;
</Button>
<Button
variant={docsOperator === "<" ? "default" : "outline"}
size="sm"
className="h-8 px-2"
onClick={(e) => {
e.preventDefault();
e.stopPropagation();
setDocsOperator(docsOperator === "<" ? null : "<");
}}
type="button"
>
&lt;
</Button>
<Button
variant={docsOperator === "=" ? "default" : "outline"}
size="sm"
className="h-8 px-2"
onClick={(e) => {
e.preventDefault();
e.stopPropagation();
setDocsOperator(docsOperator === "=" ? null : "=");
}}
type="button"
>
=
</Button>
</div>
<Input
type="number"
placeholder="Count"
value={docsValue}
onChange={(e) => setDocsValue(e.target.value)}
className="h-8 w-full"
onClick={(e) => e.stopPropagation()}
/>
</div>
<div className="px-2 py-1.5">
<Button
size="sm"
className="w-full h-8"
disabled={false}
onClick={(e) => {
e.preventDefault();
e.stopPropagation();
applyFilters();
}}
type="button"
>
Apply
</Button>
</div>
</DropdownMenuGroup>
</DropdownMenuContent>
</DropdownMenu>
{hasActiveFilters && (
<div className="absolute -top-1 -right-1">
<Badge className="h-2 bg-red-400 border-red-400 w-2 p-0 border-2 flex items-center justify-center" />
</div>
)}
</div>
);
});
FilterComponent.displayName = "FilterComponent";

View File

@@ -26,7 +26,7 @@ export function Checkbox({
onChange: (e: React.ChangeEvent<HTMLInputElement>) => void;
}) {
return (
<label className="flex text-xs cursor-pointer">
<label className="flex text-sm cursor-pointer">
<input
checked={checked}
onChange={onChange}
@@ -34,7 +34,7 @@ export function Checkbox({
className="mr-2 w-3.5 h-3.5 my-auto"
/>
<div>
<Label small>{label}</Label>
<Label>{label}</Label>
{sublabel && <SubLabel>{sublabel}</SubLabel>}
</div>
</label>
@@ -208,7 +208,7 @@ export function SettingsForm() {
}
return (
<div className="flex flex-col pb-8">
<div>
{popup}
<Title className="mb-4">Workspace Settings</Title>
<Checkbox
@@ -307,51 +307,6 @@ export function SettingsForm() {
</Button>
</>
)}
{/* Image Processing Settings */}
<Title className="mt-8 mb-4">Image Processing</Title>
<div className="flex flex-col gap-2">
<Checkbox
label="Enable Image Extraction and Analysis"
sublabel="Extract and analyze images from documents during indexing. This allows the system to process images and create searchable descriptions of them."
checked={settings.image_extraction_and_analysis_enabled ?? false}
onChange={(e) =>
handleToggleSettingsField(
"image_extraction_and_analysis_enabled",
e.target.checked
)
}
/>
<Checkbox
label="Enable Search-time Image Analysis"
sublabel="Analyze images at search time when a user asks about images. This provides more detailed and query-specific image analysis but may increase search-time latency."
checked={settings.search_time_image_analysis_enabled ?? false}
onChange={(e) =>
handleToggleSettingsField(
"search_time_image_analysis_enabled",
e.target.checked
)
}
/>
<IntegerInput
label="Maximum Image Size for Analysis (MB)"
sublabel="Images larger than this size will not be analyzed to prevent excessive resource usage."
value={settings.image_analysis_max_size_mb ?? null}
onChange={(e) => {
const value = e.target.value ? parseInt(e.target.value) : null;
if (value !== null && !isNaN(value) && value > 0) {
updateSettingField([
{ fieldName: "image_analysis_max_size_mb", newValue: value },
]);
}
}}
id="image-analysis-max-size"
placeholder="Enter maximum size in MB"
/>
</div>
</div>
);
}

View File

@@ -21,11 +21,6 @@ export interface Settings {
auto_scroll: boolean;
temperature_override_enabled: boolean;
query_history_type: QueryHistoryType;
// Image processing settings
image_extraction_and_analysis_enabled?: boolean;
search_time_image_analysis_enabled?: boolean;
image_analysis_max_size_mb?: number;
}
export enum NotificationType {

View File

@@ -204,6 +204,7 @@ export function ChatPage({
const [documentSidebarVisible, setDocumentSidebarVisible] = useState(false);
const [proSearchEnabled, setProSearchEnabled] = useState(proSearchToggled);
const [streamingAllowed, setStreamingAllowed] = useState(false);
const toggleProSearch = () => {
Cookies.set(
PRO_SEARCH_TOGGLED_COOKIE_NAME,
@@ -1978,6 +1979,8 @@ export function ChatPage({
const innerSidebarElementRef = useRef<HTMLDivElement>(null);
const [settingsToggled, setSettingsToggled] = useState(false);
const [showDeleteAllModal, setShowDeleteAllModal] = useState(false);
const currentPersona = alternativeAssistant || liveAssistant;
const HORIZON_DISTANCE = 800;
@@ -2139,6 +2142,32 @@ export function ChatPage({
<ChatPopup />
{showDeleteAllModal && (
<ConfirmEntityModal
entityType="All Chats"
entityName="all your chat sessions"
onClose={() => setShowDeleteAllModal(false)}
additionalDetails="This action cannot be undone. All your chat sessions will be deleted."
onSubmit={async () => {
const response = await deleteAllChatSessions("Chat");
if (response.ok) {
setShowDeleteAllModal(false);
setPopup({
message: "All your chat sessions have been deleted.",
type: "success",
});
refreshChatSessions();
router.push("/chat");
} else {
setPopup({
message: "Failed to delete all chat sessions.",
type: "error",
});
}
}}
/>
)}
{currentFeedback && (
<FeedbackModal
feedbackType={currentFeedback[0]}
@@ -2296,6 +2325,7 @@ export function ChatPage({
folders={folders}
removeToggle={removeToggle}
showShareModal={showShareModal}
showDeleteAllModal={() => setShowDeleteAllModal(true)}
/>
</div>

View File

@@ -403,8 +403,7 @@ export function ChatInputBar({
setTabbingIconIndex((tabbingIconIndex) =>
Math.min(
tabbingIconIndex + 1,
// showPrompts ? filteredPrompts.length :
assistantTagOptions.length
showPrompts ? filteredPrompts.length : assistantTagOptions.length
)
);
} else if (e.key === "ArrowUp") {
@@ -437,8 +436,8 @@ export function ChatInputBar({
<button
key={index}
className={`px-2 ${
tabbingIconIndex == index && "bg-neutral-200"
} rounded items-center rounded-lg content-start flex gap-x-1 py-2 w-full hover:bg-neutral-200/90 cursor-pointer`}
tabbingIconIndex == index && "bg-background-dark/75"
} rounded items-center rounded-lg content-start flex gap-x-1 py-2 w-full hover:bg-background-dark/90 cursor-pointer`}
onClick={() => {
updatedTaggedAssistant(currentAssistant);
}}
@@ -460,8 +459,8 @@ export function ChatInputBar({
target="_self"
className={`${
tabbingIconIndex == assistantTagOptions.length &&
"bg-neutral-200"
} rounded rounded-lg px-3 flex gap-x-1 py-2 w-full items-center hover:bg-neutral-200/90 cursor-pointer`}
"bg-background-dark/75"
} rounded rounded-lg px-3 flex gap-x-1 py-2 w-full items-center hover:bg-background-dark/90 cursor-pointer`}
href="/assistants/new"
>
<FiPlus size={17} />

View File

@@ -329,7 +329,7 @@ export async function deleteChatSession(chatSessionId: string) {
return response;
}
export async function deleteAllChatSessions() {
export async function deleteAllChatSessions(sessionType: "Chat" | "Search") {
const response = await fetch(`/api/chat/delete-all-chat-sessions`, {
method: "DELETE",
headers: {

View File

@@ -24,9 +24,6 @@ import { Monitor, Moon, Sun } from "lucide-react";
import { useTheme } from "next-themes";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { FiTrash2 } from "react-icons/fi";
import { deleteAllChatSessions } from "../lib";
import { useChatContext } from "@/components/context/ChatContext";
type SettingsSection = "settings" | "password";
@@ -50,8 +47,6 @@ export function UserSettingsModal({
updateUserShortcuts,
updateUserTemperatureOverrideEnabled,
} = useUser();
const { refreshChatSessions } = useChatContext();
const router = useRouter();
const containerRef = useRef<HTMLDivElement>(null);
const messageRef = useRef<HTMLDivElement>(null);
const { theme, setTheme } = useTheme();
@@ -62,8 +57,6 @@ export function UserSettingsModal({
const [isLoading, setIsLoading] = useState(false);
const [activeSection, setActiveSection] =
useState<SettingsSection>("settings");
const [isDeleteAllLoading, setIsDeleteAllLoading] = useState(false);
const [showDeleteConfirmation, setShowDeleteConfirmation] = useState(false);
useEffect(() => {
const container = containerRef.current;
@@ -139,6 +132,7 @@ export function UserSettingsModal({
);
});
const router = useRouter();
const handleChangedefaultModel = async (defaultModel: string | null) => {
try {
const response = await setUserDefaultModel(defaultModel);
@@ -211,31 +205,6 @@ export function UserSettingsModal({
};
const showPasswordSection = user?.password_configured;
const handleDeleteAllChats = async () => {
setIsDeleteAllLoading(true);
try {
const response = await deleteAllChatSessions();
if (response.ok) {
setPopup({
message: "All your chat sessions have been deleted.",
type: "success",
});
refreshChatSessions();
router.push("/chat");
} else {
throw new Error("Failed to delete all chat sessions");
}
} catch (error) {
setPopup({
message: "Failed to delete all chat sessions",
type: "error",
});
} finally {
setIsDeleteAllLoading(false);
setShowDeleteConfirmation(false);
}
};
return (
<Modal
onOutsideClick={onClose}
@@ -381,51 +350,6 @@ export function UserSettingsModal({
}}
/>
</div>
<div className="pt-4 border-t border-border">
{!showDeleteConfirmation ? (
<div className="space-y-3">
<p className="text-sm text-neutral-600 ">
This will permanently delete all your chat sessions and
cannot be undone.
</p>
<Button
variant="destructive"
className="w-full flex items-center justify-center"
onClick={() => setShowDeleteConfirmation(true)}
>
<FiTrash2 className="mr-2" size={14} />
Delete All Chats
</Button>
</div>
) : (
<div className="space-y-3">
<p className="text-sm text-neutral-600 ">
Are you sure you want to delete all your chat sessions?
</p>
<div className="flex gap-2">
<Button
type="button"
variant="destructive"
className="flex-1 flex items-center justify-center"
onClick={handleDeleteAllChats}
disabled={isDeleteAllLoading}
>
{isDeleteAllLoading
? "Deleting..."
: "Yes, Delete All"}
</Button>
<Button
variant="outline"
className="flex-1"
onClick={() => setShowDeleteConfirmation(false)}
disabled={isDeleteAllLoading}
>
Cancel
</Button>
</div>
</div>
)}
</div>
</div>
)}
{activeSection === "password" && (

View File

@@ -64,6 +64,7 @@ interface HistorySidebarProps {
showShareModal?: (chatSession: ChatSession) => void;
showDeleteModal?: (chatSession: ChatSession) => void;
explicitlyUntoggle: () => void;
showDeleteAllModal?: () => void;
setShowAssistantsModal: (show: boolean) => void;
toggleChatSessionSearchModal?: () => void;
}
@@ -182,6 +183,7 @@ export const HistorySidebar = forwardRef<HTMLDivElement, HistorySidebarProps>(
showShareModal,
toggleChatSessionSearchModal,
showDeleteModal,
showDeleteAllModal,
},
ref: ForwardedRef<HTMLDivElement>
) => {
@@ -401,6 +403,7 @@ export const HistorySidebar = forwardRef<HTMLDivElement, HistorySidebarProps>(
existingChats={existingChats}
currentChatId={currentChatId}
folders={folders}
showDeleteAllModal={showDeleteAllModal}
/>
</div>
</div>

View File

@@ -10,6 +10,7 @@ import { Folder } from "../folders/interfaces";
import { usePopup } from "@/components/admin/connectors/Popup";
import { useRouter } from "next/navigation";
import { FiPlus, FiTrash2, FiCheck, FiX } from "react-icons/fi";
import { NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED } from "@/lib/constants";
import { FolderDropdown } from "../folders/FolderDropdown";
import { ChatSessionDisplay } from "./ChatSessionDisplay";
import { useState, useCallback, useRef, useContext, useEffect } from "react";
@@ -106,6 +107,7 @@ export function PagesTab({
closeSidebar,
showShareModal,
showDeleteModal,
showDeleteAllModal,
toggleChatSessionSearchModal,
}: {
existingChats?: ChatSession[];
@@ -115,6 +117,7 @@ export function PagesTab({
closeSidebar?: () => void;
showShareModal?: (chatSession: ChatSession) => void;
showDeleteModal?: (chatSession: ChatSession) => void;
showDeleteAllModal?: () => void;
}) {
const { setPopup, popup } = usePopup();
const router = useRouter();
@@ -435,7 +438,11 @@ export function PagesTab({
</DndContext>
)}
<div className="pl-4 pr-3">
<div
className={`pl-4 pr-3 ${
NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED && "pb-20"
}`}
>
{!isHistoryEmpty && (
<>
{Object.entries(groupedChatSesssions)
@@ -472,6 +479,17 @@ export function PagesTab({
</p>
)}
</div>
{showDeleteAllModal && NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED && (
<div className="absolute w-full border-t border-t-border bg-background-100 bottom-0 left-0 p-4">
<button
className="px-4 w-full py-2 px-4 text-text-600 hover:text-text-800 bg-background-125 border border-border-strong/50 shadow-sm rounded-md transition-colors duration-200 flex items-center justify-center text-sm"
onClick={showDeleteAllModal}
>
<FiTrash2 className="mr-2" size={14} />
Clear All History
</button>
</div>
)}
</div>
);
}

View File

@@ -206,7 +206,7 @@ export function SharedChatDisplay({
{chatSession.description || `Unnamed Chat`}
</h1>
<p className=" text-text-darker">
{humanReadableFormat(chatSession.time_created)}
{humanReadableFormat(chatSession.time_updated)}
</p>
<div
className={`

View File

@@ -1,5 +1,5 @@
import { ConnectorStatus } from "@/lib/types";
import { ConnectorMultiSelect } from "@/components/ConnectorMultiSelect";
import { ConnectorIndexingStatus, ConnectorStatus } from "@/lib/types";
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
interface ConnectorEditorProps {
selectedCCPairIds: number[];
@@ -12,20 +12,55 @@ export const ConnectorEditor = ({
setSetCCPairIds,
allCCPairs,
}: ConnectorEditorProps) => {
// Filter out public docs, since they don't make sense as part of a group
const privateCCPairs = allCCPairs.filter(
(ccPair) => ccPair.access_type === "private"
);
return (
<ConnectorMultiSelect
name="connectors"
label="Connectors"
connectors={privateCCPairs}
selectedIds={selectedCCPairIds}
onChange={setSetCCPairIds}
placeholder="Search for connectors..."
showError={true}
/>
<div className="mb-3 flex gap-2 flex-wrap">
{allCCPairs
// remove public docs, since they don't make sense as part of a group
.filter((ccPair) => !(ccPair.access_type === "public"))
.map((ccPair) => {
const ind = selectedCCPairIds.indexOf(ccPair.cc_pair_id);
const isSelected = ind !== -1;
return (
<div
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
className={
`
px-3
py-1
rounded-lg
border
border-border
w-fit
flex
cursor-pointer ` +
(isSelected
? " bg-accent-background-hovered"
: " hover:bg-accent-background")
}
onClick={() => {
if (isSelected) {
setSetCCPairIds(
selectedCCPairIds.filter(
(ccPairId) => ccPairId !== ccPair.cc_pair_id
)
);
} else {
setSetCCPairIds([...selectedCCPairIds, ccPair.cc_pair_id]);
}
}}
>
<div className="my-auto">
<ConnectorTitle
connector={ccPair.connector}
ccPairId={ccPair.cc_pair_id}
ccPairName={ccPair.name}
isLink={false}
showMetadata={false}
/>
</div>
</div>
);
})}
</div>
);
};

View File

@@ -1,15 +1,13 @@
import { Button } from "@/components/Button";
import { SearchMultiSelectDropdown } from "@/components/Dropdown";
import { Modal } from "@/components/Modal";
import { useState } from "react";
import { FiX } from "react-icons/fi";
import { FiPlus, FiX } from "react-icons/fi";
import { updateUserGroup } from "./lib";
import { PopupSpec } from "@/components/admin/connectors/Popup";
import { ConnectorStatus, UserGroup } from "@/lib/types";
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
import { Connector } from "@/lib/connectors/connectors";
import { ConnectorMultiSelect } from "@/components/ConnectorMultiSelect";
import { Form } from "formik";
interface AddConnectorFormProps {
ccPairs: ConnectorStatus<any, any>[];
userGroup: UserGroup;
@@ -25,68 +23,132 @@ export const AddConnectorForm: React.FC<AddConnectorFormProps> = ({
}) => {
const [selectedCCPairIds, setSelectedCCPairIds] = useState<number[]>([]);
// Filter out ccPairs that are already in the user group and are not private
const availableCCPairs = ccPairs
.filter(
(ccPair) =>
!userGroup.cc_pairs
.map((userGroupCCPair) => userGroupCCPair.id)
.includes(ccPair.cc_pair_id)
)
.filter((ccPair) => ccPair.access_type === "private");
const selectedCCPairs = ccPairs.filter((ccPair) =>
selectedCCPairIds.includes(ccPair.cc_pair_id)
);
return (
<Modal
className="max-w-3xl"
title="Add New Connector"
onOutsideClick={() => onClose()}
>
<div className="px-6 pt-4">
<ConnectorMultiSelect
name="connectors"
label="Select Connectors"
connectors={availableCCPairs}
selectedIds={selectedCCPairIds}
onChange={setSelectedCCPairIds}
placeholder="Search for connectors to add..."
showError={false}
/>
<Modal title="Add New Connector" onOutsideClick={() => onClose()}>
<div className="px-6 pt-4 pb-12">
<div className="mb-2 flex flex-wrap gap-x-2">
{selectedCCPairs.length > 0 &&
selectedCCPairs.map((ccPair) => (
<div
key={ccPair.cc_pair_id}
onClick={() => {
setSelectedCCPairIds(
selectedCCPairIds.filter(
(ccPairId) => ccPairId !== ccPair.cc_pair_id
)
);
}}
className={`
flex
rounded-lg
px-2
py-1
my-1
border
border-border
hover:bg-accent-background-hovered
cursor-pointer`}
>
<ConnectorTitle
ccPairId={ccPair.cc_pair_id}
ccPairName={ccPair.name}
connector={ccPair.connector}
isLink={false}
showMetadata={false}
/>
<FiX className="ml-1 my-auto" />
</div>
))}
</div>
<Button
className="mt-4 flex-nowrap w-48"
onClick={async () => {
const newCCPairIds = [
...Array.from(
new Set(
userGroup.cc_pairs
.map((ccPair) => ccPair.id)
.concat(selectedCCPairIds)
)
),
];
const response = await updateUserGroup(userGroup.id, {
user_ids: userGroup.users.map((user) => user.id),
cc_pair_ids: newCCPairIds,
});
if (response.ok) {
setPopup({
message: "Successfully added connectors to group",
type: "success",
<div className="flex">
<SearchMultiSelectDropdown
options={ccPairs
.filter(
(ccPair) =>
!selectedCCPairIds.includes(ccPair.cc_pair_id) &&
!userGroup.cc_pairs
.map((userGroupCCPair) => userGroupCCPair.id)
.includes(ccPair.cc_pair_id)
)
// remove public and synced docs, since they don't make sense as part of a group
.filter((ccPair) => ccPair.access_type === "private")
.map((ccPair) => {
return {
name: ccPair.name?.toString() || "",
value: ccPair.cc_pair_id?.toString(),
metadata: {
ccPairId: ccPair.cc_pair_id,
connector: ccPair.connector,
},
};
})}
onSelect={(option) => {
setSelectedCCPairIds([
...Array.from(
new Set([
...selectedCCPairIds,
parseInt(option.value as string),
])
),
]);
}}
itemComponent={({ option }) => (
<div className="flex px-4 py-2.5 hover:bg-accent-background-hovered cursor-pointer">
<div className="my-auto">
<ConnectorTitle
ccPairId={option?.metadata?.ccPairId as number}
ccPairName={option.name}
connector={option?.metadata?.connector as Connector<any>}
isLink={false}
showMetadata={false}
/>
</div>
<div className="ml-auto my-auto">
<FiPlus />
</div>
</div>
)}
/>
<Button
className="ml-3 flex-nowrap w-48"
onClick={async () => {
const newCCPairIds = [
...Array.from(
new Set(
userGroup.cc_pairs
.map((ccPair) => ccPair.id)
.concat(selectedCCPairIds)
)
),
];
const response = await updateUserGroup(userGroup.id, {
user_ids: userGroup.users.map((user) => user.id),
cc_pair_ids: newCCPairIds,
});
onClose();
} else {
const responseJson = await response.json();
const errorMsg = responseJson.detail || responseJson.message;
setPopup({
message: `Failed to add connectors to group - ${errorMsg}`,
type: "error",
});
onClose();
}
}}
>
Add Connectors
</Button>
if (response.ok) {
setPopup({
message: "Successfully added users to group",
type: "success",
});
onClose();
} else {
const responseJson = await response.json();
const errorMsg = responseJson.detail || responseJson.message;
setPopup({
message: `Failed to add users to group - ${errorMsg}`,
type: "error",
});
onClose();
}
}}
>
Add Connectors
</Button>
</div>
</div>
</Modal>
);

View File

@@ -1,232 +0,0 @@
import React, { useState, useRef, useEffect } from "react";
import { ConnectorStatus } from "@/lib/types";
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
import { X, Search } from "lucide-react";
import { Label } from "@/components/ui/label";
import { ErrorMessage } from "formik";
interface ConnectorMultiSelectProps {
name: string;
label: string;
connectors: ConnectorStatus<any, any>[];
selectedIds: number[];
onChange: (selectedIds: number[]) => void;
disabled?: boolean;
placeholder?: string;
showError?: boolean;
}
export const ConnectorMultiSelect = ({
name,
label,
connectors,
selectedIds,
onChange,
disabled = false,
placeholder = "Search connectors...",
showError = false,
}: ConnectorMultiSelectProps) => {
const [open, setOpen] = useState(false);
const [searchQuery, setSearchQuery] = useState("");
const dropdownRef = useRef<HTMLDivElement>(null);
const inputRef = useRef<HTMLInputElement>(null);
const selectedConnectors = connectors.filter((connector) =>
selectedIds.includes(connector.cc_pair_id)
);
const unselectedConnectors = connectors.filter(
(connector) => !selectedIds.includes(connector.cc_pair_id)
);
const allConnectorsSelected = unselectedConnectors.length === 0;
const filteredUnselectedConnectors = unselectedConnectors.filter(
(connector) => {
const connectorName = connector.name || connector.connector.source;
return connectorName.toLowerCase().includes(searchQuery.toLowerCase());
}
);
useEffect(() => {
if (allConnectorsSelected && open) {
setOpen(false);
inputRef.current?.blur();
setSearchQuery("");
}
}, [allConnectorsSelected, open]);
useEffect(() => {
if (allConnectorsSelected) {
inputRef.current?.blur();
setSearchQuery("");
}
}, [allConnectorsSelected, selectedIds]);
const selectConnector = (connectorId: number) => {
const newSelectedIds = [...selectedIds, connectorId];
onChange(newSelectedIds);
setSearchQuery("");
const willAllBeSelected = connectors.length === newSelectedIds.length;
if (!willAllBeSelected) {
setTimeout(() => {
inputRef.current?.focus();
}, 0);
}
};
const removeConnector = (connectorId: number) => {
onChange(selectedIds.filter((id) => id !== connectorId));
};
useEffect(() => {
const handleClickOutside = (event: MouseEvent) => {
if (
dropdownRef.current &&
!dropdownRef.current.contains(event.target as Node) &&
inputRef.current !== event.target &&
!inputRef.current?.contains(event.target as Node)
) {
setOpen(false);
}
};
document.addEventListener("mousedown", handleClickOutside);
return () => {
document.removeEventListener("mousedown", handleClickOutside);
};
}, []);
const handleKeyDown = (e: React.KeyboardEvent) => {
if (e.key === "Escape") {
setOpen(false);
}
};
const effectivePlaceholder = allConnectorsSelected
? "All connectors selected"
: placeholder;
const isInputDisabled = disabled || allConnectorsSelected;
return (
<div className="flex flex-col w-full space-y-2 mb-4">
{label && <Label className="text-base font-medium">{label}</Label>}
<p className="text-xs text-neutral-500 ">
All documents indexed by the selected connectors will be part of this
document set.
</p>
<div className="relative">
<div
className={`flex items-center border border-input rounded-md border border-neutral-200 ${
allConnectorsSelected ? "bg-neutral-50" : ""
} focus-within:ring-1 focus-within:ring-ring focus-within:border-neutral-400 transition-colors`}
>
<Search className="absolute left-3 h-4 w-4 text-neutral-500" />
<input
ref={inputRef}
type="text"
value={searchQuery}
onChange={(e) => {
setSearchQuery(e.target.value);
setOpen(true);
}}
onFocus={() => {
if (!allConnectorsSelected) {
setOpen(true);
}
}}
onKeyDown={handleKeyDown}
placeholder={effectivePlaceholder}
className={`h-9 w-full pl-9 pr-10 py-2 bg-transparent text-sm outline-none disabled:cursor-not-allowed disabled:opacity-50 ${
allConnectorsSelected ? "text-neutral-500" : ""
}`}
disabled={isInputDisabled}
/>
</div>
{open && !allConnectorsSelected && (
<div
ref={dropdownRef}
className="absolute z-50 w-full mt-1 rounded-md border border-neutral-200 bg-white shadow-md default-scrollbar max-h-[300px] overflow-auto"
>
{filteredUnselectedConnectors.length === 0 ? (
<div className="py-4 text-center text-xs text-neutral-500">
{searchQuery
? "No matching connectors found"
: "No more connectors available"}
</div>
) : (
<div>
{filteredUnselectedConnectors.map((connector) => (
<div
key={connector.cc_pair_id}
className="flex items-center justify-between py-2 px-3 cursor-pointer hover:bg-neutral-50 text-xs"
onClick={() => selectConnector(connector.cc_pair_id)}
>
<div className="flex items-center truncate mr-2">
<ConnectorTitle
connector={connector.connector}
ccPairId={connector.cc_pair_id}
ccPairName={connector.name}
isLink={false}
showMetadata={false}
/>
</div>
</div>
))}
</div>
)}
</div>
)}
</div>
{selectedConnectors.length > 0 ? (
<div className="mt-3 ">
<div className="flex flex-wrap gap-1.5">
{selectedConnectors.map((connector) => (
<div
key={connector.cc_pair_id}
className="flex items-center bg-white rounded-md border border-neutral-300 transition-all px-2 py-1 max-w-full group text-xs"
>
<div className="flex items-center overflow-hidden">
<div className="flex-shrink-0 text-xs">
<ConnectorTitle
connector={connector.connector}
ccPairId={connector.cc_pair_id}
ccPairName={connector.name}
isLink={false}
showMetadata={false}
/>
</div>
</div>
<button
className="ml-1 flex-shrink-0 rounded-full w-4 h-4 flex items-center justify-center bg-neutral-100 text-neutral-500 hover:bg-neutral-200 hover:text-neutral-700 transition-colors group-hover:bg-neutral-200"
onClick={() => removeConnector(connector.cc_pair_id)}
aria-label="Remove connector"
>
<X className="h-2.5 w-2.5" />
</button>
</div>
))}
</div>
</div>
) : (
<div className="mt-3 p-3 border border-dashed border-neutral-300 rounded-md bg-neutral-50 text-neutral-500 text-xs">
No connectors selected. Search and select connectors above.
</div>
)}
{showError && (
<ErrorMessage
name={name}
component="div"
className="text-red-500 text-xs mt-1"
/>
)}
</div>
);
};

View File

@@ -1,52 +0,0 @@
import React from "react";
import { ConnectorStatus } from "@/lib/types";
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
import { Label } from "@/components/ui/label";
import { LockIcon } from "lucide-react";
interface NonSelectableConnectorsProps {
connectors: ConnectorStatus<any, any>[];
title: string;
description: string;
}
export const NonSelectableConnectors = ({
connectors,
title,
description,
}: NonSelectableConnectorsProps) => {
if (connectors.length === 0) {
return null;
}
return (
<div className="mt-6 mb-4">
<Label className="text-base font-medium mb-1">{title}</Label>
<p className="text-xs text-neutral-500 mb-3">{description}</p>
<div className="p-3 border border-dashed border-neutral-300 rounded-md bg-neutral-50">
<div className="text-xs font-medium text-neutral-700 mb-2 flex items-center">
<LockIcon className="h-3.5 w-3.5 mr-1.5 text-neutral-500" />
Unavailable connectors:
</div>
<div className="flex flex-wrap gap-1.5">
{connectors.map((connector) => (
<div
key={`${connector.connector.id}-${connector.credential.id}`}
className="flex items-center px-2 py-1 cursor-not-allowed opacity-80 bg-white border border-neutral-300 rounded-md text-xs"
>
<div className="flex items-center max-w-[200px] text-xs">
<ConnectorTitle
connector={connector.connector}
ccPairId={connector.cc_pair_id}
ccPairName={connector.name}
isLink={false}
showMetadata={false}
/>
</div>
</div>
))}
</div>
</div>
</div>
);
};

View File

@@ -20,7 +20,6 @@ interface ConnectorTitleProps {
owner?: string;
isLink?: boolean;
showMetadata?: boolean;
className?: string;
}
export const ConnectorTitle = ({
@@ -31,7 +30,6 @@ export const ConnectorTitle = ({
isPublic = true,
isLink = true,
showMetadata = true,
className = "",
}: ConnectorTitleProps) => {
const sourceMetadata = getSourceMetadata(connector.source);
@@ -90,17 +88,17 @@ export const ConnectorTitle = ({
);
}
const mainSectionClassName = `text-blue-500 dark:text-blue-100 flex w-fit ${className}`;
const mainSectionClassName = "text-blue-500 dark:text-blue-100 flex w-fit";
const mainDisplay = (
<>
{sourceMetadata.icon({ size: 16 })}
<div className="ml-1 my-auto text-xs font-medium truncate">
{sourceMetadata.icon({ size: 20 })}
<div className="ml-1 my-auto">
{ccPairName || sourceMetadata.displayName}
</div>
</>
);
return (
<div className="my-auto max-w-full">
<div className="my-auto">
{isLink ? (
<Link
className={mainSectionClassName}
@@ -112,10 +110,10 @@ export const ConnectorTitle = ({
<div className={mainSectionClassName}>{mainDisplay}</div>
)}
{showMetadata && additionalMetadata.size > 0 && (
<div className="text-[10px] mt-0.5 text-gray-600 dark:text-gray-400">
<div className="text-xs mt-1">
{Array.from(additionalMetadata.entries()).map(([key, value]) => {
return (
<div key={key} className="truncate">
<div key={key}>
<i>{key}:</i> {value}
</div>
);

View File

@@ -181,7 +181,7 @@ const SignedUpUserTable = ({
: "All Roles"}
</SelectValue>
</SelectTrigger>
<SelectContent className="bg-background">
<SelectContent className="bg-background-50">
{Object.entries(USER_ROLE_LABELS)
.filter(([role]) => role !== UserRole.EXT_PERM_USER)
.map(([role, label]) => (

View File

@@ -41,15 +41,6 @@ export default function TextView({
return markdownFormats.some((format) => mimeType.startsWith(format));
};
const isImageFormat = (mimeType: string) => {
const imageFormats = [
"image/png",
"image/jpeg",
"image/gif",
"image/svg+xml",
];
return imageFormats.some((format) => mimeType.startsWith(format));
};
// Detect if a given MIME type can be rendered in an <iframe>
const isSupportedIframeFormat = (mimeType: string): boolean => {
const supportedFormats = [
@@ -135,7 +126,6 @@ export default function TextView({
<DialogTitle className="text-lg font-medium truncate">
{fileName}
</DialogTitle>
<div className="flex items-center space-x-2">
<Button variant="ghost" size="icon" onClick={handleZoomOut}>
<ZoomOut className="h-4 w-4" />
@@ -171,13 +161,7 @@ export default function TextView({
className="w-full h-full transform origin-center transition-transform duration-300 ease-in-out"
style={{ transform: `scale(${zoom / 100})` }}
>
{isImageFormat(fileType) ? (
<img
src={fileUrl}
alt={fileName}
className="w-full h-full object-contain object-center"
/>
) : isSupportedIframeFormat(fileType) ? (
{isSupportedIframeFormat(fileType) ? (
<iframe
src={`${fileUrl}#toolbar=0`}
className="w-full h-full border-none"

View File

@@ -268,7 +268,7 @@ export const AVAILABLE_CLOUD_PROVIDERS: CloudEmbeddingProvider[] = [
embedding_models: [
{
provider_type: EmbeddingProvider.GOOGLE,
model_name: "text-embedding-005",
model_name: "text-embedding-004",
description: "Google's most recent text embedding model.",
pricePerMillion: 0.025,
model_dim: 768,

View File

@@ -25,8 +25,8 @@ export function mockedRefreshToken(): CustomRefreshTokenResponse {
*/
const mockExp = Date.now() + 3600000; // 1 hour from now in milliseconds
const data: CustomRefreshTokenResponse = {
access_token: "Mock access token",
refresh_token: "Mock refresh token",
access_token: "asdf Mock access token",
refresh_token: "asdf Mock refresh token",
session: { exp: mockExp },
userinfo: {
sub: "Mock email",

View File

@@ -38,7 +38,6 @@ import { SiBookstack } from "react-icons/si";
import Image, { StaticImageData } from "next/image";
import jiraSVG from "../../../public/Jira.svg";
import confluenceSVG from "../../../public/Confluence.svg";
import deepseekSVG from "../../../public/Deepseek.svg";
import openAISVG from "../../../public/Openai.svg";
import amazonSVG from "../../../public/Amazon.svg";
import geminiSVG from "../../../public/Gemini.svg";
@@ -1150,13 +1149,6 @@ export const MetaIcon = ({
className = defaultTailwindCSS,
}: IconProps) => <LogoIcon size={size} className={className} src={metaSVG} />;
export const DeepseekIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => (
<LogoIcon size={size} className={className} src={deepseekSVG} />
);
export const MicrosoftIconSVG = ({
size = 16,
className = defaultTailwindCSS,
@@ -3286,25 +3278,18 @@ export const CirclingArrowIcon = ({
</g>
</g>
</svg>
);
};
export const SortIcon = ({
size = 24,
className = defaultTailwindCSS,
}: IconProps) => {
return (
<svg
style={{ width: `${size}px`, height: `${size}px` }}
className={`w-[${size}px] h-[${size}px] ` + className}
fill="currentColor"
xmlns="http://www.w3.org/2000/svg"
viewBox="0 0 24 24"
>
<path
fill="currentColor"
d="M22 18.605a.75.75 0 0 1-.75.75h-5.1a2.93 2.93 0 0 1-5.66 0H2.75a.75.75 0 1 1 0-1.5h7.74a2.93 2.93 0 0 1 5.66 0h5.1a.75.75 0 0 1 .75.75m0-13.21a.75.75 0 0 1-.75.75H18.8a2.93 2.93 0 0 1-5.66 0H2.75a.75.75 0 1 1 0-1.5h10.39a2.93 2.93 0 0 1 5.66 0h2.45a.74.74 0 0 1 .75.75m0 6.6a.74.74 0 0 1-.75.75H9.55a2.93 2.93 0 0 1-5.66 0H2.75a.75.75 0 1 1 0-1.5h1.14a2.93 2.93 0 0 1 5.66 0h11.7a.75.75 0 0 1 .75.75"
/>
</svg>
// <svg
// style={{ width: `${size}px`, height: `${size}px` }}
// className={`w-[${size}px] h-[${size}px] ` + className}
// viewBox="0 0 112.62 120.72"
// data-name="Layer 1"
// xmlns="http://www.w3.org/2000/svg"
// >
// <path
// strokeWidth={100}
// d="M11.64,100.12l-.4-.47-1.06,8.63a5.08,5.08,0,0,1-1.92,3.41A5.11,5.11,0,0,1,0,107L2.79,84.65v-.07a3.28,3.28,0,0,1,.08-.41h0A5.09,5.09,0,0,1,9,80.39q11.22,2.53,22.42,5.15a5,5,0,0,1,3.17,2.25,5.14,5.14,0,0,1,.64,3.84v0a5,5,0,0,1-2.25,3.16,5.08,5.08,0,0,1-3.83.65c-3.31-.75-6.62-1.52-9.92-2.28a40.71,40.71,0,0,0,2.84,3,50.09,50.09,0,0,0,26.23,13.49,48.67,48.67,0,0,0,14.71.34A47.35,47.35,0,0,0,77,106h0q2.52-1.19,4.83-2.54c1.56-.93,3.07-1.92,4.51-3a50.8,50.8,0,0,0,8.56-7.88,48.92,48.92,0,0,0,6.39-9.45l.56-1.1,10,2.69-.8,1.66a58.64,58.64,0,0,1-7.9,12.24,61.28,61.28,0,0,1-10.81,10.1c-1.68,1.23-3.46,2.4-5.32,3.5s-3.73,2.07-5.74,3a58,58,0,0,1-17,5,58.56,58.56,0,0,1-17.79-.39,60.21,60.21,0,0,1-31.58-16.26c-1.2-1.16-2.26-2.31-3.24-3.45ZM101,20.6l.4.47,1-8.63a5.11,5.11,0,1,1,10.14,1.26l-2.74,22.37,0,.07c0,.13,0,.27-.07.41h0a5.09,5.09,0,0,1-6.08,3.78c-7.47-1.69-15-3.4-22.42-5.15a5,5,0,0,1-3.16-2.25,5.1,5.1,0,0,1-.65-3.84v0a5,5,0,0,1,2.25-3.16,5.1,5.1,0,0,1,3.84-.65c3.31.75,6.61,1.52,9.92,2.28-.84-1-1.77-2-2.84-3.05a50.09,50.09,0,0,0-12.13-8.73A49.49,49.49,0,0,0,64.37,11a48.6,48.6,0,0,0-14.7-.34,47.26,47.26,0,0,0-14,4.1h0q-2.53,1.18-4.83,2.54c-1.57.93-3.07,1.92-4.52,3a50.34,50.34,0,0,0-8.55,7.88,48,48,0,0,0-6.39,9.45l-.57,1.1L.76,36l.8-1.66A58.9,58.9,0,0,1,9.46,22.1,61.63,61.63,0,0,1,20.27,12q2.54-1.85,5.32-3.5c1.81-1.06,3.73-2.07,5.74-3a58,58,0,0,1,17-5A58.56,58.56,0,0,1,66.16.89a59.77,59.77,0,0,1,17,5.74A60.4,60.4,0,0,1,97.75,17.15c1.19,1.16,2.26,2.31,3.24,3.45Z"
// />
// </svg>
);
};

View File

@@ -85,7 +85,7 @@ export const LLMSelector: React.FC<LLMSelectorProps> = ({
<span>{userSettings ? "System Default" : "User Default"}</span>
{userSettings && (
<span className=" my-auto font-normal ml-1">
({defaultModelDisplayName})
({defaultModelDisplayName}) asdf
</span>
)}
</SelectItem>

View File

@@ -1,153 +0,0 @@
"use client";
import * as React from "react";
import { type DialogProps } from "@radix-ui/react-dialog";
import { Command as CommandPrimitive } from "cmdk";
import { Search } from "lucide-react";
import { cn } from "@/lib/utils";
import { Dialog, DialogContent } from "@/components/ui/dialog";
const Command = React.forwardRef<
React.ElementRef<typeof CommandPrimitive>,
React.ComponentPropsWithoutRef<typeof CommandPrimitive>
>(({ className, ...props }, ref) => (
<CommandPrimitive
ref={ref}
className={cn(
"flex h-full w-full flex-col overflow-hidden rounded-md bg-white text-neutral-950 dark:bg-neutral-950 dark:text-neutral-50",
className
)}
{...props}
/>
));
Command.displayName = CommandPrimitive.displayName;
const CommandDialog = ({ children, ...props }: DialogProps) => {
return (
<Dialog {...props}>
<DialogContent className="overflow-hidden p-0 shadow-lg">
<Command className="[&_[cmdk-group-heading]]:px-2 [&_[cmdk-group-heading]]:font-medium [&_[cmdk-group-heading]]:text-neutral-500 [&_[cmdk-group]:not([hidden])_~[cmdk-group]]:pt-0 [&_[cmdk-group]]:px-2 [&_[cmdk-input-wrapper]_svg]:h-5 [&_[cmdk-input-wrapper]_svg]:w-5 [&_[cmdk-input]]:h-12 [&_[cmdk-item]]:px-2 [&_[cmdk-item]]:py-3 [&_[cmdk-item]_svg]:h-5 [&_[cmdk-item]_svg]:w-5 dark:[&_[cmdk-group-heading]]:text-neutral-400">
{children}
</Command>
</DialogContent>
</Dialog>
);
};
const CommandInput = React.forwardRef<
React.ElementRef<typeof CommandPrimitive.Input>,
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Input>
>(({ className, ...props }, ref) => (
<div className="flex items-center border-b px-3" cmdk-input-wrapper="">
<Search className="mr-2 h-4 w-4 shrink-0 opacity-50" />
<CommandPrimitive.Input
ref={ref}
className={cn(
"flex h-11 w-full rounded-md bg-transparent py-3 text-sm outline-none placeholder:text-neutral-500 disabled:cursor-not-allowed disabled:opacity-50 dark:placeholder:text-neutral-400",
className
)}
{...props}
/>
</div>
));
CommandInput.displayName = CommandPrimitive.Input.displayName;
const CommandList = React.forwardRef<
React.ElementRef<typeof CommandPrimitive.List>,
React.ComponentPropsWithoutRef<typeof CommandPrimitive.List>
>(({ className, ...props }, ref) => (
<CommandPrimitive.List
ref={ref}
className={cn("max-h-[300px] overflow-y-auto overflow-x-hidden", className)}
{...props}
/>
));
CommandList.displayName = CommandPrimitive.List.displayName;
const CommandEmpty = React.forwardRef<
React.ElementRef<typeof CommandPrimitive.Empty>,
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Empty>
>((props, ref) => (
<CommandPrimitive.Empty
ref={ref}
className="py-6 text-center text-sm"
{...props}
/>
));
CommandEmpty.displayName = CommandPrimitive.Empty.displayName;
const CommandGroup = React.forwardRef<
React.ElementRef<typeof CommandPrimitive.Group>,
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Group>
>(({ className, ...props }, ref) => (
<CommandPrimitive.Group
ref={ref}
className={cn(
"overflow-hidden p-1 text-neutral-950 [&_[cmdk-group-heading]]:px-2 [&_[cmdk-group-heading]]:py-1.5 [&_[cmdk-group-heading]]:text-xs [&_[cmdk-group-heading]]:font-medium [&_[cmdk-group-heading]]:text-neutral-500 dark:text-neutral-50 dark:[&_[cmdk-group-heading]]:text-neutral-400",
className
)}
{...props}
/>
));
CommandGroup.displayName = CommandPrimitive.Group.displayName;
const CommandSeparator = React.forwardRef<
React.ElementRef<typeof CommandPrimitive.Separator>,
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Separator>
>(({ className, ...props }, ref) => (
<CommandPrimitive.Separator
ref={ref}
className={cn("-mx-1 h-px bg-neutral-200 dark:bg-neutral-800", className)}
{...props}
/>
));
CommandSeparator.displayName = CommandPrimitive.Separator.displayName;
const CommandItem = React.forwardRef<
React.ElementRef<typeof CommandPrimitive.Item>,
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Item>
>(({ className, ...props }, ref) => (
<CommandPrimitive.Item
ref={ref}
className={cn(
"relative flex cursor-default gap-2 select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none data-[disabled=true]:pointer-events-none data-[selected='true']:bg-neutral-100 data-[selected=true]:text-neutral-900 data-[disabled=true]:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0 dark:data-[selected='true']:bg-neutral-800 dark:data-[selected=true]:text-neutral-50",
className
)}
{...props}
/>
));
CommandItem.displayName = CommandPrimitive.Item.displayName;
const CommandShortcut = ({
className,
...props
}: React.HTMLAttributes<HTMLSpanElement>) => {
return (
<span
className={cn(
"ml-auto text-xs tracking-widest text-neutral-500 dark:text-neutral-400",
className
)}
{...props}
/>
);
};
CommandShortcut.displayName = "CommandShortcut";
export {
Command,
CommandDialog,
CommandInput,
CommandList,
CommandEmpty,
CommandGroup,
CommandItem,
CommandShortcut,
CommandSeparator,
};

View File

@@ -82,6 +82,9 @@ export const NEXT_PUBLIC_FORGOT_PASSWORD_ENABLED =
export const NEXT_PUBLIC_TEST_ENV =
process.env.NEXT_PUBLIC_TEST_ENV?.toLowerCase() === "true";
export const NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED =
process.env.NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED?.toLowerCase() === "true";
export const NEXT_PUBLIC_ENABLE_CHROME_EXTENSION =
process.env.NEXT_PUBLIC_ENABLE_CHROME_EXTENSION?.toLowerCase() === "true";

View File

@@ -706,10 +706,6 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
"phi-3.5-mini-instruct": "Phi 3.5 Mini",
"phi-3.5-moe-instruct": "Phi 3.5 MoE",
"phi-3.5-vision-instruct": "Phi 3.5 Vision",
"phi-4": "Phi 4",
// Deepseek Models
"deepseek-r1": "DeepSeek R1",
// Anthropic models
"claude-3-opus-20240229": "Claude 3 Opus",
@@ -726,8 +722,6 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
"claude-3-5-haiku-20241022": "Claude 3.5 Haiku",
"claude-3-5-haiku@20241022": "Claude 3.5 Haiku",
"claude-3.5-haiku@20241022": "Claude 3.5 Haiku",
"claude-3.7-sonnet@202502019": "Claude 3.7 Sonnet",
"claude-3-7-sonnet-202502019": "Claude 3.7 Sonnet",
// Google Models
"gemini-1.5-pro": "Gemini 1.5 Pro",
@@ -737,16 +731,10 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
"gemini-1.5-pro-002": "Gemini 1.5 Pro (v2)",
"gemini-1.5-flash-002": "Gemini 1.5 Flash (v2)",
"gemini-2.0-flash-exp": "Gemini 2.0 Flash (Experimental)",
"gemini-2.0-flash-001": "Gemini 2.0 Flash",
"gemini-2.0-flash-lite-preview-02-05": "Gemini 2.0 Flash Lite (Prv)",
"gemini-2.0-flash-thinking-exp-01-02": "Gemini 2.0 Flash Thinking (Exp)",
"gemini-2.0-pro-exp-02-05": "Gemini 2.0 Pro (Exp)",
"gemini-2.0-flash": "Gemini 2.0 Flash",
"gemini-2.0-flash-thinking-exp-01-21": "Gemini 2.0 Flash Thinking",
// Mistral Models
"mistral-large-2411": "Mistral Large 24.11",
"mistral-large@2411": "Mistral Large 24.11",
"ministral-3b": "Ministral 3B",
// Bedrock models
"meta.llama3-1-70b-instruct-v1:0": "Llama 3.1 70B",
@@ -767,8 +755,6 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
"anthropic.claude-v2:1": "Claude v2.1",
"anthropic.claude-v2": "Claude v2",
"anthropic.claude-v1": "Claude v1",
"anthropic.claude-3-7-sonnet-20250219-v1:0": "Claude 3.7 Sonnet",
"us.anthropic.claude-3-7-sonnet-20250219-v1:0": "Claude 3.7 Sonnet",
"anthropic.claude-3-opus-20240229-v1:0": "Claude 3 Opus",
"anthropic.claude-3-haiku-20240307-v1:0": "Claude 3 Haiku",
"anthropic.claude-3-5-sonnet-20240620-v1:0": "Claude 3.5 Sonnet",
@@ -802,7 +788,6 @@ export const defaultModelsByProvider: { [name: string]: string[] } = {
"anthropic.claude-3-opus-20240229-v1:0",
"mistral.mistral-large-2402-v1:0",
"anthropic.claude-3-5-sonnet-20241022-v2:0",
"anthropic.claude-3-7-sonnet-20250219-v1:0",
],
anthropic: ["claude-3-opus-20240229", "claude-3-5-sonnet-20241022"],
};

View File

@@ -90,8 +90,6 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
"anthropic.claude-3-5-sonnet-20240620-v1:0",
"anthropic.claude-3-5-sonnet-20241022-v2:0",
"anthropic.claude-3-7-sonnet-20250219-v1:0",
"claude-3.7-sonnet@202502019",
"claude-3-7-sonnet-202502019",
// google gemini model names
"gemini-1.5-pro",
"gemini-1.5-flash",
@@ -100,8 +98,6 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
"gemini-1.5-pro-002",
"gemini-1.5-flash-002",
"gemini-2.0-flash-exp",
"gemini-2.0-flash-001",
"gemini-2.0-pro-exp-02-05",
// amazon models
"amazon.nova-lite@v1",
"amazon.nova-pro@v1",