mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-17 15:55:45 +00:00
Compare commits
1 Commits
batch_proc
...
user_inves
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9ece8f68f3 |
@@ -6,8 +6,7 @@ Create Date: 2025-02-26 13:07:56.217791
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import time
|
||||
from sqlalchemy import text
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "3bd4c84fe72f"
|
||||
@@ -28,357 +27,45 @@ depends_on = None
|
||||
# 4. Adds indexes to both chat_message and chat_session tables for comprehensive search
|
||||
|
||||
|
||||
def upgrade():
|
||||
# --- PART 1: chat_message table ---
|
||||
# Step 1: Add nullable column (quick, minimal locking)
|
||||
# op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv")
|
||||
# op.execute("DROP TRIGGER IF EXISTS chat_message_tsv_trigger ON chat_message")
|
||||
# op.execute("DROP FUNCTION IF EXISTS update_chat_message_tsv()")
|
||||
# op.execute("ALTER TABLE chat_message DROP COLUMN IF EXISTS message_tsv")
|
||||
# # Drop chat_session tsv trigger if it exists
|
||||
# op.execute("DROP TRIGGER IF EXISTS chat_session_tsv_trigger ON chat_session")
|
||||
# op.execute("DROP FUNCTION IF EXISTS update_chat_session_tsv()")
|
||||
# op.execute("ALTER TABLE chat_session DROP COLUMN IF EXISTS title_tsv")
|
||||
# raise Exception("Stop here")
|
||||
time.time()
|
||||
op.execute("ALTER TABLE chat_message ADD COLUMN IF NOT EXISTS message_tsv tsvector")
|
||||
|
||||
# Step 2: Create function and trigger for new/updated rows
|
||||
def upgrade() -> None:
|
||||
# Create a GIN index for full-text search on chat_message.message
|
||||
op.execute(
|
||||
"""
|
||||
CREATE OR REPLACE FUNCTION update_chat_message_tsv()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.message_tsv = to_tsvector('english', NEW.message);
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql
|
||||
"""
|
||||
ALTER TABLE chat_message
|
||||
ADD COLUMN message_tsv tsvector
|
||||
GENERATED ALWAYS AS (to_tsvector('english', message)) STORED;
|
||||
"""
|
||||
)
|
||||
|
||||
# Create trigger in a separate execute call
|
||||
# Commit the current transaction before creating concurrent indexes
|
||||
op.execute("COMMIT")
|
||||
|
||||
op.execute(
|
||||
"""
|
||||
CREATE TRIGGER chat_message_tsv_trigger
|
||||
BEFORE INSERT OR UPDATE ON chat_message
|
||||
FOR EACH ROW EXECUTE FUNCTION update_chat_message_tsv()
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
|
||||
ON chat_message
|
||||
USING GIN (message_tsv)
|
||||
"""
|
||||
)
|
||||
|
||||
# Step 3: Update existing rows in batches using Python
|
||||
time.time()
|
||||
|
||||
# Get connection and count total rows
|
||||
connection = op.get_bind()
|
||||
total_count_result = connection.execute(
|
||||
text("SELECT COUNT(*) FROM chat_message")
|
||||
).scalar()
|
||||
total_count = total_count_result if total_count_result is not None else 0
|
||||
batch_size = 5000
|
||||
batches = 0
|
||||
|
||||
# Calculate total batches needed
|
||||
total_batches = (
|
||||
(total_count + batch_size - 1) // batch_size if total_count > 0 else 0
|
||||
# Also add a stored tsvector column for chat_session.description
|
||||
op.execute(
|
||||
"""
|
||||
ALTER TABLE chat_session
|
||||
ADD COLUMN description_tsv tsvector
|
||||
GENERATED ALWAYS AS (to_tsvector('english', coalesce(description, ''))) STORED;
|
||||
"""
|
||||
)
|
||||
|
||||
# Process in batches - properly handling UUIDs by using OFFSET/LIMIT approach
|
||||
for batch_num in range(total_batches):
|
||||
offset = batch_num * batch_size
|
||||
# Commit again before creating the second concurrent index
|
||||
op.execute("COMMIT")
|
||||
|
||||
# Execute update for this batch using OFFSET/LIMIT which works with UUIDs
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
UPDATE chat_message
|
||||
SET message_tsv = to_tsvector('english', message)
|
||||
WHERE id IN (
|
||||
SELECT id FROM chat_message
|
||||
WHERE message_tsv IS NULL
|
||||
ORDER BY id
|
||||
LIMIT :batch_size OFFSET :offset
|
||||
)
|
||||
"""
|
||||
).bindparams(batch_size=batch_size, offset=offset)
|
||||
)
|
||||
|
||||
# Commit each batch
|
||||
connection.execute(text("COMMIT"))
|
||||
# Start a new transaction
|
||||
connection.execute(text("BEGIN"))
|
||||
|
||||
batches += 1
|
||||
|
||||
# Final check for any remaining NULL values
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
UPDATE chat_message SET message_tsv = to_tsvector('english', message)
|
||||
WHERE message_tsv IS NULL
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Create GIN index concurrently
|
||||
connection.execute(text("COMMIT"))
|
||||
|
||||
time.time()
|
||||
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv
|
||||
ON chat_message USING GIN (message_tsv)
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# First drop the trigger as it won't be needed anymore
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
DROP TRIGGER IF EXISTS chat_message_tsv_trigger ON chat_message;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
DROP FUNCTION IF EXISTS update_chat_message_tsv();
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Add new generated column
|
||||
time.time()
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
ALTER TABLE chat_message
|
||||
ADD COLUMN message_tsv_gen tsvector
|
||||
GENERATED ALWAYS AS (to_tsvector('english', message)) STORED;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
connection.execute(text("COMMIT"))
|
||||
|
||||
time.time()
|
||||
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_message_tsv_gen
|
||||
ON chat_message USING GIN (message_tsv_gen)
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Drop old index and column
|
||||
connection.execute(text("COMMIT"))
|
||||
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
DROP INDEX CONCURRENTLY IF EXISTS idx_chat_message_tsv;
|
||||
"""
|
||||
)
|
||||
)
|
||||
connection.execute(text("COMMIT"))
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
ALTER TABLE chat_message DROP COLUMN message_tsv;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Rename new column to old name
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
ALTER TABLE chat_message RENAME COLUMN message_tsv_gen TO message_tsv;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# --- PART 2: chat_session table ---
|
||||
|
||||
# Step 1: Add nullable column (quick, minimal locking)
|
||||
time.time()
|
||||
connection.execute(
|
||||
text(
|
||||
"ALTER TABLE chat_session ADD COLUMN IF NOT EXISTS description_tsv tsvector"
|
||||
)
|
||||
)
|
||||
|
||||
# Step 2: Create function and trigger for new/updated rows - SPLIT INTO SEPARATE CALLS
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE OR REPLACE FUNCTION update_chat_session_tsv()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.description_tsv = to_tsvector('english', COALESCE(NEW.description, ''));
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Create trigger in a separate execute call
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE TRIGGER chat_session_tsv_trigger
|
||||
BEFORE INSERT OR UPDATE ON chat_session
|
||||
FOR EACH ROW EXECUTE FUNCTION update_chat_session_tsv()
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Step 3: Update existing rows in batches using Python
|
||||
time.time()
|
||||
|
||||
# Get the maximum ID to determine batch count
|
||||
# Cast id to text for MAX function since it's a UUID
|
||||
max_id_result = connection.execute(
|
||||
text("SELECT COALESCE(MAX(id::text), '0') FROM chat_session")
|
||||
).scalar()
|
||||
max_id_result if max_id_result is not None else "0"
|
||||
batch_size = 5000
|
||||
batches = 0
|
||||
|
||||
# Get all IDs ordered to process in batches
|
||||
rows = connection.execute(
|
||||
text("SELECT id FROM chat_session ORDER BY id")
|
||||
).fetchall()
|
||||
total_rows = len(rows)
|
||||
|
||||
# Process in batches
|
||||
for batch_num, batch_start in enumerate(range(0, total_rows, batch_size)):
|
||||
batch_end = min(batch_start + batch_size, total_rows)
|
||||
batch_ids = [row[0] for row in rows[batch_start:batch_end]]
|
||||
|
||||
if not batch_ids:
|
||||
continue
|
||||
|
||||
# Use IN clause instead of BETWEEN for UUIDs
|
||||
placeholders = ", ".join([f":id{i}" for i in range(len(batch_ids))])
|
||||
params = {f"id{i}": id_val for i, id_val in enumerate(batch_ids)}
|
||||
|
||||
# Execute update for this batch
|
||||
connection.execute(
|
||||
text(
|
||||
f"""
|
||||
UPDATE chat_session
|
||||
SET description_tsv = to_tsvector('english', COALESCE(description, ''))
|
||||
WHERE id IN ({placeholders})
|
||||
AND description_tsv IS NULL
|
||||
"""
|
||||
).bindparams(**params)
|
||||
)
|
||||
|
||||
# Commit each batch
|
||||
connection.execute(text("COMMIT"))
|
||||
# Start a new transaction
|
||||
connection.execute(text("BEGIN"))
|
||||
|
||||
batches += 1
|
||||
|
||||
# Final check for any remaining NULL values
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
UPDATE chat_session SET description_tsv = to_tsvector('english', COALESCE(description, ''))
|
||||
WHERE description_tsv IS NULL
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Create GIN index concurrently
|
||||
connection.execute(text("COMMIT"))
|
||||
|
||||
time.time()
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
|
||||
ON chat_session USING GIN (description_tsv)
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# After Final check for chat_session
|
||||
# First drop the trigger as it won't be needed anymore
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
DROP TRIGGER IF EXISTS chat_session_tsv_trigger ON chat_session;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
DROP FUNCTION IF EXISTS update_chat_session_tsv();
|
||||
"""
|
||||
)
|
||||
)
|
||||
# Add new generated column
|
||||
time.time()
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
ALTER TABLE chat_session
|
||||
ADD COLUMN description_tsv_gen tsvector
|
||||
GENERATED ALWAYS AS (to_tsvector('english', COALESCE(description, ''))) STORED;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Create new index on generated column
|
||||
connection.execute(text("COMMIT"))
|
||||
|
||||
time.time()
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv_gen
|
||||
ON chat_session USING GIN (description_tsv_gen)
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Drop old index and column
|
||||
connection.execute(text("COMMIT"))
|
||||
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
DROP INDEX CONCURRENTLY IF EXISTS idx_chat_session_desc_tsv;
|
||||
"""
|
||||
)
|
||||
)
|
||||
connection.execute(text("COMMIT"))
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
ALTER TABLE chat_session DROP COLUMN description_tsv;
|
||||
"""
|
||||
)
|
||||
)
|
||||
|
||||
# Rename new column to old name
|
||||
connection.execute(
|
||||
text(
|
||||
"""
|
||||
ALTER TABLE chat_session RENAME COLUMN description_tsv_gen TO description_tsv;
|
||||
"""
|
||||
)
|
||||
op.execute(
|
||||
"""
|
||||
CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_chat_session_desc_tsv
|
||||
ON chat_session
|
||||
USING GIN (description_tsv)
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -134,9 +134,7 @@ def fetch_chat_sessions_eagerly_by_time(
|
||||
limit: int | None = 500,
|
||||
initial_time: datetime | None = None,
|
||||
) -> list[ChatSession]:
|
||||
"""Sorted by oldest to newest, then by message id"""
|
||||
|
||||
asc_time_order: UnaryExpression = asc(ChatSession.time_created)
|
||||
time_order: UnaryExpression = desc(ChatSession.time_created)
|
||||
message_order: UnaryExpression = asc(ChatMessage.id)
|
||||
|
||||
filters: list[ColumnElement | BinaryExpression] = [
|
||||
@@ -149,7 +147,8 @@ def fetch_chat_sessions_eagerly_by_time(
|
||||
subquery = (
|
||||
db_session.query(ChatSession.id, ChatSession.time_created)
|
||||
.filter(*filters)
|
||||
.order_by(asc_time_order)
|
||||
.order_by(ChatSession.id, time_order)
|
||||
.distinct(ChatSession.id)
|
||||
.limit(limit)
|
||||
.subquery()
|
||||
)
|
||||
@@ -165,7 +164,7 @@ def fetch_chat_sessions_eagerly_by_time(
|
||||
ChatMessage.chat_message_feedbacks
|
||||
),
|
||||
)
|
||||
.order_by(asc_time_order, message_order)
|
||||
.order_by(time_order, message_order)
|
||||
)
|
||||
|
||||
chat_sessions = query.all()
|
||||
|
||||
@@ -16,18 +16,13 @@ from onyx.db.models import UsageReport
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
|
||||
|
||||
# Gets skeletons of all messages in the given range
|
||||
# Gets skeletons of all message
|
||||
def get_empty_chat_messages_entries__paginated(
|
||||
db_session: Session,
|
||||
period: tuple[datetime, datetime],
|
||||
limit: int | None = 500,
|
||||
initial_time: datetime | None = None,
|
||||
) -> tuple[Optional[datetime], list[ChatMessageSkeleton]]:
|
||||
"""Returns a tuple where:
|
||||
first element is the most recent timestamp out of the sessions iterated
|
||||
- this timestamp can be used to paginate forward in time
|
||||
second element is a list of messages belonging to all the sessions iterated
|
||||
"""
|
||||
chat_sessions = fetch_chat_sessions_eagerly_by_time(
|
||||
start=period[0],
|
||||
end=period[1],
|
||||
@@ -57,17 +52,18 @@ def get_empty_chat_messages_entries__paginated(
|
||||
if len(chat_sessions) == 0:
|
||||
return None, []
|
||||
|
||||
return chat_sessions[-1].time_created, message_skeletons
|
||||
return chat_sessions[0].time_created, message_skeletons
|
||||
|
||||
|
||||
def get_all_empty_chat_message_entries(
|
||||
db_session: Session,
|
||||
period: tuple[datetime, datetime],
|
||||
) -> Generator[list[ChatMessageSkeleton], None, None]:
|
||||
"""period is the range of time over which to fetch messages."""
|
||||
initial_time: Optional[datetime] = period[0]
|
||||
ind = 0
|
||||
while True:
|
||||
# iterate from oldest to newest
|
||||
ind += 1
|
||||
|
||||
time_created, message_skeletons = get_empty_chat_messages_entries__paginated(
|
||||
db_session,
|
||||
period,
|
||||
|
||||
@@ -104,14 +104,14 @@ async def provision_tenant(tenant_id: str, email: str) -> None:
|
||||
status_code=409, detail="User already belongs to an organization"
|
||||
)
|
||||
|
||||
logger.debug(f"Provisioning tenant {tenant_id} for user {email}")
|
||||
logger.info(f"Provisioning tenant: {tenant_id}")
|
||||
token = None
|
||||
|
||||
try:
|
||||
if not create_schema_if_not_exists(tenant_id):
|
||||
logger.debug(f"Created schema for tenant {tenant_id}")
|
||||
logger.info(f"Created schema for tenant {tenant_id}")
|
||||
else:
|
||||
logger.debug(f"Schema already exists for tenant {tenant_id}")
|
||||
logger.info(f"Schema already exists for tenant {tenant_id}")
|
||||
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ MODEL_WARM_UP_STRING = "hi " * 512
|
||||
DEFAULT_OPENAI_MODEL = "text-embedding-3-small"
|
||||
DEFAULT_COHERE_MODEL = "embed-english-light-v3.0"
|
||||
DEFAULT_VOYAGE_MODEL = "voyage-large-2-instruct"
|
||||
DEFAULT_VERTEX_MODEL = "text-embedding-005"
|
||||
DEFAULT_VERTEX_MODEL = "text-embedding-004"
|
||||
|
||||
|
||||
class EmbeddingModelTextType:
|
||||
|
||||
@@ -5,7 +5,6 @@ from types import TracebackType
|
||||
from typing import cast
|
||||
from typing import Optional
|
||||
|
||||
import aioboto3 # type: ignore
|
||||
import httpx
|
||||
import openai
|
||||
import vertexai # type: ignore
|
||||
@@ -29,13 +28,11 @@ from model_server.constants import DEFAULT_VERTEX_MODEL
|
||||
from model_server.constants import DEFAULT_VOYAGE_MODEL
|
||||
from model_server.constants import EmbeddingModelTextType
|
||||
from model_server.constants import EmbeddingProvider
|
||||
from model_server.utils import pass_aws_key
|
||||
from model_server.utils import simple_log_function_time
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import API_BASED_EMBEDDING_TIMEOUT
|
||||
from shared_configs.configs import INDEXING_ONLY
|
||||
from shared_configs.configs import OPENAI_EMBEDDING_TIMEOUT
|
||||
from shared_configs.configs import VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE
|
||||
from shared_configs.enums import EmbedTextType
|
||||
from shared_configs.enums import RerankerProvider
|
||||
from shared_configs.model_server_models import Embedding
|
||||
@@ -185,24 +182,17 @@ class CloudEmbedding:
|
||||
vertexai.init(project=project_id, credentials=credentials)
|
||||
client = TextEmbeddingModel.from_pretrained(model)
|
||||
|
||||
inputs = [TextEmbeddingInput(text, embedding_type) for text in texts]
|
||||
|
||||
# Split into batches of 25 texts
|
||||
max_texts_per_batch = VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE
|
||||
batches = [
|
||||
inputs[i : i + max_texts_per_batch]
|
||||
for i in range(0, len(inputs), max_texts_per_batch)
|
||||
]
|
||||
|
||||
# Dispatch all embedding calls asynchronously at once
|
||||
tasks = [
|
||||
client.get_embeddings_async(batch, auto_truncate=True) for batch in batches
|
||||
]
|
||||
|
||||
# Wait for all tasks to complete in parallel
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
return [embedding.values for batch in results for embedding in batch]
|
||||
embeddings = await client.get_embeddings_async(
|
||||
[
|
||||
TextEmbeddingInput(
|
||||
text,
|
||||
embedding_type,
|
||||
)
|
||||
for text in texts
|
||||
],
|
||||
auto_truncate=True, # This is the default
|
||||
)
|
||||
return [embedding.values for embedding in embeddings]
|
||||
|
||||
async def _embed_litellm_proxy(
|
||||
self, texts: list[str], model_name: str | None
|
||||
@@ -457,7 +447,7 @@ async def local_rerank(query: str, docs: list[str], model_name: str) -> list[flo
|
||||
)
|
||||
|
||||
|
||||
async def cohere_rerank_api(
|
||||
async def cohere_rerank(
|
||||
query: str, docs: list[str], model_name: str, api_key: str
|
||||
) -> list[float]:
|
||||
cohere_client = CohereAsyncClient(api_key=api_key)
|
||||
@@ -467,45 +457,6 @@ async def cohere_rerank_api(
|
||||
return [result.relevance_score for result in sorted_results]
|
||||
|
||||
|
||||
async def cohere_rerank_aws(
|
||||
query: str,
|
||||
docs: list[str],
|
||||
model_name: str,
|
||||
region_name: str,
|
||||
aws_access_key_id: str,
|
||||
aws_secret_access_key: str,
|
||||
) -> list[float]:
|
||||
session = aioboto3.Session(
|
||||
aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key
|
||||
)
|
||||
async with session.client(
|
||||
"bedrock-runtime", region_name=region_name
|
||||
) as bedrock_client:
|
||||
body = json.dumps(
|
||||
{
|
||||
"query": query,
|
||||
"documents": docs,
|
||||
"api_version": 2,
|
||||
}
|
||||
)
|
||||
# Invoke the Bedrock model asynchronously
|
||||
response = await bedrock_client.invoke_model(
|
||||
modelId=model_name,
|
||||
accept="application/json",
|
||||
contentType="application/json",
|
||||
body=body,
|
||||
)
|
||||
|
||||
# Read the response asynchronously
|
||||
response_body = json.loads(await response["body"].read())
|
||||
|
||||
# Extract and sort the results
|
||||
results = response_body.get("results", [])
|
||||
sorted_results = sorted(results, key=lambda item: item["index"])
|
||||
|
||||
return [result["relevance_score"] for result in sorted_results]
|
||||
|
||||
|
||||
async def litellm_rerank(
|
||||
query: str, docs: list[str], api_url: str, model_name: str, api_key: str | None
|
||||
) -> list[float]:
|
||||
@@ -621,32 +572,15 @@ async def process_rerank_request(rerank_request: RerankRequest) -> RerankRespons
|
||||
elif rerank_request.provider_type == RerankerProvider.COHERE:
|
||||
if rerank_request.api_key is None:
|
||||
raise RuntimeError("Cohere Rerank Requires an API Key")
|
||||
sim_scores = await cohere_rerank_api(
|
||||
sim_scores = await cohere_rerank(
|
||||
query=rerank_request.query,
|
||||
docs=rerank_request.documents,
|
||||
model_name=rerank_request.model_name,
|
||||
api_key=rerank_request.api_key,
|
||||
)
|
||||
return RerankResponse(scores=sim_scores)
|
||||
|
||||
elif rerank_request.provider_type == RerankerProvider.BEDROCK:
|
||||
if rerank_request.api_key is None:
|
||||
raise RuntimeError("Bedrock Rerank Requires an API Key")
|
||||
aws_access_key_id, aws_secret_access_key, aws_region = pass_aws_key(
|
||||
rerank_request.api_key
|
||||
)
|
||||
sim_scores = await cohere_rerank_aws(
|
||||
query=rerank_request.query,
|
||||
docs=rerank_request.documents,
|
||||
model_name=rerank_request.model_name,
|
||||
region_name=aws_region,
|
||||
aws_access_key_id=aws_access_key_id,
|
||||
aws_secret_access_key=aws_secret_access_key,
|
||||
)
|
||||
return RerankResponse(scores=sim_scores)
|
||||
else:
|
||||
raise ValueError(f"Unsupported provider: {rerank_request.provider_type}")
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error during reranking process:\n{str(e)}")
|
||||
raise HTTPException(
|
||||
|
||||
@@ -70,32 +70,3 @@ def get_gpu_type() -> str:
|
||||
return GPUStatus.MAC_MPS
|
||||
|
||||
return GPUStatus.NONE
|
||||
|
||||
|
||||
def pass_aws_key(api_key: str) -> tuple[str, str, str]:
|
||||
"""Parse AWS API key string into components.
|
||||
|
||||
Args:
|
||||
api_key: String in format 'aws_ACCESSKEY_SECRETKEY_REGION'
|
||||
|
||||
Returns:
|
||||
Tuple of (access_key, secret_key, region)
|
||||
|
||||
Raises:
|
||||
ValueError: If key format is invalid
|
||||
"""
|
||||
if not api_key.startswith("aws"):
|
||||
raise ValueError("API key must start with 'aws' prefix")
|
||||
|
||||
parts = api_key.split("_")
|
||||
if len(parts) != 4:
|
||||
raise ValueError(
|
||||
f"API key must be in format 'aws_ACCESSKEY_SECRETKEY_REGION', got {len(parts) - 1} parts"
|
||||
"this is an onyx specific format for formatting the aws secrets for bedrock"
|
||||
)
|
||||
|
||||
try:
|
||||
_, aws_access_key_id, aws_secret_access_key, aws_region = parts
|
||||
return aws_access_key_id, aws_secret_access_key, aws_region
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to parse AWS key components: {str(e)}")
|
||||
|
||||
@@ -98,16 +98,8 @@ def choose_tool(
|
||||
# For tool calling LLMs, we want to insert the task prompt as part of this flow, this is because the LLM
|
||||
# may choose to not call any tools and just generate the answer, in which case the task prompt is needed.
|
||||
prompt=built_prompt,
|
||||
tools=(
|
||||
[tool.tool_definition() for tool in tools] or None
|
||||
if using_tool_calling_llm
|
||||
else None
|
||||
),
|
||||
tool_choice=(
|
||||
"required"
|
||||
if tools and force_use_tool.force_use and using_tool_calling_llm
|
||||
else None
|
||||
),
|
||||
tools=[tool.tool_definition() for tool in tools] or None,
|
||||
tool_choice=("required" if tools and force_use_tool.force_use else None),
|
||||
structured_response_format=structured_response_format,
|
||||
)
|
||||
|
||||
|
||||
@@ -523,13 +523,12 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
token = CURRENT_TENANT_ID_CONTEXTVAR.set(tenant_id)
|
||||
try:
|
||||
user_count = await get_user_count()
|
||||
logger.debug(f"Current tenant user count: {user_count}")
|
||||
|
||||
with get_session_with_tenant(tenant_id=tenant_id) as db_session:
|
||||
if user_count == 1:
|
||||
create_milestone_and_report(
|
||||
user=user,
|
||||
distinct_id=user.email,
|
||||
distinct_id=f"{user.email}_{tenant_id}",
|
||||
event_type=MilestoneRecordType.USER_SIGNED_UP,
|
||||
properties=None,
|
||||
db_session=db_session,
|
||||
@@ -537,7 +536,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
else:
|
||||
create_milestone_and_report(
|
||||
user=user,
|
||||
distinct_id=user.email,
|
||||
distinct_id=f"{user.email}_{tenant_id}",
|
||||
event_type=MilestoneRecordType.MULTIPLE_USERS,
|
||||
properties=None,
|
||||
db_session=db_session,
|
||||
@@ -545,7 +544,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
finally:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
|
||||
logger.debug(f"User {user.id} has registered.")
|
||||
logger.notice(f"User {user.id} has registered.")
|
||||
optional_telemetry(
|
||||
record_type=RecordType.SIGN_UP,
|
||||
data={"action": "create"},
|
||||
|
||||
@@ -756,7 +756,6 @@ def stream_chat_message_objects(
|
||||
)
|
||||
|
||||
# LLM prompt building, response capturing, etc.
|
||||
|
||||
answer = Answer(
|
||||
prompt_builder=prompt_builder,
|
||||
is_connected=is_connected,
|
||||
|
||||
@@ -640,6 +640,3 @@ TEST_ENV = os.environ.get("TEST_ENV", "").lower() == "true"
|
||||
MOCK_LLM_RESPONSE = (
|
||||
os.environ.get("MOCK_LLM_RESPONSE") if os.environ.get("MOCK_LLM_RESPONSE") else None
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB = 20
|
||||
|
||||
@@ -1,38 +0,0 @@
|
||||
from onyx.configs.app_configs import DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB
|
||||
from onyx.server.settings.store import load_settings
|
||||
|
||||
|
||||
def get_image_extraction_and_analysis_enabled() -> bool:
|
||||
"""Get image extraction and analysis enabled setting from workspace settings or fallback to False"""
|
||||
try:
|
||||
settings = load_settings()
|
||||
if settings.image_extraction_and_analysis_enabled is not None:
|
||||
return settings.image_extraction_and_analysis_enabled
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_search_time_image_analysis_enabled() -> bool:
|
||||
"""Get search time image analysis enabled setting from workspace settings or fallback to False"""
|
||||
try:
|
||||
settings = load_settings()
|
||||
if settings.search_time_image_analysis_enabled is not None:
|
||||
return settings.search_time_image_analysis_enabled
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_image_analysis_max_size_mb() -> int:
|
||||
"""Get image analysis max size MB setting from workspace settings or fallback to environment variable"""
|
||||
try:
|
||||
settings = load_settings()
|
||||
if settings.image_analysis_max_size_mb is not None:
|
||||
return settings.image_analysis_max_size_mb
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB
|
||||
@@ -200,6 +200,7 @@ class AirtableConnector(LoadConnector):
|
||||
return attachment_response.content
|
||||
|
||||
logger.error(f"Failed to refresh attachment for {filename}")
|
||||
|
||||
raise
|
||||
|
||||
attachment_content = get_attachment_with_retry(url, record_id)
|
||||
|
||||
@@ -11,12 +11,13 @@ from onyx.configs.app_configs import CONFLUENCE_TIMEZONE_OFFSET
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.confluence.onyx_confluence import extract_text_from_confluence_html
|
||||
from onyx.connectors.confluence.onyx_confluence import attachment_to_content
|
||||
from onyx.connectors.confluence.onyx_confluence import (
|
||||
extract_text_from_confluence_html,
|
||||
)
|
||||
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
|
||||
from onyx.connectors.confluence.utils import build_confluence_document_id
|
||||
from onyx.connectors.confluence.utils import convert_attachment_to_content
|
||||
from onyx.connectors.confluence.utils import datetime_from_string
|
||||
from onyx.connectors.confluence.utils import process_attachment
|
||||
from onyx.connectors.confluence.utils import validate_attachment_filetype
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.connectors.exceptions import CredentialExpiredError
|
||||
@@ -35,26 +36,28 @@ from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
# Potential Improvements
|
||||
# 1. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
|
||||
# 1. Include attachments, etc
|
||||
# 2. Segment into Sections for more accurate linking, can split by headers but make sure no text/ordering is lost
|
||||
|
||||
_COMMENT_EXPANSION_FIELDS = ["body.storage.value"]
|
||||
_PAGE_EXPANSION_FIELDS = [
|
||||
"body.storage.value",
|
||||
"version",
|
||||
"space",
|
||||
"metadata.labels",
|
||||
"history.lastUpdated",
|
||||
]
|
||||
_ATTACHMENT_EXPANSION_FIELDS = [
|
||||
"version",
|
||||
"space",
|
||||
"metadata.labels",
|
||||
]
|
||||
|
||||
_RESTRICTIONS_EXPANSION_FIELDS = [
|
||||
"space",
|
||||
"restrictions.read.restrictions.user",
|
||||
@@ -84,11 +87,7 @@ _FULL_EXTENSION_FILTER_STRING = "".join(
|
||||
|
||||
|
||||
class ConfluenceConnector(
|
||||
LoadConnector,
|
||||
PollConnector,
|
||||
SlimConnector,
|
||||
CredentialsConnector,
|
||||
VisionEnabledConnector,
|
||||
LoadConnector, PollConnector, SlimConnector, CredentialsConnector
|
||||
):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -106,24 +105,13 @@ class ConfluenceConnector(
|
||||
labels_to_skip: list[str] = CONFLUENCE_CONNECTOR_LABELS_TO_SKIP,
|
||||
timezone_offset: float = CONFLUENCE_TIMEZONE_OFFSET,
|
||||
) -> None:
|
||||
self.wiki_base = wiki_base
|
||||
self.is_cloud = is_cloud
|
||||
self.space = space
|
||||
self.page_id = page_id
|
||||
self.index_recursively = index_recursively
|
||||
self.cql_query = cql_query
|
||||
self.batch_size = batch_size
|
||||
self.continue_on_failure = continue_on_failure
|
||||
self.labels_to_skip = labels_to_skip
|
||||
self.timezone_offset = timezone_offset
|
||||
self._confluence_client: OnyxConfluence | None = None
|
||||
self._fetched_titles: set[str] = set()
|
||||
|
||||
# Initialize vision LLM using the mixin
|
||||
self.initialize_vision_llm()
|
||||
self.is_cloud = is_cloud
|
||||
|
||||
# Remove trailing slash from wiki_base if present
|
||||
self.wiki_base = wiki_base.rstrip("/")
|
||||
|
||||
"""
|
||||
If nothing is provided, we default to fetching all pages
|
||||
Only one or none of the following options should be specified so
|
||||
@@ -165,6 +153,8 @@ class ConfluenceConnector(
|
||||
"max_backoff_seconds": 60,
|
||||
}
|
||||
|
||||
self._confluence_client: OnyxConfluence | None = None
|
||||
|
||||
@property
|
||||
def confluence_client(self) -> OnyxConfluence:
|
||||
if self._confluence_client is None:
|
||||
@@ -194,6 +184,7 @@ class ConfluenceConnector(
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> str:
|
||||
page_query = self.base_cql_page_query + self.cql_label_filter
|
||||
|
||||
# Add time filters
|
||||
if start:
|
||||
formatted_start_time = datetime.fromtimestamp(
|
||||
@@ -205,6 +196,7 @@ class ConfluenceConnector(
|
||||
"%Y-%m-%d %H:%M"
|
||||
)
|
||||
page_query += f" and lastmodified <= '{formatted_end_time}'"
|
||||
|
||||
return page_query
|
||||
|
||||
def _construct_attachment_query(self, confluence_page_id: str) -> str:
|
||||
@@ -215,10 +207,11 @@ class ConfluenceConnector(
|
||||
|
||||
def _get_comment_string_for_page_id(self, page_id: str) -> str:
|
||||
comment_string = ""
|
||||
|
||||
comment_cql = f"type=comment and container='{page_id}'"
|
||||
comment_cql += self.cql_label_filter
|
||||
expand = ",".join(_COMMENT_EXPANSION_FIELDS)
|
||||
|
||||
expand = ",".join(_COMMENT_EXPANSION_FIELDS)
|
||||
for comment in self.confluence_client.paginated_cql_retrieval(
|
||||
cql=comment_cql,
|
||||
expand=expand,
|
||||
@@ -229,177 +222,123 @@ class ConfluenceConnector(
|
||||
confluence_object=comment,
|
||||
fetched_titles=set(),
|
||||
)
|
||||
|
||||
return comment_string
|
||||
|
||||
def _convert_page_to_document(self, page: dict[str, Any]) -> Document | None:
|
||||
def _convert_object_to_document(
|
||||
self,
|
||||
confluence_object: dict[str, Any],
|
||||
parent_content_id: str | None = None,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Converts a Confluence page to a Document object.
|
||||
Includes the page content, comments, and attachments.
|
||||
"""
|
||||
try:
|
||||
# Extract basic page information
|
||||
page_id = page["id"]
|
||||
page_title = page["title"]
|
||||
page_url = f"{self.wiki_base}/wiki{page['_links']['webui']}"
|
||||
Takes in a confluence object, extracts all metadata, and converts it into a document.
|
||||
If its a page, it extracts the text, adds the comments for the document text.
|
||||
If its an attachment, it just downloads the attachment and converts that into a document.
|
||||
|
||||
# Get the page content
|
||||
page_content = extract_text_from_confluence_html(
|
||||
self.confluence_client, page, self._fetched_titles
|
||||
parent_content_id: if the object is an attachment, specifies the content id that
|
||||
the attachment is attached to
|
||||
"""
|
||||
# The url and the id are the same
|
||||
object_url = build_confluence_document_id(
|
||||
self.wiki_base, confluence_object["_links"]["webui"], self.is_cloud
|
||||
)
|
||||
|
||||
object_text = None
|
||||
# Extract text from page
|
||||
if confluence_object["type"] == "page":
|
||||
object_text = extract_text_from_confluence_html(
|
||||
confluence_client=self.confluence_client,
|
||||
confluence_object=confluence_object,
|
||||
fetched_titles={confluence_object.get("title", "")},
|
||||
)
|
||||
# Add comments to text
|
||||
object_text += self._get_comment_string_for_page_id(confluence_object["id"])
|
||||
elif confluence_object["type"] == "attachment":
|
||||
object_text = attachment_to_content(
|
||||
confluence_client=self.confluence_client,
|
||||
attachment=confluence_object,
|
||||
parent_content_id=parent_content_id,
|
||||
)
|
||||
|
||||
# Create the main section for the page content
|
||||
sections = [Section(text=page_content, link=page_url)]
|
||||
|
||||
# Process comments if available
|
||||
comment_text = self._get_comment_string_for_page_id(page_id)
|
||||
if comment_text:
|
||||
sections.append(Section(text=comment_text, link=f"{page_url}#comments"))
|
||||
|
||||
# Process attachments
|
||||
if "children" in page and "attachment" in page["children"]:
|
||||
attachments = self.confluence_client.get_attachments_for_page(
|
||||
page_id, expand="metadata"
|
||||
)
|
||||
|
||||
for attachment in attachments.get("results", []):
|
||||
# Process each attachment
|
||||
result = process_attachment(
|
||||
self.confluence_client,
|
||||
attachment,
|
||||
page_title,
|
||||
self.image_analysis_llm,
|
||||
)
|
||||
|
||||
if result.text:
|
||||
# Create a section for the attachment text
|
||||
attachment_section = Section(
|
||||
text=result.text,
|
||||
link=f"{page_url}#attachment-{attachment['id']}",
|
||||
image_file_name=result.file_name,
|
||||
)
|
||||
sections.append(attachment_section)
|
||||
elif result.error:
|
||||
logger.warning(
|
||||
f"Error processing attachment '{attachment.get('title')}': {result.error}"
|
||||
)
|
||||
|
||||
# Extract metadata
|
||||
metadata = {}
|
||||
if "space" in page:
|
||||
metadata["space"] = page["space"].get("name", "")
|
||||
|
||||
# Extract labels
|
||||
labels = []
|
||||
if "metadata" in page and "labels" in page["metadata"]:
|
||||
for label in page["metadata"]["labels"].get("results", []):
|
||||
labels.append(label.get("name", ""))
|
||||
if labels:
|
||||
metadata["labels"] = labels
|
||||
|
||||
# Extract owners
|
||||
primary_owners = []
|
||||
if "version" in page and "by" in page["version"]:
|
||||
author = page["version"]["by"]
|
||||
display_name = author.get("displayName", "Unknown")
|
||||
primary_owners.append(BasicExpertInfo(display_name=display_name))
|
||||
|
||||
# Create the document
|
||||
return Document(
|
||||
id=build_confluence_document_id(self.wiki_base, page_id, self.is_cloud),
|
||||
sections=sections,
|
||||
source=DocumentSource.CONFLUENCE,
|
||||
semantic_identifier=page_title,
|
||||
metadata=metadata,
|
||||
doc_updated_at=datetime_from_string(page["version"]["when"]),
|
||||
primary_owners=primary_owners if primary_owners else None,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting page {page.get('id', 'unknown')}: {e}")
|
||||
if not self.continue_on_failure:
|
||||
raise
|
||||
if object_text is None:
|
||||
# This only happens for attachments that are not parseable
|
||||
return None
|
||||
|
||||
# Get space name
|
||||
doc_metadata: dict[str, str | list[str]] = {
|
||||
"Wiki Space Name": confluence_object["space"]["name"]
|
||||
}
|
||||
|
||||
# Get labels
|
||||
label_dicts = (
|
||||
confluence_object.get("metadata", {}).get("labels", {}).get("results", [])
|
||||
)
|
||||
page_labels = [label.get("name") for label in label_dicts if label.get("name")]
|
||||
if page_labels:
|
||||
doc_metadata["labels"] = page_labels
|
||||
|
||||
# Get last modified and author email
|
||||
version_dict = confluence_object.get("version", {})
|
||||
last_modified = (
|
||||
datetime_from_string(version_dict.get("when"))
|
||||
if version_dict.get("when")
|
||||
else None
|
||||
)
|
||||
author_email = version_dict.get("by", {}).get("email")
|
||||
|
||||
title = confluence_object.get("title", "Untitled Document")
|
||||
|
||||
return Document(
|
||||
id=object_url,
|
||||
sections=[Section(link=object_url, text=object_text)],
|
||||
source=DocumentSource.CONFLUENCE,
|
||||
semantic_identifier=title,
|
||||
doc_updated_at=last_modified,
|
||||
primary_owners=(
|
||||
[BasicExpertInfo(email=author_email)] if author_email else None
|
||||
),
|
||||
metadata=doc_metadata,
|
||||
)
|
||||
|
||||
def _fetch_document_batches(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateDocumentsOutput:
|
||||
"""
|
||||
Yields batches of Documents. For each page:
|
||||
- Create a Document with 1 Section for the page text/comments
|
||||
- Then fetch attachments. For each attachment:
|
||||
- Attempt to convert it with convert_attachment_to_content(...)
|
||||
- If successful, create a new Section with the extracted text or summary.
|
||||
"""
|
||||
doc_batch: list[Document] = []
|
||||
confluence_page_ids: list[str] = []
|
||||
|
||||
page_query = self._construct_page_query(start, end)
|
||||
logger.debug(f"page_query: {page_query}")
|
||||
|
||||
# Fetch pages as Documents
|
||||
for page in self.confluence_client.paginated_cql_retrieval(
|
||||
cql=page_query,
|
||||
expand=",".join(_PAGE_EXPANSION_FIELDS),
|
||||
limit=self.batch_size,
|
||||
):
|
||||
# Build doc from page
|
||||
doc = self._convert_page_to_document(page)
|
||||
if not doc:
|
||||
continue
|
||||
|
||||
# Now get attachments for that page:
|
||||
attachment_query = self._construct_attachment_query(page["id"])
|
||||
# We'll use the page's XML to provide context if we summarize an image
|
||||
confluence_xml = page.get("body", {}).get("storage", {}).get("value", "")
|
||||
logger.debug(f"_fetch_document_batches: {page['id']}")
|
||||
confluence_page_ids.append(page["id"])
|
||||
doc = self._convert_object_to_document(page)
|
||||
if doc is not None:
|
||||
doc_batch.append(doc)
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
|
||||
# Fetch attachments as Documents
|
||||
for confluence_page_id in confluence_page_ids:
|
||||
attachment_query = self._construct_attachment_query(confluence_page_id)
|
||||
# TODO: maybe should add time filter as well?
|
||||
for attachment in self.confluence_client.paginated_cql_retrieval(
|
||||
cql=attachment_query,
|
||||
expand=",".join(_ATTACHMENT_EXPANSION_FIELDS),
|
||||
):
|
||||
attachment["metadata"].get("mediaType", "")
|
||||
if not validate_attachment_filetype(
|
||||
attachment, self.image_analysis_llm
|
||||
):
|
||||
continue
|
||||
|
||||
# Attempt to get textual content or image summarization:
|
||||
try:
|
||||
logger.info(f"Processing attachment: {attachment['title']}")
|
||||
response = convert_attachment_to_content(
|
||||
confluence_client=self.confluence_client,
|
||||
attachment=attachment,
|
||||
page_context=confluence_xml,
|
||||
llm=self.image_analysis_llm,
|
||||
)
|
||||
if response is None:
|
||||
continue
|
||||
|
||||
content_text, file_storage_name = response
|
||||
|
||||
object_url = build_confluence_document_id(
|
||||
self.wiki_base, page["_links"]["webui"], self.is_cloud
|
||||
)
|
||||
|
||||
if content_text:
|
||||
doc.sections.append(
|
||||
Section(
|
||||
text=content_text,
|
||||
link=object_url,
|
||||
image_file_name=file_storage_name,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Failed to extract/summarize attachment {attachment['title']}",
|
||||
exc_info=e,
|
||||
)
|
||||
if not self.continue_on_failure:
|
||||
raise
|
||||
|
||||
doc_batch.append(doc)
|
||||
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
doc = self._convert_object_to_document(attachment, confluence_page_id)
|
||||
if doc is not None:
|
||||
doc_batch.append(doc)
|
||||
if len(doc_batch) >= self.batch_size:
|
||||
yield doc_batch
|
||||
doc_batch = []
|
||||
|
||||
if doc_batch:
|
||||
yield doc_batch
|
||||
@@ -420,63 +359,55 @@ class ConfluenceConnector(
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""
|
||||
Return 'slim' docs (IDs + minimal permission data).
|
||||
Does not fetch actual text. Used primarily for incremental permission sync.
|
||||
"""
|
||||
doc_metadata_list: list[SlimDocument] = []
|
||||
|
||||
restrictions_expand = ",".join(_RESTRICTIONS_EXPANSION_FIELDS)
|
||||
|
||||
# Query pages
|
||||
page_query = self.base_cql_page_query + self.cql_label_filter
|
||||
for page in self.confluence_client.cql_paginate_all_expansions(
|
||||
cql=page_query,
|
||||
expand=restrictions_expand,
|
||||
limit=_SLIM_DOC_BATCH_SIZE,
|
||||
):
|
||||
# If the page has restrictions, add them to the perm_sync_data
|
||||
# These will be used by doc_sync.py to sync permissions
|
||||
page_restrictions = page.get("restrictions")
|
||||
page_space_key = page.get("space", {}).get("key")
|
||||
page_ancestors = page.get("ancestors", [])
|
||||
|
||||
page_perm_sync_data = {
|
||||
"restrictions": page_restrictions or {},
|
||||
"space_key": page_space_key,
|
||||
"ancestors": page_ancestors,
|
||||
"ancestors": page_ancestors or [],
|
||||
}
|
||||
|
||||
doc_metadata_list.append(
|
||||
SlimDocument(
|
||||
id=build_confluence_document_id(
|
||||
self.wiki_base, page["_links"]["webui"], self.is_cloud
|
||||
self.wiki_base,
|
||||
page["_links"]["webui"],
|
||||
self.is_cloud,
|
||||
),
|
||||
perm_sync_data=page_perm_sync_data,
|
||||
)
|
||||
)
|
||||
|
||||
# Query attachments for each page
|
||||
attachment_query = self._construct_attachment_query(page["id"])
|
||||
for attachment in self.confluence_client.cql_paginate_all_expansions(
|
||||
cql=attachment_query,
|
||||
expand=restrictions_expand,
|
||||
limit=_SLIM_DOC_BATCH_SIZE,
|
||||
):
|
||||
# If you skip images, you'll skip them in the permission sync
|
||||
attachment["metadata"].get("mediaType", "")
|
||||
if not validate_attachment_filetype(
|
||||
attachment, self.image_analysis_llm
|
||||
):
|
||||
if not validate_attachment_filetype(attachment):
|
||||
continue
|
||||
|
||||
attachment_restrictions = attachment.get("restrictions", {})
|
||||
attachment_restrictions = attachment.get("restrictions")
|
||||
if not attachment_restrictions:
|
||||
attachment_restrictions = page_restrictions or {}
|
||||
attachment_restrictions = page_restrictions
|
||||
|
||||
attachment_space_key = attachment.get("space", {}).get("key")
|
||||
if not attachment_space_key:
|
||||
attachment_space_key = page_space_key
|
||||
|
||||
attachment_perm_sync_data = {
|
||||
"restrictions": attachment_restrictions,
|
||||
"restrictions": attachment_restrictions or {},
|
||||
"space_key": attachment_space_key,
|
||||
}
|
||||
|
||||
@@ -490,16 +421,16 @@ class ConfluenceConnector(
|
||||
perm_sync_data=attachment_perm_sync_data,
|
||||
)
|
||||
)
|
||||
|
||||
if len(doc_metadata_list) > _SLIM_DOC_BATCH_SIZE:
|
||||
yield doc_metadata_list[:_SLIM_DOC_BATCH_SIZE]
|
||||
doc_metadata_list = doc_metadata_list[_SLIM_DOC_BATCH_SIZE:]
|
||||
|
||||
if callback and callback.should_stop():
|
||||
raise RuntimeError(
|
||||
"retrieve_all_slim_documents: Stop signal detected"
|
||||
)
|
||||
if callback:
|
||||
if callback.should_stop():
|
||||
raise RuntimeError(
|
||||
"retrieve_all_slim_documents: Stop signal detected"
|
||||
)
|
||||
|
||||
callback.progress("retrieve_all_slim_documents", 1)
|
||||
|
||||
yield doc_metadata_list
|
||||
|
||||
@@ -144,12 +144,6 @@ class OnyxConfluence:
|
||||
self.static_credentials = credential_json
|
||||
return credential_json, False
|
||||
|
||||
if not OAUTH_CONFLUENCE_CLOUD_CLIENT_ID:
|
||||
raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_ID must be set!")
|
||||
|
||||
if not OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET:
|
||||
raise RuntimeError("OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET must be set!")
|
||||
|
||||
# check if we should refresh tokens. we're deciding to refresh halfway
|
||||
# to expiration
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
import io
|
||||
import math
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import TYPE_CHECKING
|
||||
@@ -15,28 +12,14 @@ from urllib.parse import parse_qs
|
||||
from urllib.parse import quote
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import bs4
|
||||
import requests
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import (
|
||||
CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD,
|
||||
)
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
|
||||
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.models import PGFileStore
|
||||
from onyx.db.pg_file_store import create_populate_lobj
|
||||
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
|
||||
from onyx.db.pg_file_store import upsert_pgfilestore
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.file_validation import is_valid_image_type
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
pass
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
@@ -52,229 +35,15 @@ class TokenResponse(BaseModel):
|
||||
scope: str
|
||||
|
||||
|
||||
def validate_attachment_filetype(
|
||||
attachment: dict[str, Any], llm: LLM | None = None
|
||||
) -> bool:
|
||||
"""
|
||||
Validates if the attachment is a supported file type.
|
||||
If LLM is provided, also checks if it's an image that can be processed.
|
||||
"""
|
||||
attachment.get("metadata", {})
|
||||
media_type = attachment.get("metadata", {}).get("mediaType", "")
|
||||
|
||||
if media_type.startswith("image/"):
|
||||
return llm is not None and is_valid_image_type(media_type)
|
||||
|
||||
# For non-image files, check if we support the extension
|
||||
title = attachment.get("title", "")
|
||||
extension = Path(title).suffix.lstrip(".").lower() if "." in title else ""
|
||||
return extension in ["pdf", "doc", "docx", "txt", "md", "rtf"]
|
||||
|
||||
|
||||
class AttachmentProcessingResult(BaseModel):
|
||||
"""
|
||||
A container for results after processing a Confluence attachment.
|
||||
'text' is the textual content of the attachment.
|
||||
'file_name' is the final file name used in PGFileStore to store the content.
|
||||
'error' holds an exception or string if something failed.
|
||||
"""
|
||||
|
||||
text: str | None
|
||||
file_name: str | None
|
||||
error: str | None = None
|
||||
|
||||
|
||||
def _download_attachment(
|
||||
confluence_client: "OnyxConfluence", attachment: dict[str, Any]
|
||||
) -> bytes | None:
|
||||
"""
|
||||
Retrieves the raw bytes of an attachment from Confluence. Returns None on error.
|
||||
"""
|
||||
download_link = confluence_client.url + attachment["_links"]["download"]
|
||||
resp = confluence_client._session.get(download_link)
|
||||
if resp.status_code != 200:
|
||||
logger.warning(
|
||||
f"Failed to fetch {download_link} with status code {resp.status_code}"
|
||||
)
|
||||
return None
|
||||
return resp.content
|
||||
|
||||
|
||||
def process_attachment(
|
||||
confluence_client: "OnyxConfluence",
|
||||
attachment: dict[str, Any],
|
||||
page_context: str,
|
||||
llm: LLM | None,
|
||||
) -> AttachmentProcessingResult:
|
||||
"""
|
||||
Processes a Confluence attachment. If it's a document, extracts text,
|
||||
or if it's an image and an LLM is available, summarizes it. Returns a structured result.
|
||||
"""
|
||||
try:
|
||||
# Get the media type from the attachment metadata
|
||||
media_type = attachment.get("metadata", {}).get("mediaType", "")
|
||||
|
||||
# Validate the attachment type
|
||||
if not validate_attachment_filetype(attachment, llm):
|
||||
return AttachmentProcessingResult(
|
||||
text=None,
|
||||
file_name=None,
|
||||
error=f"Unsupported file type: {media_type}",
|
||||
)
|
||||
|
||||
# Download the attachment
|
||||
raw_bytes = _download_attachment(confluence_client, attachment)
|
||||
if raw_bytes is None:
|
||||
return AttachmentProcessingResult(
|
||||
text=None, file_name=None, error="Failed to download attachment"
|
||||
)
|
||||
|
||||
# Process image attachments with LLM if available
|
||||
if media_type.startswith("image/") and llm:
|
||||
return _process_image_attachment(
|
||||
confluence_client, attachment, page_context, llm, raw_bytes, media_type
|
||||
)
|
||||
|
||||
# Process document attachments
|
||||
try:
|
||||
text = extract_file_text(
|
||||
file=BytesIO(raw_bytes),
|
||||
file_name=attachment["title"],
|
||||
)
|
||||
|
||||
# Skip if the text is too long
|
||||
if len(text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD:
|
||||
return AttachmentProcessingResult(
|
||||
text=None,
|
||||
file_name=None,
|
||||
error=f"Attachment text too long: {len(text)} chars",
|
||||
)
|
||||
|
||||
return AttachmentProcessingResult(text=text, file_name=None, error=None)
|
||||
except Exception as e:
|
||||
return AttachmentProcessingResult(
|
||||
text=None, file_name=None, error=f"Failed to extract text: {e}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return AttachmentProcessingResult(
|
||||
text=None, file_name=None, error=f"Failed to process attachment: {e}"
|
||||
)
|
||||
|
||||
|
||||
def _process_image_attachment(
|
||||
confluence_client: "OnyxConfluence",
|
||||
attachment: dict[str, Any],
|
||||
page_context: str,
|
||||
llm: LLM,
|
||||
raw_bytes: bytes,
|
||||
media_type: str,
|
||||
) -> AttachmentProcessingResult:
|
||||
"""Process an image attachment by saving it and generating a summary."""
|
||||
try:
|
||||
# Use the standardized image storage and section creation
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
section, file_name = store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=raw_bytes,
|
||||
file_name=Path(attachment["id"]).name,
|
||||
display_name=attachment["title"],
|
||||
media_type=media_type,
|
||||
llm=llm,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
|
||||
return AttachmentProcessingResult(
|
||||
text=section.text, file_name=file_name, error=None
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Image summarization failed for {attachment['title']}: {e}"
|
||||
logger.error(msg, exc_info=e)
|
||||
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
|
||||
|
||||
|
||||
def _process_text_attachment(
|
||||
attachment: dict[str, Any],
|
||||
raw_bytes: bytes,
|
||||
media_type: str,
|
||||
) -> AttachmentProcessingResult:
|
||||
"""Process a text-based attachment by extracting its content."""
|
||||
try:
|
||||
extracted_text = extract_file_text(
|
||||
io.BytesIO(raw_bytes),
|
||||
file_name=attachment["title"],
|
||||
break_on_unprocessable=False,
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Failed to extract text for '{attachment['title']}': {e}"
|
||||
logger.error(msg, exc_info=e)
|
||||
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
|
||||
|
||||
# Check length constraints
|
||||
if extracted_text is None or len(extracted_text) == 0:
|
||||
msg = f"No text extracted for {attachment['title']}"
|
||||
logger.warning(msg)
|
||||
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
|
||||
|
||||
if len(extracted_text) > CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD:
|
||||
msg = (
|
||||
f"Skipping attachment {attachment['title']} due to char count "
|
||||
f"({len(extracted_text)} > {CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD})"
|
||||
)
|
||||
logger.warning(msg)
|
||||
return AttachmentProcessingResult(text=None, file_name=None, error=msg)
|
||||
|
||||
# Save the attachment
|
||||
try:
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
saved_record = save_bytes_to_pgfilestore(
|
||||
db_session=db_session,
|
||||
raw_bytes=raw_bytes,
|
||||
media_type=media_type,
|
||||
identifier=attachment["id"],
|
||||
display_name=attachment["title"],
|
||||
)
|
||||
except Exception as e:
|
||||
msg = f"Failed to save attachment '{attachment['title']}' to PG: {e}"
|
||||
logger.error(msg, exc_info=e)
|
||||
return AttachmentProcessingResult(
|
||||
text=extracted_text, file_name=None, error=msg
|
||||
)
|
||||
|
||||
return AttachmentProcessingResult(
|
||||
text=extracted_text, file_name=saved_record.file_name, error=None
|
||||
)
|
||||
|
||||
|
||||
def convert_attachment_to_content(
|
||||
confluence_client: "OnyxConfluence",
|
||||
attachment: dict[str, Any],
|
||||
page_context: str,
|
||||
llm: LLM | None,
|
||||
) -> tuple[str | None, str | None] | None:
|
||||
"""
|
||||
Facade function which:
|
||||
1. Validates attachment type
|
||||
2. Extracts or summarizes content
|
||||
3. Returns (content_text, stored_file_name) or None if we should skip it
|
||||
"""
|
||||
media_type = attachment["metadata"]["mediaType"]
|
||||
# Quick check for unsupported types:
|
||||
if media_type.startswith("video/") or media_type == "application/gliffy+json":
|
||||
logger.warning(
|
||||
f"Skipping unsupported attachment type: '{media_type}' for {attachment['title']}"
|
||||
)
|
||||
return None
|
||||
|
||||
result = process_attachment(confluence_client, attachment, page_context, llm)
|
||||
if result.error is not None:
|
||||
logger.warning(
|
||||
f"Attachment {attachment['title']} encountered error: {result.error}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Return the text and the file name
|
||||
return result.text, result.file_name
|
||||
def validate_attachment_filetype(attachment: dict[str, Any]) -> bool:
|
||||
return attachment["metadata"]["mediaType"] not in [
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
"image/gif",
|
||||
"image/svg+xml",
|
||||
"video/mp4",
|
||||
"video/quicktime",
|
||||
]
|
||||
|
||||
|
||||
def build_confluence_document_id(
|
||||
@@ -295,6 +64,23 @@ def build_confluence_document_id(
|
||||
return f"{base_url}{content_url}"
|
||||
|
||||
|
||||
def _extract_referenced_attachment_names(page_text: str) -> list[str]:
|
||||
"""Parse a Confluence html page to generate a list of current
|
||||
attachments in use
|
||||
|
||||
Args:
|
||||
text (str): The page content
|
||||
|
||||
Returns:
|
||||
list[str]: List of filenames currently in use by the page text
|
||||
"""
|
||||
referenced_attachment_filenames = []
|
||||
soup = bs4.BeautifulSoup(page_text, "html.parser")
|
||||
for attachment in soup.findAll("ri:attachment"):
|
||||
referenced_attachment_filenames.append(attachment.attrs["ri:filename"])
|
||||
return referenced_attachment_filenames
|
||||
|
||||
|
||||
def datetime_from_string(datetime_string: str) -> datetime:
|
||||
datetime_object = datetime.fromisoformat(datetime_string)
|
||||
|
||||
@@ -466,37 +252,3 @@ def update_param_in_path(path: str, param: str, value: str) -> str:
|
||||
+ "?"
|
||||
+ "&".join(f"{k}={quote(v[0])}" for k, v in query_params.items())
|
||||
)
|
||||
|
||||
|
||||
def attachment_to_file_record(
|
||||
confluence_client: "OnyxConfluence",
|
||||
attachment: dict[str, Any],
|
||||
db_session: Session,
|
||||
) -> tuple[PGFileStore, bytes]:
|
||||
"""Save an attachment to the file store and return the file record."""
|
||||
download_link = _attachment_to_download_link(confluence_client, attachment)
|
||||
image_data = confluence_client.get(
|
||||
download_link, absolute=True, not_json_response=True
|
||||
)
|
||||
|
||||
# Save image to file store
|
||||
file_name = f"confluence_attachment_{attachment['id']}"
|
||||
lobj_oid = create_populate_lobj(BytesIO(image_data), db_session)
|
||||
pgfilestore = upsert_pgfilestore(
|
||||
file_name=file_name,
|
||||
display_name=attachment["title"],
|
||||
file_origin=FileOrigin.OTHER,
|
||||
file_type=attachment["metadata"]["mediaType"],
|
||||
lobj_oid=lobj_oid,
|
||||
db_session=db_session,
|
||||
commit=True,
|
||||
)
|
||||
|
||||
return pgfilestore, image_data
|
||||
|
||||
|
||||
def _attachment_to_download_link(
|
||||
confluence_client: "OnyxConfluence", attachment: dict[str, Any]
|
||||
) -> str:
|
||||
"""Extracts the download link to images."""
|
||||
return confluence_client.url + attachment["_links"]["download"]
|
||||
|
||||
@@ -10,23 +10,22 @@ from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.models import BasicExpertInfo
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.pg_file_store import get_pgfilestore_by_file_name
|
||||
from onyx.file_processing.extract_file_text import extract_text_and_images
|
||||
from onyx.file_processing.extract_file_text import detect_encoding
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.extract_file_text import get_file_ext
|
||||
from onyx.file_processing.extract_file_text import is_text_file_extension
|
||||
from onyx.file_processing.extract_file_text import is_valid_file_ext
|
||||
from onyx.file_processing.extract_file_text import load_files_from_zip
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
from onyx.file_processing.extract_file_text import read_text_file
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -36,115 +35,81 @@ def _read_files_and_metadata(
|
||||
file_name: str,
|
||||
db_session: Session,
|
||||
) -> Iterator[tuple[str, IO, dict[str, Any]]]:
|
||||
"""
|
||||
Reads the file from Postgres. If the file is a .zip, yields subfiles.
|
||||
"""
|
||||
"""Reads the file into IO, in the case of a zip file, yields each individual
|
||||
file contained within, also includes the metadata dict if packaged in the zip"""
|
||||
extension = get_file_ext(file_name)
|
||||
metadata: dict[str, Any] = {}
|
||||
directory_path = os.path.dirname(file_name)
|
||||
|
||||
# Read file from Postgres store
|
||||
file_content = get_default_file_store(db_session).read_file(file_name, mode="b")
|
||||
|
||||
# If it's a zip, expand it
|
||||
if extension == ".zip":
|
||||
for file_info, subfile, metadata in load_files_from_zip(
|
||||
for file_info, file, metadata in load_files_from_zip(
|
||||
file_content, ignore_dirs=True
|
||||
):
|
||||
yield os.path.join(directory_path, file_info.filename), subfile, metadata
|
||||
yield os.path.join(directory_path, file_info.filename), file, metadata
|
||||
elif is_valid_file_ext(extension):
|
||||
yield file_name, file_content, metadata
|
||||
else:
|
||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||
|
||||
|
||||
def _create_image_section(
|
||||
llm: LLM | None,
|
||||
image_data: bytes,
|
||||
db_session: Session,
|
||||
parent_file_name: str,
|
||||
display_name: str,
|
||||
idx: int = 0,
|
||||
) -> tuple[Section, str | None]:
|
||||
"""
|
||||
Create a Section object for a single image and store the image in PGFileStore.
|
||||
If summarization is enabled and we have an LLM, summarize the image.
|
||||
|
||||
Returns:
|
||||
tuple: (Section object, file_name in PGFileStore or None if storage failed)
|
||||
"""
|
||||
# Create a unique file name for the embedded image
|
||||
file_name = f"{parent_file_name}_embedded_{idx}"
|
||||
|
||||
# Use the standardized utility to store the image and create a section
|
||||
return store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=image_data,
|
||||
file_name=file_name,
|
||||
display_name=display_name,
|
||||
llm=llm,
|
||||
file_origin=FileOrigin.OTHER,
|
||||
)
|
||||
|
||||
|
||||
def _process_file(
|
||||
file_name: str,
|
||||
file: IO[Any],
|
||||
metadata: dict[str, Any] | None,
|
||||
pdf_pass: str | None,
|
||||
db_session: Session,
|
||||
llm: LLM | None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
pdf_pass: str | None = None,
|
||||
) -> list[Document]:
|
||||
"""
|
||||
Processes a single file, returning a list of Documents (typically one).
|
||||
Also handles embedded images if 'EMBEDDED_IMAGE_EXTRACTION_ENABLED' is true.
|
||||
"""
|
||||
extension = get_file_ext(file_name)
|
||||
|
||||
# Fetch the DB record so we know the ID for internal URL
|
||||
pg_record = get_pgfilestore_by_file_name(file_name=file_name, db_session=db_session)
|
||||
if not pg_record:
|
||||
logger.warning(f"No file record found for '{file_name}' in PG; skipping.")
|
||||
return []
|
||||
|
||||
if not is_valid_file_ext(extension):
|
||||
logger.warning(
|
||||
f"Skipping file '{file_name}' with unrecognized extension '{extension}'"
|
||||
)
|
||||
logger.warning(f"Skipping file '{file_name}' with extension '{extension}'")
|
||||
return []
|
||||
|
||||
# Prepare doc metadata
|
||||
if metadata is None:
|
||||
metadata = {}
|
||||
file_display_name = metadata.get("file_display_name") or os.path.basename(file_name)
|
||||
file_metadata: dict[str, Any] = {}
|
||||
|
||||
# Timestamps
|
||||
current_datetime = datetime.now(timezone.utc)
|
||||
time_updated = metadata.get("time_updated", current_datetime)
|
||||
if is_text_file_extension(file_name):
|
||||
encoding = detect_encoding(file)
|
||||
file_content_raw, file_metadata = read_text_file(
|
||||
file, encoding=encoding, ignore_onyx_metadata=False
|
||||
)
|
||||
|
||||
# Using the PDF reader function directly to pass in password cleanly
|
||||
elif extension == ".pdf" and pdf_pass is not None:
|
||||
file_content_raw, file_metadata = read_pdf_file(file=file, pdf_pass=pdf_pass)
|
||||
|
||||
else:
|
||||
file_content_raw = extract_file_text(
|
||||
file=file,
|
||||
file_name=file_name,
|
||||
break_on_unprocessable=True,
|
||||
)
|
||||
|
||||
all_metadata = {**metadata, **file_metadata} if metadata else file_metadata
|
||||
|
||||
# add a prefix to avoid conflicts with other connectors
|
||||
doc_id = f"FILE_CONNECTOR__{file_name}"
|
||||
if metadata:
|
||||
doc_id = metadata.get("document_id") or doc_id
|
||||
|
||||
# If this is set, we will show this in the UI as the "name" of the file
|
||||
file_display_name = all_metadata.get("file_display_name") or os.path.basename(
|
||||
file_name
|
||||
)
|
||||
title = (
|
||||
all_metadata["title"] or "" if "title" in all_metadata else file_display_name
|
||||
)
|
||||
|
||||
time_updated = all_metadata.get("time_updated", datetime.now(timezone.utc))
|
||||
if isinstance(time_updated, str):
|
||||
time_updated = time_str_to_utc(time_updated)
|
||||
|
||||
dt_str = metadata.get("doc_updated_at")
|
||||
dt_str = all_metadata.get("doc_updated_at")
|
||||
final_time_updated = time_str_to_utc(dt_str) if dt_str else time_updated
|
||||
|
||||
# Collect owners
|
||||
p_owner_names = metadata.get("primary_owners")
|
||||
s_owner_names = metadata.get("secondary_owners")
|
||||
p_owners = (
|
||||
[BasicExpertInfo(display_name=name) for name in p_owner_names]
|
||||
if p_owner_names
|
||||
else None
|
||||
)
|
||||
s_owners = (
|
||||
[BasicExpertInfo(display_name=name) for name in s_owner_names]
|
||||
if s_owner_names
|
||||
else None
|
||||
)
|
||||
|
||||
# Additional tags we store as doc metadata
|
||||
# Metadata tags separate from the Onyx specific fields
|
||||
metadata_tags = {
|
||||
k: v
|
||||
for k, v in metadata.items()
|
||||
for k, v in all_metadata.items()
|
||||
if k
|
||||
not in [
|
||||
"document_id",
|
||||
@@ -157,142 +122,77 @@ def _process_file(
|
||||
"file_display_name",
|
||||
"title",
|
||||
"connector_type",
|
||||
"pdf_password",
|
||||
]
|
||||
}
|
||||
|
||||
source_type_str = metadata.get("connector_type")
|
||||
source_type = (
|
||||
DocumentSource(source_type_str) if source_type_str else DocumentSource.FILE
|
||||
source_type_str = all_metadata.get("connector_type")
|
||||
source_type = DocumentSource(source_type_str) if source_type_str else None
|
||||
|
||||
p_owner_names = all_metadata.get("primary_owners")
|
||||
s_owner_names = all_metadata.get("secondary_owners")
|
||||
p_owners = (
|
||||
[BasicExpertInfo(display_name=name) for name in p_owner_names]
|
||||
if p_owner_names
|
||||
else None
|
||||
)
|
||||
s_owners = (
|
||||
[BasicExpertInfo(display_name=name) for name in s_owner_names]
|
||||
if s_owner_names
|
||||
else None
|
||||
)
|
||||
|
||||
doc_id = metadata.get("document_id") or f"FILE_CONNECTOR__{file_name}"
|
||||
title = metadata.get("title") or file_display_name
|
||||
|
||||
# 1) If the file itself is an image, handle that scenario quickly
|
||||
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp"}
|
||||
if extension in IMAGE_EXTENSIONS:
|
||||
# Summarize or produce empty doc
|
||||
image_data = file.read()
|
||||
image_section, _ = _create_image_section(
|
||||
llm, image_data, db_session, pg_record.file_name, title
|
||||
)
|
||||
return [
|
||||
Document(
|
||||
id=doc_id,
|
||||
sections=[image_section],
|
||||
source=source_type,
|
||||
semantic_identifier=file_display_name,
|
||||
title=title,
|
||||
doc_updated_at=final_time_updated,
|
||||
primary_owners=p_owners,
|
||||
secondary_owners=s_owners,
|
||||
metadata=metadata_tags,
|
||||
)
|
||||
]
|
||||
|
||||
# 2) Otherwise: text-based approach. Possibly with embedded images if enabled.
|
||||
# (For example .docx with inline images).
|
||||
file.seek(0)
|
||||
text_content = ""
|
||||
embedded_images: list[tuple[bytes, str]] = []
|
||||
|
||||
text_content, embedded_images = extract_text_and_images(
|
||||
file=file,
|
||||
file_name=file_name,
|
||||
pdf_pass=pdf_pass,
|
||||
)
|
||||
|
||||
# Build sections: first the text as a single Section
|
||||
sections = []
|
||||
link_in_meta = metadata.get("link")
|
||||
if text_content.strip():
|
||||
sections.append(Section(link=link_in_meta, text=text_content.strip()))
|
||||
|
||||
# Then any extracted images from docx, etc.
|
||||
for idx, (img_data, img_name) in enumerate(embedded_images, start=1):
|
||||
# Store each embedded image as a separate file in PGFileStore
|
||||
# and create a section with the image summary
|
||||
image_section, _ = _create_image_section(
|
||||
llm,
|
||||
img_data,
|
||||
db_session,
|
||||
pg_record.file_name,
|
||||
f"{title} - image {idx}",
|
||||
idx,
|
||||
)
|
||||
sections.append(image_section)
|
||||
return [
|
||||
Document(
|
||||
id=doc_id,
|
||||
sections=sections,
|
||||
source=source_type,
|
||||
sections=[
|
||||
Section(link=all_metadata.get("link"), text=file_content_raw.strip())
|
||||
],
|
||||
source=source_type or DocumentSource.FILE,
|
||||
semantic_identifier=file_display_name,
|
||||
title=title,
|
||||
doc_updated_at=final_time_updated,
|
||||
primary_owners=p_owners,
|
||||
secondary_owners=s_owners,
|
||||
# currently metadata just houses tags, other stuff like owners / updated at have dedicated fields
|
||||
metadata=metadata_tags,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class LocalFileConnector(LoadConnector, VisionEnabledConnector):
|
||||
"""
|
||||
Connector that reads files from Postgres and yields Documents, including
|
||||
optional embedded image extraction.
|
||||
"""
|
||||
|
||||
class LocalFileConnector(LoadConnector):
|
||||
def __init__(
|
||||
self,
|
||||
file_locations: list[Path | str],
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
) -> None:
|
||||
self.file_locations = [str(loc) for loc in file_locations]
|
||||
self.file_locations = [Path(file_location) for file_location in file_locations]
|
||||
self.batch_size = batch_size
|
||||
self.pdf_pass: str | None = None
|
||||
|
||||
# Initialize vision LLM using the mixin
|
||||
self.initialize_vision_llm()
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
self.pdf_pass = credentials.get("pdf_password")
|
||||
|
||||
return None
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
"""
|
||||
Iterates over each file path, fetches from Postgres, tries to parse text
|
||||
or images, and yields Document batches.
|
||||
"""
|
||||
documents: list[Document] = []
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
for file_path in self.file_locations:
|
||||
current_datetime = datetime.now(timezone.utc)
|
||||
|
||||
files_iter = _read_files_and_metadata(
|
||||
file_name=file_path,
|
||||
db_session=db_session,
|
||||
files = _read_files_and_metadata(
|
||||
file_name=str(file_path), db_session=db_session
|
||||
)
|
||||
|
||||
for actual_file_name, file, metadata in files_iter:
|
||||
for file_name, file, metadata in files:
|
||||
metadata["time_updated"] = metadata.get(
|
||||
"time_updated", current_datetime
|
||||
)
|
||||
new_docs = _process_file(
|
||||
file_name=actual_file_name,
|
||||
file=file,
|
||||
metadata=metadata,
|
||||
pdf_pass=self.pdf_pass,
|
||||
db_session=db_session,
|
||||
llm=self.image_analysis_llm,
|
||||
documents.extend(
|
||||
_process_file(file_name, file, metadata, self.pdf_pass)
|
||||
)
|
||||
documents.extend(new_docs)
|
||||
|
||||
if len(documents) >= self.batch_size:
|
||||
yield documents
|
||||
|
||||
documents = []
|
||||
|
||||
if documents:
|
||||
@@ -301,7 +201,7 @@ class LocalFileConnector(LoadConnector, VisionEnabledConnector):
|
||||
|
||||
if __name__ == "__main__":
|
||||
connector = LocalFileConnector(file_locations=[os.environ["TEST_FILE"]])
|
||||
connector.load_credentials({"pdf_password": os.environ.get("PDF_PASSWORD")})
|
||||
doc_batches = connector.load_from_state()
|
||||
for batch in doc_batches:
|
||||
print("BATCH:", batch)
|
||||
connector.load_credentials({"pdf_password": os.environ["PDF_PASSWORD"]})
|
||||
|
||||
document_batches = connector.load_from_state()
|
||||
print(next(document_batches))
|
||||
|
||||
@@ -4,12 +4,14 @@ from concurrent.futures import as_completed
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
from google.oauth2.credentials import Credentials as OAuthCredentials # type: ignore
|
||||
from google.oauth2.service_account import Credentials as ServiceAccountCredentials # type: ignore
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.app_configs import MAX_FILE_SIZE_BYTES
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.connectors.exceptions import CredentialExpiredError
|
||||
@@ -34,6 +36,7 @@ from onyx.connectors.google_utils.shared_constants import (
|
||||
)
|
||||
from onyx.connectors.google_utils.shared_constants import MISSING_SCOPES_ERROR_STR
|
||||
from onyx.connectors.google_utils.shared_constants import ONYX_SCOPE_INSTRUCTIONS
|
||||
from onyx.connectors.google_utils.shared_constants import SCOPE_DOC_URL
|
||||
from onyx.connectors.google_utils.shared_constants import SLIM_BATCH_SIZE
|
||||
from onyx.connectors.google_utils.shared_constants import USER_FIELDS
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
@@ -43,9 +46,7 @@ from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.vision_enabled_connector import VisionEnabledConnector
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
|
||||
@@ -65,10 +66,7 @@ def _extract_ids_from_urls(urls: list[str]) -> list[str]:
|
||||
|
||||
|
||||
def _convert_single_file(
|
||||
creds: Any,
|
||||
primary_admin_email: str,
|
||||
file: dict[str, Any],
|
||||
image_analysis_llm: LLM | None,
|
||||
creds: Any, primary_admin_email: str, file: dict[str, Any]
|
||||
) -> Any:
|
||||
user_email = file.get("owners", [{}])[0].get("emailAddress") or primary_admin_email
|
||||
user_drive_service = get_drive_service(creds, user_email=user_email)
|
||||
@@ -77,14 +75,11 @@ def _convert_single_file(
|
||||
file=file,
|
||||
drive_service=user_drive_service,
|
||||
docs_service=docs_service,
|
||||
image_analysis_llm=image_analysis_llm, # pass the LLM so doc_conversion can summarize images
|
||||
)
|
||||
|
||||
|
||||
def _process_files_batch(
|
||||
files: list[GoogleDriveFileType],
|
||||
convert_func: Callable[[GoogleDriveFileType], Any],
|
||||
batch_size: int,
|
||||
files: list[GoogleDriveFileType], convert_func: Callable, batch_size: int
|
||||
) -> GenerateDocumentsOutput:
|
||||
doc_batch = []
|
||||
with ThreadPoolExecutor(max_workers=min(16, len(files))) as executor:
|
||||
@@ -116,9 +111,7 @@ def _clean_requested_drive_ids(
|
||||
return valid_requested_drive_ids, filtered_folder_ids
|
||||
|
||||
|
||||
class GoogleDriveConnector(
|
||||
LoadConnector, PollConnector, SlimConnector, VisionEnabledConnector
|
||||
):
|
||||
class GoogleDriveConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
def __init__(
|
||||
self,
|
||||
include_shared_drives: bool = False,
|
||||
@@ -136,23 +129,23 @@ class GoogleDriveConnector(
|
||||
continue_on_failure: bool | None = None,
|
||||
) -> None:
|
||||
# Check for old input parameters
|
||||
if folder_paths is not None:
|
||||
logger.warning(
|
||||
"The 'folder_paths' parameter is deprecated. Use 'shared_folder_urls' instead."
|
||||
if (
|
||||
folder_paths is not None
|
||||
or include_shared is not None
|
||||
or follow_shortcuts is not None
|
||||
or only_org_public is not None
|
||||
or continue_on_failure is not None
|
||||
):
|
||||
logger.exception(
|
||||
"Google Drive connector received old input parameters. "
|
||||
"Please visit the docs for help with the new setup: "
|
||||
f"{SCOPE_DOC_URL}"
|
||||
)
|
||||
if include_shared is not None:
|
||||
logger.warning(
|
||||
"The 'include_shared' parameter is deprecated. Use 'include_files_shared_with_me' instead."
|
||||
raise ConnectorValidationError(
|
||||
"Google Drive connector received old input parameters. "
|
||||
"Please visit the docs for help with the new setup: "
|
||||
f"{SCOPE_DOC_URL}"
|
||||
)
|
||||
if follow_shortcuts is not None:
|
||||
logger.warning("The 'follow_shortcuts' parameter is deprecated.")
|
||||
if only_org_public is not None:
|
||||
logger.warning("The 'only_org_public' parameter is deprecated.")
|
||||
if continue_on_failure is not None:
|
||||
logger.warning("The 'continue_on_failure' parameter is deprecated.")
|
||||
|
||||
# Initialize vision LLM using the mixin
|
||||
self.initialize_vision_llm()
|
||||
|
||||
if (
|
||||
not include_shared_drives
|
||||
@@ -244,7 +237,6 @@ class GoogleDriveConnector(
|
||||
credentials=credentials,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
)
|
||||
|
||||
return new_creds_dict
|
||||
|
||||
def _update_traversed_parent_ids(self, folder_id: str) -> None:
|
||||
@@ -531,53 +523,37 @@ class GoogleDriveConnector(
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
) -> GenerateDocumentsOutput:
|
||||
# Create a larger process pool for file conversion
|
||||
with ThreadPoolExecutor(max_workers=8) as executor:
|
||||
# Prepare a partial function with the credentials and admin email
|
||||
convert_func = partial(
|
||||
_convert_single_file,
|
||||
self.creds,
|
||||
self.primary_admin_email,
|
||||
image_analysis_llm=self.image_analysis_llm, # Use the mixin's LLM
|
||||
convert_func = partial(
|
||||
_convert_single_file, self.creds, self.primary_admin_email
|
||||
)
|
||||
|
||||
# Process files in larger batches
|
||||
LARGE_BATCH_SIZE = self.batch_size * 4
|
||||
files_to_process = []
|
||||
# Gather the files into batches to be processed in parallel
|
||||
for file in self._fetch_drive_items(is_slim=False, start=start, end=end):
|
||||
if (
|
||||
file.get("size")
|
||||
and int(cast(str, file.get("size"))) > MAX_FILE_SIZE_BYTES
|
||||
):
|
||||
logger.warning(
|
||||
f"Skipping file {file.get('name', 'Unknown')} as it is too large: {file.get('size')} bytes"
|
||||
)
|
||||
continue
|
||||
|
||||
files_to_process.append(file)
|
||||
if len(files_to_process) >= LARGE_BATCH_SIZE:
|
||||
yield from _process_files_batch(
|
||||
files_to_process, convert_func, self.batch_size
|
||||
)
|
||||
files_to_process = []
|
||||
|
||||
# Process any remaining files
|
||||
if files_to_process:
|
||||
yield from _process_files_batch(
|
||||
files_to_process, convert_func, self.batch_size
|
||||
)
|
||||
|
||||
# Fetch files in batches
|
||||
files_batch: list[GoogleDriveFileType] = []
|
||||
for file in self._fetch_drive_items(is_slim=False, start=start, end=end):
|
||||
files_batch.append(file)
|
||||
|
||||
if len(files_batch) >= self.batch_size:
|
||||
# Process the batch
|
||||
futures = [
|
||||
executor.submit(convert_func, file) for file in files_batch
|
||||
]
|
||||
documents = []
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
doc = future.result()
|
||||
if doc is not None:
|
||||
documents.append(doc)
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting file: {e}")
|
||||
|
||||
if documents:
|
||||
yield documents
|
||||
files_batch = []
|
||||
|
||||
# Process any remaining files
|
||||
if files_batch:
|
||||
futures = [executor.submit(convert_func, file) for file in files_batch]
|
||||
documents = []
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
doc = future.result()
|
||||
if doc is not None:
|
||||
documents.append(doc)
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting file: {e}")
|
||||
|
||||
if documents:
|
||||
yield documents
|
||||
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
try:
|
||||
yield from self._extract_docs_from_google_drive()
|
||||
|
||||
@@ -9,7 +9,7 @@ from googleapiclient.errors import HttpError # type: ignore
|
||||
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.configs.constants import IGNORE_FOR_QA
|
||||
from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
|
||||
from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
|
||||
from onyx.connectors.google_drive.constants import UNSUPPORTED_FILE_TYPE_CONTENT
|
||||
@@ -21,88 +21,32 @@ from onyx.connectors.google_utils.resources import GoogleDriveService
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.file_processing.extract_file_text import docx_to_text_and_images
|
||||
from onyx.file_processing.extract_file_text import docx_to_text
|
||||
from onyx.file_processing.extract_file_text import pptx_to_text
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
from onyx.file_processing.file_validation import is_valid_image_type
|
||||
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.file_processing.unstructured import get_unstructured_api_key
|
||||
from onyx.file_processing.unstructured import unstructured_to_text
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def _summarize_drive_image(
|
||||
image_data: bytes, image_name: str, image_analysis_llm: LLM | None
|
||||
) -> str:
|
||||
"""
|
||||
Summarize the given image using the provided LLM.
|
||||
"""
|
||||
if not image_analysis_llm:
|
||||
return ""
|
||||
|
||||
return (
|
||||
summarize_image_with_error_handling(
|
||||
llm=image_analysis_llm,
|
||||
image_data=image_data,
|
||||
context_name=image_name,
|
||||
)
|
||||
or ""
|
||||
)
|
||||
|
||||
|
||||
def is_gdrive_image_mime_type(mime_type: str) -> bool:
|
||||
"""
|
||||
Return True if the mime_type is a common image type in GDrive.
|
||||
(e.g. 'image/png', 'image/jpeg')
|
||||
"""
|
||||
return is_valid_image_type(mime_type)
|
||||
# these errors don't represent a failure in the connector, but simply files
|
||||
# that can't / shouldn't be indexed
|
||||
ERRORS_TO_CONTINUE_ON = [
|
||||
"cannotExportFile",
|
||||
"exportSizeLimitExceeded",
|
||||
"cannotDownloadFile",
|
||||
]
|
||||
|
||||
|
||||
def _extract_sections_basic(
|
||||
file: dict[str, str],
|
||||
service: GoogleDriveService,
|
||||
image_analysis_llm: LLM | None = None,
|
||||
file: dict[str, str], service: GoogleDriveService
|
||||
) -> list[Section]:
|
||||
"""
|
||||
Extends the existing logic to handle either a docx with embedded images
|
||||
or standalone images (PNG, JPG, etc).
|
||||
"""
|
||||
mime_type = file["mimeType"]
|
||||
link = file["webViewLink"]
|
||||
file_name = file.get("name", file["id"])
|
||||
supported_file_types = set(item.value for item in GDriveMimeType)
|
||||
|
||||
# 1) If the file is an image, retrieve the raw bytes, optionally summarize
|
||||
if is_gdrive_image_mime_type(mime_type):
|
||||
try:
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
section, _ = store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=response,
|
||||
file_name=file["id"],
|
||||
display_name=file_name,
|
||||
media_type=mime_type,
|
||||
llm=image_analysis_llm,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
return [section]
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch or summarize image: {e}")
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text="",
|
||||
image_file_name=link,
|
||||
)
|
||||
]
|
||||
|
||||
if mime_type not in supported_file_types:
|
||||
# Unsupported file types can still have a title, finding this way is still useful
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
@@ -241,63 +185,45 @@ def _extract_sections_basic(
|
||||
GDriveMimeType.PLAIN_TEXT.value,
|
||||
GDriveMimeType.MARKDOWN.value,
|
||||
]:
|
||||
text_data = (
|
||||
service.files().get_media(fileId=file["id"]).execute().decode("utf-8")
|
||||
)
|
||||
return [Section(link=link, text=text_data)]
|
||||
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=service.files()
|
||||
.get_media(fileId=file["id"])
|
||||
.execute()
|
||||
.decode("utf-8"),
|
||||
)
|
||||
]
|
||||
# ---------------------------
|
||||
# Word, PowerPoint, PDF files
|
||||
elif mime_type in [
|
||||
if mime_type in [
|
||||
GDriveMimeType.WORD_DOC.value,
|
||||
GDriveMimeType.POWERPOINT.value,
|
||||
GDriveMimeType.PDF.value,
|
||||
]:
|
||||
response_bytes = service.files().get_media(fileId=file["id"]).execute()
|
||||
|
||||
# Optionally use Unstructured
|
||||
response = service.files().get_media(fileId=file["id"]).execute()
|
||||
if get_unstructured_api_key():
|
||||
text = unstructured_to_text(
|
||||
file=io.BytesIO(response_bytes),
|
||||
file_name=file_name,
|
||||
)
|
||||
return [Section(link=link, text=text)]
|
||||
return [
|
||||
Section(
|
||||
link=link,
|
||||
text=unstructured_to_text(
|
||||
file=io.BytesIO(response),
|
||||
file_name=file.get("name", file["id"]),
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
if mime_type == GDriveMimeType.WORD_DOC.value:
|
||||
# Use docx_to_text_and_images to get text plus embedded images
|
||||
text, embedded_images = docx_to_text_and_images(
|
||||
file=io.BytesIO(response_bytes),
|
||||
)
|
||||
sections = []
|
||||
if text.strip():
|
||||
sections.append(Section(link=link, text=text.strip()))
|
||||
|
||||
# Process each embedded image using the standardized function
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
for idx, (img_data, img_name) in enumerate(
|
||||
embedded_images, start=1
|
||||
):
|
||||
# Create a unique identifier for the embedded image
|
||||
embedded_id = f"{file['id']}_embedded_{idx}"
|
||||
|
||||
section, _ = store_image_and_create_section(
|
||||
db_session=db_session,
|
||||
image_data=img_data,
|
||||
file_name=embedded_id,
|
||||
display_name=img_name or f"{file_name} - image {idx}",
|
||||
llm=image_analysis_llm,
|
||||
file_origin=FileOrigin.CONNECTOR,
|
||||
)
|
||||
sections.append(section)
|
||||
return sections
|
||||
|
||||
return [
|
||||
Section(link=link, text=docx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
elif mime_type == GDriveMimeType.PDF.value:
|
||||
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_bytes))
|
||||
text, _ = read_pdf_file(file=io.BytesIO(response))
|
||||
return [Section(link=link, text=text)]
|
||||
|
||||
elif mime_type == GDriveMimeType.POWERPOINT.value:
|
||||
text_data = pptx_to_text(io.BytesIO(response_bytes))
|
||||
return [Section(link=link, text=text_data)]
|
||||
return [
|
||||
Section(link=link, text=pptx_to_text(file=io.BytesIO(response)))
|
||||
]
|
||||
|
||||
# Catch-all case, should not happen since there should be specific handling
|
||||
# for each of the supported file types
|
||||
@@ -305,8 +231,7 @@ def _extract_sections_basic(
|
||||
logger.error(error_message)
|
||||
raise ValueError(error_message)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error extracting sections from file: {e}")
|
||||
except Exception:
|
||||
return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
|
||||
|
||||
|
||||
@@ -314,62 +239,74 @@ def convert_drive_item_to_document(
|
||||
file: GoogleDriveFileType,
|
||||
drive_service: GoogleDriveService,
|
||||
docs_service: GoogleDocsService,
|
||||
image_analysis_llm: LLM | None,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Main entry point for converting a Google Drive file => Document object.
|
||||
Now we accept an optional `llm` to pass to `_extract_sections_basic`.
|
||||
"""
|
||||
try:
|
||||
# skip shortcuts or folders
|
||||
if file.get("mimeType") in [DRIVE_SHORTCUT_TYPE, DRIVE_FOLDER_TYPE]:
|
||||
logger.info("Skipping shortcut/folder.")
|
||||
# Skip files that are shortcuts
|
||||
if file.get("mimeType") == DRIVE_SHORTCUT_TYPE:
|
||||
logger.info("Ignoring Drive Shortcut Filetype")
|
||||
return None
|
||||
# Skip files that are folders
|
||||
if file.get("mimeType") == DRIVE_FOLDER_TYPE:
|
||||
logger.info("Ignoring Drive Folder Filetype")
|
||||
return None
|
||||
|
||||
# If it's a Google Doc, we might do advanced parsing
|
||||
sections: list[Section] = []
|
||||
|
||||
# Special handling for Google Docs to preserve structure, link
|
||||
# to headers
|
||||
if file.get("mimeType") == GDriveMimeType.DOC.value:
|
||||
try:
|
||||
# get_document_sections is the advanced approach for Google Docs
|
||||
sections = get_document_sections(docs_service, file["id"])
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to pull google doc sections from '{file['name']}': {e}. "
|
||||
"Falling back to basic extraction."
|
||||
f"Ran into exception '{e}' when pulling sections from Google Doc '{file['name']}'."
|
||||
" Falling back to basic extraction."
|
||||
)
|
||||
|
||||
# If not a doc, or if we failed above, do our 'basic' approach
|
||||
# NOTE: this will run for either (1) the above failed or (2) the file is not a Google Doc
|
||||
if not sections:
|
||||
sections = _extract_sections_basic(file, drive_service, image_analysis_llm)
|
||||
try:
|
||||
# For all other file types just extract the text
|
||||
sections = _extract_sections_basic(file, drive_service)
|
||||
|
||||
except HttpError as e:
|
||||
reason = e.error_details[0]["reason"] if e.error_details else e.reason
|
||||
message = e.error_details[0]["message"] if e.error_details else e.reason
|
||||
if e.status_code == 403 and reason in ERRORS_TO_CONTINUE_ON:
|
||||
logger.warning(
|
||||
f"Could not export file '{file['name']}' due to '{message}', skipping..."
|
||||
)
|
||||
return None
|
||||
|
||||
raise
|
||||
if not sections:
|
||||
return None
|
||||
|
||||
doc_id = file["webViewLink"]
|
||||
updated_time = datetime.fromisoformat(file["modifiedTime"]).astimezone(
|
||||
timezone.utc
|
||||
)
|
||||
|
||||
return Document(
|
||||
id=doc_id,
|
||||
id=file["webViewLink"],
|
||||
sections=sections,
|
||||
source=DocumentSource.GOOGLE_DRIVE,
|
||||
semantic_identifier=file["name"],
|
||||
doc_updated_at=updated_time,
|
||||
metadata={}, # or any metadata from 'file'
|
||||
doc_updated_at=datetime.fromisoformat(file["modifiedTime"]).astimezone(
|
||||
timezone.utc
|
||||
),
|
||||
metadata={}
|
||||
if any(section.text for section in sections)
|
||||
else {IGNORE_FOR_QA: "True"},
|
||||
additional_info=file.get("id"),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error converting file '{file.get('name')}' to Document: {e}")
|
||||
if not CONTINUE_ON_CONNECTOR_FAILURE:
|
||||
raise
|
||||
raise e
|
||||
|
||||
logger.exception("Ran into exception when pulling a file from Google Drive")
|
||||
return None
|
||||
|
||||
|
||||
def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None:
|
||||
# Skip files that are folders or shortcuts
|
||||
if file.get("mimeType") in [DRIVE_FOLDER_TYPE, DRIVE_SHORTCUT_TYPE]:
|
||||
return None
|
||||
|
||||
return SlimDocument(
|
||||
id=file["webViewLink"],
|
||||
perm_sync_data={
|
||||
|
||||
@@ -28,8 +28,7 @@ class ConnectorMissingCredentialError(PermissionError):
|
||||
|
||||
class Section(BaseModel):
|
||||
text: str
|
||||
link: str | None = None
|
||||
image_file_name: str | None = None
|
||||
link: str | None
|
||||
|
||||
|
||||
class BasicExpertInfo(BaseModel):
|
||||
|
||||
@@ -1,45 +0,0 @@
|
||||
"""
|
||||
Mixin for connectors that need vision capabilities.
|
||||
"""
|
||||
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
|
||||
from onyx.llm.factory import get_default_llm_with_vision
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class VisionEnabledConnector:
|
||||
"""
|
||||
Mixin for connectors that need vision capabilities.
|
||||
|
||||
This mixin provides a standard way to initialize a vision-capable LLM
|
||||
for image analysis during indexing.
|
||||
|
||||
Usage:
|
||||
class MyConnector(LoadConnector, VisionEnabledConnector):
|
||||
def __init__(self, ...):
|
||||
super().__init__(...)
|
||||
self.initialize_vision_llm()
|
||||
"""
|
||||
|
||||
def initialize_vision_llm(self) -> None:
|
||||
"""
|
||||
Initialize a vision-capable LLM if enabled by configuration.
|
||||
|
||||
Sets self.image_analysis_llm to the LLM instance or None if disabled.
|
||||
"""
|
||||
self.image_analysis_llm: LLM | None = None
|
||||
if get_image_extraction_and_analysis_enabled():
|
||||
try:
|
||||
self.image_analysis_llm = get_default_llm_with_vision()
|
||||
if self.image_analysis_llm is None:
|
||||
logger.warning(
|
||||
"No LLM with vision found; image summarization will be disabled"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to initialize vision LLM due to an error: {str(e)}. "
|
||||
"Image summarization will be disabled."
|
||||
)
|
||||
self.image_analysis_llm = None
|
||||
@@ -157,7 +157,6 @@ def get_internal_links(
|
||||
|
||||
def start_playwright() -> Tuple[Playwright, BrowserContext]:
|
||||
playwright = sync_playwright().start()
|
||||
|
||||
browser = playwright.chromium.launch(headless=True)
|
||||
|
||||
context = browser.new_context()
|
||||
@@ -333,7 +332,7 @@ class WebConnector(LoadConnector):
|
||||
if initial_url.split(".")[-1] == "pdf":
|
||||
# PDF files are not checked for links
|
||||
response = requests.get(initial_url)
|
||||
page_text, metadata, images = read_pdf_file(
|
||||
page_text, metadata = read_pdf_file(
|
||||
file=io.BytesIO(response.content)
|
||||
)
|
||||
last_modified = response.headers.get("Last-Modified")
|
||||
|
||||
@@ -1,17 +1,12 @@
|
||||
import base64
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from typing import cast
|
||||
|
||||
import numpy
|
||||
from langchain_core.messages import BaseMessage
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.messages import SystemMessage
|
||||
|
||||
from onyx.chat.models import SectionRelevancePiece
|
||||
from onyx.configs.app_configs import BLURB_SIZE
|
||||
from onyx.configs.constants import RETURN_SEPARATOR
|
||||
from onyx.configs.llm_configs import get_search_time_image_analysis_enabled
|
||||
from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MAX
|
||||
from onyx.configs.model_configs import CROSS_ENCODER_RANGE_MIN
|
||||
from onyx.context.search.enums import LLMEvaluationType
|
||||
@@ -23,15 +18,11 @@ from onyx.context.search.models import MAX_METRICS_CONTENT
|
||||
from onyx.context.search.models import RerankingDetails
|
||||
from onyx.context.search.models import RerankMetricsContainer
|
||||
from onyx.context.search.models import SearchQuery
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.document_index.document_index_utils import (
|
||||
translate_boost_count_to_multiplier,
|
||||
)
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.llm.utils import message_to_string
|
||||
from onyx.natural_language_processing.search_nlp_models import RerankingModel
|
||||
from onyx.prompts.image_analysis import IMAGE_ANALYSIS_SYSTEM_PROMPT
|
||||
from onyx.secondary_llm_flows.chunk_usefulness import llm_batch_eval_sections
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.threadpool_concurrency import FunctionCall
|
||||
@@ -39,124 +30,6 @@ from onyx.utils.threadpool_concurrency import run_functions_in_parallel
|
||||
from onyx.utils.timing import log_function_time
|
||||
|
||||
|
||||
def update_image_sections_with_query(
|
||||
sections: list[InferenceSection],
|
||||
query: str,
|
||||
llm: LLM,
|
||||
) -> None:
|
||||
"""
|
||||
For each chunk in each section that has an image URL, call an LLM to produce
|
||||
a new 'content' string that directly addresses the user's query about that image.
|
||||
This implementation uses parallel processing for efficiency.
|
||||
"""
|
||||
logger = setup_logger()
|
||||
logger.debug(f"Starting image section update with query: {query}")
|
||||
|
||||
chunks_with_images = []
|
||||
for section in sections:
|
||||
for chunk in section.chunks:
|
||||
if chunk.image_file_name:
|
||||
chunks_with_images.append(chunk)
|
||||
|
||||
if not chunks_with_images:
|
||||
logger.debug("No images to process in the sections")
|
||||
return # No images to process
|
||||
|
||||
logger.info(f"Found {len(chunks_with_images)} chunks with images to process")
|
||||
|
||||
def process_image_chunk(chunk: InferenceChunk) -> tuple[str, str]:
|
||||
try:
|
||||
logger.debug(
|
||||
f"Processing image chunk with ID: {chunk.unique_id}, image: {chunk.image_file_name}"
|
||||
)
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
file_record = get_default_file_store(db_session).read_file(
|
||||
cast(str, chunk.image_file_name), mode="b"
|
||||
)
|
||||
if not file_record:
|
||||
logger.error(f"Image file not found: {chunk.image_file_name}")
|
||||
raise Exception("File not found")
|
||||
file_content = file_record.read()
|
||||
image_base64 = base64.b64encode(file_content).decode()
|
||||
logger.debug(
|
||||
f"Successfully loaded image data for {chunk.image_file_name}"
|
||||
)
|
||||
|
||||
messages: list[BaseMessage] = [
|
||||
SystemMessage(content=IMAGE_ANALYSIS_SYSTEM_PROMPT),
|
||||
HumanMessage(
|
||||
content=[
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
f"The user's question is: '{query}'. "
|
||||
"Please analyze the following image in that context:\n"
|
||||
),
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_base64}",
|
||||
},
|
||||
},
|
||||
]
|
||||
),
|
||||
]
|
||||
|
||||
raw_response = llm.invoke(messages)
|
||||
|
||||
answer_text = message_to_string(raw_response).strip()
|
||||
return (
|
||||
chunk.unique_id,
|
||||
answer_text if answer_text else "No relevant info found.",
|
||||
)
|
||||
|
||||
except Exception:
|
||||
logger.exception(
|
||||
f"Error updating image section with query source image url: {chunk.image_file_name}"
|
||||
)
|
||||
return chunk.unique_id, "Error analyzing image."
|
||||
|
||||
image_processing_tasks = [
|
||||
FunctionCall(process_image_chunk, (chunk,)) for chunk in chunks_with_images
|
||||
]
|
||||
|
||||
logger.info(
|
||||
f"Starting parallel processing of {len(image_processing_tasks)} image tasks"
|
||||
)
|
||||
image_processing_results = run_functions_in_parallel(image_processing_tasks)
|
||||
logger.info(
|
||||
f"Completed parallel processing with {len(image_processing_results)} results"
|
||||
)
|
||||
|
||||
# Create a mapping of chunk IDs to their processed content
|
||||
chunk_id_to_content = {}
|
||||
success_count = 0
|
||||
for task_id, result in image_processing_results.items():
|
||||
if result:
|
||||
chunk_id, content = result
|
||||
chunk_id_to_content[chunk_id] = content
|
||||
success_count += 1
|
||||
else:
|
||||
logger.error(f"Task {task_id} failed to return a valid result")
|
||||
|
||||
logger.info(
|
||||
f"Successfully processed {success_count}/{len(image_processing_results)} images"
|
||||
)
|
||||
|
||||
# Update the chunks with the processed content
|
||||
updated_count = 0
|
||||
for section in sections:
|
||||
for chunk in section.chunks:
|
||||
if chunk.unique_id in chunk_id_to_content:
|
||||
chunk.content = chunk_id_to_content[chunk.unique_id]
|
||||
updated_count += 1
|
||||
|
||||
logger.info(
|
||||
f"Updated content for {updated_count} chunks with image analysis results"
|
||||
)
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@@ -413,10 +286,6 @@ def search_postprocessing(
|
||||
# NOTE: if we don't rerank, we can return the chunks immediately
|
||||
# since we know this is the final order.
|
||||
# This way the user experience isn't delayed by the LLM step
|
||||
if get_search_time_image_analysis_enabled():
|
||||
update_image_sections_with_query(
|
||||
retrieved_sections, search_query.query, llm
|
||||
)
|
||||
_log_top_section_links(search_query.search_type.value, retrieved_sections)
|
||||
yield retrieved_sections
|
||||
sections_yielded = True
|
||||
@@ -454,13 +323,6 @@ def search_postprocessing(
|
||||
)
|
||||
else:
|
||||
_log_top_section_links(search_query.search_type.value, reranked_sections)
|
||||
|
||||
# Add the image processing step here
|
||||
if get_search_time_image_analysis_enabled():
|
||||
update_image_sections_with_query(
|
||||
reranked_sections, search_query.query, llm
|
||||
)
|
||||
|
||||
yield reranked_sections
|
||||
|
||||
llm_selected_section_ids = (
|
||||
|
||||
@@ -148,28 +148,3 @@ def upsert_pgfilestore(
|
||||
db_session.commit()
|
||||
|
||||
return pgfilestore
|
||||
|
||||
|
||||
def save_bytes_to_pgfilestore(
|
||||
db_session: Session,
|
||||
raw_bytes: bytes,
|
||||
media_type: str,
|
||||
identifier: str,
|
||||
display_name: str,
|
||||
file_origin: FileOrigin = FileOrigin.OTHER,
|
||||
) -> PGFileStore:
|
||||
"""
|
||||
Saves raw bytes to PGFileStore and returns the resulting record.
|
||||
"""
|
||||
file_name = f"{file_origin.name.lower()}_{identifier}"
|
||||
lobj_oid = create_populate_lobj(BytesIO(raw_bytes), db_session)
|
||||
pgfilestore = upsert_pgfilestore(
|
||||
file_name=file_name,
|
||||
display_name=display_name,
|
||||
file_origin=file_origin,
|
||||
file_type=media_type,
|
||||
lobj_oid=lobj_oid,
|
||||
db_session=db_session,
|
||||
commit=True,
|
||||
)
|
||||
return pgfilestore
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
import random
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
|
||||
from onyx.configs.constants import MessageType
|
||||
from onyx.db.chat import create_chat_session
|
||||
from onyx.db.chat import create_new_chat_message
|
||||
from onyx.db.chat import get_or_create_root_message
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.models import ChatSession
|
||||
|
||||
|
||||
def seed_chat_history(num_sessions: int, num_messages: int, days: int) -> None:
|
||||
"""Utility function to seed chat history for testing.
|
||||
|
||||
num_sessions: the number of sessions to seed
|
||||
num_messages: the number of messages to seed per sessions
|
||||
days: the number of days looking backwards from the current time over which to randomize
|
||||
the times.
|
||||
"""
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
for y in range(0, num_sessions):
|
||||
create_chat_session(db_session, f"pytest_session_{y}", None, None)
|
||||
|
||||
# randomize all session times
|
||||
rows = db_session.query(ChatSession).all()
|
||||
for row in rows:
|
||||
row.time_created = datetime.utcnow() - timedelta(
|
||||
days=random.randint(0, days)
|
||||
)
|
||||
row.time_updated = row.time_created + timedelta(
|
||||
minutes=random.randint(0, 10)
|
||||
)
|
||||
|
||||
root_message = get_or_create_root_message(row.id, db_session)
|
||||
|
||||
for x in range(0, num_messages):
|
||||
chat_message = create_new_chat_message(
|
||||
row.id,
|
||||
root_message,
|
||||
f"pytest_message_{x}",
|
||||
None,
|
||||
0,
|
||||
MessageType.USER,
|
||||
db_session,
|
||||
)
|
||||
|
||||
chat_message.time_sent = row.time_created + timedelta(
|
||||
minutes=random.randint(0, 10)
|
||||
)
|
||||
db_session.commit()
|
||||
|
||||
db_session.commit()
|
||||
@@ -55,9 +55,6 @@ schema DANSWER_CHUNK_NAME {
|
||||
field blurb type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
field image_file_name type string {
|
||||
indexing: summary | attribute
|
||||
}
|
||||
# https://docs.vespa.ai/en/attributes.html potential enum store for speed, but probably not worth it
|
||||
field source_type type string {
|
||||
indexing: summary | attribute
|
||||
|
||||
@@ -31,7 +31,6 @@ from onyx.document_index.vespa_constants import DOC_UPDATED_AT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import HIDDEN
|
||||
from onyx.document_index.vespa_constants import IMAGE_FILE_NAME
|
||||
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
|
||||
from onyx.document_index.vespa_constants import MAX_ID_SEARCH_QUERY_SIZE
|
||||
from onyx.document_index.vespa_constants import MAX_OR_CONDITIONS
|
||||
@@ -131,7 +130,6 @@ def _vespa_hit_to_inference_chunk(
|
||||
section_continuation=fields[SECTION_CONTINUATION],
|
||||
document_id=fields[DOCUMENT_ID],
|
||||
source_type=fields[SOURCE_TYPE],
|
||||
image_file_name=fields.get(IMAGE_FILE_NAME),
|
||||
title=fields.get(TITLE),
|
||||
semantic_identifier=fields[SEMANTIC_IDENTIFIER],
|
||||
boost=fields.get(BOOST, 1),
|
||||
@@ -213,7 +211,6 @@ def _get_chunks_via_visit_api(
|
||||
|
||||
# Check if the response contains any documents
|
||||
response_data = response.json()
|
||||
|
||||
if "documents" in response_data:
|
||||
for document in response_data["documents"]:
|
||||
if filters.access_control_list:
|
||||
|
||||
@@ -32,7 +32,6 @@ from onyx.document_index.vespa_constants import DOCUMENT_ID
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_ID_ENDPOINT
|
||||
from onyx.document_index.vespa_constants import DOCUMENT_SETS
|
||||
from onyx.document_index.vespa_constants import EMBEDDINGS
|
||||
from onyx.document_index.vespa_constants import IMAGE_FILE_NAME
|
||||
from onyx.document_index.vespa_constants import LARGE_CHUNK_REFERENCE_IDS
|
||||
from onyx.document_index.vespa_constants import METADATA
|
||||
from onyx.document_index.vespa_constants import METADATA_LIST
|
||||
@@ -199,13 +198,13 @@ def _index_vespa_chunk(
|
||||
# which only calls VespaIndex.update
|
||||
ACCESS_CONTROL_LIST: {acl_entry: 1 for acl_entry in chunk.access.to_acl()},
|
||||
DOCUMENT_SETS: {document_set: 1 for document_set in chunk.document_sets},
|
||||
IMAGE_FILE_NAME: chunk.image_file_name,
|
||||
BOOST: chunk.boost,
|
||||
}
|
||||
|
||||
if multitenant:
|
||||
if chunk.tenant_id:
|
||||
vespa_document_fields[TENANT_ID] = chunk.tenant_id
|
||||
|
||||
vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_chunk_id}"
|
||||
logger.debug(f'Indexing to URL "{vespa_url}"')
|
||||
res = http_client.post(
|
||||
|
||||
@@ -77,7 +77,6 @@ PRIMARY_OWNERS = "primary_owners"
|
||||
SECONDARY_OWNERS = "secondary_owners"
|
||||
RECENCY_BIAS = "recency_bias"
|
||||
HIDDEN = "hidden"
|
||||
IMAGE_FILE_NAME = "image_file_name"
|
||||
|
||||
# Specific to Vespa, needed for highlighting matching keywords / section
|
||||
CONTENT_SUMMARY = "content_summary"
|
||||
@@ -95,7 +94,6 @@ YQL_BASE = (
|
||||
f"{SEMANTIC_IDENTIFIER}, "
|
||||
f"{TITLE}, "
|
||||
f"{SECTION_CONTINUATION}, "
|
||||
f"{IMAGE_FILE_NAME}, "
|
||||
f"{BOOST}, "
|
||||
f"{HIDDEN}, "
|
||||
f"{DOC_UPDATED_AT}, "
|
||||
|
||||
@@ -9,17 +9,15 @@ from email.parser import Parser as EmailParser
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import IO
|
||||
from typing import List
|
||||
from typing import Tuple
|
||||
|
||||
import chardet
|
||||
import docx # type: ignore
|
||||
import openpyxl # type: ignore
|
||||
import pptx # type: ignore
|
||||
from docx import Document as DocxDocument
|
||||
from docx import Document
|
||||
from fastapi import UploadFile
|
||||
from PIL import Image
|
||||
from pypdf import PdfReader
|
||||
from pypdf.errors import PdfStreamError
|
||||
|
||||
@@ -33,8 +31,10 @@ from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
TEXT_SECTION_SEPARATOR = "\n\n"
|
||||
|
||||
|
||||
PLAIN_TEXT_FILE_EXTENSIONS = [
|
||||
".txt",
|
||||
".md",
|
||||
@@ -49,6 +49,7 @@ PLAIN_TEXT_FILE_EXTENSIONS = [
|
||||
".yaml",
|
||||
]
|
||||
|
||||
|
||||
VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
|
||||
".pdf",
|
||||
".docx",
|
||||
@@ -57,16 +58,6 @@ VALID_FILE_EXTENSIONS = PLAIN_TEXT_FILE_EXTENSIONS + [
|
||||
".eml",
|
||||
".epub",
|
||||
".html",
|
||||
".png",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".webp",
|
||||
]
|
||||
|
||||
IMAGE_MEDIA_TYPES = [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/webp",
|
||||
]
|
||||
|
||||
|
||||
@@ -76,13 +67,11 @@ def is_text_file_extension(file_name: str) -> bool:
|
||||
|
||||
def get_file_ext(file_path_or_name: str | Path) -> str:
|
||||
_, extension = os.path.splitext(file_path_or_name)
|
||||
# standardize all extensions to be lowercase so that checks against
|
||||
# VALID_FILE_EXTENSIONS and similar will work as intended
|
||||
return extension.lower()
|
||||
|
||||
|
||||
def is_valid_media_type(media_type: str) -> bool:
|
||||
return media_type in IMAGE_MEDIA_TYPES
|
||||
|
||||
|
||||
def is_valid_file_ext(ext: str) -> bool:
|
||||
return ext in VALID_FILE_EXTENSIONS
|
||||
|
||||
@@ -90,18 +79,17 @@ def is_valid_file_ext(ext: str) -> bool:
|
||||
def is_text_file(file: IO[bytes]) -> bool:
|
||||
"""
|
||||
checks if the first 1024 bytes only contain printable or whitespace characters
|
||||
if it does, then we say it's a plaintext file
|
||||
if it does, then we say its a plaintext file
|
||||
"""
|
||||
raw_data = file.read(1024)
|
||||
file.seek(0)
|
||||
text_chars = bytearray({7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F})
|
||||
return all(c in text_chars for c in raw_data)
|
||||
|
||||
|
||||
def detect_encoding(file: IO[bytes]) -> str:
|
||||
raw_data = file.read(50000)
|
||||
file.seek(0)
|
||||
encoding = chardet.detect(raw_data)["encoding"] or "utf-8"
|
||||
file.seek(0)
|
||||
return encoding
|
||||
|
||||
|
||||
@@ -111,14 +99,14 @@ def is_macos_resource_fork_file(file_name: str) -> bool:
|
||||
)
|
||||
|
||||
|
||||
# To include additional metadata in the search index, add a .onyx_metadata.json file
|
||||
# to the zip file. This file should contain a list of objects with the following format:
|
||||
# [{ "filename": "file1.txt", "link": "https://example.com/file1.txt" }]
|
||||
def load_files_from_zip(
|
||||
zip_file_io: IO,
|
||||
ignore_macos_resource_fork_files: bool = True,
|
||||
ignore_dirs: bool = True,
|
||||
) -> Iterator[tuple[zipfile.ZipInfo, IO[Any], dict[str, Any]]]:
|
||||
"""
|
||||
If there's a .onyx_metadata.json in the zip, attach those metadata to each subfile.
|
||||
"""
|
||||
with zipfile.ZipFile(zip_file_io, "r") as zip_file:
|
||||
zip_metadata = {}
|
||||
try:
|
||||
@@ -130,31 +118,24 @@ def load_files_from_zip(
|
||||
# convert list of dicts to dict of dicts
|
||||
zip_metadata = {d["filename"]: d for d in zip_metadata}
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(f"Unable to load {DANSWER_METADATA_FILENAME}")
|
||||
logger.warn(f"Unable to load {DANSWER_METADATA_FILENAME}")
|
||||
except KeyError:
|
||||
logger.info(f"No {DANSWER_METADATA_FILENAME} file")
|
||||
|
||||
for file_info in zip_file.infolist():
|
||||
if ignore_dirs and file_info.is_dir():
|
||||
continue
|
||||
with zip_file.open(file_info.filename, "r") as file:
|
||||
if ignore_dirs and file_info.is_dir():
|
||||
continue
|
||||
|
||||
if (
|
||||
ignore_macos_resource_fork_files
|
||||
and is_macos_resource_fork_file(file_info.filename)
|
||||
) or file_info.filename == DANSWER_METADATA_FILENAME:
|
||||
continue
|
||||
|
||||
with zip_file.open(file_info.filename, "r") as subfile:
|
||||
yield file_info, subfile, zip_metadata.get(file_info.filename, {})
|
||||
if (
|
||||
ignore_macos_resource_fork_files
|
||||
and is_macos_resource_fork_file(file_info.filename)
|
||||
) or file_info.filename == DANSWER_METADATA_FILENAME:
|
||||
continue
|
||||
yield file_info, file, zip_metadata.get(file_info.filename, {})
|
||||
|
||||
|
||||
def _extract_onyx_metadata(line: str) -> dict | None:
|
||||
"""
|
||||
Example: first line has:
|
||||
<!-- DANSWER_METADATA={"title": "..."} -->
|
||||
or
|
||||
#DANSWER_METADATA={"title":"..."}
|
||||
"""
|
||||
html_comment_pattern = r"<!--\s*DANSWER_METADATA=\{(.*?)\}\s*-->"
|
||||
hashtag_pattern = r"#DANSWER_METADATA=\{(.*?)\}"
|
||||
|
||||
@@ -180,13 +161,9 @@ def read_text_file(
|
||||
errors: str = "replace",
|
||||
ignore_onyx_metadata: bool = True,
|
||||
) -> tuple[str, dict]:
|
||||
"""
|
||||
For plain text files. Optionally extracts Onyx metadata from the first line.
|
||||
"""
|
||||
metadata = {}
|
||||
file_content_raw = ""
|
||||
for ind, line in enumerate(file):
|
||||
# decode
|
||||
try:
|
||||
line = line.decode(encoding) if isinstance(line, bytes) else line
|
||||
except UnicodeDecodeError:
|
||||
@@ -196,132 +173,131 @@ def read_text_file(
|
||||
else line
|
||||
)
|
||||
|
||||
# optionally parse metadata in the first line
|
||||
if ind == 0 and not ignore_onyx_metadata:
|
||||
potential_meta = _extract_onyx_metadata(line)
|
||||
if potential_meta is not None:
|
||||
metadata = potential_meta
|
||||
continue
|
||||
|
||||
file_content_raw += line
|
||||
if ind == 0:
|
||||
metadata_or_none = (
|
||||
None if ignore_onyx_metadata else _extract_onyx_metadata(line)
|
||||
)
|
||||
if metadata_or_none is not None:
|
||||
metadata = metadata_or_none
|
||||
else:
|
||||
file_content_raw += line
|
||||
else:
|
||||
file_content_raw += line
|
||||
|
||||
return file_content_raw, metadata
|
||||
|
||||
|
||||
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
|
||||
"""
|
||||
Extract text from a PDF. For embedded images, a more complex approach is needed.
|
||||
This is a minimal approach returning text only.
|
||||
"""
|
||||
text, _, _ = read_pdf_file(file, pdf_pass)
|
||||
"""Extract text from a PDF file."""
|
||||
# Return only the extracted text from read_pdf_file
|
||||
text, _ = read_pdf_file(file, pdf_pass)
|
||||
return text
|
||||
|
||||
|
||||
def read_pdf_file(
|
||||
file: IO[Any], pdf_pass: str | None = None, extract_images: bool = False
|
||||
) -> tuple[str, dict, list[tuple[bytes, str]]]:
|
||||
"""
|
||||
Returns the text, basic PDF metadata, and optionally extracted images.
|
||||
"""
|
||||
metadata: dict[str, Any] = {}
|
||||
extracted_images: list[tuple[bytes, str]] = []
|
||||
file: IO[Any],
|
||||
pdf_pass: str | None = None,
|
||||
) -> tuple[str, dict]:
|
||||
metadata: Dict[str, Any] = {}
|
||||
try:
|
||||
pdf_reader = PdfReader(file)
|
||||
|
||||
# If marked as encrypted and a password is provided, try to decrypt
|
||||
if pdf_reader.is_encrypted and pdf_pass is not None:
|
||||
decrypt_success = False
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error("Unable to decrypt pdf")
|
||||
if pdf_pass is not None:
|
||||
try:
|
||||
decrypt_success = pdf_reader.decrypt(pdf_pass) != 0
|
||||
except Exception:
|
||||
logger.error("Unable to decrypt pdf")
|
||||
|
||||
if not decrypt_success:
|
||||
return "", metadata, []
|
||||
# By user request, keep files that are unreadable just so they
|
||||
# can be discoverable by title.
|
||||
return "", metadata
|
||||
elif pdf_reader.is_encrypted:
|
||||
logger.warning("No Password for an encrypted PDF, returning empty text.")
|
||||
return "", metadata, []
|
||||
logger.warning("No Password available to decrypt pdf, returning empty")
|
||||
return "", metadata
|
||||
|
||||
# Basic PDF metadata
|
||||
# Extract metadata from the PDF, removing leading '/' from keys if present
|
||||
# This standardizes the metadata keys for consistency
|
||||
metadata = {}
|
||||
if pdf_reader.metadata is not None:
|
||||
for key, value in pdf_reader.metadata.items():
|
||||
clean_key = key.lstrip("/")
|
||||
if isinstance(value, str) and value.strip():
|
||||
metadata[clean_key] = value
|
||||
|
||||
elif isinstance(value, list) and all(
|
||||
isinstance(item, str) for item in value
|
||||
):
|
||||
metadata[clean_key] = ", ".join(value)
|
||||
|
||||
text = TEXT_SECTION_SEPARATOR.join(
|
||||
page.extract_text() for page in pdf_reader.pages
|
||||
return (
|
||||
TEXT_SECTION_SEPARATOR.join(
|
||||
page.extract_text() for page in pdf_reader.pages
|
||||
),
|
||||
metadata,
|
||||
)
|
||||
|
||||
if extract_images:
|
||||
for page_num, page in enumerate(pdf_reader.pages):
|
||||
for image_file_object in page.images:
|
||||
image = Image.open(io.BytesIO(image_file_object.data))
|
||||
img_byte_arr = io.BytesIO()
|
||||
image.save(img_byte_arr, format=image.format)
|
||||
img_bytes = img_byte_arr.getvalue()
|
||||
|
||||
image_name = (
|
||||
f"page_{page_num + 1}_image_{image_file_object.name}."
|
||||
f"{image.format.lower() if image.format else 'png'}"
|
||||
)
|
||||
extracted_images.append((img_bytes, image_name))
|
||||
|
||||
return text, metadata, extracted_images
|
||||
|
||||
except PdfStreamError:
|
||||
logger.exception("Invalid PDF file")
|
||||
logger.exception("PDF file is not a valid PDF")
|
||||
except Exception:
|
||||
logger.exception("Failed to read PDF")
|
||||
|
||||
return "", metadata, []
|
||||
# File is still discoverable by title
|
||||
# but the contents are not included as they cannot be parsed
|
||||
return "", metadata
|
||||
|
||||
|
||||
def docx_to_text_and_images(
|
||||
file: IO[Any],
|
||||
) -> Tuple[str, List[Tuple[bytes, str]]]:
|
||||
"""
|
||||
Extract text from a docx. If embed_images=True, also extract inline images.
|
||||
Return (text_content, list_of_images).
|
||||
"""
|
||||
def docx_to_text(file: IO[Any]) -> str:
|
||||
def is_simple_table(table: docx.table.Table) -> bool:
|
||||
for row in table.rows:
|
||||
# No omitted cells
|
||||
if row.grid_cols_before > 0 or row.grid_cols_after > 0:
|
||||
return False
|
||||
|
||||
# No nested tables
|
||||
if any(cell.tables for cell in row.cells):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def extract_cell_text(cell: docx.table._Cell) -> str:
|
||||
cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
|
||||
return " ".join(p for p in cell_paragraphs if p) or "N/A"
|
||||
|
||||
paragraphs = []
|
||||
embedded_images: List[Tuple[bytes, str]] = []
|
||||
|
||||
doc = docx.Document(file)
|
||||
for item in doc.iter_inner_content():
|
||||
if isinstance(item, docx.text.paragraph.Paragraph):
|
||||
paragraphs.append(item.text)
|
||||
|
||||
# Grab text from paragraphs
|
||||
for paragraph in doc.paragraphs:
|
||||
paragraphs.append(paragraph.text)
|
||||
elif isinstance(item, docx.table.Table):
|
||||
if not item.rows or not is_simple_table(item):
|
||||
continue
|
||||
|
||||
# Reset position so we can re-load the doc (python-docx has read the stream)
|
||||
# Note: if python-docx has fully consumed the stream, you may need to open it again from memory.
|
||||
# For large docs, a more robust approach is needed.
|
||||
# This is a simplified example.
|
||||
# Every row is a new line, joined with a single newline
|
||||
table_content = "\n".join(
|
||||
[
|
||||
",\t".join(extract_cell_text(cell) for cell in row.cells)
|
||||
for row in item.rows
|
||||
]
|
||||
)
|
||||
paragraphs.append(table_content)
|
||||
|
||||
for rel_id, rel in doc.part.rels.items():
|
||||
if "image" in rel.reltype:
|
||||
# image is typically in rel.target_part.blob
|
||||
image_bytes = rel.target_part.blob
|
||||
image_name = rel.target_part.partname
|
||||
# store
|
||||
embedded_images.append((image_bytes, os.path.basename(str(image_name))))
|
||||
|
||||
text_content = "\n".join(paragraphs)
|
||||
return text_content, embedded_images
|
||||
# Docx already has good spacing between paragraphs
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
|
||||
def pptx_to_text(file: IO[Any]) -> str:
|
||||
presentation = pptx.Presentation(file)
|
||||
text_content = []
|
||||
for slide_number, slide in enumerate(presentation.slides, start=1):
|
||||
slide_text = f"\nSlide {slide_number}:\n"
|
||||
extracted_text = f"\nSlide {slide_number}:\n"
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
slide_text += shape.text + "\n"
|
||||
text_content.append(slide_text)
|
||||
extracted_text += shape.text + "\n"
|
||||
text_content.append(extracted_text)
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
@@ -329,21 +305,18 @@ def xlsx_to_text(file: IO[Any]) -> str:
|
||||
workbook = openpyxl.load_workbook(file, read_only=True)
|
||||
text_content = []
|
||||
for sheet in workbook.worksheets:
|
||||
rows = []
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True):
|
||||
row_str = ",".join(str(cell) if cell is not None else "" for cell in row)
|
||||
rows.append(row_str)
|
||||
sheet_str = "\n".join(rows)
|
||||
text_content.append(sheet_str)
|
||||
sheet_string = "\n".join(
|
||||
",".join(map(str, row))
|
||||
for row in sheet.iter_rows(min_row=1, values_only=True)
|
||||
)
|
||||
text_content.append(sheet_string)
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def eml_to_text(file: IO[Any]) -> str:
|
||||
encoding = detect_encoding(file)
|
||||
text_file = io.TextIOWrapper(file, encoding=encoding)
|
||||
text_file = io.TextIOWrapper(file, encoding=detect_encoding(file))
|
||||
parser = EmailParser()
|
||||
message = parser.parse(text_file)
|
||||
|
||||
text_content = []
|
||||
for part in message.walk():
|
||||
if part.get_content_type().startswith("text/plain"):
|
||||
@@ -369,8 +342,8 @@ def epub_to_text(file: IO[Any]) -> str:
|
||||
|
||||
def file_io_to_text(file: IO[Any]) -> str:
|
||||
encoding = detect_encoding(file)
|
||||
file_content, _ = read_text_file(file, encoding=encoding)
|
||||
return file_content
|
||||
file_content_raw, _ = read_text_file(file, encoding=encoding)
|
||||
return file_content_raw
|
||||
|
||||
|
||||
def extract_file_text(
|
||||
@@ -379,13 +352,9 @@ def extract_file_text(
|
||||
break_on_unprocessable: bool = True,
|
||||
extension: str | None = None,
|
||||
) -> str:
|
||||
"""
|
||||
Legacy function that returns *only text*, ignoring embedded images.
|
||||
For backward-compatibility in code that only wants text.
|
||||
"""
|
||||
extension_to_function: dict[str, Callable[[IO[Any]], str]] = {
|
||||
".pdf": pdf_to_text,
|
||||
".docx": lambda f: docx_to_text_and_images(f)[0], # no images
|
||||
".docx": docx_to_text,
|
||||
".pptx": pptx_to_text,
|
||||
".xlsx": xlsx_to_text,
|
||||
".eml": eml_to_text,
|
||||
@@ -399,23 +368,24 @@ def extract_file_text(
|
||||
return unstructured_to_text(file, file_name)
|
||||
except Exception as unstructured_error:
|
||||
logger.error(
|
||||
f"Failed to process with Unstructured: {str(unstructured_error)}. "
|
||||
"Falling back to normal processing."
|
||||
f"Failed to process with Unstructured: {str(unstructured_error)}. Falling back to normal processing."
|
||||
)
|
||||
if extension is None:
|
||||
extension = get_file_ext(file_name)
|
||||
# Fall through to normal processing
|
||||
final_extension: str
|
||||
if file_name or extension:
|
||||
if extension is not None:
|
||||
final_extension = extension
|
||||
elif file_name is not None:
|
||||
final_extension = get_file_ext(file_name)
|
||||
|
||||
if is_valid_file_ext(extension):
|
||||
func = extension_to_function.get(extension, file_io_to_text)
|
||||
file.seek(0)
|
||||
return func(file)
|
||||
if is_valid_file_ext(final_extension):
|
||||
return extension_to_function.get(final_extension, file_io_to_text)(file)
|
||||
|
||||
# If unknown extension, maybe it's a text file
|
||||
file.seek(0)
|
||||
# Either the file somehow has no name or the extension is not one that we recognize
|
||||
if is_text_file(file):
|
||||
return file_io_to_text(file)
|
||||
|
||||
raise ValueError("Unknown file extension or not recognized as text data")
|
||||
raise ValueError("Unknown file extension and unknown text encoding")
|
||||
|
||||
except Exception as e:
|
||||
if break_on_unprocessable:
|
||||
@@ -426,93 +396,20 @@ def extract_file_text(
|
||||
return ""
|
||||
|
||||
|
||||
def extract_text_and_images(
|
||||
file: IO[Any],
|
||||
file_name: str,
|
||||
pdf_pass: str | None = None,
|
||||
) -> Tuple[str, List[Tuple[bytes, str]]]:
|
||||
"""
|
||||
Primary new function for the updated connector.
|
||||
Returns (text_content, [(embedded_img_bytes, embedded_img_name), ...]).
|
||||
"""
|
||||
|
||||
try:
|
||||
# Attempt unstructured if env var is set
|
||||
if get_unstructured_api_key():
|
||||
# If the user doesn't want embedded images, unstructured is fine
|
||||
file.seek(0)
|
||||
text_content = unstructured_to_text(file, file_name)
|
||||
return (text_content, [])
|
||||
|
||||
extension = get_file_ext(file_name)
|
||||
|
||||
# docx example for embedded images
|
||||
if extension == ".docx":
|
||||
file.seek(0)
|
||||
text_content, images = docx_to_text_and_images(file)
|
||||
return (text_content, images)
|
||||
|
||||
# PDF example: we do not show complicated PDF image extraction here
|
||||
# so we simply extract text for now and skip images.
|
||||
if extension == ".pdf":
|
||||
file.seek(0)
|
||||
text_content, _, images = read_pdf_file(file, pdf_pass, extract_images=True)
|
||||
return (text_content, images)
|
||||
|
||||
# For PPTX, XLSX, EML, etc., we do not show embedded image logic here.
|
||||
# You can do something similar to docx if needed.
|
||||
if extension == ".pptx":
|
||||
file.seek(0)
|
||||
return (pptx_to_text(file), [])
|
||||
|
||||
if extension == ".xlsx":
|
||||
file.seek(0)
|
||||
return (xlsx_to_text(file), [])
|
||||
|
||||
if extension == ".eml":
|
||||
file.seek(0)
|
||||
return (eml_to_text(file), [])
|
||||
|
||||
if extension == ".epub":
|
||||
file.seek(0)
|
||||
return (epub_to_text(file), [])
|
||||
|
||||
if extension == ".html":
|
||||
file.seek(0)
|
||||
return (parse_html_page_basic(file), [])
|
||||
|
||||
# If we reach here and it's a recognized text extension
|
||||
if is_text_file_extension(file_name):
|
||||
file.seek(0)
|
||||
encoding = detect_encoding(file)
|
||||
text_content_raw, _ = read_text_file(
|
||||
file, encoding=encoding, ignore_onyx_metadata=False
|
||||
)
|
||||
return (text_content_raw, [])
|
||||
|
||||
# If it's an image file or something else, we do not parse embedded images from them
|
||||
# just return empty text
|
||||
file.seek(0)
|
||||
return ("", [])
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to extract text/images from {file_name}: {e}")
|
||||
return ("", [])
|
||||
|
||||
|
||||
def convert_docx_to_txt(
|
||||
file: UploadFile, file_store: FileStore, file_path: str
|
||||
) -> None:
|
||||
"""
|
||||
Helper to convert docx to a .txt file in the same filestore.
|
||||
"""
|
||||
file.file.seek(0)
|
||||
docx_content = file.file.read()
|
||||
doc = DocxDocument(BytesIO(docx_content))
|
||||
doc = Document(BytesIO(docx_content))
|
||||
|
||||
# Extract text from the document
|
||||
all_paras = [p.text for p in doc.paragraphs]
|
||||
text_content = "\n".join(all_paras)
|
||||
full_text = []
|
||||
for para in doc.paragraphs:
|
||||
full_text.append(para.text)
|
||||
|
||||
# Join the extracted text
|
||||
text_content = "\n".join(full_text)
|
||||
|
||||
txt_file_path = docx_to_txt_filename(file_path)
|
||||
file_store.save_file(
|
||||
@@ -525,4 +422,7 @@ def convert_docx_to_txt(
|
||||
|
||||
|
||||
def docx_to_txt_filename(file_path: str) -> str:
|
||||
"""
|
||||
Convert a .docx file path to its corresponding .txt file path.
|
||||
"""
|
||||
return file_path.rsplit(".", 1)[0] + ".txt"
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
"""
|
||||
Centralized file type validation utilities.
|
||||
"""
|
||||
# Standard image MIME types supported by most vision LLMs
|
||||
IMAGE_MIME_TYPES = [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/webp",
|
||||
]
|
||||
|
||||
# Image types that should be excluded from processing
|
||||
EXCLUDED_IMAGE_TYPES = [
|
||||
"image/bmp",
|
||||
"image/tiff",
|
||||
"image/gif",
|
||||
"image/svg+xml",
|
||||
]
|
||||
|
||||
|
||||
def is_valid_image_type(mime_type: str) -> bool:
|
||||
"""
|
||||
Check if mime_type is a valid image type.
|
||||
|
||||
Args:
|
||||
mime_type: The MIME type to check
|
||||
|
||||
Returns:
|
||||
True if the MIME type is a valid image type, False otherwise
|
||||
"""
|
||||
if not mime_type:
|
||||
return False
|
||||
return mime_type.startswith("image/") and mime_type not in EXCLUDED_IMAGE_TYPES
|
||||
|
||||
|
||||
def is_supported_by_vision_llm(mime_type: str) -> bool:
|
||||
"""
|
||||
Check if this image type can be processed by vision LLMs.
|
||||
|
||||
Args:
|
||||
mime_type: The MIME type to check
|
||||
|
||||
Returns:
|
||||
True if the MIME type is supported by vision LLMs, False otherwise
|
||||
"""
|
||||
return mime_type in IMAGE_MIME_TYPES
|
||||
@@ -1,129 +0,0 @@
|
||||
import base64
|
||||
from io import BytesIO
|
||||
|
||||
from langchain_core.messages import BaseMessage
|
||||
from langchain_core.messages import HumanMessage
|
||||
from langchain_core.messages import SystemMessage
|
||||
from PIL import Image
|
||||
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.llm.utils import message_to_string
|
||||
from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_SYSTEM_PROMPT
|
||||
from onyx.prompts.image_analysis import IMAGE_SUMMARIZATION_USER_PROMPT
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def prepare_image_bytes(image_data: bytes) -> str:
|
||||
"""Prepare image bytes for summarization.
|
||||
Resizes image if it's larger than 20MB. Encodes image as a base64 string."""
|
||||
image_data = _resize_image_if_needed(image_data)
|
||||
|
||||
# encode image (base64)
|
||||
encoded_image = _encode_image_for_llm_prompt(image_data)
|
||||
|
||||
return encoded_image
|
||||
|
||||
|
||||
def summarize_image_pipeline(
|
||||
llm: LLM,
|
||||
image_data: bytes,
|
||||
query: str | None = None,
|
||||
system_prompt: str | None = None,
|
||||
) -> str:
|
||||
"""Pipeline to generate a summary of an image.
|
||||
Resizes images if it is bigger than 20MB. Encodes image as a base64 string.
|
||||
And finally uses the Default LLM to generate a textual summary of the image."""
|
||||
# resize image if it's bigger than 20MB
|
||||
encoded_image = prepare_image_bytes(image_data)
|
||||
|
||||
summary = _summarize_image(
|
||||
encoded_image,
|
||||
llm,
|
||||
query,
|
||||
system_prompt,
|
||||
)
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
def summarize_image_with_error_handling(
|
||||
llm: LLM | None,
|
||||
image_data: bytes,
|
||||
context_name: str,
|
||||
system_prompt: str = IMAGE_SUMMARIZATION_SYSTEM_PROMPT,
|
||||
user_prompt_template: str = IMAGE_SUMMARIZATION_USER_PROMPT,
|
||||
) -> str | None:
|
||||
"""Wrapper function that handles error cases and configuration consistently.
|
||||
|
||||
Args:
|
||||
llm: The LLM with vision capabilities to use for summarization
|
||||
image_data: The raw image bytes
|
||||
context_name: Name or title of the image for context
|
||||
system_prompt: System prompt to use for the LLM
|
||||
user_prompt_template: Template for the user prompt, should contain {title} placeholder
|
||||
|
||||
Returns:
|
||||
The image summary text, or None if summarization failed or is disabled
|
||||
"""
|
||||
if llm is None:
|
||||
return None
|
||||
|
||||
user_prompt = user_prompt_template.format(title=context_name)
|
||||
return summarize_image_pipeline(llm, image_data, user_prompt, system_prompt)
|
||||
|
||||
|
||||
def _summarize_image(
|
||||
encoded_image: str,
|
||||
llm: LLM,
|
||||
query: str | None = None,
|
||||
system_prompt: str | None = None,
|
||||
) -> str:
|
||||
"""Use default LLM (if it is multimodal) to generate a summary of an image."""
|
||||
|
||||
messages: list[BaseMessage] = []
|
||||
|
||||
if system_prompt:
|
||||
messages.append(SystemMessage(content=system_prompt))
|
||||
|
||||
messages.append(
|
||||
HumanMessage(
|
||||
content=[
|
||||
{"type": "text", "text": query},
|
||||
{"type": "image_url", "image_url": {"url": encoded_image}},
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
try:
|
||||
return message_to_string(llm.invoke(messages))
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError(f"Summarization failed. Messages: {messages}") from e
|
||||
|
||||
|
||||
def _encode_image_for_llm_prompt(image_data: bytes) -> str:
|
||||
"""Getting the base64 string."""
|
||||
base64_encoded_data = base64.b64encode(image_data).decode("utf-8")
|
||||
|
||||
return f"data:image/jpeg;base64,{base64_encoded_data}"
|
||||
|
||||
|
||||
def _resize_image_if_needed(image_data: bytes, max_size_mb: int = 20) -> bytes:
|
||||
"""Resize image if it's larger than the specified max size in MB."""
|
||||
max_size_bytes = max_size_mb * 1024 * 1024
|
||||
|
||||
if len(image_data) > max_size_bytes:
|
||||
with Image.open(BytesIO(image_data)) as img:
|
||||
# Reduce dimensions for better size reduction
|
||||
img.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
|
||||
output = BytesIO()
|
||||
|
||||
# Save with lower quality for compression
|
||||
img.save(output, format="JPEG", quality=85)
|
||||
resized_data = output.getvalue()
|
||||
|
||||
return resized_data
|
||||
|
||||
return image_data
|
||||
@@ -1,70 +0,0 @@
|
||||
from typing import Tuple
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.db.pg_file_store import save_bytes_to_pgfilestore
|
||||
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def store_image_and_create_section(
|
||||
db_session: Session,
|
||||
image_data: bytes,
|
||||
file_name: str,
|
||||
display_name: str,
|
||||
media_type: str = "image/unknown",
|
||||
llm: LLM | None = None,
|
||||
file_origin: FileOrigin = FileOrigin.OTHER,
|
||||
) -> Tuple[Section, str | None]:
|
||||
"""
|
||||
Stores an image in PGFileStore and creates a Section object with optional summarization.
|
||||
|
||||
Args:
|
||||
db_session: Database session
|
||||
image_data: Raw image bytes
|
||||
file_name: Base identifier for the file
|
||||
display_name: Human-readable name for the image
|
||||
media_type: MIME type of the image
|
||||
llm: Optional LLM with vision capabilities for summarization
|
||||
file_origin: Origin of the file (e.g., CONFLUENCE, GOOGLE_DRIVE, etc.)
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
- Section object with image reference and optional summary text
|
||||
- The file_name in PGFileStore or None if storage failed
|
||||
"""
|
||||
# Storage logic
|
||||
stored_file_name = None
|
||||
try:
|
||||
pgfilestore = save_bytes_to_pgfilestore(
|
||||
db_session=db_session,
|
||||
raw_bytes=image_data,
|
||||
media_type=media_type,
|
||||
identifier=file_name,
|
||||
display_name=display_name,
|
||||
file_origin=file_origin,
|
||||
)
|
||||
stored_file_name = pgfilestore.file_name
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store image: {e}")
|
||||
if not CONTINUE_ON_CONNECTOR_FAILURE:
|
||||
raise
|
||||
return Section(text=""), None
|
||||
|
||||
# Summarization logic
|
||||
summary_text = ""
|
||||
if llm:
|
||||
summary_text = (
|
||||
summarize_image_with_error_handling(llm, image_data, display_name) or ""
|
||||
)
|
||||
|
||||
return (
|
||||
Section(text=summary_text, image_file_name=stored_file_name),
|
||||
stored_file_name,
|
||||
)
|
||||
@@ -23,9 +23,12 @@ from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
CHUNK_OVERLAP = 0
|
||||
# Fairly arbitrary numbers but the general concept is we don't want the title/metadata to
|
||||
# overwhelm the actual contents of the chunk
|
||||
# For example in a rare case, this could be 128 tokens for the 512 chunk and title prefix
|
||||
# could be another 128 tokens leaving 256 for the actual contents
|
||||
MAX_METADATA_PERCENTAGE = 0.25
|
||||
CHUNK_MIN_CONTENT = 256
|
||||
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
@@ -33,8 +36,16 @@ def _get_metadata_suffix_for_document_index(
|
||||
metadata: dict[str, str | list[str]], include_separator: bool = False
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Returns the metadata as a natural language string representation with all of the keys and values
|
||||
for the vector embedding and a string of all of the values for the keyword search.
|
||||
Returns the metadata as a natural language string representation with all of the keys and values for the vector embedding
|
||||
and a string of all of the values for the keyword search
|
||||
|
||||
For example, if we have the following metadata:
|
||||
{
|
||||
"author": "John Doe",
|
||||
"space": "Engineering"
|
||||
}
|
||||
The vector embedding string should include the relation between the key and value wheres as for keyword we only want John Doe
|
||||
and Engineering. The keys are repeat and much more noisy.
|
||||
"""
|
||||
if not metadata:
|
||||
return "", ""
|
||||
@@ -63,17 +74,12 @@ def _get_metadata_suffix_for_document_index(
|
||||
|
||||
|
||||
def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwareChunk:
|
||||
"""
|
||||
Combines multiple DocAwareChunks into one large chunk (for “multipass” mode),
|
||||
appending the content and adjusting source_links accordingly.
|
||||
"""
|
||||
merged_chunk = DocAwareChunk(
|
||||
source_document=chunks[0].source_document,
|
||||
chunk_id=chunks[0].chunk_id,
|
||||
blurb=chunks[0].blurb,
|
||||
content=chunks[0].content,
|
||||
source_links=chunks[0].source_links or {},
|
||||
image_file_name=None,
|
||||
section_continuation=(chunks[0].chunk_id > 0),
|
||||
title_prefix=chunks[0].title_prefix,
|
||||
metadata_suffix_semantic=chunks[0].metadata_suffix_semantic,
|
||||
@@ -97,9 +103,6 @@ def _combine_chunks(chunks: list[DocAwareChunk], large_chunk_id: int) -> DocAwar
|
||||
|
||||
|
||||
def generate_large_chunks(chunks: list[DocAwareChunk]) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Generates larger “grouped” chunks by combining sets of smaller chunks.
|
||||
"""
|
||||
large_chunks = []
|
||||
for idx, i in enumerate(range(0, len(chunks), LARGE_CHUNK_RATIO)):
|
||||
chunk_group = chunks[i : i + LARGE_CHUNK_RATIO]
|
||||
@@ -169,60 +172,23 @@ class Chunker:
|
||||
while start < total_tokens:
|
||||
end = min(start + content_token_limit, total_tokens)
|
||||
token_chunk = tokens[start:end]
|
||||
# Join the tokens to reconstruct the text
|
||||
chunk_text = " ".join(token_chunk)
|
||||
chunks.append(chunk_text)
|
||||
start = end
|
||||
return chunks
|
||||
|
||||
def _extract_blurb(self, text: str) -> str:
|
||||
"""
|
||||
Extract a short blurb from the text (first chunk of size `blurb_size`).
|
||||
"""
|
||||
texts = self.blurb_splitter.split_text(text)
|
||||
if not texts:
|
||||
return ""
|
||||
return texts[0]
|
||||
|
||||
def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
|
||||
"""
|
||||
For “multipass” mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
|
||||
"""
|
||||
if self.mini_chunk_splitter and chunk_text.strip():
|
||||
return self.mini_chunk_splitter.split_text(chunk_text)
|
||||
return None
|
||||
|
||||
# ADDED: extra param image_url to store in the chunk
|
||||
def _create_chunk(
|
||||
self,
|
||||
document: Document,
|
||||
chunks_list: list[DocAwareChunk],
|
||||
text: str,
|
||||
links: dict[int, str],
|
||||
is_continuation: bool = False,
|
||||
title_prefix: str = "",
|
||||
metadata_suffix_semantic: str = "",
|
||||
metadata_suffix_keyword: str = "",
|
||||
image_file_name: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Helper to create a new DocAwareChunk, append it to chunks_list.
|
||||
"""
|
||||
new_chunk = DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks_list),
|
||||
blurb=self._extract_blurb(text),
|
||||
content=text,
|
||||
source_links=links or {0: ""},
|
||||
image_file_name=image_file_name,
|
||||
section_continuation=is_continuation,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
mini_chunk_texts=self._get_mini_chunk_texts(text),
|
||||
large_chunk_id=None,
|
||||
)
|
||||
chunks_list.append(new_chunk)
|
||||
|
||||
def _chunk_document(
|
||||
self,
|
||||
document: Document,
|
||||
@@ -232,156 +198,122 @@ class Chunker:
|
||||
content_token_limit: int,
|
||||
) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Loops through sections of the document, converting them into one or more chunks.
|
||||
If a section has an image_link, we treat it as a dedicated chunk.
|
||||
Loops through sections of the document, adds metadata and converts them into chunks.
|
||||
"""
|
||||
|
||||
chunks: list[DocAwareChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
chunk_text = ""
|
||||
|
||||
def _create_chunk(
|
||||
text: str,
|
||||
links: dict[int, str],
|
||||
is_continuation: bool = False,
|
||||
) -> DocAwareChunk:
|
||||
return DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks),
|
||||
blurb=self._extract_blurb(text),
|
||||
content=text,
|
||||
source_links=links or {0: ""},
|
||||
section_continuation=is_continuation,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
mini_chunk_texts=self._get_mini_chunk_texts(text),
|
||||
large_chunk_id=None,
|
||||
)
|
||||
|
||||
section_link_text: str
|
||||
|
||||
for section_idx, section in enumerate(document.sections):
|
||||
section_text = clean_text(section.text)
|
||||
section_link_text = section.link or ""
|
||||
# ADDED: if the Section has an image link
|
||||
image_url = section.image_file_name
|
||||
|
||||
# If there is no useful content, skip
|
||||
# If there is no useful content, not even the title, just drop it
|
||||
if not section_text and (not document.title or section_idx > 0):
|
||||
# If a section is empty and the document has no title, we can just drop it. We return a list of
|
||||
# DocAwareChunks where each one contains the necessary information needed down the line for indexing.
|
||||
# There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
|
||||
logger.warning(
|
||||
f"Skipping empty or irrelevant section in doc "
|
||||
f"{document.semantic_identifier}, link={section_link_text}"
|
||||
f"Skipping section {section.text} from document "
|
||||
f"{document.semantic_identifier} due to empty text after cleaning "
|
||||
f"with link {section_link_text}"
|
||||
)
|
||||
continue
|
||||
|
||||
# CASE 1: If this is an image section, force a separate chunk
|
||||
if image_url:
|
||||
# First, if we have any partially built text chunk, finalize it
|
||||
if chunk_text.strip():
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
is_continuation=False,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
chunk_text = ""
|
||||
link_offsets = {}
|
||||
|
||||
# Create a chunk specifically for this image
|
||||
# (If the section has text describing the image, use that as content)
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
section_text,
|
||||
links={0: section_link_text}
|
||||
if section_link_text
|
||||
else {}, # No text offsets needed for images
|
||||
image_file_name=image_url,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
# Continue to next section
|
||||
continue
|
||||
|
||||
# CASE 2: Normal text section
|
||||
section_token_count = len(self.tokenizer.tokenize(section_text))
|
||||
|
||||
# If the section is large on its own, split it separately
|
||||
# Large sections are considered self-contained/unique
|
||||
# Therefore, they start a new chunk and are not concatenated
|
||||
# at the end by other sections
|
||||
if section_token_count > content_token_limit:
|
||||
if chunk_text.strip():
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
chunk_text = ""
|
||||
if chunk_text:
|
||||
chunks.append(_create_chunk(chunk_text, link_offsets))
|
||||
link_offsets = {}
|
||||
chunk_text = ""
|
||||
|
||||
split_texts = self.chunk_splitter.split_text(section_text)
|
||||
|
||||
for i, split_text in enumerate(split_texts):
|
||||
# If even the split_text is bigger than strict limit, further split
|
||||
if (
|
||||
STRICT_CHUNK_TOKEN_LIMIT
|
||||
and len(self.tokenizer.tokenize(split_text))
|
||||
> content_token_limit
|
||||
and
|
||||
# Tokenizer only runs if STRICT_CHUNK_TOKEN_LIMIT is true
|
||||
len(self.tokenizer.tokenize(split_text)) > content_token_limit
|
||||
):
|
||||
# If STRICT_CHUNK_TOKEN_LIMIT is true, manually check
|
||||
# the token count of each split text to ensure it is
|
||||
# not larger than the content_token_limit
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for j, small_chunk in enumerate(smaller_chunks):
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
small_chunk,
|
||||
{0: section_link_text},
|
||||
is_continuation=(j != 0),
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
for i, small_chunk in enumerate(smaller_chunks):
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=small_chunk,
|
||||
links={0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
else:
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
split_text,
|
||||
{0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
text=split_text,
|
||||
links={0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
|
||||
continue
|
||||
|
||||
# If we can still fit this section into the current chunk, do so
|
||||
current_token_count = len(self.tokenizer.tokenize(chunk_text))
|
||||
current_offset = len(shared_precompare_cleanup(chunk_text))
|
||||
# In the case where the whole section is shorter than a chunk, either add
|
||||
# to chunk or start a new one
|
||||
next_section_tokens = (
|
||||
len(self.tokenizer.tokenize(SECTION_SEPARATOR)) + section_token_count
|
||||
)
|
||||
|
||||
if next_section_tokens + current_token_count <= content_token_limit:
|
||||
if chunk_text:
|
||||
chunk_text += SECTION_SEPARATOR
|
||||
chunk_text += section_text
|
||||
link_offsets[current_offset] = section_link_text
|
||||
else:
|
||||
# finalize the existing chunk
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
# start a new chunk
|
||||
chunks.append(_create_chunk(chunk_text, link_offsets))
|
||||
link_offsets = {0: section_link_text}
|
||||
chunk_text = section_text
|
||||
|
||||
# finalize any leftover text chunk
|
||||
# Once we hit the end, if we're still in the process of building a chunk, add what we have.
|
||||
# If there is only whitespace left then don't include it. If there are no chunks at all
|
||||
# from the doc, we can just create a single chunk with the title.
|
||||
if chunk_text.strip() or not chunks:
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets or {0: ""}, # safe default
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
chunks.append(
|
||||
_create_chunk(
|
||||
chunk_text,
|
||||
link_offsets or {0: section_link_text},
|
||||
)
|
||||
)
|
||||
|
||||
# If the chunk does not have any useable content, it will not be indexed
|
||||
return chunks
|
||||
|
||||
def _handle_single_document(self, document: Document) -> list[DocAwareChunk]:
|
||||
@@ -389,12 +321,10 @@ class Chunker:
|
||||
if document.source == DocumentSource.GMAIL:
|
||||
logger.debug(f"Chunking {document.semantic_identifier}")
|
||||
|
||||
# Title prep
|
||||
title = self._extract_blurb(document.get_title_for_document_index() or "")
|
||||
title_prefix = title + RETURN_SEPARATOR if title else ""
|
||||
title_tokens = len(self.tokenizer.tokenize(title_prefix))
|
||||
|
||||
# Metadata prep
|
||||
metadata_suffix_semantic = ""
|
||||
metadata_suffix_keyword = ""
|
||||
metadata_tokens = 0
|
||||
@@ -407,20 +337,19 @@ class Chunker:
|
||||
)
|
||||
metadata_tokens = len(self.tokenizer.tokenize(metadata_suffix_semantic))
|
||||
|
||||
# If metadata is too large, skip it in the semantic content
|
||||
if metadata_tokens >= self.chunk_token_limit * MAX_METADATA_PERCENTAGE:
|
||||
# Note: we can keep the keyword suffix even if the semantic suffix is too long to fit in the model
|
||||
# context, there is no limit for the keyword component
|
||||
metadata_suffix_semantic = ""
|
||||
metadata_tokens = 0
|
||||
|
||||
# Adjust content token limit to accommodate title + metadata
|
||||
content_token_limit = self.chunk_token_limit - title_tokens - metadata_tokens
|
||||
# If there is not enough context remaining then just index the chunk with no prefix/suffix
|
||||
if content_token_limit <= CHUNK_MIN_CONTENT:
|
||||
# Not enough space left, so revert to full chunk without the prefix
|
||||
content_token_limit = self.chunk_token_limit
|
||||
title_prefix = ""
|
||||
metadata_suffix_semantic = ""
|
||||
|
||||
# Chunk the document
|
||||
normal_chunks = self._chunk_document(
|
||||
document,
|
||||
title_prefix,
|
||||
@@ -429,7 +358,6 @@ class Chunker:
|
||||
content_token_limit,
|
||||
)
|
||||
|
||||
# Optional “multipass” large chunk creation
|
||||
if self.enable_multipass and self.enable_large_chunks:
|
||||
large_chunks = generate_large_chunks(normal_chunks)
|
||||
normal_chunks.extend(large_chunks)
|
||||
@@ -443,8 +371,9 @@ class Chunker:
|
||||
"""
|
||||
final_chunks: list[DocAwareChunk] = []
|
||||
for document in documents:
|
||||
if self.callback and self.callback.should_stop():
|
||||
raise RuntimeError("Chunker.chunk: Stop signal detected")
|
||||
if self.callback:
|
||||
if self.callback.should_stop():
|
||||
raise RuntimeError("Chunker.chunk: Stop signal detected")
|
||||
|
||||
chunks = self._handle_single_document(document)
|
||||
final_chunks.extend(chunks)
|
||||
|
||||
@@ -29,7 +29,6 @@ class BaseChunk(BaseModel):
|
||||
content: str
|
||||
# Holds the link and the offsets into the raw Chunk text
|
||||
source_links: dict[int, str] | None
|
||||
image_file_name: str | None
|
||||
# True if this Chunk's start is not at the start of a Section
|
||||
section_continuation: bool
|
||||
|
||||
|
||||
@@ -6,14 +6,12 @@ from onyx.configs.model_configs import GEN_AI_MODEL_FALLBACK_MAX_TOKENS
|
||||
from onyx.configs.model_configs import GEN_AI_TEMPERATURE
|
||||
from onyx.db.engine import get_session_context_manager
|
||||
from onyx.db.llm import fetch_default_provider
|
||||
from onyx.db.llm import fetch_existing_llm_providers
|
||||
from onyx.db.llm import fetch_provider
|
||||
from onyx.db.models import Persona
|
||||
from onyx.llm.chat_llm import DefaultMultiLLM
|
||||
from onyx.llm.exceptions import GenAIDisabledException
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.llm.override_models import LLMOverride
|
||||
from onyx.llm.utils import model_supports_image_input
|
||||
from onyx.utils.headers import build_llm_extra_headers
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.long_term_log import LongTermLogger
|
||||
@@ -88,48 +86,6 @@ def get_llms_for_persona(
|
||||
return _create_llm(model), _create_llm(fast_model)
|
||||
|
||||
|
||||
def get_default_llm_with_vision(
|
||||
timeout: int | None = None,
|
||||
temperature: float | None = None,
|
||||
additional_headers: dict[str, str] | None = None,
|
||||
long_term_logger: LongTermLogger | None = None,
|
||||
) -> LLM | None:
|
||||
if DISABLE_GENERATIVE_AI:
|
||||
raise GenAIDisabledException()
|
||||
|
||||
with get_session_context_manager() as db_session:
|
||||
llm_providers = fetch_existing_llm_providers(db_session)
|
||||
|
||||
if not llm_providers:
|
||||
return None
|
||||
|
||||
for provider in llm_providers:
|
||||
model_name = provider.default_model_name
|
||||
fast_model_name = (
|
||||
provider.fast_default_model_name or provider.default_model_name
|
||||
)
|
||||
|
||||
if not model_name or not fast_model_name:
|
||||
continue
|
||||
|
||||
if model_supports_image_input(model_name, provider.provider):
|
||||
return get_llm(
|
||||
provider=provider.provider,
|
||||
model=model_name,
|
||||
deployment_name=provider.deployment_name,
|
||||
api_key=provider.api_key,
|
||||
api_base=provider.api_base,
|
||||
api_version=provider.api_version,
|
||||
custom_config=provider.custom_config,
|
||||
timeout=timeout,
|
||||
temperature=temperature,
|
||||
additional_headers=additional_headers,
|
||||
long_term_logger=long_term_logger,
|
||||
)
|
||||
|
||||
raise ValueError("No LLM provider found that supports image input")
|
||||
|
||||
|
||||
def get_default_llms(
|
||||
timeout: int | None = None,
|
||||
temperature: float | None = None,
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
# Used for creating embeddings of images for vector search
|
||||
IMAGE_SUMMARIZATION_SYSTEM_PROMPT = """
|
||||
You are an assistant for summarizing images for retrieval.
|
||||
Summarize the content of the following image and be as precise as possible.
|
||||
The summary will be embedded and used to retrieve the original image.
|
||||
Therefore, write a concise summary of the image that is optimized for retrieval.
|
||||
"""
|
||||
|
||||
# Prompt for generating image descriptions with filename context
|
||||
IMAGE_SUMMARIZATION_USER_PROMPT = """
|
||||
The image has the file name '{title}'.
|
||||
Describe precisely and concisely what the image shows.
|
||||
"""
|
||||
|
||||
|
||||
# Used for analyzing images in response to user queries at search time
|
||||
IMAGE_ANALYSIS_SYSTEM_PROMPT = (
|
||||
"You are an AI assistant specialized in describing images.\n"
|
||||
"You will receive a user question plus an image URL. Provide a concise textual answer.\n"
|
||||
"Focus on aspects of the image that are relevant to the user's question.\n"
|
||||
"Be specific and detailed about visual elements that directly address the query.\n"
|
||||
)
|
||||
@@ -55,11 +55,7 @@ def _create_indexable_chunks(
|
||||
# The section is not really used past this point since we have already done the other processing
|
||||
# for the chunking and embedding.
|
||||
sections=[
|
||||
Section(
|
||||
text=preprocessed_doc["content"],
|
||||
link=preprocessed_doc["url"],
|
||||
image_file_name=None,
|
||||
)
|
||||
Section(text=preprocessed_doc["content"], link=preprocessed_doc["url"])
|
||||
],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=preprocessed_doc["title"],
|
||||
@@ -97,7 +93,6 @@ def _create_indexable_chunks(
|
||||
document_sets=set(),
|
||||
boost=DEFAULT_BOOST,
|
||||
large_chunk_id=None,
|
||||
image_file_name=None,
|
||||
)
|
||||
|
||||
chunks.append(chunk)
|
||||
|
||||
@@ -53,11 +53,6 @@ class Settings(BaseModel):
|
||||
auto_scroll: bool | None = False
|
||||
query_history_type: QueryHistoryType | None = None
|
||||
|
||||
# Image processing settings
|
||||
image_extraction_and_analysis_enabled: bool | None = False
|
||||
search_time_image_analysis_enabled: bool | None = False
|
||||
image_analysis_max_size_mb: int | None = 20
|
||||
|
||||
|
||||
class UserSettings(Settings):
|
||||
notifications: list[Notification]
|
||||
|
||||
@@ -47,7 +47,6 @@ def load_settings() -> Settings:
|
||||
|
||||
settings.anonymous_user_enabled = anonymous_user_enabled
|
||||
settings.query_history_type = ONYX_QUERY_HISTORY_TYPE
|
||||
|
||||
return settings
|
||||
|
||||
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
"""
|
||||
Standardized error handling utilities.
|
||||
"""
|
||||
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def handle_connector_error(e: Exception, context: str) -> None:
|
||||
"""
|
||||
Standard error handling for connectors.
|
||||
|
||||
Args:
|
||||
e: The exception that was raised
|
||||
context: A description of where the error occurred
|
||||
|
||||
Raises:
|
||||
The original exception if CONTINUE_ON_CONNECTOR_FAILURE is False
|
||||
"""
|
||||
logger.error(f"Error in {context}: {e}", exc_info=e)
|
||||
if not CONTINUE_ON_CONNECTOR_FAILURE:
|
||||
raise
|
||||
@@ -1,10 +1,9 @@
|
||||
aioboto3==14.0.0
|
||||
aiohttp==3.10.2
|
||||
alembic==1.10.4
|
||||
asyncpg==0.27.0
|
||||
atlassian-python-api==3.41.16
|
||||
beautifulsoup4==4.12.3
|
||||
boto3==1.36.23
|
||||
boto3==1.34.84
|
||||
celery==5.5.0b4
|
||||
chardet==5.2.0
|
||||
dask==2023.8.1
|
||||
|
||||
@@ -13,5 +13,4 @@ transformers==4.39.2
|
||||
uvicorn==0.21.1
|
||||
voyageai==0.2.3
|
||||
litellm==1.61.16
|
||||
sentry-sdk[fastapi,celery,starlette]==2.14.0
|
||||
aioboto3==13.4.0
|
||||
sentry-sdk[fastapi,celery,starlette]==2.14.0
|
||||
@@ -1,45 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
from logging import getLogger
|
||||
|
||||
from onyx.db.seeding.chat_history_seeding import seed_chat_history
|
||||
|
||||
# Configure the logger
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", # Log format
|
||||
handlers=[logging.StreamHandler()], # Output logs to console
|
||||
)
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
def go_main(num_sessions: int, num_messages: int, num_days: int) -> None:
|
||||
seed_chat_history(num_sessions, num_messages, num_days)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Seed chat history")
|
||||
parser.add_argument(
|
||||
"--sessions",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="Number of chat sessions to seed",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--messages",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of chat messages to seed per session",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--days",
|
||||
type=int,
|
||||
default=90,
|
||||
help="Number of days looking backwards over which to seed the timestamps with",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
go_main(args.sessions, args.messages, args.days)
|
||||
@@ -207,7 +207,7 @@ def query_vespa(
|
||||
yql: str, tenant_id: Optional[str] = None, limit: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
# Perform a Vespa query using YQL syntax.
|
||||
filters = IndexFilters(tenant_id=None, access_control_list=[])
|
||||
filters = IndexFilters(tenant_id=tenant_id, access_control_list=[])
|
||||
filter_string = build_vespa_filters(filters, remove_trailing_and=True)
|
||||
full_yql = yql.strip()
|
||||
if filter_string:
|
||||
@@ -472,7 +472,9 @@ def get_document_acls(
|
||||
print("-" * 80)
|
||||
|
||||
|
||||
def get_current_chunk_count(document_id: str) -> int | None:
|
||||
def get_current_chunk_count(
|
||||
document_id: str, index_name: str, tenant_id: str
|
||||
) -> int | None:
|
||||
with get_session_with_current_tenant() as session:
|
||||
return (
|
||||
session.query(Document.chunk_count)
|
||||
@@ -484,7 +486,7 @@ def get_current_chunk_count(document_id: str) -> int | None:
|
||||
def get_number_of_chunks_we_think_exist(
|
||||
document_id: str, index_name: str, tenant_id: str
|
||||
) -> int:
|
||||
current_chunk_count = get_current_chunk_count(document_id)
|
||||
current_chunk_count = get_current_chunk_count(document_id, index_name, tenant_id)
|
||||
print(f"Current chunk count: {current_chunk_count}")
|
||||
|
||||
doc_info = VespaIndex.enrich_basic_chunk_info(
|
||||
@@ -634,7 +636,6 @@ def delete_where(
|
||||
Removes visited documents in `cluster` where the given selection
|
||||
is true, using Vespa's 'delete where' endpoint.
|
||||
|
||||
|
||||
:param index_name: Typically <namespace>/<document-type> from your schema
|
||||
:param selection: The selection string, e.g., "true" or "foo contains 'bar'"
|
||||
:param cluster: The name of the cluster where documents reside
|
||||
@@ -798,7 +799,7 @@ def main() -> None:
|
||||
args = parser.parse_args()
|
||||
vespa_debug = VespaDebugging(args.tenant_id)
|
||||
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(args.tenant_id or "public")
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(args.tenant_id)
|
||||
if args.action == "delete-all-documents":
|
||||
if not args.tenant_id:
|
||||
parser.error("--tenant-id is required for delete-all-documents action")
|
||||
|
||||
@@ -71,7 +71,6 @@ def generate_dummy_chunk(
|
||||
title_embedding=generate_random_embedding(embedding_dim),
|
||||
large_chunk_id=None,
|
||||
large_chunk_reference_ids=[],
|
||||
image_file_name=None,
|
||||
)
|
||||
|
||||
document_set_names = []
|
||||
|
||||
@@ -68,12 +68,6 @@ LOG_LEVEL = os.environ.get("LOG_LEVEL", "info")
|
||||
# allow us to specify a custom timeout
|
||||
API_BASED_EMBEDDING_TIMEOUT = int(os.environ.get("API_BASED_EMBEDDING_TIMEOUT", "600"))
|
||||
|
||||
# Local batch size for VertexAI embedding models currently calibrated for item size of 512 tokens
|
||||
# NOTE: increasing this value may lead to API errors due to token limit exhaustion per call.
|
||||
VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE = int(
|
||||
os.environ.get("VERTEXAI_EMBEDDING_LOCAL_BATCH_SIZE", "25")
|
||||
)
|
||||
|
||||
# Only used for OpenAI
|
||||
OPENAI_EMBEDDING_TIMEOUT = int(
|
||||
os.environ.get("OPENAI_EMBEDDING_TIMEOUT", API_BASED_EMBEDDING_TIMEOUT)
|
||||
@@ -206,12 +200,12 @@ SUPPORTED_EMBEDDING_MODELS = [
|
||||
index_name="danswer_chunk_text_embedding_3_small",
|
||||
),
|
||||
SupportedEmbeddingModel(
|
||||
name="google/text-embedding-005",
|
||||
name="google/text-embedding-004",
|
||||
dim=768,
|
||||
index_name="danswer_chunk_google_text_embedding_004",
|
||||
),
|
||||
SupportedEmbeddingModel(
|
||||
name="google/text-embedding-005",
|
||||
name="google/text-embedding-004",
|
||||
dim=768,
|
||||
index_name="danswer_chunk_text_embedding_004",
|
||||
),
|
||||
|
||||
@@ -13,7 +13,6 @@ class EmbeddingProvider(str, Enum):
|
||||
class RerankerProvider(str, Enum):
|
||||
COHERE = "cohere"
|
||||
LITELLM = "litellm"
|
||||
BEDROCK = "bedrock"
|
||||
|
||||
|
||||
class EmbedTextType(str, Enum):
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
|
||||
from ee.onyx.db.usage_export import get_all_empty_chat_message_entries
|
||||
from onyx.db.engine import get_session_with_current_tenant
|
||||
from onyx.db.seeding.chat_history_seeding import seed_chat_history
|
||||
|
||||
|
||||
def test_usage_reports(reset: None) -> None:
|
||||
EXPECTED_SESSIONS = 2048
|
||||
MESSAGES_PER_SESSION = 4
|
||||
EXPECTED_MESSAGES = EXPECTED_SESSIONS * MESSAGES_PER_SESSION
|
||||
|
||||
seed_chat_history(EXPECTED_SESSIONS, MESSAGES_PER_SESSION, 90)
|
||||
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# count of all entries should be exact
|
||||
period = (
|
||||
datetime.fromtimestamp(0, tz=timezone.utc),
|
||||
datetime.now(tz=timezone.utc),
|
||||
)
|
||||
|
||||
count = 0
|
||||
for entry_batch in get_all_empty_chat_message_entries(db_session, period):
|
||||
for entry in entry_batch:
|
||||
count += 1
|
||||
|
||||
assert count == EXPECTED_MESSAGES
|
||||
|
||||
# count in a one month time range should be within a certain range statistically
|
||||
# this can be improved if we seed the chat history data deterministically
|
||||
period = (
|
||||
datetime.now(tz=timezone.utc) - timedelta(days=30),
|
||||
datetime.now(tz=timezone.utc),
|
||||
)
|
||||
|
||||
count = 0
|
||||
for entry_batch in get_all_empty_chat_message_entries(db_session, period):
|
||||
for entry in entry_batch:
|
||||
count += 1
|
||||
|
||||
lower = EXPECTED_MESSAGES // 3 - (EXPECTED_MESSAGES // (3 * 3))
|
||||
upper = EXPECTED_MESSAGES // 3 + (EXPECTED_MESSAGES // (3 * 3))
|
||||
assert count > lower
|
||||
assert count < upper
|
||||
@@ -31,7 +31,6 @@ def create_test_chunk(
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
updated_at=datetime.now(),
|
||||
image_file_name=None,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -80,7 +80,6 @@ def mock_inference_sections() -> list[InferenceSection]:
|
||||
updated_at=datetime(2023, 1, 1),
|
||||
source_links={0: "https://example.com/doc1"},
|
||||
match_highlights=[],
|
||||
image_file_name=None,
|
||||
),
|
||||
chunks=MagicMock(),
|
||||
),
|
||||
@@ -103,7 +102,6 @@ def mock_inference_sections() -> list[InferenceSection]:
|
||||
updated_at=datetime(2023, 1, 2),
|
||||
source_links={0: "https://example.com/doc2"},
|
||||
match_highlights=[],
|
||||
image_file_name=None,
|
||||
),
|
||||
chunks=MagicMock(),
|
||||
),
|
||||
|
||||
@@ -150,7 +150,6 @@ def test_fuzzy_match_quotes_to_docs() -> None:
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
updated_at=None,
|
||||
image_file_name=None,
|
||||
)
|
||||
test_chunk_1 = InferenceChunk(
|
||||
document_id="test doc 1",
|
||||
@@ -169,7 +168,6 @@ def test_fuzzy_match_quotes_to_docs() -> None:
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
updated_at=None,
|
||||
image_file_name=None,
|
||||
)
|
||||
|
||||
test_quotes = [
|
||||
|
||||
@@ -37,7 +37,6 @@ def create_inference_chunk(
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
updated_at=None,
|
||||
image_file_name=None,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -62,7 +62,6 @@ def test_default_indexing_embedder_embed_chunks(mock_embedding_model: Mock) -> N
|
||||
mini_chunk_texts=None,
|
||||
large_chunk_reference_ids=[],
|
||||
large_chunk_id=None,
|
||||
image_file_name=None,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
@@ -118,7 +118,6 @@ services:
|
||||
- API_KEY_HASH_ROUNDS=${API_KEY_HASH_ROUNDS:-}
|
||||
# Seeding configuration
|
||||
- USE_IAM_AUTH=${USE_IAM_AUTH:-}
|
||||
- ONYX_QUERY_HISTORY_TYPE=${ONYX_QUERY_HISTORY_TYPE:-}
|
||||
# Uncomment the line below to use if IAM_AUTH is true and you are using iam auth for postgres
|
||||
# volumes:
|
||||
# - ./bundle.pem:/app/bundle.pem:ro
|
||||
|
||||
@@ -95,7 +95,6 @@ services:
|
||||
# Enterprise Edition only
|
||||
- API_KEY_HASH_ROUNDS=${API_KEY_HASH_ROUNDS:-}
|
||||
- ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=${ENABLE_PAID_ENTERPRISE_EDITION_FEATURES:-false}
|
||||
- ONYX_QUERY_HISTORY_TYPE=${ONYX_QUERY_HISTORY_TYPE:-}
|
||||
# Uncomment the line below to use if IAM_AUTH is true and you are using iam auth for postgres
|
||||
# volumes:
|
||||
# - ./bundle.pem:/app/bundle.pem:ro
|
||||
|
||||
433
web/package-lock.json
generated
433
web/package-lock.json
generated
@@ -44,7 +44,6 @@
|
||||
"autoprefixer": "^10.4.14",
|
||||
"class-variance-authority": "^0.7.0",
|
||||
"clsx": "^2.1.1",
|
||||
"cmdk": "^1.0.0",
|
||||
"date-fns": "^3.6.0",
|
||||
"favicon-fetch": "^1.0.0",
|
||||
"formik": "^2.2.9",
|
||||
@@ -9314,438 +9313,6 @@
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk": {
|
||||
"version": "1.0.0",
|
||||
"resolved": "https://registry.npmjs.org/cmdk/-/cmdk-1.0.0.tgz",
|
||||
"integrity": "sha512-gDzVf0a09TvoJ5jnuPvygTB77+XdOSwEmJ88L6XPFPlv7T3RxbP9jgenfylrAMD0+Le1aO0nVjQUzl2g+vjz5Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@radix-ui/react-dialog": "1.0.5",
|
||||
"@radix-ui/react-primitive": "1.0.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": "^18.0.0",
|
||||
"react-dom": "^18.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/primitive": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/primitive/-/primitive-1.0.1.tgz",
|
||||
"integrity": "sha512-yQ8oGX2GVsEYMWGxcovu1uGWPCxV5BFfeeYxqPmuAzUyLT9qmaMXSAhXpb0WrspIeqYzdJpkh2vHModJPgRIaw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-compose-refs": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-compose-refs/-/react-compose-refs-1.0.1.tgz",
|
||||
"integrity": "sha512-fDSBgd44FKHa1FRMU59qBMPFcl2PZE+2nmqunj+BWFyYYjnhIDWL2ItDs3rrbJDQOtzt5nIebLCQc4QRfz6LJw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-context": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-context/-/react-context-1.0.1.tgz",
|
||||
"integrity": "sha512-ebbrdFoYTcuZ0v4wG5tedGnp9tzcV8awzsxYph7gXUyvnNLuTIcCk1q17JEbnVhXAKG9oX3KtchwiMIAYp9NLg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-dialog": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-dialog/-/react-dialog-1.0.5.tgz",
|
||||
"integrity": "sha512-GjWJX/AUpB703eEBanuBnIWdIXg6NvJFCXcNlSZk4xdszCdhrJgBoUd1cGk67vFO+WdA2pfI/plOpqz/5GUP6Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/primitive": "1.0.1",
|
||||
"@radix-ui/react-compose-refs": "1.0.1",
|
||||
"@radix-ui/react-context": "1.0.1",
|
||||
"@radix-ui/react-dismissable-layer": "1.0.5",
|
||||
"@radix-ui/react-focus-guards": "1.0.1",
|
||||
"@radix-ui/react-focus-scope": "1.0.4",
|
||||
"@radix-ui/react-id": "1.0.1",
|
||||
"@radix-ui/react-portal": "1.0.4",
|
||||
"@radix-ui/react-presence": "1.0.1",
|
||||
"@radix-ui/react-primitive": "1.0.3",
|
||||
"@radix-ui/react-slot": "1.0.2",
|
||||
"@radix-ui/react-use-controllable-state": "1.0.1",
|
||||
"aria-hidden": "^1.1.1",
|
||||
"react-remove-scroll": "2.5.5"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-dismissable-layer": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-dismissable-layer/-/react-dismissable-layer-1.0.5.tgz",
|
||||
"integrity": "sha512-aJeDjQhywg9LBu2t/At58hCvr7pEm0o2Ke1x33B+MhjNmmZ17sy4KImo0KPLgsnc/zN7GPdce8Cnn0SWvwZO7g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/primitive": "1.0.1",
|
||||
"@radix-ui/react-compose-refs": "1.0.1",
|
||||
"@radix-ui/react-primitive": "1.0.3",
|
||||
"@radix-ui/react-use-callback-ref": "1.0.1",
|
||||
"@radix-ui/react-use-escape-keydown": "1.0.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-dismissable-layer/node_modules/@radix-ui/react-use-callback-ref": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.0.1.tgz",
|
||||
"integrity": "sha512-D94LjX4Sp0xJFVaoQOd3OO9k7tpBYNOXdVhkltUbGv2Qb9OXdrg/CpsjlZv7ia14Sylv398LswWBVVu5nqKzAQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-dismissable-layer/node_modules/@radix-ui/react-use-escape-keydown": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-escape-keydown/-/react-use-escape-keydown-1.0.3.tgz",
|
||||
"integrity": "sha512-vyL82j40hcFicA+M4Ex7hVkB9vHgSse1ZWomAqV2Je3RleKGO5iM8KMOEtfoSB0PnIelMd2lATjTGMYqN5ylTg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-use-callback-ref": "1.0.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-focus-guards": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-guards/-/react-focus-guards-1.0.1.tgz",
|
||||
"integrity": "sha512-Rect2dWbQ8waGzhMavsIbmSVCgYxkXLxxR3ZvCX79JOglzdEy4JXMb98lq4hPxUbLr77nP0UOGf4rcMU+s1pUA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-focus-scope": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-focus-scope/-/react-focus-scope-1.0.4.tgz",
|
||||
"integrity": "sha512-sL04Mgvf+FmyvZeYfNu1EPAaaxD+aw7cYeIB9L9Fvq8+urhltTRaEo5ysKOpHuKPclsZcSUMKlN05x4u+CINpA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-compose-refs": "1.0.1",
|
||||
"@radix-ui/react-primitive": "1.0.3",
|
||||
"@radix-ui/react-use-callback-ref": "1.0.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-focus-scope/node_modules/@radix-ui/react-use-callback-ref": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.0.1.tgz",
|
||||
"integrity": "sha512-D94LjX4Sp0xJFVaoQOd3OO9k7tpBYNOXdVhkltUbGv2Qb9OXdrg/CpsjlZv7ia14Sylv398LswWBVVu5nqKzAQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-id": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-id/-/react-id-1.0.1.tgz",
|
||||
"integrity": "sha512-tI7sT/kqYp8p96yGWY1OAnLHrqDgzHefRBKQ2YAkBS5ja7QLcZ9Z/uY7bEjPUatf8RomoXM8/1sMj1IJaE5UzQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-use-layout-effect": "1.0.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-id/node_modules/@radix-ui/react-use-layout-effect": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-layout-effect/-/react-use-layout-effect-1.0.1.tgz",
|
||||
"integrity": "sha512-v/5RegiJWYdoCvMnITBkNNx6bCj20fiaJnWtRkU18yITptraXjffz5Qbn05uOiQnOvi+dbkznkoaMltz1GnszQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-portal": {
|
||||
"version": "1.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-portal/-/react-portal-1.0.4.tgz",
|
||||
"integrity": "sha512-Qki+C/EuGUVCQTOTD5vzJzJuMUlewbzuKyUy+/iHM2uwGiru9gZeBJtHAPKAEkB5KWGi9mP/CHKcY0wt1aW45Q==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-primitive": "1.0.3"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-presence": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-presence/-/react-presence-1.0.1.tgz",
|
||||
"integrity": "sha512-UXLW4UAbIY5ZjcvzjfRFo5gxva8QirC9hF7wRE4U5gz+TP0DbRk+//qyuAQ1McDxBt1xNMBTaciFGvEmJvAZCg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-compose-refs": "1.0.1",
|
||||
"@radix-ui/react-use-layout-effect": "1.0.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-presence/node_modules/@radix-ui/react-use-layout-effect": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-layout-effect/-/react-use-layout-effect-1.0.1.tgz",
|
||||
"integrity": "sha512-v/5RegiJWYdoCvMnITBkNNx6bCj20fiaJnWtRkU18yITptraXjffz5Qbn05uOiQnOvi+dbkznkoaMltz1GnszQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-primitive": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-primitive/-/react-primitive-1.0.3.tgz",
|
||||
"integrity": "sha512-yi58uVyoAcK/Nq1inRY56ZSjKypBNKTa/1mcL8qdl6oJeEaDbOldlzrGn7P6Q3Id5d+SYNGc5AJgc4vGhjs5+g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-slot": "1.0.2"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"@types/react-dom": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0",
|
||||
"react-dom": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
},
|
||||
"@types/react-dom": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-slot": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-slot/-/react-slot-1.0.2.tgz",
|
||||
"integrity": "sha512-YeTpuq4deV+6DusvVUW4ivBgnkHwECUu0BiN43L5UCDFgdhsRUWAghhTF5MbvNTPzmiFOx90asDSUjWuCNapwg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-compose-refs": "1.0.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-use-controllable-state": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-controllable-state/-/react-use-controllable-state-1.0.1.tgz",
|
||||
"integrity": "sha512-Svl5GY5FQeN758fWKrjM6Qb7asvXeiZltlT4U2gVfl8Gx5UAv2sMR0LWo8yhsIZh2oQ0eFdZ59aoOOMV7b47VA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10",
|
||||
"@radix-ui/react-use-callback-ref": "1.0.1"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/@radix-ui/react-use-controllable-state/node_modules/@radix-ui/react-use-callback-ref": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@radix-ui/react-use-callback-ref/-/react-use-callback-ref-1.0.1.tgz",
|
||||
"integrity": "sha512-D94LjX4Sp0xJFVaoQOd3OO9k7tpBYNOXdVhkltUbGv2Qb9OXdrg/CpsjlZv7ia14Sylv398LswWBVVu5nqKzAQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@babel/runtime": "^7.13.10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "*",
|
||||
"react": "^16.8 || ^17.0 || ^18.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/cmdk/node_modules/react-remove-scroll": {
|
||||
"version": "2.5.5",
|
||||
"resolved": "https://registry.npmjs.org/react-remove-scroll/-/react-remove-scroll-2.5.5.tgz",
|
||||
"integrity": "sha512-ImKhrzJJsyXJfBZ4bzu8Bwpka14c/fQt0k+cyFp/PBhTfyDnU5hjOtM4AG/0AMyy8oKzOTR0lDgJIM7pYXI0kw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"react-remove-scroll-bar": "^2.3.3",
|
||||
"react-style-singleton": "^2.2.1",
|
||||
"tslib": "^2.1.0",
|
||||
"use-callback-ref": "^1.3.0",
|
||||
"use-sidecar": "^1.1.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=10"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0",
|
||||
"react": "^16.8.0 || ^17.0.0 || ^18.0.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"@types/react": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/co": {
|
||||
"version": "4.6.0",
|
||||
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
|
||||
|
||||
@@ -47,7 +47,6 @@
|
||||
"autoprefixer": "^10.4.14",
|
||||
"class-variance-authority": "^0.7.0",
|
||||
"clsx": "^2.1.1",
|
||||
"cmdk": "^1.0.0",
|
||||
"date-fns": "^3.6.0",
|
||||
"favicon-fetch": "^1.0.0",
|
||||
"formik": "^2.2.9",
|
||||
|
||||
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 2.9 KiB After Width: | Height: | Size: 7.0 KiB |
@@ -1 +1,8 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" shape-rendering="geometricPrecision" text-rendering="geometricPrecision" image-rendering="optimizeQuality" fill-rule="evenodd" clip-rule="evenodd" viewBox="0 0 512 509.64"><path fill="#D77655" d="M115.612 0h280.775C459.974 0 512 52.026 512 115.612v278.415c0 63.587-52.026 115.612-115.613 115.612H115.612C52.026 509.639 0 457.614 0 394.027V115.612C0 52.026 52.026 0 115.612 0z"/><path fill="#FCF2EE" fill-rule="nonzero" d="M142.27 316.619l73.655-41.326 1.238-3.589-1.238-1.996-3.589-.001-12.31-.759-42.084-1.138-36.498-1.516-35.361-1.896-8.897-1.895-8.34-10.995.859-5.484 7.482-5.03 10.717.935 23.683 1.617 35.537 2.452 25.782 1.517 38.193 3.968h6.064l.86-2.451-2.073-1.517-1.618-1.517-36.776-24.922-39.81-26.338-20.852-15.166-11.273-7.683-5.687-7.204-2.451-15.721 10.237-11.273 13.75.935 3.513.936 13.928 10.716 29.749 23.027 38.848 28.612 5.687 4.727 2.275-1.617.278-1.138-2.553-4.271-21.13-38.193-22.546-38.848-10.035-16.101-2.654-9.655c-.935-3.968-1.617-7.304-1.617-11.374l11.652-15.823 6.445-2.073 15.545 2.073 6.547 5.687 9.655 22.092 15.646 34.78 24.265 47.291 7.103 14.028 3.791 12.992 1.416 3.968 2.449-.001v-2.275l1.997-26.641 3.69-32.707 3.589-42.084 1.239-11.854 5.863-14.206 11.652-7.683 9.099 4.348 7.482 10.716-1.036 6.926-4.449 28.915-8.72 45.294-5.687 30.331h3.313l3.792-3.791 15.342-20.372 25.782-32.227 11.374-12.789 13.27-14.129 8.517-6.724 16.1-.001 11.854 17.617-5.307 18.199-16.581 21.029-13.75 17.819-19.716 26.54-12.309 21.231 1.138 1.694 2.932-.278 44.536-9.479 24.062-4.347 28.714-4.928 12.992 6.066 1.416 6.167-5.106 12.613-30.71 7.583-36.018 7.204-53.636 12.689-.657.48.758.935 24.164 2.275 10.337.556h25.301l47.114 3.514 12.309 8.139 7.381 9.959-1.238 7.583-18.957 9.655-25.579-6.066-59.702-14.205-20.474-5.106-2.83-.001v1.694l17.061 16.682 31.266 28.233 39.152 36.397 1.997 8.999-5.03 7.102-5.307-.758-34.401-25.883-13.27-11.651-30.053-25.302-1.996-.001v2.654l6.926 10.136 36.574 54.975 1.895 16.859-2.653 5.485-9.479 3.311-10.414-1.895-21.408-30.054-22.092-33.844-17.819-30.331-2.173 1.238-10.515 113.261-4.929 5.788-11.374 4.348-9.478-7.204-5.03-11.652 5.03-23.027 6.066-30.052 4.928-23.886 4.449-29.674 2.654-9.858-.177-.657-2.173.278-22.37 30.71-34.021 45.977-26.919 28.815-6.445 2.553-11.173-5.789 1.037-10.337 6.243-9.2 37.257-47.392 22.47-29.371 14.508-16.961-.101-2.451h-.859l-98.954 64.251-17.618 2.275-7.583-7.103.936-11.652 3.589-3.791 29.749-20.474-.101.102.024.101z"/></svg>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg width="256px" height="176px" viewBox="0 0 256 176" version="1.1" xmlns="http://www.w3.org/2000/svg" preserveAspectRatio="xMidYMid">
|
||||
<title>Anthropic</title>
|
||||
<g fill="#181818">
|
||||
<path d="M147.486878,0 C147.486878,0 217.568251,175.780074 217.568251,175.780074 C217.568251,175.780074 256,175.780074 256,175.780074 C256,175.780074 185.918621,0 185.918621,0 C185.918621,0 147.486878,0 147.486878,0 C147.486878,0 147.486878,0 147.486878,0 Z"></path>
|
||||
<path d="M66.1828124,106.221191 C66.1828124,106.221191 90.1624677,44.4471185 90.1624677,44.4471185 C90.1624677,44.4471185 114.142128,106.221191 114.142128,106.221191 C114.142128,106.221191 66.1828124,106.221191 66.1828124,106.221191 C66.1828124,106.221191 66.1828124,106.221191 66.1828124,106.221191 Z M70.0705318,0 C70.0705318,0 0,175.780074 0,175.780074 C0,175.780074 39.179211,175.780074 39.179211,175.780074 C39.179211,175.780074 53.5097704,138.86606 53.5097704,138.86606 C53.5097704,138.86606 126.817544,138.86606 126.817544,138.86606 C126.817544,138.86606 141.145724,175.780074 141.145724,175.780074 C141.145724,175.780074 180.324935,175.780074 180.324935,175.780074 C180.324935,175.780074 110.254409,0 110.254409,0 C110.254409,0 70.0705318,0 70.0705318,0 C70.0705318,0 70.0705318,0 70.0705318,0 Z"></path>
|
||||
</g>
|
||||
</svg>
|
||||
|
||||
|
Before Width: | Height: | Size: 2.4 KiB After Width: | Height: | Size: 1.3 KiB |
@@ -1 +0,0 @@
|
||||
<svg height="1em" style="flex:none;line-height:1" viewBox="0 0 24 24" width="1em" xmlns="http://www.w3.org/2000/svg"><title>DeepSeek</title><path d="M23.748 4.482c-.254-.124-.364.113-.512.234-.051.039-.094.09-.137.136-.372.397-.806.657-1.373.626-.829-.046-1.537.214-2.163.848-.133-.782-.575-1.248-1.247-1.548-.352-.156-.708-.311-.955-.65-.172-.241-.219-.51-.305-.774-.055-.16-.11-.323-.293-.35-.2-.031-.278.136-.356.276-.313.572-.434 1.202-.422 1.84.027 1.436.633 2.58 1.838 3.393.137.093.172.187.129.323-.082.28-.18.552-.266.833-.055.179-.137.217-.329.14a5.526 5.526 0 01-1.736-1.18c-.857-.828-1.631-1.742-2.597-2.458a11.365 11.365 0 00-.689-.471c-.985-.957.13-1.743.388-1.836.27-.098.093-.432-.779-.428-.872.004-1.67.295-2.687.684a3.055 3.055 0 01-.465.137 9.597 9.597 0 00-2.883-.102c-1.885.21-3.39 1.102-4.497 2.623C.082 8.606-.231 10.684.152 12.85c.403 2.284 1.569 4.175 3.36 5.653 1.858 1.533 3.997 2.284 6.438 2.14 1.482-.085 3.133-.284 4.994-1.86.47.234.962.327 1.78.397.63.059 1.236-.03 1.705-.128.735-.156.684-.837.419-.961-2.155-1.004-1.682-.595-2.113-.926 1.096-1.296 2.746-2.642 3.392-7.003.05-.347.007-.565 0-.845-.004-.17.035-.237.23-.256a4.173 4.173 0 001.545-.475c1.396-.763 1.96-2.015 2.093-3.517.02-.23-.004-.467-.247-.588zM11.581 18c-2.089-1.642-3.102-2.183-3.52-2.16-.392.024-.321.471-.235.763.09.288.207.486.371.739.114.167.192.416-.113.603-.673.416-1.842-.14-1.897-.167-1.361-.802-2.5-1.86-3.301-3.307-.774-1.393-1.224-2.887-1.298-4.482-.02-.386.093-.522.477-.592a4.696 4.696 0 011.529-.039c2.132.312 3.946 1.265 5.468 2.774.868.86 1.525 1.887 2.202 2.891.72 1.066 1.494 2.082 2.48 2.914.348.292.625.514.891.677-.802.09-2.14.11-3.054-.614zm1-6.44a.306.306 0 01.415-.287.302.302 0 01.2.288.306.306 0 01-.31.307.303.303 0 01-.304-.308zm3.11 1.596c-.2.081-.399.151-.59.16a1.245 1.245 0 01-.798-.254c-.274-.23-.47-.358-.552-.758a1.73 1.73 0 01.016-.588c.07-.327-.008-.537-.239-.727-.187-.156-.426-.199-.688-.199a.559.559 0 01-.254-.078c-.11-.054-.2-.19-.114-.358.028-.054.16-.186.192-.21.356-.202.767-.136 1.146.016.352.144.618.408 1.001.782.391.451.462.576.685.914.176.265.336.537.445.848.067.195-.019.354-.25.452z" fill="#4D6BFE"></path></svg>
|
||||
|
Before Width: | Height: | Size: 2.1 KiB |
@@ -7,12 +7,14 @@ import {
|
||||
MicrosoftIconSVG,
|
||||
MistralIcon,
|
||||
MetaIcon,
|
||||
OpenAIIcon,
|
||||
GeminiIcon,
|
||||
OpenSourceIcon,
|
||||
AnthropicSVG,
|
||||
IconProps,
|
||||
OpenAIISVG,
|
||||
DeepseekIcon,
|
||||
} from "@/components/icons/icons";
|
||||
import { FaRobot } from "react-icons/fa";
|
||||
|
||||
export interface CustomConfigKey {
|
||||
name: string;
|
||||
@@ -74,31 +76,30 @@ export interface LLMProviderDescriptor {
|
||||
}
|
||||
|
||||
export const getProviderIcon = (providerName: string, modelName?: string) => {
|
||||
const modelIconMap: Record<
|
||||
string,
|
||||
({ size, className }: IconProps) => JSX.Element
|
||||
> = {
|
||||
amazon: AmazonIcon,
|
||||
phi: MicrosoftIconSVG,
|
||||
mistral: MistralIcon,
|
||||
ministral: MistralIcon,
|
||||
llama: MetaIcon,
|
||||
gemini: GeminiIcon,
|
||||
deepseek: DeepseekIcon,
|
||||
claude: AnthropicIcon,
|
||||
};
|
||||
|
||||
const modelNameToIcon = (
|
||||
modelName: string,
|
||||
fallbackIcon: ({ size, className }: IconProps) => JSX.Element
|
||||
): (({ size, className }: IconProps) => JSX.Element) => {
|
||||
const lowerModelName = modelName?.toLowerCase();
|
||||
for (const [key, icon] of Object.entries(modelIconMap)) {
|
||||
if (lowerModelName?.includes(key)) {
|
||||
return icon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("amazon")) {
|
||||
return AmazonIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("phi")) {
|
||||
return MicrosoftIconSVG;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("mistral")) {
|
||||
return MistralIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("llama")) {
|
||||
return MetaIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("gemini")) {
|
||||
return GeminiIcon;
|
||||
}
|
||||
if (modelName?.toLowerCase().includes("claude")) {
|
||||
return AnthropicIcon;
|
||||
} else {
|
||||
return fallbackIcon;
|
||||
}
|
||||
return fallbackIcon;
|
||||
};
|
||||
|
||||
switch (providerName) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"use client";
|
||||
|
||||
import { Form, Formik } from "formik";
|
||||
import { ArrayHelpers, FieldArray, Form, Formik } from "formik";
|
||||
import * as Yup from "yup";
|
||||
import { PopupSpec } from "@/components/admin/connectors/Popup";
|
||||
import {
|
||||
@@ -10,14 +10,13 @@ import {
|
||||
} from "./lib";
|
||||
import { ConnectorStatus, DocumentSet, UserGroup, UserRole } from "@/lib/types";
|
||||
import { TextFormField } from "@/components/admin/connectors/Field";
|
||||
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
|
||||
import { Separator } from "@/components/ui/separator";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { usePaidEnterpriseFeaturesEnabled } from "@/components/settings/usePaidEnterpriseFeaturesEnabled";
|
||||
import { IsPublicGroupSelector } from "@/components/IsPublicGroupSelector";
|
||||
import React, { useEffect, useState } from "react";
|
||||
import { useUser } from "@/components/user/UserProvider";
|
||||
import { ConnectorMultiSelect } from "@/components/ConnectorMultiSelect";
|
||||
import { NonSelectableConnectors } from "@/components/NonSelectableConnectors";
|
||||
|
||||
interface SetCreationPopupProps {
|
||||
ccPairs: ConnectorStatus<any, any>[];
|
||||
@@ -46,7 +45,7 @@ export const DocumentSetCreationForm = ({
|
||||
}, [existingDocumentSet?.is_public]);
|
||||
|
||||
return (
|
||||
<div className="max-w-full mx-auto">
|
||||
<div>
|
||||
<Formik<DocumentSetCreationRequest>
|
||||
initialValues={{
|
||||
name: existingDocumentSet?.name ?? "",
|
||||
@@ -105,122 +104,243 @@ export const DocumentSetCreationForm = ({
|
||||
}}
|
||||
>
|
||||
{(props) => {
|
||||
// Filter visible cc pairs for curator role
|
||||
const visibleCcPairs =
|
||||
user?.role === UserRole.CURATOR
|
||||
? localCcPairs.filter(
|
||||
(ccPair) =>
|
||||
ccPair.access_type === "public" ||
|
||||
(ccPair.groups.length > 0 &&
|
||||
props.values.groups.every((group) =>
|
||||
ccPair.groups.includes(group)
|
||||
))
|
||||
)
|
||||
: localCcPairs;
|
||||
|
||||
// Filter non-visible cc pairs for curator role
|
||||
const nonVisibleCcPairs =
|
||||
user?.role === UserRole.CURATOR
|
||||
? localCcPairs.filter(
|
||||
(ccPair) =>
|
||||
!(ccPair.access_type === "public") &&
|
||||
(ccPair.groups.length === 0 ||
|
||||
!props.values.groups.every((group) =>
|
||||
ccPair.groups.includes(group)
|
||||
))
|
||||
)
|
||||
: [];
|
||||
|
||||
// Deselect filtered out cc pairs
|
||||
if (user?.role === UserRole.CURATOR) {
|
||||
const visibleCcPairIds = visibleCcPairs.map(
|
||||
(ccPair) => ccPair.cc_pair_id
|
||||
);
|
||||
props.values.cc_pair_ids = props.values.cc_pair_ids.filter((id) =>
|
||||
visibleCcPairIds.includes(id)
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<Form className="space-y-6 w-full ">
|
||||
<div className="space-y-4 w-full">
|
||||
<TextFormField
|
||||
name="name"
|
||||
label="Name:"
|
||||
placeholder="A name for the document set"
|
||||
disabled={isUpdate}
|
||||
autoCompleteDisabled={true}
|
||||
/>
|
||||
<TextFormField
|
||||
name="description"
|
||||
label="Description:"
|
||||
placeholder="Describe what the document set represents"
|
||||
autoCompleteDisabled={true}
|
||||
optional={true}
|
||||
<Form>
|
||||
<TextFormField
|
||||
name="name"
|
||||
label="Name:"
|
||||
placeholder="A name for the document set"
|
||||
disabled={isUpdate}
|
||||
autoCompleteDisabled={true}
|
||||
/>
|
||||
<TextFormField
|
||||
name="description"
|
||||
label="Description:"
|
||||
placeholder="Describe what the document set represents"
|
||||
autoCompleteDisabled={true}
|
||||
optional={true}
|
||||
/>
|
||||
|
||||
{isPaidEnterpriseFeaturesEnabled && (
|
||||
<IsPublicGroupSelector
|
||||
formikProps={props}
|
||||
objectName="document set"
|
||||
/>
|
||||
)}
|
||||
|
||||
{isPaidEnterpriseFeaturesEnabled && (
|
||||
<IsPublicGroupSelector
|
||||
formikProps={props}
|
||||
objectName="document set"
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
<Separator />
|
||||
|
||||
<Separator className="my-6" />
|
||||
{user?.role === UserRole.CURATOR ? (
|
||||
<>
|
||||
<div className="flex flex-col gap-y-1">
|
||||
<h2 className="mb-1 font-medium text-base">
|
||||
These are the connectors available to{" "}
|
||||
{userGroups && userGroups.length > 1
|
||||
? "the selected group"
|
||||
: "the group you curate"}
|
||||
:
|
||||
</h2>
|
||||
|
||||
<div className="space-y-6">
|
||||
{user?.role === UserRole.CURATOR ? (
|
||||
<>
|
||||
<ConnectorMultiSelect
|
||||
<p className="mb-text-sm">
|
||||
All documents indexed by these selected connectors will be
|
||||
a part of this document set.
|
||||
</p>
|
||||
<FieldArray
|
||||
name="cc_pair_ids"
|
||||
label={`Connectors available to ${
|
||||
userGroups && userGroups.length > 1
|
||||
? "the selected group"
|
||||
: "the group you curate"
|
||||
}`}
|
||||
connectors={visibleCcPairs}
|
||||
selectedIds={props.values.cc_pair_ids}
|
||||
onChange={(selectedIds) => {
|
||||
props.setFieldValue("cc_pair_ids", selectedIds);
|
||||
render={(arrayHelpers: ArrayHelpers) => {
|
||||
// Filter visible cc pairs
|
||||
const visibleCcPairs = localCcPairs.filter(
|
||||
(ccPair) =>
|
||||
ccPair.access_type === "public" ||
|
||||
(ccPair.groups.length > 0 &&
|
||||
props.values.groups.every((group) =>
|
||||
ccPair.groups.includes(group)
|
||||
))
|
||||
);
|
||||
|
||||
// Deselect filtered out cc pairs
|
||||
const visibleCcPairIds = visibleCcPairs.map(
|
||||
(ccPair) => ccPair.cc_pair_id
|
||||
);
|
||||
props.values.cc_pair_ids =
|
||||
props.values.cc_pair_ids.filter((id) =>
|
||||
visibleCcPairIds.includes(id)
|
||||
);
|
||||
|
||||
return (
|
||||
<div className="mb-3 flex gap-2 flex-wrap">
|
||||
{visibleCcPairs.map((ccPair) => {
|
||||
const ind = props.values.cc_pair_ids.indexOf(
|
||||
ccPair.cc_pair_id
|
||||
);
|
||||
const isSelected = ind !== -1;
|
||||
return (
|
||||
<div
|
||||
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
|
||||
className={
|
||||
`
|
||||
px-3
|
||||
py-1
|
||||
rounded-lg
|
||||
border
|
||||
border-border
|
||||
w-fit
|
||||
flex
|
||||
cursor-pointer ` +
|
||||
(isSelected
|
||||
? " bg-background-200"
|
||||
: " hover:bg-accent-background-hovered")
|
||||
}
|
||||
onClick={() => {
|
||||
if (isSelected) {
|
||||
arrayHelpers.remove(ind);
|
||||
} else {
|
||||
arrayHelpers.push(ccPair.cc_pair_id);
|
||||
}
|
||||
}}
|
||||
>
|
||||
<div className="my-auto">
|
||||
<ConnectorTitle
|
||||
connector={ccPair.connector}
|
||||
ccPairId={ccPair.cc_pair_id}
|
||||
ccPairName={ccPair.name}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
);
|
||||
}}
|
||||
placeholder="Search for connectors..."
|
||||
/>
|
||||
</div>
|
||||
|
||||
<NonSelectableConnectors
|
||||
connectors={nonVisibleCcPairs}
|
||||
title={`Connectors not available to the ${
|
||||
userGroups && userGroups.length > 1
|
||||
? `group${
|
||||
props.values.groups.length > 1 ? "s" : ""
|
||||
} you have selected`
|
||||
: "group you curate"
|
||||
}`}
|
||||
description="Only connectors that are directly assigned to the group you are trying to add the document set to will be available."
|
||||
<div>
|
||||
<FieldArray
|
||||
name="cc_pair_ids"
|
||||
render={() => {
|
||||
// Filter non-visible cc pairs
|
||||
const nonVisibleCcPairs = localCcPairs.filter(
|
||||
(ccPair) =>
|
||||
!(ccPair.access_type === "public") &&
|
||||
(ccPair.groups.length === 0 ||
|
||||
!props.values.groups.every((group) =>
|
||||
ccPair.groups.includes(group)
|
||||
))
|
||||
);
|
||||
|
||||
return nonVisibleCcPairs.length > 0 ? (
|
||||
<>
|
||||
<Separator />
|
||||
<h2 className="mb-1 font-medium text-base">
|
||||
These connectors are not available to the{" "}
|
||||
{userGroups && userGroups.length > 1
|
||||
? `group${
|
||||
props.values.groups.length > 1 ? "s" : ""
|
||||
} you have selected`
|
||||
: "group you curate"}
|
||||
:
|
||||
</h2>
|
||||
<p className="mb-3 text-sm">
|
||||
Only connectors that are directly assigned to the
|
||||
group you are trying to add the document set to
|
||||
will be available.
|
||||
</p>
|
||||
<div className="mb-3 flex gap-2 flex-wrap">
|
||||
{nonVisibleCcPairs.map((ccPair) => (
|
||||
<div
|
||||
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
|
||||
className="px-3 py-1 rounded-lg border border-non-selectable-border w-fit flex cursor-not-allowed"
|
||||
>
|
||||
<div className="my-auto">
|
||||
<ConnectorTitle
|
||||
connector={ccPair.connector}
|
||||
ccPairId={ccPair.cc_pair_id}
|
||||
ccPairName={ccPair.name}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</>
|
||||
) : null;
|
||||
}}
|
||||
/>
|
||||
</>
|
||||
) : (
|
||||
<ConnectorMultiSelect
|
||||
</div>
|
||||
</>
|
||||
) : (
|
||||
<div>
|
||||
<h2 className="mb-1 font-medium text-base">
|
||||
Pick your connectors:
|
||||
</h2>
|
||||
<p className="mb-3 text-xs">
|
||||
All documents indexed by the selected connectors will be a
|
||||
part of this document set.
|
||||
</p>
|
||||
<FieldArray
|
||||
name="cc_pair_ids"
|
||||
label="Pick your connectors"
|
||||
connectors={visibleCcPairs}
|
||||
selectedIds={props.values.cc_pair_ids}
|
||||
onChange={(selectedIds) => {
|
||||
props.setFieldValue("cc_pair_ids", selectedIds);
|
||||
}}
|
||||
placeholder="Search for connectors..."
|
||||
render={(arrayHelpers: ArrayHelpers) => (
|
||||
<div className="mb-3 flex gap-2 flex-wrap">
|
||||
{ccPairs.map((ccPair) => {
|
||||
const ind = props.values.cc_pair_ids.indexOf(
|
||||
ccPair.cc_pair_id
|
||||
);
|
||||
const isSelected = ind !== -1;
|
||||
return (
|
||||
<div
|
||||
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
|
||||
className={
|
||||
`
|
||||
px-3
|
||||
py-1
|
||||
rounded-lg
|
||||
border
|
||||
border-border
|
||||
w-fit
|
||||
flex
|
||||
cursor-pointer ` +
|
||||
(isSelected
|
||||
? " bg-background-200"
|
||||
: " hover:bg-accent-background-hovered")
|
||||
}
|
||||
onClick={() => {
|
||||
if (isSelected) {
|
||||
arrayHelpers.remove(ind);
|
||||
} else {
|
||||
arrayHelpers.push(ccPair.cc_pair_id);
|
||||
}
|
||||
}}
|
||||
>
|
||||
<div className="my-auto">
|
||||
<ConnectorTitle
|
||||
connector={ccPair.connector}
|
||||
ccPairId={ccPair.cc_pair_id}
|
||||
ccPairName={ccPair.name}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
)}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="flex mt-6 pt-4 border-t border-neutral-200">
|
||||
<div className="flex mt-6">
|
||||
<Button
|
||||
type="submit"
|
||||
variant="submit"
|
||||
disabled={props.isSubmitting}
|
||||
className="w-56 mx-auto py-1.5 h-auto text-sm"
|
||||
className="w-64 mx-auto"
|
||||
>
|
||||
{isUpdate ? "Update Document Set" : "Create Document Set"}
|
||||
{isUpdate ? "Update!" : "Create!"}
|
||||
</Button>
|
||||
</div>
|
||||
</Form>
|
||||
|
||||
@@ -15,7 +15,6 @@ import {
|
||||
} from "./interfaces";
|
||||
import { FiExternalLink } from "react-icons/fi";
|
||||
import {
|
||||
AmazonIcon,
|
||||
CohereIcon,
|
||||
LiteLLMIcon,
|
||||
MixedBreadIcon,
|
||||
@@ -243,11 +242,6 @@ const RerankingDetailsForm = forwardRef<
|
||||
card.rerank_provider_type == RerankerProvider.COHERE
|
||||
) {
|
||||
setIsApiKeyModalOpen(true);
|
||||
} else if (
|
||||
card.rerank_provider_type ==
|
||||
RerankerProvider.BEDROCK
|
||||
) {
|
||||
setIsApiKeyModalOpen(true);
|
||||
} else if (
|
||||
card.rerank_provider_type ==
|
||||
RerankerProvider.LITELLM
|
||||
@@ -284,9 +278,6 @@ const RerankingDetailsForm = forwardRef<
|
||||
) : card.rerank_provider_type ===
|
||||
RerankerProvider.COHERE ? (
|
||||
<CohereIcon size={24} className="mr-2" />
|
||||
) : card.rerank_provider_type ===
|
||||
RerankerProvider.BEDROCK ? (
|
||||
<AmazonIcon size={24} className="mr-2" />
|
||||
) : (
|
||||
<MixedBreadIcon size={24} className="mr-2" />
|
||||
)}
|
||||
@@ -446,10 +437,7 @@ const RerankingDetailsForm = forwardRef<
|
||||
placeholder={
|
||||
values.rerank_api_key
|
||||
? "*".repeat(values.rerank_api_key.length)
|
||||
: values.rerank_provider_type ===
|
||||
RerankerProvider.BEDROCK
|
||||
? "aws_ACCESSKEY_SECRETKEY_REGION"
|
||||
: "Enter your API key"
|
||||
: undefined
|
||||
}
|
||||
onChange={(e: React.ChangeEvent<HTMLInputElement>) => {
|
||||
const value = e.target.value;
|
||||
@@ -460,12 +448,7 @@ const RerankingDetailsForm = forwardRef<
|
||||
setFieldValue("api_key", value);
|
||||
}}
|
||||
type="password"
|
||||
label={
|
||||
values.rerank_provider_type ===
|
||||
RerankerProvider.BEDROCK
|
||||
? "AWS Credentials in format: aws_ACCESSKEY_SECRETKEY_REGION"
|
||||
: "Cohere API Key"
|
||||
}
|
||||
label="Cohere API Key"
|
||||
name="rerank_api_key"
|
||||
/>
|
||||
<div className="flex w-full justify-end mt-4">
|
||||
|
||||
@@ -18,7 +18,6 @@ export interface RerankingDetails {
|
||||
export enum RerankerProvider {
|
||||
COHERE = "cohere",
|
||||
LITELLM = "litellm",
|
||||
BEDROCK = "bedrock",
|
||||
}
|
||||
|
||||
export enum EmbeddingPrecision {
|
||||
@@ -101,15 +100,6 @@ export const rerankingModels: RerankingModel[] = [
|
||||
description: "Powerful multilingual reranking model.",
|
||||
link: "https://docs.cohere.com/v2/reference/rerank",
|
||||
},
|
||||
{
|
||||
cloud: true,
|
||||
rerank_provider_type: RerankerProvider.BEDROCK,
|
||||
modelName: "cohere.rerank-v3-5:0",
|
||||
displayName: "Cohere Rerank 3.5",
|
||||
description:
|
||||
"Powerful multilingual reranking model invoked through AWS Bedrock.",
|
||||
link: "https://aws.amazon.com/blogs/machine-learning/cohere-rerank-3-5-is-now-available-in-amazon-bedrock-through-rerank-api",
|
||||
},
|
||||
];
|
||||
|
||||
export const getCurrentModelCopy = (
|
||||
|
||||
@@ -26,8 +26,6 @@ import {
|
||||
FiUnlock,
|
||||
FiRefreshCw,
|
||||
FiPauseCircle,
|
||||
FiFilter,
|
||||
FiX,
|
||||
} from "react-icons/fi";
|
||||
import {
|
||||
Tooltip,
|
||||
@@ -43,7 +41,6 @@ import Cookies from "js-cookie";
|
||||
import { TOGGLED_CONNECTORS_COOKIE_NAME } from "@/lib/constants";
|
||||
import { usePaidEnterpriseFeaturesEnabled } from "@/components/settings/usePaidEnterpriseFeaturesEnabled";
|
||||
import { ConnectorCredentialPairStatus } from "../../connector/[ccPairId]/types";
|
||||
import { FilterComponent, FilterOptions } from "./FilterComponent";
|
||||
|
||||
function SummaryRow({
|
||||
source,
|
||||
@@ -288,26 +285,7 @@ export function CCPairIndexingStatusTable({
|
||||
return savedState ? JSON.parse(savedState) : {};
|
||||
});
|
||||
|
||||
const [filterOptions, setFilterOptions] = useState<FilterOptions>({
|
||||
accessType: null,
|
||||
docsCountFilter: {
|
||||
operator: null,
|
||||
value: null,
|
||||
},
|
||||
lastStatus: null,
|
||||
});
|
||||
|
||||
// Reference to the FilterComponent for resetting its state
|
||||
const filterComponentRef = useRef<{
|
||||
resetFilters: () => void;
|
||||
} | null>(null);
|
||||
|
||||
const {
|
||||
groupedStatuses,
|
||||
sortedSources,
|
||||
groupSummaries,
|
||||
filteredGroupedStatuses,
|
||||
} = useMemo(() => {
|
||||
const { groupedStatuses, sortedSources, groupSummaries } = useMemo(() => {
|
||||
const grouped: Record<ValidSources, ConnectorIndexingStatus<any, any>[]> =
|
||||
{} as Record<ValidSources, ConnectorIndexingStatus<any, any>[]>;
|
||||
|
||||
@@ -359,139 +337,12 @@ export function CCPairIndexingStatusTable({
|
||||
};
|
||||
});
|
||||
|
||||
// Apply filters to create filtered grouped statuses
|
||||
const filteredGrouped: Record<
|
||||
ValidSources,
|
||||
ConnectorIndexingStatus<any, any>[]
|
||||
> = {} as Record<ValidSources, ConnectorIndexingStatus<any, any>[]>;
|
||||
|
||||
sorted.forEach((source) => {
|
||||
const statuses = grouped[source];
|
||||
|
||||
// Apply filters
|
||||
const filteredStatuses = statuses.filter((status) => {
|
||||
// Filter by access type
|
||||
if (filterOptions.accessType && filterOptions.accessType.length > 0) {
|
||||
if (!filterOptions.accessType.includes(status.access_type)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Filter by last status
|
||||
if (filterOptions.lastStatus && filterOptions.lastStatus.length > 0) {
|
||||
if (
|
||||
!filterOptions.lastStatus.includes(
|
||||
status.last_finished_status as any
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Filter by docs count
|
||||
if (filterOptions.docsCountFilter.operator) {
|
||||
const { operator, value } = filterOptions.docsCountFilter;
|
||||
|
||||
// If only operator is selected (no value), show all
|
||||
if (value === null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (operator === ">" && !(status.docs_indexed > value)) {
|
||||
return false;
|
||||
} else if (operator === "<" && !(status.docs_indexed < value)) {
|
||||
return false;
|
||||
} else if (operator === "=" && status.docs_indexed !== value) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
if (filteredStatuses.length > 0) {
|
||||
filteredGrouped[source] = filteredStatuses;
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
groupedStatuses: grouped,
|
||||
sortedSources: sorted,
|
||||
groupSummaries: summaries,
|
||||
filteredGroupedStatuses: filteredGrouped,
|
||||
};
|
||||
}, [ccPairsIndexingStatuses, editableCcPairsIndexingStatuses, filterOptions]);
|
||||
|
||||
// Determine which sources to display based on filters and search
|
||||
const displaySources = useMemo(() => {
|
||||
const hasActiveFilters =
|
||||
(filterOptions.accessType && filterOptions.accessType.length > 0) ||
|
||||
(filterOptions.lastStatus && filterOptions.lastStatus.length > 0) ||
|
||||
filterOptions.docsCountFilter.operator !== null;
|
||||
|
||||
if (hasActiveFilters) {
|
||||
return Object.keys(filteredGroupedStatuses) as ValidSources[];
|
||||
}
|
||||
|
||||
return sortedSources;
|
||||
}, [sortedSources, filteredGroupedStatuses, filterOptions]);
|
||||
|
||||
const handleFilterChange = (newFilters: FilterOptions) => {
|
||||
setFilterOptions(newFilters);
|
||||
|
||||
// Auto-expand sources when filters are applied
|
||||
if (
|
||||
(newFilters.accessType && newFilters.accessType.length > 0) ||
|
||||
(newFilters.lastStatus && newFilters.lastStatus.length > 0) ||
|
||||
newFilters.docsCountFilter.operator !== null
|
||||
) {
|
||||
// We need to wait for the filteredGroupedStatuses to be updated
|
||||
// before we can expand the sources
|
||||
setTimeout(() => {
|
||||
const sourcesToExpand = Object.keys(
|
||||
filteredGroupedStatuses
|
||||
) as ValidSources[];
|
||||
const newConnectorsToggled = { ...connectorsToggled };
|
||||
|
||||
sourcesToExpand.forEach((source) => {
|
||||
newConnectorsToggled[source] = true;
|
||||
});
|
||||
|
||||
setConnectorsToggled(newConnectorsToggled);
|
||||
Cookies.set(
|
||||
TOGGLED_CONNECTORS_COOKIE_NAME,
|
||||
JSON.stringify(newConnectorsToggled)
|
||||
);
|
||||
}, 0);
|
||||
}
|
||||
};
|
||||
|
||||
const clearAllFilters = () => {
|
||||
const emptyFilters: FilterOptions = {
|
||||
accessType: null,
|
||||
docsCountFilter: {
|
||||
operator: null,
|
||||
value: null,
|
||||
},
|
||||
lastStatus: null,
|
||||
};
|
||||
|
||||
setFilterOptions(emptyFilters);
|
||||
|
||||
// Reset the FilterComponent's internal state
|
||||
if (filterComponentRef.current) {
|
||||
filterComponentRef.current.resetFilters();
|
||||
}
|
||||
};
|
||||
|
||||
// Check if filters are active
|
||||
const hasActiveFilters = useMemo(() => {
|
||||
return (
|
||||
(filterOptions.accessType && filterOptions.accessType.length > 0) ||
|
||||
(filterOptions.lastStatus && filterOptions.lastStatus.length > 0) ||
|
||||
filterOptions.docsCountFilter.operator !== null
|
||||
);
|
||||
}, [filterOptions]);
|
||||
}, [ccPairsIndexingStatuses, editableCcPairsIndexingStatuses]);
|
||||
|
||||
const toggleSource = (
|
||||
source: ValidSources,
|
||||
@@ -525,194 +376,127 @@ export function CCPairIndexingStatusTable({
|
||||
sortedSources.length;
|
||||
|
||||
return (
|
||||
<>
|
||||
<Table>
|
||||
<TableHeader>
|
||||
<ConnectorRow
|
||||
invisible
|
||||
ccPairsIndexingStatus={{
|
||||
cc_pair_id: 1,
|
||||
<Table>
|
||||
<TableHeader>
|
||||
<ConnectorRow
|
||||
invisible
|
||||
ccPairsIndexingStatus={{
|
||||
cc_pair_id: 1,
|
||||
name: "Sample File Connector",
|
||||
cc_pair_status: ConnectorCredentialPairStatus.ACTIVE,
|
||||
last_status: "success",
|
||||
connector: {
|
||||
name: "Sample File Connector",
|
||||
cc_pair_status: ConnectorCredentialPairStatus.ACTIVE,
|
||||
last_status: "success",
|
||||
connector: {
|
||||
name: "Sample File Connector",
|
||||
source: ValidSources.File,
|
||||
input_type: "poll",
|
||||
connector_specific_config: {
|
||||
file_locations: ["/path/to/sample/file.txt"],
|
||||
},
|
||||
refresh_freq: 86400,
|
||||
prune_freq: null,
|
||||
indexing_start: new Date("2023-07-01T12:00:00Z"),
|
||||
id: 1,
|
||||
credential_ids: [],
|
||||
access_type: "public",
|
||||
time_created: "2023-07-01T12:00:00Z",
|
||||
time_updated: "2023-07-01T12:00:00Z",
|
||||
},
|
||||
credential: {
|
||||
id: 1,
|
||||
name: "Sample Credential",
|
||||
source: ValidSources.File,
|
||||
user_id: "1",
|
||||
time_created: "2023-07-01T12:00:00Z",
|
||||
time_updated: "2023-07-01T12:00:00Z",
|
||||
credential_json: {},
|
||||
admin_public: false,
|
||||
source: ValidSources.File,
|
||||
input_type: "poll",
|
||||
connector_specific_config: {
|
||||
file_locations: ["/path/to/sample/file.txt"],
|
||||
},
|
||||
refresh_freq: 86400,
|
||||
prune_freq: null,
|
||||
indexing_start: new Date("2023-07-01T12:00:00Z"),
|
||||
id: 1,
|
||||
credential_ids: [],
|
||||
access_type: "public",
|
||||
docs_indexed: 1000,
|
||||
last_success: "2023-07-01T12:00:00Z",
|
||||
last_finished_status: "success",
|
||||
latest_index_attempt: null,
|
||||
groups: [], // Add this line
|
||||
}}
|
||||
isEditable={false}
|
||||
/>
|
||||
</TableHeader>
|
||||
<div className="flex -mt-12 items-center w-0 m4 gap-x-2">
|
||||
<input
|
||||
type="text"
|
||||
ref={searchInputRef}
|
||||
placeholder="Search connectors..."
|
||||
value={searchTerm}
|
||||
onChange={(e) => setSearchTerm(e.target.value)}
|
||||
className="ml-1 w-96 h-9 border border-border flex-none rounded-md bg-background-50 px-3 py-1 text-sm shadow-sm transition-colors placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring"
|
||||
/>
|
||||
time_created: "2023-07-01T12:00:00Z",
|
||||
time_updated: "2023-07-01T12:00:00Z",
|
||||
},
|
||||
credential: {
|
||||
id: 1,
|
||||
name: "Sample Credential",
|
||||
source: ValidSources.File,
|
||||
user_id: "1",
|
||||
time_created: "2023-07-01T12:00:00Z",
|
||||
time_updated: "2023-07-01T12:00:00Z",
|
||||
credential_json: {},
|
||||
admin_public: false,
|
||||
},
|
||||
access_type: "public",
|
||||
docs_indexed: 1000,
|
||||
last_success: "2023-07-01T12:00:00Z",
|
||||
last_finished_status: "success",
|
||||
latest_index_attempt: null,
|
||||
groups: [], // Add this line
|
||||
}}
|
||||
isEditable={false}
|
||||
/>
|
||||
</TableHeader>
|
||||
<div className="flex -mt-12 items-center w-0 m4 gap-x-2">
|
||||
<input
|
||||
type="text"
|
||||
ref={searchInputRef}
|
||||
placeholder="Search connectors..."
|
||||
value={searchTerm}
|
||||
onChange={(e) => setSearchTerm(e.target.value)}
|
||||
className="ml-1 w-96 h-9 border border-border flex-none rounded-md bg-background-50 px-3 py-1 text-sm shadow-sm transition-colors placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring"
|
||||
/>
|
||||
|
||||
<Button className="h-9" onClick={() => toggleSources()}>
|
||||
{!shouldExpand ? "Collapse All" : "Expand All"}
|
||||
</Button>
|
||||
|
||||
<div className="flex items-center gap-2">
|
||||
<FilterComponent
|
||||
onFilterChange={handleFilterChange}
|
||||
ref={filterComponentRef}
|
||||
/>
|
||||
|
||||
{hasActiveFilters && (
|
||||
<div className="flex flex-none items-center gap-1 ml-2 max-w-[500px]">
|
||||
{filterOptions.accessType &&
|
||||
filterOptions.accessType.length > 0 && (
|
||||
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
|
||||
Access: {filterOptions.accessType.join(", ")}
|
||||
</Badge>
|
||||
)}
|
||||
|
||||
{filterOptions.lastStatus &&
|
||||
filterOptions.lastStatus.length > 0 && (
|
||||
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
|
||||
Status:{" "}
|
||||
{filterOptions.lastStatus
|
||||
.map((s) => s.replace(/_/g, " "))
|
||||
.join(", ")}
|
||||
</Badge>
|
||||
)}
|
||||
|
||||
{filterOptions.docsCountFilter.operator &&
|
||||
filterOptions.docsCountFilter.value !== null && (
|
||||
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
|
||||
Docs {filterOptions.docsCountFilter.operator}{" "}
|
||||
{filterOptions.docsCountFilter.value}
|
||||
</Badge>
|
||||
)}
|
||||
|
||||
{filterOptions.docsCountFilter.operator &&
|
||||
filterOptions.docsCountFilter.value === null && (
|
||||
<Badge variant="secondary" className="px-2 py-0.5 text-xs">
|
||||
Docs {filterOptions.docsCountFilter.operator} any
|
||||
</Badge>
|
||||
)}
|
||||
|
||||
<Badge
|
||||
variant="outline"
|
||||
className="px-2 py-0.5 text-xs border-red-400 bg-red-100 hover:border-red-600 cursor-pointer hover:bg-red-100 dark:hover:bg-red-900"
|
||||
onClick={() => {
|
||||
if (filterComponentRef.current) {
|
||||
filterComponentRef.current.resetFilters();
|
||||
setFilterOptions({
|
||||
accessType: null,
|
||||
docsCountFilter: {
|
||||
operator: null,
|
||||
value: null,
|
||||
},
|
||||
lastStatus: null,
|
||||
});
|
||||
}
|
||||
}}
|
||||
>
|
||||
<span className="text-red-500 dark:text-red-400">Clear</span>
|
||||
</Badge>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
<TableBody>
|
||||
{displaySources
|
||||
.filter(
|
||||
(source) =>
|
||||
source != "not_applicable" && source != "ingestion_api"
|
||||
)
|
||||
.map((source, ind) => {
|
||||
const sourceMatches = source
|
||||
.toLowerCase()
|
||||
.includes(searchTerm.toLowerCase());
|
||||
|
||||
const statuses =
|
||||
filteredGroupedStatuses[source] || groupedStatuses[source];
|
||||
|
||||
const matchingConnectors = statuses.filter((status) =>
|
||||
<Button className="h-9" onClick={() => toggleSources()}>
|
||||
{!shouldExpand ? "Collapse All" : "Expand All"}
|
||||
</Button>
|
||||
</div>
|
||||
<TableBody>
|
||||
{sortedSources
|
||||
.filter(
|
||||
(source) => source != "not_applicable" && source != "ingestion_api"
|
||||
)
|
||||
.map((source, ind) => {
|
||||
const sourceMatches = source
|
||||
.toLowerCase()
|
||||
.includes(searchTerm.toLowerCase());
|
||||
const matchingConnectors = groupedStatuses[source].filter(
|
||||
(status) =>
|
||||
(status.name || "")
|
||||
.toLowerCase()
|
||||
.includes(searchTerm.toLowerCase())
|
||||
);
|
||||
|
||||
if (sourceMatches || matchingConnectors.length > 0) {
|
||||
return (
|
||||
<React.Fragment key={ind}>
|
||||
<br className="mt-4" />
|
||||
<SummaryRow
|
||||
source={source}
|
||||
summary={groupSummaries[source]}
|
||||
isOpen={connectorsToggled[source] || false}
|
||||
onToggle={() => toggleSource(source)}
|
||||
/>
|
||||
{connectorsToggled[source] && (
|
||||
<>
|
||||
<TableRow className="border border-border dark:border-neutral-700">
|
||||
<TableHead>Name</TableHead>
|
||||
<TableHead>Last Indexed</TableHead>
|
||||
<TableHead>Activity</TableHead>
|
||||
{isPaidEnterpriseFeaturesEnabled && (
|
||||
<TableHead>Permissions</TableHead>
|
||||
)}
|
||||
<TableHead>Total Docs</TableHead>
|
||||
<TableHead>Last Status</TableHead>
|
||||
<TableHead></TableHead>
|
||||
</TableRow>
|
||||
{(sourceMatches ? statuses : matchingConnectors).map(
|
||||
(ccPairsIndexingStatus) => (
|
||||
<ConnectorRow
|
||||
key={ccPairsIndexingStatus.cc_pair_id}
|
||||
ccPairsIndexingStatus={ccPairsIndexingStatus}
|
||||
isEditable={editableCcPairsIndexingStatuses.some(
|
||||
(e) =>
|
||||
e.cc_pair_id ===
|
||||
ccPairsIndexingStatus.cc_pair_id
|
||||
)}
|
||||
/>
|
||||
)
|
||||
);
|
||||
if (sourceMatches || matchingConnectors.length > 0) {
|
||||
return (
|
||||
<React.Fragment key={ind}>
|
||||
<br className="mt-4" />
|
||||
<SummaryRow
|
||||
source={source}
|
||||
summary={groupSummaries[source]}
|
||||
isOpen={connectorsToggled[source] || false}
|
||||
onToggle={() => toggleSource(source)}
|
||||
/>
|
||||
{connectorsToggled[source] && (
|
||||
<>
|
||||
<TableRow
|
||||
noHover
|
||||
className="border ! border-border dark:border-neutral-700"
|
||||
>
|
||||
<TableHead>Name</TableHead>
|
||||
<TableHead>Last Indexed</TableHead>
|
||||
<TableHead>Activity</TableHead>
|
||||
{isPaidEnterpriseFeaturesEnabled && (
|
||||
<TableHead>Permissions</TableHead>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</React.Fragment>
|
||||
);
|
||||
}
|
||||
return null;
|
||||
})}
|
||||
</TableBody>
|
||||
</Table>
|
||||
</>
|
||||
<TableHead>Total Docs</TableHead>
|
||||
<TableHead>Last Status</TableHead>
|
||||
<TableHead></TableHead>
|
||||
</TableRow>
|
||||
{(sourceMatches
|
||||
? groupedStatuses[source]
|
||||
: matchingConnectors
|
||||
).map((ccPairsIndexingStatus) => (
|
||||
<ConnectorRow
|
||||
key={ccPairsIndexingStatus.cc_pair_id}
|
||||
ccPairsIndexingStatus={ccPairsIndexingStatus}
|
||||
isEditable={editableCcPairsIndexingStatuses.some(
|
||||
(e) =>
|
||||
e.cc_pair_id === ccPairsIndexingStatus.cc_pair_id
|
||||
)}
|
||||
/>
|
||||
))}
|
||||
</>
|
||||
)}
|
||||
</React.Fragment>
|
||||
);
|
||||
}
|
||||
return null;
|
||||
})}
|
||||
</TableBody>
|
||||
</Table>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1,375 +0,0 @@
|
||||
"use client";
|
||||
|
||||
import React, { useState, useImperativeHandle, forwardRef } from "react";
|
||||
import {
|
||||
DropdownMenu,
|
||||
DropdownMenuContent,
|
||||
DropdownMenuGroup,
|
||||
DropdownMenuItem,
|
||||
DropdownMenuLabel,
|
||||
DropdownMenuSeparator,
|
||||
DropdownMenuTrigger,
|
||||
DropdownMenuCheckboxItem,
|
||||
DropdownMenuRadioGroup,
|
||||
DropdownMenuRadioItem,
|
||||
} from "@/components/ui/dropdown-menu";
|
||||
import { SortIcon } from "@/components/icons/icons";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { Badge } from "@/components/ui/badge";
|
||||
import { AccessType, ValidStatuses } from "@/lib/types";
|
||||
import { FiFilter, FiX, FiCheck } from "react-icons/fi";
|
||||
|
||||
export interface FilterOptions {
|
||||
accessType: AccessType[] | null;
|
||||
docsCountFilter: {
|
||||
operator: ">" | "<" | "=" | null;
|
||||
value: number | null;
|
||||
};
|
||||
lastStatus: ValidStatuses[] | null;
|
||||
}
|
||||
|
||||
interface FilterComponentProps {
|
||||
onFilterChange: (filters: FilterOptions) => void;
|
||||
}
|
||||
|
||||
export const FilterComponent = forwardRef<
|
||||
{ resetFilters: () => void },
|
||||
FilterComponentProps
|
||||
>(({ onFilterChange }, ref) => {
|
||||
const [isOpen, setIsOpen] = useState(false);
|
||||
const [filters, setFilters] = useState<FilterOptions>({
|
||||
accessType: null,
|
||||
docsCountFilter: {
|
||||
operator: null,
|
||||
value: null,
|
||||
},
|
||||
lastStatus: null,
|
||||
});
|
||||
|
||||
// Local state for tracking selected filters before applying
|
||||
const [docsOperator, setDocsOperator] = useState<">" | "<" | "=" | null>(
|
||||
null
|
||||
);
|
||||
const [docsValue, setDocsValue] = useState<string>("");
|
||||
const [selectedAccessTypes, setSelectedAccessTypes] = useState<AccessType[]>(
|
||||
[]
|
||||
);
|
||||
const [selectedStatuses, setSelectedStatuses] = useState<ValidStatuses[]>([]);
|
||||
|
||||
// Expose resetFilters method via ref
|
||||
useImperativeHandle(ref, () => ({
|
||||
resetFilters: () => {
|
||||
setDocsOperator(null);
|
||||
setDocsValue("");
|
||||
setSelectedAccessTypes([]);
|
||||
setSelectedStatuses([]);
|
||||
setFilters({
|
||||
accessType: null,
|
||||
docsCountFilter: {
|
||||
operator: null,
|
||||
value: null,
|
||||
},
|
||||
lastStatus: null,
|
||||
});
|
||||
},
|
||||
}));
|
||||
|
||||
const handleAccessTypeChange = (accessType: AccessType) => {
|
||||
const newAccessTypes = selectedAccessTypes.includes(accessType)
|
||||
? selectedAccessTypes.filter((type) => type !== accessType)
|
||||
: [...selectedAccessTypes, accessType];
|
||||
|
||||
setSelectedAccessTypes(newAccessTypes);
|
||||
};
|
||||
|
||||
const handleStatusChange = (status: ValidStatuses) => {
|
||||
const newStatuses = selectedStatuses.includes(status)
|
||||
? selectedStatuses.filter((s) => s !== status)
|
||||
: [...selectedStatuses, status];
|
||||
|
||||
setSelectedStatuses(newStatuses);
|
||||
};
|
||||
|
||||
const handleDocsFilterChange = () => {
|
||||
if (docsOperator && docsValue) {
|
||||
const newFilters = {
|
||||
...filters,
|
||||
accessType: selectedAccessTypes.length > 0 ? selectedAccessTypes : null,
|
||||
lastStatus: selectedStatuses.length > 0 ? selectedStatuses : null,
|
||||
docsCountFilter: {
|
||||
operator: docsOperator,
|
||||
value: parseInt(docsValue),
|
||||
},
|
||||
};
|
||||
|
||||
setFilters(newFilters);
|
||||
onFilterChange(newFilters);
|
||||
setIsOpen(false);
|
||||
}
|
||||
};
|
||||
|
||||
const applyFilters = () => {
|
||||
const newFilters = {
|
||||
...filters,
|
||||
accessType: selectedAccessTypes.length > 0 ? selectedAccessTypes : null,
|
||||
lastStatus: selectedStatuses.length > 0 ? selectedStatuses : null,
|
||||
docsCountFilter: {
|
||||
operator: docsOperator,
|
||||
value: docsValue ? parseInt(docsValue) : null,
|
||||
},
|
||||
};
|
||||
|
||||
setFilters(newFilters);
|
||||
onFilterChange(newFilters);
|
||||
setIsOpen(false);
|
||||
};
|
||||
|
||||
const clearFilters = () => {
|
||||
setSelectedAccessTypes([]);
|
||||
setSelectedStatuses([]);
|
||||
setDocsOperator(null);
|
||||
setDocsValue("");
|
||||
|
||||
const newFilters = {
|
||||
accessType: null,
|
||||
docsCountFilter: {
|
||||
operator: null,
|
||||
value: null,
|
||||
},
|
||||
lastStatus: null,
|
||||
};
|
||||
|
||||
setFilters(newFilters);
|
||||
onFilterChange(newFilters);
|
||||
};
|
||||
|
||||
// Sync local state with filters when dropdown opens
|
||||
const handleOpenChange = (open: boolean) => {
|
||||
if (open) {
|
||||
// When opening, initialize local state from current filters
|
||||
setSelectedAccessTypes(filters.accessType || []);
|
||||
setSelectedStatuses(filters.lastStatus || []);
|
||||
setDocsOperator(filters.docsCountFilter.operator);
|
||||
setDocsValue(
|
||||
filters.docsCountFilter.value !== null
|
||||
? filters.docsCountFilter.value.toString()
|
||||
: ""
|
||||
);
|
||||
}
|
||||
setIsOpen(open);
|
||||
};
|
||||
|
||||
const hasActiveFilters =
|
||||
(filters.accessType && filters.accessType.length > 0) ||
|
||||
(filters.lastStatus && filters.lastStatus.length > 0) ||
|
||||
filters.docsCountFilter.operator !== null;
|
||||
|
||||
// Get active filter count for badge
|
||||
const getActiveFilterCount = () => {
|
||||
let count = 0;
|
||||
if (filters.accessType && filters.accessType.length > 0) count++;
|
||||
if (filters.lastStatus && filters.lastStatus.length > 0) count++;
|
||||
if (filters.docsCountFilter.operator !== null) count++;
|
||||
return count;
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="relative">
|
||||
<DropdownMenu open={isOpen} onOpenChange={handleOpenChange}>
|
||||
<DropdownMenuTrigger asChild>
|
||||
<Button
|
||||
variant="outline"
|
||||
size="sm"
|
||||
className={`p-2 h-9 ${
|
||||
hasActiveFilters ? "border-primary bg-primary/5" : ""
|
||||
}`}
|
||||
>
|
||||
<SortIcon size={20} className="text-neutral-800" />
|
||||
</Button>
|
||||
</DropdownMenuTrigger>
|
||||
<DropdownMenuContent
|
||||
align="end"
|
||||
className="w-72"
|
||||
onCloseAutoFocus={(e) => e.preventDefault()}
|
||||
>
|
||||
<div className="flex items-center justify-between px-2 py-1.5">
|
||||
<DropdownMenuLabel className="text-base font-medium">
|
||||
Filter Connectors
|
||||
</DropdownMenuLabel>
|
||||
</div>
|
||||
<DropdownMenuSeparator />
|
||||
|
||||
<DropdownMenuGroup>
|
||||
<DropdownMenuLabel className="px-2 py-1.5 text-xs text-muted-foreground">
|
||||
Access Type
|
||||
</DropdownMenuLabel>
|
||||
<div onClick={(e) => e.stopPropagation()}>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedAccessTypes.includes("public")}
|
||||
onCheckedChange={() => handleAccessTypeChange("public")}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
Public
|
||||
</DropdownMenuCheckboxItem>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedAccessTypes.includes("private")}
|
||||
onCheckedChange={() => handleAccessTypeChange("private")}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
Private
|
||||
</DropdownMenuCheckboxItem>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedAccessTypes.includes("sync")}
|
||||
onCheckedChange={() => handleAccessTypeChange("sync")}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
Auto-Sync
|
||||
</DropdownMenuCheckboxItem>
|
||||
</div>
|
||||
</DropdownMenuGroup>
|
||||
|
||||
<DropdownMenuSeparator />
|
||||
|
||||
<DropdownMenuGroup>
|
||||
<DropdownMenuLabel className="px-2 py-1.5 text-xs text-muted-foreground">
|
||||
Last Status
|
||||
</DropdownMenuLabel>
|
||||
<div onClick={(e) => e.stopPropagation()}>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedStatuses.includes("success")}
|
||||
onCheckedChange={() => handleStatusChange("success")}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
Success
|
||||
</DropdownMenuCheckboxItem>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedStatuses.includes("failed")}
|
||||
onCheckedChange={() => handleStatusChange("failed")}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
Failed
|
||||
</DropdownMenuCheckboxItem>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedStatuses.includes("in_progress")}
|
||||
onCheckedChange={() => handleStatusChange("in_progress")}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
In Progress
|
||||
</DropdownMenuCheckboxItem>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedStatuses.includes("not_started")}
|
||||
onCheckedChange={() => handleStatusChange("not_started")}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
Not Started
|
||||
</DropdownMenuCheckboxItem>
|
||||
<DropdownMenuCheckboxItem
|
||||
checked={selectedStatuses.includes("completed_with_errors")}
|
||||
onCheckedChange={() =>
|
||||
handleStatusChange("completed_with_errors")
|
||||
}
|
||||
className="flex items-center justify-between"
|
||||
onSelect={(e) => e.preventDefault()}
|
||||
>
|
||||
Completed with Errors
|
||||
</DropdownMenuCheckboxItem>
|
||||
</div>
|
||||
</DropdownMenuGroup>
|
||||
|
||||
<DropdownMenuSeparator />
|
||||
|
||||
<DropdownMenuGroup>
|
||||
<DropdownMenuLabel className="px-2 py-1.5 text-xs text-muted-foreground">
|
||||
Document Count
|
||||
</DropdownMenuLabel>
|
||||
<div
|
||||
className="flex items-center px-2 py-2 gap-2"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
>
|
||||
<div className="flex gap-2">
|
||||
<Button
|
||||
variant={docsOperator === ">" ? "default" : "outline"}
|
||||
size="sm"
|
||||
className="h-8 px-2"
|
||||
onClick={(e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
setDocsOperator(docsOperator === ">" ? null : ">");
|
||||
}}
|
||||
type="button"
|
||||
>
|
||||
>
|
||||
</Button>
|
||||
<Button
|
||||
variant={docsOperator === "<" ? "default" : "outline"}
|
||||
size="sm"
|
||||
className="h-8 px-2"
|
||||
onClick={(e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
setDocsOperator(docsOperator === "<" ? null : "<");
|
||||
}}
|
||||
type="button"
|
||||
>
|
||||
<
|
||||
</Button>
|
||||
<Button
|
||||
variant={docsOperator === "=" ? "default" : "outline"}
|
||||
size="sm"
|
||||
className="h-8 px-2"
|
||||
onClick={(e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
setDocsOperator(docsOperator === "=" ? null : "=");
|
||||
}}
|
||||
type="button"
|
||||
>
|
||||
=
|
||||
</Button>
|
||||
</div>
|
||||
<Input
|
||||
type="number"
|
||||
placeholder="Count"
|
||||
value={docsValue}
|
||||
onChange={(e) => setDocsValue(e.target.value)}
|
||||
className="h-8 w-full"
|
||||
onClick={(e) => e.stopPropagation()}
|
||||
/>
|
||||
</div>
|
||||
<div className="px-2 py-1.5">
|
||||
<Button
|
||||
size="sm"
|
||||
className="w-full h-8"
|
||||
disabled={false}
|
||||
onClick={(e) => {
|
||||
e.preventDefault();
|
||||
e.stopPropagation();
|
||||
applyFilters();
|
||||
}}
|
||||
type="button"
|
||||
>
|
||||
Apply
|
||||
</Button>
|
||||
</div>
|
||||
</DropdownMenuGroup>
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
|
||||
{hasActiveFilters && (
|
||||
<div className="absolute -top-1 -right-1">
|
||||
<Badge className="h-2 bg-red-400 border-red-400 w-2 p-0 border-2 flex items-center justify-center" />
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
});
|
||||
|
||||
FilterComponent.displayName = "FilterComponent";
|
||||
@@ -26,7 +26,7 @@ export function Checkbox({
|
||||
onChange: (e: React.ChangeEvent<HTMLInputElement>) => void;
|
||||
}) {
|
||||
return (
|
||||
<label className="flex text-xs cursor-pointer">
|
||||
<label className="flex text-sm cursor-pointer">
|
||||
<input
|
||||
checked={checked}
|
||||
onChange={onChange}
|
||||
@@ -34,7 +34,7 @@ export function Checkbox({
|
||||
className="mr-2 w-3.5 h-3.5 my-auto"
|
||||
/>
|
||||
<div>
|
||||
<Label small>{label}</Label>
|
||||
<Label>{label}</Label>
|
||||
{sublabel && <SubLabel>{sublabel}</SubLabel>}
|
||||
</div>
|
||||
</label>
|
||||
@@ -208,7 +208,7 @@ export function SettingsForm() {
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="flex flex-col pb-8">
|
||||
<div>
|
||||
{popup}
|
||||
<Title className="mb-4">Workspace Settings</Title>
|
||||
<Checkbox
|
||||
@@ -307,51 +307,6 @@ export function SettingsForm() {
|
||||
</Button>
|
||||
</>
|
||||
)}
|
||||
|
||||
{/* Image Processing Settings */}
|
||||
<Title className="mt-8 mb-4">Image Processing</Title>
|
||||
|
||||
<div className="flex flex-col gap-2">
|
||||
<Checkbox
|
||||
label="Enable Image Extraction and Analysis"
|
||||
sublabel="Extract and analyze images from documents during indexing. This allows the system to process images and create searchable descriptions of them."
|
||||
checked={settings.image_extraction_and_analysis_enabled ?? false}
|
||||
onChange={(e) =>
|
||||
handleToggleSettingsField(
|
||||
"image_extraction_and_analysis_enabled",
|
||||
e.target.checked
|
||||
)
|
||||
}
|
||||
/>
|
||||
|
||||
<Checkbox
|
||||
label="Enable Search-time Image Analysis"
|
||||
sublabel="Analyze images at search time when a user asks about images. This provides more detailed and query-specific image analysis but may increase search-time latency."
|
||||
checked={settings.search_time_image_analysis_enabled ?? false}
|
||||
onChange={(e) =>
|
||||
handleToggleSettingsField(
|
||||
"search_time_image_analysis_enabled",
|
||||
e.target.checked
|
||||
)
|
||||
}
|
||||
/>
|
||||
|
||||
<IntegerInput
|
||||
label="Maximum Image Size for Analysis (MB)"
|
||||
sublabel="Images larger than this size will not be analyzed to prevent excessive resource usage."
|
||||
value={settings.image_analysis_max_size_mb ?? null}
|
||||
onChange={(e) => {
|
||||
const value = e.target.value ? parseInt(e.target.value) : null;
|
||||
if (value !== null && !isNaN(value) && value > 0) {
|
||||
updateSettingField([
|
||||
{ fieldName: "image_analysis_max_size_mb", newValue: value },
|
||||
]);
|
||||
}
|
||||
}}
|
||||
id="image-analysis-max-size"
|
||||
placeholder="Enter maximum size in MB"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -21,11 +21,6 @@ export interface Settings {
|
||||
auto_scroll: boolean;
|
||||
temperature_override_enabled: boolean;
|
||||
query_history_type: QueryHistoryType;
|
||||
|
||||
// Image processing settings
|
||||
image_extraction_and_analysis_enabled?: boolean;
|
||||
search_time_image_analysis_enabled?: boolean;
|
||||
image_analysis_max_size_mb?: number;
|
||||
}
|
||||
|
||||
export enum NotificationType {
|
||||
|
||||
@@ -204,6 +204,7 @@ export function ChatPage({
|
||||
|
||||
const [documentSidebarVisible, setDocumentSidebarVisible] = useState(false);
|
||||
const [proSearchEnabled, setProSearchEnabled] = useState(proSearchToggled);
|
||||
const [streamingAllowed, setStreamingAllowed] = useState(false);
|
||||
const toggleProSearch = () => {
|
||||
Cookies.set(
|
||||
PRO_SEARCH_TOGGLED_COOKIE_NAME,
|
||||
@@ -1978,6 +1979,8 @@ export function ChatPage({
|
||||
|
||||
const innerSidebarElementRef = useRef<HTMLDivElement>(null);
|
||||
const [settingsToggled, setSettingsToggled] = useState(false);
|
||||
const [showDeleteAllModal, setShowDeleteAllModal] = useState(false);
|
||||
|
||||
const currentPersona = alternativeAssistant || liveAssistant;
|
||||
|
||||
const HORIZON_DISTANCE = 800;
|
||||
@@ -2139,6 +2142,32 @@ export function ChatPage({
|
||||
|
||||
<ChatPopup />
|
||||
|
||||
{showDeleteAllModal && (
|
||||
<ConfirmEntityModal
|
||||
entityType="All Chats"
|
||||
entityName="all your chat sessions"
|
||||
onClose={() => setShowDeleteAllModal(false)}
|
||||
additionalDetails="This action cannot be undone. All your chat sessions will be deleted."
|
||||
onSubmit={async () => {
|
||||
const response = await deleteAllChatSessions("Chat");
|
||||
if (response.ok) {
|
||||
setShowDeleteAllModal(false);
|
||||
setPopup({
|
||||
message: "All your chat sessions have been deleted.",
|
||||
type: "success",
|
||||
});
|
||||
refreshChatSessions();
|
||||
router.push("/chat");
|
||||
} else {
|
||||
setPopup({
|
||||
message: "Failed to delete all chat sessions.",
|
||||
type: "error",
|
||||
});
|
||||
}
|
||||
}}
|
||||
/>
|
||||
)}
|
||||
|
||||
{currentFeedback && (
|
||||
<FeedbackModal
|
||||
feedbackType={currentFeedback[0]}
|
||||
@@ -2296,6 +2325,7 @@ export function ChatPage({
|
||||
folders={folders}
|
||||
removeToggle={removeToggle}
|
||||
showShareModal={showShareModal}
|
||||
showDeleteAllModal={() => setShowDeleteAllModal(true)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -403,8 +403,7 @@ export function ChatInputBar({
|
||||
setTabbingIconIndex((tabbingIconIndex) =>
|
||||
Math.min(
|
||||
tabbingIconIndex + 1,
|
||||
// showPrompts ? filteredPrompts.length :
|
||||
assistantTagOptions.length
|
||||
showPrompts ? filteredPrompts.length : assistantTagOptions.length
|
||||
)
|
||||
);
|
||||
} else if (e.key === "ArrowUp") {
|
||||
@@ -437,8 +436,8 @@ export function ChatInputBar({
|
||||
<button
|
||||
key={index}
|
||||
className={`px-2 ${
|
||||
tabbingIconIndex == index && "bg-neutral-200"
|
||||
} rounded items-center rounded-lg content-start flex gap-x-1 py-2 w-full hover:bg-neutral-200/90 cursor-pointer`}
|
||||
tabbingIconIndex == index && "bg-background-dark/75"
|
||||
} rounded items-center rounded-lg content-start flex gap-x-1 py-2 w-full hover:bg-background-dark/90 cursor-pointer`}
|
||||
onClick={() => {
|
||||
updatedTaggedAssistant(currentAssistant);
|
||||
}}
|
||||
@@ -460,8 +459,8 @@ export function ChatInputBar({
|
||||
target="_self"
|
||||
className={`${
|
||||
tabbingIconIndex == assistantTagOptions.length &&
|
||||
"bg-neutral-200"
|
||||
} rounded rounded-lg px-3 flex gap-x-1 py-2 w-full items-center hover:bg-neutral-200/90 cursor-pointer`}
|
||||
"bg-background-dark/75"
|
||||
} rounded rounded-lg px-3 flex gap-x-1 py-2 w-full items-center hover:bg-background-dark/90 cursor-pointer`}
|
||||
href="/assistants/new"
|
||||
>
|
||||
<FiPlus size={17} />
|
||||
|
||||
@@ -329,7 +329,7 @@ export async function deleteChatSession(chatSessionId: string) {
|
||||
return response;
|
||||
}
|
||||
|
||||
export async function deleteAllChatSessions() {
|
||||
export async function deleteAllChatSessions(sessionType: "Chat" | "Search") {
|
||||
const response = await fetch(`/api/chat/delete-all-chat-sessions`, {
|
||||
method: "DELETE",
|
||||
headers: {
|
||||
|
||||
@@ -24,9 +24,6 @@ import { Monitor, Moon, Sun } from "lucide-react";
|
||||
import { useTheme } from "next-themes";
|
||||
import { Button } from "@/components/ui/button";
|
||||
import { Input } from "@/components/ui/input";
|
||||
import { FiTrash2 } from "react-icons/fi";
|
||||
import { deleteAllChatSessions } from "../lib";
|
||||
import { useChatContext } from "@/components/context/ChatContext";
|
||||
|
||||
type SettingsSection = "settings" | "password";
|
||||
|
||||
@@ -50,8 +47,6 @@ export function UserSettingsModal({
|
||||
updateUserShortcuts,
|
||||
updateUserTemperatureOverrideEnabled,
|
||||
} = useUser();
|
||||
const { refreshChatSessions } = useChatContext();
|
||||
const router = useRouter();
|
||||
const containerRef = useRef<HTMLDivElement>(null);
|
||||
const messageRef = useRef<HTMLDivElement>(null);
|
||||
const { theme, setTheme } = useTheme();
|
||||
@@ -62,8 +57,6 @@ export function UserSettingsModal({
|
||||
const [isLoading, setIsLoading] = useState(false);
|
||||
const [activeSection, setActiveSection] =
|
||||
useState<SettingsSection>("settings");
|
||||
const [isDeleteAllLoading, setIsDeleteAllLoading] = useState(false);
|
||||
const [showDeleteConfirmation, setShowDeleteConfirmation] = useState(false);
|
||||
|
||||
useEffect(() => {
|
||||
const container = containerRef.current;
|
||||
@@ -139,6 +132,7 @@ export function UserSettingsModal({
|
||||
);
|
||||
});
|
||||
|
||||
const router = useRouter();
|
||||
const handleChangedefaultModel = async (defaultModel: string | null) => {
|
||||
try {
|
||||
const response = await setUserDefaultModel(defaultModel);
|
||||
@@ -211,31 +205,6 @@ export function UserSettingsModal({
|
||||
};
|
||||
const showPasswordSection = user?.password_configured;
|
||||
|
||||
const handleDeleteAllChats = async () => {
|
||||
setIsDeleteAllLoading(true);
|
||||
try {
|
||||
const response = await deleteAllChatSessions();
|
||||
if (response.ok) {
|
||||
setPopup({
|
||||
message: "All your chat sessions have been deleted.",
|
||||
type: "success",
|
||||
});
|
||||
refreshChatSessions();
|
||||
router.push("/chat");
|
||||
} else {
|
||||
throw new Error("Failed to delete all chat sessions");
|
||||
}
|
||||
} catch (error) {
|
||||
setPopup({
|
||||
message: "Failed to delete all chat sessions",
|
||||
type: "error",
|
||||
});
|
||||
} finally {
|
||||
setIsDeleteAllLoading(false);
|
||||
setShowDeleteConfirmation(false);
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<Modal
|
||||
onOutsideClick={onClose}
|
||||
@@ -381,51 +350,6 @@ export function UserSettingsModal({
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
<div className="pt-4 border-t border-border">
|
||||
{!showDeleteConfirmation ? (
|
||||
<div className="space-y-3">
|
||||
<p className="text-sm text-neutral-600 ">
|
||||
This will permanently delete all your chat sessions and
|
||||
cannot be undone.
|
||||
</p>
|
||||
<Button
|
||||
variant="destructive"
|
||||
className="w-full flex items-center justify-center"
|
||||
onClick={() => setShowDeleteConfirmation(true)}
|
||||
>
|
||||
<FiTrash2 className="mr-2" size={14} />
|
||||
Delete All Chats
|
||||
</Button>
|
||||
</div>
|
||||
) : (
|
||||
<div className="space-y-3">
|
||||
<p className="text-sm text-neutral-600 ">
|
||||
Are you sure you want to delete all your chat sessions?
|
||||
</p>
|
||||
<div className="flex gap-2">
|
||||
<Button
|
||||
type="button"
|
||||
variant="destructive"
|
||||
className="flex-1 flex items-center justify-center"
|
||||
onClick={handleDeleteAllChats}
|
||||
disabled={isDeleteAllLoading}
|
||||
>
|
||||
{isDeleteAllLoading
|
||||
? "Deleting..."
|
||||
: "Yes, Delete All"}
|
||||
</Button>
|
||||
<Button
|
||||
variant="outline"
|
||||
className="flex-1"
|
||||
onClick={() => setShowDeleteConfirmation(false)}
|
||||
disabled={isDeleteAllLoading}
|
||||
>
|
||||
Cancel
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{activeSection === "password" && (
|
||||
|
||||
@@ -64,6 +64,7 @@ interface HistorySidebarProps {
|
||||
showShareModal?: (chatSession: ChatSession) => void;
|
||||
showDeleteModal?: (chatSession: ChatSession) => void;
|
||||
explicitlyUntoggle: () => void;
|
||||
showDeleteAllModal?: () => void;
|
||||
setShowAssistantsModal: (show: boolean) => void;
|
||||
toggleChatSessionSearchModal?: () => void;
|
||||
}
|
||||
@@ -182,6 +183,7 @@ export const HistorySidebar = forwardRef<HTMLDivElement, HistorySidebarProps>(
|
||||
showShareModal,
|
||||
toggleChatSessionSearchModal,
|
||||
showDeleteModal,
|
||||
showDeleteAllModal,
|
||||
},
|
||||
ref: ForwardedRef<HTMLDivElement>
|
||||
) => {
|
||||
@@ -401,6 +403,7 @@ export const HistorySidebar = forwardRef<HTMLDivElement, HistorySidebarProps>(
|
||||
existingChats={existingChats}
|
||||
currentChatId={currentChatId}
|
||||
folders={folders}
|
||||
showDeleteAllModal={showDeleteAllModal}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -10,6 +10,7 @@ import { Folder } from "../folders/interfaces";
|
||||
import { usePopup } from "@/components/admin/connectors/Popup";
|
||||
import { useRouter } from "next/navigation";
|
||||
import { FiPlus, FiTrash2, FiCheck, FiX } from "react-icons/fi";
|
||||
import { NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED } from "@/lib/constants";
|
||||
import { FolderDropdown } from "../folders/FolderDropdown";
|
||||
import { ChatSessionDisplay } from "./ChatSessionDisplay";
|
||||
import { useState, useCallback, useRef, useContext, useEffect } from "react";
|
||||
@@ -106,6 +107,7 @@ export function PagesTab({
|
||||
closeSidebar,
|
||||
showShareModal,
|
||||
showDeleteModal,
|
||||
showDeleteAllModal,
|
||||
toggleChatSessionSearchModal,
|
||||
}: {
|
||||
existingChats?: ChatSession[];
|
||||
@@ -115,6 +117,7 @@ export function PagesTab({
|
||||
closeSidebar?: () => void;
|
||||
showShareModal?: (chatSession: ChatSession) => void;
|
||||
showDeleteModal?: (chatSession: ChatSession) => void;
|
||||
showDeleteAllModal?: () => void;
|
||||
}) {
|
||||
const { setPopup, popup } = usePopup();
|
||||
const router = useRouter();
|
||||
@@ -435,7 +438,11 @@ export function PagesTab({
|
||||
</DndContext>
|
||||
)}
|
||||
|
||||
<div className="pl-4 pr-3">
|
||||
<div
|
||||
className={`pl-4 pr-3 ${
|
||||
NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED && "pb-20"
|
||||
}`}
|
||||
>
|
||||
{!isHistoryEmpty && (
|
||||
<>
|
||||
{Object.entries(groupedChatSesssions)
|
||||
@@ -472,6 +479,17 @@ export function PagesTab({
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
{showDeleteAllModal && NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED && (
|
||||
<div className="absolute w-full border-t border-t-border bg-background-100 bottom-0 left-0 p-4">
|
||||
<button
|
||||
className="px-4 w-full py-2 px-4 text-text-600 hover:text-text-800 bg-background-125 border border-border-strong/50 shadow-sm rounded-md transition-colors duration-200 flex items-center justify-center text-sm"
|
||||
onClick={showDeleteAllModal}
|
||||
>
|
||||
<FiTrash2 className="mr-2" size={14} />
|
||||
Clear All History
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@@ -206,7 +206,7 @@ export function SharedChatDisplay({
|
||||
{chatSession.description || `Unnamed Chat`}
|
||||
</h1>
|
||||
<p className=" text-text-darker">
|
||||
{humanReadableFormat(chatSession.time_created)}
|
||||
{humanReadableFormat(chatSession.time_updated)}
|
||||
</p>
|
||||
<div
|
||||
className={`
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
import { ConnectorStatus } from "@/lib/types";
|
||||
import { ConnectorMultiSelect } from "@/components/ConnectorMultiSelect";
|
||||
import { ConnectorIndexingStatus, ConnectorStatus } from "@/lib/types";
|
||||
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
|
||||
|
||||
interface ConnectorEditorProps {
|
||||
selectedCCPairIds: number[];
|
||||
@@ -12,20 +12,55 @@ export const ConnectorEditor = ({
|
||||
setSetCCPairIds,
|
||||
allCCPairs,
|
||||
}: ConnectorEditorProps) => {
|
||||
// Filter out public docs, since they don't make sense as part of a group
|
||||
const privateCCPairs = allCCPairs.filter(
|
||||
(ccPair) => ccPair.access_type === "private"
|
||||
);
|
||||
|
||||
return (
|
||||
<ConnectorMultiSelect
|
||||
name="connectors"
|
||||
label="Connectors"
|
||||
connectors={privateCCPairs}
|
||||
selectedIds={selectedCCPairIds}
|
||||
onChange={setSetCCPairIds}
|
||||
placeholder="Search for connectors..."
|
||||
showError={true}
|
||||
/>
|
||||
<div className="mb-3 flex gap-2 flex-wrap">
|
||||
{allCCPairs
|
||||
// remove public docs, since they don't make sense as part of a group
|
||||
.filter((ccPair) => !(ccPair.access_type === "public"))
|
||||
.map((ccPair) => {
|
||||
const ind = selectedCCPairIds.indexOf(ccPair.cc_pair_id);
|
||||
const isSelected = ind !== -1;
|
||||
return (
|
||||
<div
|
||||
key={`${ccPair.connector.id}-${ccPair.credential.id}`}
|
||||
className={
|
||||
`
|
||||
px-3
|
||||
py-1
|
||||
rounded-lg
|
||||
border
|
||||
border-border
|
||||
w-fit
|
||||
flex
|
||||
cursor-pointer ` +
|
||||
(isSelected
|
||||
? " bg-accent-background-hovered"
|
||||
: " hover:bg-accent-background")
|
||||
}
|
||||
onClick={() => {
|
||||
if (isSelected) {
|
||||
setSetCCPairIds(
|
||||
selectedCCPairIds.filter(
|
||||
(ccPairId) => ccPairId !== ccPair.cc_pair_id
|
||||
)
|
||||
);
|
||||
} else {
|
||||
setSetCCPairIds([...selectedCCPairIds, ccPair.cc_pair_id]);
|
||||
}
|
||||
}}
|
||||
>
|
||||
<div className="my-auto">
|
||||
<ConnectorTitle
|
||||
connector={ccPair.connector}
|
||||
ccPairId={ccPair.cc_pair_id}
|
||||
ccPairName={ccPair.name}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -1,15 +1,13 @@
|
||||
import { Button } from "@/components/Button";
|
||||
import { SearchMultiSelectDropdown } from "@/components/Dropdown";
|
||||
import { Modal } from "@/components/Modal";
|
||||
import { useState } from "react";
|
||||
import { FiX } from "react-icons/fi";
|
||||
import { FiPlus, FiX } from "react-icons/fi";
|
||||
import { updateUserGroup } from "./lib";
|
||||
import { PopupSpec } from "@/components/admin/connectors/Popup";
|
||||
import { ConnectorStatus, UserGroup } from "@/lib/types";
|
||||
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
|
||||
import { Connector } from "@/lib/connectors/connectors";
|
||||
import { ConnectorMultiSelect } from "@/components/ConnectorMultiSelect";
|
||||
import { Form } from "formik";
|
||||
|
||||
interface AddConnectorFormProps {
|
||||
ccPairs: ConnectorStatus<any, any>[];
|
||||
userGroup: UserGroup;
|
||||
@@ -25,68 +23,132 @@ export const AddConnectorForm: React.FC<AddConnectorFormProps> = ({
|
||||
}) => {
|
||||
const [selectedCCPairIds, setSelectedCCPairIds] = useState<number[]>([]);
|
||||
|
||||
// Filter out ccPairs that are already in the user group and are not private
|
||||
const availableCCPairs = ccPairs
|
||||
.filter(
|
||||
(ccPair) =>
|
||||
!userGroup.cc_pairs
|
||||
.map((userGroupCCPair) => userGroupCCPair.id)
|
||||
.includes(ccPair.cc_pair_id)
|
||||
)
|
||||
.filter((ccPair) => ccPair.access_type === "private");
|
||||
|
||||
const selectedCCPairs = ccPairs.filter((ccPair) =>
|
||||
selectedCCPairIds.includes(ccPair.cc_pair_id)
|
||||
);
|
||||
return (
|
||||
<Modal
|
||||
className="max-w-3xl"
|
||||
title="Add New Connector"
|
||||
onOutsideClick={() => onClose()}
|
||||
>
|
||||
<div className="px-6 pt-4">
|
||||
<ConnectorMultiSelect
|
||||
name="connectors"
|
||||
label="Select Connectors"
|
||||
connectors={availableCCPairs}
|
||||
selectedIds={selectedCCPairIds}
|
||||
onChange={setSelectedCCPairIds}
|
||||
placeholder="Search for connectors to add..."
|
||||
showError={false}
|
||||
/>
|
||||
<Modal title="Add New Connector" onOutsideClick={() => onClose()}>
|
||||
<div className="px-6 pt-4 pb-12">
|
||||
<div className="mb-2 flex flex-wrap gap-x-2">
|
||||
{selectedCCPairs.length > 0 &&
|
||||
selectedCCPairs.map((ccPair) => (
|
||||
<div
|
||||
key={ccPair.cc_pair_id}
|
||||
onClick={() => {
|
||||
setSelectedCCPairIds(
|
||||
selectedCCPairIds.filter(
|
||||
(ccPairId) => ccPairId !== ccPair.cc_pair_id
|
||||
)
|
||||
);
|
||||
}}
|
||||
className={`
|
||||
flex
|
||||
rounded-lg
|
||||
px-2
|
||||
py-1
|
||||
my-1
|
||||
border
|
||||
border-border
|
||||
hover:bg-accent-background-hovered
|
||||
cursor-pointer`}
|
||||
>
|
||||
<ConnectorTitle
|
||||
ccPairId={ccPair.cc_pair_id}
|
||||
ccPairName={ccPair.name}
|
||||
connector={ccPair.connector}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
<FiX className="ml-1 my-auto" />
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
<Button
|
||||
className="mt-4 flex-nowrap w-48"
|
||||
onClick={async () => {
|
||||
const newCCPairIds = [
|
||||
...Array.from(
|
||||
new Set(
|
||||
userGroup.cc_pairs
|
||||
.map((ccPair) => ccPair.id)
|
||||
.concat(selectedCCPairIds)
|
||||
)
|
||||
),
|
||||
];
|
||||
const response = await updateUserGroup(userGroup.id, {
|
||||
user_ids: userGroup.users.map((user) => user.id),
|
||||
cc_pair_ids: newCCPairIds,
|
||||
});
|
||||
if (response.ok) {
|
||||
setPopup({
|
||||
message: "Successfully added connectors to group",
|
||||
type: "success",
|
||||
<div className="flex">
|
||||
<SearchMultiSelectDropdown
|
||||
options={ccPairs
|
||||
.filter(
|
||||
(ccPair) =>
|
||||
!selectedCCPairIds.includes(ccPair.cc_pair_id) &&
|
||||
!userGroup.cc_pairs
|
||||
.map((userGroupCCPair) => userGroupCCPair.id)
|
||||
.includes(ccPair.cc_pair_id)
|
||||
)
|
||||
// remove public and synced docs, since they don't make sense as part of a group
|
||||
.filter((ccPair) => ccPair.access_type === "private")
|
||||
.map((ccPair) => {
|
||||
return {
|
||||
name: ccPair.name?.toString() || "",
|
||||
value: ccPair.cc_pair_id?.toString(),
|
||||
metadata: {
|
||||
ccPairId: ccPair.cc_pair_id,
|
||||
connector: ccPair.connector,
|
||||
},
|
||||
};
|
||||
})}
|
||||
onSelect={(option) => {
|
||||
setSelectedCCPairIds([
|
||||
...Array.from(
|
||||
new Set([
|
||||
...selectedCCPairIds,
|
||||
parseInt(option.value as string),
|
||||
])
|
||||
),
|
||||
]);
|
||||
}}
|
||||
itemComponent={({ option }) => (
|
||||
<div className="flex px-4 py-2.5 hover:bg-accent-background-hovered cursor-pointer">
|
||||
<div className="my-auto">
|
||||
<ConnectorTitle
|
||||
ccPairId={option?.metadata?.ccPairId as number}
|
||||
ccPairName={option.name}
|
||||
connector={option?.metadata?.connector as Connector<any>}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
<div className="ml-auto my-auto">
|
||||
<FiPlus />
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
/>
|
||||
<Button
|
||||
className="ml-3 flex-nowrap w-48"
|
||||
onClick={async () => {
|
||||
const newCCPairIds = [
|
||||
...Array.from(
|
||||
new Set(
|
||||
userGroup.cc_pairs
|
||||
.map((ccPair) => ccPair.id)
|
||||
.concat(selectedCCPairIds)
|
||||
)
|
||||
),
|
||||
];
|
||||
const response = await updateUserGroup(userGroup.id, {
|
||||
user_ids: userGroup.users.map((user) => user.id),
|
||||
cc_pair_ids: newCCPairIds,
|
||||
});
|
||||
onClose();
|
||||
} else {
|
||||
const responseJson = await response.json();
|
||||
const errorMsg = responseJson.detail || responseJson.message;
|
||||
setPopup({
|
||||
message: `Failed to add connectors to group - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
onClose();
|
||||
}
|
||||
}}
|
||||
>
|
||||
Add Connectors
|
||||
</Button>
|
||||
if (response.ok) {
|
||||
setPopup({
|
||||
message: "Successfully added users to group",
|
||||
type: "success",
|
||||
});
|
||||
onClose();
|
||||
} else {
|
||||
const responseJson = await response.json();
|
||||
const errorMsg = responseJson.detail || responseJson.message;
|
||||
setPopup({
|
||||
message: `Failed to add users to group - ${errorMsg}`,
|
||||
type: "error",
|
||||
});
|
||||
onClose();
|
||||
}
|
||||
}}
|
||||
>
|
||||
Add Connectors
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
</Modal>
|
||||
);
|
||||
|
||||
@@ -1,232 +0,0 @@
|
||||
import React, { useState, useRef, useEffect } from "react";
|
||||
import { ConnectorStatus } from "@/lib/types";
|
||||
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
|
||||
import { X, Search } from "lucide-react";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { ErrorMessage } from "formik";
|
||||
|
||||
interface ConnectorMultiSelectProps {
|
||||
name: string;
|
||||
label: string;
|
||||
connectors: ConnectorStatus<any, any>[];
|
||||
selectedIds: number[];
|
||||
onChange: (selectedIds: number[]) => void;
|
||||
disabled?: boolean;
|
||||
placeholder?: string;
|
||||
showError?: boolean;
|
||||
}
|
||||
|
||||
export const ConnectorMultiSelect = ({
|
||||
name,
|
||||
label,
|
||||
connectors,
|
||||
selectedIds,
|
||||
onChange,
|
||||
disabled = false,
|
||||
placeholder = "Search connectors...",
|
||||
showError = false,
|
||||
}: ConnectorMultiSelectProps) => {
|
||||
const [open, setOpen] = useState(false);
|
||||
const [searchQuery, setSearchQuery] = useState("");
|
||||
const dropdownRef = useRef<HTMLDivElement>(null);
|
||||
const inputRef = useRef<HTMLInputElement>(null);
|
||||
|
||||
const selectedConnectors = connectors.filter((connector) =>
|
||||
selectedIds.includes(connector.cc_pair_id)
|
||||
);
|
||||
|
||||
const unselectedConnectors = connectors.filter(
|
||||
(connector) => !selectedIds.includes(connector.cc_pair_id)
|
||||
);
|
||||
|
||||
const allConnectorsSelected = unselectedConnectors.length === 0;
|
||||
|
||||
const filteredUnselectedConnectors = unselectedConnectors.filter(
|
||||
(connector) => {
|
||||
const connectorName = connector.name || connector.connector.source;
|
||||
return connectorName.toLowerCase().includes(searchQuery.toLowerCase());
|
||||
}
|
||||
);
|
||||
|
||||
useEffect(() => {
|
||||
if (allConnectorsSelected && open) {
|
||||
setOpen(false);
|
||||
inputRef.current?.blur();
|
||||
setSearchQuery("");
|
||||
}
|
||||
}, [allConnectorsSelected, open]);
|
||||
|
||||
useEffect(() => {
|
||||
if (allConnectorsSelected) {
|
||||
inputRef.current?.blur();
|
||||
setSearchQuery("");
|
||||
}
|
||||
}, [allConnectorsSelected, selectedIds]);
|
||||
|
||||
const selectConnector = (connectorId: number) => {
|
||||
const newSelectedIds = [...selectedIds, connectorId];
|
||||
onChange(newSelectedIds);
|
||||
setSearchQuery("");
|
||||
|
||||
const willAllBeSelected = connectors.length === newSelectedIds.length;
|
||||
|
||||
if (!willAllBeSelected) {
|
||||
setTimeout(() => {
|
||||
inputRef.current?.focus();
|
||||
}, 0);
|
||||
}
|
||||
};
|
||||
|
||||
const removeConnector = (connectorId: number) => {
|
||||
onChange(selectedIds.filter((id) => id !== connectorId));
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
const handleClickOutside = (event: MouseEvent) => {
|
||||
if (
|
||||
dropdownRef.current &&
|
||||
!dropdownRef.current.contains(event.target as Node) &&
|
||||
inputRef.current !== event.target &&
|
||||
!inputRef.current?.contains(event.target as Node)
|
||||
) {
|
||||
setOpen(false);
|
||||
}
|
||||
};
|
||||
|
||||
document.addEventListener("mousedown", handleClickOutside);
|
||||
return () => {
|
||||
document.removeEventListener("mousedown", handleClickOutside);
|
||||
};
|
||||
}, []);
|
||||
|
||||
const handleKeyDown = (e: React.KeyboardEvent) => {
|
||||
if (e.key === "Escape") {
|
||||
setOpen(false);
|
||||
}
|
||||
};
|
||||
|
||||
const effectivePlaceholder = allConnectorsSelected
|
||||
? "All connectors selected"
|
||||
: placeholder;
|
||||
|
||||
const isInputDisabled = disabled || allConnectorsSelected;
|
||||
|
||||
return (
|
||||
<div className="flex flex-col w-full space-y-2 mb-4">
|
||||
{label && <Label className="text-base font-medium">{label}</Label>}
|
||||
|
||||
<p className="text-xs text-neutral-500 ">
|
||||
All documents indexed by the selected connectors will be part of this
|
||||
document set.
|
||||
</p>
|
||||
<div className="relative">
|
||||
<div
|
||||
className={`flex items-center border border-input rounded-md border border-neutral-200 ${
|
||||
allConnectorsSelected ? "bg-neutral-50" : ""
|
||||
} focus-within:ring-1 focus-within:ring-ring focus-within:border-neutral-400 transition-colors`}
|
||||
>
|
||||
<Search className="absolute left-3 h-4 w-4 text-neutral-500" />
|
||||
<input
|
||||
ref={inputRef}
|
||||
type="text"
|
||||
value={searchQuery}
|
||||
onChange={(e) => {
|
||||
setSearchQuery(e.target.value);
|
||||
setOpen(true);
|
||||
}}
|
||||
onFocus={() => {
|
||||
if (!allConnectorsSelected) {
|
||||
setOpen(true);
|
||||
}
|
||||
}}
|
||||
onKeyDown={handleKeyDown}
|
||||
placeholder={effectivePlaceholder}
|
||||
className={`h-9 w-full pl-9 pr-10 py-2 bg-transparent text-sm outline-none disabled:cursor-not-allowed disabled:opacity-50 ${
|
||||
allConnectorsSelected ? "text-neutral-500" : ""
|
||||
}`}
|
||||
disabled={isInputDisabled}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{open && !allConnectorsSelected && (
|
||||
<div
|
||||
ref={dropdownRef}
|
||||
className="absolute z-50 w-full mt-1 rounded-md border border-neutral-200 bg-white shadow-md default-scrollbar max-h-[300px] overflow-auto"
|
||||
>
|
||||
{filteredUnselectedConnectors.length === 0 ? (
|
||||
<div className="py-4 text-center text-xs text-neutral-500">
|
||||
{searchQuery
|
||||
? "No matching connectors found"
|
||||
: "No more connectors available"}
|
||||
</div>
|
||||
) : (
|
||||
<div>
|
||||
{filteredUnselectedConnectors.map((connector) => (
|
||||
<div
|
||||
key={connector.cc_pair_id}
|
||||
className="flex items-center justify-between py-2 px-3 cursor-pointer hover:bg-neutral-50 text-xs"
|
||||
onClick={() => selectConnector(connector.cc_pair_id)}
|
||||
>
|
||||
<div className="flex items-center truncate mr-2">
|
||||
<ConnectorTitle
|
||||
connector={connector.connector}
|
||||
ccPairId={connector.cc_pair_id}
|
||||
ccPairName={connector.name}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{selectedConnectors.length > 0 ? (
|
||||
<div className="mt-3 ">
|
||||
<div className="flex flex-wrap gap-1.5">
|
||||
{selectedConnectors.map((connector) => (
|
||||
<div
|
||||
key={connector.cc_pair_id}
|
||||
className="flex items-center bg-white rounded-md border border-neutral-300 transition-all px-2 py-1 max-w-full group text-xs"
|
||||
>
|
||||
<div className="flex items-center overflow-hidden">
|
||||
<div className="flex-shrink-0 text-xs">
|
||||
<ConnectorTitle
|
||||
connector={connector.connector}
|
||||
ccPairId={connector.cc_pair_id}
|
||||
ccPairName={connector.name}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<button
|
||||
className="ml-1 flex-shrink-0 rounded-full w-4 h-4 flex items-center justify-center bg-neutral-100 text-neutral-500 hover:bg-neutral-200 hover:text-neutral-700 transition-colors group-hover:bg-neutral-200"
|
||||
onClick={() => removeConnector(connector.cc_pair_id)}
|
||||
aria-label="Remove connector"
|
||||
>
|
||||
<X className="h-2.5 w-2.5" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
<div className="mt-3 p-3 border border-dashed border-neutral-300 rounded-md bg-neutral-50 text-neutral-500 text-xs">
|
||||
No connectors selected. Search and select connectors above.
|
||||
</div>
|
||||
)}
|
||||
|
||||
{showError && (
|
||||
<ErrorMessage
|
||||
name={name}
|
||||
component="div"
|
||||
className="text-red-500 text-xs mt-1"
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
};
|
||||
@@ -1,52 +0,0 @@
|
||||
import React from "react";
|
||||
import { ConnectorStatus } from "@/lib/types";
|
||||
import { ConnectorTitle } from "@/components/admin/connectors/ConnectorTitle";
|
||||
import { Label } from "@/components/ui/label";
|
||||
import { LockIcon } from "lucide-react";
|
||||
|
||||
interface NonSelectableConnectorsProps {
|
||||
connectors: ConnectorStatus<any, any>[];
|
||||
title: string;
|
||||
description: string;
|
||||
}
|
||||
|
||||
export const NonSelectableConnectors = ({
|
||||
connectors,
|
||||
title,
|
||||
description,
|
||||
}: NonSelectableConnectorsProps) => {
|
||||
if (connectors.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="mt-6 mb-4">
|
||||
<Label className="text-base font-medium mb-1">{title}</Label>
|
||||
<p className="text-xs text-neutral-500 mb-3">{description}</p>
|
||||
<div className="p-3 border border-dashed border-neutral-300 rounded-md bg-neutral-50">
|
||||
<div className="text-xs font-medium text-neutral-700 mb-2 flex items-center">
|
||||
<LockIcon className="h-3.5 w-3.5 mr-1.5 text-neutral-500" />
|
||||
Unavailable connectors:
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-1.5">
|
||||
{connectors.map((connector) => (
|
||||
<div
|
||||
key={`${connector.connector.id}-${connector.credential.id}`}
|
||||
className="flex items-center px-2 py-1 cursor-not-allowed opacity-80 bg-white border border-neutral-300 rounded-md text-xs"
|
||||
>
|
||||
<div className="flex items-center max-w-[200px] text-xs">
|
||||
<ConnectorTitle
|
||||
connector={connector.connector}
|
||||
ccPairId={connector.cc_pair_id}
|
||||
ccPairName={connector.name}
|
||||
isLink={false}
|
||||
showMetadata={false}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
@@ -20,7 +20,6 @@ interface ConnectorTitleProps {
|
||||
owner?: string;
|
||||
isLink?: boolean;
|
||||
showMetadata?: boolean;
|
||||
className?: string;
|
||||
}
|
||||
|
||||
export const ConnectorTitle = ({
|
||||
@@ -31,7 +30,6 @@ export const ConnectorTitle = ({
|
||||
isPublic = true,
|
||||
isLink = true,
|
||||
showMetadata = true,
|
||||
className = "",
|
||||
}: ConnectorTitleProps) => {
|
||||
const sourceMetadata = getSourceMetadata(connector.source);
|
||||
|
||||
@@ -90,17 +88,17 @@ export const ConnectorTitle = ({
|
||||
);
|
||||
}
|
||||
|
||||
const mainSectionClassName = `text-blue-500 dark:text-blue-100 flex w-fit ${className}`;
|
||||
const mainSectionClassName = "text-blue-500 dark:text-blue-100 flex w-fit";
|
||||
const mainDisplay = (
|
||||
<>
|
||||
{sourceMetadata.icon({ size: 16 })}
|
||||
<div className="ml-1 my-auto text-xs font-medium truncate">
|
||||
{sourceMetadata.icon({ size: 20 })}
|
||||
<div className="ml-1 my-auto">
|
||||
{ccPairName || sourceMetadata.displayName}
|
||||
</div>
|
||||
</>
|
||||
);
|
||||
return (
|
||||
<div className="my-auto max-w-full">
|
||||
<div className="my-auto">
|
||||
{isLink ? (
|
||||
<Link
|
||||
className={mainSectionClassName}
|
||||
@@ -112,10 +110,10 @@ export const ConnectorTitle = ({
|
||||
<div className={mainSectionClassName}>{mainDisplay}</div>
|
||||
)}
|
||||
{showMetadata && additionalMetadata.size > 0 && (
|
||||
<div className="text-[10px] mt-0.5 text-gray-600 dark:text-gray-400">
|
||||
<div className="text-xs mt-1">
|
||||
{Array.from(additionalMetadata.entries()).map(([key, value]) => {
|
||||
return (
|
||||
<div key={key} className="truncate">
|
||||
<div key={key}>
|
||||
<i>{key}:</i> {value}
|
||||
</div>
|
||||
);
|
||||
|
||||
@@ -181,7 +181,7 @@ const SignedUpUserTable = ({
|
||||
: "All Roles"}
|
||||
</SelectValue>
|
||||
</SelectTrigger>
|
||||
<SelectContent className="bg-background">
|
||||
<SelectContent className="bg-background-50">
|
||||
{Object.entries(USER_ROLE_LABELS)
|
||||
.filter(([role]) => role !== UserRole.EXT_PERM_USER)
|
||||
.map(([role, label]) => (
|
||||
|
||||
@@ -41,15 +41,6 @@ export default function TextView({
|
||||
return markdownFormats.some((format) => mimeType.startsWith(format));
|
||||
};
|
||||
|
||||
const isImageFormat = (mimeType: string) => {
|
||||
const imageFormats = [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/gif",
|
||||
"image/svg+xml",
|
||||
];
|
||||
return imageFormats.some((format) => mimeType.startsWith(format));
|
||||
};
|
||||
// Detect if a given MIME type can be rendered in an <iframe>
|
||||
const isSupportedIframeFormat = (mimeType: string): boolean => {
|
||||
const supportedFormats = [
|
||||
@@ -135,7 +126,6 @@ export default function TextView({
|
||||
<DialogTitle className="text-lg font-medium truncate">
|
||||
{fileName}
|
||||
</DialogTitle>
|
||||
|
||||
<div className="flex items-center space-x-2">
|
||||
<Button variant="ghost" size="icon" onClick={handleZoomOut}>
|
||||
<ZoomOut className="h-4 w-4" />
|
||||
@@ -171,13 +161,7 @@ export default function TextView({
|
||||
className="w-full h-full transform origin-center transition-transform duration-300 ease-in-out"
|
||||
style={{ transform: `scale(${zoom / 100})` }}
|
||||
>
|
||||
{isImageFormat(fileType) ? (
|
||||
<img
|
||||
src={fileUrl}
|
||||
alt={fileName}
|
||||
className="w-full h-full object-contain object-center"
|
||||
/>
|
||||
) : isSupportedIframeFormat(fileType) ? (
|
||||
{isSupportedIframeFormat(fileType) ? (
|
||||
<iframe
|
||||
src={`${fileUrl}#toolbar=0`}
|
||||
className="w-full h-full border-none"
|
||||
|
||||
@@ -268,7 +268,7 @@ export const AVAILABLE_CLOUD_PROVIDERS: CloudEmbeddingProvider[] = [
|
||||
embedding_models: [
|
||||
{
|
||||
provider_type: EmbeddingProvider.GOOGLE,
|
||||
model_name: "text-embedding-005",
|
||||
model_name: "text-embedding-004",
|
||||
description: "Google's most recent text embedding model.",
|
||||
pricePerMillion: 0.025,
|
||||
model_dim: 768,
|
||||
|
||||
@@ -25,8 +25,8 @@ export function mockedRefreshToken(): CustomRefreshTokenResponse {
|
||||
*/
|
||||
const mockExp = Date.now() + 3600000; // 1 hour from now in milliseconds
|
||||
const data: CustomRefreshTokenResponse = {
|
||||
access_token: "Mock access token",
|
||||
refresh_token: "Mock refresh token",
|
||||
access_token: "asdf Mock access token",
|
||||
refresh_token: "asdf Mock refresh token",
|
||||
session: { exp: mockExp },
|
||||
userinfo: {
|
||||
sub: "Mock email",
|
||||
|
||||
@@ -38,7 +38,6 @@ import { SiBookstack } from "react-icons/si";
|
||||
import Image, { StaticImageData } from "next/image";
|
||||
import jiraSVG from "../../../public/Jira.svg";
|
||||
import confluenceSVG from "../../../public/Confluence.svg";
|
||||
import deepseekSVG from "../../../public/Deepseek.svg";
|
||||
import openAISVG from "../../../public/Openai.svg";
|
||||
import amazonSVG from "../../../public/Amazon.svg";
|
||||
import geminiSVG from "../../../public/Gemini.svg";
|
||||
@@ -1150,13 +1149,6 @@ export const MetaIcon = ({
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => <LogoIcon size={size} className={className} src={metaSVG} />;
|
||||
|
||||
export const DeepseekIcon = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => (
|
||||
<LogoIcon size={size} className={className} src={deepseekSVG} />
|
||||
);
|
||||
|
||||
export const MicrosoftIconSVG = ({
|
||||
size = 16,
|
||||
className = defaultTailwindCSS,
|
||||
@@ -3286,25 +3278,18 @@ export const CirclingArrowIcon = ({
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
);
|
||||
};
|
||||
|
||||
export const SortIcon = ({
|
||||
size = 24,
|
||||
className = defaultTailwindCSS,
|
||||
}: IconProps) => {
|
||||
return (
|
||||
<svg
|
||||
style={{ width: `${size}px`, height: `${size}px` }}
|
||||
className={`w-[${size}px] h-[${size}px] ` + className}
|
||||
fill="currentColor"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
viewBox="0 0 24 24"
|
||||
>
|
||||
<path
|
||||
fill="currentColor"
|
||||
d="M22 18.605a.75.75 0 0 1-.75.75h-5.1a2.93 2.93 0 0 1-5.66 0H2.75a.75.75 0 1 1 0-1.5h7.74a2.93 2.93 0 0 1 5.66 0h5.1a.75.75 0 0 1 .75.75m0-13.21a.75.75 0 0 1-.75.75H18.8a2.93 2.93 0 0 1-5.66 0H2.75a.75.75 0 1 1 0-1.5h10.39a2.93 2.93 0 0 1 5.66 0h2.45a.74.74 0 0 1 .75.75m0 6.6a.74.74 0 0 1-.75.75H9.55a2.93 2.93 0 0 1-5.66 0H2.75a.75.75 0 1 1 0-1.5h1.14a2.93 2.93 0 0 1 5.66 0h11.7a.75.75 0 0 1 .75.75"
|
||||
/>
|
||||
</svg>
|
||||
// <svg
|
||||
// style={{ width: `${size}px`, height: `${size}px` }}
|
||||
// className={`w-[${size}px] h-[${size}px] ` + className}
|
||||
// viewBox="0 0 112.62 120.72"
|
||||
// data-name="Layer 1"
|
||||
// xmlns="http://www.w3.org/2000/svg"
|
||||
// >
|
||||
// <path
|
||||
// strokeWidth={100}
|
||||
// d="M11.64,100.12l-.4-.47-1.06,8.63a5.08,5.08,0,0,1-1.92,3.41A5.11,5.11,0,0,1,0,107L2.79,84.65v-.07a3.28,3.28,0,0,1,.08-.41h0A5.09,5.09,0,0,1,9,80.39q11.22,2.53,22.42,5.15a5,5,0,0,1,3.17,2.25,5.14,5.14,0,0,1,.64,3.84v0a5,5,0,0,1-2.25,3.16,5.08,5.08,0,0,1-3.83.65c-3.31-.75-6.62-1.52-9.92-2.28a40.71,40.71,0,0,0,2.84,3,50.09,50.09,0,0,0,26.23,13.49,48.67,48.67,0,0,0,14.71.34A47.35,47.35,0,0,0,77,106h0q2.52-1.19,4.83-2.54c1.56-.93,3.07-1.92,4.51-3a50.8,50.8,0,0,0,8.56-7.88,48.92,48.92,0,0,0,6.39-9.45l.56-1.1,10,2.69-.8,1.66a58.64,58.64,0,0,1-7.9,12.24,61.28,61.28,0,0,1-10.81,10.1c-1.68,1.23-3.46,2.4-5.32,3.5s-3.73,2.07-5.74,3a58,58,0,0,1-17,5,58.56,58.56,0,0,1-17.79-.39,60.21,60.21,0,0,1-31.58-16.26c-1.2-1.16-2.26-2.31-3.24-3.45ZM101,20.6l.4.47,1-8.63a5.11,5.11,0,1,1,10.14,1.26l-2.74,22.37,0,.07c0,.13,0,.27-.07.41h0a5.09,5.09,0,0,1-6.08,3.78c-7.47-1.69-15-3.4-22.42-5.15a5,5,0,0,1-3.16-2.25,5.1,5.1,0,0,1-.65-3.84v0a5,5,0,0,1,2.25-3.16,5.1,5.1,0,0,1,3.84-.65c3.31.75,6.61,1.52,9.92,2.28-.84-1-1.77-2-2.84-3.05a50.09,50.09,0,0,0-12.13-8.73A49.49,49.49,0,0,0,64.37,11a48.6,48.6,0,0,0-14.7-.34,47.26,47.26,0,0,0-14,4.1h0q-2.53,1.18-4.83,2.54c-1.57.93-3.07,1.92-4.52,3a50.34,50.34,0,0,0-8.55,7.88,48,48,0,0,0-6.39,9.45l-.57,1.1L.76,36l.8-1.66A58.9,58.9,0,0,1,9.46,22.1,61.63,61.63,0,0,1,20.27,12q2.54-1.85,5.32-3.5c1.81-1.06,3.73-2.07,5.74-3a58,58,0,0,1,17-5A58.56,58.56,0,0,1,66.16.89a59.77,59.77,0,0,1,17,5.74A60.4,60.4,0,0,1,97.75,17.15c1.19,1.16,2.26,2.31,3.24,3.45Z"
|
||||
// />
|
||||
// </svg>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -85,7 +85,7 @@ export const LLMSelector: React.FC<LLMSelectorProps> = ({
|
||||
<span>{userSettings ? "System Default" : "User Default"}</span>
|
||||
{userSettings && (
|
||||
<span className=" my-auto font-normal ml-1">
|
||||
({defaultModelDisplayName})
|
||||
({defaultModelDisplayName}) asdf
|
||||
</span>
|
||||
)}
|
||||
</SelectItem>
|
||||
|
||||
@@ -1,153 +0,0 @@
|
||||
"use client";
|
||||
|
||||
import * as React from "react";
|
||||
import { type DialogProps } from "@radix-ui/react-dialog";
|
||||
import { Command as CommandPrimitive } from "cmdk";
|
||||
import { Search } from "lucide-react";
|
||||
|
||||
import { cn } from "@/lib/utils";
|
||||
import { Dialog, DialogContent } from "@/components/ui/dialog";
|
||||
|
||||
const Command = React.forwardRef<
|
||||
React.ElementRef<typeof CommandPrimitive>,
|
||||
React.ComponentPropsWithoutRef<typeof CommandPrimitive>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<CommandPrimitive
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"flex h-full w-full flex-col overflow-hidden rounded-md bg-white text-neutral-950 dark:bg-neutral-950 dark:text-neutral-50",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
));
|
||||
Command.displayName = CommandPrimitive.displayName;
|
||||
|
||||
const CommandDialog = ({ children, ...props }: DialogProps) => {
|
||||
return (
|
||||
<Dialog {...props}>
|
||||
<DialogContent className="overflow-hidden p-0 shadow-lg">
|
||||
<Command className="[&_[cmdk-group-heading]]:px-2 [&_[cmdk-group-heading]]:font-medium [&_[cmdk-group-heading]]:text-neutral-500 [&_[cmdk-group]:not([hidden])_~[cmdk-group]]:pt-0 [&_[cmdk-group]]:px-2 [&_[cmdk-input-wrapper]_svg]:h-5 [&_[cmdk-input-wrapper]_svg]:w-5 [&_[cmdk-input]]:h-12 [&_[cmdk-item]]:px-2 [&_[cmdk-item]]:py-3 [&_[cmdk-item]_svg]:h-5 [&_[cmdk-item]_svg]:w-5 dark:[&_[cmdk-group-heading]]:text-neutral-400">
|
||||
{children}
|
||||
</Command>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
);
|
||||
};
|
||||
|
||||
const CommandInput = React.forwardRef<
|
||||
React.ElementRef<typeof CommandPrimitive.Input>,
|
||||
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Input>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<div className="flex items-center border-b px-3" cmdk-input-wrapper="">
|
||||
<Search className="mr-2 h-4 w-4 shrink-0 opacity-50" />
|
||||
<CommandPrimitive.Input
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"flex h-11 w-full rounded-md bg-transparent py-3 text-sm outline-none placeholder:text-neutral-500 disabled:cursor-not-allowed disabled:opacity-50 dark:placeholder:text-neutral-400",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
</div>
|
||||
));
|
||||
|
||||
CommandInput.displayName = CommandPrimitive.Input.displayName;
|
||||
|
||||
const CommandList = React.forwardRef<
|
||||
React.ElementRef<typeof CommandPrimitive.List>,
|
||||
React.ComponentPropsWithoutRef<typeof CommandPrimitive.List>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<CommandPrimitive.List
|
||||
ref={ref}
|
||||
className={cn("max-h-[300px] overflow-y-auto overflow-x-hidden", className)}
|
||||
{...props}
|
||||
/>
|
||||
));
|
||||
|
||||
CommandList.displayName = CommandPrimitive.List.displayName;
|
||||
|
||||
const CommandEmpty = React.forwardRef<
|
||||
React.ElementRef<typeof CommandPrimitive.Empty>,
|
||||
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Empty>
|
||||
>((props, ref) => (
|
||||
<CommandPrimitive.Empty
|
||||
ref={ref}
|
||||
className="py-6 text-center text-sm"
|
||||
{...props}
|
||||
/>
|
||||
));
|
||||
|
||||
CommandEmpty.displayName = CommandPrimitive.Empty.displayName;
|
||||
|
||||
const CommandGroup = React.forwardRef<
|
||||
React.ElementRef<typeof CommandPrimitive.Group>,
|
||||
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Group>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<CommandPrimitive.Group
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"overflow-hidden p-1 text-neutral-950 [&_[cmdk-group-heading]]:px-2 [&_[cmdk-group-heading]]:py-1.5 [&_[cmdk-group-heading]]:text-xs [&_[cmdk-group-heading]]:font-medium [&_[cmdk-group-heading]]:text-neutral-500 dark:text-neutral-50 dark:[&_[cmdk-group-heading]]:text-neutral-400",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
));
|
||||
|
||||
CommandGroup.displayName = CommandPrimitive.Group.displayName;
|
||||
|
||||
const CommandSeparator = React.forwardRef<
|
||||
React.ElementRef<typeof CommandPrimitive.Separator>,
|
||||
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Separator>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<CommandPrimitive.Separator
|
||||
ref={ref}
|
||||
className={cn("-mx-1 h-px bg-neutral-200 dark:bg-neutral-800", className)}
|
||||
{...props}
|
||||
/>
|
||||
));
|
||||
CommandSeparator.displayName = CommandPrimitive.Separator.displayName;
|
||||
|
||||
const CommandItem = React.forwardRef<
|
||||
React.ElementRef<typeof CommandPrimitive.Item>,
|
||||
React.ComponentPropsWithoutRef<typeof CommandPrimitive.Item>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<CommandPrimitive.Item
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"relative flex cursor-default gap-2 select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none data-[disabled=true]:pointer-events-none data-[selected='true']:bg-neutral-100 data-[selected=true]:text-neutral-900 data-[disabled=true]:opacity-50 [&_svg]:pointer-events-none [&_svg]:size-4 [&_svg]:shrink-0 dark:data-[selected='true']:bg-neutral-800 dark:data-[selected=true]:text-neutral-50",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
));
|
||||
|
||||
CommandItem.displayName = CommandPrimitive.Item.displayName;
|
||||
|
||||
const CommandShortcut = ({
|
||||
className,
|
||||
...props
|
||||
}: React.HTMLAttributes<HTMLSpanElement>) => {
|
||||
return (
|
||||
<span
|
||||
className={cn(
|
||||
"ml-auto text-xs tracking-widest text-neutral-500 dark:text-neutral-400",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
);
|
||||
};
|
||||
CommandShortcut.displayName = "CommandShortcut";
|
||||
|
||||
export {
|
||||
Command,
|
||||
CommandDialog,
|
||||
CommandInput,
|
||||
CommandList,
|
||||
CommandEmpty,
|
||||
CommandGroup,
|
||||
CommandItem,
|
||||
CommandShortcut,
|
||||
CommandSeparator,
|
||||
};
|
||||
@@ -82,6 +82,9 @@ export const NEXT_PUBLIC_FORGOT_PASSWORD_ENABLED =
|
||||
export const NEXT_PUBLIC_TEST_ENV =
|
||||
process.env.NEXT_PUBLIC_TEST_ENV?.toLowerCase() === "true";
|
||||
|
||||
export const NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED =
|
||||
process.env.NEXT_PUBLIC_DELETE_ALL_CHATS_ENABLED?.toLowerCase() === "true";
|
||||
|
||||
export const NEXT_PUBLIC_ENABLE_CHROME_EXTENSION =
|
||||
process.env.NEXT_PUBLIC_ENABLE_CHROME_EXTENSION?.toLowerCase() === "true";
|
||||
|
||||
|
||||
@@ -706,10 +706,6 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
"phi-3.5-mini-instruct": "Phi 3.5 Mini",
|
||||
"phi-3.5-moe-instruct": "Phi 3.5 MoE",
|
||||
"phi-3.5-vision-instruct": "Phi 3.5 Vision",
|
||||
"phi-4": "Phi 4",
|
||||
|
||||
// Deepseek Models
|
||||
"deepseek-r1": "DeepSeek R1",
|
||||
|
||||
// Anthropic models
|
||||
"claude-3-opus-20240229": "Claude 3 Opus",
|
||||
@@ -726,8 +722,6 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
"claude-3-5-haiku-20241022": "Claude 3.5 Haiku",
|
||||
"claude-3-5-haiku@20241022": "Claude 3.5 Haiku",
|
||||
"claude-3.5-haiku@20241022": "Claude 3.5 Haiku",
|
||||
"claude-3.7-sonnet@202502019": "Claude 3.7 Sonnet",
|
||||
"claude-3-7-sonnet-202502019": "Claude 3.7 Sonnet",
|
||||
|
||||
// Google Models
|
||||
"gemini-1.5-pro": "Gemini 1.5 Pro",
|
||||
@@ -737,16 +731,10 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
"gemini-1.5-pro-002": "Gemini 1.5 Pro (v2)",
|
||||
"gemini-1.5-flash-002": "Gemini 1.5 Flash (v2)",
|
||||
"gemini-2.0-flash-exp": "Gemini 2.0 Flash (Experimental)",
|
||||
"gemini-2.0-flash-001": "Gemini 2.0 Flash",
|
||||
"gemini-2.0-flash-lite-preview-02-05": "Gemini 2.0 Flash Lite (Prv)",
|
||||
"gemini-2.0-flash-thinking-exp-01-02": "Gemini 2.0 Flash Thinking (Exp)",
|
||||
"gemini-2.0-pro-exp-02-05": "Gemini 2.0 Pro (Exp)",
|
||||
"gemini-2.0-flash": "Gemini 2.0 Flash",
|
||||
"gemini-2.0-flash-thinking-exp-01-21": "Gemini 2.0 Flash Thinking",
|
||||
|
||||
// Mistral Models
|
||||
"mistral-large-2411": "Mistral Large 24.11",
|
||||
"mistral-large@2411": "Mistral Large 24.11",
|
||||
"ministral-3b": "Ministral 3B",
|
||||
|
||||
// Bedrock models
|
||||
"meta.llama3-1-70b-instruct-v1:0": "Llama 3.1 70B",
|
||||
@@ -767,8 +755,6 @@ const MODEL_DISPLAY_NAMES: { [key: string]: string } = {
|
||||
"anthropic.claude-v2:1": "Claude v2.1",
|
||||
"anthropic.claude-v2": "Claude v2",
|
||||
"anthropic.claude-v1": "Claude v1",
|
||||
"anthropic.claude-3-7-sonnet-20250219-v1:0": "Claude 3.7 Sonnet",
|
||||
"us.anthropic.claude-3-7-sonnet-20250219-v1:0": "Claude 3.7 Sonnet",
|
||||
"anthropic.claude-3-opus-20240229-v1:0": "Claude 3 Opus",
|
||||
"anthropic.claude-3-haiku-20240307-v1:0": "Claude 3 Haiku",
|
||||
"anthropic.claude-3-5-sonnet-20240620-v1:0": "Claude 3.5 Sonnet",
|
||||
@@ -802,7 +788,6 @@ export const defaultModelsByProvider: { [name: string]: string[] } = {
|
||||
"anthropic.claude-3-opus-20240229-v1:0",
|
||||
"mistral.mistral-large-2402-v1:0",
|
||||
"anthropic.claude-3-5-sonnet-20241022-v2:0",
|
||||
"anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
],
|
||||
anthropic: ["claude-3-opus-20240229", "claude-3-5-sonnet-20241022"],
|
||||
};
|
||||
|
||||
@@ -90,8 +90,6 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
|
||||
"anthropic.claude-3-5-sonnet-20240620-v1:0",
|
||||
"anthropic.claude-3-5-sonnet-20241022-v2:0",
|
||||
"anthropic.claude-3-7-sonnet-20250219-v1:0",
|
||||
"claude-3.7-sonnet@202502019",
|
||||
"claude-3-7-sonnet-202502019",
|
||||
// google gemini model names
|
||||
"gemini-1.5-pro",
|
||||
"gemini-1.5-flash",
|
||||
@@ -100,8 +98,6 @@ const MODEL_NAMES_SUPPORTING_IMAGE_INPUT = [
|
||||
"gemini-1.5-pro-002",
|
||||
"gemini-1.5-flash-002",
|
||||
"gemini-2.0-flash-exp",
|
||||
"gemini-2.0-flash-001",
|
||||
"gemini-2.0-pro-exp-02-05",
|
||||
// amazon models
|
||||
"amazon.nova-lite@v1",
|
||||
"amazon.nova-pro@v1",
|
||||
|
||||
Reference in New Issue
Block a user