Compare commits

..

10 Commits

Author SHA1 Message Date
Dane Urban
cbf781b658 . 2026-03-19 16:47:37 -07:00
Dane Urban
ea15d7e338 Max chunks 2026-03-18 18:52:07 -07:00
Dane Urban
72ef0b1cac Add extra tests 2026-03-18 18:46:24 -07:00
Dane Urban
d2cff54a40 Add comments 2026-03-18 18:43:12 -07:00
Dane Urban
fee2e7f8ca Remove restriction comment 2026-03-18 18:43:12 -07:00
Dane Urban
2e542f820b mypy fixes 2026-03-18 18:43:12 -07:00
Dane Urban
4787a075c5 . 2026-03-18 18:43:12 -07:00
Dane Urban
a443ae5b21 Vespa change 2026-03-18 18:43:12 -07:00
Dane Urban
28aa75a2dd Open-search iterable refactor 2026-03-18 18:43:12 -07:00
Dane Urban
24d0d3b0b1 Add tests 2026-03-18 18:41:19 -07:00
140 changed files with 2909 additions and 5810 deletions

View File

@@ -317,7 +317,6 @@ celery_app.autodiscover_tasks(
"onyx.background.celery.tasks.docprocessing",
"onyx.background.celery.tasks.evals",
"onyx.background.celery.tasks.hierarchyfetching",
"onyx.background.celery.tasks.hooks",
"onyx.background.celery.tasks.periodic",
"onyx.background.celery.tasks.pruning",
"onyx.background.celery.tasks.shared",

View File

@@ -9,7 +9,6 @@ from onyx.configs.app_configs import AUTO_LLM_UPDATE_INTERVAL_SECONDS
from onyx.configs.app_configs import DISABLE_VECTOR_DB
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
from onyx.configs.app_configs import ENTERPRISE_EDITION_ENABLED
from onyx.configs.app_configs import HOOK_ENABLED
from onyx.configs.app_configs import SCHEDULED_EVAL_DATASET_NAMES
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
from onyx.configs.constants import OnyxCeleryPriority
@@ -362,19 +361,6 @@ if not MULTI_TENANT:
tasks_to_schedule.extend(beat_task_templates)
if not MULTI_TENANT and HOOK_ENABLED:
tasks_to_schedule.append(
{
"name": "hook-execution-log-cleanup",
"task": OnyxCeleryTask.HOOK_EXECUTION_LOG_CLEANUP_TASK,
"schedule": timedelta(days=1),
"options": {
"priority": OnyxCeleryPriority.LOW,
"expires": BEAT_EXPIRES_DEFAULT,
},
}
)
def generate_cloud_tasks(
beat_tasks: list[dict], beat_templates: list[dict], beat_multiplier: float

View File

@@ -1,35 +0,0 @@
from celery import shared_task
from onyx.configs.app_configs import JOB_TIMEOUT
from onyx.configs.constants import OnyxCeleryTask
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.hook import cleanup_old_execution_logs__no_commit
from onyx.utils.logger import setup_logger
logger = setup_logger()
_HOOK_EXECUTION_LOG_RETENTION_DAYS: int = 30
@shared_task(
name=OnyxCeleryTask.HOOK_EXECUTION_LOG_CLEANUP_TASK,
ignore_result=True,
soft_time_limit=JOB_TIMEOUT,
trail=False,
)
def hook_execution_log_cleanup_task(*, tenant_id: str) -> None: # noqa: ARG001
try:
with get_session_with_current_tenant() as db_session:
deleted: int = cleanup_old_execution_logs__no_commit(
db_session=db_session,
max_age_days=_HOOK_EXECUTION_LOG_RETENTION_DAYS,
)
db_session.commit()
if deleted:
logger.info(
f"Deleted {deleted} hook execution log(s) older than "
f"{_HOOK_EXECUTION_LOG_RETENTION_DAYS} days."
)
except Exception:
logger.exception("Failed to clean up hook execution logs")
raise

View File

@@ -24,7 +24,6 @@ from onyx.configs.app_configs import MANAGED_VESPA
from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
from onyx.configs.constants import CELERY_USER_FILE_DELETE_TASK_EXPIRES
from onyx.configs.constants import CELERY_USER_FILE_PROCESSING_LOCK_TIMEOUT
from onyx.configs.constants import CELERY_USER_FILE_PROCESSING_TASK_EXPIRES
from onyx.configs.constants import CELERY_USER_FILE_PROJECT_SYNC_LOCK_TIMEOUT
@@ -34,7 +33,6 @@ from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryQueues
from onyx.configs.constants import OnyxCeleryTask
from onyx.configs.constants import OnyxRedisLocks
from onyx.configs.constants import USER_FILE_DELETE_MAX_QUEUE_DEPTH
from onyx.configs.constants import USER_FILE_PROCESSING_MAX_QUEUE_DEPTH
from onyx.configs.constants import USER_FILE_PROJECT_SYNC_MAX_QUEUE_DEPTH
from onyx.connectors.file.connector import LocalFileConnector
@@ -93,17 +91,6 @@ def _user_file_delete_lock_key(user_file_id: str | UUID) -> str:
return f"{OnyxRedisLocks.USER_FILE_DELETE_LOCK_PREFIX}:{user_file_id}"
def _user_file_delete_queued_key(user_file_id: str | UUID) -> str:
"""Key that exists while a delete_single_user_file task is sitting in the queue.
The beat generator sets this with a TTL equal to CELERY_USER_FILE_DELETE_TASK_EXPIRES
before enqueuing and the worker deletes it as its first action. This prevents
the beat from adding duplicate tasks for files that already have a live task
in flight.
"""
return f"{OnyxRedisLocks.USER_FILE_DELETE_QUEUED_PREFIX}:{user_file_id}"
def get_user_file_project_sync_queue_depth(celery_app: Celery) -> int:
redis_celery: Redis = celery_app.broker_connection().channel().client # type: ignore
return celery_get_queue_length(
@@ -559,23 +546,7 @@ def process_single_user_file(
ignore_result=True,
)
def check_for_user_file_delete(self: Task, *, tenant_id: str) -> None:
"""Scan for user files with DELETING status and enqueue per-file tasks.
Three mechanisms prevent queue runaway (mirrors check_user_file_processing):
1. **Queue depth backpressure** if the broker queue already has more than
USER_FILE_DELETE_MAX_QUEUE_DEPTH items we skip this beat cycle entirely.
2. **Per-file queued guard** before enqueuing a task we set a short-lived
Redis key (TTL = CELERY_USER_FILE_DELETE_TASK_EXPIRES). If that key
already exists the file already has a live task in the queue, so we skip
it. The worker deletes the key the moment it picks up the task so the
next beat cycle can re-enqueue if the file is still DELETING.
3. **Task expiry** every enqueued task carries an `expires` value equal to
CELERY_USER_FILE_DELETE_TASK_EXPIRES. If a task is still sitting in
the queue after that deadline, Celery discards it without touching the DB.
"""
"""Scan for user files with DELETING status and enqueue per-file tasks."""
task_logger.info("check_for_user_file_delete - Starting")
redis_client = get_redis_client(tenant_id=tenant_id)
lock: RedisLock = redis_client.lock(
@@ -584,23 +555,8 @@ def check_for_user_file_delete(self: Task, *, tenant_id: str) -> None:
)
if not lock.acquire(blocking=False):
return None
enqueued = 0
skipped_guard = 0
try:
# --- Protection 1: queue depth backpressure ---
# NOTE: must use the broker's Redis client (not redis_client) because
# Celery queues live on a separate Redis DB with CELERY_SEPARATOR keys.
r_celery: Redis = self.app.broker_connection().channel().client # type: ignore
queue_len = celery_get_queue_length(OnyxCeleryQueues.USER_FILE_DELETE, r_celery)
if queue_len > USER_FILE_DELETE_MAX_QUEUE_DEPTH:
task_logger.warning(
f"check_for_user_file_delete - Queue depth {queue_len} exceeds "
f"{USER_FILE_DELETE_MAX_QUEUE_DEPTH}, skipping enqueue for "
f"tenant={tenant_id}"
)
return None
with get_session_with_current_tenant() as db_session:
user_file_ids = (
db_session.execute(
@@ -612,40 +568,23 @@ def check_for_user_file_delete(self: Task, *, tenant_id: str) -> None:
.all()
)
for user_file_id in user_file_ids:
# --- Protection 2: per-file queued guard ---
queued_key = _user_file_delete_queued_key(user_file_id)
guard_set = redis_client.set(
queued_key,
1,
ex=CELERY_USER_FILE_DELETE_TASK_EXPIRES,
nx=True,
self.app.send_task(
OnyxCeleryTask.DELETE_SINGLE_USER_FILE,
kwargs={"user_file_id": str(user_file_id), "tenant_id": tenant_id},
queue=OnyxCeleryQueues.USER_FILE_DELETE,
priority=OnyxCeleryPriority.HIGH,
)
if not guard_set:
skipped_guard += 1
continue
# --- Protection 3: task expiry ---
try:
self.app.send_task(
OnyxCeleryTask.DELETE_SINGLE_USER_FILE,
kwargs={
"user_file_id": str(user_file_id),
"tenant_id": tenant_id,
},
queue=OnyxCeleryQueues.USER_FILE_DELETE,
priority=OnyxCeleryPriority.HIGH,
expires=CELERY_USER_FILE_DELETE_TASK_EXPIRES,
)
except Exception:
redis_client.delete(queued_key)
raise
enqueued += 1
except Exception as e:
task_logger.exception(
f"check_for_user_file_delete - Error enqueuing deletes - {e.__class__.__name__}"
)
return None
finally:
if lock.owned():
lock.release()
task_logger.info(
f"check_for_user_file_delete - Enqueued {enqueued} tasks, skipped_guard={skipped_guard} for tenant={tenant_id}"
f"check_for_user_file_delete - Enqueued {enqueued} tasks for tenant={tenant_id}"
)
return None
@@ -663,9 +602,6 @@ def delete_user_file_impl(
file_lock: RedisLock | None = None
if redis_locking:
redis_client = get_redis_client(tenant_id=tenant_id)
# Clear the queued guard so the beat can re-enqueue if deletion fails
# and the file remains in DELETING status.
redis_client.delete(_user_file_delete_queued_key(user_file_id))
file_lock = redis_client.lock(
_user_file_delete_lock_key(user_file_id),
timeout=CELERY_GENERIC_BEAT_LOCK_TIMEOUT,

View File

@@ -297,9 +297,7 @@ class PostgresCacheBackend(CacheBackend):
def _lock_id_for(self, name: str) -> int:
"""Map *name* to a 64-bit signed int for ``pg_advisory_lock``."""
h = hashlib.md5(
f"{self._tenant_id}:{name}".encode(), usedforsecurity=False
).digest()
h = hashlib.md5(f"{self._tenant_id}:{name}".encode()).digest()
return struct.unpack("q", h[:8])[0]

View File

@@ -278,17 +278,14 @@ USING_AWS_MANAGED_OPENSEARCH = (
OPENSEARCH_PROFILING_DISABLED = (
os.environ.get("OPENSEARCH_PROFILING_DISABLED", "").lower() == "true"
)
# Whether to disable match highlights for OpenSearch. Defaults to True for now
# as we investigate query performance.
OPENSEARCH_MATCH_HIGHLIGHTS_DISABLED = (
os.environ.get("OPENSEARCH_MATCH_HIGHLIGHTS_DISABLED", "true").lower() == "true"
)
# When enabled, OpenSearch returns detailed score breakdowns for each hit.
# Useful for debugging and tuning search relevance. Has ~10-30% performance overhead according to documentation.
# Seems for Hybrid Search in practice, the impact is actually more like 1000x slower.
OPENSEARCH_EXPLAIN_ENABLED = (
os.environ.get("OPENSEARCH_EXPLAIN_ENABLED", "").lower() == "true"
)
# Analyzer used for full-text fields (title, content). Use OpenSearch built-in analyzer
# names (e.g. "english", "standard", "german"). Affects stemming and tokenization;
# existing indices need reindexing after a change.
@@ -321,16 +318,8 @@ VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT = (
OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE = int(
os.environ.get("OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE") or 500
)
# If set, will override the default number of shards and replicas for the index.
OPENSEARCH_INDEX_NUM_SHARDS: int | None = (
int(os.environ["OPENSEARCH_INDEX_NUM_SHARDS"])
if os.environ.get("OPENSEARCH_INDEX_NUM_SHARDS", None) is not None
else None
)
OPENSEARCH_INDEX_NUM_REPLICAS: int | None = (
int(os.environ["OPENSEARCH_INDEX_NUM_REPLICAS"])
if os.environ.get("OPENSEARCH_INDEX_NUM_REPLICAS", None) is not None
else None
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = int(
os.environ.get("OPENSEARCH_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES") or 0
)
VESPA_HOST = os.environ.get("VESPA_HOST") or "localhost"
@@ -787,6 +776,9 @@ MINI_CHUNK_SIZE = 150
# This is the number of regular chunks per large chunk
LARGE_CHUNK_RATIO = 4
# The number of chunks in an indexing batch
CHUNKS_PER_BATCH = 1000
# Include the document level metadata in each chunk. If the metadata is too long, then it is thrown out
# We don't want the metadata to overwhelm the actual contents of the chunk
SKIP_METADATA_IN_CHUNK = os.environ.get("SKIP_METADATA_IN_CHUNK", "").lower() == "true"

View File

@@ -177,14 +177,6 @@ USER_FILE_PROJECT_SYNC_MAX_QUEUE_DEPTH = 500
CELERY_USER_FILE_PROJECT_SYNC_LOCK_TIMEOUT = 5 * 60 # 5 minutes (in seconds)
# How long a queued user-file-delete task is valid before workers discard it.
# Mirrors the processing task expiry to prevent indefinite queue growth when
# files are stuck in DELETING status and the beat keeps re-enqueuing them.
CELERY_USER_FILE_DELETE_TASK_EXPIRES = 60 # 1 minute (in seconds)
# Max queue depth before the delete beat stops enqueuing more delete tasks.
USER_FILE_DELETE_MAX_QUEUE_DEPTH = 500
CELERY_SANDBOX_FILE_SYNC_LOCK_TIMEOUT = 5 * 60 # 5 minutes (in seconds)
DANSWER_REDIS_FUNCTION_LOCK_PREFIX = "da_function_lock:"
@@ -477,9 +469,6 @@ class OnyxRedisLocks:
USER_FILE_PROJECT_SYNC_QUEUED_PREFIX = "da_lock:user_file_project_sync_queued"
USER_FILE_DELETE_BEAT_LOCK = "da_lock:check_user_file_delete_beat"
USER_FILE_DELETE_LOCK_PREFIX = "da_lock:user_file_delete"
# Short-lived key set when a delete task is enqueued; cleared when the worker picks it up.
# Prevents the beat from re-enqueuing the same file while a delete task is already queued.
USER_FILE_DELETE_QUEUED_PREFIX = "da_lock:user_file_delete_queued"
# Release notes
RELEASE_NOTES_FETCH_LOCK = "da_lock:release_notes_fetch"
@@ -608,9 +597,6 @@ class OnyxCeleryTask:
EXPORT_QUERY_HISTORY_TASK = "export_query_history_task"
EXPORT_QUERY_HISTORY_CLEANUP_TASK = "export_query_history_cleanup_task"
# Hook execution log retention
HOOK_EXECUTION_LOG_CLEANUP_TASK = "hook_execution_log_cleanup_task"
# Sandbox cleanup
CLEANUP_IDLE_SANDBOXES = "cleanup_idle_sandboxes"
CLEANUP_OLD_SNAPSHOTS = "cleanup_old_snapshots"

View File

@@ -157,7 +157,9 @@ def _execute_single_retrieval(
logger.error(f"Error executing request: {e}")
raise e
elif _is_rate_limit_error(e):
results = _execute_with_retry(retrieval_function(**request_kwargs))
results = _execute_with_retry(
lambda: retrieval_function(**request_kwargs).execute()
)
elif e.resp.status == 404 or e.resp.status == 403:
if continue_on_404_or_403:
logger.debug(f"Error executing request: {e}")

View File

@@ -1,233 +0,0 @@
import datetime
from uuid import UUID
from sqlalchemy import delete
from sqlalchemy import select
from sqlalchemy.engine import CursorResult
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import selectinload
from sqlalchemy.orm import Session
from onyx.db.constants import UNSET
from onyx.db.constants import UnsetType
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.db.models import Hook
from onyx.db.models import HookExecutionLog
from onyx.error_handling.error_codes import OnyxErrorCode
from onyx.error_handling.exceptions import OnyxError
# ── Hook CRUD ────────────────────────────────────────────────────────────
def get_hook_by_id(
*,
db_session: Session,
hook_id: int,
include_deleted: bool = False,
include_creator: bool = False,
) -> Hook | None:
stmt = select(Hook).where(Hook.id == hook_id)
if not include_deleted:
stmt = stmt.where(Hook.deleted.is_(False))
if include_creator:
stmt = stmt.options(selectinload(Hook.creator))
return db_session.scalar(stmt)
def get_non_deleted_hook_by_hook_point(
*,
db_session: Session,
hook_point: HookPoint,
include_creator: bool = False,
) -> Hook | None:
stmt = (
select(Hook).where(Hook.hook_point == hook_point).where(Hook.deleted.is_(False))
)
if include_creator:
stmt = stmt.options(selectinload(Hook.creator))
return db_session.scalar(stmt)
def get_hooks(
*,
db_session: Session,
include_deleted: bool = False,
include_creator: bool = False,
) -> list[Hook]:
stmt = select(Hook)
if not include_deleted:
stmt = stmt.where(Hook.deleted.is_(False))
if include_creator:
stmt = stmt.options(selectinload(Hook.creator))
stmt = stmt.order_by(Hook.hook_point, Hook.created_at.desc())
return list(db_session.scalars(stmt).all())
def create_hook__no_commit(
*,
db_session: Session,
name: str,
hook_point: HookPoint,
endpoint_url: str | None = None,
api_key: str | None = None,
fail_strategy: HookFailStrategy,
timeout_seconds: float,
is_active: bool = False,
creator_id: UUID | None = None,
) -> Hook:
"""Create a new hook for the given hook point.
At most one non-deleted hook per hook point is allowed. Raises
OnyxError(CONFLICT) if a hook already exists, including under concurrent
duplicate creates where the partial unique index fires an IntegrityError.
"""
existing = get_non_deleted_hook_by_hook_point(
db_session=db_session, hook_point=hook_point
)
if existing:
raise OnyxError(
OnyxErrorCode.CONFLICT,
f"A hook for '{hook_point.value}' already exists (id={existing.id}).",
)
hook = Hook(
name=name,
hook_point=hook_point,
endpoint_url=endpoint_url,
api_key=api_key,
fail_strategy=fail_strategy,
timeout_seconds=timeout_seconds,
is_active=is_active,
creator_id=creator_id,
)
# Use a savepoint so that a failed insert only rolls back this operation,
# not the entire outer transaction.
savepoint = db_session.begin_nested()
try:
db_session.add(hook)
savepoint.commit()
except IntegrityError as exc:
savepoint.rollback()
if "ix_hook_one_non_deleted_per_point" in str(exc.orig):
raise OnyxError(
OnyxErrorCode.CONFLICT,
f"A hook for '{hook_point.value}' already exists.",
)
raise # re-raise unrelated integrity errors (FK violations, etc.)
return hook
def update_hook__no_commit(
*,
db_session: Session,
hook_id: int,
name: str | None = None,
endpoint_url: str | None | UnsetType = UNSET,
api_key: str | None | UnsetType = UNSET,
fail_strategy: HookFailStrategy | None = None,
timeout_seconds: float | None = None,
is_active: bool | None = None,
is_reachable: bool | None = None,
include_creator: bool = False,
) -> Hook:
"""Update hook fields.
Sentinel conventions:
- endpoint_url, api_key: pass UNSET to leave unchanged; pass None to clear.
- name, fail_strategy, timeout_seconds, is_active, is_reachable: pass None to leave unchanged.
"""
hook = get_hook_by_id(
db_session=db_session, hook_id=hook_id, include_creator=include_creator
)
if hook is None:
raise OnyxError(OnyxErrorCode.NOT_FOUND, f"Hook with id {hook_id} not found.")
if name is not None:
hook.name = name
if not isinstance(endpoint_url, UnsetType):
hook.endpoint_url = endpoint_url
if not isinstance(api_key, UnsetType):
hook.api_key = api_key # type: ignore[assignment] # EncryptedString coerces str → SensitiveValue at the ORM level
if fail_strategy is not None:
hook.fail_strategy = fail_strategy
if timeout_seconds is not None:
hook.timeout_seconds = timeout_seconds
if is_active is not None:
hook.is_active = is_active
if is_reachable is not None:
hook.is_reachable = is_reachable
db_session.flush()
return hook
def delete_hook__no_commit(
*,
db_session: Session,
hook_id: int,
) -> None:
hook = get_hook_by_id(db_session=db_session, hook_id=hook_id)
if hook is None:
raise OnyxError(OnyxErrorCode.NOT_FOUND, f"Hook with id {hook_id} not found.")
hook.deleted = True
hook.is_active = False
db_session.flush()
# ── HookExecutionLog CRUD ────────────────────────────────────────────────
def create_hook_execution_log__no_commit(
*,
db_session: Session,
hook_id: int,
is_success: bool,
error_message: str | None = None,
status_code: int | None = None,
duration_ms: int | None = None,
) -> HookExecutionLog:
log = HookExecutionLog(
hook_id=hook_id,
is_success=is_success,
error_message=error_message,
status_code=status_code,
duration_ms=duration_ms,
)
db_session.add(log)
db_session.flush()
return log
def get_hook_execution_logs(
*,
db_session: Session,
hook_id: int,
limit: int,
) -> list[HookExecutionLog]:
stmt = (
select(HookExecutionLog)
.where(HookExecutionLog.hook_id == hook_id)
.order_by(HookExecutionLog.created_at.desc())
.limit(limit)
)
return list(db_session.scalars(stmt).all())
def cleanup_old_execution_logs__no_commit(
*,
db_session: Session,
max_age_days: int,
) -> int:
"""Delete execution logs older than max_age_days. Returns the number of rows deleted."""
cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(
days=max_age_days
)
result: CursorResult = db_session.execute( # type: ignore[assignment]
delete(HookExecutionLog)
.where(HookExecutionLog.created_at < cutoff)
.execution_options(synchronize_session=False)
)
return result.rowcount

View File

@@ -12,7 +12,6 @@ from sqlalchemy.orm import Session
from starlette.background import BackgroundTasks
from onyx.configs.app_configs import DISABLE_VECTOR_DB
from onyx.configs.constants import CELERY_USER_FILE_PROCESSING_TASK_EXPIRES
from onyx.configs.constants import FileOrigin
from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryQueues
@@ -145,7 +144,6 @@ def upload_files_to_user_files_with_indexing(
kwargs={"user_file_id": user_file.id, "tenant_id": tenant_id},
queue=OnyxCeleryQueues.USER_FILE_PROCESSING,
priority=OnyxCeleryPriority.HIGH,
expires=CELERY_USER_FILE_PROCESSING_TASK_EXPIRES,
)
logger.info(
f"Triggered indexing for user_file_id={user_file.id} with task_id={task.id}"

View File

@@ -5,6 +5,7 @@ accidentally reaches the vector DB layer will fail loudly instead of timing
out against a nonexistent Vespa/OpenSearch instance.
"""
from collections.abc import Iterable
from typing import Any
from onyx.context.search.models import IndexFilters
@@ -66,7 +67,7 @@ class DisabledDocumentIndex(DocumentIndex):
# ------------------------------------------------------------------
def index(
self,
chunks: list[DocMetadataAwareIndexChunk], # noqa: ARG002
chunks: Iterable[DocMetadataAwareIndexChunk], # noqa: ARG002
index_batch_params: IndexBatchParams, # noqa: ARG002
) -> set[DocumentInsertionRecord]:
raise RuntimeError(VECTOR_DB_DISABLED_ERROR)

View File

@@ -1,4 +1,5 @@
import abc
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import datetime
from typing import Any
@@ -206,7 +207,7 @@ class Indexable(abc.ABC):
@abc.abstractmethod
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
chunks: Iterable[DocMetadataAwareIndexChunk],
index_batch_params: IndexBatchParams,
) -> set[DocumentInsertionRecord]:
"""
@@ -226,8 +227,8 @@ class Indexable(abc.ABC):
it is done automatically outside of this code.
Parameters:
- chunks: Document chunks with all of the information needed for indexing to the document
index.
- chunks: Document chunks with all of the information needed for
indexing to the document index.
- tenant_id: The tenant id of the user whose chunks are being indexed
- large_chunks_enabled: Whether large chunks are enabled

View File

@@ -1,4 +1,5 @@
import abc
from collections.abc import Iterable
from typing import Self
from pydantic import BaseModel
@@ -209,10 +210,10 @@ class Indexable(abc.ABC):
@abc.abstractmethod
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
chunks: Iterable[DocMetadataAwareIndexChunk],
indexing_metadata: IndexingMetadata,
) -> list[DocumentInsertionRecord]:
"""Indexes a list of document chunks into the document index.
"""Indexes an iterable of document chunks into the document index.
This is often a batch operation including chunks from multiple
documents.

View File

@@ -18,7 +18,6 @@ from onyx.configs.app_configs import OPENSEARCH_HOST
from onyx.configs.app_configs import OPENSEARCH_REST_API_PORT
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import DocumentChunkWithoutVectors
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.search import DEFAULT_OPENSEARCH_MAX_RESULT_WINDOW
from onyx.utils.logger import setup_logger
@@ -57,8 +56,8 @@ class SearchHit(BaseModel, Generic[SchemaDocumentModel]):
# Maps schema property name to a list of highlighted snippets with match
# terms wrapped in tags (e.g. "something <hi>keyword</hi> other thing").
match_highlights: dict[str, list[str]] = {}
# Score explanation from OpenSearch when "explain": true is set in the
# query. Contains detailed breakdown of how the score was calculated.
# Score explanation from OpenSearch when "explain": true is set in the query.
# Contains detailed breakdown of how the score was calculated.
explanation: dict[str, Any] | None = None
@@ -834,13 +833,9 @@ class OpenSearchIndexClient(OpenSearchClient):
@log_function_time(print_only=True, debug_only=True)
def search(
self, body: dict[str, Any], search_pipeline_id: str | None
) -> list[SearchHit[DocumentChunkWithoutVectors]]:
) -> list[SearchHit[DocumentChunk]]:
"""Searches the index.
NOTE: Does not return vector fields. In order to take advantage of
performance benefits, the search body should exclude the schema's vector
fields.
TODO(andrei): Ideally we could check that every field in the body is
present in the index, to avoid a class of runtime bugs that could easily
be caught during development. Or change the function signature to accept
@@ -888,7 +883,7 @@ class OpenSearchIndexClient(OpenSearchClient):
raise_on_timeout=True,
)
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = []
search_hits: list[SearchHit[DocumentChunk]] = []
for hit in hits:
document_chunk_source: dict[str, Any] | None = hit.get("_source")
if not document_chunk_source:
@@ -898,10 +893,8 @@ class OpenSearchIndexClient(OpenSearchClient):
document_chunk_score = hit.get("_score", None)
match_highlights: dict[str, list[str]] = hit.get("highlight", {})
explanation: dict[str, Any] | None = hit.get("_explanation", None)
search_hit = SearchHit[DocumentChunkWithoutVectors](
document_chunk=DocumentChunkWithoutVectors.model_validate(
document_chunk_source
),
search_hit = SearchHit[DocumentChunk](
document_chunk=DocumentChunk.model_validate(document_chunk_source),
score=document_chunk_score,
match_highlights=match_highlights,
explanation=explanation,

View File

@@ -3,6 +3,10 @@
import os
from enum import Enum
from onyx.configs.app_configs import (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
)
DEFAULT_MAX_CHUNK_SIZE = 512
@@ -37,10 +41,10 @@ M = 32 # Set relatively high for better accuracy.
# we have a much higher chance of all 10 of the final desired docs showing up
# and getting scored. In worse situations, the final 10 docs don't even show up
# as the final 10 (worse than just a miss at the reranking step).
# Defaults to 100 for now. Initially this defaulted to 750 but we were seeing
# poor search performance.
DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES = int(
os.environ.get("DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES", 100)
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
if OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES > 0
else 750
)
# Number of vectors to examine to decide the top k neighbors for the HNSW
@@ -50,7 +54,7 @@ DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES = int(
# larger than k, you can provide the size parameter to limit the final number of
# results to k." from
# https://docs.opensearch.org/latest/query-dsl/specialized/k-nn/index/#ef_search
EF_SEARCH = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES
EF_SEARCH = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
class HybridSearchSubqueryConfiguration(Enum):

View File

@@ -1,11 +1,12 @@
import json
from collections import defaultdict
from collections.abc import Iterable
from typing import Any
import httpx
from opensearchpy import NotFoundError
from onyx.access.models import DocumentAccess
from onyx.configs.app_configs import CHUNKS_PER_BATCH
from onyx.configs.app_configs import VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT
from onyx.configs.chat_configs import NUM_RETURNED_HITS
from onyx.configs.chat_configs import TITLE_CONTENT_RATIO
@@ -47,7 +48,6 @@ from onyx.document_index.opensearch.schema import ACCESS_CONTROL_LIST_FIELD_NAME
from onyx.document_index.opensearch.schema import CONTENT_FIELD_NAME
from onyx.document_index.opensearch.schema import DOCUMENT_SETS_FIELD_NAME
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import DocumentChunkWithoutVectors
from onyx.document_index.opensearch.schema import DocumentSchema
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.schema import GLOBAL_BOOST_FIELD_NAME
@@ -118,7 +118,7 @@ def set_cluster_state(client: OpenSearchClient) -> None:
def _convert_retrieved_opensearch_chunk_to_inference_chunk_uncleaned(
chunk: DocumentChunkWithoutVectors,
chunk: DocumentChunk,
score: float | None,
highlights: dict[str, list[str]],
) -> InferenceChunkUncleaned:
@@ -350,7 +350,7 @@ class OpenSearchOldDocumentIndex(OldDocumentIndex):
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
chunks: Iterable[DocMetadataAwareIndexChunk],
index_batch_params: IndexBatchParams,
) -> set[OldDocumentInsertionRecord]:
"""
@@ -646,8 +646,8 @@ class OpenSearchDocumentIndex(DocumentIndex):
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
indexing_metadata: IndexingMetadata, # noqa: ARG002
chunks: Iterable[DocMetadataAwareIndexChunk],
indexing_metadata: IndexingMetadata,
) -> list[DocumentInsertionRecord]:
"""Indexes a list of document chunks into the document index.
@@ -672,29 +672,32 @@ class OpenSearchDocumentIndex(DocumentIndex):
document is newly indexed or had already existed and was just
updated.
"""
# Group chunks by document ID.
doc_id_to_chunks: dict[str, list[DocMetadataAwareIndexChunk]] = defaultdict(
list
total_chunks = sum(
cc.new_chunk_cnt
for cc in indexing_metadata.doc_id_to_chunk_cnt_diff.values()
)
for chunk in chunks:
doc_id_to_chunks[chunk.source_document.id].append(chunk)
logger.debug(
f"[OpenSearchDocumentIndex] Indexing {len(chunks)} chunks from {len(doc_id_to_chunks)} "
f"[OpenSearchDocumentIndex] Indexing {total_chunks} chunks from {len(indexing_metadata.doc_id_to_chunk_cnt_diff)} "
f"documents for index {self._index_name}."
)
document_indexing_results: list[DocumentInsertionRecord] = []
# Try to index per-document.
for _, chunks in doc_id_to_chunks.items():
deleted_doc_ids: set[str] = set()
# Buffer chunks per document as they arrive from the iterable.
# When the document ID changes flush the buffered chunks.
current_doc_id: str | None = None
current_chunks: list[DocMetadataAwareIndexChunk] = []
def _flush_chunks(doc_chunks: list[DocMetadataAwareIndexChunk]) -> None:
# Create a batch of OpenSearch-formatted chunks for bulk insertion.
# Do this before deleting existing chunks to reduce the amount of
# time the document index has no content for a given document, and
# to reduce the chance of entering a state where we delete chunks,
# then some error happens, and never successfully index new chunks.
# Since we are doing this in batches, an error occurring midway
# can result in a state where chunks are deleted and not all the
# new chunks have been indexed.
chunk_batch: list[DocumentChunk] = [
_convert_onyx_chunk_to_opensearch_document(chunk) for chunk in chunks
_convert_onyx_chunk_to_opensearch_document(chunk)
for chunk in doc_chunks
]
onyx_document: Document = chunks[0].source_document
onyx_document: Document = doc_chunks[0].source_document
# First delete the doc's chunks from the index. This is so that
# there are no dangling chunks in the index, in the event that the
# new document's content contains fewer chunks than the previous
@@ -703,22 +706,43 @@ class OpenSearchDocumentIndex(DocumentIndex):
# if the chunk count has actually decreased. This assumes that
# overlapping chunks are perfectly overwritten. If we can't
# guarantee that then we need the code as-is.
num_chunks_deleted = self.delete(
onyx_document.id, onyx_document.chunk_count
)
# If we see that chunks were deleted we assume the doc already
# existed.
document_insertion_record = DocumentInsertionRecord(
document_id=onyx_document.id,
already_existed=num_chunks_deleted > 0,
)
if onyx_document.id not in deleted_doc_ids:
num_chunks_deleted = self.delete(
onyx_document.id, onyx_document.chunk_count
)
deleted_doc_ids.add(onyx_document.id)
# If we see that chunks were deleted we assume the doc already
# existed. We record the result before bulk_index_documents
# runs. If indexing raises, this entire result list is discarded
# by the caller's retry logic, so early recording is safe.
document_indexing_results.append(
DocumentInsertionRecord(
document_id=onyx_document.id,
already_existed=num_chunks_deleted > 0,
)
)
# Now index. This will raise if a chunk of the same ID exists, which
# we do not expect because we should have deleted all chunks.
self._client.bulk_index_documents(
documents=chunk_batch,
tenant_state=self._tenant_state,
)
document_indexing_results.append(document_insertion_record)
for chunk in chunks:
doc_id = chunk.source_document.id
if doc_id != current_doc_id:
if current_chunks:
_flush_chunks(current_chunks)
current_doc_id = doc_id
current_chunks = [chunk]
elif len(current_chunks) >= CHUNKS_PER_BATCH:
_flush_chunks(current_chunks)
current_chunks = [chunk]
else:
current_chunks.append(chunk)
if current_chunks:
_flush_chunks(current_chunks)
return document_indexing_results
@@ -881,7 +905,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
)
results: list[InferenceChunk] = []
for chunk_request in chunk_requests:
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = []
search_hits: list[SearchHit[DocumentChunk]] = []
query_body = DocumentQuery.get_from_document_id_query(
document_id=chunk_request.document_id,
tenant_state=self._tenant_state,
@@ -945,7 +969,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
include_hidden=False,
)
normalization_pipeline_name, _ = get_normalization_pipeline_name_and_config()
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = self._client.search(
search_hits: list[SearchHit[DocumentChunk]] = self._client.search(
body=query_body,
search_pipeline_id=normalization_pipeline_name,
)
@@ -977,7 +1001,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
index_filters=filters,
num_to_retrieve=num_to_retrieve,
)
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = self._client.search(
search_hits: list[SearchHit[DocumentChunk]] = self._client.search(
body=query_body,
search_pipeline_id=None,
)

View File

@@ -11,8 +11,6 @@ from pydantic import model_serializer
from pydantic import model_validator
from pydantic import SerializerFunctionWrapHandler
from onyx.configs.app_configs import OPENSEARCH_INDEX_NUM_REPLICAS
from onyx.configs.app_configs import OPENSEARCH_INDEX_NUM_SHARDS
from onyx.configs.app_configs import OPENSEARCH_TEXT_ANALYZER
from onyx.configs.app_configs import USING_AWS_MANAGED_OPENSEARCH
from onyx.document_index.interfaces_new import TenantState
@@ -102,9 +100,9 @@ def set_or_convert_timezone_to_utc(value: datetime) -> datetime:
return value
class DocumentChunkWithoutVectors(BaseModel):
class DocumentChunk(BaseModel):
"""
Represents a chunk of a document in the OpenSearch index without vectors.
Represents a chunk of a document in the OpenSearch index.
The names of these fields are based on the OpenSearch schema. Changes to the
schema require changes here. See get_document_schema.
@@ -126,7 +124,9 @@ class DocumentChunkWithoutVectors(BaseModel):
# Either both should be None or both should be non-None.
title: str | None = None
title_vector: list[float] | None = None
content: str
content_vector: list[float]
source_type: str
# A list of key-value pairs separated by INDEX_SEPARATOR. See
@@ -176,9 +176,19 @@ class DocumentChunkWithoutVectors(BaseModel):
def __str__(self) -> str:
return (
f"DocumentChunk(document_id={self.document_id}, chunk_index={self.chunk_index}, "
f"content length={len(self.content)}, tenant_id={self.tenant_id.tenant_id})."
f"content length={len(self.content)}, content vector length={len(self.content_vector)}, "
f"tenant_id={self.tenant_id.tenant_id})"
)
@model_validator(mode="after")
def check_title_and_title_vector_are_consistent(self) -> Self:
# title and title_vector should both either be None or not.
if self.title is not None and self.title_vector is None:
raise ValueError("Bug: Title vector must not be None if title is not None.")
if self.title_vector is not None and self.title is None:
raise ValueError("Bug: Title must not be None if title vector is not None.")
return self
@model_serializer(mode="wrap")
def serialize_model(
self, handler: SerializerFunctionWrapHandler
@@ -295,35 +305,6 @@ class DocumentChunkWithoutVectors(BaseModel):
return TenantState(tenant_id=value, multitenant=MULTI_TENANT)
class DocumentChunk(DocumentChunkWithoutVectors):
"""Represents a chunk of a document in the OpenSearch index.
The names of these fields are based on the OpenSearch schema. Changes to the
schema require changes here. See get_document_schema.
"""
model_config = {"frozen": True}
title_vector: list[float] | None = None
content_vector: list[float]
def __str__(self) -> str:
return (
f"DocumentChunk(document_id={self.document_id}, chunk_index={self.chunk_index}, "
f"content length={len(self.content)}, content vector length={len(self.content_vector)}, "
f"tenant_id={self.tenant_id.tenant_id})"
)
@model_validator(mode="after")
def check_title_and_title_vector_are_consistent(self) -> Self:
# title and title_vector should both either be None or not.
if self.title is not None and self.title_vector is None:
raise ValueError("Bug: Title vector must not be None if title is not None.")
if self.title_vector is not None and self.title is None:
raise ValueError("Bug: Title must not be None if title vector is not None.")
return self
class DocumentSchema:
"""
Represents the schema and indexing strategies of the OpenSearch index.
@@ -536,34 +517,77 @@ class DocumentSchema:
return schema
@staticmethod
def get_index_settings_based_on_environment() -> dict[str, Any]:
def get_index_settings() -> dict[str, Any]:
"""
Returns the index settings based on the environment.
Standard settings for reasonable local index and search performance.
"""
if USING_AWS_MANAGED_OPENSEARCH:
# NOTE: The number of data copies, including the primary (not a
# replica) copy, must be divisible by the number of AZs.
if MULTI_TENANT:
number_of_shards = 324
number_of_replicas = 2
else:
number_of_shards = 3
number_of_replicas = 2
else:
number_of_shards = 1
number_of_replicas = 1
if OPENSEARCH_INDEX_NUM_SHARDS is not None:
number_of_shards = OPENSEARCH_INDEX_NUM_SHARDS
if OPENSEARCH_INDEX_NUM_REPLICAS is not None:
number_of_replicas = OPENSEARCH_INDEX_NUM_REPLICAS
return {
"index": {
"number_of_shards": number_of_shards,
"number_of_replicas": number_of_replicas,
"number_of_shards": 1,
"number_of_replicas": 1,
# Required for vector search.
"knn": True,
"knn.algo_param.ef_search": EF_SEARCH,
}
}
@staticmethod
def get_index_settings_for_aws_managed_opensearch_st_dev() -> dict[str, Any]:
"""
Settings for AWS-managed OpenSearch.
Our AWS-managed OpenSearch cluster has 3 data nodes in 3 availability
zones.
- We use 3 shards to distribute load across all data nodes.
- We use 2 replicas to ensure each shard has a copy in each
availability zone. This is a hard requirement from AWS. The number
of data copies, including the primary (not a replica) copy, must be
divisible by the number of AZs.
"""
return {
"index": {
"number_of_shards": 3,
"number_of_replicas": 2,
# Required for vector search.
"knn": True,
"knn.algo_param.ef_search": EF_SEARCH,
}
}
@staticmethod
def get_index_settings_for_aws_managed_opensearch_mt_cloud() -> dict[str, Any]:
"""
Settings for AWS-managed OpenSearch in multi-tenant cloud.
324 shards very roughly targets a storage load of ~30Gb per shard, which
according to AWS OpenSearch documentation is within a good target range.
As documented above we need 2 replicas for a total of 3 copies of the
data because the cluster is configured with 3-AZ awareness.
"""
return {
"index": {
"number_of_shards": 324,
"number_of_replicas": 2,
# Required for vector search.
"knn": True,
"knn.algo_param.ef_search": EF_SEARCH,
}
}
@staticmethod
def get_index_settings_based_on_environment() -> dict[str, Any]:
"""
Returns the index settings based on the environment.
"""
if USING_AWS_MANAGED_OPENSEARCH:
if MULTI_TENANT:
return (
DocumentSchema.get_index_settings_for_aws_managed_opensearch_mt_cloud()
)
else:
return (
DocumentSchema.get_index_settings_for_aws_managed_opensearch_st_dev()
)
else:
return DocumentSchema.get_index_settings()

View File

@@ -7,7 +7,6 @@ from uuid import UUID
from onyx.configs.app_configs import DEFAULT_OPENSEARCH_QUERY_TIMEOUT_S
from onyx.configs.app_configs import OPENSEARCH_EXPLAIN_ENABLED
from onyx.configs.app_configs import OPENSEARCH_MATCH_HIGHLIGHTS_DISABLED
from onyx.configs.app_configs import OPENSEARCH_PROFILING_DISABLED
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import INDEX_SEPARATOR
@@ -16,7 +15,7 @@ from onyx.context.search.models import Tag
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.constants import ASSUMED_DOCUMENT_AGE_DAYS
from onyx.document_index.opensearch.constants import (
DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
)
from onyx.document_index.opensearch.constants import (
DEFAULT_OPENSEARCH_MAX_RESULT_WINDOW,
@@ -236,17 +235,9 @@ class DocumentQuery:
# returning some number of results less than the index max allowed
# return size.
"size": DEFAULT_OPENSEARCH_MAX_RESULT_WINDOW,
# By default exclude retrieving the vector fields in order to save
# on retrieval cost as we don't need them upstream.
"_source": {
"excludes": [TITLE_VECTOR_FIELD_NAME, CONTENT_VECTOR_FIELD_NAME]
},
"_source": get_full_document,
"timeout": f"{DEFAULT_OPENSEARCH_QUERY_TIMEOUT_S}s",
}
if not get_full_document:
# If we explicitly do not want the underlying document, we will only
# retrieve IDs.
final_get_ids_query["_source"] = False
if not OPENSEARCH_PROFILING_DISABLED:
final_get_ids_query["profile"] = True
@@ -341,7 +332,7 @@ class DocumentQuery:
# TODO(andrei, yuhong): We can tune this more dynamically based on
# num_hits.
max_results_per_subquery = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES
max_results_per_subquery = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
hybrid_search_subqueries = DocumentQuery._get_hybrid_search_subqueries(
query_text, query_vector, vector_candidates=max_results_per_subquery
@@ -365,6 +356,9 @@ class DocumentQuery:
attached_document_ids=index_filters.attached_document_ids,
hierarchy_node_ids=index_filters.hierarchy_node_ids,
)
match_highlights_configuration = (
DocumentQuery._get_match_highlights_configuration()
)
# See https://docs.opensearch.org/latest/query-dsl/compound/hybrid/
hybrid_search_query: dict[str, Any] = {
@@ -391,19 +385,10 @@ class DocumentQuery:
final_hybrid_search_body: dict[str, Any] = {
"query": hybrid_search_query,
"size": num_hits,
"highlight": match_highlights_configuration,
"timeout": f"{DEFAULT_OPENSEARCH_QUERY_TIMEOUT_S}s",
# Exclude retrieving the vector fields in order to save on
# retrieval cost as we don't need them upstream.
"_source": {
"excludes": [TITLE_VECTOR_FIELD_NAME, CONTENT_VECTOR_FIELD_NAME]
},
}
if not OPENSEARCH_MATCH_HIGHLIGHTS_DISABLED:
final_hybrid_search_body["highlight"] = (
DocumentQuery._get_match_highlights_configuration()
)
# Explain is for scoring breakdowns.
if OPENSEARCH_EXPLAIN_ENABLED:
final_hybrid_search_body["explain"] = True
@@ -461,11 +446,6 @@ class DocumentQuery:
},
"size": num_to_retrieve,
"timeout": f"{DEFAULT_OPENSEARCH_QUERY_TIMEOUT_S}s",
# Exclude retrieving the vector fields in order to save on
# retrieval cost as we don't need them upstream.
"_source": {
"excludes": [TITLE_VECTOR_FIELD_NAME, CONTENT_VECTOR_FIELD_NAME]
},
}
if not OPENSEARCH_PROFILING_DISABLED:
final_random_search_query["profile"] = True
@@ -480,7 +460,7 @@ class DocumentQuery:
# search. This is higher than the number of results because the scoring
# is hybrid. For a detailed breakdown, see where the default value is
# set.
vector_candidates: int = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
) -> list[dict[str, Any]]:
"""Returns subqueries for hybrid search.
@@ -566,7 +546,7 @@ class DocumentQuery:
@staticmethod
def _get_title_vector_similarity_search_query(
query_vector: list[float],
vector_candidates: int = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
) -> dict[str, Any]:
return {
"knn": {
@@ -580,7 +560,7 @@ class DocumentQuery:
@staticmethod
def _get_content_vector_similarity_search_query(
query_vector: list[float],
vector_candidates: int = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
) -> dict[str, Any]:
return {
"knn": {

View File

@@ -6,6 +6,7 @@ import re
import time
import urllib
import zipfile
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import datetime
from datetime import timedelta
@@ -461,7 +462,7 @@ class VespaIndex(DocumentIndex):
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
chunks: Iterable[DocMetadataAwareIndexChunk],
index_batch_params: IndexBatchParams,
) -> set[OldDocumentInsertionRecord]:
"""

View File

@@ -1,6 +1,8 @@
import concurrent.futures
import logging
import random
from collections.abc import Generator
from collections.abc import Iterable
from typing import Any
from uuid import UUID
@@ -8,6 +10,7 @@ import httpx
from pydantic import BaseModel
from retry import retry
from onyx.configs.app_configs import CHUNKS_PER_BATCH
from onyx.configs.app_configs import RECENCY_BIAS_MULTIPLIER
from onyx.configs.app_configs import RERANK_COUNT
from onyx.configs.chat_configs import DOC_TIME_DECAY
@@ -318,7 +321,7 @@ class VespaDocumentIndex(DocumentIndex):
def index(
self,
chunks: list[DocMetadataAwareIndexChunk],
chunks: Iterable[DocMetadataAwareIndexChunk],
indexing_metadata: IndexingMetadata,
) -> list[DocumentInsertionRecord]:
doc_id_to_chunk_cnt_diff = indexing_metadata.doc_id_to_chunk_cnt_diff
@@ -338,22 +341,31 @@ class VespaDocumentIndex(DocumentIndex):
# Vespa has restrictions on valid characters, yet document IDs come from
# external w.r.t. this class. We need to sanitize them.
cleaned_chunks: list[DocMetadataAwareIndexChunk] = [
clean_chunk_id_copy(chunk) for chunk in chunks
]
assert len(cleaned_chunks) == len(
chunks
), "Bug: Cleaned chunks and input chunks have different lengths."
#
# Instead of materializing all cleaned chunks upfront, we stream them
# through a generator that cleans IDs and builds the original-ID mapping
# incrementally as chunks flow into Vespa.
def _clean_and_track(
chunks_iter: Iterable[DocMetadataAwareIndexChunk],
id_map: dict[str, str],
seen_ids: set[str],
) -> Generator[DocMetadataAwareIndexChunk, None, None]:
"""Cleans chunk IDs and builds the original-ID mapping
incrementally as chunks flow through, avoiding a separate
materialization pass."""
for chunk in chunks_iter:
original_id = chunk.source_document.id
cleaned = clean_chunk_id_copy(chunk)
cleaned_id = cleaned.source_document.id
# Needed so the final DocumentInsertionRecord returned can have
# the original document ID. cleaned_chunks might not contain IDs
# exactly as callers supplied them.
id_map[cleaned_id] = original_id
seen_ids.add(cleaned_id)
yield cleaned
# Needed so the final DocumentInsertionRecord returned can have the
# original document ID. cleaned_chunks might not contain IDs exactly as
# callers supplied them.
new_document_id_to_original_document_id: dict[str, str] = dict()
for i, cleaned_chunk in enumerate(cleaned_chunks):
old_chunk = chunks[i]
new_document_id_to_original_document_id[
cleaned_chunk.source_document.id
] = old_chunk.source_document.id
new_document_id_to_original_document_id: dict[str, str] = {}
all_cleaned_doc_ids: set[str] = set()
existing_docs: set[str] = set()
@@ -409,8 +421,16 @@ class VespaDocumentIndex(DocumentIndex):
executor=executor,
)
# Insert new Vespa documents.
for chunk_batch in batch_generator(cleaned_chunks, BATCH_SIZE):
# Insert new Vespa documents, streaming through the cleaning
# pipeline so chunks are never fully materialized.
cleaned_chunks = _clean_and_track(
chunks,
new_document_id_to_original_document_id,
all_cleaned_doc_ids,
)
for chunk_batch in batch_generator(
cleaned_chunks, min(BATCH_SIZE, CHUNKS_PER_BATCH)
):
batch_index_vespa_chunks(
chunks=chunk_batch,
index_name=self._index_name,
@@ -419,10 +439,6 @@ class VespaDocumentIndex(DocumentIndex):
executor=executor,
)
all_cleaned_doc_ids: set[str] = {
chunk.source_document.id for chunk in cleaned_chunks
}
return [
DocumentInsertionRecord(
document_id=new_document_id_to_original_document_id[cleaned_doc_id],

View File

@@ -88,13 +88,9 @@ def summarize_image_with_error_handling(
try:
return summarize_image_pipeline(llm, image_data, user_prompt, system_prompt)
except UnsupportedImageFormatError:
magic_hex = image_data[:8].hex() if image_data else "empty"
logger.info(
"Skipping image summarization due to unsupported MIME type "
"for %s (magic_bytes=%s, size=%d bytes)",
"Skipping image summarization due to unsupported MIME type for %s",
context_name,
magic_hex,
len(image_data),
)
return None
@@ -138,23 +134,9 @@ def _summarize_image(
return summary
except Exception as e:
# Extract structured details from LiteLLM exceptions when available,
# rather than dumping the full messages payload (which contains base64
# image data and produces enormous, unreadable error logs).
str_e = str(e)
if len(str_e) > 512:
str_e = str_e[:512] + "... (truncated)"
parts = [f"Summarization failed: {type(e).__name__}: {str_e}"]
status_code = getattr(e, "status_code", None)
llm_provider = getattr(e, "llm_provider", None)
model = getattr(e, "model", None)
if status_code is not None:
parts.append(f"status_code={status_code}")
if llm_provider is not None:
parts.append(f"llm_provider={llm_provider}")
if model is not None:
parts.append(f"model={model}")
raise ValueError(" | ".join(parts)) from e
error_msg = f"Summarization failed. Messages: {messages}"
error_msg = error_msg[:1024]
raise ValueError(error_msg) from e
def _encode_image_for_llm_prompt(image_data: bytes) -> str:

View File

@@ -1,127 +0,0 @@
from datetime import datetime
from enum import Enum
from typing import Annotated
from typing import Any
from pydantic import BaseModel
from pydantic import Field
from pydantic import field_validator
from pydantic import model_validator
from pydantic import SecretStr
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
NonEmptySecretStr = Annotated[SecretStr, Field(min_length=1)]
# ---------------------------------------------------------------------------
# Request models
# ---------------------------------------------------------------------------
class HookCreateRequest(BaseModel):
name: str = Field(min_length=1)
hook_point: HookPoint
endpoint_url: str = Field(min_length=1)
api_key: NonEmptySecretStr | None = None
fail_strategy: HookFailStrategy | None = None # if None, uses HookPointSpec default
timeout_seconds: float | None = Field(
default=None, gt=0
) # if None, uses HookPointSpec default
@field_validator("name", "endpoint_url")
@classmethod
def no_whitespace_only(cls, v: str) -> str:
if not v.strip():
raise ValueError("cannot be whitespace-only.")
return v
class HookUpdateRequest(BaseModel):
name: str | None = None
endpoint_url: str | None = None
api_key: NonEmptySecretStr | None = None
fail_strategy: HookFailStrategy | None = (
None # if None in model_fields_set, reset to spec default
)
timeout_seconds: float | None = Field(
default=None, gt=0
) # if None in model_fields_set, reset to spec default
@model_validator(mode="after")
def require_at_least_one_field(self) -> "HookUpdateRequest":
if not self.model_fields_set:
raise ValueError("At least one field must be provided for an update.")
if "name" in self.model_fields_set and not (self.name or "").strip():
raise ValueError("name cannot be cleared.")
if (
"endpoint_url" in self.model_fields_set
and not (self.endpoint_url or "").strip()
):
raise ValueError("endpoint_url cannot be cleared.")
return self
# ---------------------------------------------------------------------------
# Response models
# ---------------------------------------------------------------------------
class HookPointMetaResponse(BaseModel):
hook_point: HookPoint
display_name: str
description: str
docs_url: str | None
input_schema: dict[str, Any]
output_schema: dict[str, Any]
default_timeout_seconds: float
default_fail_strategy: HookFailStrategy
fail_hard_description: str
class HookResponse(BaseModel):
id: int
name: str
hook_point: HookPoint
# Nullable to match the DB column — endpoint_url is required on creation but
# future hook point types may not use an external endpoint (e.g. built-in handlers).
endpoint_url: str | None
fail_strategy: HookFailStrategy
timeout_seconds: float # always resolved — None from request is replaced with spec default before DB write
is_active: bool
creator_email: str | None
created_at: datetime
updated_at: datetime
class HookValidateResponse(BaseModel):
success: bool
error_message: str | None = None
# ---------------------------------------------------------------------------
# Health models
# ---------------------------------------------------------------------------
class HookHealthStatus(str, Enum):
healthy = "healthy" # green — reachable, no failures in last 1h
degraded = "degraded" # yellow — reachable, failures in last 1h
unreachable = "unreachable" # red — is_reachable=false or null
class HookFailureRecord(BaseModel):
error_message: str | None = None
status_code: int | None = None
duration_ms: int | None = None
created_at: datetime
class HookHealthResponse(BaseModel):
status: HookHealthStatus
recent_failures: list[HookFailureRecord] = Field(
default_factory=list,
description="Last 10 failures, newest first",
max_length=10,
)

View File

@@ -1,59 +0,0 @@
from abc import ABC
from abc import abstractmethod
from typing import Any
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
_REQUIRED_ATTRS = (
"hook_point",
"display_name",
"description",
"default_timeout_seconds",
"fail_hard_description",
"default_fail_strategy",
)
class HookPointSpec(ABC):
"""Static metadata and contract for a pipeline hook point.
This is NOT a regular class meant for direct instantiation by callers.
Each concrete subclass represents exactly one hook point and is instantiated
once at startup, registered in onyx.hooks.registry._REGISTRY. No caller
should ever create instances directly — use get_hook_point_spec() or
get_all_specs() from the registry instead.
Each hook point is a concrete subclass of this class. Onyx engineers
own these definitions — customers never touch this code.
Subclasses must define all attributes as class-level constants.
"""
hook_point: HookPoint
display_name: str
description: str
default_timeout_seconds: float
fail_hard_description: str
default_fail_strategy: HookFailStrategy
docs_url: str | None = None
def __init_subclass__(cls, **kwargs: object) -> None:
super().__init_subclass__(**kwargs)
# Skip intermediate abstract subclasses — they may still be partially defined.
if getattr(cls, "__abstractmethods__", None):
return
missing = [attr for attr in _REQUIRED_ATTRS if not hasattr(cls, attr)]
if missing:
raise TypeError(f"{cls.__name__} must define class attributes: {missing}")
@property
@abstractmethod
def input_schema(self) -> dict[str, Any]:
"""JSON schema describing the request payload sent to the customer's endpoint."""
@property
@abstractmethod
def output_schema(self) -> dict[str, Any]:
"""JSON schema describing the expected response from the customer's endpoint."""

View File

@@ -1,29 +0,0 @@
from typing import Any
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.hooks.points.base import HookPointSpec
class DocumentIngestionSpec(HookPointSpec):
"""Hook point that runs during document ingestion.
# TODO(@Bo-Onyx): define call site, input/output schema, and timeout budget.
"""
hook_point = HookPoint.DOCUMENT_INGESTION
display_name = "Document Ingestion"
description = "Runs during document ingestion. Allows filtering or transforming documents before indexing."
default_timeout_seconds = 30.0
fail_hard_description = "The document will not be indexed."
default_fail_strategy = HookFailStrategy.HARD
@property
def input_schema(self) -> dict[str, Any]:
# TODO(@Bo-Onyx): define input schema
return {"type": "object", "properties": {}}
@property
def output_schema(self) -> dict[str, Any]:
# TODO(@Bo-Onyx): define output schema
return {"type": "object", "properties": {}}

View File

@@ -1,83 +0,0 @@
from typing import Any
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.hooks.points.base import HookPointSpec
class QueryProcessingSpec(HookPointSpec):
"""Hook point that runs on every user query before it enters the pipeline.
Call site: inside handle_stream_message_objects() in
backend/onyx/chat/process_message.py, immediately after message_text is
assigned from the request and before create_new_chat_message() saves it.
This is the earliest possible point in the query pipeline:
- Raw query — unmodified, exactly as the user typed it
- No side effects yet — message has not been saved to DB
- User identity is available for user-specific logic
Supported use cases:
- Query rejection: block queries based on content or user context
- Query rewriting: normalize, expand, or modify the query
- PII removal: scrub sensitive data before the LLM sees it
- Access control: reject queries from certain users or groups
- Query auditing: log or track queries based on business rules
"""
hook_point = HookPoint.QUERY_PROCESSING
display_name = "Query Processing"
description = (
"Runs on every user query before it enters the pipeline. "
"Allows rewriting, filtering, or rejecting queries."
)
default_timeout_seconds = 5.0 # user is actively waiting — keep tight
fail_hard_description = (
"The query will be blocked and the user will see an error message."
)
default_fail_strategy = HookFailStrategy.HARD
@property
def input_schema(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The raw query string exactly as the user typed it.",
},
"user_email": {
"type": ["string", "null"],
"description": "Email of the user submitting the query, or null if unauthenticated.",
},
"chat_session_id": {
"type": "string",
"description": "UUID of the chat session. Always present — the session is guaranteed to exist by the time this hook fires.",
},
},
"required": ["query", "user_email", "chat_session_id"],
"additionalProperties": False,
}
@property
def output_schema(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"query": {
"type": ["string", "null"],
"description": (
"The (optionally modified) query to use. "
"Set to null to reject the query."
),
},
"rejection_message": {
"type": ["string", "null"],
"description": (
"Message shown to the user when query is null. "
"Falls back to a generic message if not provided."
),
},
},
"required": ["query"],
}

View File

@@ -1,45 +0,0 @@
from onyx.db.enums import HookPoint
from onyx.hooks.points.base import HookPointSpec
from onyx.hooks.points.document_ingestion import DocumentIngestionSpec
from onyx.hooks.points.query_processing import QueryProcessingSpec
# Internal: use `monkeypatch.setattr(registry_module, "_REGISTRY", {...})` to override in tests.
_REGISTRY: dict[HookPoint, HookPointSpec] = {
HookPoint.DOCUMENT_INGESTION: DocumentIngestionSpec(),
HookPoint.QUERY_PROCESSING: QueryProcessingSpec(),
}
def validate_registry() -> None:
"""Assert that every HookPoint enum value has a registered spec.
Call once at application startup (e.g. from the FastAPI lifespan hook).
Raises RuntimeError if any hook point is missing a spec.
"""
missing = set(HookPoint) - set(_REGISTRY)
if missing:
raise RuntimeError(
f"Hook point(s) have no registered spec: {missing}. "
"Add an entry to onyx.hooks.registry._REGISTRY."
)
def get_hook_point_spec(hook_point: HookPoint) -> HookPointSpec:
"""Returns the spec for a given hook point.
Raises ValueError if the hook point has no registered spec — this is a
programmer error; every HookPoint enum value must have a corresponding spec
in _REGISTRY.
"""
try:
return _REGISTRY[hook_point]
except KeyError:
raise ValueError(
f"No spec registered for hook point {hook_point!r}. "
"Add an entry to onyx.hooks.registry._REGISTRY."
)
def get_all_specs() -> list[HookPointSpec]:
"""Returns the specs for all registered hook points."""
return list(_REGISTRY.values())

View File

@@ -395,12 +395,6 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
llm = get_default_llm_with_vision()
if not llm:
if get_image_extraction_and_analysis_enabled():
logger.warning(
"Image analysis is enabled but no vision-capable LLM is "
"available — images will not be summarized. Configure a "
"vision model in the admin LLM settings."
)
# Even without LLM, we still convert to IndexingDocument with base Sections
return [
IndexingDocument(

View File

@@ -168,23 +168,10 @@ def get_default_llm_with_vision(
if model_supports_image_input(
default_model.name, default_model.llm_provider.provider
):
logger.info(
"Using default vision model: %s (provider=%s)",
default_model.name,
default_model.llm_provider.provider,
)
return create_vision_llm(
LLMProviderView.from_model(default_model.llm_provider),
default_model.name,
)
else:
logger.warning(
"Default vision model %s (provider=%s) does not support "
"image input — falling back to searching all providers",
default_model.name,
default_model.llm_provider.provider,
)
# Fall back to searching all providers
models = fetch_existing_models(
db_session=db_session,
@@ -192,10 +179,6 @@ def get_default_llm_with_vision(
)
if not models:
logger.warning(
"No LLM models with VISION or CHAT flow type found — "
"image summarization will be disabled"
)
return None
for model in models:
@@ -217,25 +200,11 @@ def get_default_llm_with_vision(
for model in sorted_models:
if model_supports_image_input(model.name, model.llm_provider.provider):
logger.info(
"Using fallback vision model: %s (provider=%s)",
model.name,
model.llm_provider.provider,
)
return create_vision_llm(
provider_map[model.llm_provider_id],
model.name,
)
checked_models = [
f"{m.name} (provider={m.llm_provider.provider})" for m in sorted_models
]
logger.warning(
"No vision-capable model found among %d candidates: %s"
"image summarization will be disabled",
len(sorted_models),
", ".join(checked_models),
)
return None

View File

@@ -530,11 +530,6 @@ class LitellmLLM(LLM):
):
messages = _strip_tool_content_from_messages(messages)
# Only pass tool_choice when tools are present — some providers (e.g. Fireworks)
# reject requests where tool_choice is explicitly null.
if tools and tool_choice is not None:
optional_kwargs["tool_choice"] = tool_choice
response = litellm.completion(
mock_response=get_llm_mock_response() or MOCK_LLM_RESPONSE,
model=model,
@@ -543,6 +538,7 @@ class LitellmLLM(LLM):
custom_llm_provider=self._custom_llm_provider or None,
messages=messages,
tools=tools,
tool_choice=tool_choice,
stream=stream,
temperature=temperature,
timeout=timeout_override or self._timeout,

View File

@@ -62,7 +62,6 @@ from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.engine.sql_engine import SqlEngine
from onyx.error_handling.exceptions import register_onyx_exception_handlers
from onyx.file_store.file_store import get_default_file_store
from onyx.hooks.registry import validate_registry
from onyx.server.api_key.api import router as api_key_router
from onyx.server.auth_check import check_router_auth
from onyx.server.documents.cc_pair import router as cc_pair_router
@@ -309,7 +308,6 @@ def validate_no_vector_db_settings() -> None:
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: # noqa: ARG001
validate_no_vector_db_settings()
validate_cache_backend_settings()
validate_registry()
# Set recursion limit
if SYSTEM_RECURSION_LIMIT is not None:

View File

@@ -479,9 +479,7 @@ def is_zip_file(file: UploadFile) -> bool:
def upload_files(
files: list[UploadFile],
file_origin: FileOrigin = FileOrigin.CONNECTOR,
unzip: bool = True,
files: list[UploadFile], file_origin: FileOrigin = FileOrigin.CONNECTOR
) -> FileUploadResponse:
# Skip directories and known macOS metadata entries
@@ -504,46 +502,31 @@ def upload_files(
if seen_zip:
raise HTTPException(status_code=400, detail=SEEN_ZIP_DETAIL)
seen_zip = True
# Validate the zip by opening it (catches corrupt/non-zip files)
with zipfile.ZipFile(file.file, "r") as zf:
if unzip:
zip_metadata_file_id = save_zip_metadata_to_file_store(
zf, file_store
zip_metadata_file_id = save_zip_metadata_to_file_store(
zf, file_store
)
for file_info in zf.namelist():
if zf.getinfo(file_info).is_dir():
continue
if not should_process_file(file_info):
continue
sub_file_bytes = zf.read(file_info)
mime_type, __ = mimetypes.guess_type(file_info)
if mime_type is None:
mime_type = "application/octet-stream"
file_id = file_store.save_file(
content=BytesIO(sub_file_bytes),
display_name=os.path.basename(file_info),
file_origin=file_origin,
file_type=mime_type,
)
for file_info in zf.namelist():
if zf.getinfo(file_info).is_dir():
continue
if not should_process_file(file_info):
continue
sub_file_bytes = zf.read(file_info)
mime_type, __ = mimetypes.guess_type(file_info)
if mime_type is None:
mime_type = "application/octet-stream"
file_id = file_store.save_file(
content=BytesIO(sub_file_bytes),
display_name=os.path.basename(file_info),
file_origin=file_origin,
file_type=mime_type,
)
deduped_file_paths.append(file_id)
deduped_file_names.append(os.path.basename(file_info))
continue
# Store the zip as-is (unzip=False)
file.file.seek(0)
file_id = file_store.save_file(
content=file.file,
display_name=file.filename,
file_origin=file_origin,
file_type=file.content_type or "application/zip",
)
deduped_file_paths.append(file_id)
deduped_file_names.append(file.filename)
deduped_file_paths.append(file_id)
deduped_file_names.append(os.path.basename(file_info))
continue
# Since we can't render docx files in the UI,
@@ -630,10 +613,9 @@ def _fetch_and_check_file_connector_cc_pair_permissions(
@router.post("/admin/connector/file/upload", tags=PUBLIC_API_TAGS)
def upload_files_api(
files: list[UploadFile],
unzip: bool = True,
_: User = Depends(current_curator_or_admin_user),
) -> FileUploadResponse:
return upload_files(files, FileOrigin.OTHER, unzip=unzip)
return upload_files(files, FileOrigin.OTHER)
@router.get("/admin/connector/{connector_id}/files", tags=PUBLIC_API_TAGS)

View File

@@ -1,4 +1,3 @@
import hashlib
import mimetypes
from io import BytesIO
from typing import Any
@@ -84,14 +83,6 @@ class PythonTool(Tool[PythonToolOverrideKwargs]):
def __init__(self, tool_id: int, emitter: Emitter) -> None:
super().__init__(emitter=emitter)
self._id = tool_id
# Cache of (filename, content_hash) -> ci_file_id to avoid re-uploading
# the same file on every tool call iteration within the same agent session.
# Filename is included in the key so two files with identical bytes but
# different names each get their own upload slot.
# TTL assumption: code-interpreter file TTLs (typically hours) greatly
# exceed the lifetime of a single agent session (at most MAX_LLM_CYCLES
# iterations, typically a few minutes), so stale-ID eviction is not needed.
self._uploaded_file_cache: dict[tuple[str, str], str] = {}
@property
def id(self) -> int:
@@ -191,13 +182,8 @@ class PythonTool(Tool[PythonToolOverrideKwargs]):
for ind, chat_file in enumerate(chat_files):
file_name = chat_file.filename or f"file_{ind}"
try:
content_hash = hashlib.sha256(chat_file.content).hexdigest()
cache_key = (file_name, content_hash)
ci_file_id = self._uploaded_file_cache.get(cache_key)
if ci_file_id is None:
# Upload to Code Interpreter
ci_file_id = client.upload_file(chat_file.content, file_name)
self._uploaded_file_cache[cache_key] = ci_file_id
# Upload to Code Interpreter
ci_file_id = client.upload_file(chat_file.content, file_name)
# Stage for execution
files_to_stage.append({"path": file_name, "file_id": ci_file_id})
@@ -313,10 +299,14 @@ class PythonTool(Tool[PythonToolOverrideKwargs]):
f"Failed to delete Code Interpreter generated file {ci_file_id}: {e}"
)
# Note: staged input files are intentionally not deleted here because
# _uploaded_file_cache reuses their file_ids across iterations. They are
# orphaned when the session ends, but the code interpreter cleans up
# stale files on its own TTL.
# Cleanup staged input files
for file_mapping in files_to_stage:
try:
client.delete_file(file_mapping["file_id"])
except Exception as e:
logger.error(
f"Failed to delete Code Interpreter staged file {file_mapping['file_id']}: {e}"
)
# Emit file_ids once files are processed
if generated_file_ids:

View File

@@ -74,7 +74,7 @@ def make_structured_onyx_request_id(prefix: str, request_url: str) -> str:
def _make_onyx_request_id(prefix: str, hash_input: str) -> str:
"""helper function to return an id given a string input"""
hash_obj = hashlib.md5(hash_input.encode("utf-8"), usedforsecurity=False)
hash_obj = hashlib.md5(hash_input.encode("utf-8"))
hash_bytes = hash_obj.digest()[:6] # Truncate to 6 bytes
# 6 bytes becomes 8 bytes. we shouldn't need to strip but just in case

View File

@@ -752,7 +752,7 @@ pypandoc-binary==1.16.2
# via onyx
pyparsing==3.2.5
# via httplib2
pypdf==6.9.1
pypdf==6.8.0
# via
# onyx
# unstructured-client

View File

@@ -1,274 +0,0 @@
"""
External dependency unit tests for user file delete queue protections.
Verifies that the three mechanisms added to check_for_user_file_delete work
correctly:
1. Queue depth backpressure when the broker queue exceeds
USER_FILE_DELETE_MAX_QUEUE_DEPTH, no new tasks are enqueued.
2. Per-file Redis guard key if the guard key for a file already exists in
Redis, that file is skipped even though it is still in DELETING status.
3. Task expiry every send_task call carries expires=
CELERY_USER_FILE_DELETE_TASK_EXPIRES so that stale queued tasks are
discarded by workers automatically.
Also verifies that delete_user_file_impl clears the guard key the moment
it is picked up by a worker.
Uses real Redis (DB 0 via get_redis_client) and real PostgreSQL for UserFile
rows. The Celery app is provided as a MagicMock injected via a PropertyMock
on the task class so no real broker is needed.
"""
from collections.abc import Generator
from contextlib import contextmanager
from typing import Any
from unittest.mock import MagicMock
from unittest.mock import patch
from unittest.mock import PropertyMock
from uuid import uuid4
from sqlalchemy.orm import Session
from onyx.background.celery.tasks.user_file_processing.tasks import (
_user_file_delete_lock_key,
)
from onyx.background.celery.tasks.user_file_processing.tasks import (
_user_file_delete_queued_key,
)
from onyx.background.celery.tasks.user_file_processing.tasks import (
check_for_user_file_delete,
)
from onyx.background.celery.tasks.user_file_processing.tasks import (
process_single_user_file_delete,
)
from onyx.configs.constants import CELERY_USER_FILE_DELETE_TASK_EXPIRES
from onyx.configs.constants import OnyxCeleryQueues
from onyx.configs.constants import OnyxCeleryTask
from onyx.configs.constants import USER_FILE_DELETE_MAX_QUEUE_DEPTH
from onyx.db.enums import UserFileStatus
from onyx.db.models import UserFile
from onyx.redis.redis_pool import get_redis_client
from tests.external_dependency_unit.conftest import create_test_user
from tests.external_dependency_unit.constants import TEST_TENANT_ID
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_PATCH_QUEUE_LEN = (
"onyx.background.celery.tasks.user_file_processing.tasks.celery_get_queue_length"
)
def _create_deleting_user_file(db_session: Session, user_id: object) -> UserFile:
"""Insert a UserFile in DELETING status and return it."""
uf = UserFile(
id=uuid4(),
user_id=user_id,
file_id=f"test_file_{uuid4().hex[:8]}",
name=f"test_{uuid4().hex[:8]}.txt",
file_type="text/plain",
status=UserFileStatus.DELETING,
)
db_session.add(uf)
db_session.commit()
db_session.refresh(uf)
return uf
@contextmanager
def _patch_task_app(task: Any, mock_app: MagicMock) -> Generator[None, None, None]:
"""Patch the ``app`` property on *task*'s class so that ``self.app``
inside the task function returns *mock_app*.
With ``bind=True``, ``task.run`` is a bound method whose ``__self__`` is
the actual task instance. We patch ``app`` on that instance's class
(a unique Celery-generated Task subclass) so the mock is scoped to this
task only.
"""
task_instance = task.run.__self__
with patch.object(
type(task_instance), "app", new_callable=PropertyMock, return_value=mock_app
):
yield
# ---------------------------------------------------------------------------
# Test classes
# ---------------------------------------------------------------------------
class TestDeleteQueueDepthBackpressure:
"""Protection 1: skip all enqueuing when the broker queue is too deep."""
def test_no_tasks_enqueued_when_queue_over_limit(
self,
db_session: Session,
tenant_context: None, # noqa: ARG002
) -> None:
"""When the queue depth exceeds the limit the beat cycle is skipped."""
user = create_test_user(db_session, "del_bp_user")
_create_deleting_user_file(db_session, user.id)
mock_app = MagicMock()
with (
_patch_task_app(check_for_user_file_delete, mock_app),
patch(_PATCH_QUEUE_LEN, return_value=USER_FILE_DELETE_MAX_QUEUE_DEPTH + 1),
):
check_for_user_file_delete.run(tenant_id=TEST_TENANT_ID)
mock_app.send_task.assert_not_called()
class TestDeletePerFileGuardKey:
"""Protection 2: per-file Redis guard key prevents duplicate enqueue."""
def test_guarded_file_not_re_enqueued(
self,
db_session: Session,
tenant_context: None, # noqa: ARG002
) -> None:
"""A file whose guard key is already set in Redis is skipped."""
user = create_test_user(db_session, "del_guard_user")
uf = _create_deleting_user_file(db_session, user.id)
redis_client = get_redis_client(tenant_id=TEST_TENANT_ID)
guard_key = _user_file_delete_queued_key(uf.id)
redis_client.setex(guard_key, CELERY_USER_FILE_DELETE_TASK_EXPIRES, 1)
mock_app = MagicMock()
try:
with (
_patch_task_app(check_for_user_file_delete, mock_app),
patch(_PATCH_QUEUE_LEN, return_value=0),
):
check_for_user_file_delete.run(tenant_id=TEST_TENANT_ID)
# send_task must not have been called with this specific file's ID
for call in mock_app.send_task.call_args_list:
kwargs = call.kwargs.get("kwargs", {})
assert kwargs.get("user_file_id") != str(
uf.id
), f"File {uf.id} should have been skipped because its guard key exists"
finally:
redis_client.delete(guard_key)
def test_guard_key_exists_in_redis_after_enqueue(
self,
db_session: Session,
tenant_context: None, # noqa: ARG002
) -> None:
"""After a file is enqueued its guard key is present in Redis with a TTL."""
user = create_test_user(db_session, "del_guard_set_user")
uf = _create_deleting_user_file(db_session, user.id)
redis_client = get_redis_client(tenant_id=TEST_TENANT_ID)
guard_key = _user_file_delete_queued_key(uf.id)
redis_client.delete(guard_key) # clean slate
mock_app = MagicMock()
try:
with (
_patch_task_app(check_for_user_file_delete, mock_app),
patch(_PATCH_QUEUE_LEN, return_value=0),
):
check_for_user_file_delete.run(tenant_id=TEST_TENANT_ID)
assert redis_client.exists(
guard_key
), "Guard key should be set in Redis after enqueue"
ttl = int(redis_client.ttl(guard_key)) # type: ignore[arg-type]
assert (
0 < ttl <= CELERY_USER_FILE_DELETE_TASK_EXPIRES
), f"Guard key TTL {ttl}s is outside the expected range (0, {CELERY_USER_FILE_DELETE_TASK_EXPIRES}]"
finally:
redis_client.delete(guard_key)
class TestDeleteTaskExpiry:
"""Protection 3: every send_task call includes an expires value."""
def test_send_task_called_with_expires(
self,
db_session: Session,
tenant_context: None, # noqa: ARG002
) -> None:
"""send_task is called with the correct queue, task name, and expires."""
user = create_test_user(db_session, "del_expires_user")
uf = _create_deleting_user_file(db_session, user.id)
redis_client = get_redis_client(tenant_id=TEST_TENANT_ID)
guard_key = _user_file_delete_queued_key(uf.id)
redis_client.delete(guard_key)
mock_app = MagicMock()
try:
with (
_patch_task_app(check_for_user_file_delete, mock_app),
patch(_PATCH_QUEUE_LEN, return_value=0),
):
check_for_user_file_delete.run(tenant_id=TEST_TENANT_ID)
# At least one task should have been submitted (for our file)
assert (
mock_app.send_task.call_count >= 1
), "Expected at least one task to be submitted"
# Every submitted task must carry expires
for call in mock_app.send_task.call_args_list:
assert call.args[0] == OnyxCeleryTask.DELETE_SINGLE_USER_FILE
assert call.kwargs.get("queue") == OnyxCeleryQueues.USER_FILE_DELETE
assert (
call.kwargs.get("expires") == CELERY_USER_FILE_DELETE_TASK_EXPIRES
), "Task must be submitted with the correct expires value to prevent stale task accumulation"
finally:
redis_client.delete(guard_key)
class TestDeleteWorkerClearsGuardKey:
"""process_single_user_file_delete removes the guard key when it picks up a task."""
def test_guard_key_deleted_on_pickup(
self,
tenant_context: None, # noqa: ARG002
) -> None:
"""The guard key is deleted before the worker does any real work.
We simulate an already-locked file so delete_user_file_impl returns
early but crucially, after the guard key deletion.
"""
user_file_id = str(uuid4())
redis_client = get_redis_client(tenant_id=TEST_TENANT_ID)
guard_key = _user_file_delete_queued_key(user_file_id)
# Simulate the guard key set when the beat enqueued the task
redis_client.setex(guard_key, CELERY_USER_FILE_DELETE_TASK_EXPIRES, 1)
assert redis_client.exists(guard_key), "Guard key must exist before pickup"
# Hold the per-file delete lock so the worker exits early without
# touching the database or file store.
lock_key = _user_file_delete_lock_key(user_file_id)
delete_lock = redis_client.lock(lock_key, timeout=10)
acquired = delete_lock.acquire(blocking=False)
assert acquired, "Should be able to acquire the delete lock for this test"
try:
process_single_user_file_delete.run(
user_file_id=user_file_id,
tenant_id=TEST_TENANT_ID,
)
finally:
if delete_lock.owned():
delete_lock.release()
assert not redis_client.exists(
guard_key
), "Guard key should be deleted when the worker picks up the task"

View File

@@ -297,10 +297,6 @@ def index_batch_params(
class TestDocumentIndexOld:
"""Tests the old DocumentIndex interface."""
# TODO(ENG-3864)(andrei): Re-enable this test.
@pytest.mark.xfail(
reason="Flaky test: Retrieved chunks vary non-deterministically before and after changing user projects and personas. Likely a timing issue with the index being updated."
)
def test_update_single_can_clear_user_projects_and_personas(
self,
document_indices: list[DocumentIndex],

View File

@@ -29,7 +29,6 @@ from onyx.document_index.opensearch.opensearch_document_index import (
)
from onyx.document_index.opensearch.schema import CONTENT_FIELD_NAME
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import DocumentChunkWithoutVectors
from onyx.document_index.opensearch.schema import DocumentSchema
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.search import DocumentQuery
@@ -97,23 +96,6 @@ def _patch_hybrid_search_normalization_pipeline(
)
def _patch_opensearch_match_highlights_disabled(
monkeypatch: pytest.MonkeyPatch, disabled: bool
) -> None:
"""
Patches OPENSEARCH_MATCH_HIGHLIGHTS_DISABLED wherever necessary for this
test file.
"""
monkeypatch.setattr(
"onyx.configs.app_configs.OPENSEARCH_MATCH_HIGHLIGHTS_DISABLED",
disabled,
)
monkeypatch.setattr(
"onyx.document_index.opensearch.search.OPENSEARCH_MATCH_HIGHLIGHTS_DISABLED",
disabled,
)
def _create_test_document_chunk(
document_id: str,
content: str,
@@ -244,7 +226,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
# Under test.
# Should not raise.
@@ -260,7 +242,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
@@ -289,7 +271,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
@@ -303,7 +285,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
# Under test and postcondition.
# Should return False before creation.
@@ -323,7 +305,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
@@ -358,7 +340,7 @@ class TestOpenSearchClient:
},
},
}
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=initial_mappings, settings=settings)
# Under test.
@@ -401,7 +383,7 @@ class TestOpenSearchClient:
"test_field": {"type": "keyword"},
},
}
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=initial_mappings, settings=settings)
# Under test and postcondition.
@@ -436,7 +418,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
# Create once - should succeed.
test_client.create_index(mappings=mappings, settings=settings)
@@ -479,7 +461,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
doc = _create_test_document_chunk(
@@ -507,7 +489,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
docs = [
@@ -538,7 +520,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
doc = _create_test_document_chunk(
@@ -566,7 +548,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
original_doc = _create_test_document_chunk(
@@ -601,7 +583,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=False
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Under test and postcondition.
@@ -620,7 +602,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
doc = _create_test_document_chunk(
@@ -656,7 +638,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
@@ -677,7 +659,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index multiple documents.
@@ -753,7 +735,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Create a document to update.
@@ -802,7 +784,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Under test and postcondition.
@@ -822,12 +804,11 @@ class TestOpenSearchClient:
"""Tests all hybrid search configurations and pipelines."""
# Precondition.
_patch_global_tenant_state(monkeypatch, False)
_patch_opensearch_match_highlights_disabled(monkeypatch, False)
tenant_state = TenantState(tenant_id=POSTGRES_DEFAULT_SCHEMA, multitenant=False)
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents.
docs = {
@@ -900,12 +881,8 @@ class TestOpenSearchClient:
)
# Make sure the chunk contents are preserved.
for i, chunk in enumerate(results):
expected = docs[chunk.document_chunk.document_id]
assert chunk.document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(expected, k)
for k in DocumentChunkWithoutVectors.model_fields
}
assert (
chunk.document_chunk == docs[chunk.document_chunk.document_id]
)
# Make sure score reporting seems reasonable (it should not be None
# or 0).
@@ -929,7 +906,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Note no documents were indexed.
@@ -965,13 +942,12 @@ class TestOpenSearchClient:
"""
# Precondition.
_patch_global_tenant_state(monkeypatch, True)
_patch_opensearch_match_highlights_disabled(monkeypatch, False)
tenant_x = TenantState(tenant_id="tenant-x", multitenant=True)
tenant_y = TenantState(tenant_id="tenant-y", multitenant=True)
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_x.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents with different public/hidden and tenant states.
@@ -1062,12 +1038,7 @@ class TestOpenSearchClient:
# ordered; we're just assuming which doc will be the first result here.
assert results[0].document_chunk.document_id == "public-doc"
# Make sure the chunk contents are preserved.
assert results[0].document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(docs["public-doc"], k)
for k in DocumentChunkWithoutVectors.model_fields
}
)
assert results[0].document_chunk == docs["public-doc"]
# Make sure score reporting seems reasonable (it should not be None
# or 0).
assert results[0].score
@@ -1075,12 +1046,7 @@ class TestOpenSearchClient:
assert results[0].match_highlights.get(CONTENT_FIELD_NAME, [])
# Same for the second result.
assert results[1].document_chunk.document_id == "private-doc-user-a"
assert results[1].document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(docs["private-doc-user-a"], k)
for k in DocumentChunkWithoutVectors.model_fields
}
)
assert results[1].document_chunk == docs["private-doc-user-a"]
assert results[1].score
assert results[1].match_highlights.get(CONTENT_FIELD_NAME, [])
@@ -1096,12 +1062,11 @@ class TestOpenSearchClient:
"""
# Precondition.
_patch_global_tenant_state(monkeypatch, True)
_patch_opensearch_match_highlights_disabled(monkeypatch, False)
tenant_x = TenantState(tenant_id="tenant-x", multitenant=True)
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_x.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents with varying relevance to the query.
@@ -1228,7 +1193,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_x.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Although very unlikely in practice, let's use the same doc ID just to
@@ -1321,7 +1286,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Don't index any documents.
@@ -1348,7 +1313,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index chunks for two different documents.
@@ -1416,7 +1381,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents with different public/hidden and tenant states.
@@ -1493,7 +1458,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index docs with various ages.
@@ -1585,7 +1550,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings_based_on_environment()
settings = DocumentSchema.get_index_settings()
test_client.create_index(mappings=mappings, settings=settings)
# Index chunks for two different documents, one hidden one not.
@@ -1634,9 +1599,4 @@ class TestOpenSearchClient:
for result in results:
# Note each result must be from doc 1, which is not hidden.
expected_result = doc1_chunks[result.document_chunk.chunk_index]
assert result.document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(expected_result, k)
for k in DocumentChunkWithoutVectors.model_fields
}
)
assert result.document_chunk == expected_result

View File

@@ -31,6 +31,7 @@ from onyx.background.celery.tasks.opensearch_migration.transformer import (
)
from onyx.configs.constants import PUBLIC_DOC_PAT
from onyx.configs.constants import SOURCE_TYPE
from onyx.context.search.models import IndexFilters
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.models import Document
from onyx.db.models import OpenSearchDocumentMigrationRecord
@@ -43,7 +44,6 @@ from onyx.document_index.opensearch.client import OpenSearchIndexClient
from onyx.document_index.opensearch.client import wait_for_opensearch_with_timeout
from onyx.document_index.opensearch.constants import DEFAULT_MAX_CHUNK_SIZE
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.search import DocumentQuery
from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
from onyx.document_index.vespa.vespa_document_index import VespaDocumentIndex
@@ -70,7 +70,6 @@ from onyx.document_index.vespa_constants import SOURCE_LINKS
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.document_index.vespa_constants import USER_PROJECT
from shared_configs.configs import MULTI_TENANT
from shared_configs.contextvars import get_current_tenant_id
from tests.external_dependency_unit.full_setup import ensure_full_deployment_setup
@@ -79,22 +78,24 @@ CHUNK_COUNT = 5
def _get_document_chunks_from_opensearch(
opensearch_client: OpenSearchIndexClient,
document_id: str,
tenant_state: TenantState,
opensearch_client: OpenSearchIndexClient, document_id: str, current_tenant_id: str
) -> list[DocumentChunk]:
opensearch_client.refresh_index()
results: list[DocumentChunk] = []
for i in range(CHUNK_COUNT):
document_chunk_id: str = get_opensearch_doc_chunk_id(
tenant_state=tenant_state,
document_id=document_id,
chunk_index=i,
max_chunk_size=DEFAULT_MAX_CHUNK_SIZE,
)
result = opensearch_client.get_document(document_chunk_id)
results.append(result)
return results
filters = IndexFilters(access_control_list=None, tenant_id=current_tenant_id)
query_body = DocumentQuery.get_from_document_id_query(
document_id=document_id,
tenant_state=TenantState(tenant_id=current_tenant_id, multitenant=False),
index_filters=filters,
include_hidden=False,
max_chunk_size=DEFAULT_MAX_CHUNK_SIZE,
min_chunk_index=None,
max_chunk_index=None,
)
search_hits = opensearch_client.search(
body=query_body,
search_pipeline_id=None,
)
return [search_hit.document_chunk for search_hit in search_hits]
def _delete_document_chunks_from_opensearch(
@@ -451,13 +452,10 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Under test.
result = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
# Postcondition.
@@ -479,7 +477,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, tenant_state
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -524,9 +522,6 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Run the initial batch. To simulate partial progress we will mock the
# redis lock to return True for the first invocation of .owned() and
@@ -541,7 +536,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
return_value=mock_redis_client,
):
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
assert result_1 is True
@@ -564,7 +559,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Under test.
# Run the remainder of the migration.
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
# Postcondition.
@@ -588,7 +583,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, tenant_state
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -635,9 +630,6 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Run the initial batch. To simulate partial progress we will mock the
# redis lock to return True for the first invocation of .owned() and
@@ -654,7 +646,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
return_value=mock_redis_client,
):
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
assert result_1 is True
@@ -699,7 +691,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
),
):
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
# Postcondition.
@@ -736,7 +728,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
),
):
result_3 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
# Postcondition.
@@ -760,7 +752,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, tenant_state
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -848,25 +840,24 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
chunk["content"] = (
f"Different content {chunk[CHUNK_ID]} for {test_documents[0].id}"
)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
chunks_for_document_in_opensearch, _ = (
transform_vespa_chunks_to_opensearch_chunks(
document_in_opensearch,
tenant_state,
TenantState(tenant_id=get_current_tenant_id(), multitenant=False),
{},
)
)
opensearch_client.bulk_index_documents(
documents=chunks_for_document_in_opensearch,
tenant_state=tenant_state,
tenant_state=TenantState(
tenant_id=get_current_tenant_id(), multitenant=False
),
update_if_exists=True,
)
# Under test.
result = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
# Postcondition.
@@ -887,7 +878,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, tenant_state
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -931,14 +922,11 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Under test.
# First run.
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
# Postcondition.
@@ -959,7 +947,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, tenant_state
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -972,7 +960,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Under test.
# Second run.
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=tenant_state.tenant_id
tenant_id=get_current_tenant_id()
)
# Postcondition.
@@ -994,7 +982,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, tenant_state
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)

View File

@@ -1219,16 +1219,15 @@ def test_code_interpreter_receives_chat_files(
finally:
ci_mod.CodeInterpreterClient.__init__.__defaults__ = original_defaults
# Verify: file uploaded and code executed via streaming.
# Verify: file uploaded, code executed via streaming, staged file cleaned up
assert len(mock_ci_server.get_requests(method="POST", path="/v1/files")) == 1
assert (
len(mock_ci_server.get_requests(method="POST", path="/v1/execute/stream")) == 1
)
# Staged input files are intentionally NOT deleted — PythonTool caches their
# file IDs across agent-loop iterations to avoid re-uploading on every call.
# The code interpreter cleans them up via its own TTL.
assert len(mock_ci_server.get_requests(method="DELETE")) == 0
delete_requests = mock_ci_server.get_requests(method="DELETE")
assert len(delete_requests) == 1
assert delete_requests[0].path.startswith("/v1/files/")
execute_body = mock_ci_server.get_requests(
method="POST", path="/v1/execute/stream"

View File

@@ -1,63 +0,0 @@
"""
Unit test verifying that the upload API path sends tasks with expires=.
The upload_files_to_user_files_with_indexing function must include expires=
on every send_task call to prevent phantom task accumulation if the worker
is down or slow.
"""
from unittest.mock import MagicMock
from unittest.mock import patch
from uuid import uuid4
from onyx.configs.constants import CELERY_USER_FILE_PROCESSING_TASK_EXPIRES
from onyx.configs.constants import OnyxCeleryQueues
from onyx.configs.constants import OnyxCeleryTask
from onyx.db.models import UserFile
from onyx.db.projects import upload_files_to_user_files_with_indexing
def _make_mock_user_file() -> MagicMock:
uf = MagicMock(spec=UserFile)
uf.id = str(uuid4())
return uf
@patch("onyx.db.projects.get_current_tenant_id", return_value="test_tenant")
@patch("onyx.db.projects.create_user_files")
@patch(
"onyx.background.celery.versioned_apps.client.app",
new_callable=MagicMock,
)
def test_send_task_includes_expires(
mock_client_app: MagicMock,
mock_create: MagicMock,
mock_tenant: MagicMock, # noqa: ARG001
) -> None:
"""Every send_task call from the upload path must include expires=."""
user_files = [_make_mock_user_file(), _make_mock_user_file()]
mock_create.return_value = MagicMock(
user_files=user_files,
rejected_files=[],
id_to_temp_id={},
)
mock_user = MagicMock()
mock_db_session = MagicMock()
upload_files_to_user_files_with_indexing(
files=[],
project_id=None,
user=mock_user,
temp_id_map=None,
db_session=mock_db_session,
)
assert mock_client_app.send_task.call_count == len(user_files)
for call in mock_client_app.send_task.call_args_list:
assert call.args[0] == OnyxCeleryTask.PROCESS_SINGLE_USER_FILE
assert call.kwargs.get("queue") == OnyxCeleryQueues.USER_FILE_PROCESSING
assert (
call.kwargs.get("expires") == CELERY_USER_FILE_PROCESSING_TASK_EXPIRES
), "send_task must include expires= to prevent phantom task accumulation"

View File

@@ -0,0 +1,208 @@
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.access.models import DocumentAccess
from onyx.configs.constants import DocumentSource
from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.document_index.interfaces_new import IndexingMetadata
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.opensearch_document_index import (
OpenSearchDocumentIndex,
)
from onyx.indexing.models import DocMetadataAwareIndexChunk
def _make_chunk(
doc_id: str,
chunk_id: int,
) -> DocMetadataAwareIndexChunk:
"""Creates a minimal DocMetadataAwareIndexChunk for testing."""
doc = Document(
id=doc_id,
sections=[TextSection(text="test", link="http://test.com")],
source=DocumentSource.FILE,
semantic_identifier="test_doc",
metadata={},
)
access = DocumentAccess.build(
user_emails=[],
user_groups=[],
external_user_emails=[],
external_user_group_ids=[],
is_public=True,
)
return DocMetadataAwareIndexChunk(
chunk_id=chunk_id,
blurb="test",
content="test content",
source_links={0: "http://test.com"},
image_file_id=None,
section_continuation=False,
source_document=doc,
title_prefix="",
metadata_suffix_semantic="",
metadata_suffix_keyword="",
mini_chunk_texts=None,
large_chunk_id=None,
doc_summary="",
chunk_context="",
contextual_rag_reserved_tokens=0,
embeddings={"full_embedding": [0.1] * 10, "mini_chunk_embeddings": []},
title_embedding=[0.1] * 10,
tenant_id="test_tenant",
access=access,
document_sets=set(),
user_project=[],
personas=[],
boost=0,
aggregated_chunk_boost_factor=1.0,
ancestor_hierarchy_node_ids=[],
)
def _make_index() -> OpenSearchDocumentIndex:
"""Creates an OpenSearchDocumentIndex with a mocked client."""
mock_client = MagicMock()
mock_client.bulk_index_documents = MagicMock()
tenant_state = TenantState(tenant_id="test_tenant", multitenant=False)
index = OpenSearchDocumentIndex.__new__(OpenSearchDocumentIndex)
index._index_name = "test_index"
index._client = mock_client
index._tenant_state = tenant_state
return index
def _make_metadata(doc_id: str, chunk_count: int) -> IndexingMetadata:
return IndexingMetadata(
doc_id_to_chunk_cnt_diff={
doc_id: IndexingMetadata.ChunkCounts(
old_chunk_cnt=0,
new_chunk_cnt=chunk_count,
),
},
)
@patch("onyx.document_index.opensearch.opensearch_document_index.CHUNKS_PER_BATCH", 100)
def test_single_doc_under_batch_limit_flushes_once() -> None:
"""A document with fewer chunks than CHUNKS_PER_BATCH should flush once."""
index = _make_index()
doc_id = "doc_1"
num_chunks = 50
chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
metadata = _make_metadata(doc_id, num_chunks)
with patch.object(index, "delete", return_value=0):
index.index(chunks, metadata)
assert index._client.bulk_index_documents.call_count == 1
batch_arg = index._client.bulk_index_documents.call_args_list[0]
assert len(batch_arg.kwargs["documents"]) == num_chunks
@patch("onyx.document_index.opensearch.opensearch_document_index.CHUNKS_PER_BATCH", 100)
def test_single_doc_over_batch_limit_flushes_multiple_times() -> None:
"""A document with more chunks than CHUNKS_PER_BATCH should flush multiple times."""
index = _make_index()
doc_id = "doc_1"
num_chunks = 250
chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
metadata = _make_metadata(doc_id, num_chunks)
with patch.object(index, "delete", return_value=0):
index.index(chunks, metadata)
# 250 chunks / 100 per batch = 3 flushes (100 + 100 + 50)
assert index._client.bulk_index_documents.call_count == 3
batch_sizes = [
len(call.kwargs["documents"])
for call in index._client.bulk_index_documents.call_args_list
]
assert batch_sizes == [100, 100, 50]
@patch("onyx.document_index.opensearch.opensearch_document_index.CHUNKS_PER_BATCH", 100)
def test_single_doc_exactly_at_batch_limit() -> None:
"""A document with exactly CHUNKS_PER_BATCH chunks should flush once
(the flush happens on the next chunk, not at the boundary)."""
index = _make_index()
doc_id = "doc_1"
num_chunks = 100
chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
metadata = _make_metadata(doc_id, num_chunks)
with patch.object(index, "delete", return_value=0):
index.index(chunks, metadata)
# 100 chunks hit the >= check on chunk 101 which doesn't exist,
# so final flush handles all 100
# Actually: the elif fires when len(current_chunks) >= 100, which happens
# when current_chunks has 100 items and the 101st chunk arrives.
# With exactly 100 chunks, the 100th chunk makes len == 99, then appended -> 100.
# No 101st chunk arrives, so the final flush handles all 100.
assert index._client.bulk_index_documents.call_count == 1
@patch("onyx.document_index.opensearch.opensearch_document_index.CHUNKS_PER_BATCH", 100)
def test_single_doc_one_over_batch_limit() -> None:
"""101 chunks for one doc: first 100 flushed when the 101st arrives, then
the 101st is flushed at the end."""
index = _make_index()
doc_id = "doc_1"
num_chunks = 101
chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
metadata = _make_metadata(doc_id, num_chunks)
with patch.object(index, "delete", return_value=0):
index.index(chunks, metadata)
assert index._client.bulk_index_documents.call_count == 2
batch_sizes = [
len(call.kwargs["documents"])
for call in index._client.bulk_index_documents.call_args_list
]
assert batch_sizes == [100, 1]
@patch("onyx.document_index.opensearch.opensearch_document_index.CHUNKS_PER_BATCH", 100)
def test_multiple_docs_each_under_limit_flush_per_doc() -> None:
"""Multiple documents each under the batch limit should flush once per document."""
index = _make_index()
chunks = []
for doc_idx in range(3):
doc_id = f"doc_{doc_idx}"
for chunk_idx in range(50):
chunks.append(_make_chunk(doc_id, chunk_idx))
metadata = IndexingMetadata(
doc_id_to_chunk_cnt_diff={
f"doc_{i}": IndexingMetadata.ChunkCounts(old_chunk_cnt=0, new_chunk_cnt=50)
for i in range(3)
},
)
with patch.object(index, "delete", return_value=0):
index.index(chunks, metadata)
# 3 documents = 3 flushes (one per doc boundary + final)
assert index._client.bulk_index_documents.call_count == 3
@patch("onyx.document_index.opensearch.opensearch_document_index.CHUNKS_PER_BATCH", 100)
def test_delete_called_once_per_document() -> None:
"""Even with multiple flushes for a single document, delete should only be
called once per document."""
index = _make_index()
doc_id = "doc_1"
num_chunks = 250
chunks = [_make_chunk(doc_id, i) for i in range(num_chunks)]
metadata = _make_metadata(doc_id, num_chunks)
with patch.object(index, "delete", return_value=0) as mock_delete:
index.index(chunks, metadata)
mock_delete.assert_called_once_with(doc_id, None)

View File

@@ -0,0 +1,272 @@
"""Unit tests for OpenSearchDocumentIndex.index().
These tests mock the OpenSearch client and verify the buffered
flush-by-document logic, DocumentInsertionRecord construction, and
delete-before-insert semantics.
"""
from collections.abc import Iterator
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.access.models import DocumentAccess
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentSource
from onyx.connectors.models import TextSection
from onyx.document_index.interfaces_new import IndexingMetadata
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.opensearch_document_index import (
OpenSearchDocumentIndex,
)
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import IndexChunk
def _make_chunk(
doc_id: str,
chunk_id: int = 0,
content: str = "test content",
chunk_count: int | None = None,
) -> DocMetadataAwareIndexChunk:
doc = Document(
id=doc_id,
semantic_identifier="test_doc",
sections=[TextSection(text=content, link=None)],
source=DocumentSource.NOT_APPLICABLE,
metadata={},
chunk_count=chunk_count,
)
index_chunk = IndexChunk(
chunk_id=chunk_id,
blurb=content[:50],
content=content,
source_links=None,
image_file_id=None,
section_continuation=False,
source_document=doc,
title_prefix="",
metadata_suffix_semantic="",
metadata_suffix_keyword="",
contextual_rag_reserved_tokens=0,
doc_summary="",
chunk_context="",
mini_chunk_texts=None,
large_chunk_id=None,
embeddings=ChunkEmbedding(
full_embedding=[0.1] * 10,
mini_chunk_embeddings=[],
),
title_embedding=[0.1] * 10,
)
access = DocumentAccess.build(
user_emails=[],
user_groups=[],
external_user_emails=[],
external_user_group_ids=[],
is_public=True,
)
return DocMetadataAwareIndexChunk.from_index_chunk(
index_chunk=index_chunk,
access=access,
document_sets=set(),
user_project=[],
personas=[],
boost=0,
aggregated_chunk_boost_factor=1.0,
tenant_id="test_tenant",
)
def _make_indexing_metadata(
doc_ids: list[str],
old_counts: list[int],
new_counts: list[int],
) -> IndexingMetadata:
return IndexingMetadata(
doc_id_to_chunk_cnt_diff={
doc_id: IndexingMetadata.ChunkCounts(
old_chunk_cnt=old,
new_chunk_cnt=new,
)
for doc_id, old, new in zip(doc_ids, old_counts, new_counts)
}
)
def _make_os_index(mock_client: MagicMock) -> OpenSearchDocumentIndex:
"""Create an OpenSearchDocumentIndex with a mocked client."""
with patch.object(
OpenSearchDocumentIndex,
"__init__",
lambda _self, *_a, **_kw: None,
):
idx = OpenSearchDocumentIndex.__new__(OpenSearchDocumentIndex)
idx._index_name = "test_index"
idx._tenant_state = TenantState(tenant_id="test_tenant", multitenant=False)
idx._client = mock_client
return idx
def test_index_single_new_doc() -> None:
"""Indexing a single new document returns one record with already_existed=False."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
# Patch delete to return 0 (no existing chunks)
with patch.object(idx, "delete", return_value=0) as mock_delete:
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[1])
results = idx.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].document_id == "doc1"
assert results[0].already_existed is False
mock_delete.assert_called_once()
mock_client.bulk_index_documents.assert_called_once()
def test_index_existing_doc_already_existed_true() -> None:
"""Re-indexing a doc with previous chunks returns already_existed=True."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
with patch.object(idx, "delete", return_value=5):
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[5], new_counts=[1])
results = idx.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].already_existed is True
def test_index_multiple_docs_flushed_separately() -> None:
"""Chunks from different documents are flushed in separate bulk calls."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
with patch.object(idx, "delete", return_value=0):
chunks = [
_make_chunk("doc1", chunk_id=0),
_make_chunk("doc1", chunk_id=1),
_make_chunk("doc2", chunk_id=0),
]
metadata = _make_indexing_metadata(
["doc1", "doc2"], old_counts=[0, 0], new_counts=[2, 1]
)
results = idx.index(chunks=chunks, indexing_metadata=metadata)
result_map = {r.document_id: r.already_existed for r in results}
assert len(result_map) == 2
assert result_map["doc1"] is False
assert result_map["doc2"] is False
# Two separate flushes: one for doc1 (2 chunks), one for doc2 (1 chunk)
assert mock_client.bulk_index_documents.call_count == 2
def test_index_deletes_before_inserting() -> None:
"""For each document, delete is called before bulk_index_documents."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
call_order: list[str] = []
idx = _make_os_index(mock_client)
def track_delete(*_args: object, **_kwargs: object) -> int:
call_order.append("delete")
return 3
def track_bulk(*_args: object, **_kwargs: object) -> None:
call_order.append("bulk_index")
mock_client.bulk_index_documents.side_effect = track_bulk
with patch.object(idx, "delete", side_effect=track_delete):
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[3], new_counts=[1])
idx.index(chunks=[chunk], indexing_metadata=metadata)
assert call_order == ["delete", "bulk_index"]
def test_index_delete_called_once_per_doc() -> None:
"""Delete is called only once per document, even with multiple chunks."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
with patch.object(idx, "delete", return_value=0) as mock_delete:
# 3 chunks, all same doc — should only delete once
chunks = [_make_chunk("doc1", chunk_id=i) for i in range(3)]
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[3])
idx.index(chunks=chunks, indexing_metadata=metadata)
mock_delete.assert_called_once()
def test_index_flushes_on_doc_boundary() -> None:
"""When doc ID changes in the stream, the previous doc's chunks are flushed."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
bulk_call_chunk_counts: list[int] = []
def track_bulk(documents: list[object], **_kwargs: object) -> None:
bulk_call_chunk_counts.append(len(documents))
mock_client.bulk_index_documents.side_effect = track_bulk
with patch.object(idx, "delete", return_value=0):
chunks = [
_make_chunk("doc1", chunk_id=0),
_make_chunk("doc1", chunk_id=1),
_make_chunk("doc1", chunk_id=2),
_make_chunk("doc2", chunk_id=0),
_make_chunk("doc2", chunk_id=1),
]
metadata = _make_indexing_metadata(
["doc1", "doc2"], old_counts=[0, 0], new_counts=[3, 2]
)
idx.index(chunks=chunks, indexing_metadata=metadata)
# First flush: 3 chunks for doc1, second flush: 2 chunks for doc2
assert bulk_call_chunk_counts == [3, 2]
def test_index_with_generator_input() -> None:
"""The index method works with a generator (iterable) input, not just lists."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
consumed: list[int] = []
def chunk_gen() -> Iterator[DocMetadataAwareIndexChunk]:
for i in range(3):
consumed.append(i)
yield _make_chunk("doc1", chunk_id=i)
with patch.object(idx, "delete", return_value=0):
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[3])
results = idx.index(chunks=chunk_gen(), indexing_metadata=metadata)
assert consumed == [0, 1, 2]
assert len(results) == 1

View File

@@ -0,0 +1,417 @@
"""Unit tests for VespaDocumentIndex.index().
These tests mock all external I/O (HTTP calls, thread pools) and verify
the streaming logic, ID cleaning/mapping, and DocumentInsertionRecord
construction.
"""
from collections.abc import Iterator
from unittest.mock import MagicMock
from unittest.mock import patch
from uuid import uuid4
from onyx.access.models import DocumentAccess
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentSource
from onyx.connectors.models import TextSection
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
from onyx.document_index.interfaces_new import IndexingMetadata
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.vespa.vespa_document_index import VespaDocumentIndex
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import IndexChunk
def _make_chunk(
doc_id: str,
chunk_id: int = 0,
content: str = "test content",
) -> DocMetadataAwareIndexChunk:
doc = Document(
id=doc_id,
semantic_identifier="test_doc",
sections=[TextSection(text=content, link=None)],
source=DocumentSource.NOT_APPLICABLE,
metadata={},
)
index_chunk = IndexChunk(
chunk_id=chunk_id,
blurb=content[:50],
content=content,
source_links=None,
image_file_id=None,
section_continuation=False,
source_document=doc,
title_prefix="",
metadata_suffix_semantic="",
metadata_suffix_keyword="",
contextual_rag_reserved_tokens=0,
doc_summary="",
chunk_context="",
mini_chunk_texts=None,
large_chunk_id=None,
embeddings=ChunkEmbedding(
full_embedding=[0.1] * 10,
mini_chunk_embeddings=[],
),
title_embedding=None,
)
access = DocumentAccess.build(
user_emails=[],
user_groups=[],
external_user_emails=[],
external_user_group_ids=[],
is_public=True,
)
return DocMetadataAwareIndexChunk.from_index_chunk(
index_chunk=index_chunk,
access=access,
document_sets=set(),
user_project=[],
personas=[],
boost=0,
aggregated_chunk_boost_factor=1.0,
tenant_id="test_tenant",
)
def _make_indexing_metadata(
doc_ids: list[str],
old_counts: list[int],
new_counts: list[int],
) -> IndexingMetadata:
return IndexingMetadata(
doc_id_to_chunk_cnt_diff={
doc_id: IndexingMetadata.ChunkCounts(
old_chunk_cnt=old,
new_chunk_cnt=new,
)
for doc_id, old, new in zip(doc_ids, old_counts, new_counts)
}
)
def _stub_enrich(
doc_id: str,
old_chunk_cnt: int,
) -> EnrichedDocumentIndexingInfo:
"""Build an EnrichedDocumentIndexingInfo that says 'no chunks to delete'
when old_chunk_cnt == 0, or 'has existing chunks' otherwise."""
return EnrichedDocumentIndexingInfo(
doc_id=doc_id,
chunk_start_index=0,
old_version=False,
chunk_end_index=old_chunk_cnt,
)
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_single_new_doc(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Indexing a single new document returns one record with already_existed=False."""
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[1])
results = index.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].document_id == "doc1"
assert results[0].already_existed is False
# batch_index_vespa_chunks should be called once with a single cleaned chunk
mock_batch_index.assert_called_once()
call_kwargs = mock_batch_index.call_args
indexed_chunks = call_kwargs.kwargs["chunks"]
assert len(indexed_chunks) == 1
assert indexed_chunks[0].source_document.id == "doc1"
assert call_kwargs.kwargs["index_name"] == "test_index"
assert call_kwargs.kwargs["multitenant"] is False
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_existing_doc_already_existed_true(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock,
mock_delete: MagicMock,
mock_batch_index: MagicMock,
) -> None:
"""Re-indexing a doc with previous chunks deletes old chunks, indexes
new ones, and returns already_existed=True."""
fake_chunk_ids = [uuid4(), uuid4()]
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=5)
mock_get_chunk_ids.return_value = fake_chunk_ids
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[5], new_counts=[1])
results = index.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].already_existed is True
# Old chunks should be deleted
mock_delete.assert_called_once()
delete_kwargs = mock_delete.call_args.kwargs
assert delete_kwargs["doc_chunk_ids"] == fake_chunk_ids
assert delete_kwargs["index_name"] == "test_index"
# New chunk should be indexed
mock_batch_index.assert_called_once()
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 1
assert indexed_chunks[0].source_document.id == "doc1"
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_multiple_docs(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Indexing multiple documents returns one record per unique document."""
mock_enrich.side_effect = [
_stub_enrich("doc1", old_chunk_cnt=0),
_stub_enrich("doc2", old_chunk_cnt=3),
]
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunks = [
_make_chunk("doc1", chunk_id=0),
_make_chunk("doc1", chunk_id=1),
_make_chunk("doc2", chunk_id=0),
]
metadata = _make_indexing_metadata(
["doc1", "doc2"], old_counts=[0, 3], new_counts=[2, 1]
)
results = index.index(chunks=chunks, indexing_metadata=metadata)
result_map = {r.document_id: r.already_existed for r in results}
assert len(result_map) == 2
assert result_map["doc1"] is False
assert result_map["doc2"] is True
# All 3 chunks fit in one batch (BATCH_SIZE=128), so one call
mock_batch_index.assert_called_once()
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 3
indexed_doc_ids = [c.source_document.id for c in indexed_chunks]
assert indexed_doc_ids == ["doc1", "doc1", "doc2"]
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_cleans_doc_ids(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Documents with invalid Vespa characters get cleaned IDs, but
the returned DocumentInsertionRecord uses the original ID."""
doc_id_with_quote = "doc'1"
mock_enrich.return_value = _stub_enrich(doc_id_with_quote, old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunk = _make_chunk(doc_id_with_quote)
metadata = _make_indexing_metadata(
[doc_id_with_quote], old_counts=[0], new_counts=[1]
)
results = index.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
# The returned ID should be the original (unclean) ID
assert results[0].document_id == doc_id_with_quote
# The chunk passed to batch_index_vespa_chunks should have the cleaned ID
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 1
assert indexed_chunks[0].source_document.id == "doc_1" # quote replaced with _
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_deduplicates_doc_ids_in_results(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Multiple chunks from the same document produce only one
DocumentInsertionRecord."""
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunks = [_make_chunk("doc1", chunk_id=i) for i in range(5)]
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[5])
results = index.index(chunks=chunks, indexing_metadata=metadata)
assert len(results) == 1
assert results[0].document_id == "doc1"
# All 5 chunks should be passed to batch_index_vespa_chunks
mock_batch_index.assert_called_once()
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 5
assert all(c.source_document.id == "doc1" for c in indexed_chunks)
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
@patch(
"onyx.document_index.vespa.vespa_document_index.BATCH_SIZE",
3,
)
def test_index_respects_batch_size(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""When chunks exceed BATCH_SIZE, batch_index_vespa_chunks is called
multiple times with correctly sized batches."""
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunks = [_make_chunk("doc1", chunk_id=i) for i in range(7)]
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[7])
results = index.index(chunks=chunks, indexing_metadata=metadata)
assert len(results) == 1
# With BATCH_SIZE=3 and 7 chunks: batches of 3, 3, 1
assert mock_batch_index.call_count == 3
batch_sizes = [len(c.kwargs["chunks"]) for c in mock_batch_index.call_args_list]
assert batch_sizes == [3, 3, 1]
# Verify all chunks are accounted for and in order
all_indexed = [
chunk for c in mock_batch_index.call_args_list for chunk in c.kwargs["chunks"]
]
assert len(all_indexed) == 7
assert [c.chunk_id for c in all_indexed] == list(range(7))
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_streams_chunks_lazily(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock, # noqa: ARG001
) -> None:
"""Chunks are consumed lazily via a generator, not materialized upfront."""
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
consumed: list[int] = []
def chunk_generator() -> Iterator[DocMetadataAwareIndexChunk]:
for i in range(3):
consumed.append(i)
yield _make_chunk("doc1", chunk_id=i)
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[3])
# Before calling index, nothing consumed
gen = chunk_generator()
assert len(consumed) == 0
results = index.index(chunks=gen, indexing_metadata=metadata)
# After calling index, all chunks should have been consumed
assert consumed == [0, 1, 2]
assert len(results) == 1

View File

@@ -1,45 +0,0 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 1
/Kids [ 4 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Parent 2 0 R
>>
endobj
xref
0 5
0000000000 65535 f
0000000015 00000 n
0000000054 00000 n
0000000113 00000 n
0000000162 00000 n
trailer
<<
/Size 5
/Root 3 0 R
/Info 1 0 R
>>
startxref
256
%%EOF

View File

@@ -1,89 +0,0 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 2
/Kids [ 4 0 R 6 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 5 0 R
/Parent 2 0 R
>>
endobj
5 0 obj
<<
/Length 47
>>
stream
BT /F1 12 Tf 50 150 Td (Page one content) Tj ET
endstream
endobj
6 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 7 0 R
/Parent 2 0 R
>>
endobj
7 0 obj
<<
/Length 47
>>
stream
BT /F1 12 Tf 50 150 Td (Page two content) Tj ET
endstream
endobj
xref
0 8
0000000000 65535 f
0000000015 00000 n
0000000054 00000 n
0000000119 00000 n
0000000168 00000 n
0000000349 00000 n
0000000446 00000 n
0000000627 00000 n
trailer
<<
/Size 8
/Root 3 0 R
/Info 1 0 R
>>
startxref
724
%%EOF

View File

@@ -1,62 +0,0 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 1
/Kids [ 4 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 5 0 R
/Parent 2 0 R
>>
endobj
5 0 obj
<<
/Length 42
>>
stream
BT /F1 12 Tf 50 150 Td (Hello World) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000054 00000 n
0000000113 00000 n
0000000162 00000 n
0000000343 00000 n
trailer
<<
/Size 6
/Root 3 0 R
/Info 1 0 R
>>
startxref
435
%%EOF

View File

@@ -1,64 +0,0 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
/Title (My Title)
/Author (Jane Doe)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 1
/Kids [ 4 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 5 0 R
/Parent 2 0 R
>>
endobj
5 0 obj
<<
/Length 35
>>
stream
BT /F1 12 Tf 50 150 Td (test) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000091 00000 n
0000000150 00000 n
0000000199 00000 n
0000000380 00000 n
trailer
<<
/Size 6
/Root 3 0 R
/Info 1 0 R
>>
startxref
465
%%EOF

View File

@@ -1,89 +0,0 @@
"""
Unit tests for image summarization error handling.
Verifies that:
1. LLM errors produce actionable error messages (not base64 dumps)
2. Unsupported MIME type logs include the magic bytes and size
3. The ValueError raised on LLM failure preserves the original exception
"""
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from onyx.file_processing.image_summarization import _summarize_image
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
class TestSummarizeImageErrorMessage:
"""_summarize_image must not dump base64 image data into error messages."""
def test_error_message_contains_exception_type_not_base64(self) -> None:
"""The ValueError should contain the original exception info, not message payloads."""
mock_llm = MagicMock()
mock_llm.invoke.side_effect = RuntimeError("Connection timeout")
# A fake base64-encoded image string (should NOT appear in the error)
fake_encoded = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUg..."
with pytest.raises(ValueError, match="RuntimeError: Connection timeout"):
_summarize_image(fake_encoded, mock_llm, query="test")
def test_error_message_does_not_contain_base64(self) -> None:
"""Ensure base64 data is never included in the error message."""
mock_llm = MagicMock()
mock_llm.invoke.side_effect = RuntimeError("API error")
fake_encoded = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA"
with pytest.raises(ValueError) as exc_info:
_summarize_image(fake_encoded, mock_llm)
error_str = str(exc_info.value)
assert "base64" not in error_str
assert "iVBOR" not in error_str
def test_original_exception_is_chained(self) -> None:
"""The ValueError should chain the original exception via __cause__."""
mock_llm = MagicMock()
original = RuntimeError("upstream failure")
mock_llm.invoke.side_effect = original
with pytest.raises(ValueError) as exc_info:
_summarize_image("data:image/png;base64,abc", mock_llm)
assert exc_info.value.__cause__ is original
class TestUnsupportedMimeTypeLogging:
"""summarize_image_with_error_handling should log useful info for unsupported formats."""
@patch(
"onyx.file_processing.image_summarization.summarize_image_pipeline",
side_effect=__import__(
"onyx.file_processing.image_summarization",
fromlist=["UnsupportedImageFormatError"],
).UnsupportedImageFormatError("unsupported"),
)
def test_logs_magic_bytes_and_size(
self, mock_pipeline: MagicMock # noqa: ARG002
) -> None:
"""The info log should include magic bytes hex and image size."""
mock_llm = MagicMock()
# TIFF magic bytes (not in the supported list)
image_data = b"\x49\x49\x2a\x00" + b"\x00" * 100
with patch("onyx.file_processing.image_summarization.logger") as mock_logger:
result = summarize_image_with_error_handling(
llm=mock_llm,
image_data=image_data,
context_name="test_image.tiff",
)
assert result is None
mock_logger.info.assert_called_once()
log_args = mock_logger.info.call_args
# Check the format string args contain magic bytes and size
assert "49492a00" in str(log_args)
assert "104" in str(log_args) # 4 + 100 bytes

View File

@@ -1,141 +0,0 @@
"""
Unit tests verifying that LiteLLM error details are extracted and surfaced
in image summarization error messages.
When the LLM call fails, the error handler should include the status_code,
llm_provider, and model from LiteLLM exceptions so operators can diagnose
the root cause (rate limit, content filter, unsupported vision, etc.)
without needing to dig through LiteLLM internals.
"""
from unittest.mock import MagicMock
import pytest
from onyx.file_processing.image_summarization import _summarize_image
def _make_litellm_style_error(
*,
message: str = "API error",
status_code: int | None = None,
llm_provider: str | None = None,
model: str | None = None,
) -> RuntimeError:
"""Create an exception with LiteLLM-style attributes."""
exc = RuntimeError(message)
if status_code is not None:
exc.status_code = status_code # type: ignore[attr-defined]
if llm_provider is not None:
exc.llm_provider = llm_provider # type: ignore[attr-defined]
if model is not None:
exc.model = model # type: ignore[attr-defined]
return exc
class TestLiteLLMErrorExtraction:
"""Verify that LiteLLM error attributes are included in the ValueError."""
def test_status_code_included(self) -> None:
mock_llm = MagicMock()
mock_llm.invoke.side_effect = _make_litellm_style_error(
message="Content filter triggered",
status_code=400,
llm_provider="azure",
model="gpt-4o",
)
with pytest.raises(ValueError, match="status_code=400"):
_summarize_image("data:image/png;base64,abc", mock_llm)
def test_llm_provider_included(self) -> None:
mock_llm = MagicMock()
mock_llm.invoke.side_effect = _make_litellm_style_error(
message="Bad request",
status_code=400,
llm_provider="azure",
)
with pytest.raises(ValueError, match="llm_provider=azure"):
_summarize_image("data:image/png;base64,abc", mock_llm)
def test_model_included(self) -> None:
mock_llm = MagicMock()
mock_llm.invoke.side_effect = _make_litellm_style_error(
message="Bad request",
model="gpt-4o",
)
with pytest.raises(ValueError, match="model=gpt-4o"):
_summarize_image("data:image/png;base64,abc", mock_llm)
def test_all_fields_in_single_message(self) -> None:
mock_llm = MagicMock()
mock_llm.invoke.side_effect = _make_litellm_style_error(
message="Rate limit exceeded",
status_code=429,
llm_provider="azure",
model="gpt-4o",
)
with pytest.raises(ValueError) as exc_info:
_summarize_image("data:image/png;base64,abc", mock_llm)
msg = str(exc_info.value)
assert "status_code=429" in msg
assert "llm_provider=azure" in msg
assert "model=gpt-4o" in msg
assert "Rate limit exceeded" in msg
def test_plain_exception_without_litellm_attrs(self) -> None:
"""Non-LiteLLM exceptions should still produce a useful message."""
mock_llm = MagicMock()
mock_llm.invoke.side_effect = ConnectionError("Connection refused")
with pytest.raises(ValueError) as exc_info:
_summarize_image("data:image/png;base64,abc", mock_llm)
msg = str(exc_info.value)
assert "ConnectionError" in msg
assert "Connection refused" in msg
# Should not contain status_code/llm_provider/model
assert "status_code" not in msg
assert "llm_provider" not in msg
def test_no_base64_in_error(self) -> None:
"""Error messages must not contain the full base64 image payload.
Some LiteLLM exceptions echo the request body (including base64 images)
in their message. The truncation guard ensures the bulk of such a
payload is stripped from the re-raised ValueError.
"""
mock_llm = MagicMock()
# Build a long base64-like payload that exceeds the 512-char truncation
fake_b64_payload = "iVBORw0KGgo" * 100 # ~1100 chars
fake_b64 = f"data:image/png;base64,{fake_b64_payload}"
mock_llm.invoke.side_effect = RuntimeError(
f"Request failed for payload: {fake_b64}"
)
with pytest.raises(ValueError) as exc_info:
_summarize_image(fake_b64, mock_llm)
msg = str(exc_info.value)
# The full payload must not appear (truncation should have kicked in)
assert fake_b64_payload not in msg
assert "truncated" in msg
def test_long_error_message_truncated(self) -> None:
"""Exception messages longer than 512 chars are truncated."""
mock_llm = MagicMock()
long_msg = "x" * 1000
mock_llm.invoke.side_effect = RuntimeError(long_msg)
with pytest.raises(ValueError) as exc_info:
_summarize_image("data:image/png;base64,abc", mock_llm)
msg = str(exc_info.value)
assert "truncated" in msg
# The full 1000-char string should not appear
assert long_msg not in msg

View File

@@ -1,124 +0,0 @@
"""Unit tests for pypdf-dependent PDF processing functions.
Tests cover:
- read_pdf_file: text extraction, metadata, encrypted PDFs, image extraction
- pdf_to_text: convenience wrapper
- is_pdf_protected: password protection detection
Fixture PDFs live in ./fixtures/ and are pre-built so the test layer has no
dependency on pypdf internals (pypdf.generic).
"""
from io import BytesIO
from pathlib import Path
from onyx.file_processing.extract_file_text import pdf_to_text
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.password_validation import is_pdf_protected
FIXTURES = Path(__file__).parent / "fixtures"
def _load(name: str) -> BytesIO:
return BytesIO((FIXTURES / name).read_bytes())
# ── read_pdf_file ────────────────────────────────────────────────────────
class TestReadPdfFile:
def test_basic_text_extraction(self) -> None:
text, _, images = read_pdf_file(_load("simple.pdf"))
assert "Hello World" in text
assert images == []
def test_multi_page_text_extraction(self) -> None:
text, _, _ = read_pdf_file(_load("multipage.pdf"))
assert "Page one content" in text
assert "Page two content" in text
def test_metadata_extraction(self) -> None:
_, pdf_metadata, _ = read_pdf_file(_load("with_metadata.pdf"))
assert pdf_metadata.get("Title") == "My Title"
assert pdf_metadata.get("Author") == "Jane Doe"
def test_encrypted_pdf_with_correct_password(self) -> None:
text, _, _ = read_pdf_file(_load("encrypted.pdf"), pdf_pass="pass123")
assert "Secret Content" in text
def test_encrypted_pdf_without_password(self) -> None:
text, _, _ = read_pdf_file(_load("encrypted.pdf"))
assert text == ""
def test_encrypted_pdf_with_wrong_password(self) -> None:
text, _, _ = read_pdf_file(_load("encrypted.pdf"), pdf_pass="wrong")
assert text == ""
def test_empty_pdf(self) -> None:
text, _, _ = read_pdf_file(_load("empty.pdf"))
assert text.strip() == ""
def test_invalid_pdf_returns_empty(self) -> None:
text, _, images = read_pdf_file(BytesIO(b"this is not a pdf"))
assert text == ""
assert images == []
def test_image_extraction_disabled_by_default(self) -> None:
_, _, images = read_pdf_file(_load("with_image.pdf"))
assert images == []
def test_image_extraction_collects_images(self) -> None:
_, _, images = read_pdf_file(_load("with_image.pdf"), extract_images=True)
assert len(images) == 1
img_bytes, img_name = images[0]
assert len(img_bytes) > 0
assert img_name # non-empty name
def test_image_callback_streams_instead_of_collecting(self) -> None:
"""With image_callback, images are streamed via callback and not accumulated."""
collected: list[tuple[bytes, str]] = []
def callback(data: bytes, name: str) -> None:
collected.append((data, name))
_, _, images = read_pdf_file(
_load("with_image.pdf"), extract_images=True, image_callback=callback
)
# Callback received the image
assert len(collected) == 1
assert len(collected[0][0]) > 0
# Returned list is empty when callback is used
assert images == []
# ── pdf_to_text ──────────────────────────────────────────────────────────
class TestPdfToText:
def test_returns_text(self) -> None:
assert "Hello World" in pdf_to_text(_load("simple.pdf"))
def test_with_password(self) -> None:
assert "Secret Content" in pdf_to_text(
_load("encrypted.pdf"), pdf_pass="pass123"
)
def test_encrypted_without_password_returns_empty(self) -> None:
assert pdf_to_text(_load("encrypted.pdf")) == ""
# ── is_pdf_protected ─────────────────────────────────────────────────────
class TestIsPdfProtected:
def test_unprotected_pdf(self) -> None:
assert is_pdf_protected(_load("simple.pdf")) is False
def test_protected_pdf(self) -> None:
assert is_pdf_protected(_load("encrypted.pdf")) is True
def test_preserves_file_position(self) -> None:
pdf = _load("simple.pdf")
pdf.seek(42)
is_pdf_protected(pdf)
assert pdf.tell() == 42

View File

@@ -1,22 +0,0 @@
from typing import Any
import pytest
from onyx.db.enums import HookPoint
from onyx.hooks.points.base import HookPointSpec
def test_init_subclass_raises_for_missing_attrs() -> None:
with pytest.raises(TypeError, match="must define class attributes"):
class IncompleteSpec(HookPointSpec):
hook_point = HookPoint.QUERY_PROCESSING
# missing display_name, description, etc.
@property
def input_schema(self) -> dict[str, Any]:
return {}
@property
def output_schema(self) -> dict[str, Any]:
return {}

View File

@@ -1,86 +0,0 @@
import pytest
from pydantic import ValidationError
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.hooks.models import HookCreateRequest
from onyx.hooks.models import HookUpdateRequest
def test_hook_update_request_rejects_empty() -> None:
# No fields supplied at all
with pytest.raises(ValidationError, match="At least one field must be provided"):
HookUpdateRequest()
def test_hook_update_request_rejects_null_name_when_only_field() -> None:
# Explicitly setting name=None is rejected as name cannot be cleared
with pytest.raises(ValidationError, match="name cannot be cleared"):
HookUpdateRequest(name=None)
def test_hook_update_request_accepts_single_field() -> None:
req = HookUpdateRequest(name="new name")
assert req.name == "new name"
def test_hook_update_request_accepts_partial_fields() -> None:
req = HookUpdateRequest(fail_strategy=HookFailStrategy.SOFT, timeout_seconds=10.0)
assert req.fail_strategy == HookFailStrategy.SOFT
assert req.timeout_seconds == 10.0
assert req.name is None
def test_hook_update_request_rejects_null_name() -> None:
with pytest.raises(ValidationError, match="name cannot be cleared"):
HookUpdateRequest(name=None, fail_strategy=HookFailStrategy.SOFT)
def test_hook_update_request_rejects_empty_name() -> None:
with pytest.raises(ValidationError, match="name cannot be cleared"):
HookUpdateRequest(name="", fail_strategy=HookFailStrategy.SOFT)
def test_hook_update_request_rejects_null_endpoint_url() -> None:
with pytest.raises(ValidationError, match="endpoint_url cannot be cleared"):
HookUpdateRequest(endpoint_url=None, fail_strategy=HookFailStrategy.SOFT)
def test_hook_update_request_rejects_empty_endpoint_url() -> None:
with pytest.raises(ValidationError, match="endpoint_url cannot be cleared"):
HookUpdateRequest(endpoint_url="", fail_strategy=HookFailStrategy.SOFT)
def test_hook_update_request_allows_null_api_key() -> None:
# api_key=null is valid — means "clear the api key"
req = HookUpdateRequest(api_key=None)
assert req.api_key is None
assert "api_key" in req.model_fields_set
def test_hook_update_request_rejects_whitespace_name() -> None:
with pytest.raises(ValidationError, match="name cannot be cleared"):
HookUpdateRequest(name=" ", fail_strategy=HookFailStrategy.SOFT)
def test_hook_update_request_rejects_whitespace_endpoint_url() -> None:
with pytest.raises(ValidationError, match="endpoint_url cannot be cleared"):
HookUpdateRequest(endpoint_url=" ", fail_strategy=HookFailStrategy.SOFT)
def test_hook_create_request_rejects_whitespace_name() -> None:
with pytest.raises(ValidationError, match="whitespace-only"):
HookCreateRequest(
name=" ",
hook_point=HookPoint.QUERY_PROCESSING,
endpoint_url="https://example.com/hook",
)
def test_hook_create_request_rejects_whitespace_endpoint_url() -> None:
with pytest.raises(ValidationError, match="whitespace-only"):
HookCreateRequest(
name="my hook",
hook_point=HookPoint.QUERY_PROCESSING,
endpoint_url=" ",
)

View File

@@ -1,60 +0,0 @@
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.hooks.points.query_processing import QueryProcessingSpec
def test_hook_point_is_query_processing() -> None:
assert QueryProcessingSpec().hook_point == HookPoint.QUERY_PROCESSING
def test_default_fail_strategy_is_hard() -> None:
assert QueryProcessingSpec().default_fail_strategy == HookFailStrategy.HARD
def test_default_timeout_seconds() -> None:
# User is actively waiting — 5s is the documented contract for this hook point
assert QueryProcessingSpec().default_timeout_seconds == 5.0
def test_input_schema_required_fields() -> None:
schema = QueryProcessingSpec().input_schema
assert schema["type"] == "object"
required = schema["required"]
assert "query" in required
assert "user_email" in required
assert "chat_session_id" in required
def test_input_schema_chat_session_id_is_string() -> None:
props = QueryProcessingSpec().input_schema["properties"]
assert props["chat_session_id"]["type"] == "string"
def test_input_schema_query_is_string() -> None:
props = QueryProcessingSpec().input_schema["properties"]
assert props["query"]["type"] == "string"
def test_input_schema_user_email_is_nullable() -> None:
props = QueryProcessingSpec().input_schema["properties"]
assert "null" in props["user_email"]["type"]
def test_output_schema_query_is_required() -> None:
schema = QueryProcessingSpec().output_schema
assert "query" in schema["required"]
def test_output_schema_query_is_nullable() -> None:
# null means "reject the query"
props = QueryProcessingSpec().output_schema["properties"]
assert "null" in props["query"]["type"]
def test_output_schema_rejection_message_is_optional() -> None:
schema = QueryProcessingSpec().output_schema
assert "rejection_message" not in schema.get("required", [])
def test_input_schema_no_additional_properties() -> None:
assert QueryProcessingSpec().input_schema.get("additionalProperties") is False

View File

@@ -1,47 +0,0 @@
import pytest
from onyx.db.enums import HookPoint
from onyx.hooks import registry as registry_module
from onyx.hooks.registry import get_all_specs
from onyx.hooks.registry import get_hook_point_spec
from onyx.hooks.registry import validate_registry
def test_registry_covers_all_hook_points() -> None:
"""Every HookPoint enum member must have a registered spec."""
assert {s.hook_point for s in get_all_specs()} == set(
HookPoint
), f"Missing specs for: {set(HookPoint) - {s.hook_point for s in get_all_specs()}}"
def test_get_hook_point_spec_returns_correct_spec() -> None:
for hook_point in HookPoint:
spec = get_hook_point_spec(hook_point)
assert spec.hook_point == hook_point
def test_get_all_specs_returns_all() -> None:
specs = get_all_specs()
assert len(specs) == len(HookPoint)
assert {s.hook_point for s in specs} == set(HookPoint)
def test_get_hook_point_spec_raises_for_unregistered(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""get_hook_point_spec raises ValueError when a hook point has no spec."""
monkeypatch.setattr(registry_module, "_REGISTRY", {})
with pytest.raises(ValueError, match="No spec registered for hook point"):
get_hook_point_spec(HookPoint.QUERY_PROCESSING)
def test_validate_registry_passes() -> None:
validate_registry() # should not raise with the real registry
def test_validate_registry_raises_for_incomplete(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(registry_module, "_REGISTRY", {})
with pytest.raises(RuntimeError, match="Hook point\\(s\\) have no registered spec"):
validate_registry()

View File

@@ -256,6 +256,7 @@ def test_multiple_tool_calls(default_multi_llm: LitellmLLM) -> None:
{"role": "user", "content": "What's the weather and time in New York?"}
],
tools=tools,
tool_choice=None,
stream=True,
temperature=0.0, # Default value from GEN_AI_TEMPERATURE
timeout=30,
@@ -411,6 +412,7 @@ def test_multiple_tool_calls_streaming(default_multi_llm: LitellmLLM) -> None:
{"role": "user", "content": "What's the weather and time in New York?"}
],
tools=tools,
tool_choice=None,
stream=True,
temperature=0.0, # Default value from GEN_AI_TEMPERATURE
timeout=30,
@@ -1429,36 +1431,3 @@ def test_strip_tool_content_merges_consecutive_tool_results() -> None:
assert "sunny 72F" in merged
assert "tc_2" in merged
assert "headline news" in merged
def test_no_tool_choice_sent_when_no_tools(default_multi_llm: LitellmLLM) -> None:
"""Regression test for providers (e.g. Fireworks) that reject tool_choice=null.
When no tools are provided, tool_choice must not be forwarded to
litellm.completion() at all — not even as None.
"""
messages: LanguageModelInput = [UserMessage(content="Hello!")]
mock_stream_chunks = [
litellm.ModelResponse(
id="chatcmpl-123",
choices=[
litellm.Choices(
delta=_create_delta(role="assistant", content="Hello!"),
finish_reason="stop",
index=0,
)
],
model="gpt-3.5-turbo",
),
]
with patch("litellm.completion") as mock_completion:
mock_completion.return_value = mock_stream_chunks
default_multi_llm.invoke(messages, tools=None)
_, kwargs = mock_completion.call_args
assert (
"tool_choice" not in kwargs
), "tool_choice must not be sent to providers when no tools are provided"

View File

@@ -1,130 +0,0 @@
"""
Unit tests for vision model selection logging in get_default_llm_with_vision.
Verifies that operators get clear feedback about:
1. Which vision model was selected and why
2. When the default vision model doesn't support image input
3. When no vision-capable model exists at all
"""
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.llm.factory import get_default_llm_with_vision
_FACTORY = "onyx.llm.factory"
def _make_mock_model(
*,
name: str = "gpt-4o",
provider: str = "openai",
provider_id: int = 1,
flow_types: list[str] | None = None,
) -> MagicMock:
model = MagicMock()
model.name = name
model.llm_provider_id = provider_id
model.llm_provider.provider = provider
model.llm_model_flow_types = flow_types or []
return model
@patch(f"{_FACTORY}.get_session_with_current_tenant")
@patch(f"{_FACTORY}.fetch_default_vision_model")
@patch(f"{_FACTORY}.model_supports_image_input", return_value=True)
@patch(f"{_FACTORY}.llm_from_provider")
@patch(f"{_FACTORY}.LLMProviderView")
@patch(f"{_FACTORY}.logger")
def test_logs_when_using_default_vision_model(
mock_logger: MagicMock,
mock_provider_view: MagicMock, # noqa: ARG001
mock_llm_from: MagicMock, # noqa: ARG001
mock_supports: MagicMock, # noqa: ARG001
mock_fetch_default: MagicMock,
mock_session: MagicMock, # noqa: ARG001
) -> None:
mock_fetch_default.return_value = _make_mock_model(name="gpt-4o", provider="azure")
get_default_llm_with_vision()
mock_logger.info.assert_called_once()
log_msg = mock_logger.info.call_args[0][0]
assert "default vision model" in log_msg.lower()
@patch(f"{_FACTORY}.get_session_with_current_tenant")
@patch(f"{_FACTORY}.fetch_default_vision_model")
@patch(f"{_FACTORY}.model_supports_image_input", return_value=False)
@patch(f"{_FACTORY}.fetch_existing_models", return_value=[])
@patch(f"{_FACTORY}.logger")
def test_warns_when_default_model_lacks_vision(
mock_logger: MagicMock,
mock_fetch_models: MagicMock, # noqa: ARG001
mock_supports: MagicMock, # noqa: ARG001
mock_fetch_default: MagicMock,
mock_session: MagicMock, # noqa: ARG001
) -> None:
mock_fetch_default.return_value = _make_mock_model(
name="text-only-model", provider="azure"
)
result = get_default_llm_with_vision()
assert result is None
# Should have warned about the default model not supporting vision
warning_calls = [
call
for call in mock_logger.warning.call_args_list
if "does not support" in str(call)
]
assert len(warning_calls) >= 1
@patch(f"{_FACTORY}.get_session_with_current_tenant")
@patch(f"{_FACTORY}.fetch_default_vision_model", return_value=None)
@patch(f"{_FACTORY}.fetch_existing_models", return_value=[])
@patch(f"{_FACTORY}.logger")
def test_warns_when_no_models_exist(
mock_logger: MagicMock,
mock_fetch_models: MagicMock, # noqa: ARG001
mock_fetch_default: MagicMock, # noqa: ARG001
mock_session: MagicMock, # noqa: ARG001
) -> None:
result = get_default_llm_with_vision()
assert result is None
mock_logger.warning.assert_called_once()
log_msg = mock_logger.warning.call_args[0][0]
assert "no llm models" in log_msg.lower()
@patch(f"{_FACTORY}.get_session_with_current_tenant")
@patch(f"{_FACTORY}.fetch_default_vision_model", return_value=None)
@patch(f"{_FACTORY}.fetch_existing_models")
@patch(f"{_FACTORY}.model_supports_image_input", return_value=False)
@patch(f"{_FACTORY}.LLMProviderView")
@patch(f"{_FACTORY}.logger")
def test_warns_when_no_model_supports_vision(
mock_logger: MagicMock,
mock_provider_view: MagicMock, # noqa: ARG001
mock_supports: MagicMock, # noqa: ARG001
mock_fetch_models: MagicMock,
mock_fetch_default: MagicMock, # noqa: ARG001
mock_session: MagicMock, # noqa: ARG001
) -> None:
mock_fetch_models.return_value = [
_make_mock_model(name="text-model-1", provider="openai"),
_make_mock_model(name="text-model-2", provider="azure", provider_id=2),
]
result = get_default_llm_with_vision()
assert result is None
warning_calls = [
call
for call in mock_logger.warning.call_args_list
if "no vision-capable model" in str(call).lower()
]
assert len(warning_calls) == 1

View File

@@ -1,109 +0,0 @@
import io
import zipfile
from unittest.mock import MagicMock
from unittest.mock import patch
from zipfile import BadZipFile
import pytest
from fastapi import UploadFile
from starlette.datastructures import Headers
from onyx.configs.constants import FileOrigin
from onyx.server.documents.connector import upload_files
def _create_test_zip() -> bytes:
"""Create a simple in-memory zip file containing two text files."""
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr("file1.txt", "hello")
zf.writestr("file2.txt", "world")
return buf.getvalue()
def _make_upload_file(content: bytes, filename: str, content_type: str) -> UploadFile:
return UploadFile(
file=io.BytesIO(content),
filename=filename,
headers=Headers({"content-type": content_type}),
)
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_zip_with_unzip_true_extracts_files(
mock_get_store: MagicMock,
) -> None:
"""When unzip=True (default), a zip upload is extracted into individual files."""
mock_store = MagicMock()
mock_store.save_file.side_effect = lambda **kwargs: f"id-{kwargs['display_name']}"
mock_get_store.return_value = mock_store
zip_bytes = _create_test_zip()
upload = _make_upload_file(zip_bytes, "test.zip", "application/zip")
result = upload_files([upload], FileOrigin.CONNECTOR)
# Should have extracted the two individual files, not stored the zip itself
assert len(result.file_paths) == 2
assert "id-file1.txt" in result.file_paths
assert "id-file2.txt" in result.file_paths
assert "file1.txt" in result.file_names
assert "file2.txt" in result.file_names
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_zip_with_unzip_false_stores_zip_as_is(
mock_get_store: MagicMock,
) -> None:
"""When unzip=False, the zip file is stored as-is without extraction."""
mock_store = MagicMock()
mock_store.save_file.return_value = "zip-file-id"
mock_get_store.return_value = mock_store
zip_bytes = _create_test_zip()
upload = _make_upload_file(zip_bytes, "site_export.zip", "application/zip")
result = upload_files([upload], FileOrigin.CONNECTOR, unzip=False)
# Should store exactly one file (the zip itself)
assert len(result.file_paths) == 1
assert result.file_paths[0] == "zip-file-id"
assert result.file_names == ["site_export.zip"]
# No zip metadata should be created
assert result.zip_metadata_file_id is None
# Verify the stored content is a valid zip
saved_content: io.BytesIO = mock_store.save_file.call_args[1]["content"]
saved_content.seek(0)
with zipfile.ZipFile(saved_content, "r") as zf:
assert set(zf.namelist()) == {"file1.txt", "file2.txt"}
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_invalid_zip_with_unzip_false_raises(
mock_get_store: MagicMock,
) -> None:
"""An invalid zip is rejected even when unzip=False (validation still runs)."""
mock_get_store.return_value = MagicMock()
bad_zip = _make_upload_file(b"not a zip", "bad.zip", "application/zip")
with pytest.raises(BadZipFile):
upload_files([bad_zip], FileOrigin.CONNECTOR, unzip=False)
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_multiple_zips_rejected_when_unzip_false(
mock_get_store: MagicMock,
) -> None:
"""The seen_zip guard rejects a second zip even when unzip=False."""
mock_store = MagicMock()
mock_store.save_file.return_value = "zip-id"
mock_get_store.return_value = mock_store
zip_bytes = _create_test_zip()
zip1 = _make_upload_file(zip_bytes, "a.zip", "application/zip")
zip2 = _make_upload_file(zip_bytes, "b.zip", "application/zip")
with pytest.raises(Exception, match="Only one zip file"):
upload_files([zip1, zip2], FileOrigin.CONNECTOR, unzip=False)

View File

@@ -1,208 +0,0 @@
"""Unit tests for PythonTool file-upload caching.
Verifies that PythonTool reuses code-interpreter file IDs across multiple
run() calls within the same session instead of re-uploading identical content
on every agent loop iteration.
"""
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.tools.models import ChatFile
from onyx.tools.models import PythonToolOverrideKwargs
from onyx.tools.tool_implementations.python.code_interpreter_client import (
StreamResultEvent,
)
from onyx.tools.tool_implementations.python.python_tool import PythonTool
TOOL_MODULE = "onyx.tools.tool_implementations.python.python_tool"
def _make_stream_result() -> StreamResultEvent:
return StreamResultEvent(
exit_code=0,
timed_out=False,
duration_ms=10,
files=[],
)
def _make_tool() -> PythonTool:
emitter = MagicMock()
return PythonTool(tool_id=1, emitter=emitter)
def _make_override(files: list[ChatFile]) -> PythonToolOverrideKwargs:
return PythonToolOverrideKwargs(chat_files=files)
def _run_tool(tool: PythonTool, mock_client: MagicMock, files: list[ChatFile]) -> None:
"""Call tool.run() with a mocked CodeInterpreterClient context manager."""
from onyx.server.query_and_chat.placement import Placement
mock_client.execute_streaming.return_value = iter([_make_stream_result()])
ctx = MagicMock()
ctx.__enter__ = MagicMock(return_value=mock_client)
ctx.__exit__ = MagicMock(return_value=False)
placement = Placement(turn_index=0, tab_index=0)
override = _make_override(files)
with patch(f"{TOOL_MODULE}.CodeInterpreterClient", return_value=ctx):
tool.run(placement=placement, override_kwargs=override, code="print('hi')")
# ---------------------------------------------------------------------------
# Cache hit: same content uploaded in a second call reuses the file_id
# ---------------------------------------------------------------------------
@patch(f"{TOOL_MODULE}.CODE_INTERPRETER_BASE_URL", "http://fake:8000")
def test_same_file_uploaded_only_once_across_two_runs() -> None:
tool = _make_tool()
client = MagicMock()
client.upload_file.return_value = "file-id-abc"
pptx_content = b"fake pptx bytes"
files = [ChatFile(filename="report.pptx", content=pptx_content)]
_run_tool(tool, client, files)
_run_tool(tool, client, files)
# upload_file should only have been called once across both runs
client.upload_file.assert_called_once_with(pptx_content, "report.pptx")
@patch(f"{TOOL_MODULE}.CODE_INTERPRETER_BASE_URL", "http://fake:8000")
def test_cached_file_id_is_staged_on_second_run() -> None:
tool = _make_tool()
client = MagicMock()
client.upload_file.return_value = "file-id-abc"
files = [ChatFile(filename="data.pptx", content=b"content")]
_run_tool(tool, client, files)
# On the second run, execute_streaming should still receive the file
client.execute_streaming.return_value = iter([_make_stream_result()])
ctx = MagicMock()
ctx.__enter__ = MagicMock(return_value=client)
ctx.__exit__ = MagicMock(return_value=False)
from onyx.server.query_and_chat.placement import Placement
placement = Placement(turn_index=1, tab_index=0)
with patch(f"{TOOL_MODULE}.CodeInterpreterClient", return_value=ctx):
tool.run(
placement=placement,
override_kwargs=_make_override(files),
code="print('hi')",
)
# The second execute_streaming call should include the file
_, kwargs = client.execute_streaming.call_args
staged_files = kwargs.get("files") or []
assert any(f["file_id"] == "file-id-abc" for f in staged_files)
# ---------------------------------------------------------------------------
# Cache miss: different content triggers a new upload
# ---------------------------------------------------------------------------
@patch(f"{TOOL_MODULE}.CODE_INTERPRETER_BASE_URL", "http://fake:8000")
def test_different_file_content_uploaded_separately() -> None:
tool = _make_tool()
client = MagicMock()
client.upload_file.side_effect = ["file-id-v1", "file-id-v2"]
file_v1 = ChatFile(filename="report.pptx", content=b"version 1")
file_v2 = ChatFile(filename="report.pptx", content=b"version 2")
_run_tool(tool, client, [file_v1])
_run_tool(tool, client, [file_v2])
assert client.upload_file.call_count == 2
@patch(f"{TOOL_MODULE}.CODE_INTERPRETER_BASE_URL", "http://fake:8000")
def test_multiple_distinct_files_each_uploaded_once() -> None:
tool = _make_tool()
client = MagicMock()
client.upload_file.side_effect = ["id-a", "id-b"]
files = [
ChatFile(filename="a.pptx", content=b"aaa"),
ChatFile(filename="b.xlsx", content=b"bbb"),
]
_run_tool(tool, client, files)
_run_tool(tool, client, files)
# Two distinct files — each uploaded exactly once
assert client.upload_file.call_count == 2
@patch(f"{TOOL_MODULE}.CODE_INTERPRETER_BASE_URL", "http://fake:8000")
def test_same_content_different_filename_uploaded_separately() -> None:
# Identical bytes but different names must each get their own upload slot
# so both files appear under their respective paths in the workspace.
tool = _make_tool()
client = MagicMock()
client.upload_file.side_effect = ["id-v1", "id-v2"]
same_bytes = b"shared content"
files = [
ChatFile(filename="report_v1.csv", content=same_bytes),
ChatFile(filename="report_v2.csv", content=same_bytes),
]
_run_tool(tool, client, files)
assert client.upload_file.call_count == 2
# ---------------------------------------------------------------------------
# No cross-instance sharing: a fresh PythonTool re-uploads everything
# ---------------------------------------------------------------------------
@patch(f"{TOOL_MODULE}.CODE_INTERPRETER_BASE_URL", "http://fake:8000")
def test_new_tool_instance_re_uploads_file() -> None:
client = MagicMock()
client.upload_file.side_effect = ["id-session-1", "id-session-2"]
files = [ChatFile(filename="deck.pptx", content=b"slide data")]
tool_session_1 = _make_tool()
_run_tool(tool_session_1, client, files)
tool_session_2 = _make_tool()
_run_tool(tool_session_2, client, files)
# Different instances — each uploads independently
assert client.upload_file.call_count == 2
# ---------------------------------------------------------------------------
# Upload failure: failed upload is not cached, retried next run
# ---------------------------------------------------------------------------
@patch(f"{TOOL_MODULE}.CODE_INTERPRETER_BASE_URL", "http://fake:8000")
def test_upload_failure_not_cached() -> None:
tool = _make_tool()
client = MagicMock()
# First call raises, second succeeds
client.upload_file.side_effect = [Exception("network error"), "file-id-ok"]
files = [ChatFile(filename="slides.pptx", content=b"data")]
# First run — upload fails, file is skipped but not cached
_run_tool(tool, client, files)
# Second run — should attempt upload again
_run_tool(tool, client, files)
assert client.upload_file.call_count == 2

File diff suppressed because it is too large Load Diff

View File

@@ -207,16 +207,6 @@ prompt_yn_or_default() {
fi
}
confirm_action() {
local description="$1"
prompt_yn_or_default "Install ${description}? (Y/n) [default: Y] " "Y"
if [[ "$REPLY" =~ ^[Nn] ]]; then
print_warning "Skipping: ${description}"
return 1
fi
return 0
}
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -405,11 +395,6 @@ fi
if ! command -v docker &> /dev/null; then
if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ -n "${WSL_DISTRO_NAME:-}" ]]; then
print_info "Docker is required but not installed."
if ! confirm_action "Docker Engine"; then
print_error "Docker is required to run Onyx."
exit 1
fi
install_docker_linux
if ! command -v docker &> /dev/null; then
print_error "Docker installation failed."
@@ -426,11 +411,7 @@ if command -v docker &> /dev/null \
&& ! command -v docker-compose &> /dev/null \
&& { [[ "$OSTYPE" == "linux-gnu"* ]] || [[ -n "${WSL_DISTRO_NAME:-}" ]]; }; then
print_info "Docker Compose is required but not installed."
if ! confirm_action "Docker Compose plugin"; then
print_error "Docker Compose is required to run Onyx."
exit 1
fi
print_info "Docker Compose not found — installing plugin..."
COMPOSE_ARCH="$(uname -m)"
COMPOSE_URL="https://github.com/docker/compose/releases/latest/download/docker-compose-linux-${COMPOSE_ARCH}"
COMPOSE_DIR="/usr/local/lib/docker/cli-plugins"
@@ -581,31 +562,10 @@ version_compare() {
# Check Docker daemon
if ! docker info &> /dev/null; then
if [[ "$OSTYPE" == "darwin"* ]]; then
print_info "Docker daemon is not running. Starting Docker Desktop..."
open -a Docker
# Wait up to 120 seconds for Docker to be ready
DOCKER_WAIT=0
DOCKER_MAX_WAIT=120
while ! docker info &> /dev/null; do
if [ $DOCKER_WAIT -ge $DOCKER_MAX_WAIT ]; then
print_error "Docker Desktop did not start within ${DOCKER_MAX_WAIT} seconds."
print_info "Please start Docker Desktop manually and re-run this script."
exit 1
fi
printf "\r\033[KWaiting for Docker Desktop to start... (%ds)" "$DOCKER_WAIT"
sleep 2
DOCKER_WAIT=$((DOCKER_WAIT + 2))
done
echo ""
print_success "Docker Desktop is now running"
else
print_error "Docker daemon is not running. Please start Docker."
exit 1
fi
else
print_success "Docker daemon is running"
print_error "Docker daemon is not running. Please start Docker."
exit 1
fi
print_success "Docker daemon is running"
# Check Docker resources
print_step "Verifying Docker resources"
@@ -785,7 +745,6 @@ print_success "All configuration files ready"
# Set up deployment configuration
print_step "Setting up deployment configs"
ENV_FILE="${INSTALL_ROOT}/deployment/.env"
ENV_TEMPLATE="${INSTALL_ROOT}/deployment/env.template"
# Check if services are already running
if [ -d "${INSTALL_ROOT}/deployment" ] && [ -f "${INSTALL_ROOT}/deployment/docker-compose.yml" ]; then
# Determine compose command
@@ -1125,25 +1084,6 @@ else
USE_LATEST=false
fi
# For pinned version tags, re-download config files from that tag so the
# compose file matches the images being pulled (the initial download used main).
if [[ "$USE_LATEST" = false ]] && [[ "$USE_LOCAL_FILES" = false ]]; then
PINNED_BASE="https://raw.githubusercontent.com/onyx-dot-app/onyx/${CURRENT_IMAGE_TAG}/deployment"
print_info "Fetching config files matching tag ${CURRENT_IMAGE_TAG}..."
if download_file "${PINNED_BASE}/docker_compose/docker-compose.yml" "${INSTALL_ROOT}/deployment/docker-compose.yml" 2>/dev/null; then
download_file "${PINNED_BASE}/data/nginx/app.conf.template" "${INSTALL_ROOT}/data/nginx/app.conf.template" 2>/dev/null || true
download_file "${PINNED_BASE}/data/nginx/run-nginx.sh" "${INSTALL_ROOT}/data/nginx/run-nginx.sh" 2>/dev/null || true
chmod +x "${INSTALL_ROOT}/data/nginx/run-nginx.sh"
if [[ "$LITE_MODE" = true ]]; then
download_file "${PINNED_BASE}/docker_compose/${LITE_COMPOSE_FILE}" \
"${INSTALL_ROOT}/deployment/${LITE_COMPOSE_FILE}" 2>/dev/null || true
fi
print_success "Config files updated to match ${CURRENT_IMAGE_TAG}"
else
print_warning "Tag ${CURRENT_IMAGE_TAG} not found on GitHub — using main branch configs"
fi
fi
# Pull Docker images with reduced output
print_step "Pulling Docker images"
print_info "This may take several minutes depending on your internet connection..."

View File

@@ -8,7 +8,7 @@
"name": "widget",
"version": "0.1.0",
"dependencies": {
"next": "^16.1.7",
"next": "^16.1.5",
"react": "^19",
"react-dom": "^19",
"react-markdown": "^10.1.0"
@@ -1023,9 +1023,9 @@
}
},
"node_modules/@next/env": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.7.tgz",
"integrity": "sha512-rJJbIdJB/RQr2F1nylZr/PJzamvNNhfr3brdKP6s/GW850jbtR70QlSfFselvIBbcPUOlQwBakexjFzqLzF6pg==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.5.tgz",
"integrity": "sha512-CRSCPJiSZoi4Pn69RYBDI9R7YK2g59vLexPQFXY0eyw+ILevIenCywzg+DqmlBik9zszEnw2HLFOUlLAcJbL7g==",
"license": "MIT"
},
"node_modules/@next/eslint-plugin-next": {
@@ -1039,9 +1039,9 @@
}
},
"node_modules/@next/swc-darwin-arm64": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.7.tgz",
"integrity": "sha512-b2wWIE8sABdyafc4IM8r5Y/dS6kD80JRtOGrUiKTsACFQfWWgUQ2NwoUX1yjFMXVsAwcQeNpnucF2ZrujsBBPg==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.5.tgz",
"integrity": "sha512-eK7Wdm3Hjy/SCL7TevlH0C9chrpeOYWx2iR7guJDaz4zEQKWcS1IMVfMb9UKBFMg1XgzcPTYPIp1Vcpukkjg6Q==",
"cpu": [
"arm64"
],
@@ -1055,9 +1055,9 @@
}
},
"node_modules/@next/swc-darwin-x64": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.7.tgz",
"integrity": "sha512-zcnVaaZulS1WL0Ss38R5Q6D2gz7MtBu8GZLPfK+73D/hp4GFMrC2sudLky1QibfV7h6RJBJs/gOFvYP0X7UVlQ==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.5.tgz",
"integrity": "sha512-foQscSHD1dCuxBmGkbIr6ScAUF6pRoDZP6czajyvmXPAOFNnQUJu2Os1SGELODjKp/ULa4fulnBWoHV3XdPLfA==",
"cpu": [
"x64"
],
@@ -1071,9 +1071,9 @@
}
},
"node_modules/@next/swc-linux-arm64-gnu": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.7.tgz",
"integrity": "sha512-2ant89Lux/Q3VyC8vNVg7uBaFVP9SwoK2jJOOR0L8TQnX8CAYnh4uctAScy2Hwj2dgjVHqHLORQZJ2wH6VxhSQ==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.5.tgz",
"integrity": "sha512-qNIb42o3C02ccIeSeKjacF3HXotGsxh/FMk/rSRmCzOVMtoWH88odn2uZqF8RLsSUWHcAqTgYmPD3pZ03L9ZAA==",
"cpu": [
"arm64"
],
@@ -1087,9 +1087,9 @@
}
},
"node_modules/@next/swc-linux-arm64-musl": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.7.tgz",
"integrity": "sha512-uufcze7LYv0FQg9GnNeZ3/whYfo+1Q3HnQpm16o6Uyi0OVzLlk2ZWoY7j07KADZFY8qwDbsmFnMQP3p3+Ftprw==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.5.tgz",
"integrity": "sha512-U+kBxGUY1xMAzDTXmuVMfhaWUZQAwzRaHJ/I6ihtR5SbTVUEaDRiEU9YMjy1obBWpdOBuk1bcm+tsmifYSygfw==",
"cpu": [
"arm64"
],
@@ -1103,9 +1103,9 @@
}
},
"node_modules/@next/swc-linux-x64-gnu": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.7.tgz",
"integrity": "sha512-KWVf2gxYvHtvuT+c4MBOGxuse5TD7DsMFYSxVxRBnOzok/xryNeQSjXgxSv9QpIVlaGzEn/pIuI6Koosx8CGWA==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.5.tgz",
"integrity": "sha512-gq2UtoCpN7Ke/7tKaU7i/1L7eFLfhMbXjNghSv0MVGF1dmuoaPeEVDvkDuO/9LVa44h5gqpWeJ4mRRznjDv7LA==",
"cpu": [
"x64"
],
@@ -1119,9 +1119,9 @@
}
},
"node_modules/@next/swc-linux-x64-musl": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.7.tgz",
"integrity": "sha512-HguhaGwsGr1YAGs68uRKc4aGWxLET+NevJskOcCAwXbwj0fYX0RgZW2gsOCzr9S11CSQPIkxmoSbuVaBp4Z3dA==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.5.tgz",
"integrity": "sha512-bQWSE729PbXT6mMklWLf8dotislPle2L70E9q6iwETYEOt092GDn0c+TTNj26AjmeceSsC4ndyGsK5nKqHYXjQ==",
"cpu": [
"x64"
],
@@ -1135,9 +1135,9 @@
}
},
"node_modules/@next/swc-win32-arm64-msvc": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.7.tgz",
"integrity": "sha512-S0n3KrDJokKTeFyM/vGGGR8+pCmXYrjNTk2ZozOL1C/JFdfUIL9O1ATaJOl5r2POe56iRChbsszrjMAdWSv7kQ==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.5.tgz",
"integrity": "sha512-LZli0anutkIllMtTAWZlDqdfvjWX/ch8AFK5WgkNTvaqwlouiD1oHM+WW8RXMiL0+vAkAJyAGEzPPjO+hnrSNQ==",
"cpu": [
"arm64"
],
@@ -1151,9 +1151,9 @@
}
},
"node_modules/@next/swc-win32-x64-msvc": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.7.tgz",
"integrity": "sha512-mwgtg8CNZGYm06LeEd+bNnOUfwOyNem/rOiP14Lsz+AnUY92Zq/LXwtebtUiaeVkhbroRCQ0c8GlR4UT1U+0yg==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.5.tgz",
"integrity": "sha512-7is37HJTNQGhjPpQbkKjKEboHYQnCgpVt/4rBrrln0D9nderNxZ8ZWs8w1fAtzUx7wEyYjQ+/13myFgFj6K2Ng==",
"cpu": [
"x64"
],
@@ -2564,15 +2564,12 @@
"dev": true
},
"node_modules/baseline-browser-mapping": {
"version": "2.10.8",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.8.tgz",
"integrity": "sha512-PCLz/LXGBsNTErbtB6i5u4eLpHeMfi93aUv5duMmj6caNu6IphS4q6UevDnL36sZQv9lrP11dbPKGMaXPwMKfQ==",
"version": "2.9.14",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.14.tgz",
"integrity": "sha512-B0xUquLkiGLgHhpPBqvl7GWegWBUNuujQ6kXd/r1U38ElPT6Ok8KZ8e+FpUGEc2ZoRQUzq/aUnaKFc/svWUGSg==",
"license": "Apache-2.0",
"bin": {
"baseline-browser-mapping": "dist/cli.cjs"
},
"engines": {
"node": ">=6.0.0"
"baseline-browser-mapping": "dist/cli.js"
}
},
"node_modules/brace-expansion": {
@@ -5929,14 +5926,14 @@
"dev": true
},
"node_modules/next": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.7.tgz",
"integrity": "sha512-WM0L7WrSvKwoLegLYr6V+mz+RIofqQgVAfHhMp9a88ms0cFX8iX9ew+snpWlSBwpkURJOUdvCEt3uLl3NNzvWg==",
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.5.tgz",
"integrity": "sha512-f+wE+NSbiQgh3DSAlTaw2FwY5yGdVViAtp8TotNQj4kk4Q8Bh1sC/aL9aH+Rg1YAVn18OYXsRDT7U/079jgP7w==",
"license": "MIT",
"dependencies": {
"@next/env": "16.1.7",
"@next/env": "16.1.5",
"@swc/helpers": "0.5.15",
"baseline-browser-mapping": "^2.9.19",
"baseline-browser-mapping": "^2.8.3",
"caniuse-lite": "^1.0.30001579",
"postcss": "8.4.31",
"styled-jsx": "5.1.6"
@@ -5948,14 +5945,14 @@
"node": ">=20.9.0"
},
"optionalDependencies": {
"@next/swc-darwin-arm64": "16.1.7",
"@next/swc-darwin-x64": "16.1.7",
"@next/swc-linux-arm64-gnu": "16.1.7",
"@next/swc-linux-arm64-musl": "16.1.7",
"@next/swc-linux-x64-gnu": "16.1.7",
"@next/swc-linux-x64-musl": "16.1.7",
"@next/swc-win32-arm64-msvc": "16.1.7",
"@next/swc-win32-x64-msvc": "16.1.7",
"@next/swc-darwin-arm64": "16.1.5",
"@next/swc-darwin-x64": "16.1.5",
"@next/swc-linux-arm64-gnu": "16.1.5",
"@next/swc-linux-arm64-musl": "16.1.5",
"@next/swc-linux-x64-gnu": "16.1.5",
"@next/swc-linux-x64-musl": "16.1.5",
"@next/swc-win32-arm64-msvc": "16.1.5",
"@next/swc-win32-x64-msvc": "16.1.5",
"sharp": "^0.34.4"
},
"peerDependencies": {

View File

@@ -9,7 +9,7 @@
"lint": "next lint"
},
"dependencies": {
"next": "^16.1.7",
"next": "^16.1.5",
"react": "^19",
"react-dom": "^19",
"react-markdown": "^10.1.0"

View File

@@ -92,7 +92,7 @@ backend = [
"python-gitlab==5.6.0",
"python-pptx==0.6.23",
"pypandoc_binary==1.16.2",
"pypdf==6.9.1",
"pypdf==6.8.0",
"pytest-mock==3.12.0",
"pytest-playwright==0.7.0",
"python-docx==1.1.2",
@@ -245,7 +245,6 @@ select = [
"ARG",
"E",
"F",
"S324",
"W",
]

View File

@@ -17,7 +17,6 @@ import (
type RunCIOptions struct {
DryRun bool
Yes bool
Rerun bool
}
// NewRunCICommand creates a new run-ci command
@@ -50,7 +49,6 @@ Example usage:
cmd.Flags().BoolVar(&opts.DryRun, "dry-run", false, "Perform all local operations but skip pushing to remote and creating PRs")
cmd.Flags().BoolVar(&opts.Yes, "yes", false, "Skip confirmation prompts and automatically proceed")
cmd.Flags().BoolVar(&opts.Rerun, "rerun", false, "Update an existing CI PR with the latest fork changes to re-trigger CI")
return cmd
}
@@ -109,44 +107,19 @@ func runCI(cmd *cobra.Command, args []string, opts *RunCIOptions) {
log.Fatalf("PR #%s is not from a fork - CI should already run automatically", prNumber)
}
// Create the CI branch
ciBranch := fmt.Sprintf("run-ci/%s", prNumber)
prTitle := fmt.Sprintf("chore: [Running GitHub actions for #%s]", prNumber)
prBody := fmt.Sprintf("This PR runs GitHub Actions CI for #%s.\n\n- [x] Override Linear Check\n\n**This PR should be closed (not merged) after CI completes.**", prNumber)
// Check if a CI PR already exists for this branch
existingPRURL, err := findExistingCIPR(ciBranch)
if err != nil {
log.Fatalf("Failed to check for existing CI PR: %v", err)
}
if existingPRURL != "" && !opts.Rerun {
log.Infof("A CI PR already exists for #%s: %s", prNumber, existingPRURL)
log.Info("Run with --rerun to update it with the latest fork changes and re-trigger CI.")
return
}
if opts.Rerun && existingPRURL == "" {
log.Warn("--rerun was specified but no existing open CI PR was found. A new PR will be created.")
}
if existingPRURL != "" && opts.Rerun {
log.Infof("Existing CI PR found: %s", existingPRURL)
log.Info("Will update the CI branch with the latest fork changes to re-trigger CI.")
}
// Confirm before proceeding
if !opts.Yes {
action := "Create CI branch"
if existingPRURL != "" {
action = "Update existing CI branch"
}
if !prompt.Confirm(fmt.Sprintf("%s for PR #%s? (yes/no): ", action, prNumber)) {
if !prompt.Confirm(fmt.Sprintf("Create CI branch for PR #%s? (yes/no): ", prNumber)) {
log.Info("Exiting...")
return
}
}
// Create the CI branch
ciBranch := fmt.Sprintf("run-ci/%s", prNumber)
prTitle := fmt.Sprintf("chore: [Running GitHub actions for #%s]", prNumber)
prBody := fmt.Sprintf("This PR runs GitHub Actions CI for #%s.\n\n- [x] Override Linear Check\n\n**This PR should be closed (not merged) after CI completes.**", prNumber)
// Fetch the fork's branch
if forkRepo == "" {
log.Fatalf("Could not determine fork repository - headRepositoryOwner or headRepository.name is empty")
@@ -185,11 +158,7 @@ func runCI(cmd *cobra.Command, args []string, opts *RunCIOptions) {
if opts.DryRun {
log.Warnf("[DRY RUN] Would push CI branch: %s", ciBranch)
if existingPRURL == "" {
log.Warnf("[DRY RUN] Would create PR: %s", prTitle)
} else {
log.Warnf("[DRY RUN] Would update existing PR: %s", existingPRURL)
}
log.Warnf("[DRY RUN] Would create PR: %s", prTitle)
// Switch back to original branch
if err := git.RunCommand("switch", "--quiet", originalBranch); err != nil {
log.Warnf("Failed to switch back to original branch: %v", err)
@@ -207,17 +176,6 @@ func runCI(cmd *cobra.Command, args []string, opts *RunCIOptions) {
log.Fatalf("Failed to push CI branch: %v", err)
}
if existingPRURL != "" {
// PR already exists - force push is enough to re-trigger CI
log.Infof("Switching back to original branch: %s", originalBranch)
if err := git.RunCommand("switch", "--quiet", originalBranch); err != nil {
log.Warnf("Failed to switch back to original branch: %v", err)
}
log.Infof("CI PR updated successfully: %s", existingPRURL)
log.Info("The force push will re-trigger CI. Remember to close (not merge) this PR after CI completes!")
return
}
// Create PR using GitHub CLI
log.Info("Creating PR...")
prURL, err := createCIPR(ciBranch, prInfo.BaseRefName, prTitle, prBody)
@@ -259,39 +217,6 @@ func getPRInfo(prNumber string) (*PRInfo, error) {
return &prInfo, nil
}
// findExistingCIPR checks if an open PR already exists for the given CI branch.
// Returns the PR URL if found, or empty string if not.
func findExistingCIPR(headBranch string) (string, error) {
cmd := exec.Command("gh", "pr", "list",
"--head", headBranch,
"--state", "open",
"--json", "url",
)
output, err := cmd.Output()
if err != nil {
if exitErr, ok := err.(*exec.ExitError); ok {
return "", fmt.Errorf("%w: %s", err, string(exitErr.Stderr))
}
return "", err
}
var prs []struct {
URL string `json:"url"`
}
if err := json.Unmarshal(output, &prs); err != nil {
log.Debugf("Failed to parse PR list JSON: %v (raw: %s)", err, string(output))
return "", fmt.Errorf("failed to parse PR list: %w", err)
}
if len(prs) == 0 {
log.Debugf("No existing open PRs found for branch %s", headBranch)
return "", nil
}
log.Debugf("Found existing PR for branch %s: %s", headBranch, prs[0].URL)
return prs[0].URL, nil
}
// createCIPR creates a pull request for CI using the GitHub CLI
func createCIPR(headBranch, baseBranch, title, body string) (string, error) {
cmd := exec.Command("gh", "pr", "create",

8
uv.lock generated
View File

@@ -4481,7 +4481,7 @@ requires-dist = [
{ name = "pygithub", marker = "extra == 'backend'", specifier = "==2.5.0" },
{ name = "pympler", marker = "extra == 'backend'", specifier = "==1.1" },
{ name = "pypandoc-binary", marker = "extra == 'backend'", specifier = "==1.16.2" },
{ name = "pypdf", marker = "extra == 'backend'", specifier = "==6.9.1" },
{ name = "pypdf", marker = "extra == 'backend'", specifier = "==6.8.0" },
{ name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.5" },
{ name = "pytest-alembic", marker = "extra == 'dev'", specifier = "==0.12.1" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==1.3.0" },
@@ -5727,11 +5727,11 @@ wheels = [
[[package]]
name = "pypdf"
version = "6.9.1"
version = "6.8.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/f9/fb/dc2e8cb006e80b0020ed20d8649106fe4274e82d8e756ad3e24ade19c0df/pypdf-6.9.1.tar.gz", hash = "sha256:ae052407d33d34de0c86c5c729be6d51010bf36e03035a8f23ab449bca52377d", size = 5311551, upload-time = "2026-03-17T10:46:07.876Z" }
sdist = { url = "https://files.pythonhosted.org/packages/b4/a3/e705b0805212b663a4c27b861c8a603dba0f8b4bb281f96f8e746576a50d/pypdf-6.8.0.tar.gz", hash = "sha256:cb7eaeaa4133ce76f762184069a854e03f4d9a08568f0e0623f7ea810407833b", size = 5307831, upload-time = "2026-03-09T13:37:40.591Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f9/f4/75543fa802b86e72f87e9395440fe1a89a6d149887e3e55745715c3352ac/pypdf-6.9.1-py3-none-any.whl", hash = "sha256:f35a6a022348fae47e092a908339a8f3dc993510c026bb39a96718fc7185e89f", size = 333661, upload-time = "2026-03-17T10:46:06.286Z" },
{ url = "https://files.pythonhosted.org/packages/8c/ec/4ccf3bb86b1afe5d7176e1c8abcdbf22b53dd682ec2eda50e1caadcf6846/pypdf-6.8.0-py3-none-any.whl", hash = "sha256:2a025080a8dd73f48123c89c57174a5ff3806c71763ee4e49572dc90454943c7", size = 332177, upload-time = "2026-03-09T13:37:38.774Z" },
]
[[package]]

View File

@@ -1,3 +1,4 @@
import "@opal/components/buttons/button/styles.css";
import "@opal/components/tooltip.css";
import { Interactive, type InteractiveStatelessProps } from "@opal/core";
import type { ContainerSizeVariants, ExtremaSizeVariants } from "@opal/types";
@@ -66,7 +67,7 @@ function Button({
const labelEl = children ? (
<span
className={cn(
"whitespace-nowrap",
"opal-button-label",
isLarge ? "font-main-ui-body " : "font-secondary-body",
responsiveHideText && "hidden md:inline"
)}
@@ -86,7 +87,7 @@ function Button({
isLarge ? "default" : size === "2xs" ? "mini" : "compact"
}
>
<div className="flex flex-row items-center gap-1 interactive-foreground">
<div className={cn("opal-button interactive-foreground")}>
{iconWrapper(Icon, size, !!children)}
{labelEl}

View File

@@ -0,0 +1,9 @@
/* Button — layout only; colors handled by Interactive.Stateless */
.opal-button {
@apply flex flex-row items-center gap-1;
}
.opal-button-label {
@apply whitespace-nowrap;
}

View File

@@ -118,7 +118,7 @@ function OpenButton({
const labelEl = children ? (
<span
className={cn(
"whitespace-nowrap",
"opal-button-label whitespace-nowrap",
isLarge ? "font-main-ui-body" : "font-secondary-body"
)}
>
@@ -143,7 +143,7 @@ function OpenButton({
>
<div
className={cn(
"interactive-foreground flex flex-row items-center",
"opal-button interactive-foreground flex flex-row items-center",
justifyContent === "between" ? "w-full justify-between" : "gap-1",
foldable &&
justifyContent !== "between" &&

View File

@@ -52,8 +52,3 @@ export {
type PaginationProps,
type PaginationSize,
} from "@opal/components/pagination/components";
/* Table */
export { Table } from "@opal/components/table/components";
export { createTableColumns } from "@opal/components/table/columns";
export type { DataTableProps } from "@opal/components/table/components";

View File

@@ -1,82 +0,0 @@
# Table
Config-driven table component with sorting, pagination, column visibility,
row selection, drag-and-drop reordering, and server-side mode.
## Usage
```tsx
import { Table, createTableColumns } from "@opal/components";
interface User {
id: string;
email: string;
name: string | null;
status: "active" | "invited";
}
const tc = createTableColumns<User>();
const columns = [
tc.qualifier({ content: "avatar-user", getInitials: (r) => r.name?.[0] ?? "?" }),
tc.column("email", {
header: "Name",
weight: 22,
minWidth: 140,
cell: (email, row) => <span>{row.name ?? email}</span>,
}),
tc.column("status", {
header: "Status",
weight: 14,
cell: (status) => <span>{status}</span>,
}),
tc.actions(),
];
function UsersTable({ users }: { users: User[] }) {
return (
<Table
data={users}
columns={columns}
getRowId={(r) => r.id}
pageSize={10}
footer={{ mode: "summary" }}
/>
);
}
```
## Props
| Prop | Type | Default | Description |
|------|------|---------|-------------|
| `data` | `TData[]` | required | Row data array |
| `columns` | `OnyxColumnDef<TData>[]` | required | Column definitions from `createTableColumns()` |
| `getRowId` | `(row: TData) => string` | required | Unique row identifier |
| `pageSize` | `number` | `10` | Rows per page (`Infinity` disables pagination) |
| `size` | `"md" \| "lg"` | `"lg"` | Density variant |
| `footer` | `DataTableFooterConfig` | — | Footer mode (`"selection"` or `"summary"`) |
| `initialSorting` | `SortingState` | — | Initial sort state |
| `initialColumnVisibility` | `VisibilityState` | — | Initial column visibility |
| `draggable` | `DataTableDraggableConfig` | — | Enable drag-and-drop reordering |
| `onSelectionChange` | `(ids: string[]) => void` | — | Selection callback |
| `onRowClick` | `(row: TData) => void` | — | Row click handler |
| `searchTerm` | `string` | — | Global text filter |
| `height` | `number \| string` | — | Max scrollable height |
| `headerBackground` | `string` | — | Sticky header background |
| `serverSide` | `ServerSideConfig` | — | Server-side pagination/sorting/filtering |
| `emptyState` | `ReactNode` | — | Empty state content |
## Column Builder
`createTableColumns<TData>()` returns a builder with:
- `tc.qualifier(opts)` — leading avatar/icon/checkbox column
- `tc.column(accessor, opts)` — data column with sorting/resizing
- `tc.displayColumn(opts)` — non-accessor custom column
- `tc.actions(opts)` — trailing actions column with visibility/sorting popovers
## Footer Modes
- **`"selection"`** — shows selection count, optional view/clear buttons, count pagination
- **`"summary"`** — shows "Showing X~Y of Z", list pagination, optional extra element

View File

@@ -1,148 +0,0 @@
import type { Meta, StoryObj } from "@storybook/react";
import { Table, createTableColumns } from "@opal/components";
// ---------------------------------------------------------------------------
// Sample data
// ---------------------------------------------------------------------------
interface User {
id: string;
email: string;
name: string;
role: "admin" | "user" | "viewer";
status: "active" | "invited" | "inactive";
}
const USERS: User[] = [
{
id: "1",
email: "alice@example.com",
name: "Alice Johnson",
role: "admin",
status: "active",
},
{
id: "2",
email: "bob@example.com",
name: "Bob Smith",
role: "user",
status: "active",
},
{
id: "3",
email: "carol@example.com",
name: "Carol White",
role: "viewer",
status: "invited",
},
{
id: "4",
email: "dave@example.com",
name: "Dave Brown",
role: "user",
status: "inactive",
},
{
id: "5",
email: "eve@example.com",
name: "Eve Davis",
role: "admin",
status: "active",
},
{
id: "6",
email: "frank@example.com",
name: "Frank Miller",
role: "viewer",
status: "active",
},
{
id: "7",
email: "grace@example.com",
name: "Grace Lee",
role: "user",
status: "invited",
},
{
id: "8",
email: "hank@example.com",
name: "Hank Wilson",
role: "user",
status: "active",
},
{
id: "9",
email: "iris@example.com",
name: "Iris Taylor",
role: "viewer",
status: "active",
},
{
id: "10",
email: "jack@example.com",
name: "Jack Moore",
role: "admin",
status: "active",
},
{
id: "11",
email: "kate@example.com",
name: "Kate Anderson",
role: "user",
status: "inactive",
},
{
id: "12",
email: "leo@example.com",
name: "Leo Thomas",
role: "viewer",
status: "active",
},
];
// ---------------------------------------------------------------------------
// Columns
// ---------------------------------------------------------------------------
const tc = createTableColumns<User>();
const columns = [
tc.qualifier({
content: "avatar-user",
getInitials: (r) =>
r.name
.split(" ")
.map((n) => n[0])
.join(""),
}),
tc.column("name", { header: "Name", weight: 25, minWidth: 120 }),
tc.column("email", { header: "Email", weight: 30, minWidth: 160 }),
tc.column("role", { header: "Role", weight: 15, minWidth: 80 }),
tc.column("status", { header: "Status", weight: 15, minWidth: 80 }),
tc.actions(),
];
// ---------------------------------------------------------------------------
// Story
// ---------------------------------------------------------------------------
const meta: Meta<typeof Table> = {
title: "opal/components/Table",
component: Table,
tags: ["autodocs"],
};
export default meta;
type Story = StoryObj<typeof Table>;
export const Default: Story = {
render: () => (
<Table
data={USERS}
columns={columns}
getRowId={(r) => r.id}
pageSize={8}
footer={{ mode: "summary" }}
/>
),
};

View File

@@ -1,71 +0,0 @@
import React from "react";
import { cn } from "@opal/utils";
import type { WithoutStyles } from "@/types";
import type { ExtremaSizeVariants, SizeVariants } from "@opal/types";
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
type TableSize = Extract<SizeVariants, "md" | "lg">;
type TableVariant = "rows" | "cards";
type TableQualifier = "simple" | "avatar" | "icon";
type SelectionBehavior = "no-select" | "single-select" | "multi-select";
interface TableProps
extends WithoutStyles<React.TableHTMLAttributes<HTMLTableElement>> {
ref?: React.Ref<HTMLTableElement>;
/** Size preset for the table. @default "lg" */
size?: TableSize;
/** Visual row variant. @default "cards" */
variant?: TableVariant;
/** Row selection behavior. @default "no-select" */
selectionBehavior?: SelectionBehavior;
/** Leading qualifier column type. @default null */
qualifier?: TableQualifier;
/** Height behavior. `"fit"` = shrink to content, `"full"` = fill available space. */
heightVariant?: ExtremaSizeVariants;
/** Explicit pixel width for the table (e.g. from `table.getTotalSize()`).
* When provided the table uses exactly this width instead of stretching
* to fill its container, which prevents `table-layout: fixed` from
* redistributing extra space across columns on resize. */
width?: number;
}
// ---------------------------------------------------------------------------
// Component
// ---------------------------------------------------------------------------
function Table({
ref,
size = "lg",
variant = "cards",
selectionBehavior = "no-select",
qualifier = "simple",
heightVariant,
width,
...props
}: TableProps) {
return (
<table
ref={ref}
className={cn("border-separate border-spacing-0", !width && "min-w-full")}
style={{ tableLayout: "fixed", width }}
data-size={size}
data-variant={variant}
data-selection={selectionBehavior}
data-qualifier={qualifier}
data-height={heightVariant}
{...props}
/>
);
}
export default Table;
export type {
TableProps,
TableSize,
TableVariant,
TableQualifier,
SelectionBehavior,
};

View File

@@ -1,164 +0,0 @@
/* Imports shared timing tokens (--interactive-duration, --interactive-easing) */
@import "@opal/core/interactive/shared.css";
/* ---------------------------------------------------------------------------
* Table primitives — data-attribute driven styling
* Follows the same pattern as card.css / line-item.css.
* ------------------------------------------------------------------------- */
/* ---- TableCell ---- */
.tbl-cell[data-size="lg"] {
@apply px-1 py-0.5;
}
.tbl-cell[data-size="md"] {
@apply pl-0.5 pr-1.5 py-1.5;
}
.tbl-cell-inner[data-size="lg"] {
@apply h-10 px-1;
}
.tbl-cell-inner[data-size="md"] {
@apply h-6 px-0.5;
}
/* ---- TableHead ---- */
.table-head {
@apply relative sticky top-0 z-20;
background: var(--table-header-bg, transparent);
}
.table-head[data-size="lg"] {
@apply px-2 py-1;
}
.table-head[data-size="md"] {
@apply px-2 py-1;
}
.table-head[data-bottom-border] {
@apply border-b border-transparent hover:border-border-03;
}
/* Inner text wrapper */
.table-head[data-size="lg"] .table-head-label {
@apply py-2 px-0.5;
}
.table-head[data-size="md"] .table-head-label {
@apply py-1;
}
/* Sort button wrapper */
.table-head[data-size="lg"] .table-head-sort {
@apply py-1.5;
}
/* ---- TableRow (base) ---- */
.tbl-row > td {
@apply bg-background-tint-00;
transition: background-color var(--interactive-duration)
var(--interactive-easing);
}
.tbl-row[data-selected] > td {
@apply bg-[var(--action-link-01)];
}
.tbl-row[data-disabled] {
@apply pointer-events-none;
}
/* Suppress default focus ring on rows — the row bg is the indicator */
.tbl-row:focus,
.tbl-row:focus-visible {
outline: none;
}
/* ---- variant="rows" — traditional borders, no gaps ---- */
table[data-variant="rows"] .tbl-row > td {
@apply border-b border-border-01;
}
/* Hover/focus only for selectable tables */
table[data-variant="rows"][data-selection="single-select"] .tbl-row,
table[data-variant="rows"][data-selection="multi-select"] .tbl-row {
@apply cursor-pointer;
}
table[data-variant="rows"][data-selection="single-select"] .tbl-row:hover > td,
table[data-variant="rows"][data-selection="multi-select"] .tbl-row:hover > td {
@apply bg-background-tint-02;
}
table[data-variant="rows"] .tbl-row:focus-visible > td,
table[data-variant="rows"] .tbl-row:has(:focus-visible) > td {
@apply bg-action-link-01;
}
/* ---- variant="cards" — rounded cards with gap ---- */
table[data-variant="cards"] .tbl-row > td {
@apply bg-clip-padding border-y-[2px] border-x-0 border-transparent;
}
table[data-variant="cards"] .tbl-row > td:first-child {
@apply rounded-l-12;
}
table[data-variant="cards"] .tbl-row > td:last-child {
@apply rounded-r-12;
}
/* When a drag handle is present the second-to-last td gets the rounding */
table[data-variant="cards"] .tbl-row[data-drag-handle] > td:nth-last-child(2) {
@apply rounded-r-12;
}
table[data-variant="cards"] .tbl-row[data-drag-handle] > td:last-child {
border-radius: 0;
}
/* Hover/focus only for selectable tables */
table[data-variant="cards"][data-selection="single-select"] .tbl-row,
table[data-variant="cards"][data-selection="multi-select"] .tbl-row {
@apply cursor-pointer;
}
table[data-variant="cards"][data-selection="single-select"] .tbl-row:hover > td,
table[data-variant="cards"][data-selection="multi-select"] .tbl-row:hover > td {
@apply bg-background-tint-02;
}
table[data-variant="cards"] .tbl-row:focus-visible > td,
table[data-variant="cards"] .tbl-row:has(:focus-visible) > td {
@apply bg-action-link-01;
}
/* ---- QualifierContainer ---- */
.tbl-qualifier[data-type="head"] {
@apply w-px whitespace-nowrap py-1 sticky top-0 z-20;
background: var(--table-header-bg, transparent);
}
.tbl-qualifier[data-type="head"][data-size="md"] {
@apply py-0.5;
}
.tbl-qualifier[data-type="cell"] {
@apply w-px whitespace-nowrap py-1;
}
.tbl-qualifier[data-type="cell"][data-size="md"] {
@apply py-0.5;
}
/* ---- ActionsContainer ---- */
.tbl-actions {
@apply sticky right-0 w-px whitespace-nowrap px-1;
}
.tbl-actions[data-type="head"] {
@apply z-30 sticky top-0 px-2 py-1;
background: var(--table-header-bg, transparent);
}
/* ---- Footer ---- */
.table-footer[data-size="lg"] {
@apply min-h-[2.75rem];
}
.table-footer[data-size="md"] {
@apply min-h-[2.25rem];
}

View File

@@ -1,162 +0,0 @@
import type { Meta, StoryObj } from "@storybook/react";
import { Hoverable } from "@opal/core";
// ---------------------------------------------------------------------------
// Meta
// ---------------------------------------------------------------------------
const meta: Meta = {
title: "Core/Hoverable",
tags: ["autodocs"],
parameters: {
layout: "centered",
},
};
export default meta;
// ---------------------------------------------------------------------------
// Stories
// ---------------------------------------------------------------------------
/** Group mode — hovering the root reveals hidden items. */
export const GroupMode: StoryObj = {
render: () => (
<Hoverable.Root group="demo">
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.75rem",
padding: "1rem",
border: "1px solid var(--border-02)",
borderRadius: "0.5rem",
minWidth: 260,
}}
>
<span style={{ color: "var(--text-01)" }}>Hover this card</span>
<Hoverable.Item group="demo" variant="opacity-on-hover">
<span style={{ color: "var(--text-03)" }}> Revealed</span>
</Hoverable.Item>
</div>
</Hoverable.Root>
),
};
/** Local mode — hovering the item itself reveals it (no Root needed). */
export const LocalMode: StoryObj = {
render: () => (
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.75rem",
padding: "1rem",
}}
>
<span style={{ color: "var(--text-01)" }}>Hover the icon </span>
<Hoverable.Item variant="opacity-on-hover">
<span style={{ fontSize: "1.25rem" }}>🗑</span>
</Hoverable.Item>
</div>
),
};
/** Multiple independent groups on the same page. */
export const MultipleGroups: StoryObj = {
render: () => (
<div style={{ display: "flex", flexDirection: "column", gap: "0.75rem" }}>
{(["alpha", "beta"] as const).map((group) => (
<Hoverable.Root key={group} group={group}>
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.75rem",
padding: "1rem",
border: "1px solid var(--border-02)",
borderRadius: "0.5rem",
}}
>
<span style={{ color: "var(--text-01)" }}>Group: {group}</span>
<Hoverable.Item group={group} variant="opacity-on-hover">
<span style={{ color: "var(--text-03)" }}> Revealed</span>
</Hoverable.Item>
</div>
</Hoverable.Root>
))}
</div>
),
};
/** Multiple items revealed by a single root. */
export const MultipleItems: StoryObj = {
render: () => (
<Hoverable.Root group="multi">
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.75rem",
padding: "1rem",
border: "1px solid var(--border-02)",
borderRadius: "0.5rem",
}}
>
<span style={{ color: "var(--text-01)" }}>Hover to reveal all</span>
<Hoverable.Item group="multi" variant="opacity-on-hover">
<span>Edit</span>
</Hoverable.Item>
<Hoverable.Item group="multi" variant="opacity-on-hover">
<span>Delete</span>
</Hoverable.Item>
<Hoverable.Item group="multi" variant="opacity-on-hover">
<span>Share</span>
</Hoverable.Item>
</div>
</Hoverable.Root>
),
};
/** Nested groups — inner and outer hover independently. */
export const NestedGroups: StoryObj = {
render: () => (
<Hoverable.Root group="outer">
<div
style={{
padding: "1rem",
border: "1px solid var(--border-02)",
borderRadius: "0.5rem",
display: "flex",
flexDirection: "column",
gap: "0.75rem",
}}
>
<div style={{ display: "flex", alignItems: "center", gap: "0.75rem" }}>
<span style={{ color: "var(--text-01)" }}>Outer card</span>
<Hoverable.Item group="outer" variant="opacity-on-hover">
<span style={{ color: "var(--text-03)" }}>Outer action</span>
</Hoverable.Item>
</div>
<Hoverable.Root group="inner">
<div
style={{
display: "flex",
alignItems: "center",
gap: "0.75rem",
padding: "0.75rem",
border: "1px solid var(--border-03)",
borderRadius: "0.375rem",
}}
>
<span style={{ color: "var(--text-02)" }}>Inner card</span>
<Hoverable.Item group="inner" variant="opacity-on-hover">
<span style={{ color: "var(--text-03)" }}>Inner action</span>
</Hoverable.Item>
</div>
</Hoverable.Root>
</div>
</Hoverable.Root>
),
};

View File

@@ -0,0 +1,127 @@
import type { Meta, StoryObj } from "@storybook/react";
import { Hoverable } from "@opal/core";
import SvgX from "@opal/icons/x";
// ---------------------------------------------------------------------------
// Shared styles
// ---------------------------------------------------------------------------
const cardStyle: React.CSSProperties = {
display: "flex",
alignItems: "center",
justifyContent: "space-between",
gap: "1rem",
padding: "0.75rem 1rem",
borderRadius: "0.5rem",
border: "1px solid var(--border-02)",
background: "var(--background-neutral-01)",
minWidth: 220,
};
const labelStyle: React.CSSProperties = {
fontSize: "0.875rem",
fontWeight: 500,
};
// ---------------------------------------------------------------------------
// Meta
// ---------------------------------------------------------------------------
const meta: Meta = {
title: "Core/Hoverable",
tags: ["autodocs"],
parameters: {
layout: "centered",
},
};
export default meta;
// ---------------------------------------------------------------------------
// Stories
// ---------------------------------------------------------------------------
/**
* Local hover mode -- no `group` prop on the Item.
* The icon only appears when you hover directly over the Item element itself.
*/
export const LocalHover: StoryObj = {
render: () => (
<div style={cardStyle}>
<span style={labelStyle}>Hover this card area</span>
<Hoverable.Item variant="opacity-on-hover">
<SvgX width={16} height={16} />
</Hoverable.Item>
</div>
),
};
/**
* Group hover mode -- hovering anywhere inside the Root reveals the Item.
*/
export const GroupHover: StoryObj = {
render: () => (
<Hoverable.Root group="card">
<div style={cardStyle}>
<span style={labelStyle}>Hover anywhere on this card</span>
<Hoverable.Item group="card" variant="opacity-on-hover">
<SvgX width={16} height={16} />
</Hoverable.Item>
</div>
</Hoverable.Root>
),
};
/**
* Nested groups demonstrating isolation.
*
* - Hovering the outer card reveals only the outer icon.
* - Hovering the inner card reveals only the inner icon.
*/
export const NestedGroups: StoryObj = {
render: () => (
<Hoverable.Root group="outer">
<div
style={{
...cardStyle,
flexDirection: "column",
alignItems: "stretch",
gap: "0.75rem",
padding: "1rem",
minWidth: 300,
}}
>
<div
style={{
display: "flex",
alignItems: "center",
justifyContent: "space-between",
}}
>
<span style={labelStyle}>Outer card</span>
<Hoverable.Item group="outer" variant="opacity-on-hover">
<SvgX width={16} height={16} />
</Hoverable.Item>
</div>
<Hoverable.Root group="inner">
<div
style={{
...cardStyle,
background: "var(--background-neutral-02)",
}}
>
<span style={labelStyle}>Inner card</span>
<Hoverable.Item group="inner" variant="opacity-on-hover">
<SvgX width={16} height={16} />
</Hoverable.Item>
</div>
</Hoverable.Root>
</div>
</Hoverable.Root>
),
};

View File

@@ -15,21 +15,13 @@
initial-value: transparent;
}
/* Shared timing tokens — used by .interactive and other surfaces (e.g. table rows) */
:root {
--interactive-duration: 150ms;
--interactive-easing: ease-in-out;
}
/* Base interactive surface */
.interactive {
@apply cursor-pointer select-none;
transition:
background-color var(--interactive-duration) var(--interactive-easing),
--interactive-foreground var(--interactive-duration)
var(--interactive-easing),
--interactive-foreground-icon var(--interactive-duration)
var(--interactive-easing);
background-color 150ms ease-in-out,
--interactive-foreground 150ms ease-in-out,
--interactive-foreground-icon 150ms ease-in-out;
}
.interactive[data-disabled] {
@apply cursor-not-allowed;

View File

@@ -174,7 +174,6 @@ function ContentLg({
)}
onClick={editable ? startEditing : undefined}
style={{ height: config.lineHeight }}
title={title}
>
{title}
</span>

View File

@@ -218,7 +218,6 @@ function ContentMd({
"text-text-04",
editable && "cursor-pointer"
)}
title={title}
onClick={editable ? startEditing : undefined}
style={{ height: config.lineHeight }}
>

View File

@@ -118,7 +118,6 @@ function ContentSm({
<span
className={cn("opal-content-sm-title", config.titleFont)}
style={{ height: config.lineHeight }}
title={title}
>
{title}
</span>

View File

@@ -231,7 +231,6 @@ function ContentXl({
)}
onClick={editable ? startEditing : undefined}
style={{ height: config.lineHeight }}
title={title}
>
{title}
</span>

View File

@@ -2,14 +2,7 @@
"extends": "../../tsconfig.json",
"compilerOptions": {
"paths": {
"@opal/*": ["./src/*"],
// TODO (@raunakab): Remove this once the table component migration is
// complete. The table internals still import app-layer modules (e.g.
// @/refresh-components/texts/Text, @/refresh-components/Popover) via the
// @/ alias. Without this entry the IDE cannot resolve those paths since
// opal's tsconfig only defines @opal/*. Once all @/ deps are replaced
// with opal-internal equivalents, this line should be deleted.
"@/*": ["../../src/*"]
"@opal/*": ["./src/*"]
}
},
"include": ["src/**/*"],

82
web/package-lock.json generated
View File

@@ -61,7 +61,7 @@
"mdast-util-find-and-replace": "^3.0.1",
"mime": "^4.1.0",
"motion": "^12.29.0",
"next": "16.1.7",
"next": "16.1.6",
"next-themes": "^0.4.4",
"postcss": "^8.5.6",
"posthog-js": "^1.176.0",
@@ -2896,9 +2896,9 @@
}
},
"node_modules/@next/env": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.7.tgz",
"integrity": "sha512-rJJbIdJB/RQr2F1nylZr/PJzamvNNhfr3brdKP6s/GW850jbtR70QlSfFselvIBbcPUOlQwBakexjFzqLzF6pg==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.6.tgz",
"integrity": "sha512-N1ySLuZjnAtN3kFnwhAwPvZah8RJxKasD7x1f8shFqhncnWZn4JMfg37diLNuoHsLAlrDfM3g4mawVdtAG8XLQ==",
"license": "MIT"
},
"node_modules/@next/eslint-plugin-next": {
@@ -2942,9 +2942,9 @@
}
},
"node_modules/@next/swc-darwin-arm64": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.7.tgz",
"integrity": "sha512-b2wWIE8sABdyafc4IM8r5Y/dS6kD80JRtOGrUiKTsACFQfWWgUQ2NwoUX1yjFMXVsAwcQeNpnucF2ZrujsBBPg==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.6.tgz",
"integrity": "sha512-wTzYulosJr/6nFnqGW7FrG3jfUUlEf8UjGA0/pyypJl42ExdVgC6xJgcXQ+V8QFn6niSG2Pb8+MIG1mZr2vczw==",
"cpu": [
"arm64"
],
@@ -2958,9 +2958,9 @@
}
},
"node_modules/@next/swc-darwin-x64": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.7.tgz",
"integrity": "sha512-zcnVaaZulS1WL0Ss38R5Q6D2gz7MtBu8GZLPfK+73D/hp4GFMrC2sudLky1QibfV7h6RJBJs/gOFvYP0X7UVlQ==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.6.tgz",
"integrity": "sha512-BLFPYPDO+MNJsiDWbeVzqvYd4NyuRrEYVB5k2N3JfWncuHAy2IVwMAOlVQDFjj+krkWzhY2apvmekMkfQR0CUQ==",
"cpu": [
"x64"
],
@@ -2974,9 +2974,9 @@
}
},
"node_modules/@next/swc-linux-arm64-gnu": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.7.tgz",
"integrity": "sha512-2ant89Lux/Q3VyC8vNVg7uBaFVP9SwoK2jJOOR0L8TQnX8CAYnh4uctAScy2Hwj2dgjVHqHLORQZJ2wH6VxhSQ==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.6.tgz",
"integrity": "sha512-OJYkCd5pj/QloBvoEcJ2XiMnlJkRv9idWA/j0ugSuA34gMT6f5b7vOiCQHVRpvStoZUknhl6/UxOXL4OwtdaBw==",
"cpu": [
"arm64"
],
@@ -2990,9 +2990,9 @@
}
},
"node_modules/@next/swc-linux-arm64-musl": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.7.tgz",
"integrity": "sha512-uufcze7LYv0FQg9GnNeZ3/whYfo+1Q3HnQpm16o6Uyi0OVzLlk2ZWoY7j07KADZFY8qwDbsmFnMQP3p3+Ftprw==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.6.tgz",
"integrity": "sha512-S4J2v+8tT3NIO9u2q+S0G5KdvNDjXfAv06OhfOzNDaBn5rw84DGXWndOEB7d5/x852A20sW1M56vhC/tRVbccQ==",
"cpu": [
"arm64"
],
@@ -3006,9 +3006,9 @@
}
},
"node_modules/@next/swc-linux-x64-gnu": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.7.tgz",
"integrity": "sha512-KWVf2gxYvHtvuT+c4MBOGxuse5TD7DsMFYSxVxRBnOzok/xryNeQSjXgxSv9QpIVlaGzEn/pIuI6Koosx8CGWA==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.6.tgz",
"integrity": "sha512-2eEBDkFlMMNQnkTyPBhQOAyn2qMxyG2eE7GPH2WIDGEpEILcBPI/jdSv4t6xupSP+ot/jkfrCShLAa7+ZUPcJQ==",
"cpu": [
"x64"
],
@@ -3022,9 +3022,9 @@
}
},
"node_modules/@next/swc-linux-x64-musl": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.7.tgz",
"integrity": "sha512-HguhaGwsGr1YAGs68uRKc4aGWxLET+NevJskOcCAwXbwj0fYX0RgZW2gsOCzr9S11CSQPIkxmoSbuVaBp4Z3dA==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.6.tgz",
"integrity": "sha512-oicJwRlyOoZXVlxmIMaTq7f8pN9QNbdes0q2FXfRsPhfCi8n8JmOZJm5oo1pwDaFbnnD421rVU409M3evFbIqg==",
"cpu": [
"x64"
],
@@ -3038,9 +3038,9 @@
}
},
"node_modules/@next/swc-win32-arm64-msvc": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.7.tgz",
"integrity": "sha512-S0n3KrDJokKTeFyM/vGGGR8+pCmXYrjNTk2ZozOL1C/JFdfUIL9O1ATaJOl5r2POe56iRChbsszrjMAdWSv7kQ==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.6.tgz",
"integrity": "sha512-gQmm8izDTPgs+DCWH22kcDmuUp7NyiJgEl18bcr8irXA5N2m2O+JQIr6f3ct42GOs9c0h8QF3L5SzIxcYAAXXw==",
"cpu": [
"arm64"
],
@@ -3054,9 +3054,9 @@
}
},
"node_modules/@next/swc-win32-x64-msvc": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.7.tgz",
"integrity": "sha512-mwgtg8CNZGYm06LeEd+bNnOUfwOyNem/rOiP14Lsz+AnUY92Zq/LXwtebtUiaeVkhbroRCQ0c8GlR4UT1U+0yg==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.6.tgz",
"integrity": "sha512-NRfO39AIrzBnixKbjuo2YiYhB6o9d8v/ymU9m/Xk8cyVk+k7XylniXkHwjs4s70wedVffc6bQNbufk5v0xEm0A==",
"cpu": [
"x64"
],
@@ -14068,14 +14068,14 @@
"license": "MIT"
},
"node_modules/next": {
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.7.tgz",
"integrity": "sha512-WM0L7WrSvKwoLegLYr6V+mz+RIofqQgVAfHhMp9a88ms0cFX8iX9ew+snpWlSBwpkURJOUdvCEt3uLl3NNzvWg==",
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.6.tgz",
"integrity": "sha512-hkyRkcu5x/41KoqnROkfTm2pZVbKxvbZRuNvKXLRXxs3VfyO0WhY50TQS40EuKO9SW3rBj/sF3WbVwDACeMZyw==",
"license": "MIT",
"dependencies": {
"@next/env": "16.1.7",
"@next/env": "16.1.6",
"@swc/helpers": "0.5.15",
"baseline-browser-mapping": "^2.9.19",
"baseline-browser-mapping": "^2.8.3",
"caniuse-lite": "^1.0.30001579",
"postcss": "8.4.31",
"styled-jsx": "5.1.6"
@@ -14087,14 +14087,14 @@
"node": ">=20.9.0"
},
"optionalDependencies": {
"@next/swc-darwin-arm64": "16.1.7",
"@next/swc-darwin-x64": "16.1.7",
"@next/swc-linux-arm64-gnu": "16.1.7",
"@next/swc-linux-arm64-musl": "16.1.7",
"@next/swc-linux-x64-gnu": "16.1.7",
"@next/swc-linux-x64-musl": "16.1.7",
"@next/swc-win32-arm64-msvc": "16.1.7",
"@next/swc-win32-x64-msvc": "16.1.7",
"@next/swc-darwin-arm64": "16.1.6",
"@next/swc-darwin-x64": "16.1.6",
"@next/swc-linux-arm64-gnu": "16.1.6",
"@next/swc-linux-arm64-musl": "16.1.6",
"@next/swc-linux-x64-gnu": "16.1.6",
"@next/swc-linux-x64-musl": "16.1.6",
"@next/swc-win32-arm64-msvc": "16.1.6",
"@next/swc-win32-x64-msvc": "16.1.6",
"sharp": "^0.34.4"
},
"peerDependencies": {

View File

@@ -79,7 +79,7 @@
"mdast-util-find-and-replace": "^3.0.1",
"mime": "^4.1.0",
"motion": "^12.29.0",
"next": "16.1.7",
"next": "16.1.6",
"next-themes": "^0.4.4",
"postcss": "^8.5.6",
"posthog-js": "^1.176.0",

View File

@@ -12,7 +12,7 @@ import { localizeAndPrettify } from "@/lib/time";
import Button from "@/refresh-components/buttons/Button";
import Text from "@/refresh-components/texts/Text";
import { PageSelector } from "@/components/PageSelector";
import { useCallback, useEffect, useRef, useState, useMemo } from "react";
import { useEffect, useState, useMemo } from "react";
import { SvgAlertTriangle } from "@opal/icons";
export interface IndexAttemptErrorsModalProps {
errors: {
@@ -22,66 +22,93 @@ export interface IndexAttemptErrorsModalProps {
onClose: () => void;
onResolveAll: () => void;
isResolvingErrors?: boolean;
onPageChange?: (page: number) => void;
currentPage?: number;
pageSize?: number;
}
const ROW_HEIGHT = 65; // 4rem + 1px for border
export default function IndexAttemptErrorsModal({
errors,
onClose,
onResolveAll,
isResolvingErrors = false,
pageSize: propPageSize,
}: IndexAttemptErrorsModalProps) {
const observerRef = useRef<ResizeObserver | null>(null);
const [pageSize, setPageSize] = useState(10);
const [calculatedPageSize, setCalculatedPageSize] = useState(10);
const [currentPage, setCurrentPage] = useState(1);
const tableContainerRef = useCallback((container: HTMLDivElement | null) => {
if (observerRef.current) {
observerRef.current.disconnect();
observerRef.current = null;
}
// Reset to page 1 when the error list actually changes
useEffect(() => {
setCurrentPage(1);
}, [errors.items.length, errors.total_items]);
if (!container) return;
useEffect(() => {
const calculatePageSize = () => {
// Modal height is 75% of viewport height
const modalHeight = window.innerHeight * 0.6;
const observer = new ResizeObserver(() => {
const thead = container.querySelector("thead");
const theadHeight = thead?.getBoundingClientRect().height ?? 0;
const availableHeight = container.clientHeight - theadHeight;
const newPageSize = Math.max(3, Math.floor(availableHeight / ROW_HEIGHT));
setPageSize(newPageSize);
});
// Estimate heights (in pixels):
// - Modal header (title + description): ~120px
// - Table header: ~40px
// - Pagination section: ~80px
// - Modal padding: ~64px (32px top + 32px bottom)
const fixedHeight = 120 + 40 + 80 + 64;
observer.observe(container);
observerRef.current = observer;
// Available height for table rows
const availableHeight = modalHeight - fixedHeight;
// Each table row is approximately 60px (including borders and padding)
const rowHeight = 60;
// Calculate how many rows can fit, with a minimum of 3
const rowsPerPage = Math.max(3, Math.floor(availableHeight / rowHeight));
setCalculatedPageSize((prev) => {
// Only update if the new size is significantly different to prevent flickering
if (Math.abs(prev - rowsPerPage) > 0) {
return rowsPerPage;
}
return prev;
});
};
// Initial calculation
calculatePageSize();
// Debounced resize handler to prevent excessive recalculation
let resizeTimeout: NodeJS.Timeout;
const debouncedCalculatePageSize = () => {
clearTimeout(resizeTimeout);
resizeTimeout = setTimeout(calculatePageSize, 100);
};
window.addEventListener("resize", debouncedCalculatePageSize);
return () => {
window.removeEventListener("resize", debouncedCalculatePageSize);
clearTimeout(resizeTimeout);
};
}, []);
// When data changes, reset to page 1.
// When page size changes (resize), preserve the user's position by
// finding which new page contains the first item they were looking at.
const prevPageSizeRef = useRef(pageSize);
// Separate effect to reset current page when page size changes
useEffect(() => {
if (pageSize !== prevPageSizeRef.current) {
setCurrentPage((prev) => {
const firstVisibleIndex = (prev - 1) * prevPageSizeRef.current;
const newPage = Math.floor(firstVisibleIndex / pageSize) + 1;
const totalPages = Math.ceil(errors.items.length / pageSize);
return Math.min(newPage, totalPages);
});
prevPageSizeRef.current = pageSize;
} else {
setCurrentPage(1);
}
}, [errors.items.length, pageSize]);
setCurrentPage(1);
}, [calculatedPageSize]);
const pageSize = propPageSize || calculatedPageSize;
// Memoize pagination calculations to prevent unnecessary recalculations
const paginationData = useMemo(() => {
const totalPages = Math.ceil(errors.items.length / pageSize);
const startIndex = (currentPage - 1) * pageSize;
const currentPageItems = errors.items.slice(
const endIndex = startIndex + pageSize;
const currentPageItems = errors.items.slice(startIndex, endIndex);
return {
totalPages,
currentPageItems,
startIndex,
startIndex + pageSize
);
return { totalPages, currentPageItems };
endIndex,
};
}, [errors.items, pageSize, currentPage]);
const hasUnresolvedErrors = useMemo(
@@ -110,7 +137,7 @@ export default function IndexAttemptErrorsModal({
onClose={onClose}
height="fit"
/>
<Modal.Body height="full">
<Modal.Body>
{!isResolvingErrors && (
<div className="flex flex-col gap-2 flex-shrink-0">
<Text as="p">
@@ -125,10 +152,7 @@ export default function IndexAttemptErrorsModal({
</div>
)}
<div
ref={tableContainerRef}
className="flex-1 w-full overflow-hidden min-h-0"
>
<div className="flex-1 overflow-hidden min-h-0">
<Table>
<TableHeader>
<TableRow>
@@ -141,11 +165,11 @@ export default function IndexAttemptErrorsModal({
<TableBody>
{paginationData.currentPageItems.length > 0 ? (
paginationData.currentPageItems.map((error) => (
<TableRow key={error.id} className="h-[4rem]">
<TableCell>
<TableRow key={error.id} className="h-[60px] max-h-[60px]">
<TableCell className="h-[60px] align-top">
{localizeAndPrettify(error.time_created)}
</TableCell>
<TableCell>
<TableCell className="h-[60px] align-top">
{error.document_link ? (
<a
href={error.document_link}
@@ -159,12 +183,12 @@ export default function IndexAttemptErrorsModal({
error.document_id || error.entity_id || "Unknown"
)}
</TableCell>
<TableCell>
<div className="flex items-center h-[2rem] overflow-y-auto whitespace-normal">
<TableCell className="h-[60px] align-top p-0">
<div className="h-[60px] overflow-y-auto p-4 whitespace-normal">
{error.failure_message}
</div>
</TableCell>
<TableCell>
<TableCell className="h-[60px] align-top">
<span
className={`px-2 py-1 rounded text-xs ${
error.is_resolved
@@ -178,7 +202,7 @@ export default function IndexAttemptErrorsModal({
</TableRow>
))
) : (
<TableRow className="h-[4rem]">
<TableRow>
<TableCell
colSpan={4}
className="text-center py-8 text-gray-500"
@@ -191,24 +215,32 @@ export default function IndexAttemptErrorsModal({
</Table>
</div>
{paginationData.totalPages > 1 && (
<div className="flex w-full justify-center">
<PageSelector
totalPages={paginationData.totalPages}
currentPage={currentPage}
onPageChange={handlePageChange}
/>
<div className="flex-shrink-0">
{paginationData.totalPages > 1 && (
<div className="flex-1 flex justify-center mb-2">
<PageSelector
totalPages={paginationData.totalPages}
currentPage={currentPage}
onPageChange={handlePageChange}
/>
</div>
)}
<div className="flex w-full">
<div className="flex gap-2 ml-auto">
{hasUnresolvedErrors && !isResolvingErrors && (
// TODO(@raunakab): migrate to opal Button once className/iconClassName is resolved
<Button
onClick={onResolveAll}
className="ml-4 whitespace-nowrap"
>
Resolve All
</Button>
)}
</div>
</div>
)}
</div>
</Modal.Body>
<Modal.Footer>
{hasUnresolvedErrors && !isResolvingErrors && (
// TODO(@raunakab): migrate to opal Button once className/iconClassName is resolved
<Button onClick={onResolveAll} className="ml-4 whitespace-nowrap">
Resolve All
</Button>
)}
</Modal.Footer>
</Modal.Content>
</Modal>
);

View File

@@ -18,7 +18,7 @@ import { PageSelector } from "@/components/PageSelector";
import { localizeAndPrettify } from "@/lib/time";
import { getDocsProcessedPerMinute } from "@/lib/indexAttempt";
import { InfoIcon } from "@/components/icons/icons";
import ExceptionTraceModal from "@/sections/modals/PreviewModal/ExceptionTraceModal";
import ExceptionTraceModal from "@/components/modals/ExceptionTraceModal";
import SimpleTooltip from "@/refresh-components/SimpleTooltip";
import { SvgClock } from "@opal/icons";
export interface IndexingAttemptsTableProps {
@@ -98,14 +98,7 @@ export function IndexAttemptsTable({
isReindexInProgress ? "are being" : "were"
} synced into the system.`;
return (
<TableRow
key={indexAttempt.id}
className={
indexAttempt.full_exception_trace
? "hover:bg-accent-background cursor-pointer relative select-none"
: undefined
}
>
<TableRow key={indexAttempt.id}>
<TableCell>
{indexAttempt.time_started
? localizeAndPrettify(indexAttempt.time_started)
@@ -153,43 +146,46 @@ export function IndexAttemptsTable({
</div>
</TableCell>
<TableCell>
{indexAttempt.status === "success" && (
<Text className="flex flex-wrap whitespace-normal">
{"-"}
</Text>
)}
{indexAttempt.status === "failed" &&
indexAttempt.error_msg && (
<div>
{indexAttempt.status === "success" && (
<Text className="flex flex-wrap whitespace-normal">
{indexAttempt.error_msg}
{"-"}
</Text>
)}
{indexAttempt.status === "failed" &&
indexAttempt.error_msg && (
<Text className="flex flex-wrap whitespace-normal">
{indexAttempt.error_msg}
</Text>
)}
{indexAttempt.full_exception_trace && (
<div
onClick={() => {
setIndexAttemptTracePopupId(indexAttempt.id);
}}
className="mt-2 text-link cursor-pointer select-none"
>
View Full Trace
</div>
)}
</div>
</TableCell>
<td className="w-0 p-0">
{indexAttempt.full_exception_trace && (
<button
type="button"
aria-label="View full trace"
onClick={() =>
setIndexAttemptTracePopupId(indexAttempt.id)
}
className="absolute w-full h-full left-0 top-0"
/>
)}
</td>
</TableRow>
);
})}
</TableBody>
</Table>
{totalPages > 1 && (
<div className="flex flex-1 justify-center pt-3">
<PageSelector
totalPages={totalPages}
currentPage={currentPage}
onPageChange={onPageChange}
/>
<div className="mt-3 flex">
<div className="mx-auto">
<PageSelector
totalPages={totalPages}
currentPage={currentPage}
onPageChange={onPageChange}
/>
</div>
</div>
)}
</>

View File

@@ -21,13 +21,10 @@ export const submitGoogleSite = async (
formData.append("files", file);
});
const response = await fetch(
"/api/manage/admin/connector/file/upload?unzip=false",
{
method: "POST",
body: formData,
}
);
const response = await fetch("/api/manage/admin/connector/file/upload", {
method: "POST",
body: formData,
});
const responseJson = await response.json();
if (!response.ok) {
toast.error(`Unable to upload files - ${responseJson.detail}`);

View File

@@ -1 +0,0 @@
export { default } from "@/refresh-pages/admin/GroupsPage";

View File

@@ -19,7 +19,6 @@ import {
} from "@/lib/types";
import type { Route } from "next";
import { useRouter } from "next/navigation";
import Truncated from "@/refresh-components/texts/Truncated";
import {
FiChevronDown,
FiChevronRight,
@@ -166,7 +165,9 @@ function ConnectorRow({
onClick={handleRowClick}
>
<TableCell className="">
<Truncated>{ccPairsIndexingStatus.name}</Truncated>
<p className="max-w-[200px] xl:max-w-[400px] inline-block ellipsis truncate">
{ccPairsIndexingStatus.name}
</p>
</TableCell>
<TableCell>
{timeAgo(ccPairsIndexingStatus?.last_success) || "-"}
@@ -245,7 +246,9 @@ function FederatedConnectorRow({
onClick={handleRowClick}
>
<TableCell className="">
<Truncated>{federatedConnector.name}</Truncated>
<p className="max-w-[200px] xl:max-w-[400px] inline-block ellipsis truncate">
{federatedConnector.name}
</p>
</TableCell>
<TableCell>N/A</TableCell>
<TableCell>

143
web/src/app/css/table.css Normal file
View File

@@ -0,0 +1,143 @@
/* ---------------------------------------------------------------------------
* Table primitives — data-attribute driven styling
* Follows the same pattern as card.css / line-item.css.
* ------------------------------------------------------------------------- */
/* ---- TableCell ---- */
.tbl-cell[data-size="regular"] {
@apply px-1 py-0.5;
}
.tbl-cell[data-size="small"] {
@apply pl-0.5 pr-1.5 py-1.5;
}
.tbl-cell-inner[data-size="regular"] {
@apply h-10 px-1;
}
.tbl-cell-inner[data-size="small"] {
@apply h-6 px-0.5;
}
/* ---- TableHead ---- */
.table-head {
@apply relative sticky top-0 z-20;
background: var(--table-header-bg, transparent);
}
.table-head[data-size="regular"] {
@apply px-2 py-1;
}
.table-head[data-size="small"] {
@apply p-1.5;
}
.table-head[data-bottom-border] {
@apply border-b border-transparent hover:border-border-03;
}
/* Inner text wrapper */
.table-head[data-size="regular"] .table-head-label {
@apply py-2 px-0.5;
}
.table-head[data-size="small"] .table-head-label {
@apply py-1;
}
/* Sort button wrapper */
.table-head[data-size="regular"] .table-head-sort {
@apply py-1.5;
}
/* ---- TableRow ---- */
.tbl-row > td {
@apply bg-background-tint-00;
}
.tbl-row[data-variant="table"] > td {
@apply border-b border-border-01;
}
.tbl-row[data-variant="list"] > td {
@apply bg-clip-padding border-y-[4px] border-x-0 border-transparent;
}
.tbl-row[data-variant="list"] > td:first-child {
@apply rounded-l-12;
}
.tbl-row[data-variant="list"] > td:last-child {
@apply rounded-r-12;
}
/* When a drag handle is present the second-to-last td gets the rounding */
.tbl-row[data-variant="list"][data-drag-handle] > td:nth-last-child(2) {
@apply rounded-r-12;
}
.tbl-row[data-variant="list"][data-drag-handle] > td:last-child {
border-radius: 0;
}
/* ---- Row states (list variant) ---- */
.tbl-row[data-variant="list"]:hover > td {
@apply bg-background-tint-02;
}
.tbl-row[data-variant="list"]:focus-visible > td,
.tbl-row[data-variant="list"]:has(:focus-visible) > td {
@apply bg-action-link-01;
}
.tbl-row[data-disabled] {
@apply pointer-events-none;
}
/* ---- Row states (table variant) ---- */
.tbl-row[data-variant="table"]:hover > td {
@apply bg-background-tint-02;
}
.tbl-row[data-variant="table"]:focus-visible > td,
.tbl-row[data-variant="table"]:has(:focus-visible) > td {
@apply bg-action-link-01;
}
/* Suppress default focus ring on rows — the row bg is the indicator */
.tbl-row:focus,
.tbl-row:focus-visible {
outline: none;
}
/* ---- QualifierContainer ---- */
.tbl-qualifier[data-type="head"] {
@apply w-px whitespace-nowrap py-1 sticky top-0 z-20;
background: var(--table-header-bg, transparent);
}
.tbl-qualifier[data-type="head"][data-size="small"] {
@apply py-0.5;
}
.tbl-qualifier[data-type="cell"] {
@apply w-px whitespace-nowrap py-1 pl-1;
}
.tbl-qualifier[data-type="cell"][data-size="small"] {
@apply py-0.5 pl-0.5;
}
/* ---- ActionsContainer ---- */
.tbl-actions {
@apply sticky right-0 w-px whitespace-nowrap;
}
.tbl-actions[data-type="head"] {
@apply z-30 sticky top-0;
background: var(--table-header-bg, transparent);
}
/* ---- Footer ---- */
.table-footer[data-size="regular"] {
@apply min-h-[2.75rem];
}
.table-footer[data-size="small"] {
@apply min-h-[2.25rem];
}

View File

@@ -1 +0,0 @@
export { default } from "@/refresh-pages/admin/GroupsPage";

View File

@@ -16,6 +16,7 @@
@import "css/sizes.css";
@import "css/square-button.css";
@import "css/switch.css";
@import "css/table.css";
@import "css/z-index.css";
/* KH Teka Font */

View File

@@ -0,0 +1,53 @@
import { useState } from "react";
import Modal from "@/refresh-components/Modal";
import Text from "@/refresh-components/texts/Text";
import { SvgAlertTriangle, SvgCheck, SvgCopy } from "@opal/icons";
interface ExceptionTraceModalProps {
onOutsideClick: () => void;
exceptionTrace: string;
}
export default function ExceptionTraceModal({
onOutsideClick,
exceptionTrace,
}: ExceptionTraceModalProps) {
const [copyClicked, setCopyClicked] = useState(false);
return (
<Modal open onOpenChange={onOutsideClick}>
<Modal.Content width="lg" height="full">
<Modal.Header
icon={SvgAlertTriangle}
title="Full Exception Trace"
onClose={onOutsideClick}
height="fit"
/>
<Modal.Body>
<div className="mb-6">
{!copyClicked ? (
<button
type="button"
onClick={() => {
navigator.clipboard.writeText(exceptionTrace!);
setCopyClicked(true);
setTimeout(() => setCopyClicked(false), 2000);
}}
className="flex w-fit items-center hover:bg-accent-background p-2 border-border border rounded"
>
<Text>Copy full trace</Text>
<SvgCopy className="stroke-text-04 ml-2 h-4 w-4 flex flex-shrink-0" />
</button>
) : (
<div className="flex w-fit items-center hover:bg-accent-background p-2 border-border border rounded cursor-default">
<Text>Copied to clipboard</Text>
<SvgCheck className="stroke-text-04 my-auto ml-2 h-4 w-4 flex flex-shrink-0" />
</div>
)}
</div>
<div className="whitespace-pre-wrap">{exceptionTrace}</div>
</Modal.Body>
</Modal.Content>
</Modal>
);
}

View File

@@ -451,19 +451,13 @@ const ModalHeader = React.forwardRef<HTMLDivElement, ModalHeaderProps>(
);
return (
<Section
ref={ref}
padding={0.5}
alignItems="start"
height="fit"
{...props}
>
<Section ref={ref} padding={1} alignItems="start" height="fit" {...props}>
<Section
flexDirection="row"
justifyContent="between"
alignItems="start"
gap={0}
padding={0.5}
padding={0}
>
<div className="relative w-full">
{/* Close button is absolutely positioned because:
@@ -491,6 +485,7 @@ const ModalHeader = React.forwardRef<HTMLDivElement, ModalHeaderProps>(
</DialogPrimitive.Title>
</div>
</Section>
{children}
</Section>
);

View File

@@ -9,8 +9,6 @@ export interface SeparatorProps
noPadding?: boolean;
/** Custom horizontal padding in rem. Overrides the default padding. */
paddingXRem?: number;
/** Custom vertical padding in rem. Overrides the default padding. */
paddingYRem?: number;
}
/**
@@ -39,7 +37,7 @@ const Separator = React.forwardRef(
{
noPadding,
paddingXRem,
paddingYRem,
className,
orientation = "horizontal",
decorative = true,
@@ -58,12 +56,6 @@ const Separator = React.forwardRef(
paddingRight: `${paddingXRem}rem`,
}
: {}),
...(paddingYRem != null
? {
paddingTop: `${paddingYRem}rem`,
paddingBottom: `${paddingYRem}rem`,
}
: {}),
}}
className={cn(
isHorizontal ? "w-full" : "h-full",

View File

@@ -112,11 +112,9 @@ function MemoryItem({
/>
</Disabled>
</Section>
<div
className={isFocused ? "visible" : "invisible h-0 overflow-hidden"}
>
{isFocused && (
<CharacterCount value={memory.content} limit={MAX_MEMORY_LENGTH} />
</div>
)}
</Section>
</div>
);

Some files were not shown because too many files have changed in this diff Show More