Compare commits

..

22 Commits

Author SHA1 Message Date
Bo-Onyx
164d1baece address commetns 2026-03-18 19:14:52 -07:00
Bo-Onyx
aa69b799bc address comments 2026-03-18 16:51:28 -07:00
Bo-Onyx
9b4bf6bef3 chore(hook): Hook executor. 2026-03-18 16:41:09 -07:00
Jamison Lahman
e8bf45cfd2 feat(fe): "Full Exception Trace" modal uses CodePreview rendering (#9464)
Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
2026-03-18 21:04:55 +00:00
Bo-Onyx
13ff648fcd chore(hooks): Add Celery task to remove hook running records older than 30 days (#9433) 2026-03-18 21:03:01 +00:00
Jamison Lahman
ae8268afb1 fix(fe): truncate connector names in table (#9459) 2026-03-18 20:59:49 +00:00
acaprau
b338bd9e97 feat(opensearch): Can override number of shards and replicas via env var (#9431) 2026-03-18 20:16:05 +00:00
acaprau
0dcc90a042 fix(opensearch): Exclude retrieving vectors during hybrid and random search (#9430) 2026-03-18 20:13:12 +00:00
Jamison Lahman
0f6a6693d3 fix(fe): truncate project name in sidebar button (#9462) 2026-03-18 20:06:09 +00:00
Jamison Lahman
e32cc450b2 fix(fe): update connector indexing error modal (#9426)
Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
2026-03-18 11:57:28 -07:00
Jamison Lahman
732fb71edf chore(tests): unit tests for pdf processing (#9452)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-18 18:31:37 +00:00
dependabot[bot]
ca3320c0e0 chore(deps): bump pypdf from 6.8.0 to 6.9.1 (#9450)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Jamison Lahman <jamison@lahman.dev>
2026-03-18 17:52:50 +00:00
Jamison Lahman
d7c554aca7 chore(ruff): fix and enable S324 (#9451) 2026-03-18 17:26:29 +00:00
dependabot[bot]
69e5c19695 chore(deps): bump next from 16.1.5 to 16.1.7 in /examples/widget (#9425)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-03-18 09:25:27 -07:00
Nikolas Garza
b4ce1c7a97 chore: bump next to 16.1.7 (#9423) 2026-03-18 09:22:40 -07:00
Jamison Lahman
cd64a91154 fix(fe): display name on attachment file card hover (#9446) 2026-03-18 16:13:21 +00:00
Danelegend
c282cdc096 fix(file upload): Allow zip file upload via query param (#9432) 2026-03-18 07:32:07 +00:00
Jamison Lahman
b1de1c59b6 chore(playwright): projects screenshot is main container only (#9440) 2026-03-18 05:35:30 +00:00
acaprau
64d484039f chore(opensearch): Disable test_update_single_can_clear_user_projects_and_personas (#9434) 2026-03-18 00:40:29 +00:00
Jamison Lahman
0530095b71 fix(fe): replace users table buttons with LineItems (#9435) 2026-03-17 23:45:15 +00:00
acaprau
23280d5b91 fix(opensearch): Fix env var mismatch issue with configuring subquery results; set default to 100 (#9428) 2026-03-17 16:01:45 -07:00
Bo-Onyx
229442679c chore(hooks): Add db CRUD (#9411) 2026-03-17 22:36:50 +00:00
55 changed files with 2137 additions and 1145 deletions

View File

@@ -317,6 +317,7 @@ celery_app.autodiscover_tasks(
"onyx.background.celery.tasks.docprocessing",
"onyx.background.celery.tasks.evals",
"onyx.background.celery.tasks.hierarchyfetching",
"onyx.background.celery.tasks.hooks",
"onyx.background.celery.tasks.periodic",
"onyx.background.celery.tasks.pruning",
"onyx.background.celery.tasks.shared",

View File

@@ -14,6 +14,7 @@ from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
from onyx.configs.constants import OnyxCeleryPriority
from onyx.configs.constants import OnyxCeleryQueues
from onyx.configs.constants import OnyxCeleryTask
from onyx.hooks.utils import HOOKS_AVAILABLE
from shared_configs.configs import MULTI_TENANT
# choosing 15 minutes because it roughly gives us enough time to process many tasks
@@ -361,6 +362,19 @@ if not MULTI_TENANT:
tasks_to_schedule.extend(beat_task_templates)
if HOOKS_AVAILABLE:
tasks_to_schedule.append(
{
"name": "hook-execution-log-cleanup",
"task": OnyxCeleryTask.HOOK_EXECUTION_LOG_CLEANUP_TASK,
"schedule": timedelta(days=1),
"options": {
"priority": OnyxCeleryPriority.LOW,
"expires": BEAT_EXPIRES_DEFAULT,
},
}
)
def generate_cloud_tasks(
beat_tasks: list[dict], beat_templates: list[dict], beat_multiplier: float

View File

@@ -0,0 +1,35 @@
from celery import shared_task
from onyx.configs.app_configs import JOB_TIMEOUT
from onyx.configs.constants import OnyxCeleryTask
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.hook import cleanup_old_execution_logs__no_commit
from onyx.utils.logger import setup_logger
logger = setup_logger()
_HOOK_EXECUTION_LOG_RETENTION_DAYS: int = 30
@shared_task(
name=OnyxCeleryTask.HOOK_EXECUTION_LOG_CLEANUP_TASK,
ignore_result=True,
soft_time_limit=JOB_TIMEOUT,
trail=False,
)
def hook_execution_log_cleanup_task(*, tenant_id: str) -> None: # noqa: ARG001
try:
with get_session_with_current_tenant() as db_session:
deleted: int = cleanup_old_execution_logs__no_commit(
db_session=db_session,
max_age_days=_HOOK_EXECUTION_LOG_RETENTION_DAYS,
)
db_session.commit()
if deleted:
logger.info(
f"Deleted {deleted} hook execution log(s) older than "
f"{_HOOK_EXECUTION_LOG_RETENTION_DAYS} days."
)
except Exception:
logger.exception("Failed to clean up hook execution logs")
raise

View File

@@ -297,7 +297,9 @@ class PostgresCacheBackend(CacheBackend):
def _lock_id_for(self, name: str) -> int:
"""Map *name* to a 64-bit signed int for ``pg_advisory_lock``."""
h = hashlib.md5(f"{self._tenant_id}:{name}".encode()).digest()
h = hashlib.md5(
f"{self._tenant_id}:{name}".encode(), usedforsecurity=False
).digest()
return struct.unpack("q", h[:8])[0]

View File

@@ -318,8 +318,16 @@ VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT = (
OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE = int(
os.environ.get("OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE") or 500
)
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = int(
os.environ.get("OPENSEARCH_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES") or 0
# If set, will override the default number of shards and replicas for the index.
OPENSEARCH_INDEX_NUM_SHARDS: int | None = (
int(os.environ["OPENSEARCH_INDEX_NUM_SHARDS"])
if os.environ.get("OPENSEARCH_INDEX_NUM_SHARDS", None) is not None
else None
)
OPENSEARCH_INDEX_NUM_REPLICAS: int | None = (
int(os.environ["OPENSEARCH_INDEX_NUM_REPLICAS"])
if os.environ.get("OPENSEARCH_INDEX_NUM_REPLICAS", None) is not None
else None
)
VESPA_HOST = os.environ.get("VESPA_HOST") or "localhost"

View File

@@ -597,6 +597,9 @@ class OnyxCeleryTask:
EXPORT_QUERY_HISTORY_TASK = "export_query_history_task"
EXPORT_QUERY_HISTORY_CLEANUP_TASK = "export_query_history_cleanup_task"
# Hook execution log retention
HOOK_EXECUTION_LOG_CLEANUP_TASK = "hook_execution_log_cleanup_task"
# Sandbox cleanup
CLEANUP_IDLE_SANDBOXES = "cleanup_idle_sandboxes"
CLEANUP_OLD_SNAPSHOTS = "cleanup_old_snapshots"

233
backend/onyx/db/hook.py Normal file
View File

@@ -0,0 +1,233 @@
import datetime
from uuid import UUID
from sqlalchemy import delete
from sqlalchemy import select
from sqlalchemy.engine import CursorResult
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import selectinload
from sqlalchemy.orm import Session
from onyx.db.constants import UNSET
from onyx.db.constants import UnsetType
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.db.models import Hook
from onyx.db.models import HookExecutionLog
from onyx.error_handling.error_codes import OnyxErrorCode
from onyx.error_handling.exceptions import OnyxError
# ── Hook CRUD ────────────────────────────────────────────────────────────
def get_hook_by_id(
*,
db_session: Session,
hook_id: int,
include_deleted: bool = False,
include_creator: bool = False,
) -> Hook | None:
stmt = select(Hook).where(Hook.id == hook_id)
if not include_deleted:
stmt = stmt.where(Hook.deleted.is_(False))
if include_creator:
stmt = stmt.options(selectinload(Hook.creator))
return db_session.scalar(stmt)
def get_non_deleted_hook_by_hook_point(
*,
db_session: Session,
hook_point: HookPoint,
include_creator: bool = False,
) -> Hook | None:
stmt = (
select(Hook).where(Hook.hook_point == hook_point).where(Hook.deleted.is_(False))
)
if include_creator:
stmt = stmt.options(selectinload(Hook.creator))
return db_session.scalar(stmt)
def get_hooks(
*,
db_session: Session,
include_deleted: bool = False,
include_creator: bool = False,
) -> list[Hook]:
stmt = select(Hook)
if not include_deleted:
stmt = stmt.where(Hook.deleted.is_(False))
if include_creator:
stmt = stmt.options(selectinload(Hook.creator))
stmt = stmt.order_by(Hook.hook_point, Hook.created_at.desc())
return list(db_session.scalars(stmt).all())
def create_hook__no_commit(
*,
db_session: Session,
name: str,
hook_point: HookPoint,
endpoint_url: str | None = None,
api_key: str | None = None,
fail_strategy: HookFailStrategy,
timeout_seconds: float,
is_active: bool = False,
creator_id: UUID | None = None,
) -> Hook:
"""Create a new hook for the given hook point.
At most one non-deleted hook per hook point is allowed. Raises
OnyxError(CONFLICT) if a hook already exists, including under concurrent
duplicate creates where the partial unique index fires an IntegrityError.
"""
existing = get_non_deleted_hook_by_hook_point(
db_session=db_session, hook_point=hook_point
)
if existing:
raise OnyxError(
OnyxErrorCode.CONFLICT,
f"A hook for '{hook_point.value}' already exists (id={existing.id}).",
)
hook = Hook(
name=name,
hook_point=hook_point,
endpoint_url=endpoint_url,
api_key=api_key,
fail_strategy=fail_strategy,
timeout_seconds=timeout_seconds,
is_active=is_active,
creator_id=creator_id,
)
# Use a savepoint so that a failed insert only rolls back this operation,
# not the entire outer transaction.
savepoint = db_session.begin_nested()
try:
db_session.add(hook)
savepoint.commit()
except IntegrityError as exc:
savepoint.rollback()
if "ix_hook_one_non_deleted_per_point" in str(exc.orig):
raise OnyxError(
OnyxErrorCode.CONFLICT,
f"A hook for '{hook_point.value}' already exists.",
)
raise # re-raise unrelated integrity errors (FK violations, etc.)
return hook
def update_hook__no_commit(
*,
db_session: Session,
hook_id: int,
name: str | None = None,
endpoint_url: str | None | UnsetType = UNSET,
api_key: str | None | UnsetType = UNSET,
fail_strategy: HookFailStrategy | None = None,
timeout_seconds: float | None = None,
is_active: bool | None = None,
is_reachable: bool | None = None,
include_creator: bool = False,
) -> Hook:
"""Update hook fields.
Sentinel conventions:
- endpoint_url, api_key: pass UNSET to leave unchanged; pass None to clear.
- name, fail_strategy, timeout_seconds, is_active, is_reachable: pass None to leave unchanged.
"""
hook = get_hook_by_id(
db_session=db_session, hook_id=hook_id, include_creator=include_creator
)
if hook is None:
raise OnyxError(OnyxErrorCode.NOT_FOUND, f"Hook with id {hook_id} not found.")
if name is not None:
hook.name = name
if not isinstance(endpoint_url, UnsetType):
hook.endpoint_url = endpoint_url
if not isinstance(api_key, UnsetType):
hook.api_key = api_key # type: ignore[assignment] # EncryptedString coerces str → SensitiveValue at the ORM level
if fail_strategy is not None:
hook.fail_strategy = fail_strategy
if timeout_seconds is not None:
hook.timeout_seconds = timeout_seconds
if is_active is not None:
hook.is_active = is_active
if is_reachable is not None:
hook.is_reachable = is_reachable
db_session.flush()
return hook
def delete_hook__no_commit(
*,
db_session: Session,
hook_id: int,
) -> None:
hook = get_hook_by_id(db_session=db_session, hook_id=hook_id)
if hook is None:
raise OnyxError(OnyxErrorCode.NOT_FOUND, f"Hook with id {hook_id} not found.")
hook.deleted = True
hook.is_active = False
db_session.flush()
# ── HookExecutionLog CRUD ────────────────────────────────────────────────
def create_hook_execution_log__no_commit(
*,
db_session: Session,
hook_id: int,
is_success: bool,
error_message: str | None = None,
status_code: int | None = None,
duration_ms: int | None = None,
) -> HookExecutionLog:
log = HookExecutionLog(
hook_id=hook_id,
is_success=is_success,
error_message=error_message,
status_code=status_code,
duration_ms=duration_ms,
)
db_session.add(log)
db_session.flush()
return log
def get_hook_execution_logs(
*,
db_session: Session,
hook_id: int,
limit: int,
) -> list[HookExecutionLog]:
stmt = (
select(HookExecutionLog)
.where(HookExecutionLog.hook_id == hook_id)
.order_by(HookExecutionLog.created_at.desc())
.limit(limit)
)
return list(db_session.scalars(stmt).all())
def cleanup_old_execution_logs__no_commit(
*,
db_session: Session,
max_age_days: int,
) -> int:
"""Delete execution logs older than max_age_days. Returns the number of rows deleted."""
cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(
days=max_age_days
)
result: CursorResult = db_session.execute( # type: ignore[assignment]
delete(HookExecutionLog)
.where(HookExecutionLog.created_at < cutoff)
.execution_options(synchronize_session=False)
)
return result.rowcount

View File

@@ -18,6 +18,7 @@ from onyx.configs.app_configs import OPENSEARCH_HOST
from onyx.configs.app_configs import OPENSEARCH_REST_API_PORT
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import DocumentChunkWithoutVectors
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.search import DEFAULT_OPENSEARCH_MAX_RESULT_WINDOW
from onyx.utils.logger import setup_logger
@@ -56,8 +57,8 @@ class SearchHit(BaseModel, Generic[SchemaDocumentModel]):
# Maps schema property name to a list of highlighted snippets with match
# terms wrapped in tags (e.g. "something <hi>keyword</hi> other thing").
match_highlights: dict[str, list[str]] = {}
# Score explanation from OpenSearch when "explain": true is set in the query.
# Contains detailed breakdown of how the score was calculated.
# Score explanation from OpenSearch when "explain": true is set in the
# query. Contains detailed breakdown of how the score was calculated.
explanation: dict[str, Any] | None = None
@@ -833,9 +834,13 @@ class OpenSearchIndexClient(OpenSearchClient):
@log_function_time(print_only=True, debug_only=True)
def search(
self, body: dict[str, Any], search_pipeline_id: str | None
) -> list[SearchHit[DocumentChunk]]:
) -> list[SearchHit[DocumentChunkWithoutVectors]]:
"""Searches the index.
NOTE: Does not return vector fields. In order to take advantage of
performance benefits, the search body should exclude the schema's vector
fields.
TODO(andrei): Ideally we could check that every field in the body is
present in the index, to avoid a class of runtime bugs that could easily
be caught during development. Or change the function signature to accept
@@ -883,7 +888,7 @@ class OpenSearchIndexClient(OpenSearchClient):
raise_on_timeout=True,
)
search_hits: list[SearchHit[DocumentChunk]] = []
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = []
for hit in hits:
document_chunk_source: dict[str, Any] | None = hit.get("_source")
if not document_chunk_source:
@@ -893,8 +898,10 @@ class OpenSearchIndexClient(OpenSearchClient):
document_chunk_score = hit.get("_score", None)
match_highlights: dict[str, list[str]] = hit.get("highlight", {})
explanation: dict[str, Any] | None = hit.get("_explanation", None)
search_hit = SearchHit[DocumentChunk](
document_chunk=DocumentChunk.model_validate(document_chunk_source),
search_hit = SearchHit[DocumentChunkWithoutVectors](
document_chunk=DocumentChunkWithoutVectors.model_validate(
document_chunk_source
),
score=document_chunk_score,
match_highlights=match_highlights,
explanation=explanation,

View File

@@ -3,10 +3,6 @@
import os
from enum import Enum
from onyx.configs.app_configs import (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
)
DEFAULT_MAX_CHUNK_SIZE = 512
@@ -41,10 +37,10 @@ M = 32 # Set relatively high for better accuracy.
# we have a much higher chance of all 10 of the final desired docs showing up
# and getting scored. In worse situations, the final 10 docs don't even show up
# as the final 10 (worse than just a miss at the reranking step).
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
if OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES > 0
else 750
# Defaults to 100 for now. Initially this defaulted to 750 but we were seeing
# poor search performance.
DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES = int(
os.environ.get("DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES", 100)
)
# Number of vectors to examine to decide the top k neighbors for the HNSW
@@ -54,7 +50,7 @@ DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = (
# larger than k, you can provide the size parameter to limit the final number of
# results to k." from
# https://docs.opensearch.org/latest/query-dsl/specialized/k-nn/index/#ef_search
EF_SEARCH = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
EF_SEARCH = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES
class HybridSearchSubqueryConfiguration(Enum):

View File

@@ -47,6 +47,7 @@ from onyx.document_index.opensearch.schema import ACCESS_CONTROL_LIST_FIELD_NAME
from onyx.document_index.opensearch.schema import CONTENT_FIELD_NAME
from onyx.document_index.opensearch.schema import DOCUMENT_SETS_FIELD_NAME
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import DocumentChunkWithoutVectors
from onyx.document_index.opensearch.schema import DocumentSchema
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.schema import GLOBAL_BOOST_FIELD_NAME
@@ -117,7 +118,7 @@ def set_cluster_state(client: OpenSearchClient) -> None:
def _convert_retrieved_opensearch_chunk_to_inference_chunk_uncleaned(
chunk: DocumentChunk,
chunk: DocumentChunkWithoutVectors,
score: float | None,
highlights: dict[str, list[str]],
) -> InferenceChunkUncleaned:
@@ -880,7 +881,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
)
results: list[InferenceChunk] = []
for chunk_request in chunk_requests:
search_hits: list[SearchHit[DocumentChunk]] = []
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = []
query_body = DocumentQuery.get_from_document_id_query(
document_id=chunk_request.document_id,
tenant_state=self._tenant_state,
@@ -944,7 +945,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
include_hidden=False,
)
normalization_pipeline_name, _ = get_normalization_pipeline_name_and_config()
search_hits: list[SearchHit[DocumentChunk]] = self._client.search(
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = self._client.search(
body=query_body,
search_pipeline_id=normalization_pipeline_name,
)
@@ -976,7 +977,7 @@ class OpenSearchDocumentIndex(DocumentIndex):
index_filters=filters,
num_to_retrieve=num_to_retrieve,
)
search_hits: list[SearchHit[DocumentChunk]] = self._client.search(
search_hits: list[SearchHit[DocumentChunkWithoutVectors]] = self._client.search(
body=query_body,
search_pipeline_id=None,
)

View File

@@ -11,6 +11,8 @@ from pydantic import model_serializer
from pydantic import model_validator
from pydantic import SerializerFunctionWrapHandler
from onyx.configs.app_configs import OPENSEARCH_INDEX_NUM_REPLICAS
from onyx.configs.app_configs import OPENSEARCH_INDEX_NUM_SHARDS
from onyx.configs.app_configs import OPENSEARCH_TEXT_ANALYZER
from onyx.configs.app_configs import USING_AWS_MANAGED_OPENSEARCH
from onyx.document_index.interfaces_new import TenantState
@@ -100,9 +102,9 @@ def set_or_convert_timezone_to_utc(value: datetime) -> datetime:
return value
class DocumentChunk(BaseModel):
class DocumentChunkWithoutVectors(BaseModel):
"""
Represents a chunk of a document in the OpenSearch index.
Represents a chunk of a document in the OpenSearch index without vectors.
The names of these fields are based on the OpenSearch schema. Changes to the
schema require changes here. See get_document_schema.
@@ -124,9 +126,7 @@ class DocumentChunk(BaseModel):
# Either both should be None or both should be non-None.
title: str | None = None
title_vector: list[float] | None = None
content: str
content_vector: list[float]
source_type: str
# A list of key-value pairs separated by INDEX_SEPARATOR. See
@@ -176,19 +176,9 @@ class DocumentChunk(BaseModel):
def __str__(self) -> str:
return (
f"DocumentChunk(document_id={self.document_id}, chunk_index={self.chunk_index}, "
f"content length={len(self.content)}, content vector length={len(self.content_vector)}, "
f"tenant_id={self.tenant_id.tenant_id})"
f"content length={len(self.content)}, tenant_id={self.tenant_id.tenant_id})."
)
@model_validator(mode="after")
def check_title_and_title_vector_are_consistent(self) -> Self:
# title and title_vector should both either be None or not.
if self.title is not None and self.title_vector is None:
raise ValueError("Bug: Title vector must not be None if title is not None.")
if self.title_vector is not None and self.title is None:
raise ValueError("Bug: Title must not be None if title vector is not None.")
return self
@model_serializer(mode="wrap")
def serialize_model(
self, handler: SerializerFunctionWrapHandler
@@ -305,6 +295,35 @@ class DocumentChunk(BaseModel):
return TenantState(tenant_id=value, multitenant=MULTI_TENANT)
class DocumentChunk(DocumentChunkWithoutVectors):
"""Represents a chunk of a document in the OpenSearch index.
The names of these fields are based on the OpenSearch schema. Changes to the
schema require changes here. See get_document_schema.
"""
model_config = {"frozen": True}
title_vector: list[float] | None = None
content_vector: list[float]
def __str__(self) -> str:
return (
f"DocumentChunk(document_id={self.document_id}, chunk_index={self.chunk_index}, "
f"content length={len(self.content)}, content vector length={len(self.content_vector)}, "
f"tenant_id={self.tenant_id.tenant_id})"
)
@model_validator(mode="after")
def check_title_and_title_vector_are_consistent(self) -> Self:
# title and title_vector should both either be None or not.
if self.title is not None and self.title_vector is None:
raise ValueError("Bug: Title vector must not be None if title is not None.")
if self.title_vector is not None and self.title is None:
raise ValueError("Bug: Title must not be None if title vector is not None.")
return self
class DocumentSchema:
"""
Represents the schema and indexing strategies of the OpenSearch index.
@@ -516,78 +535,35 @@ class DocumentSchema:
return schema
@staticmethod
def get_index_settings() -> dict[str, Any]:
"""
Standard settings for reasonable local index and search performance.
"""
return {
"index": {
"number_of_shards": 1,
"number_of_replicas": 1,
# Required for vector search.
"knn": True,
"knn.algo_param.ef_search": EF_SEARCH,
}
}
@staticmethod
def get_index_settings_for_aws_managed_opensearch_st_dev() -> dict[str, Any]:
"""
Settings for AWS-managed OpenSearch.
Our AWS-managed OpenSearch cluster has 3 data nodes in 3 availability
zones.
- We use 3 shards to distribute load across all data nodes.
- We use 2 replicas to ensure each shard has a copy in each
availability zone. This is a hard requirement from AWS. The number
of data copies, including the primary (not a replica) copy, must be
divisible by the number of AZs.
"""
return {
"index": {
"number_of_shards": 3,
"number_of_replicas": 2,
# Required for vector search.
"knn": True,
"knn.algo_param.ef_search": EF_SEARCH,
}
}
@staticmethod
def get_index_settings_for_aws_managed_opensearch_mt_cloud() -> dict[str, Any]:
"""
Settings for AWS-managed OpenSearch in multi-tenant cloud.
324 shards very roughly targets a storage load of ~30Gb per shard, which
according to AWS OpenSearch documentation is within a good target range.
As documented above we need 2 replicas for a total of 3 copies of the
data because the cluster is configured with 3-AZ awareness.
"""
return {
"index": {
"number_of_shards": 324,
"number_of_replicas": 2,
# Required for vector search.
"knn": True,
"knn.algo_param.ef_search": EF_SEARCH,
}
}
@staticmethod
def get_index_settings_based_on_environment() -> dict[str, Any]:
"""
Returns the index settings based on the environment.
"""
if USING_AWS_MANAGED_OPENSEARCH:
# NOTE: The number of data copies, including the primary (not a
# replica) copy, must be divisible by the number of AZs.
if MULTI_TENANT:
return (
DocumentSchema.get_index_settings_for_aws_managed_opensearch_mt_cloud()
)
number_of_shards = 324
number_of_replicas = 2
else:
return (
DocumentSchema.get_index_settings_for_aws_managed_opensearch_st_dev()
)
number_of_shards = 3
number_of_replicas = 2
else:
return DocumentSchema.get_index_settings()
number_of_shards = 1
number_of_replicas = 1
if OPENSEARCH_INDEX_NUM_SHARDS is not None:
number_of_shards = OPENSEARCH_INDEX_NUM_SHARDS
if OPENSEARCH_INDEX_NUM_REPLICAS is not None:
number_of_replicas = OPENSEARCH_INDEX_NUM_REPLICAS
return {
"index": {
"number_of_shards": number_of_shards,
"number_of_replicas": number_of_replicas,
# Required for vector search.
"knn": True,
"knn.algo_param.ef_search": EF_SEARCH,
}
}

View File

@@ -15,7 +15,7 @@ from onyx.context.search.models import Tag
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.constants import ASSUMED_DOCUMENT_AGE_DAYS
from onyx.document_index.opensearch.constants import (
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
)
from onyx.document_index.opensearch.constants import (
DEFAULT_OPENSEARCH_MAX_RESULT_WINDOW,
@@ -235,9 +235,17 @@ class DocumentQuery:
# returning some number of results less than the index max allowed
# return size.
"size": DEFAULT_OPENSEARCH_MAX_RESULT_WINDOW,
"_source": get_full_document,
# By default exclude retrieving the vector fields in order to save
# on retrieval cost as we don't need them upstream.
"_source": {
"excludes": [TITLE_VECTOR_FIELD_NAME, CONTENT_VECTOR_FIELD_NAME]
},
"timeout": f"{DEFAULT_OPENSEARCH_QUERY_TIMEOUT_S}s",
}
if not get_full_document:
# If we explicitly do not want the underlying document, we will only
# retrieve IDs.
final_get_ids_query["_source"] = False
if not OPENSEARCH_PROFILING_DISABLED:
final_get_ids_query["profile"] = True
@@ -332,7 +340,7 @@ class DocumentQuery:
# TODO(andrei, yuhong): We can tune this more dynamically based on
# num_hits.
max_results_per_subquery = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
max_results_per_subquery = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES
hybrid_search_subqueries = DocumentQuery._get_hybrid_search_subqueries(
query_text, query_vector, vector_candidates=max_results_per_subquery
@@ -387,6 +395,11 @@ class DocumentQuery:
"size": num_hits,
"highlight": match_highlights_configuration,
"timeout": f"{DEFAULT_OPENSEARCH_QUERY_TIMEOUT_S}s",
# Exclude retrieving the vector fields in order to save on
# retrieval cost as we don't need them upstream.
"_source": {
"excludes": [TITLE_VECTOR_FIELD_NAME, CONTENT_VECTOR_FIELD_NAME]
},
}
# Explain is for scoring breakdowns.
@@ -446,6 +459,11 @@ class DocumentQuery:
},
"size": num_to_retrieve,
"timeout": f"{DEFAULT_OPENSEARCH_QUERY_TIMEOUT_S}s",
# Exclude retrieving the vector fields in order to save on
# retrieval cost as we don't need them upstream.
"_source": {
"excludes": [TITLE_VECTOR_FIELD_NAME, CONTENT_VECTOR_FIELD_NAME]
},
}
if not OPENSEARCH_PROFILING_DISABLED:
final_random_search_query["profile"] = True
@@ -460,7 +478,7 @@ class DocumentQuery:
# search. This is higher than the number of results because the scoring
# is hybrid. For a detailed breakdown, see where the default value is
# set.
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
vector_candidates: int = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
) -> list[dict[str, Any]]:
"""Returns subqueries for hybrid search.
@@ -546,7 +564,7 @@ class DocumentQuery:
@staticmethod
def _get_title_vector_similarity_search_query(
query_vector: list[float],
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
vector_candidates: int = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
) -> dict[str, Any]:
return {
"knn": {
@@ -560,7 +578,7 @@ class DocumentQuery:
@staticmethod
def _get_content_vector_similarity_search_query(
query_vector: list[float],
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
vector_candidates: int = DEFAULT_NUM_HYBRID_SUBQUERY_CANDIDATES,
) -> dict[str, Any]:
return {
"knn": {

View File

@@ -88,6 +88,7 @@ class OnyxErrorCode(Enum):
SERVICE_UNAVAILABLE = ("SERVICE_UNAVAILABLE", 503)
BAD_GATEWAY = ("BAD_GATEWAY", 502)
LLM_PROVIDER_ERROR = ("LLM_PROVIDER_ERROR", 502)
HOOK_EXECUTION_FAILED = ("HOOK_EXECUTION_FAILED", 502)
GATEWAY_TIMEOUT = ("GATEWAY_TIMEOUT", 504)
def __init__(self, code: str, status_code: int) -> None:

View File

@@ -0,0 +1,311 @@
"""Hook executor — calls a customer's external HTTP endpoint for a given hook point.
Usage (Celery tasks and FastAPI handlers):
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload={"query": "...", "user_email": "...", "chat_session_id": "..."},
)
if isinstance(result, HookSkipped):
# no active hook configured — continue with original behavior
...
elif isinstance(result, HookSoftFailed):
# hook failed but fail strategy is SOFT — continue with original behavior
...
else:
# result is the response payload dict from the customer's endpoint
...
DB session design
-----------------
The executor uses three sessions:
1. Caller's session (db_session) — used only for the hook lookup read. All
needed fields are extracted from the Hook object before the HTTP call, so
the caller's session is not held open during the external HTTP request.
2. Log session — a separate short-lived session opened after the HTTP call
completes to write the HookExecutionLog row on failure. Success runs are
not recorded. Committed independently of everything else.
3. Reachable session — a second short-lived session to update is_reachable on
the Hook. Kept separate from the log session so a concurrent hook deletion
(which causes update_hook__no_commit to raise OnyxError(NOT_FOUND)) cannot
prevent the execution log from being written. This update is best-effort.
"""
import json
import time
from typing import Any
import httpx
from pydantic import BaseModel
from sqlalchemy.orm import Session
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.db.hook import create_hook_execution_log__no_commit
from onyx.db.hook import get_non_deleted_hook_by_hook_point
from onyx.db.hook import update_hook__no_commit
from onyx.db.models import Hook
from onyx.error_handling.error_codes import OnyxErrorCode
from onyx.error_handling.exceptions import OnyxError
from onyx.hooks.utils import HOOKS_AVAILABLE
from onyx.utils.logger import setup_logger
logger = setup_logger()
class HookSkipped:
"""No active hook configured for this hook point."""
class HookSoftFailed:
"""Hook was called but failed with SOFT fail strategy — continuing."""
# ---------------------------------------------------------------------------
# Private helpers
# ---------------------------------------------------------------------------
class _HttpOutcome(BaseModel):
"""Structured result of an HTTP hook call, returned by _process_response."""
is_success: bool
updated_is_reachable: (
bool | None
) # True/False = write to DB, None = unchanged (skip write)
status_code: int | None
error_message: str | None
response_payload: dict[str, Any] | None
def _lookup_hook(
db_session: Session,
hook_point: HookPoint,
) -> Hook | HookSkipped:
"""Return the active Hook or HookSkipped if hooks are unavailable/unconfigured.
No HTTP call is made and no DB writes are performed for any HookSkipped path.
There is nothing to log and no reachability information to update.
"""
if not HOOKS_AVAILABLE:
return HookSkipped()
hook = get_non_deleted_hook_by_hook_point(
db_session=db_session, hook_point=hook_point
)
if hook is None or not hook.is_active:
return HookSkipped()
if not hook.endpoint_url:
return HookSkipped()
return hook
def _process_response(
*,
response: httpx.Response | None,
exc: Exception | None,
timeout: float,
) -> _HttpOutcome:
"""Process the result of an HTTP call and return a structured outcome.
Called after the client.post() try/except. If post() raised, exc is set and
response is None. Otherwise response is set and exc is None. Handles
raise_for_status(), JSON decoding, and the dict shape check.
"""
if exc is not None:
if isinstance(exc, httpx.ConnectError):
msg = f"Hook endpoint unreachable: {exc}"
logger.warning(msg, exc_info=exc)
return _HttpOutcome(
is_success=False,
updated_is_reachable=False,
status_code=None,
error_message=msg,
response_payload=None,
)
if isinstance(exc, httpx.TimeoutException):
msg = f"Hook timed out after {timeout}s: {exc}"
logger.warning(msg, exc_info=exc)
return _HttpOutcome(
is_success=False,
updated_is_reachable=True,
status_code=None,
error_message=msg,
response_payload=None,
)
msg = f"Hook call failed: {exc}"
logger.exception(msg, exc_info=exc)
return _HttpOutcome(
is_success=False,
updated_is_reachable=True,
status_code=None,
error_message=msg,
response_payload=None,
)
if response is None:
raise ValueError(
"exactly one of response or exc must be non-None; both are None"
)
status_code = response.status_code
try:
response.raise_for_status()
except httpx.HTTPStatusError as e:
msg = f"Hook returned HTTP {e.response.status_code}: {e.response.text}"
logger.warning(msg, exc_info=e)
return _HttpOutcome(
is_success=False,
updated_is_reachable=True,
status_code=status_code,
error_message=msg,
response_payload=None,
)
try:
response_payload = response.json()
except (json.JSONDecodeError, httpx.DecodingError) as e:
msg = f"Hook returned non-JSON response: {e}"
logger.warning(msg, exc_info=e)
return _HttpOutcome(
is_success=False,
updated_is_reachable=True,
status_code=status_code,
error_message=msg,
response_payload=None,
)
if not isinstance(response_payload, dict):
msg = f"Hook returned non-dict JSON (got {type(response_payload).__name__})"
logger.warning(msg)
return _HttpOutcome(
is_success=False,
updated_is_reachable=True,
status_code=status_code,
error_message=msg,
response_payload=None,
)
return _HttpOutcome(
is_success=True,
updated_is_reachable=True,
status_code=status_code,
error_message=None,
response_payload=response_payload,
)
def _persist_result(
*,
hook_id: int,
outcome: _HttpOutcome,
duration_ms: int,
) -> None:
"""Write the execution log on failure and optionally update is_reachable, each
in its own session so a failure in one does not affect the other."""
# Only write the execution log on failure — success runs are not recorded.
# Must not be skipped if the is_reachable update fails (e.g. hook concurrently
# deleted between the initial lookup and here).
if not outcome.is_success:
try:
with get_session_with_current_tenant() as log_session:
create_hook_execution_log__no_commit(
db_session=log_session,
hook_id=hook_id,
is_success=False,
error_message=outcome.error_message,
status_code=outcome.status_code,
duration_ms=duration_ms,
)
log_session.commit()
except Exception:
logger.exception(
f"Failed to persist hook execution log for hook_id={hook_id}"
)
# Update is_reachable separately — best-effort, non-critical.
# None means the value is unchanged (set by the caller to skip the no-op write).
# update_hook__no_commit can raise OnyxError(NOT_FOUND) if the hook was
# concurrently deleted, so keep this isolated from the log write above.
if outcome.updated_is_reachable is not None:
try:
with get_session_with_current_tenant() as reachable_session:
update_hook__no_commit(
db_session=reachable_session,
hook_id=hook_id,
is_reachable=outcome.updated_is_reachable,
)
reachable_session.commit()
except Exception:
logger.warning(f"Failed to update is_reachable for hook_id={hook_id}")
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def execute_hook(
*,
db_session: Session,
hook_point: HookPoint,
payload: dict[str, Any],
) -> dict[str, Any] | HookSkipped | HookSoftFailed:
"""Execute the hook for the given hook point synchronously."""
hook = _lookup_hook(db_session, hook_point)
if isinstance(hook, HookSkipped):
return hook
timeout = hook.timeout_seconds
hook_id = hook.id
fail_strategy = hook.fail_strategy
endpoint_url = hook.endpoint_url
current_is_reachable: bool | None = hook.is_reachable
if not endpoint_url:
raise ValueError(
f"hook_id={hook_id} is active but has no endpoint_url — "
"active hooks without an endpoint_url must be rejected by _lookup_hook"
)
start = time.monotonic()
response: httpx.Response | None = None
exc: Exception | None = None
try:
api_key: str | None = (
hook.api_key.get_value(apply_mask=False) if hook.api_key else None
)
headers: dict[str, str] = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
with httpx.Client(timeout=timeout) as client:
response = client.post(endpoint_url, json=payload, headers=headers)
except Exception as e:
exc = e
duration_ms = int((time.monotonic() - start) * 1000)
outcome = _process_response(response=response, exc=exc, timeout=timeout)
# Skip the is_reachable write when the value would not change — avoids a
# no-op DB round-trip on every call when the hook is already in the expected state.
if outcome.updated_is_reachable == current_is_reachable:
outcome = outcome.model_copy(update={"updated_is_reachable": None})
_persist_result(hook_id=hook_id, outcome=outcome, duration_ms=duration_ms)
if not outcome.is_success:
if fail_strategy == HookFailStrategy.HARD:
raise OnyxError(
OnyxErrorCode.HOOK_EXECUTION_FAILED,
outcome.error_message or "Hook execution failed.",
)
logger.warning(
f"Hook execution failed (soft fail) for hook_id={hook_id}: {outcome.error_message}"
)
return HookSoftFailed()
if outcome.response_payload is None:
raise ValueError(
f"response_payload is None for successful hook call (hook_id={hook_id})"
)
return outcome.response_payload

View File

@@ -0,0 +1,5 @@
from onyx.configs.app_configs import HOOK_ENABLED
from shared_configs.configs import MULTI_TENANT
# True only when hooks are available: single-tenant deployment with HOOK_ENABLED=true.
HOOKS_AVAILABLE: bool = HOOK_ENABLED and not MULTI_TENANT

View File

@@ -479,7 +479,9 @@ def is_zip_file(file: UploadFile) -> bool:
def upload_files(
files: list[UploadFile], file_origin: FileOrigin = FileOrigin.CONNECTOR
files: list[UploadFile],
file_origin: FileOrigin = FileOrigin.CONNECTOR,
unzip: bool = True,
) -> FileUploadResponse:
# Skip directories and known macOS metadata entries
@@ -502,31 +504,46 @@ def upload_files(
if seen_zip:
raise HTTPException(status_code=400, detail=SEEN_ZIP_DETAIL)
seen_zip = True
# Validate the zip by opening it (catches corrupt/non-zip files)
with zipfile.ZipFile(file.file, "r") as zf:
zip_metadata_file_id = save_zip_metadata_to_file_store(
zf, file_store
)
for file_info in zf.namelist():
if zf.getinfo(file_info).is_dir():
continue
if not should_process_file(file_info):
continue
sub_file_bytes = zf.read(file_info)
mime_type, __ = mimetypes.guess_type(file_info)
if mime_type is None:
mime_type = "application/octet-stream"
file_id = file_store.save_file(
content=BytesIO(sub_file_bytes),
display_name=os.path.basename(file_info),
file_origin=file_origin,
file_type=mime_type,
if unzip:
zip_metadata_file_id = save_zip_metadata_to_file_store(
zf, file_store
)
deduped_file_paths.append(file_id)
deduped_file_names.append(os.path.basename(file_info))
for file_info in zf.namelist():
if zf.getinfo(file_info).is_dir():
continue
if not should_process_file(file_info):
continue
sub_file_bytes = zf.read(file_info)
mime_type, __ = mimetypes.guess_type(file_info)
if mime_type is None:
mime_type = "application/octet-stream"
file_id = file_store.save_file(
content=BytesIO(sub_file_bytes),
display_name=os.path.basename(file_info),
file_origin=file_origin,
file_type=mime_type,
)
deduped_file_paths.append(file_id)
deduped_file_names.append(os.path.basename(file_info))
continue
# Store the zip as-is (unzip=False)
file.file.seek(0)
file_id = file_store.save_file(
content=file.file,
display_name=file.filename,
file_origin=file_origin,
file_type=file.content_type or "application/zip",
)
deduped_file_paths.append(file_id)
deduped_file_names.append(file.filename)
continue
# Since we can't render docx files in the UI,
@@ -613,9 +630,10 @@ def _fetch_and_check_file_connector_cc_pair_permissions(
@router.post("/admin/connector/file/upload", tags=PUBLIC_API_TAGS)
def upload_files_api(
files: list[UploadFile],
unzip: bool = True,
_: User = Depends(current_curator_or_admin_user),
) -> FileUploadResponse:
return upload_files(files, FileOrigin.OTHER)
return upload_files(files, FileOrigin.OTHER, unzip=unzip)
@router.get("/admin/connector/{connector_id}/files", tags=PUBLIC_API_TAGS)

View File

@@ -74,7 +74,7 @@ def make_structured_onyx_request_id(prefix: str, request_url: str) -> str:
def _make_onyx_request_id(prefix: str, hash_input: str) -> str:
"""helper function to return an id given a string input"""
hash_obj = hashlib.md5(hash_input.encode("utf-8"))
hash_obj = hashlib.md5(hash_input.encode("utf-8"), usedforsecurity=False)
hash_bytes = hash_obj.digest()[:6] # Truncate to 6 bytes
# 6 bytes becomes 8 bytes. we shouldn't need to strip but just in case

View File

@@ -752,7 +752,7 @@ pypandoc-binary==1.16.2
# via onyx
pyparsing==3.2.5
# via httplib2
pypdf==6.8.0
pypdf==6.9.1
# via
# onyx
# unstructured-client

View File

@@ -297,6 +297,10 @@ def index_batch_params(
class TestDocumentIndexOld:
"""Tests the old DocumentIndex interface."""
# TODO(ENG-3864)(andrei): Re-enable this test.
@pytest.mark.xfail(
reason="Flaky test: Retrieved chunks vary non-deterministically before and after changing user projects and personas. Likely a timing issue with the index being updated."
)
def test_update_single_can_clear_user_projects_and_personas(
self,
document_indices: list[DocumentIndex],

View File

@@ -29,6 +29,7 @@ from onyx.document_index.opensearch.opensearch_document_index import (
)
from onyx.document_index.opensearch.schema import CONTENT_FIELD_NAME
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import DocumentChunkWithoutVectors
from onyx.document_index.opensearch.schema import DocumentSchema
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.search import DocumentQuery
@@ -226,7 +227,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
# Under test.
# Should not raise.
@@ -242,7 +243,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
@@ -271,7 +272,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
@@ -285,7 +286,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
# Under test and postcondition.
# Should return False before creation.
@@ -305,7 +306,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
@@ -340,7 +341,7 @@ class TestOpenSearchClient:
},
},
}
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=initial_mappings, settings=settings)
# Under test.
@@ -383,7 +384,7 @@ class TestOpenSearchClient:
"test_field": {"type": "keyword"},
},
}
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=initial_mappings, settings=settings)
# Under test and postcondition.
@@ -418,7 +419,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
# Create once - should succeed.
test_client.create_index(mappings=mappings, settings=settings)
@@ -461,7 +462,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
doc = _create_test_document_chunk(
@@ -489,7 +490,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
docs = [
@@ -520,7 +521,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
doc = _create_test_document_chunk(
@@ -548,7 +549,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
original_doc = _create_test_document_chunk(
@@ -583,7 +584,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=False
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Under test and postcondition.
@@ -602,7 +603,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
doc = _create_test_document_chunk(
@@ -638,7 +639,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
@@ -659,7 +660,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index multiple documents.
@@ -735,7 +736,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Create a document to update.
@@ -784,7 +785,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Under test and postcondition.
@@ -808,7 +809,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents.
docs = {
@@ -881,8 +882,12 @@ class TestOpenSearchClient:
)
# Make sure the chunk contents are preserved.
for i, chunk in enumerate(results):
assert (
chunk.document_chunk == docs[chunk.document_chunk.document_id]
expected = docs[chunk.document_chunk.document_id]
assert chunk.document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(expected, k)
for k in DocumentChunkWithoutVectors.model_fields
}
)
# Make sure score reporting seems reasonable (it should not be None
# or 0).
@@ -906,7 +911,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Note no documents were indexed.
@@ -947,7 +952,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_x.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents with different public/hidden and tenant states.
@@ -1038,7 +1043,12 @@ class TestOpenSearchClient:
# ordered; we're just assuming which doc will be the first result here.
assert results[0].document_chunk.document_id == "public-doc"
# Make sure the chunk contents are preserved.
assert results[0].document_chunk == docs["public-doc"]
assert results[0].document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(docs["public-doc"], k)
for k in DocumentChunkWithoutVectors.model_fields
}
)
# Make sure score reporting seems reasonable (it should not be None
# or 0).
assert results[0].score
@@ -1046,7 +1056,12 @@ class TestOpenSearchClient:
assert results[0].match_highlights.get(CONTENT_FIELD_NAME, [])
# Same for the second result.
assert results[1].document_chunk.document_id == "private-doc-user-a"
assert results[1].document_chunk == docs["private-doc-user-a"]
assert results[1].document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(docs["private-doc-user-a"], k)
for k in DocumentChunkWithoutVectors.model_fields
}
)
assert results[1].score
assert results[1].match_highlights.get(CONTENT_FIELD_NAME, [])
@@ -1066,7 +1081,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_x.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents with varying relevance to the query.
@@ -1193,7 +1208,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_x.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Although very unlikely in practice, let's use the same doc ID just to
@@ -1286,7 +1301,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Don't index any documents.
@@ -1313,7 +1328,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index chunks for two different documents.
@@ -1381,7 +1396,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index documents with different public/hidden and tenant states.
@@ -1458,7 +1473,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index docs with various ages.
@@ -1550,7 +1565,7 @@ class TestOpenSearchClient:
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=tenant_state.multitenant
)
settings = DocumentSchema.get_index_settings()
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Index chunks for two different documents, one hidden one not.
@@ -1599,4 +1614,9 @@ class TestOpenSearchClient:
for result in results:
# Note each result must be from doc 1, which is not hidden.
expected_result = doc1_chunks[result.document_chunk.chunk_index]
assert result.document_chunk == expected_result
assert result.document_chunk == DocumentChunkWithoutVectors(
**{
k: getattr(expected_result, k)
for k in DocumentChunkWithoutVectors.model_fields
}
)

View File

@@ -31,7 +31,6 @@ from onyx.background.celery.tasks.opensearch_migration.transformer import (
)
from onyx.configs.constants import PUBLIC_DOC_PAT
from onyx.configs.constants import SOURCE_TYPE
from onyx.context.search.models import IndexFilters
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.models import Document
from onyx.db.models import OpenSearchDocumentMigrationRecord
@@ -44,6 +43,7 @@ from onyx.document_index.opensearch.client import OpenSearchIndexClient
from onyx.document_index.opensearch.client import wait_for_opensearch_with_timeout
from onyx.document_index.opensearch.constants import DEFAULT_MAX_CHUNK_SIZE
from onyx.document_index.opensearch.schema import DocumentChunk
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.search import DocumentQuery
from onyx.document_index.vespa.shared_utils.utils import wait_for_vespa_with_timeout
from onyx.document_index.vespa.vespa_document_index import VespaDocumentIndex
@@ -70,6 +70,7 @@ from onyx.document_index.vespa_constants import SOURCE_LINKS
from onyx.document_index.vespa_constants import TITLE
from onyx.document_index.vespa_constants import TITLE_EMBEDDING
from onyx.document_index.vespa_constants import USER_PROJECT
from shared_configs.configs import MULTI_TENANT
from shared_configs.contextvars import get_current_tenant_id
from tests.external_dependency_unit.full_setup import ensure_full_deployment_setup
@@ -78,24 +79,22 @@ CHUNK_COUNT = 5
def _get_document_chunks_from_opensearch(
opensearch_client: OpenSearchIndexClient, document_id: str, current_tenant_id: str
opensearch_client: OpenSearchIndexClient,
document_id: str,
tenant_state: TenantState,
) -> list[DocumentChunk]:
opensearch_client.refresh_index()
filters = IndexFilters(access_control_list=None, tenant_id=current_tenant_id)
query_body = DocumentQuery.get_from_document_id_query(
document_id=document_id,
tenant_state=TenantState(tenant_id=current_tenant_id, multitenant=False),
index_filters=filters,
include_hidden=False,
max_chunk_size=DEFAULT_MAX_CHUNK_SIZE,
min_chunk_index=None,
max_chunk_index=None,
)
search_hits = opensearch_client.search(
body=query_body,
search_pipeline_id=None,
)
return [search_hit.document_chunk for search_hit in search_hits]
results: list[DocumentChunk] = []
for i in range(CHUNK_COUNT):
document_chunk_id: str = get_opensearch_doc_chunk_id(
tenant_state=tenant_state,
document_id=document_id,
chunk_index=i,
max_chunk_size=DEFAULT_MAX_CHUNK_SIZE,
)
result = opensearch_client.get_document(document_chunk_id)
results.append(result)
return results
def _delete_document_chunks_from_opensearch(
@@ -452,10 +451,13 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Under test.
result = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
# Postcondition.
@@ -477,7 +479,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
opensearch_client, document.id, tenant_state
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -522,6 +524,9 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Run the initial batch. To simulate partial progress we will mock the
# redis lock to return True for the first invocation of .owned() and
@@ -536,7 +541,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
return_value=mock_redis_client,
):
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
assert result_1 is True
@@ -559,7 +564,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Under test.
# Run the remainder of the migration.
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
# Postcondition.
@@ -583,7 +588,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
opensearch_client, document.id, tenant_state
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -630,6 +635,9 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Run the initial batch. To simulate partial progress we will mock the
# redis lock to return True for the first invocation of .owned() and
@@ -646,7 +654,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
return_value=mock_redis_client,
):
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
assert result_1 is True
@@ -691,7 +699,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
),
):
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
# Postcondition.
@@ -728,7 +736,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
),
):
result_3 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
# Postcondition.
@@ -752,7 +760,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
opensearch_client, document.id, tenant_state
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -840,24 +848,25 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
chunk["content"] = (
f"Different content {chunk[CHUNK_ID]} for {test_documents[0].id}"
)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
chunks_for_document_in_opensearch, _ = (
transform_vespa_chunks_to_opensearch_chunks(
document_in_opensearch,
TenantState(tenant_id=get_current_tenant_id(), multitenant=False),
tenant_state,
{},
)
)
opensearch_client.bulk_index_documents(
documents=chunks_for_document_in_opensearch,
tenant_state=TenantState(
tenant_id=get_current_tenant_id(), multitenant=False
),
tenant_state=tenant_state,
update_if_exists=True,
)
# Under test.
result = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
# Postcondition.
@@ -878,7 +887,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
opensearch_client, document.id, tenant_state
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -922,11 +931,14 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
tenant_state = TenantState(
tenant_id=get_current_tenant_id(), multitenant=MULTI_TENANT
)
# Under test.
# First run.
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
# Postcondition.
@@ -947,7 +959,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
opensearch_client, document.id, tenant_state
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
@@ -960,7 +972,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Under test.
# Second run.
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
tenant_id=tenant_state.tenant_id
)
# Postcondition.
@@ -982,7 +994,7 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
opensearch_client, document.id, tenant_state
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)

View File

@@ -1,249 +0,0 @@
"""Unit tests for OpenSearchDocumentIndex.index().
These tests mock the OpenSearch client and verify the buffered
flush-by-document logic, DocumentInsertionRecord construction, and
delete-before-insert semantics.
"""
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.access.models import DocumentAccess
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentSource
from onyx.connectors.models import TextSection
from onyx.document_index.interfaces_new import IndexingMetadata
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.opensearch_document_index import (
OpenSearchDocumentIndex,
)
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import IndexChunk
def _make_chunk(
doc_id: str,
chunk_id: int = 0,
content: str = "test content",
chunk_count: int | None = None,
) -> DocMetadataAwareIndexChunk:
doc = Document(
id=doc_id,
semantic_identifier="test_doc",
sections=[TextSection(text=content, link=None)],
source=DocumentSource.NOT_APPLICABLE,
metadata={},
chunk_count=chunk_count,
)
index_chunk = IndexChunk(
chunk_id=chunk_id,
blurb=content[:50],
content=content,
source_links=None,
image_file_id=None,
section_continuation=False,
source_document=doc,
title_prefix="",
metadata_suffix_semantic="",
metadata_suffix_keyword="",
contextual_rag_reserved_tokens=0,
doc_summary="",
chunk_context="",
mini_chunk_texts=None,
large_chunk_id=None,
embeddings=ChunkEmbedding(
full_embedding=[0.1] * 10,
mini_chunk_embeddings=[],
),
title_embedding=[0.1] * 10,
)
access = DocumentAccess.build(
user_emails=[],
user_groups=[],
external_user_emails=[],
external_user_group_ids=[],
is_public=True,
)
return DocMetadataAwareIndexChunk.from_index_chunk(
index_chunk=index_chunk,
access=access,
document_sets=set(),
user_project=[],
personas=[],
boost=0,
aggregated_chunk_boost_factor=1.0,
tenant_id="test_tenant",
)
def _make_indexing_metadata(
doc_ids: list[str],
old_counts: list[int],
new_counts: list[int],
) -> IndexingMetadata:
return IndexingMetadata(
doc_id_to_chunk_cnt_diff={
doc_id: IndexingMetadata.ChunkCounts(
old_chunk_cnt=old,
new_chunk_cnt=new,
)
for doc_id, old, new in zip(doc_ids, old_counts, new_counts)
}
)
def _make_os_index(mock_client: MagicMock) -> OpenSearchDocumentIndex:
"""Create an OpenSearchDocumentIndex with a mocked client."""
with patch.object(
OpenSearchDocumentIndex,
"__init__",
lambda _self, *_a, **_kw: None,
):
idx = OpenSearchDocumentIndex.__new__(OpenSearchDocumentIndex)
idx._index_name = "test_index"
idx._tenant_state = TenantState(tenant_id="test_tenant", multitenant=False)
idx._client = mock_client
return idx
def test_index_single_new_doc() -> None:
"""Indexing a single new document returns one record with already_existed=False."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
# Patch delete to return 0 (no existing chunks)
with patch.object(idx, "delete", return_value=0) as mock_delete:
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[1])
results = idx.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].document_id == "doc1"
assert results[0].already_existed is False
mock_delete.assert_called_once()
mock_client.bulk_index_documents.assert_called_once()
def test_index_existing_doc_already_existed_true() -> None:
"""Re-indexing a doc with previous chunks returns already_existed=True."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
with patch.object(idx, "delete", return_value=5):
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[5], new_counts=[1])
results = idx.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].already_existed is True
def test_index_multiple_docs_flushed_separately() -> None:
"""Chunks from different documents are flushed in separate bulk calls."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
with patch.object(idx, "delete", return_value=0):
chunks = [
_make_chunk("doc1", chunk_id=0),
_make_chunk("doc1", chunk_id=1),
_make_chunk("doc2", chunk_id=0),
]
metadata = _make_indexing_metadata(
["doc1", "doc2"], old_counts=[0, 0], new_counts=[2, 1]
)
results = idx.index(chunks=chunks, indexing_metadata=metadata)
result_map = {r.document_id: r.already_existed for r in results}
assert len(result_map) == 2
assert result_map["doc1"] is False
assert result_map["doc2"] is False
# Two separate flushes: one for doc1 (2 chunks), one for doc2 (1 chunk)
assert mock_client.bulk_index_documents.call_count == 2
def test_index_deletes_before_inserting() -> None:
"""For each document, delete is called before bulk_index_documents."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
call_order: list[str] = []
idx = _make_os_index(mock_client)
def track_delete(*_args: object, **_kwargs: object) -> int:
call_order.append("delete")
return 3
def track_bulk(*_args: object, **_kwargs: object) -> None:
call_order.append("bulk_index")
mock_client.bulk_index_documents.side_effect = track_bulk
with patch.object(idx, "delete", side_effect=track_delete):
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[3], new_counts=[1])
idx.index(chunks=[chunk], indexing_metadata=metadata)
assert call_order == ["delete", "bulk_index"]
def test_index_delete_called_once_per_doc() -> None:
"""Delete is called only once per document, even with multiple chunks."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
with patch.object(idx, "delete", return_value=0) as mock_delete:
# 3 chunks, all same doc — should only delete once
chunks = [_make_chunk("doc1", chunk_id=i) for i in range(3)]
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[3])
idx.index(chunks=chunks, indexing_metadata=metadata)
mock_delete.assert_called_once()
def test_index_flushes_on_doc_boundary() -> None:
"""When doc ID changes in the stream, the previous doc's chunks are flushed."""
mock_client = MagicMock()
mock_client.bulk_index_documents.return_value = None
idx = _make_os_index(mock_client)
bulk_call_chunk_counts: list[int] = []
def track_bulk(documents: list[object], **_kwargs: object) -> None:
bulk_call_chunk_counts.append(len(documents))
mock_client.bulk_index_documents.side_effect = track_bulk
with patch.object(idx, "delete", return_value=0):
chunks = [
_make_chunk("doc1", chunk_id=0),
_make_chunk("doc1", chunk_id=1),
_make_chunk("doc1", chunk_id=2),
_make_chunk("doc2", chunk_id=0),
_make_chunk("doc2", chunk_id=1),
]
metadata = _make_indexing_metadata(
["doc1", "doc2"], old_counts=[0, 0], new_counts=[3, 2]
)
idx.index(chunks=chunks, indexing_metadata=metadata)
# First flush: 3 chunks for doc1, second flush: 2 chunks for doc2
assert bulk_call_chunk_counts == [3, 2]

View File

@@ -1,373 +0,0 @@
"""Unit tests for VespaDocumentIndex.index().
These tests mock all external I/O (HTTP calls, thread pools) and verify
the streaming logic, ID cleaning/mapping, and DocumentInsertionRecord
construction.
"""
from unittest.mock import MagicMock
from unittest.mock import patch
from uuid import uuid4
from onyx.access.models import DocumentAccess
from onyx.connectors.models import Document
from onyx.connectors.models import DocumentSource
from onyx.connectors.models import TextSection
from onyx.document_index.interfaces import EnrichedDocumentIndexingInfo
from onyx.document_index.interfaces_new import IndexingMetadata
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.vespa.vespa_document_index import VespaDocumentIndex
from onyx.indexing.models import ChunkEmbedding
from onyx.indexing.models import DocMetadataAwareIndexChunk
from onyx.indexing.models import IndexChunk
def _make_chunk(
doc_id: str,
chunk_id: int = 0,
content: str = "test content",
) -> DocMetadataAwareIndexChunk:
doc = Document(
id=doc_id,
semantic_identifier="test_doc",
sections=[TextSection(text=content, link=None)],
source=DocumentSource.NOT_APPLICABLE,
metadata={},
)
index_chunk = IndexChunk(
chunk_id=chunk_id,
blurb=content[:50],
content=content,
source_links=None,
image_file_id=None,
section_continuation=False,
source_document=doc,
title_prefix="",
metadata_suffix_semantic="",
metadata_suffix_keyword="",
contextual_rag_reserved_tokens=0,
doc_summary="",
chunk_context="",
mini_chunk_texts=None,
large_chunk_id=None,
embeddings=ChunkEmbedding(
full_embedding=[0.1] * 10,
mini_chunk_embeddings=[],
),
title_embedding=None,
)
access = DocumentAccess.build(
user_emails=[],
user_groups=[],
external_user_emails=[],
external_user_group_ids=[],
is_public=True,
)
return DocMetadataAwareIndexChunk.from_index_chunk(
index_chunk=index_chunk,
access=access,
document_sets=set(),
user_project=[],
personas=[],
boost=0,
aggregated_chunk_boost_factor=1.0,
tenant_id="test_tenant",
)
def _make_indexing_metadata(
doc_ids: list[str],
old_counts: list[int],
new_counts: list[int],
) -> IndexingMetadata:
return IndexingMetadata(
doc_id_to_chunk_cnt_diff={
doc_id: IndexingMetadata.ChunkCounts(
old_chunk_cnt=old,
new_chunk_cnt=new,
)
for doc_id, old, new in zip(doc_ids, old_counts, new_counts)
}
)
def _stub_enrich(
doc_id: str,
old_chunk_cnt: int,
) -> EnrichedDocumentIndexingInfo:
"""Build an EnrichedDocumentIndexingInfo that says 'no chunks to delete'
when old_chunk_cnt == 0, or 'has existing chunks' otherwise."""
return EnrichedDocumentIndexingInfo(
doc_id=doc_id,
chunk_start_index=0,
old_version=False,
chunk_end_index=old_chunk_cnt,
)
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_single_new_doc(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Indexing a single new document returns one record with already_existed=False."""
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[1])
results = index.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].document_id == "doc1"
assert results[0].already_existed is False
# batch_index_vespa_chunks should be called once with a single cleaned chunk
mock_batch_index.assert_called_once()
call_kwargs = mock_batch_index.call_args
indexed_chunks = call_kwargs.kwargs["chunks"]
assert len(indexed_chunks) == 1
assert indexed_chunks[0].source_document.id == "doc1"
assert call_kwargs.kwargs["index_name"] == "test_index"
assert call_kwargs.kwargs["multitenant"] is False
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_existing_doc_already_existed_true(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock,
mock_delete: MagicMock,
mock_batch_index: MagicMock,
) -> None:
"""Re-indexing a doc with previous chunks deletes old chunks, indexes
new ones, and returns already_existed=True."""
fake_chunk_ids = [uuid4(), uuid4()]
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=5)
mock_get_chunk_ids.return_value = fake_chunk_ids
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunk = _make_chunk("doc1")
metadata = _make_indexing_metadata(["doc1"], old_counts=[5], new_counts=[1])
results = index.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
assert results[0].already_existed is True
# Old chunks should be deleted
mock_delete.assert_called_once()
delete_kwargs = mock_delete.call_args.kwargs
assert delete_kwargs["doc_chunk_ids"] == fake_chunk_ids
assert delete_kwargs["index_name"] == "test_index"
# New chunk should be indexed
mock_batch_index.assert_called_once()
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 1
assert indexed_chunks[0].source_document.id == "doc1"
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_multiple_docs(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Indexing multiple documents returns one record per unique document."""
mock_enrich.side_effect = [
_stub_enrich("doc1", old_chunk_cnt=0),
_stub_enrich("doc2", old_chunk_cnt=3),
]
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunks = [
_make_chunk("doc1", chunk_id=0),
_make_chunk("doc1", chunk_id=1),
_make_chunk("doc2", chunk_id=0),
]
metadata = _make_indexing_metadata(
["doc1", "doc2"], old_counts=[0, 3], new_counts=[2, 1]
)
results = index.index(chunks=chunks, indexing_metadata=metadata)
result_map = {r.document_id: r.already_existed for r in results}
assert len(result_map) == 2
assert result_map["doc1"] is False
assert result_map["doc2"] is True
# All 3 chunks fit in one batch (BATCH_SIZE=128), so one call
mock_batch_index.assert_called_once()
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 3
indexed_doc_ids = [c.source_document.id for c in indexed_chunks]
assert indexed_doc_ids == ["doc1", "doc1", "doc2"]
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_cleans_doc_ids(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Documents with invalid Vespa characters get cleaned IDs, but
the returned DocumentInsertionRecord uses the original ID."""
doc_id_with_quote = "doc'1"
mock_enrich.return_value = _stub_enrich(doc_id_with_quote, old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunk = _make_chunk(doc_id_with_quote)
metadata = _make_indexing_metadata(
[doc_id_with_quote], old_counts=[0], new_counts=[1]
)
results = index.index(chunks=[chunk], indexing_metadata=metadata)
assert len(results) == 1
# The returned ID should be the original (unclean) ID
assert results[0].document_id == doc_id_with_quote
# The chunk passed to batch_index_vespa_chunks should have the cleaned ID
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 1
assert indexed_chunks[0].source_document.id == "doc_1" # quote replaced with _
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
def test_index_deduplicates_doc_ids_in_results(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""Multiple chunks from the same document produce only one
DocumentInsertionRecord."""
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunks = [_make_chunk("doc1", chunk_id=i) for i in range(5)]
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[5])
results = index.index(chunks=chunks, indexing_metadata=metadata)
assert len(results) == 1
assert results[0].document_id == "doc1"
# All 5 chunks should be passed to batch_index_vespa_chunks
mock_batch_index.assert_called_once()
indexed_chunks = mock_batch_index.call_args.kwargs["chunks"]
assert len(indexed_chunks) == 5
assert all(c.source_document.id == "doc1" for c in indexed_chunks)
@patch("onyx.document_index.vespa.vespa_document_index.batch_index_vespa_chunks")
@patch("onyx.document_index.vespa.vespa_document_index.delete_vespa_chunks")
@patch(
"onyx.document_index.vespa.vespa_document_index.get_document_chunk_ids",
return_value=[],
)
@patch("onyx.document_index.vespa.vespa_document_index._enrich_basic_chunk_info")
@patch(
"onyx.document_index.vespa.vespa_document_index.BATCH_SIZE",
3,
)
def test_index_respects_batch_size(
mock_enrich: MagicMock,
mock_get_chunk_ids: MagicMock, # noqa: ARG001
mock_delete: MagicMock, # noqa: ARG001
mock_batch_index: MagicMock,
) -> None:
"""When chunks exceed BATCH_SIZE, batch_index_vespa_chunks is called
multiple times with correctly sized batches."""
mock_enrich.return_value = _stub_enrich("doc1", old_chunk_cnt=0)
index = VespaDocumentIndex(
index_name="test_index",
tenant_state=TenantState(tenant_id="test_tenant", multitenant=False),
large_chunks_enabled=False,
httpx_client=MagicMock(),
)
chunks = [_make_chunk("doc1", chunk_id=i) for i in range(7)]
metadata = _make_indexing_metadata(["doc1"], old_counts=[0], new_counts=[7])
results = index.index(chunks=chunks, indexing_metadata=metadata)
assert len(results) == 1
# With BATCH_SIZE=3 and 7 chunks: batches of 3, 3, 1
assert mock_batch_index.call_count == 3
batch_sizes = [len(c.kwargs["chunks"]) for c in mock_batch_index.call_args_list]
assert batch_sizes == [3, 3, 1]
# Verify all chunks are accounted for and in order
all_indexed = [
chunk for c in mock_batch_index.call_args_list for chunk in c.kwargs["chunks"]
]
assert len(all_indexed) == 7
assert [c.chunk_id for c in all_indexed] == list(range(7))

View File

@@ -0,0 +1,45 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 1
/Kids [ 4 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Parent 2 0 R
>>
endobj
xref
0 5
0000000000 65535 f
0000000015 00000 n
0000000054 00000 n
0000000113 00000 n
0000000162 00000 n
trailer
<<
/Size 5
/Root 3 0 R
/Info 1 0 R
>>
startxref
256
%%EOF

View File

@@ -0,0 +1,89 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 2
/Kids [ 4 0 R 6 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 5 0 R
/Parent 2 0 R
>>
endobj
5 0 obj
<<
/Length 47
>>
stream
BT /F1 12 Tf 50 150 Td (Page one content) Tj ET
endstream
endobj
6 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 7 0 R
/Parent 2 0 R
>>
endobj
7 0 obj
<<
/Length 47
>>
stream
BT /F1 12 Tf 50 150 Td (Page two content) Tj ET
endstream
endobj
xref
0 8
0000000000 65535 f
0000000015 00000 n
0000000054 00000 n
0000000119 00000 n
0000000168 00000 n
0000000349 00000 n
0000000446 00000 n
0000000627 00000 n
trailer
<<
/Size 8
/Root 3 0 R
/Info 1 0 R
>>
startxref
724
%%EOF

View File

@@ -0,0 +1,62 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 1
/Kids [ 4 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 5 0 R
/Parent 2 0 R
>>
endobj
5 0 obj
<<
/Length 42
>>
stream
BT /F1 12 Tf 50 150 Td (Hello World) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000054 00000 n
0000000113 00000 n
0000000162 00000 n
0000000343 00000 n
trailer
<<
/Size 6
/Root 3 0 R
/Info 1 0 R
>>
startxref
435
%%EOF

View File

@@ -0,0 +1,64 @@
%PDF-1.3
%<25><><EFBFBD><EFBFBD>
1 0 obj
<<
/Producer (pypdf)
/Title (My Title)
/Author (Jane Doe)
>>
endobj
2 0 obj
<<
/Type /Pages
/Count 1
/Kids [ 4 0 R ]
>>
endobj
3 0 obj
<<
/Type /Catalog
/Pages 2 0 R
>>
endobj
4 0 obj
<<
/Type /Page
/Resources <<
/Font <<
/F1 <<
/Type /Font
/Subtype /Type1
/BaseFont /Helvetica
>>
>>
>>
/MediaBox [ 0.0 0.0 200 200 ]
/Contents 5 0 R
/Parent 2 0 R
>>
endobj
5 0 obj
<<
/Length 35
>>
stream
BT /F1 12 Tf 50 150 Td (test) Tj ET
endstream
endobj
xref
0 6
0000000000 65535 f
0000000015 00000 n
0000000091 00000 n
0000000150 00000 n
0000000199 00000 n
0000000380 00000 n
trailer
<<
/Size 6
/Root 3 0 R
/Info 1 0 R
>>
startxref
465
%%EOF

View File

@@ -0,0 +1,124 @@
"""Unit tests for pypdf-dependent PDF processing functions.
Tests cover:
- read_pdf_file: text extraction, metadata, encrypted PDFs, image extraction
- pdf_to_text: convenience wrapper
- is_pdf_protected: password protection detection
Fixture PDFs live in ./fixtures/ and are pre-built so the test layer has no
dependency on pypdf internals (pypdf.generic).
"""
from io import BytesIO
from pathlib import Path
from onyx.file_processing.extract_file_text import pdf_to_text
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.password_validation import is_pdf_protected
FIXTURES = Path(__file__).parent / "fixtures"
def _load(name: str) -> BytesIO:
return BytesIO((FIXTURES / name).read_bytes())
# ── read_pdf_file ────────────────────────────────────────────────────────
class TestReadPdfFile:
def test_basic_text_extraction(self) -> None:
text, _, images = read_pdf_file(_load("simple.pdf"))
assert "Hello World" in text
assert images == []
def test_multi_page_text_extraction(self) -> None:
text, _, _ = read_pdf_file(_load("multipage.pdf"))
assert "Page one content" in text
assert "Page two content" in text
def test_metadata_extraction(self) -> None:
_, pdf_metadata, _ = read_pdf_file(_load("with_metadata.pdf"))
assert pdf_metadata.get("Title") == "My Title"
assert pdf_metadata.get("Author") == "Jane Doe"
def test_encrypted_pdf_with_correct_password(self) -> None:
text, _, _ = read_pdf_file(_load("encrypted.pdf"), pdf_pass="pass123")
assert "Secret Content" in text
def test_encrypted_pdf_without_password(self) -> None:
text, _, _ = read_pdf_file(_load("encrypted.pdf"))
assert text == ""
def test_encrypted_pdf_with_wrong_password(self) -> None:
text, _, _ = read_pdf_file(_load("encrypted.pdf"), pdf_pass="wrong")
assert text == ""
def test_empty_pdf(self) -> None:
text, _, _ = read_pdf_file(_load("empty.pdf"))
assert text.strip() == ""
def test_invalid_pdf_returns_empty(self) -> None:
text, _, images = read_pdf_file(BytesIO(b"this is not a pdf"))
assert text == ""
assert images == []
def test_image_extraction_disabled_by_default(self) -> None:
_, _, images = read_pdf_file(_load("with_image.pdf"))
assert images == []
def test_image_extraction_collects_images(self) -> None:
_, _, images = read_pdf_file(_load("with_image.pdf"), extract_images=True)
assert len(images) == 1
img_bytes, img_name = images[0]
assert len(img_bytes) > 0
assert img_name # non-empty name
def test_image_callback_streams_instead_of_collecting(self) -> None:
"""With image_callback, images are streamed via callback and not accumulated."""
collected: list[tuple[bytes, str]] = []
def callback(data: bytes, name: str) -> None:
collected.append((data, name))
_, _, images = read_pdf_file(
_load("with_image.pdf"), extract_images=True, image_callback=callback
)
# Callback received the image
assert len(collected) == 1
assert len(collected[0][0]) > 0
# Returned list is empty when callback is used
assert images == []
# ── pdf_to_text ──────────────────────────────────────────────────────────
class TestPdfToText:
def test_returns_text(self) -> None:
assert "Hello World" in pdf_to_text(_load("simple.pdf"))
def test_with_password(self) -> None:
assert "Secret Content" in pdf_to_text(
_load("encrypted.pdf"), pdf_pass="pass123"
)
def test_encrypted_without_password_returns_empty(self) -> None:
assert pdf_to_text(_load("encrypted.pdf")) == ""
# ── is_pdf_protected ─────────────────────────────────────────────────────
class TestIsPdfProtected:
def test_unprotected_pdf(self) -> None:
assert is_pdf_protected(_load("simple.pdf")) is False
def test_protected_pdf(self) -> None:
assert is_pdf_protected(_load("encrypted.pdf")) is True
def test_preserves_file_position(self) -> None:
pdf = _load("simple.pdf")
pdf.seek(42)
is_pdf_protected(pdf)
assert pdf.tell() == 42

View File

@@ -0,0 +1,479 @@
"""Unit tests for the hook executor."""
import json
from typing import Any
from unittest.mock import MagicMock
from unittest.mock import patch
import httpx
import pytest
from onyx.db.enums import HookFailStrategy
from onyx.db.enums import HookPoint
from onyx.error_handling.error_codes import OnyxErrorCode
from onyx.error_handling.exceptions import OnyxError
from onyx.hooks.executor import execute_hook
from onyx.hooks.executor import HookSkipped
from onyx.hooks.executor import HookSoftFailed
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_PAYLOAD: dict[str, Any] = {"query": "test", "user_email": "u@example.com"}
_RESPONSE_PAYLOAD: dict[str, Any] = {"rewritten_query": "better test"}
def _make_hook(
*,
is_active: bool = True,
endpoint_url: str | None = "https://hook.example.com/query",
api_key: MagicMock | None = None,
timeout_seconds: float = 5.0,
fail_strategy: HookFailStrategy = HookFailStrategy.SOFT,
hook_id: int = 1,
) -> MagicMock:
hook = MagicMock()
hook.is_active = is_active
hook.endpoint_url = endpoint_url
hook.api_key = api_key
hook.timeout_seconds = timeout_seconds
hook.id = hook_id
hook.fail_strategy = fail_strategy
return hook
def _make_api_key(value: str) -> MagicMock:
api_key = MagicMock()
api_key.get_value.return_value = value
return api_key
def _make_response(
*,
status_code: int = 200,
json_return: Any = _RESPONSE_PAYLOAD,
json_side_effect: Exception | None = None,
) -> MagicMock:
"""Build a response mock with controllable json() behaviour."""
response = MagicMock()
response.status_code = status_code
if json_side_effect is not None:
response.json.side_effect = json_side_effect
else:
response.json.return_value = json_return
return response
def _setup_client(
mock_client_cls: MagicMock,
*,
response: MagicMock | None = None,
side_effect: Exception | None = None,
) -> MagicMock:
"""Wire up the httpx.Client mock and return the inner client.
If side_effect is an httpx.HTTPStatusError, it is raised from
raise_for_status() (matching real httpx behaviour) and post() returns a
response mock with the matching status_code set. All other exceptions are
raised directly from post().
"""
mock_client = MagicMock()
if isinstance(side_effect, httpx.HTTPStatusError):
error_response = MagicMock()
error_response.status_code = side_effect.response.status_code
error_response.raise_for_status.side_effect = side_effect
mock_client.post = MagicMock(return_value=error_response)
else:
mock_client.post = MagicMock(
side_effect=side_effect, return_value=response if not side_effect else None
)
mock_client_cls.return_value.__enter__ = MagicMock(return_value=mock_client)
mock_client_cls.return_value.__exit__ = MagicMock(return_value=False)
return mock_client
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture()
def db_session() -> MagicMock:
return MagicMock()
# ---------------------------------------------------------------------------
# Early-exit guards (no HTTP call, no DB writes)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"hooks_available,hook",
[
# HOOKS_AVAILABLE=False exits before the DB lookup — hook is irrelevant.
pytest.param(False, None, id="hooks_not_available"),
pytest.param(True, None, id="hook_not_found"),
pytest.param(True, _make_hook(is_active=False), id="hook_inactive"),
pytest.param(True, _make_hook(endpoint_url=None), id="no_endpoint_url"),
],
)
def test_early_exit_returns_skipped_with_no_db_writes(
db_session: MagicMock,
hooks_available: bool,
hook: MagicMock | None,
) -> None:
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", hooks_available),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch("onyx.hooks.executor.update_hook__no_commit") as mock_update,
patch("onyx.hooks.executor.create_hook_execution_log__no_commit") as mock_log,
):
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert isinstance(result, HookSkipped)
mock_update.assert_not_called()
mock_log.assert_not_called()
# ---------------------------------------------------------------------------
# Successful HTTP call
# ---------------------------------------------------------------------------
def test_success_returns_payload_and_sets_reachable(db_session: MagicMock) -> None:
hook = _make_hook()
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", True),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch("onyx.hooks.executor.get_session_with_current_tenant"),
patch("onyx.hooks.executor.update_hook__no_commit") as mock_update,
patch("onyx.hooks.executor.create_hook_execution_log__no_commit") as mock_log,
patch("httpx.Client") as mock_client_cls,
):
_setup_client(mock_client_cls, response=_make_response())
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert result == _RESPONSE_PAYLOAD
_, update_kwargs = mock_update.call_args
assert update_kwargs["is_reachable"] is True
mock_log.assert_not_called()
def test_non_dict_json_response_is_a_failure(db_session: MagicMock) -> None:
"""response.json() returning a non-dict (e.g. list) must be treated as failure."""
hook = _make_hook(fail_strategy=HookFailStrategy.SOFT)
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", True),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch("onyx.hooks.executor.get_session_with_current_tenant"),
patch("onyx.hooks.executor.update_hook__no_commit"),
patch("onyx.hooks.executor.create_hook_execution_log__no_commit") as mock_log,
patch("httpx.Client") as mock_client_cls,
):
_setup_client(
mock_client_cls,
response=_make_response(json_return=["unexpected", "list"]),
)
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert isinstance(result, HookSoftFailed)
_, log_kwargs = mock_log.call_args
assert log_kwargs["is_success"] is False
assert "non-dict" in (log_kwargs["error_message"] or "")
def test_json_decode_failure_is_a_failure(db_session: MagicMock) -> None:
"""response.json() raising must be treated as failure with SOFT strategy."""
hook = _make_hook(fail_strategy=HookFailStrategy.SOFT)
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", True),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch("onyx.hooks.executor.get_session_with_current_tenant"),
patch("onyx.hooks.executor.update_hook__no_commit"),
patch("onyx.hooks.executor.create_hook_execution_log__no_commit") as mock_log,
patch("httpx.Client") as mock_client_cls,
):
_setup_client(
mock_client_cls,
response=_make_response(
json_side_effect=json.JSONDecodeError("not JSON", "", 0)
),
)
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert isinstance(result, HookSoftFailed)
_, log_kwargs = mock_log.call_args
assert log_kwargs["is_success"] is False
assert "non-JSON" in (log_kwargs["error_message"] or "")
# ---------------------------------------------------------------------------
# HTTP failure paths
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"exception,fail_strategy,expected_type,expected_is_reachable",
[
pytest.param(
httpx.ConnectError("refused"),
HookFailStrategy.SOFT,
HookSoftFailed,
False,
id="connect_error_soft",
),
pytest.param(
httpx.ConnectError("refused"),
HookFailStrategy.HARD,
OnyxError,
False,
id="connect_error_hard",
),
pytest.param(
httpx.TimeoutException("timeout"),
HookFailStrategy.SOFT,
HookSoftFailed,
True,
id="timeout_soft",
),
pytest.param(
httpx.TimeoutException("timeout"),
HookFailStrategy.HARD,
OnyxError,
True,
id="timeout_hard",
),
pytest.param(
httpx.HTTPStatusError(
"500",
request=MagicMock(),
response=MagicMock(status_code=500, text="error"),
),
HookFailStrategy.SOFT,
HookSoftFailed,
True,
id="http_status_error_soft",
),
pytest.param(
httpx.HTTPStatusError(
"500",
request=MagicMock(),
response=MagicMock(status_code=500, text="error"),
),
HookFailStrategy.HARD,
OnyxError,
True,
id="http_status_error_hard",
),
],
)
def test_http_failure_paths(
db_session: MagicMock,
exception: Exception,
fail_strategy: HookFailStrategy,
expected_type: type,
expected_is_reachable: bool,
) -> None:
hook = _make_hook(fail_strategy=fail_strategy)
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", True),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch("onyx.hooks.executor.get_session_with_current_tenant"),
patch("onyx.hooks.executor.update_hook__no_commit") as mock_update,
patch("onyx.hooks.executor.create_hook_execution_log__no_commit"),
patch("httpx.Client") as mock_client_cls,
):
_setup_client(mock_client_cls, side_effect=exception)
if expected_type is OnyxError:
with pytest.raises(OnyxError) as exc_info:
execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert exc_info.value.error_code is OnyxErrorCode.HOOK_EXECUTION_FAILED
else:
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert isinstance(result, expected_type)
mock_update.assert_called_once()
_, kwargs = mock_update.call_args
assert kwargs["is_reachable"] is expected_is_reachable
# ---------------------------------------------------------------------------
# Authorization header
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"api_key_value,expect_auth_header",
[
pytest.param("secret-token", True, id="api_key_present"),
pytest.param(None, False, id="api_key_absent"),
],
)
def test_authorization_header(
db_session: MagicMock,
api_key_value: str | None,
expect_auth_header: bool,
) -> None:
api_key = _make_api_key(api_key_value) if api_key_value else None
hook = _make_hook(api_key=api_key)
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", True),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch("onyx.hooks.executor.get_session_with_current_tenant"),
patch("onyx.hooks.executor.update_hook__no_commit"),
patch("onyx.hooks.executor.create_hook_execution_log__no_commit"),
patch("httpx.Client") as mock_client_cls,
):
mock_client = _setup_client(mock_client_cls, response=_make_response())
execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
_, call_kwargs = mock_client.post.call_args
if expect_auth_header:
assert call_kwargs["headers"]["Authorization"] == f"Bearer {api_key_value}"
else:
assert "Authorization" not in call_kwargs["headers"]
# ---------------------------------------------------------------------------
# Persist session failure
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"http_exception,expected_result",
[
pytest.param(None, _RESPONSE_PAYLOAD, id="success_path"),
pytest.param(httpx.ConnectError("refused"), OnyxError, id="hard_fail_path"),
],
)
def test_persist_session_failure_is_swallowed(
db_session: MagicMock,
http_exception: Exception | None,
expected_result: Any,
) -> None:
"""DB session failure in _persist_result must not mask the real return value or OnyxError."""
hook = _make_hook(fail_strategy=HookFailStrategy.HARD)
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", True),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch(
"onyx.hooks.executor.get_session_with_current_tenant",
side_effect=RuntimeError("DB unavailable"),
),
patch("httpx.Client") as mock_client_cls,
):
_setup_client(
mock_client_cls,
response=_make_response() if not http_exception else None,
side_effect=http_exception,
)
if expected_result is OnyxError:
with pytest.raises(OnyxError) as exc_info:
execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert exc_info.value.error_code is OnyxErrorCode.HOOK_EXECUTION_FAILED
else:
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert result == expected_result
def test_is_reachable_failure_does_not_prevent_log(db_session: MagicMock) -> None:
"""is_reachable update failing (e.g. concurrent hook deletion) must not
prevent the execution log from being written.
Simulates the production failure path: update_hook__no_commit raises
OnyxError(NOT_FOUND) as it would if the hook was concurrently deleted
between the initial lookup and the reachable update.
"""
hook = _make_hook(fail_strategy=HookFailStrategy.SOFT)
with (
patch("onyx.hooks.executor.HOOKS_AVAILABLE", True),
patch(
"onyx.hooks.executor.get_non_deleted_hook_by_hook_point",
return_value=hook,
),
patch("onyx.hooks.executor.get_session_with_current_tenant"),
patch(
"onyx.hooks.executor.update_hook__no_commit",
side_effect=OnyxError(OnyxErrorCode.NOT_FOUND, "hook deleted"),
),
patch("onyx.hooks.executor.create_hook_execution_log__no_commit") as mock_log,
patch("httpx.Client") as mock_client_cls,
):
_setup_client(mock_client_cls, side_effect=httpx.ConnectError("refused"))
result = execute_hook(
db_session=db_session,
hook_point=HookPoint.QUERY_PROCESSING,
payload=_PAYLOAD,
)
assert isinstance(result, HookSoftFailed)
mock_log.assert_called_once()

View File

@@ -0,0 +1,109 @@
import io
import zipfile
from unittest.mock import MagicMock
from unittest.mock import patch
from zipfile import BadZipFile
import pytest
from fastapi import UploadFile
from starlette.datastructures import Headers
from onyx.configs.constants import FileOrigin
from onyx.server.documents.connector import upload_files
def _create_test_zip() -> bytes:
"""Create a simple in-memory zip file containing two text files."""
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w") as zf:
zf.writestr("file1.txt", "hello")
zf.writestr("file2.txt", "world")
return buf.getvalue()
def _make_upload_file(content: bytes, filename: str, content_type: str) -> UploadFile:
return UploadFile(
file=io.BytesIO(content),
filename=filename,
headers=Headers({"content-type": content_type}),
)
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_zip_with_unzip_true_extracts_files(
mock_get_store: MagicMock,
) -> None:
"""When unzip=True (default), a zip upload is extracted into individual files."""
mock_store = MagicMock()
mock_store.save_file.side_effect = lambda **kwargs: f"id-{kwargs['display_name']}"
mock_get_store.return_value = mock_store
zip_bytes = _create_test_zip()
upload = _make_upload_file(zip_bytes, "test.zip", "application/zip")
result = upload_files([upload], FileOrigin.CONNECTOR)
# Should have extracted the two individual files, not stored the zip itself
assert len(result.file_paths) == 2
assert "id-file1.txt" in result.file_paths
assert "id-file2.txt" in result.file_paths
assert "file1.txt" in result.file_names
assert "file2.txt" in result.file_names
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_zip_with_unzip_false_stores_zip_as_is(
mock_get_store: MagicMock,
) -> None:
"""When unzip=False, the zip file is stored as-is without extraction."""
mock_store = MagicMock()
mock_store.save_file.return_value = "zip-file-id"
mock_get_store.return_value = mock_store
zip_bytes = _create_test_zip()
upload = _make_upload_file(zip_bytes, "site_export.zip", "application/zip")
result = upload_files([upload], FileOrigin.CONNECTOR, unzip=False)
# Should store exactly one file (the zip itself)
assert len(result.file_paths) == 1
assert result.file_paths[0] == "zip-file-id"
assert result.file_names == ["site_export.zip"]
# No zip metadata should be created
assert result.zip_metadata_file_id is None
# Verify the stored content is a valid zip
saved_content: io.BytesIO = mock_store.save_file.call_args[1]["content"]
saved_content.seek(0)
with zipfile.ZipFile(saved_content, "r") as zf:
assert set(zf.namelist()) == {"file1.txt", "file2.txt"}
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_invalid_zip_with_unzip_false_raises(
mock_get_store: MagicMock,
) -> None:
"""An invalid zip is rejected even when unzip=False (validation still runs)."""
mock_get_store.return_value = MagicMock()
bad_zip = _make_upload_file(b"not a zip", "bad.zip", "application/zip")
with pytest.raises(BadZipFile):
upload_files([bad_zip], FileOrigin.CONNECTOR, unzip=False)
@patch("onyx.server.documents.connector.get_default_file_store")
def test_upload_multiple_zips_rejected_when_unzip_false(
mock_get_store: MagicMock,
) -> None:
"""The seen_zip guard rejects a second zip even when unzip=False."""
mock_store = MagicMock()
mock_store.save_file.return_value = "zip-id"
mock_get_store.return_value = mock_store
zip_bytes = _create_test_zip()
zip1 = _make_upload_file(zip_bytes, "a.zip", "application/zip")
zip2 = _make_upload_file(zip_bytes, "b.zip", "application/zip")
with pytest.raises(Exception, match="Only one zip file"):
upload_files([zip1, zip2], FileOrigin.CONNECTOR, unzip=False)

View File

@@ -8,7 +8,7 @@
"name": "widget",
"version": "0.1.0",
"dependencies": {
"next": "^16.1.5",
"next": "^16.1.7",
"react": "^19",
"react-dom": "^19",
"react-markdown": "^10.1.0"
@@ -1023,9 +1023,9 @@
}
},
"node_modules/@next/env": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.5.tgz",
"integrity": "sha512-CRSCPJiSZoi4Pn69RYBDI9R7YK2g59vLexPQFXY0eyw+ILevIenCywzg+DqmlBik9zszEnw2HLFOUlLAcJbL7g==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.7.tgz",
"integrity": "sha512-rJJbIdJB/RQr2F1nylZr/PJzamvNNhfr3brdKP6s/GW850jbtR70QlSfFselvIBbcPUOlQwBakexjFzqLzF6pg==",
"license": "MIT"
},
"node_modules/@next/eslint-plugin-next": {
@@ -1039,9 +1039,9 @@
}
},
"node_modules/@next/swc-darwin-arm64": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.5.tgz",
"integrity": "sha512-eK7Wdm3Hjy/SCL7TevlH0C9chrpeOYWx2iR7guJDaz4zEQKWcS1IMVfMb9UKBFMg1XgzcPTYPIp1Vcpukkjg6Q==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.7.tgz",
"integrity": "sha512-b2wWIE8sABdyafc4IM8r5Y/dS6kD80JRtOGrUiKTsACFQfWWgUQ2NwoUX1yjFMXVsAwcQeNpnucF2ZrujsBBPg==",
"cpu": [
"arm64"
],
@@ -1055,9 +1055,9 @@
}
},
"node_modules/@next/swc-darwin-x64": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.5.tgz",
"integrity": "sha512-foQscSHD1dCuxBmGkbIr6ScAUF6pRoDZP6czajyvmXPAOFNnQUJu2Os1SGELODjKp/ULa4fulnBWoHV3XdPLfA==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.7.tgz",
"integrity": "sha512-zcnVaaZulS1WL0Ss38R5Q6D2gz7MtBu8GZLPfK+73D/hp4GFMrC2sudLky1QibfV7h6RJBJs/gOFvYP0X7UVlQ==",
"cpu": [
"x64"
],
@@ -1071,9 +1071,9 @@
}
},
"node_modules/@next/swc-linux-arm64-gnu": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.5.tgz",
"integrity": "sha512-qNIb42o3C02ccIeSeKjacF3HXotGsxh/FMk/rSRmCzOVMtoWH88odn2uZqF8RLsSUWHcAqTgYmPD3pZ03L9ZAA==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.7.tgz",
"integrity": "sha512-2ant89Lux/Q3VyC8vNVg7uBaFVP9SwoK2jJOOR0L8TQnX8CAYnh4uctAScy2Hwj2dgjVHqHLORQZJ2wH6VxhSQ==",
"cpu": [
"arm64"
],
@@ -1087,9 +1087,9 @@
}
},
"node_modules/@next/swc-linux-arm64-musl": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.5.tgz",
"integrity": "sha512-U+kBxGUY1xMAzDTXmuVMfhaWUZQAwzRaHJ/I6ihtR5SbTVUEaDRiEU9YMjy1obBWpdOBuk1bcm+tsmifYSygfw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.7.tgz",
"integrity": "sha512-uufcze7LYv0FQg9GnNeZ3/whYfo+1Q3HnQpm16o6Uyi0OVzLlk2ZWoY7j07KADZFY8qwDbsmFnMQP3p3+Ftprw==",
"cpu": [
"arm64"
],
@@ -1103,9 +1103,9 @@
}
},
"node_modules/@next/swc-linux-x64-gnu": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.5.tgz",
"integrity": "sha512-gq2UtoCpN7Ke/7tKaU7i/1L7eFLfhMbXjNghSv0MVGF1dmuoaPeEVDvkDuO/9LVa44h5gqpWeJ4mRRznjDv7LA==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.7.tgz",
"integrity": "sha512-KWVf2gxYvHtvuT+c4MBOGxuse5TD7DsMFYSxVxRBnOzok/xryNeQSjXgxSv9QpIVlaGzEn/pIuI6Koosx8CGWA==",
"cpu": [
"x64"
],
@@ -1119,9 +1119,9 @@
}
},
"node_modules/@next/swc-linux-x64-musl": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.5.tgz",
"integrity": "sha512-bQWSE729PbXT6mMklWLf8dotislPle2L70E9q6iwETYEOt092GDn0c+TTNj26AjmeceSsC4ndyGsK5nKqHYXjQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.7.tgz",
"integrity": "sha512-HguhaGwsGr1YAGs68uRKc4aGWxLET+NevJskOcCAwXbwj0fYX0RgZW2gsOCzr9S11CSQPIkxmoSbuVaBp4Z3dA==",
"cpu": [
"x64"
],
@@ -1135,9 +1135,9 @@
}
},
"node_modules/@next/swc-win32-arm64-msvc": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.5.tgz",
"integrity": "sha512-LZli0anutkIllMtTAWZlDqdfvjWX/ch8AFK5WgkNTvaqwlouiD1oHM+WW8RXMiL0+vAkAJyAGEzPPjO+hnrSNQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.7.tgz",
"integrity": "sha512-S0n3KrDJokKTeFyM/vGGGR8+pCmXYrjNTk2ZozOL1C/JFdfUIL9O1ATaJOl5r2POe56iRChbsszrjMAdWSv7kQ==",
"cpu": [
"arm64"
],
@@ -1151,9 +1151,9 @@
}
},
"node_modules/@next/swc-win32-x64-msvc": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.5.tgz",
"integrity": "sha512-7is37HJTNQGhjPpQbkKjKEboHYQnCgpVt/4rBrrln0D9nderNxZ8ZWs8w1fAtzUx7wEyYjQ+/13myFgFj6K2Ng==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.7.tgz",
"integrity": "sha512-mwgtg8CNZGYm06LeEd+bNnOUfwOyNem/rOiP14Lsz+AnUY92Zq/LXwtebtUiaeVkhbroRCQ0c8GlR4UT1U+0yg==",
"cpu": [
"x64"
],
@@ -2564,12 +2564,15 @@
"dev": true
},
"node_modules/baseline-browser-mapping": {
"version": "2.9.14",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.14.tgz",
"integrity": "sha512-B0xUquLkiGLgHhpPBqvl7GWegWBUNuujQ6kXd/r1U38ElPT6Ok8KZ8e+FpUGEc2ZoRQUzq/aUnaKFc/svWUGSg==",
"version": "2.10.8",
"resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.8.tgz",
"integrity": "sha512-PCLz/LXGBsNTErbtB6i5u4eLpHeMfi93aUv5duMmj6caNu6IphS4q6UevDnL36sZQv9lrP11dbPKGMaXPwMKfQ==",
"license": "Apache-2.0",
"bin": {
"baseline-browser-mapping": "dist/cli.js"
"baseline-browser-mapping": "dist/cli.cjs"
},
"engines": {
"node": ">=6.0.0"
}
},
"node_modules/brace-expansion": {
@@ -5926,14 +5929,14 @@
"dev": true
},
"node_modules/next": {
"version": "16.1.5",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.5.tgz",
"integrity": "sha512-f+wE+NSbiQgh3DSAlTaw2FwY5yGdVViAtp8TotNQj4kk4Q8Bh1sC/aL9aH+Rg1YAVn18OYXsRDT7U/079jgP7w==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.7.tgz",
"integrity": "sha512-WM0L7WrSvKwoLegLYr6V+mz+RIofqQgVAfHhMp9a88ms0cFX8iX9ew+snpWlSBwpkURJOUdvCEt3uLl3NNzvWg==",
"license": "MIT",
"dependencies": {
"@next/env": "16.1.5",
"@next/env": "16.1.7",
"@swc/helpers": "0.5.15",
"baseline-browser-mapping": "^2.8.3",
"baseline-browser-mapping": "^2.9.19",
"caniuse-lite": "^1.0.30001579",
"postcss": "8.4.31",
"styled-jsx": "5.1.6"
@@ -5945,14 +5948,14 @@
"node": ">=20.9.0"
},
"optionalDependencies": {
"@next/swc-darwin-arm64": "16.1.5",
"@next/swc-darwin-x64": "16.1.5",
"@next/swc-linux-arm64-gnu": "16.1.5",
"@next/swc-linux-arm64-musl": "16.1.5",
"@next/swc-linux-x64-gnu": "16.1.5",
"@next/swc-linux-x64-musl": "16.1.5",
"@next/swc-win32-arm64-msvc": "16.1.5",
"@next/swc-win32-x64-msvc": "16.1.5",
"@next/swc-darwin-arm64": "16.1.7",
"@next/swc-darwin-x64": "16.1.7",
"@next/swc-linux-arm64-gnu": "16.1.7",
"@next/swc-linux-arm64-musl": "16.1.7",
"@next/swc-linux-x64-gnu": "16.1.7",
"@next/swc-linux-x64-musl": "16.1.7",
"@next/swc-win32-arm64-msvc": "16.1.7",
"@next/swc-win32-x64-msvc": "16.1.7",
"sharp": "^0.34.4"
},
"peerDependencies": {

View File

@@ -9,7 +9,7 @@
"lint": "next lint"
},
"dependencies": {
"next": "^16.1.5",
"next": "^16.1.7",
"react": "^19",
"react-dom": "^19",
"react-markdown": "^10.1.0"

View File

@@ -92,7 +92,7 @@ backend = [
"python-gitlab==5.6.0",
"python-pptx==0.6.23",
"pypandoc_binary==1.16.2",
"pypdf==6.8.0",
"pypdf==6.9.1",
"pytest-mock==3.12.0",
"pytest-playwright==0.7.0",
"python-docx==1.1.2",
@@ -245,6 +245,7 @@ select = [
"ARG",
"E",
"F",
"S324",
"W",
]

8
uv.lock generated
View File

@@ -4481,7 +4481,7 @@ requires-dist = [
{ name = "pygithub", marker = "extra == 'backend'", specifier = "==2.5.0" },
{ name = "pympler", marker = "extra == 'backend'", specifier = "==1.1" },
{ name = "pypandoc-binary", marker = "extra == 'backend'", specifier = "==1.16.2" },
{ name = "pypdf", marker = "extra == 'backend'", specifier = "==6.8.0" },
{ name = "pypdf", marker = "extra == 'backend'", specifier = "==6.9.1" },
{ name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.5" },
{ name = "pytest-alembic", marker = "extra == 'dev'", specifier = "==0.12.1" },
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==1.3.0" },
@@ -5727,11 +5727,11 @@ wheels = [
[[package]]
name = "pypdf"
version = "6.8.0"
version = "6.9.1"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/b4/a3/e705b0805212b663a4c27b861c8a603dba0f8b4bb281f96f8e746576a50d/pypdf-6.8.0.tar.gz", hash = "sha256:cb7eaeaa4133ce76f762184069a854e03f4d9a08568f0e0623f7ea810407833b", size = 5307831, upload-time = "2026-03-09T13:37:40.591Z" }
sdist = { url = "https://files.pythonhosted.org/packages/f9/fb/dc2e8cb006e80b0020ed20d8649106fe4274e82d8e756ad3e24ade19c0df/pypdf-6.9.1.tar.gz", hash = "sha256:ae052407d33d34de0c86c5c729be6d51010bf36e03035a8f23ab449bca52377d", size = 5311551, upload-time = "2026-03-17T10:46:07.876Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8c/ec/4ccf3bb86b1afe5d7176e1c8abcdbf22b53dd682ec2eda50e1caadcf6846/pypdf-6.8.0-py3-none-any.whl", hash = "sha256:2a025080a8dd73f48123c89c57174a5ff3806c71763ee4e49572dc90454943c7", size = 332177, upload-time = "2026-03-09T13:37:38.774Z" },
{ url = "https://files.pythonhosted.org/packages/f9/f4/75543fa802b86e72f87e9395440fe1a89a6d149887e3e55745715c3352ac/pypdf-6.9.1-py3-none-any.whl", hash = "sha256:f35a6a022348fae47e092a908339a8f3dc993510c026bb39a96718fc7185e89f", size = 333661, upload-time = "2026-03-17T10:46:06.286Z" },
]
[[package]]

View File

@@ -174,6 +174,7 @@ function ContentLg({
)}
onClick={editable ? startEditing : undefined}
style={{ height: config.lineHeight }}
title={title}
>
{title}
</span>

View File

@@ -218,6 +218,7 @@ function ContentMd({
"text-text-04",
editable && "cursor-pointer"
)}
title={title}
onClick={editable ? startEditing : undefined}
style={{ height: config.lineHeight }}
>

View File

@@ -118,6 +118,7 @@ function ContentSm({
<span
className={cn("opal-content-sm-title", config.titleFont)}
style={{ height: config.lineHeight }}
title={title}
>
{title}
</span>

View File

@@ -231,6 +231,7 @@ function ContentXl({
)}
onClick={editable ? startEditing : undefined}
style={{ height: config.lineHeight }}
title={title}
>
{title}
</span>

82
web/package-lock.json generated
View File

@@ -61,7 +61,7 @@
"mdast-util-find-and-replace": "^3.0.1",
"mime": "^4.1.0",
"motion": "^12.29.0",
"next": "16.1.6",
"next": "16.1.7",
"next-themes": "^0.4.4",
"postcss": "^8.5.6",
"posthog-js": "^1.176.0",
@@ -2896,9 +2896,9 @@
}
},
"node_modules/@next/env": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.6.tgz",
"integrity": "sha512-N1ySLuZjnAtN3kFnwhAwPvZah8RJxKasD7x1f8shFqhncnWZn4JMfg37diLNuoHsLAlrDfM3g4mawVdtAG8XLQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.7.tgz",
"integrity": "sha512-rJJbIdJB/RQr2F1nylZr/PJzamvNNhfr3brdKP6s/GW850jbtR70QlSfFselvIBbcPUOlQwBakexjFzqLzF6pg==",
"license": "MIT"
},
"node_modules/@next/eslint-plugin-next": {
@@ -2942,9 +2942,9 @@
}
},
"node_modules/@next/swc-darwin-arm64": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.6.tgz",
"integrity": "sha512-wTzYulosJr/6nFnqGW7FrG3jfUUlEf8UjGA0/pyypJl42ExdVgC6xJgcXQ+V8QFn6niSG2Pb8+MIG1mZr2vczw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.7.tgz",
"integrity": "sha512-b2wWIE8sABdyafc4IM8r5Y/dS6kD80JRtOGrUiKTsACFQfWWgUQ2NwoUX1yjFMXVsAwcQeNpnucF2ZrujsBBPg==",
"cpu": [
"arm64"
],
@@ -2958,9 +2958,9 @@
}
},
"node_modules/@next/swc-darwin-x64": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.6.tgz",
"integrity": "sha512-BLFPYPDO+MNJsiDWbeVzqvYd4NyuRrEYVB5k2N3JfWncuHAy2IVwMAOlVQDFjj+krkWzhY2apvmekMkfQR0CUQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.7.tgz",
"integrity": "sha512-zcnVaaZulS1WL0Ss38R5Q6D2gz7MtBu8GZLPfK+73D/hp4GFMrC2sudLky1QibfV7h6RJBJs/gOFvYP0X7UVlQ==",
"cpu": [
"x64"
],
@@ -2974,9 +2974,9 @@
}
},
"node_modules/@next/swc-linux-arm64-gnu": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.6.tgz",
"integrity": "sha512-OJYkCd5pj/QloBvoEcJ2XiMnlJkRv9idWA/j0ugSuA34gMT6f5b7vOiCQHVRpvStoZUknhl6/UxOXL4OwtdaBw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.7.tgz",
"integrity": "sha512-2ant89Lux/Q3VyC8vNVg7uBaFVP9SwoK2jJOOR0L8TQnX8CAYnh4uctAScy2Hwj2dgjVHqHLORQZJ2wH6VxhSQ==",
"cpu": [
"arm64"
],
@@ -2990,9 +2990,9 @@
}
},
"node_modules/@next/swc-linux-arm64-musl": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.6.tgz",
"integrity": "sha512-S4J2v+8tT3NIO9u2q+S0G5KdvNDjXfAv06OhfOzNDaBn5rw84DGXWndOEB7d5/x852A20sW1M56vhC/tRVbccQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.7.tgz",
"integrity": "sha512-uufcze7LYv0FQg9GnNeZ3/whYfo+1Q3HnQpm16o6Uyi0OVzLlk2ZWoY7j07KADZFY8qwDbsmFnMQP3p3+Ftprw==",
"cpu": [
"arm64"
],
@@ -3006,9 +3006,9 @@
}
},
"node_modules/@next/swc-linux-x64-gnu": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.6.tgz",
"integrity": "sha512-2eEBDkFlMMNQnkTyPBhQOAyn2qMxyG2eE7GPH2WIDGEpEILcBPI/jdSv4t6xupSP+ot/jkfrCShLAa7+ZUPcJQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.7.tgz",
"integrity": "sha512-KWVf2gxYvHtvuT+c4MBOGxuse5TD7DsMFYSxVxRBnOzok/xryNeQSjXgxSv9QpIVlaGzEn/pIuI6Koosx8CGWA==",
"cpu": [
"x64"
],
@@ -3022,9 +3022,9 @@
}
},
"node_modules/@next/swc-linux-x64-musl": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.6.tgz",
"integrity": "sha512-oicJwRlyOoZXVlxmIMaTq7f8pN9QNbdes0q2FXfRsPhfCi8n8JmOZJm5oo1pwDaFbnnD421rVU409M3evFbIqg==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.7.tgz",
"integrity": "sha512-HguhaGwsGr1YAGs68uRKc4aGWxLET+NevJskOcCAwXbwj0fYX0RgZW2gsOCzr9S11CSQPIkxmoSbuVaBp4Z3dA==",
"cpu": [
"x64"
],
@@ -3038,9 +3038,9 @@
}
},
"node_modules/@next/swc-win32-arm64-msvc": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.6.tgz",
"integrity": "sha512-gQmm8izDTPgs+DCWH22kcDmuUp7NyiJgEl18bcr8irXA5N2m2O+JQIr6f3ct42GOs9c0h8QF3L5SzIxcYAAXXw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.7.tgz",
"integrity": "sha512-S0n3KrDJokKTeFyM/vGGGR8+pCmXYrjNTk2ZozOL1C/JFdfUIL9O1ATaJOl5r2POe56iRChbsszrjMAdWSv7kQ==",
"cpu": [
"arm64"
],
@@ -3054,9 +3054,9 @@
}
},
"node_modules/@next/swc-win32-x64-msvc": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.6.tgz",
"integrity": "sha512-NRfO39AIrzBnixKbjuo2YiYhB6o9d8v/ymU9m/Xk8cyVk+k7XylniXkHwjs4s70wedVffc6bQNbufk5v0xEm0A==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.7.tgz",
"integrity": "sha512-mwgtg8CNZGYm06LeEd+bNnOUfwOyNem/rOiP14Lsz+AnUY92Zq/LXwtebtUiaeVkhbroRCQ0c8GlR4UT1U+0yg==",
"cpu": [
"x64"
],
@@ -14068,14 +14068,14 @@
"license": "MIT"
},
"node_modules/next": {
"version": "16.1.6",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.6.tgz",
"integrity": "sha512-hkyRkcu5x/41KoqnROkfTm2pZVbKxvbZRuNvKXLRXxs3VfyO0WhY50TQS40EuKO9SW3rBj/sF3WbVwDACeMZyw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.7.tgz",
"integrity": "sha512-WM0L7WrSvKwoLegLYr6V+mz+RIofqQgVAfHhMp9a88ms0cFX8iX9ew+snpWlSBwpkURJOUdvCEt3uLl3NNzvWg==",
"license": "MIT",
"dependencies": {
"@next/env": "16.1.6",
"@next/env": "16.1.7",
"@swc/helpers": "0.5.15",
"baseline-browser-mapping": "^2.8.3",
"baseline-browser-mapping": "^2.9.19",
"caniuse-lite": "^1.0.30001579",
"postcss": "8.4.31",
"styled-jsx": "5.1.6"
@@ -14087,14 +14087,14 @@
"node": ">=20.9.0"
},
"optionalDependencies": {
"@next/swc-darwin-arm64": "16.1.6",
"@next/swc-darwin-x64": "16.1.6",
"@next/swc-linux-arm64-gnu": "16.1.6",
"@next/swc-linux-arm64-musl": "16.1.6",
"@next/swc-linux-x64-gnu": "16.1.6",
"@next/swc-linux-x64-musl": "16.1.6",
"@next/swc-win32-arm64-msvc": "16.1.6",
"@next/swc-win32-x64-msvc": "16.1.6",
"@next/swc-darwin-arm64": "16.1.7",
"@next/swc-darwin-x64": "16.1.7",
"@next/swc-linux-arm64-gnu": "16.1.7",
"@next/swc-linux-arm64-musl": "16.1.7",
"@next/swc-linux-x64-gnu": "16.1.7",
"@next/swc-linux-x64-musl": "16.1.7",
"@next/swc-win32-arm64-msvc": "16.1.7",
"@next/swc-win32-x64-msvc": "16.1.7",
"sharp": "^0.34.4"
},
"peerDependencies": {

View File

@@ -79,7 +79,7 @@
"mdast-util-find-and-replace": "^3.0.1",
"mime": "^4.1.0",
"motion": "^12.29.0",
"next": "16.1.6",
"next": "16.1.7",
"next-themes": "^0.4.4",
"postcss": "^8.5.6",
"posthog-js": "^1.176.0",

View File

@@ -12,7 +12,7 @@ import { localizeAndPrettify } from "@/lib/time";
import Button from "@/refresh-components/buttons/Button";
import Text from "@/refresh-components/texts/Text";
import { PageSelector } from "@/components/PageSelector";
import { useEffect, useState, useMemo } from "react";
import { useCallback, useEffect, useRef, useState, useMemo } from "react";
import { SvgAlertTriangle } from "@opal/icons";
export interface IndexAttemptErrorsModalProps {
errors: {
@@ -22,93 +22,66 @@ export interface IndexAttemptErrorsModalProps {
onClose: () => void;
onResolveAll: () => void;
isResolvingErrors?: boolean;
onPageChange?: (page: number) => void;
currentPage?: number;
pageSize?: number;
}
const ROW_HEIGHT = 65; // 4rem + 1px for border
export default function IndexAttemptErrorsModal({
errors,
onClose,
onResolveAll,
isResolvingErrors = false,
pageSize: propPageSize,
}: IndexAttemptErrorsModalProps) {
const [calculatedPageSize, setCalculatedPageSize] = useState(10);
const observerRef = useRef<ResizeObserver | null>(null);
const [pageSize, setPageSize] = useState(10);
const [currentPage, setCurrentPage] = useState(1);
// Reset to page 1 when the error list actually changes
useEffect(() => {
setCurrentPage(1);
}, [errors.items.length, errors.total_items]);
const tableContainerRef = useCallback((container: HTMLDivElement | null) => {
if (observerRef.current) {
observerRef.current.disconnect();
observerRef.current = null;
}
useEffect(() => {
const calculatePageSize = () => {
// Modal height is 75% of viewport height
const modalHeight = window.innerHeight * 0.6;
if (!container) return;
// Estimate heights (in pixels):
// - Modal header (title + description): ~120px
// - Table header: ~40px
// - Pagination section: ~80px
// - Modal padding: ~64px (32px top + 32px bottom)
const fixedHeight = 120 + 40 + 80 + 64;
const observer = new ResizeObserver(() => {
const thead = container.querySelector("thead");
const theadHeight = thead?.getBoundingClientRect().height ?? 0;
const availableHeight = container.clientHeight - theadHeight;
const newPageSize = Math.max(3, Math.floor(availableHeight / ROW_HEIGHT));
setPageSize(newPageSize);
});
// Available height for table rows
const availableHeight = modalHeight - fixedHeight;
// Each table row is approximately 60px (including borders and padding)
const rowHeight = 60;
// Calculate how many rows can fit, with a minimum of 3
const rowsPerPage = Math.max(3, Math.floor(availableHeight / rowHeight));
setCalculatedPageSize((prev) => {
// Only update if the new size is significantly different to prevent flickering
if (Math.abs(prev - rowsPerPage) > 0) {
return rowsPerPage;
}
return prev;
});
};
// Initial calculation
calculatePageSize();
// Debounced resize handler to prevent excessive recalculation
let resizeTimeout: NodeJS.Timeout;
const debouncedCalculatePageSize = () => {
clearTimeout(resizeTimeout);
resizeTimeout = setTimeout(calculatePageSize, 100);
};
window.addEventListener("resize", debouncedCalculatePageSize);
return () => {
window.removeEventListener("resize", debouncedCalculatePageSize);
clearTimeout(resizeTimeout);
};
observer.observe(container);
observerRef.current = observer;
}, []);
// Separate effect to reset current page when page size changes
// When data changes, reset to page 1.
// When page size changes (resize), preserve the user's position by
// finding which new page contains the first item they were looking at.
const prevPageSizeRef = useRef(pageSize);
useEffect(() => {
setCurrentPage(1);
}, [calculatedPageSize]);
if (pageSize !== prevPageSizeRef.current) {
setCurrentPage((prev) => {
const firstVisibleIndex = (prev - 1) * prevPageSizeRef.current;
const newPage = Math.floor(firstVisibleIndex / pageSize) + 1;
const totalPages = Math.ceil(errors.items.length / pageSize);
return Math.min(newPage, totalPages);
});
prevPageSizeRef.current = pageSize;
} else {
setCurrentPage(1);
}
}, [errors.items.length, pageSize]);
const pageSize = propPageSize || calculatedPageSize;
// Memoize pagination calculations to prevent unnecessary recalculations
const paginationData = useMemo(() => {
const totalPages = Math.ceil(errors.items.length / pageSize);
const startIndex = (currentPage - 1) * pageSize;
const endIndex = startIndex + pageSize;
const currentPageItems = errors.items.slice(startIndex, endIndex);
return {
totalPages,
currentPageItems,
const currentPageItems = errors.items.slice(
startIndex,
endIndex,
};
startIndex + pageSize
);
return { totalPages, currentPageItems };
}, [errors.items, pageSize, currentPage]);
const hasUnresolvedErrors = useMemo(
@@ -137,7 +110,7 @@ export default function IndexAttemptErrorsModal({
onClose={onClose}
height="fit"
/>
<Modal.Body>
<Modal.Body height="full">
{!isResolvingErrors && (
<div className="flex flex-col gap-2 flex-shrink-0">
<Text as="p">
@@ -152,7 +125,10 @@ export default function IndexAttemptErrorsModal({
</div>
)}
<div className="flex-1 overflow-hidden min-h-0">
<div
ref={tableContainerRef}
className="flex-1 w-full overflow-hidden min-h-0"
>
<Table>
<TableHeader>
<TableRow>
@@ -165,11 +141,11 @@ export default function IndexAttemptErrorsModal({
<TableBody>
{paginationData.currentPageItems.length > 0 ? (
paginationData.currentPageItems.map((error) => (
<TableRow key={error.id} className="h-[60px] max-h-[60px]">
<TableCell className="h-[60px] align-top">
<TableRow key={error.id} className="h-[4rem]">
<TableCell>
{localizeAndPrettify(error.time_created)}
</TableCell>
<TableCell className="h-[60px] align-top">
<TableCell>
{error.document_link ? (
<a
href={error.document_link}
@@ -183,12 +159,12 @@ export default function IndexAttemptErrorsModal({
error.document_id || error.entity_id || "Unknown"
)}
</TableCell>
<TableCell className="h-[60px] align-top p-0">
<div className="h-[60px] overflow-y-auto p-4 whitespace-normal">
<TableCell>
<div className="flex items-center h-[2rem] overflow-y-auto whitespace-normal">
{error.failure_message}
</div>
</TableCell>
<TableCell className="h-[60px] align-top">
<TableCell>
<span
className={`px-2 py-1 rounded text-xs ${
error.is_resolved
@@ -202,7 +178,7 @@ export default function IndexAttemptErrorsModal({
</TableRow>
))
) : (
<TableRow>
<TableRow className="h-[4rem]">
<TableCell
colSpan={4}
className="text-center py-8 text-gray-500"
@@ -215,32 +191,24 @@ export default function IndexAttemptErrorsModal({
</Table>
</div>
<div className="flex-shrink-0">
{paginationData.totalPages > 1 && (
<div className="flex-1 flex justify-center mb-2">
<PageSelector
totalPages={paginationData.totalPages}
currentPage={currentPage}
onPageChange={handlePageChange}
/>
</div>
)}
<div className="flex w-full">
<div className="flex gap-2 ml-auto">
{hasUnresolvedErrors && !isResolvingErrors && (
// TODO(@raunakab): migrate to opal Button once className/iconClassName is resolved
<Button
onClick={onResolveAll}
className="ml-4 whitespace-nowrap"
>
Resolve All
</Button>
)}
</div>
{paginationData.totalPages > 1 && (
<div className="flex w-full justify-center">
<PageSelector
totalPages={paginationData.totalPages}
currentPage={currentPage}
onPageChange={handlePageChange}
/>
</div>
</div>
)}
</Modal.Body>
<Modal.Footer>
{hasUnresolvedErrors && !isResolvingErrors && (
// TODO(@raunakab): migrate to opal Button once className/iconClassName is resolved
<Button onClick={onResolveAll} className="ml-4 whitespace-nowrap">
Resolve All
</Button>
)}
</Modal.Footer>
</Modal.Content>
</Modal>
);

View File

@@ -18,7 +18,7 @@ import { PageSelector } from "@/components/PageSelector";
import { localizeAndPrettify } from "@/lib/time";
import { getDocsProcessedPerMinute } from "@/lib/indexAttempt";
import { InfoIcon } from "@/components/icons/icons";
import ExceptionTraceModal from "@/components/modals/ExceptionTraceModal";
import ExceptionTraceModal from "@/sections/modals/PreviewModal/ExceptionTraceModal";
import SimpleTooltip from "@/refresh-components/SimpleTooltip";
import { SvgClock } from "@opal/icons";
export interface IndexingAttemptsTableProps {

View File

@@ -21,10 +21,13 @@ export const submitGoogleSite = async (
formData.append("files", file);
});
const response = await fetch("/api/manage/admin/connector/file/upload", {
method: "POST",
body: formData,
});
const response = await fetch(
"/api/manage/admin/connector/file/upload?unzip=false",
{
method: "POST",
body: formData,
}
);
const responseJson = await response.json();
if (!response.ok) {
toast.error(`Unable to upload files - ${responseJson.detail}`);

View File

@@ -19,6 +19,7 @@ import {
} from "@/lib/types";
import type { Route } from "next";
import { useRouter } from "next/navigation";
import Truncated from "@/refresh-components/texts/Truncated";
import {
FiChevronDown,
FiChevronRight,
@@ -165,9 +166,7 @@ function ConnectorRow({
onClick={handleRowClick}
>
<TableCell className="">
<p className="max-w-[200px] xl:max-w-[400px] inline-block ellipsis truncate">
{ccPairsIndexingStatus.name}
</p>
<Truncated>{ccPairsIndexingStatus.name}</Truncated>
</TableCell>
<TableCell>
{timeAgo(ccPairsIndexingStatus?.last_success) || "-"}
@@ -246,9 +245,7 @@ function FederatedConnectorRow({
onClick={handleRowClick}
>
<TableCell className="">
<p className="max-w-[200px] xl:max-w-[400px] inline-block ellipsis truncate">
{federatedConnector.name}
</p>
<Truncated>{federatedConnector.name}</Truncated>
</TableCell>
<TableCell>N/A</TableCell>
<TableCell>

View File

@@ -1,53 +0,0 @@
import { useState } from "react";
import Modal from "@/refresh-components/Modal";
import Text from "@/refresh-components/texts/Text";
import { SvgAlertTriangle, SvgCheck, SvgCopy } from "@opal/icons";
interface ExceptionTraceModalProps {
onOutsideClick: () => void;
exceptionTrace: string;
}
export default function ExceptionTraceModal({
onOutsideClick,
exceptionTrace,
}: ExceptionTraceModalProps) {
const [copyClicked, setCopyClicked] = useState(false);
return (
<Modal open onOpenChange={onOutsideClick}>
<Modal.Content width="lg" height="full">
<Modal.Header
icon={SvgAlertTriangle}
title="Full Exception Trace"
onClose={onOutsideClick}
height="fit"
/>
<Modal.Body>
<div className="mb-6">
{!copyClicked ? (
<button
type="button"
onClick={() => {
navigator.clipboard.writeText(exceptionTrace!);
setCopyClicked(true);
setTimeout(() => setCopyClicked(false), 2000);
}}
className="flex w-fit items-center hover:bg-accent-background p-2 border-border border rounded"
>
<Text>Copy full trace</Text>
<SvgCopy className="stroke-text-04 ml-2 h-4 w-4 flex flex-shrink-0" />
</button>
) : (
<div className="flex w-fit items-center hover:bg-accent-background p-2 border-border border rounded cursor-default">
<Text>Copied to clipboard</Text>
<SvgCheck className="stroke-text-04 my-auto ml-2 h-4 w-4 flex flex-shrink-0" />
</div>
)}
</div>
<div className="whitespace-pre-wrap">{exceptionTrace}</div>
</Modal.Body>
</Modal.Content>
</Modal>
);
}

View File

@@ -12,6 +12,7 @@ import {
SvgKey,
} from "@opal/icons";
import { Disabled } from "@opal/core";
import LineItem from "@/refresh-components/buttons/LineItem";
import Popover from "@/refresh-components/Popover";
import Separator from "@/refresh-components/Separator";
import { Section } from "@/layouts/general-layouts";
@@ -78,18 +79,17 @@ export default function UserRowActions({
return (
<>
{user.id && (
<Button
prominence="tertiary"
<LineItem
icon={SvgUsers}
onClick={() => openModal(Modal.EDIT_GROUPS)}
>
Groups &amp; Roles
</Button>
</LineItem>
)}
<Disabled disabled>
<Button prominence="tertiary" variant="danger" icon={SvgUserX}>
<LineItem danger icon={SvgUserX}>
Deactivate User
</Button>
</LineItem>
</Disabled>
<Separator paddingXRem={0.5} />
<Text as="p" secondaryBody text03 className="px-3 py-1">
@@ -102,20 +102,18 @@ export default function UserRowActions({
switch (user.status) {
case UserStatus.INVITED:
return (
<Button
prominence="tertiary"
variant="danger"
<LineItem
danger
icon={SvgXCircle}
onClick={() => openModal(Modal.CANCEL_INVITE)}
>
Cancel Invite
</Button>
</LineItem>
);
case UserStatus.REQUESTED:
return (
<Button
prominence="tertiary"
<LineItem
icon={SvgUserCheck}
onClick={() => {
setPopoverOpen(false);
@@ -133,37 +131,34 @@ export default function UserRowActions({
}}
>
Approve
</Button>
</LineItem>
);
case UserStatus.ACTIVE:
return (
<>
{user.id && (
<Button
prominence="tertiary"
<LineItem
icon={SvgUsers}
onClick={() => openModal(Modal.EDIT_GROUPS)}
>
Groups &amp; Roles
</Button>
</LineItem>
)}
<Button
prominence="tertiary"
<LineItem
icon={SvgKey}
onClick={() => openModal(Modal.RESET_PASSWORD)}
>
Reset Password
</Button>
</LineItem>
<Separator paddingXRem={0.5} />
<Button
prominence="tertiary"
variant="danger"
<LineItem
danger
icon={SvgUserX}
onClick={() => openModal(Modal.DEACTIVATE)}
>
Deactivate User
</Button>
</LineItem>
</>
);
@@ -171,38 +166,34 @@ export default function UserRowActions({
return (
<>
{user.id && (
<Button
prominence="tertiary"
<LineItem
icon={SvgUsers}
onClick={() => openModal(Modal.EDIT_GROUPS)}
>
Groups &amp; Roles
</Button>
</LineItem>
)}
<Button
prominence="tertiary"
<LineItem
icon={SvgKey}
onClick={() => openModal(Modal.RESET_PASSWORD)}
>
Reset Password
</Button>
</LineItem>
<Separator paddingXRem={0.5} />
<Button
prominence="tertiary"
<LineItem
icon={SvgUserPlus}
onClick={() => openModal(Modal.ACTIVATE)}
>
Activate User
</Button>
</LineItem>
<Separator paddingXRem={0.5} />
<Button
prominence="tertiary"
variant="danger"
<LineItem
danger
icon={SvgUserX}
onClick={() => openModal(Modal.DELETE)}
>
Delete User
</Button>
</LineItem>
</>
);

View File

@@ -0,0 +1,39 @@
import Modal from "@/refresh-components/Modal";
import { SvgAlertTriangle } from "@opal/icons";
import { CodePreview } from "@/sections/modals/PreviewModal/variants/CodePreview";
import { CopyButton } from "@/sections/modals/PreviewModal/variants/shared";
import FloatingFooter from "@/sections/modals/PreviewModal/FloatingFooter";
interface ExceptionTraceModalProps {
onOutsideClick: () => void;
exceptionTrace: string;
language?: string;
}
export default function ExceptionTraceModal({
onOutsideClick,
exceptionTrace,
language = "python",
}: ExceptionTraceModalProps) {
return (
<Modal open onOpenChange={onOutsideClick}>
<Modal.Content width="lg" height="full">
<Modal.Header
icon={SvgAlertTriangle}
title="Full Exception Trace"
onClose={onOutsideClick}
height="fit"
/>
<div className="flex flex-col flex-1 min-h-0 overflow-hidden w-full bg-background-tint-01">
<CodePreview content={exceptionTrace} language={language} normalize />
</div>
<FloatingFooter
right={<CopyButton getText={() => exceptionTrace} />}
codeBackground
/>
</Modal.Content>
</Modal>
);
}

View File

@@ -0,0 +1,39 @@
import { cn } from "@/lib/utils";
import { ReactNode } from "react";
interface FloatingFooterProps {
left?: ReactNode;
right?: ReactNode;
codeBackground?: boolean;
}
export default function FloatingFooter({
left,
right,
codeBackground,
}: FloatingFooterProps) {
return (
<div
className={cn(
"absolute bottom-0 left-0 right-0",
"flex items-center justify-between",
"p-4 pointer-events-none w-full"
)}
style={{
background: `linear-gradient(to top, var(--background-${
codeBackground ? "code-01" : "tint-01"
}) 40%, transparent)`,
}}
>
{/* Left slot */}
<div className="pointer-events-auto">{left}</div>
{/* Right slot */}
{right ? (
<div className="pointer-events-auto rounded-12 bg-background-tint-00 p-1 shadow-lg">
{right}
</div>
) : null}
</div>
);
}

View File

@@ -5,8 +5,8 @@ import { MinimalOnyxDocument } from "@/lib/search/interfaces";
import Modal from "@/refresh-components/Modal";
import Text from "@/refresh-components/texts/Text";
import SimpleLoader from "@/refresh-components/loaders/SimpleLoader";
import { cn } from "@/lib/utils";
import { Section } from "@/layouts/general-layouts";
import FloatingFooter from "@/sections/modals/PreviewModal/FloatingFooter";
import mime from "mime";
import {
getCodeLanguage,
@@ -189,30 +189,12 @@ export default function PreviewModal({
)}
</div>
{/* Floating footer */}
{!isLoading && !loadError && (
<div
className={cn(
"absolute bottom-0 left-0 right-0",
"flex items-center justify-between",
"p-4 pointer-events-none w-full"
)}
style={{
background: `linear-gradient(to top, var(--background-${
variant.codeBackground ? "code-01" : "tint-01"
}) 40%, transparent)`,
}}
>
{/* Left slot */}
<div className="pointer-events-auto">
{variant.renderFooterLeft(ctx)}
</div>
{/* Right slot */}
<div className="pointer-events-auto rounded-12 bg-background-tint-00 p-1 shadow-lg">
{variant.renderFooterRight(ctx)}
</div>
</div>
<FloatingFooter
left={variant.renderFooterLeft(ctx)}
right={variant.renderFooterRight(ctx)}
codeBackground={variant.codeBackground}
/>
)}
</Modal.Content>
</Modal>

View File

@@ -12,6 +12,7 @@ import { cn, noProp } from "@/lib/utils";
import { DRAG_TYPES } from "./constants";
import SidebarTab from "@/refresh-components/buttons/SidebarTab";
import IconButton from "@/refresh-components/buttons/IconButton";
import Truncated from "@/refresh-components/texts/Truncated";
import { Button } from "@opal/components";
import ButtonRenaming from "@/refresh-components/buttons/ButtonRenaming";
import type { IconProps } from "@opal/types";
@@ -181,7 +182,7 @@ const ProjectFolderButton = memo(({ project }: ProjectFolderButtonProps) => {
onClose={() => setIsEditing(false)}
/>
) : (
project.name
<Truncated>{project.name}</Truncated>
)}
</SidebarTab>
</Popover.Anchor>

View File

@@ -169,7 +169,9 @@ test.describe("Project Files visual regression", () => {
.first();
await expect(iconWrapper).toBeVisible();
await expectElementScreenshot(filesSection, {
const container = page.locator("[data-main-container]");
await expect(container).toBeVisible();
await expectElementScreenshot(container, {
name: "project-files-long-underscore-filename",
});