Compare commits

..

7 Commits

Author SHA1 Message Date
Dane Urban
190b0475e5 Connector changes 2026-04-17 17:55:06 -07:00
Dane Urban
8d1660c627 Delete documents 2026-04-17 16:27:00 -07:00
Dane Urban
4f6826ca0c . 2026-04-17 13:50:33 -07:00
Dane Urban
895c329c00 file saving 2026-04-17 11:53:55 -07:00
Dane Urban
5303a2bd00 . 2026-04-17 09:58:55 -07:00
Dane Urban
ada1d4f9bc New callback 2026-04-16 18:38:46 -07:00
Dane Urban
0708832290 Add file id col to Document db model 2026-04-16 15:50:29 -07:00
205 changed files with 3304 additions and 4889 deletions

View File

@@ -45,7 +45,7 @@ if [ "$ACTIVE_HOME" != "$MOUNT_HOME" ]; then
[ -d "$MOUNT_HOME/$item" ] || continue
if [ -e "$ACTIVE_HOME/$item" ] && [ ! -L "$ACTIVE_HOME/$item" ]; then
echo "warning: replacing $ACTIVE_HOME/$item with symlink to $MOUNT_HOME/$item" >&2
rm -rf "${ACTIVE_HOME:?}/$item"
rm -rf "$ACTIVE_HOME/$item"
fi
ln -sfn "$MOUNT_HOME/$item" "$ACTIVE_HOME/$item"
done

View File

@@ -39,8 +39,6 @@ jobs:
working-directory: ./web
run: npm ci
- uses: j178/prek-action@cbc2f23eb5539cf20d82d1aabd0d0ecbcc56f4e3
env:
SKIP: ty
with:
prek-version: '0.3.4'
extra-args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || github.event_name == 'merge_group' && format('--from-ref {0} --to-ref {1}', github.event.merge_group.base_sha, github.event.merge_group.head_sha) || github.ref_name == 'main' && '--all-files' || '' }}

View File

@@ -68,7 +68,6 @@ repos:
pass_filenames: true
files: ^backend/(?!\.venv/|scripts/).*\.py$
- id: uv-run
alias: ty
name: ty
args: ["ty", "check"]
pass_filenames: true
@@ -86,17 +85,6 @@ repos:
hooks:
- id: actionlint
- repo: https://github.com/shellcheck-py/shellcheck-py
rev: 745eface02aef23e168a8afb6b5737818efbea95 # frozen: v0.11.0.1
hooks:
- id: shellcheck
exclude: >-
(?x)^(
backend/scripts/setup_craft_templates\.sh|
deployment/docker_compose/init-letsencrypt\.sh|
deployment/docker_compose/install\.sh
)$
- repo: https://github.com/psf/black
rev: 8a737e727ac5ab2f1d4cf5876720ed276dc8dc4b # frozen: 25.1.0
hooks:

View File

@@ -0,0 +1,27 @@
"""Add file_id to documents
Revision ID: 91d150c361f6
Revises: d129f37b3d87
Create Date: 2026-04-16 15:43:30.314823
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "91d150c361f6"
down_revision = "d129f37b3d87"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"document",
sa.Column("file_id", sa.String(), nullable=True),
)
def downgrade() -> None:
op.drop_column("document", "file_id")

View File

@@ -1,10 +1,8 @@
import time
from typing import cast
from celery import shared_task
from celery import Task
from celery.exceptions import SoftTimeLimitExceeded
from redis.client import Redis
from redis.lock import Lock as RedisLock
from ee.onyx.server.tenants.product_gating import get_gated_tenants
@@ -18,56 +16,9 @@ from onyx.configs.constants import OnyxRedisLocks
from onyx.db.engine.tenant_utils import get_all_tenant_ids
from onyx.redis.redis_pool import get_redis_client
from onyx.redis.redis_pool import redis_lock_dump
from onyx.redis.redis_tenant_work_gating import cleanup_expired
from onyx.redis.redis_tenant_work_gating import get_active_tenants
from onyx.redis.redis_tenant_work_gating import observe_active_set_size
from onyx.redis.redis_tenant_work_gating import record_full_fanout_cycle
from onyx.redis.redis_tenant_work_gating import record_gate_decision
from onyx.server.runtime.onyx_runtime import OnyxRuntime
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
_FULL_FANOUT_TIMESTAMP_KEY_PREFIX = "tenant_work_gating_last_full_fanout_ms"
def _should_bypass_gate_for_full_fanout(
redis_client: Redis, task_name: str, interval_seconds: int
) -> bool:
"""True if at least `interval_seconds` have elapsed since the last
full-fanout bypass for this task. On True, updates the stored timestamp
atomically-enough (it's a best-effort counter, not a lock)."""
key = f"{_FULL_FANOUT_TIMESTAMP_KEY_PREFIX}:{task_name}"
now_ms = int(time.time() * 1000)
threshold_ms = now_ms - (interval_seconds * 1000)
try:
raw = cast(bytes | None, redis_client.get(key))
except Exception:
task_logger.exception(f"full-fanout timestamp read failed: task={task_name}")
# Fail open: treat as "interval elapsed" so we don't skip every
# tenant during a Redis hiccup.
return True
if raw is None:
# First invocation — bypass so the set seeds cleanly.
elapsed = True
else:
try:
last_ms = int(raw.decode())
elapsed = last_ms <= threshold_ms
except ValueError:
elapsed = True
if elapsed:
try:
redis_client.set(key, str(now_ms))
except Exception:
task_logger.exception(
f"full-fanout timestamp write failed: task={task_name}"
)
return elapsed
@shared_task(
name=OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
ignore_result=True,
@@ -81,7 +32,6 @@ def cloud_beat_task_generator(
priority: int = OnyxCeleryPriority.MEDIUM,
expires: int = BEAT_EXPIRES_DEFAULT,
skip_gated: bool = True,
work_gated: bool = False,
) -> bool | None:
"""a lightweight task used to kick off individual beat tasks per tenant."""
time_start = time.monotonic()
@@ -101,56 +51,8 @@ def cloud_beat_task_generator(
tenant_ids: list[str] = []
num_processed_tenants = 0
num_skipped_gated = 0
num_would_skip_work_gate = 0
num_skipped_work_gate = 0
# Tenant-work-gating read path. Resolve once per invocation.
gate_enabled = False
gate_enforce = False
full_fanout_cycle = False
active_tenants: set[str] | None = None
try:
# Gating setup is inside the try block so any exception still
# reaches the finally that releases the beat lock.
if work_gated:
try:
gate_enabled = OnyxRuntime.get_tenant_work_gating_enabled()
gate_enforce = OnyxRuntime.get_tenant_work_gating_enforce()
except Exception:
task_logger.exception("tenant work gating: runtime flag read failed")
gate_enabled = False
if gate_enabled:
redis_failed = False
interval_s = (
OnyxRuntime.get_tenant_work_gating_full_fanout_interval_seconds()
)
full_fanout_cycle = _should_bypass_gate_for_full_fanout(
redis_client, task_name, interval_s
)
if full_fanout_cycle:
record_full_fanout_cycle(task_name)
try:
ttl_s = OnyxRuntime.get_tenant_work_gating_ttl_seconds()
cleanup_expired(ttl_s)
except Exception:
task_logger.exception(
"tenant work gating: cleanup_expired failed"
)
else:
ttl_s = OnyxRuntime.get_tenant_work_gating_ttl_seconds()
active_tenants = get_active_tenants(ttl_s)
if active_tenants is None:
full_fanout_cycle = True
record_full_fanout_cycle(task_name)
redis_failed = True
# Only refresh the gauge when Redis is known-reachable —
# skip the ZCARD if we just failed open due to a Redis error.
if not redis_failed:
observe_active_set_size()
tenant_ids = get_all_tenant_ids()
# Per-task control over whether gated tenants are included. Most periodic tasks
@@ -174,21 +76,6 @@ def cloud_beat_task_generator(
if IGNORED_SYNCING_TENANT_LIST and tenant_id in IGNORED_SYNCING_TENANT_LIST:
continue
# Tenant work gate: if the feature is on, check membership. Skip
# unmarked tenants when enforce=True AND we're not in a full-
# fanout cycle. Always log/emit the shadow counter.
if work_gated and gate_enabled and not full_fanout_cycle:
would_skip = (
active_tenants is not None and tenant_id not in active_tenants
)
if would_skip:
num_would_skip_work_gate += 1
if gate_enforce:
num_skipped_work_gate += 1
record_gate_decision(task_name, skipped=True)
continue
record_gate_decision(task_name, skipped=False)
self.app.send_task(
task_name,
kwargs=dict(
@@ -222,12 +109,6 @@ def cloud_beat_task_generator(
f"task={task_name} "
f"num_processed_tenants={num_processed_tenants} "
f"num_skipped_gated={num_skipped_gated} "
f"num_would_skip_work_gate={num_would_skip_work_gate} "
f"num_skipped_work_gate={num_skipped_work_gate} "
f"full_fanout_cycle={full_fanout_cycle} "
f"work_gated={work_gated} "
f"gate_enabled={gate_enabled} "
f"gate_enforce={gate_enforce} "
f"num_tenants={len(tenant_ids)} "
f"elapsed={time_elapsed:.2f}"
)

View File

@@ -212,7 +212,7 @@ def check_for_doc_permissions_sync(self: Task, *, tenant_id: str) -> bool | None
# Tenant-work-gating hook: refresh this tenant's active-set membership
# whenever doc-permission sync has any due cc_pairs to dispatch.
if cc_pair_ids_to_sync:
maybe_mark_tenant_active(tenant_id, caller="doc_permission_sync")
maybe_mark_tenant_active(tenant_id)
lock_beat.reacquire()
for cc_pair_id in cc_pair_ids_to_sync:

View File

@@ -206,7 +206,7 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
# Tenant-work-gating hook: refresh this tenant's active-set membership
# whenever external-group sync has any due cc_pairs to dispatch.
if cc_pair_ids_to_sync:
maybe_mark_tenant_active(tenant_id, caller="external_group_sync")
maybe_mark_tenant_active(tenant_id)
lock_beat.reacquire()
for cc_pair_id in cc_pair_ids_to_sync:

View File

@@ -30,7 +30,6 @@ from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_PREFI
from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_TASKSET_KEY
from onyx.configs.app_configs import DISABLE_VECTOR_DB
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
from onyx.configs.constants import OnyxRedisLocks
from onyx.db.engine.sql_engine import get_sqlalchemy_engine
@@ -532,26 +531,23 @@ def reset_tenant_id(
CURRENT_TENANT_ID_CONTEXTVAR.set(POSTGRES_DEFAULT_SCHEMA)
def wait_for_document_index_or_shutdown() -> None:
"""
Waits for all configured document indices to become ready subject to a
timeout.
def wait_for_vespa_or_shutdown(
sender: Any, # noqa: ARG001
**kwargs: Any, # noqa: ARG001
) -> None: # noqa: ARG001
"""Waits for Vespa to become ready subject to a timeout.
Raises WorkerShutdown if the timeout is reached."""
Raises WorkerShutdown if the timeout is reached.
"""
if DISABLE_VECTOR_DB:
logger.info(
"DISABLE_VECTOR_DB is set — skipping Vespa/OpenSearch readiness check."
)
return
if not ONYX_DISABLE_VESPA:
if not wait_for_vespa_with_timeout():
msg = (
"[Vespa] Readiness probe did not succeed within the timeout. Exiting..."
)
logger.error(msg)
raise WorkerShutdown(msg)
if not wait_for_vespa_with_timeout():
msg = "[Vespa] Readiness probe did not succeed within the timeout. Exiting..."
logger.error(msg)
raise WorkerShutdown(msg)
if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
if not wait_for_opensearch_with_timeout():

View File

@@ -105,7 +105,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_redis(sender, **kwargs)
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_document_index_or_shutdown()
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
# Less startup checks in multi-tenant case
if MULTI_TENANT:

View File

@@ -111,7 +111,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_redis(sender, **kwargs)
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_document_index_or_shutdown()
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
# Less startup checks in multi-tenant case
if MULTI_TENANT:

View File

@@ -97,7 +97,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_redis(sender, **kwargs)
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_document_index_or_shutdown()
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
# Less startup checks in multi-tenant case
if MULTI_TENANT:

View File

@@ -118,7 +118,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_redis(sender, **kwargs)
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_document_index_or_shutdown()
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
# Less startup checks in multi-tenant case
if MULTI_TENANT:

View File

@@ -124,7 +124,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_redis(sender, **kwargs)
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_document_index_or_shutdown()
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
logger.info(f"Running as the primary celery worker: pid={os.getpid()}")

View File

@@ -71,7 +71,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
app_base.wait_for_redis(sender, **kwargs)
app_base.wait_for_db(sender, **kwargs)
app_base.wait_for_document_index_or_shutdown()
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
# Less startup checks in multi-tenant case
if MULTI_TENANT:

View File

@@ -10,7 +10,6 @@ from onyx.configs.app_configs import DISABLE_OPENSEARCH_MIGRATION_TASK
from onyx.configs.app_configs import DISABLE_VECTOR_DB
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
from onyx.configs.app_configs import ENTERPRISE_EDITION_ENABLED
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
from onyx.configs.app_configs import SCHEDULED_EVAL_DATASET_NAMES
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
from onyx.configs.constants import OnyxCeleryPriority
@@ -68,7 +67,6 @@ beat_task_templates: list[dict] = [
"options": {
"priority": OnyxCeleryPriority.MEDIUM,
"expires": BEAT_EXPIRES_DEFAULT,
"work_gated": True,
},
},
{
@@ -102,7 +100,6 @@ beat_task_templates: list[dict] = [
"expires": BEAT_EXPIRES_DEFAULT,
# Gated tenants may still have connectors awaiting deletion.
"skip_gated": False,
"work_gated": True,
},
},
{
@@ -112,7 +109,6 @@ beat_task_templates: list[dict] = [
"options": {
"priority": OnyxCeleryPriority.MEDIUM,
"expires": BEAT_EXPIRES_DEFAULT,
"work_gated": True,
},
},
{
@@ -122,7 +118,6 @@ beat_task_templates: list[dict] = [
"options": {
"priority": OnyxCeleryPriority.MEDIUM,
"expires": BEAT_EXPIRES_DEFAULT,
"work_gated": True,
},
},
{
@@ -160,7 +155,6 @@ beat_task_templates: list[dict] = [
"priority": OnyxCeleryPriority.LOW,
"expires": BEAT_EXPIRES_DEFAULT,
"queue": OnyxCeleryQueues.SANDBOX,
"work_gated": True,
},
},
{
@@ -185,7 +179,6 @@ if ENTERPRISE_EDITION_ENABLED:
"options": {
"priority": OnyxCeleryPriority.MEDIUM,
"expires": BEAT_EXPIRES_DEFAULT,
"work_gated": True,
},
},
{
@@ -195,7 +188,6 @@ if ENTERPRISE_EDITION_ENABLED:
"options": {
"priority": OnyxCeleryPriority.MEDIUM,
"expires": BEAT_EXPIRES_DEFAULT,
"work_gated": True,
},
},
]
@@ -235,11 +227,7 @@ if SCHEDULED_EVAL_DATASET_NAMES:
)
# Add OpenSearch migration task if enabled.
if (
ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
and not DISABLE_OPENSEARCH_MIGRATION_TASK
and not ONYX_DISABLE_VESPA
):
if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX and not DISABLE_OPENSEARCH_MIGRATION_TASK:
beat_task_templates.append(
{
"name": "migrate-chunks-from-vespa-to-opensearch",
@@ -292,7 +280,7 @@ def make_cloud_generator_task(task: dict[str, Any]) -> dict[str, Any]:
cloud_task["kwargs"] = {}
cloud_task["kwargs"]["task_name"] = task["task"]
optional_fields = ["queue", "priority", "expires", "skip_gated", "work_gated"]
optional_fields = ["queue", "priority", "expires", "skip_gated"]
for field in optional_fields:
if field in task["options"]:
cloud_task["kwargs"][field] = task["options"][field]
@@ -385,14 +373,12 @@ if not MULTI_TENANT:
]
)
# `skip_gated` and `work_gated` are cloud-only hints consumed by
# `cloud_beat_task_generator`. Strip them before extending the self-hosted
# schedule so they don't leak into apply_async as unrecognised options on
# every fired task message.
# `skip_gated` is a cloud-only hint consumed by `cloud_beat_task_generator`. Strip
# it before extending the self-hosted schedule so it doesn't leak into apply_async
# as an unrecognised option on every fired task message.
for _template in beat_task_templates:
_self_hosted_template = copy.deepcopy(_template)
_self_hosted_template["options"].pop("skip_gated", None)
_self_hosted_template["options"].pop("work_gated", None)
tasks_to_schedule.append(_self_hosted_template)

View File

@@ -181,7 +181,7 @@ def check_for_connector_deletion_task(self: Task, *, tenant_id: str) -> bool | N
# nearly every tenant in the active set since most have cc_pairs
# but almost none are actively being deleted on any given cycle.
if has_deleting_cc_pair:
maybe_mark_tenant_active(tenant_id, caller="connector_deletion")
maybe_mark_tenant_active(tenant_id)
# try running cleanup on the cc_pair_ids
for cc_pair_id in cc_pair_ids:

View File

@@ -1020,7 +1020,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
# `tasks_created > 0` here gives us a "real work was done" signal
# rather than just "tenant has a cc_pair somewhere."
if tasks_created > 0:
maybe_mark_tenant_active(tenant_id, caller="check_for_indexing")
maybe_mark_tenant_active(tenant_id)
# 2/3: VALIDATE
# Check for inconsistent index attempts - active attempts without task IDs

View File

@@ -263,7 +263,7 @@ def check_for_pruning(self: Task, *, tenant_id: str) -> bool | None:
# since most tenants have cc_pairs but almost none are due on
# any given cycle.
if prune_dispatched:
maybe_mark_tenant_active(tenant_id, caller="check_for_pruning")
maybe_mark_tenant_active(tenant_id)
r.set(OnyxRedisSignals.BLOCK_PRUNING, 1, ex=_get_pruning_block_expiration())
# we want to run this less frequently than the overall task

View File

@@ -15,7 +15,7 @@ from onyx.background.celery.tasks.shared.RetryDocumentIndex import RetryDocument
from onyx.configs.constants import ONYX_CELERY_BEAT_HEARTBEAT_KEY
from onyx.configs.constants import OnyxCeleryTask
from onyx.db.document import delete_document_by_connector_credential_pair__no_commit
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.document import delete_documents_complete
from onyx.db.document import fetch_chunk_count_for_document
from onyx.db.document import get_document
from onyx.db.document import get_document_connector_count
@@ -129,11 +129,10 @@ def document_by_cc_pair_cleanup_task(
document_id=document_id,
)
delete_documents_complete__no_commit(
delete_documents_complete(
db_session=db_session,
document_ids=[document_id],
)
db_session.commit()
completion_status = OnyxCeleryTaskCompletionStatus.SUCCEEDED
elif count > 1:

View File

@@ -153,7 +153,7 @@ def try_generate_stale_document_sync_tasks(
# Tenant-work-gating hook: refresh this tenant's active-set membership
# whenever vespa sync actually has stale docs to dispatch.
maybe_mark_tenant_active(tenant_id, caller="vespa_sync")
maybe_mark_tenant_active(tenant_id)
logger.info(
f"Stale documents found (at least {stale_doc_count}). Generating sync tasks in one batch."

View File

@@ -58,6 +58,8 @@ from onyx.db.indexing_coordination import IndexingCoordination
from onyx.db.models import IndexAttempt
from onyx.file_store.document_batch_storage import DocumentBatchStorage
from onyx.file_store.document_batch_storage import get_document_batch_storage
from onyx.file_store.staging import build_raw_file_callback
from onyx.file_store.staging import RawFileCallback
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
@@ -90,6 +92,7 @@ def _get_connector_runner(
end_time: datetime,
include_permissions: bool,
leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
raw_file_callback: RawFileCallback | None = None,
) -> ConnectorRunner:
"""
NOTE: `start_time` and `end_time` are only used for poll connectors
@@ -108,6 +111,7 @@ def _get_connector_runner(
input_type=task,
connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
credential=attempt.connector_credential_pair.credential,
raw_file_callback=raw_file_callback,
)
# validate the connector settings
@@ -275,6 +279,12 @@ def run_docfetching_entrypoint(
f"credentials='{credential_id}'"
)
raw_file_callback = build_raw_file_callback(
index_attempt_id=index_attempt_id,
cc_pair_id=connector_credential_pair_id,
tenant_id=tenant_id,
)
connector_document_extraction(
app,
index_attempt_id,
@@ -282,6 +292,7 @@ def run_docfetching_entrypoint(
attempt.search_settings_id,
tenant_id,
callback,
raw_file_callback=raw_file_callback,
)
logger.info(
@@ -301,6 +312,7 @@ def connector_document_extraction(
search_settings_id: int,
tenant_id: str,
callback: IndexingHeartbeatInterface | None = None,
raw_file_callback: RawFileCallback | None = None,
) -> None:
"""Extract documents from connector and queue them for indexing pipeline processing.
@@ -451,6 +463,7 @@ def connector_document_extraction(
start_time=window_start,
end_time=window_end,
include_permissions=should_fetch_permissions_during_indexing,
raw_file_callback=raw_file_callback,
)
# don't use a checkpoint if we're explicitly indexing from

View File

@@ -282,7 +282,6 @@ OPENSEARCH_ADMIN_USERNAME = os.environ.get("OPENSEARCH_ADMIN_USERNAME", "admin")
OPENSEARCH_ADMIN_PASSWORD = os.environ.get(
"OPENSEARCH_ADMIN_PASSWORD", "StrongPassword123!"
)
OPENSEARCH_USE_SSL = os.environ.get("OPENSEARCH_USE_SSL", "true").lower() == "true"
USING_AWS_MANAGED_OPENSEARCH = (
os.environ.get("USING_AWS_MANAGED_OPENSEARCH", "").lower() == "true"
)
@@ -328,7 +327,6 @@ ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX = (
DISABLE_OPENSEARCH_MIGRATION_TASK = (
os.environ.get("DISABLE_OPENSEARCH_MIGRATION_TASK", "").lower() == "true"
)
ONYX_DISABLE_VESPA = os.environ.get("ONYX_DISABLE_VESPA", "").lower() == "true"
# Whether we should check for and create an index if necessary every time we
# instantiate an OpenSearchDocumentIndex on multitenant cloud. Defaults to True.
VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT = (
@@ -845,29 +843,6 @@ MAX_FILE_SIZE_BYTES = int(
os.environ.get("MAX_FILE_SIZE_BYTES") or 2 * 1024 * 1024 * 1024
) # 2GB in bytes
# Maximum embedded images allowed in a single file. PDFs (and other formats)
# with thousands of embedded images can OOM the user-file-processing worker
# because every image is decoded with PIL and then sent to the vision LLM.
# Enforced both at upload time (rejects the file) and during extraction
# (defense-in-depth: caps the number of images materialized).
#
# Clamped to >= 0; a negative env value would turn upload validation into
# always-fail and extraction into always-stop, which is never desired. 0
# disables image extraction entirely, which is a valid (if aggressive) setting.
MAX_EMBEDDED_IMAGES_PER_FILE = max(
0, int(os.environ.get("MAX_EMBEDDED_IMAGES_PER_FILE") or 500)
)
# Maximum embedded images allowed across all files in a single upload batch.
# Protects against the scenario where a user uploads many files that each
# fall under MAX_EMBEDDED_IMAGES_PER_FILE but aggregate to enough work
# (serial-ish celery fan-out plus per-image vision-LLM calls) to OOM the
# worker under concurrency or run up surprise latency/cost. Also clamped
# to >= 0.
MAX_EMBEDDED_IMAGES_PER_UPLOAD = max(
0, int(os.environ.get("MAX_EMBEDDED_IMAGES_PER_UPLOAD") or 1000)
)
# Use document summary for contextual rag
USE_DOCUMENT_SUMMARY = os.environ.get("USE_DOCUMENT_SUMMARY", "true").lower() == "true"
# Use chunk summary for contextual rag

View File

@@ -372,6 +372,7 @@ class FileOrigin(str, Enum):
CONNECTOR_METADATA = "connector_metadata"
GENERATED_REPORT = "generated_report"
INDEXING_CHECKPOINT = "indexing_checkpoint"
INDEXING_STAGING = "indexing_staging"
PLAINTEXT_CACHE = "plaintext_cache"
OTHER = "other"
QUERY_HISTORY_CSV = "query_history_csv"

View File

@@ -3,7 +3,6 @@ from collections.abc import Callable
from collections.abc import Iterator
from datetime import datetime
from datetime import timezone
from email.utils import parsedate_to_datetime
from typing import Any
from typing import TypeVar
from urllib.parse import urljoin
@@ -11,6 +10,7 @@ from urllib.parse import urlparse
import requests
from dateutil.parser import parse
from dateutil.parser import ParserError
from onyx.configs.app_configs import CONNECTOR_LOCALHOST_OVERRIDE
from onyx.configs.constants import DocumentSource
@@ -56,16 +56,18 @@ def time_str_to_utc(datetime_str: str) -> datetime:
if fixed not in candidates:
candidates.append(fixed)
# dateutil is the primary; the stdlib RFC 2822 parser is a fallback for
# inputs dateutil rejects (e.g. headers concatenated without a CRLF —
# TZ may be dropped, datetime_to_utc then assumes UTC).
for parser in (parse, parsedate_to_datetime):
for candidate in candidates:
try:
return datetime_to_utc(parser(candidate))
except (TypeError, ValueError, OverflowError):
continue
last_exception: Exception | None = None
for candidate in candidates:
try:
dt = parse(candidate)
return datetime_to_utc(dt)
except (ValueError, ParserError) as exc:
last_exception = exc
if last_exception is not None:
raise last_exception
# Fallback in case parsing failed without raising (should not happen)
raise ValueError(f"Unable to parse datetime string: {datetime_str}")

View File

@@ -2,15 +2,23 @@ import csv
import io
from typing import IO
from pydantic import BaseModel
from onyx.connectors.models import TabularSection
from onyx.file_processing.extract_file_text import file_io_to_text
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
from onyx.file_processing.file_types import OnyxFileExtensions
from onyx.file_store.staging import RawFileCallback
from onyx.utils.logger import setup_logger
logger = setup_logger()
class TabularExtractionResult(BaseModel):
sections: list[TabularSection]
staged_file_id: str
def is_tabular_file(file_name: str) -> bool:
lowered = file_name.lower()
return any(lowered.endswith(ext) for ext in OnyxFileExtensions.TABULAR_EXTENSIONS)
@@ -41,6 +49,9 @@ def tabular_file_to_sections(
"""
lowered = file_name.lower()
if not lowered.endswith(tuple(OnyxFileExtensions.TABULAR_EXTENSIONS)):
raise ValueError(f"{file_name!r} is not a tabular file")
if lowered.endswith(tuple(OnyxFileExtensions.SPREADSHEET_EXTENSIONS)):
return [
TabularSection(
@@ -53,9 +64,6 @@ def tabular_file_to_sections(
)
]
if not lowered.endswith((".csv", ".tsv")):
raise ValueError(f"{file_name!r} is not a tabular file")
try:
text = file_io_to_text(file).strip()
except Exception:
@@ -67,3 +75,26 @@ def tabular_file_to_sections(
if lowered.endswith(".tsv"):
text = _tsv_to_csv(text)
return [TabularSection(link=link or file_name, text=text)]
def extract_and_stage_tabular_file(
file: IO[bytes],
file_name: str,
content_type: str,
raw_file_callback: RawFileCallback,
link: str = "",
) -> TabularExtractionResult:
"""Extract tabular sections AND stage the raw bytes via the callback."""
sections = tabular_file_to_sections(
file=file,
file_name=file_name,
link=link,
)
# rewind so the callback can re-read what extraction consumed
file.seek(0)
staged_file_id = raw_file_callback(file, content_type)
return TabularExtractionResult(
sections=sections,
staged_file_id=staged_file_id,
)

View File

@@ -22,6 +22,7 @@ from onyx.db.credentials import backend_update_credential_json
from onyx.db.credentials import fetch_credential_by_id
from onyx.db.enums import AccessType
from onyx.db.models import Credential
from onyx.file_store.staging import RawFileCallback
from shared_configs.contextvars import get_current_tenant_id
@@ -107,6 +108,7 @@ def instantiate_connector(
input_type: InputType,
connector_specific_config: dict[str, Any],
credential: Credential,
raw_file_callback: RawFileCallback | None = None,
) -> BaseConnector:
connector_class = identify_connector_class(source, input_type)
@@ -130,6 +132,9 @@ def instantiate_connector(
connector.set_allow_images(get_image_extraction_and_analysis_enabled())
if raw_file_callback is not None:
connector.set_raw_file_callback(raw_file_callback)
return connector

View File

@@ -253,17 +253,7 @@ def thread_to_document(
updated_at_datetime = None
if updated_at:
try:
updated_at_datetime = time_str_to_utc(updated_at)
except (ValueError, OverflowError) as e:
# Old mailboxes contain RFC-violating Date headers. Drop the
# timestamp instead of aborting the indexing run.
logger.warning(
"Skipping unparseable Gmail Date header on thread %s: %r (%s)",
full_thread.get("id"),
updated_at,
e,
)
updated_at_datetime = time_str_to_utc(updated_at)
id = full_thread.get("id")
if not id:

View File

@@ -502,9 +502,6 @@ class GoogleDriveConnector(
files: list[RetrievedDriveFile],
seen_hierarchy_node_raw_ids: ThreadSafeSet[str],
fully_walked_hierarchy_node_raw_ids: ThreadSafeSet[str],
failed_folder_ids_by_email: (
ThreadSafeDict[str, ThreadSafeSet[str]] | None
) = None,
permission_sync_context: PermissionSyncContext | None = None,
add_prefix: bool = False,
) -> list[HierarchyNode]:
@@ -528,9 +525,6 @@ class GoogleDriveConnector(
seen_hierarchy_node_raw_ids: Set of already-yielded node IDs (modified in place)
fully_walked_hierarchy_node_raw_ids: Set of node IDs where the walk to root
succeeded (modified in place)
failed_folder_ids_by_email: Map of email → folder IDs where that email
previously confirmed no accessible parent. Skips the API call if the same
(folder, email) is encountered again (modified in place).
permission_sync_context: If provided, permissions will be fetched for hierarchy nodes.
Contains google_domain and primary_admin_email needed for permission syncing.
add_prefix: When True, prefix group IDs with source type (for indexing path).
@@ -575,7 +569,7 @@ class GoogleDriveConnector(
# Fetch folder metadata
folder = self._get_folder_metadata(
current_id, file.user_email, field_type, failed_folder_ids_by_email
current_id, file.user_email, field_type
)
if not folder:
# Can't access this folder - stop climbing
@@ -659,13 +653,7 @@ class GoogleDriveConnector(
return new_nodes
def _get_folder_metadata(
self,
folder_id: str,
retriever_email: str,
field_type: DriveFileFieldType,
failed_folder_ids_by_email: (
ThreadSafeDict[str, ThreadSafeSet[str]] | None
) = None,
self, folder_id: str, retriever_email: str, field_type: DriveFileFieldType
) -> GoogleDriveFileType | None:
"""
Fetch metadata for a folder by ID.
@@ -679,17 +667,6 @@ class GoogleDriveConnector(
# Use a set to deduplicate if retriever_email == primary_admin_email
for email in {retriever_email, self.primary_admin_email}:
failed_ids = (
failed_folder_ids_by_email.get(email)
if failed_folder_ids_by_email
else None
)
if failed_ids and folder_id in failed_ids:
logger.debug(
f"Skipping folder {folder_id} using {email} (previously confirmed no parents)"
)
continue
service = get_drive_service(self.creds, email)
folder = get_folder_metadata(service, folder_id, field_type)
@@ -705,10 +682,6 @@ class GoogleDriveConnector(
# Folder has no parents - could be a root OR user lacks access to parent
# Keep this as a fallback but try admin to see if they can see parents
if failed_folder_ids_by_email is not None:
failed_folder_ids_by_email.setdefault(email, ThreadSafeSet()).add(
folder_id
)
if best_folder is None:
best_folder = folder
logger.debug(
@@ -1117,13 +1090,6 @@ class GoogleDriveConnector(
]
yield from parallel_yield(user_retrieval_gens, max_workers=MAX_DRIVE_WORKERS)
# Free per-user cache entries now that this batch is done.
# Skip the admin email — it is shared across all user batches and must
# persist for the duration of the run.
for email in non_completed_org_emails:
if email != self.primary_admin_email:
checkpoint.failed_folder_ids_by_email.pop(email, None)
# if there are more emails to process, don't mark as complete
if not email_batch_takes_us_to_completion:
return
@@ -1580,7 +1546,6 @@ class GoogleDriveConnector(
files=files_batch,
seen_hierarchy_node_raw_ids=checkpoint.seen_hierarchy_node_raw_ids,
fully_walked_hierarchy_node_raw_ids=checkpoint.fully_walked_hierarchy_node_raw_ids,
failed_folder_ids_by_email=checkpoint.failed_folder_ids_by_email,
permission_sync_context=permission_sync_context,
add_prefix=True,
)
@@ -1625,6 +1590,7 @@ class GoogleDriveConnector(
[retrieved_file.user_email, self.primary_admin_email]
+ get_file_owners(retrieved_file.drive_file, self.primary_admin_email),
retrieved_file.drive_file,
self.raw_file_callback,
)
except Exception as e:
logger.exception(
@@ -1817,7 +1783,6 @@ class GoogleDriveConnector(
files=files_batch,
seen_hierarchy_node_raw_ids=checkpoint.seen_hierarchy_node_raw_ids,
fully_walked_hierarchy_node_raw_ids=checkpoint.fully_walked_hierarchy_node_raw_ids,
failed_folder_ids_by_email=checkpoint.failed_folder_ids_by_email,
permission_sync_context=permission_sync_context,
)

View File

@@ -13,6 +13,9 @@ from pydantic import BaseModel
from onyx.access.models import ExternalAccess
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
extract_and_stage_tabular_file,
)
from onyx.connectors.cross_connector_utils.tabular_section_utils import is_tabular_file
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
tabular_file_to_sections,
@@ -43,6 +46,7 @@ from onyx.file_processing.file_types import OnyxFileExtensions
from onyx.file_processing.file_types import OnyxMimeTypes
from onyx.file_processing.file_types import SPREADSHEET_MIME_TYPE
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.file_store.staging import RawFileCallback
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import (
fetch_versioned_implementation_with_fallback,
@@ -51,6 +55,12 @@ from onyx.utils.variable_functionality import noop_fallback
logger = setup_logger()
class BasicExtractionResult(BaseModel):
sections: list[TextSection | ImageSection | TabularSection]
staged_file_id: str | None = None
# Cache for folder path lookups to avoid redundant API calls
# Maps folder_id -> (folder_name, parent_id)
_folder_cache: dict[str, tuple[str, str | None]] = {}
@@ -300,7 +310,8 @@ def _download_and_extract_sections_basic(
service: GoogleDriveService,
allow_images: bool,
size_threshold: int,
) -> list[TextSection | ImageSection | TabularSection]:
raw_file_callback: RawFileCallback | None = None,
) -> BasicExtractionResult:
"""Extract text and images from a Google Drive file."""
file_id = file["id"]
file_name = file["name"]
@@ -313,10 +324,35 @@ def _download_and_extract_sections_basic(
def response_call() -> bytes:
return download_request(service, file_id, size_threshold)
def _extract_tabular(
raw_bytes: bytes, name: str, content_type: str
) -> BasicExtractionResult:
if raw_file_callback is not None:
result = extract_and_stage_tabular_file(
file=io.BytesIO(raw_bytes),
file_name=name,
content_type=content_type,
raw_file_callback=raw_file_callback,
link=link,
)
return BasicExtractionResult(
sections=list(result.sections),
staged_file_id=result.staged_file_id,
)
return BasicExtractionResult(
sections=list(
tabular_file_to_sections(
io.BytesIO(raw_bytes),
file_name=name,
link=link,
)
)
)
if mime_type in OnyxMimeTypes.IMAGE_MIME_TYPES:
# Skip images if not explicitly enabled
if not allow_images:
return []
return BasicExtractionResult(sections=[])
# Store images for later processing
sections: list[TextSection | ImageSection | TabularSection] = []
@@ -332,7 +368,7 @@ def _download_and_extract_sections_basic(
sections.append(section)
except Exception as e:
logger.error(f"Failed to process image {file_name}: {e}")
return sections
return BasicExtractionResult(sections=sections)
# For Google Docs, Sheets, and Slides, export via the Drive API
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
@@ -343,58 +379,44 @@ def _download_and_extract_sections_basic(
response = _download_request(request, file_id, size_threshold)
if not response:
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
return []
return BasicExtractionResult(sections=[])
if export_mime_type in OnyxMimeTypes.TABULAR_MIME_TYPES:
# Synthesize an extension on the filename
ext = ".xlsx" if export_mime_type == SPREADSHEET_MIME_TYPE else ".csv"
return list(
tabular_file_to_sections(
io.BytesIO(response),
file_name=f"{file_name}{ext}",
link=link,
)
return _extract_tabular(
raw_bytes=response,
name=f"{file_name}{ext}",
content_type=export_mime_type,
)
text = response.decode("utf-8")
return [TextSection(link=link, text=text)]
return BasicExtractionResult(sections=[TextSection(link=link, text=text)])
# Process based on mime type
if mime_type == "text/plain":
try:
text = response_call().decode("utf-8")
return [TextSection(link=link, text=text)]
return BasicExtractionResult(sections=[TextSection(link=link, text=text)])
except UnicodeDecodeError as e:
logger.warning(f"Failed to extract text from {file_name}: {e}")
return []
return BasicExtractionResult(sections=[])
elif (
mime_type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
text, _ = read_docx_file(io.BytesIO(response_call()))
return [TextSection(link=link, text=text)]
return BasicExtractionResult(sections=[TextSection(link=link, text=text)])
elif (
mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
or is_tabular_file(file_name)
):
# Google Drive doesn't enforce file extensions, so the filename may not
# end in .xlsx even when the mime type says it's one. Synthesize the
# extension so tabular_file_to_sections dispatches correctly.
tabular_file_name = file_name
if (
mime_type
== "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
and not is_tabular_file(file_name)
):
tabular_file_name = f"{file_name}.xlsx"
return list(
tabular_file_to_sections(
io.BytesIO(response_call()),
file_name=tabular_file_name,
link=link,
)
return _extract_tabular(
raw_bytes=response_call(),
name=file_name,
content_type=mime_type,
)
elif (
@@ -402,7 +424,9 @@ def _download_and_extract_sections_basic(
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
text = pptx_to_text(io.BytesIO(response_call()), file_name=file_name)
return [TextSection(link=link, text=text)] if text else []
return BasicExtractionResult(
sections=[TextSection(link=link, text=text)] if text else []
)
elif mime_type == "application/pdf":
text, _pdf_meta, images = read_pdf_file(io.BytesIO(response_call()))
@@ -422,20 +446,20 @@ def _download_and_extract_sections_basic(
pdf_sections.append(section)
except Exception as e:
logger.error(f"Failed to process PDF images in {file_name}: {e}")
return pdf_sections
return BasicExtractionResult(sections=pdf_sections)
# Final attempt at extracting text
file_ext = get_file_ext(file.get("name", ""))
if file_ext not in OnyxFileExtensions.ALL_ALLOWED_EXTENSIONS:
logger.warning(f"Skipping file {file.get('name')} due to extension.")
return []
return BasicExtractionResult(sections=[])
try:
text = extract_file_text(io.BytesIO(response_call()), file_name)
return [TextSection(link=link, text=text)]
return BasicExtractionResult(sections=[TextSection(link=link, text=text)])
except Exception as e:
logger.warning(f"Failed to extract text from {file_name}: {e}")
return []
return BasicExtractionResult(sections=[])
def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
@@ -564,6 +588,7 @@ def convert_drive_item_to_document(
permission_sync_context: PermissionSyncContext | None,
retriever_emails: list[str],
file: GoogleDriveFileType,
raw_file_callback: RawFileCallback | None = None,
) -> Document | ConnectorFailure | None:
"""
Attempt to convert a drive item to a document with each retriever email
@@ -590,6 +615,7 @@ def convert_drive_item_to_document(
retriever_email,
file,
permission_sync_context,
raw_file_callback,
)
# There are a variety of permissions-based errors that occasionally occur
@@ -635,11 +661,13 @@ def _convert_drive_item_to_document(
# if not specified, we will not sync permissions
# will also be a no-op if EE is not enabled
permission_sync_context: PermissionSyncContext | None,
raw_file_callback: RawFileCallback | None = None,
) -> Document | ConnectorFailure | None:
"""
Main entry point for converting a Google Drive file => Document object.
"""
sections: list[TextSection | ImageSection | TabularSection] = []
staged_file_id: str | None = None
# Only construct these services when needed
def _get_drive_service() -> GoogleDriveService:
@@ -686,10 +714,17 @@ def _convert_drive_item_to_document(
logger.debug(
f"found smart chips in {file.get('name')}, aligning with basic sections"
)
basic_sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images, size_threshold
basic_extraction = _download_and_extract_sections_basic(
file,
_get_drive_service(),
allow_images,
size_threshold,
raw_file_callback,
)
sections = align_basic_advanced(basic_sections, doc_sections)
sections = align_basic_advanced(
basic_extraction.sections, doc_sections
)
staged_file_id = basic_extraction.staged_file_id
except Exception as e:
logger.warning(
@@ -697,9 +732,15 @@ def _convert_drive_item_to_document(
)
# Not Google Doc, attempt basic extraction
else:
sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images, size_threshold
basic_extraction = _download_and_extract_sections_basic(
file,
_get_drive_service(),
allow_images,
size_threshold,
raw_file_callback,
)
sections = basic_extraction.sections
staged_file_id = basic_extraction.staged_file_id
# If we still don't have any sections, skip this file
if not sections:
@@ -760,6 +801,7 @@ def _convert_drive_item_to_document(
),
external_access=external_access,
parent_hierarchy_raw_node_id=(file.get("parents") or [None])[0],
file_id=staged_file_id,
)
except Exception as e:
doc_id = "unknown"

View File

@@ -167,13 +167,6 @@ class GoogleDriveCheckpoint(ConnectorCheckpoint):
default_factory=ThreadSafeSet
)
# Maps email → set of IDs of folders where that email confirmed no accessible parent.
# Avoids redundant API calls when the same (folder, email) pair is
# encountered again within the same retrieval run.
failed_folder_ids_by_email: ThreadSafeDict[str, ThreadSafeSet[str]] = Field(
default_factory=ThreadSafeDict
)
@field_serializer("completion_map")
def serialize_completion_map(
self, completion_map: ThreadSafeDict[str, StageCompletion], _info: Any
@@ -218,25 +211,3 @@ class GoogleDriveCheckpoint(ConnectorCheckpoint):
if isinstance(v, list):
return ThreadSafeSet(set(v)) # ty: ignore[invalid-return-type]
return ThreadSafeSet()
@field_serializer("failed_folder_ids_by_email")
def serialize_failed_folder_ids_by_email(
self,
failed_folder_ids_by_email: ThreadSafeDict[str, ThreadSafeSet[str]],
_info: Any,
) -> dict[str, set[str]]:
return {
k: inner.copy() for k, inner in failed_folder_ids_by_email.copy().items()
}
@field_validator("failed_folder_ids_by_email", mode="before")
def validate_failed_folder_ids_by_email(
cls, v: Any
) -> ThreadSafeDict[str, ThreadSafeSet[str]]:
if isinstance(v, ThreadSafeDict):
return v
if isinstance(v, dict):
return ThreadSafeDict(
{k: ThreadSafeSet(set(vals)) for k, vals in v.items()}
)
return ThreadSafeDict()

View File

@@ -15,6 +15,7 @@ from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import SlimDocument
from onyx.file_store.staging import RawFileCallback
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
@@ -42,6 +43,9 @@ class NormalizationResult(BaseModel):
class BaseConnector(abc.ABC, Generic[CT]):
REDIS_KEY_PREFIX = "da_connector_data:"
# Optional raw-file persistence hook to save original file
raw_file_callback: RawFileCallback | None = None
@abc.abstractmethod
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
raise NotImplementedError
@@ -88,6 +92,15 @@ class BaseConnector(abc.ABC, Generic[CT]):
"""Implement if the underlying connector wants to skip/allow image downloading
based on the application level image analysis setting."""
def set_raw_file_callback(self, callback: RawFileCallback) -> None:
"""Inject the per-attempt raw-file persistence callback.
Wired up by the docfetching entrypoint via `instantiate_connector`.
Connectors that don't care about persisting raw bytes can ignore this
— `raw_file_callback` simply stays `None`.
"""
self.raw_file_callback = callback
@classmethod
def normalize_url(cls, url: str) -> "NormalizationResult": # noqa: ARG003
"""Normalize a URL to match the canonical Document.id format used during ingestion.

View File

@@ -62,19 +62,17 @@ def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any:
def extract_text_from_adf(adf: dict | None) -> str:
"""Extracts plain text from Atlassian Document Format:
https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
WARNING: This function is incomplete and will e.g. skip lists!
"""
texts: list[str] = []
def _extract(node: dict) -> None:
if node.get("type") == "text":
text = node.get("text", "")
if text:
texts.append(text)
for child in node.get("content", []):
_extract(child)
if adf is not None:
_extract(adf)
# TODO: complete this function
texts = []
if adf is not None and "content" in adf:
for block in adf["content"]:
if "content" in block:
for item in block["content"]:
if item["type"] == "text":
texts.append(item["text"])
return " ".join(texts)

View File

@@ -231,6 +231,8 @@ class DocumentBase(BaseModel):
# Set during docfetching after hierarchy nodes are cached
parent_hierarchy_node_id: int | None = None
file_id: str | None = None
def get_title_for_document_index(
self,
) -> str | None:
@@ -370,6 +372,7 @@ class Document(DocumentBase):
secondary_owners=base.secondary_owners,
title=base.title,
from_ingestion_api=base.from_ingestion_api,
file_id=base.file_id,
)
def __sizeof__(self) -> int:

View File

@@ -1958,7 +1958,8 @@ class SharepointConnector(
self._graph_client = GraphClient(
_acquire_token_for_graph, environment=self._azure_environment
)
self.sp_tenant_domain = self._resolve_tenant_domain()
if auth_method == SharepointAuthMethod.CERTIFICATE.value:
self.sp_tenant_domain = self._resolve_tenant_domain()
return None
def _get_drive_names_for_site(self, site_url: str) -> list[str]:

View File

@@ -81,7 +81,9 @@ class ZulipConnector(LoadConnector, PollConnector):
# zuliprc file. This reverts them back to newlines.
contents_spaces_to_newlines = contents.replace(" ", "\n")
# create a temporary zuliprc file
tempdir = tempfile.gettempdir()
tempdir = tempfile.tempdir
if tempdir is None:
raise Exception("Could not determine tempfile directory")
config_file = os.path.join(tempdir, f"zuliprc-{self.realm_name}")
with open(config_file, "w") as f:
f.write(contents_spaces_to_newlines)

View File

@@ -244,21 +244,13 @@ def fetch_latest_index_attempts_by_status(
return query.all()
_INTERNAL_ONLY_SOURCES = {
# Used by the ingestion API, not a user-created connector.
DocumentSource.INGESTION_API,
# Backs the user library / build feature, not a connector users filter by.
DocumentSource.CRAFT_FILE,
}
def fetch_unique_document_sources(db_session: Session) -> list[DocumentSource]:
distinct_sources = db_session.query(Connector.source).distinct().all()
sources = [
source[0]
for source in distinct_sources
if source[0] not in _INTERNAL_ONLY_SOURCES
if source[0] != DocumentSource.INGESTION_API
]
return sources

View File

@@ -52,6 +52,7 @@ from onyx.db.utils import DocumentRow
from onyx.db.utils import model_to_dict
from onyx.db.utils import SortOrder
from onyx.document_index.interfaces import DocumentMetadata
from onyx.file_store.staging import delete_files_best_effort
from onyx.kg.models import KGStage
from onyx.server.documents.models import ConnectorCredentialPairIdentifier
from onyx.utils.logger import setup_logger
@@ -696,6 +697,7 @@ def upsert_documents(
else {}
),
doc_metadata=doc.doc_metadata,
file_id=doc.file_id,
)
)
for doc in seen_documents.values()
@@ -712,6 +714,7 @@ def upsert_documents(
"secondary_owners": insert_stmt.excluded.secondary_owners,
"doc_metadata": insert_stmt.excluded.doc_metadata,
"parent_hierarchy_node_id": insert_stmt.excluded.parent_hierarchy_node_id,
"file_id": insert_stmt.excluded.file_id,
}
if includes_permissions:
# Use COALESCE to preserve existing permissions when new values are NULL.
@@ -925,6 +928,26 @@ def delete_documents__no_commit(db_session: Session, document_ids: list[str]) ->
db_session.execute(delete(DbDocument).where(DbDocument.id.in_(document_ids)))
def get_file_ids_for_document_ids(
db_session: Session,
document_ids: list[str],
) -> list[str]:
"""Return the non-null `file_id` values attached to the given documents.
Used at deletion time to enumerate raw files that need to be reaped from
the file store once their owning document rows are gone.
"""
if not document_ids:
return []
rows = (
db_session.query(DbDocument.file_id)
.filter(DbDocument.id.in_(document_ids))
.filter(DbDocument.file_id.isnot(None))
.all()
)
return [row.file_id for row in rows]
def delete_documents_complete__no_commit(
db_session: Session, document_ids: list[str]
) -> None:
@@ -968,6 +991,32 @@ def delete_documents_complete__no_commit(
delete_documents__no_commit(db_session, document_ids)
def delete_documents_complete(
db_session: Session,
document_ids: list[str],
) -> None:
"""Fully remove documents AND best-effort delete their attached files.
This is the canonical path for "I'm done with these docs" — it captures
file_ids, removes the rows + every FK they hold, commits, then reaps
files. The order matters: file deletion happens after commit so a DB
rollback can never leave a `document` row pointing at a missing file.
Use this instead of `delete_documents_complete__no_commit` unless you
specifically need to compose with other operations in one transaction.
"""
file_ids_to_delete = get_file_ids_for_document_ids(
db_session=db_session,
document_ids=document_ids,
)
delete_documents_complete__no_commit(
db_session=db_session,
document_ids=document_ids,
)
db_session.commit()
delete_files_best_effort(file_ids_to_delete)
def delete_all_documents_for_connector_credential_pair(
db_session: Session,
connector_id: int,
@@ -999,10 +1048,9 @@ def delete_all_documents_for_connector_credential_pair(
if not document_ids:
break
delete_documents_complete__no_commit(
delete_documents_complete(
db_session=db_session, document_ids=list(document_ids)
)
db_session.commit()
if time.monotonic() - start_time > timeout:
raise RuntimeError("Timeout reached while deleting documents")

View File

@@ -62,6 +62,21 @@ def delete_filerecord_by_file_id(
db_session.query(FileRecord).filter_by(file_id=file_id).delete()
def update_filerecord_origin(
file_id: str,
from_origin: FileOrigin,
to_origin: FileOrigin,
db_session: Session,
) -> None:
"""Change a file_record's `file_origin`, filtered on the current origin
so the update is idempotent. Caller owns the commit.
"""
db_session.query(FileRecord).filter(
FileRecord.file_id == file_id,
FileRecord.file_origin == from_origin,
).update({FileRecord.file_origin: to_origin})
def upsert_filerecord(
file_id: str,
display_name: str,

View File

@@ -952,6 +952,7 @@ class Document(Base):
semantic_id: Mapped[str] = mapped_column(NullFilteredString)
# First Section's link
link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
file_id: Mapped[str | None] = mapped_column(String, nullable=True)
# The updated time is also used as a measure of the last successful state of the doc
# pulled from the source (to help skip reindexing already updated docs in case of

View File

@@ -20,7 +20,6 @@ from onyx.background.celery.tasks.opensearch_migration.constants import (
TOTAL_ALLOWABLE_DOC_MIGRATION_ATTEMPTS_BEFORE_PERMANENT_FAILURE,
)
from onyx.configs.app_configs import ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
from onyx.db.enums import OpenSearchDocumentMigrationStatus
from onyx.db.models import Document
from onyx.db.models import OpenSearchDocumentMigrationRecord
@@ -413,11 +412,7 @@ def get_opensearch_retrieval_state(
If the tenant migration record is not found, defaults to
ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX.
If ONYX_DISABLE_VESPA is True, always returns True.
"""
if ONYX_DISABLE_VESPA:
return True
record = db_session.query(OpenSearchTenantMigrationRecord).first()
if record is None:
return ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX

View File

@@ -3,7 +3,6 @@ from sqlalchemy.orm import Session
from onyx.configs.app_configs import DISABLE_VECTOR_DB
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
from onyx.db.models import SearchSettings
from onyx.db.opensearch_migration import get_opensearch_retrieval_state
from onyx.document_index.disabled import DisabledDocumentIndex
@@ -49,11 +48,6 @@ def get_default_document_index(
secondary_large_chunks_enabled = secondary_search_settings.large_chunks_enabled
opensearch_retrieval_enabled = get_opensearch_retrieval_state(db_session)
if ONYX_DISABLE_VESPA:
if not opensearch_retrieval_enabled:
raise ValueError(
"BUG: ONYX_DISABLE_VESPA is set but opensearch_retrieval_enabled is not set."
)
if opensearch_retrieval_enabled:
indexing_setting = IndexingSetting.from_db_model(search_settings)
secondary_indexing_setting = (
@@ -125,32 +119,21 @@ def get_all_document_indices(
)
]
result: list[DocumentIndex] = []
if ONYX_DISABLE_VESPA:
if not ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
raise ValueError(
"ONYX_DISABLE_VESPA is set but ENABLE_OPENSEARCH_INDEXING_FOR_ONYX is not set."
)
else:
vespa_document_index = VespaIndex(
index_name=search_settings.index_name,
secondary_index_name=(
secondary_search_settings.index_name
if secondary_search_settings
else None
),
large_chunks_enabled=search_settings.large_chunks_enabled,
secondary_large_chunks_enabled=(
secondary_search_settings.large_chunks_enabled
if secondary_search_settings
else None
),
multitenant=MULTI_TENANT,
httpx_client=httpx_client,
)
result.append(vespa_document_index)
vespa_document_index = VespaIndex(
index_name=search_settings.index_name,
secondary_index_name=(
secondary_search_settings.index_name if secondary_search_settings else None
),
large_chunks_enabled=search_settings.large_chunks_enabled,
secondary_large_chunks_enabled=(
secondary_search_settings.large_chunks_enabled
if secondary_search_settings
else None
),
multitenant=MULTI_TENANT,
httpx_client=httpx_client,
)
opensearch_document_index: OpenSearchOldDocumentIndex | None = None
if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
indexing_setting = IndexingSetting.from_db_model(search_settings)
secondary_indexing_setting = (
@@ -186,6 +169,7 @@ def get_all_document_indices(
multitenant=MULTI_TENANT,
httpx_client=httpx_client,
)
result: list[DocumentIndex] = [vespa_document_index]
if opensearch_document_index:
result.append(opensearch_document_index)
return result

View File

@@ -98,6 +98,9 @@ class DocumentMetadata:
# The resolved database ID of the parent hierarchy node (folder/container)
parent_hierarchy_node_id: int | None = None
# Opt-in pointer to the persisted raw file for this document (file_store id).
file_id: str | None = None
@dataclass
class VespaDocumentFields:

View File

@@ -17,7 +17,6 @@ from onyx.configs.app_configs import OPENSEARCH_ADMIN_PASSWORD
from onyx.configs.app_configs import OPENSEARCH_ADMIN_USERNAME
from onyx.configs.app_configs import OPENSEARCH_HOST
from onyx.configs.app_configs import OPENSEARCH_REST_API_PORT
from onyx.configs.app_configs import OPENSEARCH_USE_SSL
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.constants import OpenSearchSearchType
from onyx.document_index.opensearch.schema import DocumentChunk
@@ -133,7 +132,7 @@ class OpenSearchClient(AbstractContextManager):
host: str = OPENSEARCH_HOST,
port: int = OPENSEARCH_REST_API_PORT,
auth: tuple[str, str] = (OPENSEARCH_ADMIN_USERNAME, OPENSEARCH_ADMIN_PASSWORD),
use_ssl: bool = OPENSEARCH_USE_SSL,
use_ssl: bool = True,
verify_certs: bool = False,
ssl_show_warn: bool = False,
timeout: int = DEFAULT_OPENSEARCH_CLIENT_TIMEOUT_S,
@@ -303,7 +302,7 @@ class OpenSearchIndexClient(OpenSearchClient):
host: str = OPENSEARCH_HOST,
port: int = OPENSEARCH_REST_API_PORT,
auth: tuple[str, str] = (OPENSEARCH_ADMIN_USERNAME, OPENSEARCH_ADMIN_PASSWORD),
use_ssl: bool = OPENSEARCH_USE_SSL,
use_ssl: bool = True,
verify_certs: bool = False,
ssl_show_warn: bool = False,
timeout: int = DEFAULT_OPENSEARCH_CLIENT_TIMEOUT_S,
@@ -508,55 +507,8 @@ class OpenSearchIndexClient(OpenSearchClient):
Raises:
Exception: There was an error updating the settings of the index.
"""
logger.debug(f"Updating settings of index {self._index_name} with {settings}.")
response = self._client.indices.put_settings(
index=self._index_name, body=settings
)
if not response.get("acknowledged", False):
raise RuntimeError(
f"Failed to update settings of index {self._index_name}."
)
logger.debug(f"Settings of index {self._index_name} updated successfully.")
@log_function_time(print_only=True, debug_only=True)
def get_settings(self) -> dict[str, Any]:
"""Gets the settings of the index.
Returns:
The settings of the index.
Raises:
Exception: There was an error getting the settings of the index.
"""
logger.debug(f"Getting settings of index {self._index_name}.")
response = self._client.indices.get_settings(index=self._index_name)
return response[self._index_name]["settings"]
@log_function_time(print_only=True, debug_only=True)
def open_index(self) -> None:
"""Opens the index.
Raises:
Exception: There was an error opening the index.
"""
logger.debug(f"Opening index {self._index_name}.")
response = self._client.indices.open(index=self._index_name)
if not response.get("acknowledged", False):
raise RuntimeError(f"Failed to open index {self._index_name}.")
logger.debug(f"Index {self._index_name} opened successfully.")
@log_function_time(print_only=True, debug_only=True)
def close_index(self) -> None:
"""Closes the index.
Raises:
Exception: There was an error closing the index.
"""
logger.debug(f"Closing index {self._index_name}.")
response = self._client.indices.close(index=self._index_name)
if not response.get("acknowledged", False):
raise RuntimeError(f"Failed to close index {self._index_name}.")
logger.debug(f"Index {self._index_name} closed successfully.")
# TODO(andrei): Implement this.
raise NotImplementedError
@log_function_time(
print_only=True,

View File

@@ -23,7 +23,6 @@ import openpyxl
from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image
from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_FILE
from onyx.configs.constants import ONYX_METADATA_FILENAME
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
from onyx.file_processing.file_types import OnyxFileExtensions
@@ -48,7 +47,6 @@ KNOWN_OPENPYXL_BUGS = [
"File contains no valid workbook part",
"Unable to read workbook: could not read stylesheet from None",
"Colors must be aRGB hex values",
"Max value is",
]
@@ -193,56 +191,6 @@ def read_text_file(
return file_content_raw, metadata
def count_pdf_embedded_images(file: IO[Any], cap: int) -> int:
"""Return the number of embedded images in a PDF, short-circuiting at cap+1.
Used to reject PDFs whose image count would OOM the user-file-processing
worker during indexing. Returns a value > cap as a sentinel once the count
exceeds the cap, so callers do not iterate thousands of image objects just
to report a number. Returns 0 if the PDF cannot be parsed.
Owner-password-only PDFs (permission restrictions but no open password) are
counted normally — they decrypt with an empty string. Truly password-locked
PDFs are skipped (return 0) since we can't inspect them; the caller should
ensure the password-protected check runs first.
Always restores the file pointer to its original position before returning.
"""
from pypdf import PdfReader
try:
start_pos = file.tell()
except Exception:
start_pos = None
try:
if start_pos is not None:
file.seek(0)
reader = PdfReader(file)
if reader.is_encrypted:
# Try empty password first (owner-password-only PDFs); give up if that fails.
try:
if reader.decrypt("") == 0:
return 0
except Exception:
return 0
count = 0
for page in reader.pages:
for _ in page.images:
count += 1
if count > cap:
return count
return count
except Exception:
logger.warning("Failed to count embedded images in PDF", exc_info=True)
return 0
finally:
if start_pos is not None:
try:
file.seek(start_pos)
except Exception:
pass
def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
"""
Extract text from a PDF. For embedded images, a more complex approach is needed.
@@ -306,27 +254,8 @@ def read_pdf_file(
)
if extract_images:
image_cap = MAX_EMBEDDED_IMAGES_PER_FILE
images_processed = 0
cap_reached = False
for page_num, page in enumerate(pdf_reader.pages):
if cap_reached:
break
for image_file_object in page.images:
if images_processed >= image_cap:
# Defense-in-depth backstop. Upload-time validation
# should have rejected files exceeding the cap, but
# we also break here so a single oversized file can
# never pin a worker.
logger.warning(
"PDF embedded image cap reached (%d). "
"Skipping remaining images on page %d and beyond.",
image_cap,
page_num + 1,
)
cap_reached = True
break
image = Image.open(io.BytesIO(image_file_object.data))
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format=image.format)
@@ -339,7 +268,6 @@ def read_pdf_file(
image_callback(img_bytes, image_name)
else:
extracted_images.append((img_bytes, image_name))
images_processed += 1
return text, metadata, extracted_images

View File

@@ -0,0 +1,103 @@
from collections.abc import Callable
from typing import Any
from typing import IO
from sqlalchemy.orm import Session
from onyx.configs.constants import FileOrigin
from onyx.db.file_record import update_filerecord_origin
from onyx.file_store.file_store import get_default_file_store
from onyx.utils.logger import setup_logger
logger = setup_logger()
# (content, content_type) -> file_id
RawFileCallback = Callable[[IO[bytes], str], str]
def stage_raw_file(
content: IO,
content_type: str,
*,
metadata: dict[str, Any],
) -> str:
"""Persist raw bytes to the file store with FileOrigin.INDEXING_STAGING.
`metadata` is attached to the file_record so that downstream promotion
(in docprocessing) and orphan reaping (TTL janitor) can locate the file
by its originating context.
"""
file_store = get_default_file_store()
file_id = file_store.save_file(
content=content,
display_name=None,
file_origin=FileOrigin.INDEXING_STAGING,
file_type=content_type,
file_metadata=metadata,
)
return file_id
def build_raw_file_callback(
*,
index_attempt_id: int,
cc_pair_id: int,
tenant_id: str,
) -> RawFileCallback:
"""Build a per-attempt callback that connectors can invoke to opt in to
raw-file persistence. The closure binds the attempt-level context as the
staging metadata so the connector only needs to pass per-call info
(bytes, content_type) and gets back a file_id to attach to its Document.
"""
metadata: dict[str, Any] = {
"index_attempt_id": index_attempt_id,
"cc_pair_id": cc_pair_id,
"tenant_id": tenant_id,
}
def _callback(content: IO[bytes], content_type: str) -> str:
return stage_raw_file(
content=content,
content_type=content_type,
metadata=metadata,
)
return _callback
def delete_files_best_effort(file_ids: list[str]) -> None:
"""Delete a list of files from the file store, logging individual
failures rather than raising.
Used at document-deletion time to reap raw files attached via
`Document.file_id`. The corresponding document rows have already been
deleted by the caller, so a failure here just leaves a recoverable
orphan rather than a broken pointer.
"""
if not file_ids:
return
file_store = get_default_file_store()
for file_id in file_ids:
try:
file_store.delete_file(file_id, error_on_missing=False)
except Exception:
logger.exception(
f"Failed to delete file_id={file_id} during document cleanup"
)
def promote_staged_file(db_session: Session, file_id: str) -> None:
"""Mark a previously-staged file as `FileOrigin.CONNECTOR`.
Idempotent — the underlying update filters on the STAGING origin so
repeated calls no-op once the file has already been promoted or removed.
Caller owns the commit so promotion stays transactional with whatever
document-level bookkeeping the caller is doing.
"""
update_filerecord_origin(
file_id=file_id,
from_origin=FileOrigin.INDEXING_STAGING,
to_origin=FileOrigin.CONNECTOR,
db_session=db_session,
)

View File

@@ -49,6 +49,7 @@ from onyx.document_index.interfaces import DocumentMetadata
from onyx.document_index.interfaces import IndexBatchParams
from onyx.file_processing.image_summarization import summarize_image_with_error_handling
from onyx.file_store.file_store import get_default_file_store
from onyx.file_store.staging import promote_staged_file
from onyx.hooks.executor import execute_hook
from onyx.hooks.executor import HookSkipped
from onyx.hooks.executor import HookSoftFailed
@@ -154,6 +155,7 @@ def _upsert_documents_in_db(
doc_metadata=doc.doc_metadata,
# parent_hierarchy_node_id is resolved in docfetching using Redis cache
parent_hierarchy_node_id=doc.parent_hierarchy_node_id,
file_id=doc.file_id,
)
document_metadata_list.append(db_doc_metadata)
@@ -364,6 +366,39 @@ def index_doc_batch_with_handler(
return index_pipeline_result
def _apply_file_id_transitions(
documents: list[Document],
previous_file_ids: dict[str, str],
db_session: Session,
) -> None:
"""Finalize file_id lifecycle for the batch.
`document.file_id` is already written by `upsert_documents`. For each doc
whose file_id changed, promote the new staged file to `CONNECTOR` (so the
TTL janitor leaves it alone) and delete the replaced one. The delete is
best-effort; if it fails the janitor will reap the orphan.
"""
file_store = get_default_file_store()
for doc in documents:
new_file_id = doc.file_id
old_file_id = previous_file_ids.get(doc.id)
if new_file_id == old_file_id:
continue
if new_file_id is not None:
promote_staged_file(db_session=db_session, file_id=new_file_id)
if old_file_id is not None:
try:
file_store.delete_file(old_file_id, error_on_missing=False)
except Exception:
logger.exception(
f"Failed to delete replaced file_id={old_file_id}; "
"will be reaped by janitor."
)
def index_doc_batch_prepare(
documents: list[Document],
index_attempt_metadata: IndexAttemptMetadata,
@@ -382,6 +417,11 @@ def index_doc_batch_prepare(
document_ids=document_ids,
)
# Capture previous file_ids BEFORE any writes so we know what to reap.
previous_file_ids: dict[str, str] = {
db_doc.id: db_doc.file_id for db_doc in db_docs if db_doc.file_id is not None
}
updatable_docs = (
get_doc_ids_to_update(documents=documents, db_docs=db_docs)
if not ignore_time_skip
@@ -404,6 +444,11 @@ def index_doc_batch_prepare(
index_attempt_metadata=index_attempt_metadata,
db_session=db_session,
)
_apply_file_id_transitions(
documents=updatable_docs,
previous_file_ids=previous_file_ids,
db_session=db_session,
)
logger.info(
f"Upserted {len(updatable_docs)} changed docs out of {len(documents)} total docs into the DB"

View File

@@ -19,14 +19,9 @@ from onyx.configs.app_configs import MCP_SERVER_CORS_ORIGINS
from onyx.mcp_server.auth import OnyxTokenVerifier
from onyx.mcp_server.utils import shutdown_http_client
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import set_is_ee_based_on_env_variable
logger = setup_logger()
# Initialize EE flag at module import so it's set regardless of the entry point
# (python -m onyx.mcp_server_main, uvicorn onyx.mcp_server.api:mcp_app, etc.).
set_is_ee_based_on_env_variable()
logger.info("Creating Onyx MCP Server...")
mcp_server = FastMCP(

View File

@@ -1,5 +1,4 @@
"""Resource registrations for the Onyx MCP server."""
# Import resource modules so decorators execute when the package loads.
from onyx.mcp_server.resources import document_sets # noqa: F401
from onyx.mcp_server.resources import indexed_sources # noqa: F401

View File

@@ -1,41 +0,0 @@
"""Resource exposing document sets available to the current user."""
from __future__ import annotations
import json
from onyx.mcp_server.api import mcp_server
from onyx.mcp_server.utils import get_accessible_document_sets
from onyx.mcp_server.utils import require_access_token
from onyx.utils.logger import setup_logger
logger = setup_logger()
@mcp_server.resource(
"resource://document_sets",
name="document_sets",
description=(
"Enumerate the Document Sets accessible to the current user. Use the "
"returned `name` values with the `document_set_names` filter of the "
"`search_indexed_documents` tool to scope searches to a specific set."
),
mime_type="application/json",
)
async def document_sets_resource() -> str:
"""Return the list of document sets the user can filter searches by."""
access_token = require_access_token()
document_sets = sorted(
await get_accessible_document_sets(access_token), key=lambda entry: entry.name
)
logger.info(
"Onyx MCP Server: document_sets resource returning %s entries",
len(document_sets),
)
# FastMCP 3.2+ requires str/bytes/list[ResourceContent] — it no longer
# auto-serializes; serialize to JSON ourselves.
return json.dumps([entry.model_dump(mode="json") for entry in document_sets])

View File

@@ -2,7 +2,7 @@
from __future__ import annotations
import json
from typing import Any
from onyx.mcp_server.api import mcp_server
from onyx.mcp_server.utils import get_indexed_sources
@@ -21,7 +21,7 @@ logger = setup_logger()
),
mime_type="application/json",
)
async def indexed_sources_resource() -> str:
async def indexed_sources_resource() -> dict[str, Any]:
"""Return the list of indexed source types for search filtering."""
access_token = require_access_token()
@@ -33,6 +33,6 @@ async def indexed_sources_resource() -> str:
len(sources),
)
# FastMCP 3.2+ requires str/bytes/list[ResourceContent] — it no longer
# auto-serializes; serialize to JSON ourselves.
return json.dumps(sorted(sources))
return {
"indexed_sources": sorted(sources),
}

View File

@@ -4,23 +4,12 @@ from datetime import datetime
from typing import Any
import httpx
from fastmcp.server.auth.auth import AccessToken
from pydantic import BaseModel
from onyx.chat.models import ChatFullResponse
from onyx.configs.constants import DocumentSource
from onyx.context.search.models import BaseFilters
from onyx.context.search.models import SearchDoc
from onyx.mcp_server.api import mcp_server
from onyx.mcp_server.utils import get_http_client
from onyx.mcp_server.utils import get_indexed_sources
from onyx.mcp_server.utils import require_access_token
from onyx.server.features.web_search.models import OpenUrlsToolRequest
from onyx.server.features.web_search.models import OpenUrlsToolResponse
from onyx.server.features.web_search.models import WebSearchToolRequest
from onyx.server.features.web_search.models import WebSearchToolResponse
from onyx.server.query_and_chat.models import ChatSessionCreationRequest
from onyx.server.query_and_chat.models import SendMessageRequest
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import build_api_server_url_for_http_requests
from onyx.utils.variable_functionality import global_version
@@ -28,43 +17,6 @@ from onyx.utils.variable_functionality import global_version
logger = setup_logger()
# CE search falls through to the chat endpoint, which invokes an LLM — the
# default 60s client timeout is not enough for a real RAG-backed response.
_CE_SEARCH_TIMEOUT_SECONDS = 300.0
async def _post_model(
url: str,
body: BaseModel,
access_token: AccessToken,
timeout: float | None = None,
) -> httpx.Response:
"""POST a Pydantic model as JSON to the Onyx backend."""
return await get_http_client().post(
url,
content=body.model_dump_json(exclude_unset=True),
headers={
"Authorization": f"Bearer {access_token.token}",
"Content-Type": "application/json",
},
timeout=timeout if timeout is not None else httpx.USE_CLIENT_DEFAULT,
)
def _project_doc(doc: SearchDoc, content: str | None) -> dict[str, Any]:
"""Project a backend search doc into the MCP wire shape.
Accepts SearchDocWithContent (EE) too since it extends SearchDoc.
"""
return {
"semantic_identifier": doc.semantic_identifier,
"content": content,
"source_type": doc.source_type.value,
"link": doc.link,
"score": doc.score,
}
def _extract_error_detail(response: httpx.Response) -> str:
"""Extract a human-readable error message from a failed backend response.
@@ -84,7 +36,6 @@ def _extract_error_detail(response: httpx.Response) -> str:
async def search_indexed_documents(
query: str,
source_types: list[str] | None = None,
document_set_names: list[str] | None = None,
time_cutoff: str | None = None,
limit: int = 10,
) -> dict[str, Any]:
@@ -102,10 +53,6 @@ async def search_indexed_documents(
In EE mode, the dedicated search endpoint is used instead.
To find a list of available sources, use the `indexed_sources` resource.
`document_set_names` restricts results to documents belonging to the named
Document Sets — useful for scoping queries to a curated subset of the
knowledge base (e.g. to isolate knowledge between agents). Use the
`document_sets` resource to discover accessible set names.
Returns chunks of text as search results with snippets, scores, and metadata.
Example usage:
@@ -113,23 +60,15 @@ async def search_indexed_documents(
{
"query": "What is the latest status of PROJ-1234 and what is the next development item?",
"source_types": ["jira", "google_drive", "github"],
"document_set_names": ["Engineering Wiki"],
"time_cutoff": "2025-11-24T00:00:00Z",
"limit": 10,
}
```
"""
logger.info(
f"Onyx MCP Server: document search: query='{query}', sources={source_types}, "
f"document_sets={document_set_names}, limit={limit}"
f"Onyx MCP Server: document search: query='{query}', sources={source_types}, limit={limit}"
)
# Normalize empty list inputs to None so downstream filter construction is
# consistent — BaseFilters treats [] as "match zero" which differs from
# "no filter" (None).
source_types = source_types or None
document_set_names = document_set_names or None
# Parse time_cutoff string to datetime if provided
time_cutoff_dt: datetime | None = None
if time_cutoff:
@@ -142,6 +81,9 @@ async def search_indexed_documents(
# Continue with no time_cutoff instead of returning an error
time_cutoff_dt = None
# Initialize source_type_enums early to avoid UnboundLocalError
source_type_enums: list[DocumentSource] | None = None
# Get authenticated user from FastMCP's access token
access_token = require_access_token()
@@ -175,7 +117,6 @@ async def search_indexed_documents(
# Convert source_types strings to DocumentSource enums if provided
# Invalid values will be handled by the API server
source_type_enums: list[DocumentSource] | None = None
if source_types is not None:
source_type_enums = []
for src in source_types:
@@ -186,83 +127,83 @@ async def search_indexed_documents(
f"Onyx MCP Server: Invalid source type '{src}' - will be ignored by server"
)
filters: BaseFilters | None = None
if source_type_enums or document_set_names or time_cutoff_dt:
filters = BaseFilters(
source_type=source_type_enums,
document_set=document_set_names,
time_cutoff=time_cutoff_dt,
)
# Build filters dict only with non-None values
filters: dict[str, Any] | None = None
if source_type_enums or time_cutoff_dt:
filters = {}
if source_type_enums:
filters["source_type"] = [src.value for src in source_type_enums]
if time_cutoff_dt:
filters["time_cutoff"] = time_cutoff_dt.isoformat()
base_url = build_api_server_url_for_http_requests(respect_env_override_if_set=True)
is_ee = global_version.is_ee_version()
base_url = build_api_server_url_for_http_requests(respect_env_override_if_set=True)
auth_headers = {"Authorization": f"Bearer {access_token.token}"}
request: BaseModel
search_request: dict[str, Any]
if is_ee:
# EE: use the dedicated search endpoint (no LLM invocation).
# Lazy import so CE deployments that strip ee/ never load this module.
from ee.onyx.server.query_and_chat.models import SendSearchQueryRequest
request = SendSearchQueryRequest(
search_query=query,
filters=filters,
num_docs_fed_to_llm_selection=limit,
run_query_expansion=False,
include_content=True,
stream=False,
)
# EE: use the dedicated search endpoint (no LLM invocation)
search_request = {
"search_query": query,
"filters": filters,
"num_docs_fed_to_llm_selection": limit,
"run_query_expansion": False,
"include_content": True,
"stream": False,
}
endpoint = f"{base_url}/search/send-search-message"
error_key = "error"
docs_key = "search_docs"
content_field = "content"
else:
# CE: fall back to the chat endpoint (invokes LLM, consumes tokens)
request = SendMessageRequest(
message=query,
stream=False,
chat_session_info=ChatSessionCreationRequest(),
internal_search_filters=filters,
)
search_request = {
"message": query,
"stream": False,
"chat_session_info": {},
}
if filters:
search_request["internal_search_filters"] = filters
endpoint = f"{base_url}/chat/send-chat-message"
error_key = "error_msg"
docs_key = "top_documents"
content_field = "blurb"
try:
response = await _post_model(
response = await get_http_client().post(
endpoint,
request,
access_token,
timeout=None if is_ee else _CE_SEARCH_TIMEOUT_SECONDS,
json=search_request,
headers=auth_headers,
)
if not response.is_success:
error_detail = _extract_error_detail(response)
return {
"documents": [],
"total_results": 0,
"query": query,
"error": _extract_error_detail(response),
"error": error_detail,
}
result = response.json()
# Check for error in response
if result.get(error_key):
return {
"documents": [],
"total_results": 0,
"query": query,
"error": result.get(error_key),
}
if is_ee:
from ee.onyx.server.query_and_chat.models import SearchFullResponse
ee_payload = SearchFullResponse.model_validate_json(response.content)
if ee_payload.error:
return {
"documents": [],
"total_results": 0,
"query": query,
"error": ee_payload.error,
}
documents = [
_project_doc(doc, doc.content) for doc in ee_payload.search_docs
]
else:
ce_payload = ChatFullResponse.model_validate_json(response.content)
if ce_payload.error_msg:
return {
"documents": [],
"total_results": 0,
"query": query,
"error": ce_payload.error_msg,
}
documents = [
_project_doc(doc, doc.blurb) for doc in ce_payload.top_documents
]
documents = [
{
"semantic_identifier": doc.get("semantic_identifier"),
"content": doc.get(content_field),
"source_type": doc.get("source_type"),
"link": doc.get("link"),
"score": doc.get("score"),
}
for doc in result.get(docs_key, [])
]
# NOTE: search depth is controlled by the backend persona defaults, not `limit`.
# `limit` only caps the returned list; fewer results may be returned if the
@@ -311,20 +252,23 @@ async def search_web(
access_token = require_access_token()
try:
response = await _post_model(
request_payload = {"queries": [query], "max_results": limit}
response = await get_http_client().post(
f"{build_api_server_url_for_http_requests(respect_env_override_if_set=True)}/web-search/search-lite",
WebSearchToolRequest(queries=[query], max_results=limit),
access_token,
json=request_payload,
headers={"Authorization": f"Bearer {access_token.token}"},
)
if not response.is_success:
error_detail = _extract_error_detail(response)
return {
"error": _extract_error_detail(response),
"error": error_detail,
"results": [],
"query": query,
}
payload = WebSearchToolResponse.model_validate_json(response.content)
response_payload = response.json()
results = response_payload.get("results", [])
return {
"results": [result.model_dump(mode="json") for result in payload.results],
"results": results,
"query": query,
}
except Exception as e:
@@ -361,19 +305,21 @@ async def open_urls(
access_token = require_access_token()
try:
response = await _post_model(
response = await get_http_client().post(
f"{build_api_server_url_for_http_requests(respect_env_override_if_set=True)}/web-search/open-urls",
OpenUrlsToolRequest(urls=urls),
access_token,
json={"urls": urls},
headers={"Authorization": f"Bearer {access_token.token}"},
)
if not response.is_success:
error_detail = _extract_error_detail(response)
return {
"error": _extract_error_detail(response),
"error": error_detail,
"results": [],
}
payload = OpenUrlsToolResponse.model_validate_json(response.content)
response_payload = response.json()
results = response_payload.get("results", [])
return {
"results": [result.model_dump(mode="json") for result in payload.results],
"results": results,
}
except Exception as e:
logger.error(f"Onyx MCP Server: URL fetch error: {e}", exc_info=True)

View File

@@ -5,24 +5,10 @@ from __future__ import annotations
import httpx
from fastmcp.server.auth.auth import AccessToken
from fastmcp.server.dependencies import get_access_token
from pydantic import BaseModel
from pydantic import TypeAdapter
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import build_api_server_url_for_http_requests
class DocumentSetEntry(BaseModel):
"""Minimal document-set shape surfaced to MCP clients.
Projected from the backend's DocumentSetSummary to avoid coupling MCP to
admin-only fields (cc-pair summaries, federated connectors, etc.).
"""
name: str
description: str | None = None
logger = setup_logger()
# Shared HTTP client reused across requests
@@ -98,32 +84,3 @@ async def get_indexed_sources(
exc_info=True,
)
raise RuntimeError(f"Failed to fetch indexed sources: {exc}") from exc
_DOCUMENT_SET_ENTRIES_ADAPTER = TypeAdapter(list[DocumentSetEntry])
async def get_accessible_document_sets(
access_token: AccessToken,
) -> list[DocumentSetEntry]:
"""Fetch document sets accessible to the current user."""
headers = {"Authorization": f"Bearer {access_token.token}"}
try:
response = await get_http_client().get(
f"{build_api_server_url_for_http_requests(respect_env_override_if_set=True)}/manage/document-set",
headers=headers,
)
response.raise_for_status()
return _DOCUMENT_SET_ENTRIES_ADAPTER.validate_json(response.content)
except (httpx.HTTPStatusError, httpx.RequestError, ValueError):
logger.error(
"Onyx MCP Server: Failed to fetch document sets",
exc_info=True,
)
raise
except Exception as exc:
logger.error(
"Onyx MCP Server: Unexpected error fetching document sets",
exc_info=True,
)
raise RuntimeError(f"Failed to fetch document sets: {exc}") from exc

View File

@@ -5,7 +5,6 @@ import uvicorn
from onyx.configs.app_configs import MCP_SERVER_ENABLED
from onyx.configs.app_configs import MCP_SERVER_HOST
from onyx.configs.app_configs import MCP_SERVER_PORT
from onyx.tracing.setup import setup_tracing
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import set_is_ee_based_on_env_variable
@@ -19,7 +18,6 @@ def main() -> None:
return
set_is_ee_based_on_env_variable()
setup_tracing()
logger.info(f"Starting MCP server on {MCP_SERVER_HOST}:{MCP_SERVER_PORT}")
from onyx.mcp_server.api import mcp_app

View File

@@ -11,8 +11,6 @@ All public functions no-op in single-tenant mode (`MULTI_TENANT=False`).
import time
from typing import cast
from prometheus_client import Counter
from prometheus_client import Gauge
from redis.client import Redis
from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
@@ -28,40 +26,6 @@ logger = setup_logger()
_SET_KEY = "active_tenants"
# --- Prometheus metrics ---
_active_set_size = Gauge(
"onyx_tenant_work_gating_active_set_size",
"Current cardinality of the active_tenants sorted set (updated once per "
"generator invocation when the gate reads it).",
)
_marked_total = Counter(
"onyx_tenant_work_gating_marked_total",
"Writes into active_tenants, labelled by caller.",
["caller"],
)
_skipped_total = Counter(
"onyx_tenant_work_gating_skipped_total",
"Per-tenant fanouts skipped by the gate (enforce mode only), by task.",
["task"],
)
_would_skip_total = Counter(
"onyx_tenant_work_gating_would_skip_total",
"Per-tenant fanouts that would have been skipped if enforce were on "
"(shadow counter), by task.",
["task"],
)
_full_fanout_total = Counter(
"onyx_tenant_work_gating_full_fanout_total",
"Generator invocations that bypassed the gate for a full fanout cycle, by task.",
["task"],
)
def _now_ms() -> int:
return int(time.time() * 1000)
@@ -90,14 +54,10 @@ def mark_tenant_active(tenant_id: str) -> None:
logger.exception(f"mark_tenant_active failed: tenant_id={tenant_id}")
def maybe_mark_tenant_active(tenant_id: str, caller: str = "unknown") -> None:
def maybe_mark_tenant_active(tenant_id: str) -> None:
"""Convenience wrapper for writer call sites: records the tenant only
when the feature flag is on. Fully defensive — never raises, so a Redis
outage or flag-read failure can't abort the calling task.
`caller` labels the Prometheus counter so a dashboard can show which
consumer is firing the hook most.
"""
outage or flag-read failure can't abort the calling task."""
try:
# Local import to avoid a module-load cycle: OnyxRuntime imports
# onyx.redis.redis_pool, so a top-level import here would wedge on
@@ -107,44 +67,10 @@ def maybe_mark_tenant_active(tenant_id: str, caller: str = "unknown") -> None:
if not OnyxRuntime.get_tenant_work_gating_enabled():
return
mark_tenant_active(tenant_id)
_marked_total.labels(caller=caller).inc()
except Exception:
logger.exception(f"maybe_mark_tenant_active failed: tenant_id={tenant_id}")
def observe_active_set_size() -> int | None:
"""Return `ZCARD active_tenants` and update the Prometheus gauge. Call
from the gate generator once per invocation so the dashboard has a
live reading.
Returns `None` on Redis error or in single-tenant mode; callers can
tolerate that (gauge simply doesn't update)."""
if not MULTI_TENANT:
return None
try:
size = cast(int, _client().zcard(_SET_KEY))
_active_set_size.set(size)
return size
except Exception:
logger.exception("observe_active_set_size failed")
return None
def record_gate_decision(task_name: str, skipped: bool) -> None:
"""Increment skip counters from the gate generator. Called once per
tenant that the gate would skip. Always increments the shadow counter;
increments the enforced counter only when `skipped=True`."""
_would_skip_total.labels(task=task_name).inc()
if skipped:
_skipped_total.labels(task=task_name).inc()
def record_full_fanout_cycle(task_name: str) -> None:
"""Increment the full-fanout counter. Called once per generator
invocation where the gate is bypassed (interval elapsed OR fail-open)."""
_full_fanout_total.labels(task=task_name).inc()
def get_active_tenants(ttl_seconds: int) -> set[str] | None:
"""Return tenants whose last-seen timestamp is within `ttl_seconds` of
now.

View File

@@ -584,7 +584,7 @@ def associate_credential_to_connector(
# Tenant-work-gating lifecycle hook: keep new-tenant latency to
# seconds instead of one full-fanout interval.
maybe_mark_tenant_active(tenant_id, caller="cc_pair_lifecycle")
maybe_mark_tenant_active(tenant_id)
# trigger indexing immediately
client_app.send_task(

View File

@@ -1643,7 +1643,7 @@ def create_connector_with_mock_credential(
# Tenant-work-gating lifecycle hook: keep new-tenant latency to
# seconds instead of one full-fanout interval.
maybe_mark_tenant_active(tenant_id, caller="cc_pair_lifecycle")
maybe_mark_tenant_active(tenant_id)
# trigger indexing immediately
client_app.send_task(

View File

@@ -40,8 +40,6 @@ from sqlalchemy.orm import Session
from onyx.auth.permissions import require_permission
from onyx.background.celery.versioned_apps.client import app as celery_app
from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_FILE
from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_UPLOAD
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import OnyxCeleryQueues
from onyx.configs.constants import OnyxCeleryTask
@@ -53,9 +51,6 @@ from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.enums import Permission
from onyx.db.models import User
from onyx.document_index.interfaces import DocumentMetadata
from onyx.error_handling.error_codes import OnyxErrorCode
from onyx.error_handling.exceptions import OnyxError
from onyx.file_processing.extract_file_text import count_pdf_embedded_images
from onyx.server.features.build.configs import USER_LIBRARY_MAX_FILE_SIZE_BYTES
from onyx.server.features.build.configs import USER_LIBRARY_MAX_FILES_PER_UPLOAD
from onyx.server.features.build.configs import USER_LIBRARY_MAX_TOTAL_SIZE_BYTES
@@ -133,49 +128,6 @@ class DeleteFileResponse(BaseModel):
# =============================================================================
def _looks_like_pdf(filename: str, content_type: str | None) -> bool:
"""True if either the filename or the content-type indicates a PDF.
Client-supplied ``content_type`` can be spoofed (e.g. a PDF uploaded with
``Content-Type: application/octet-stream``), so we also fall back to
extension-based detection via ``mimetypes.guess_type`` on the filename.
"""
if content_type == "application/pdf":
return True
guessed, _ = mimetypes.guess_type(filename)
return guessed == "application/pdf"
def _check_pdf_image_caps(
filename: str, content: bytes, content_type: str | None, batch_total: int
) -> int:
"""Enforce per-file and per-batch embedded-image caps for PDFs.
Returns the number of embedded images in this file (0 for non-PDFs) so
callers can update their running batch total. Raises OnyxError(INVALID_INPUT)
if either cap is exceeded.
"""
if not _looks_like_pdf(filename, content_type):
return 0
file_cap = MAX_EMBEDDED_IMAGES_PER_FILE
batch_cap = MAX_EMBEDDED_IMAGES_PER_UPLOAD
# Short-circuit at the larger cap so we get a useful count for both checks.
count = count_pdf_embedded_images(BytesIO(content), max(file_cap, batch_cap))
if count > file_cap:
raise OnyxError(
OnyxErrorCode.INVALID_INPUT,
f"PDF '{filename}' contains too many embedded images "
f"(more than {file_cap}). Try splitting the document into smaller files.",
)
if batch_total + count > batch_cap:
raise OnyxError(
OnyxErrorCode.INVALID_INPUT,
f"Upload would exceed the {batch_cap}-image limit across all "
f"files in this batch. Try uploading fewer image-heavy files at once.",
)
return count
def _sanitize_path(path: str) -> str:
"""Sanitize a file path, removing traversal attempts and normalizing.
@@ -404,7 +356,6 @@ async def upload_files(
uploaded_entries: list[LibraryEntryResponse] = []
total_size = 0
batch_image_total = 0
now = datetime.now(timezone.utc)
# Sanitize the base path
@@ -424,14 +375,6 @@ async def upload_files(
detail=f"File '{file.filename}' exceeds maximum size of {USER_LIBRARY_MAX_FILE_SIZE_BYTES // (1024 * 1024)}MB",
)
# Reject PDFs with an unreasonable per-file or per-batch image count
batch_image_total += _check_pdf_image_caps(
filename=file.filename or "unnamed",
content=content,
content_type=file.content_type,
batch_total=batch_image_total,
)
# Validate cumulative storage (existing + this upload batch)
total_size += file_size
if existing_usage + total_size > USER_LIBRARY_MAX_TOTAL_SIZE_BYTES:
@@ -530,7 +473,6 @@ async def upload_zip(
uploaded_entries: list[LibraryEntryResponse] = []
total_size = 0
batch_image_total = 0
# Extract zip contents into a subfolder named after the zip file
zip_name = api_sanitize_filename(file.filename or "upload")
@@ -569,36 +511,6 @@ async def upload_zip(
logger.warning(f"Skipping '{zip_info.filename}' - exceeds max size")
continue
# Skip PDFs that would trip the per-file or per-batch image
# cap (would OOM the user-file-processing worker). Matches
# /upload behavior but uses skip-and-warn to stay consistent
# with the zip path's handling of oversized files.
zip_file_name = zip_info.filename.split("/")[-1]
zip_content_type, _ = mimetypes.guess_type(zip_file_name)
if zip_content_type == "application/pdf":
image_count = count_pdf_embedded_images(
BytesIO(file_content),
max(
MAX_EMBEDDED_IMAGES_PER_FILE,
MAX_EMBEDDED_IMAGES_PER_UPLOAD,
),
)
if image_count > MAX_EMBEDDED_IMAGES_PER_FILE:
logger.warning(
"Skipping '%s' - exceeds %d per-file embedded-image cap",
zip_info.filename,
MAX_EMBEDDED_IMAGES_PER_FILE,
)
continue
if batch_image_total + image_count > MAX_EMBEDDED_IMAGES_PER_UPLOAD:
logger.warning(
"Skipping '%s' - would exceed %d per-batch embedded-image cap",
zip_info.filename,
MAX_EMBEDDED_IMAGES_PER_UPLOAD,
)
continue
batch_image_total += image_count
total_size += file_size
# Validate cumulative storage

View File

@@ -113,7 +113,7 @@ def cleanup_idle_sandboxes_task(self: Task, *, tenant_id: str) -> None: # noqa:
# Tenant-work-gating hook: refresh this tenant's active-set
# membership whenever sandbox cleanup has work to do.
maybe_mark_tenant_active(tenant_id, caller="sandbox_cleanup")
maybe_mark_tenant_active(tenant_id)
task_logger.info(
f"Found {len(idle_sandboxes)} idle sandboxes to put to sleep"

View File

@@ -9,10 +9,7 @@ from pydantic import ConfigDict
from pydantic import Field
from sqlalchemy.orm import Session
from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_FILE
from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_UPLOAD
from onyx.db.llm import fetch_default_llm_model
from onyx.file_processing.extract_file_text import count_pdf_embedded_images
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
from onyx.file_processing.file_types import OnyxFileExtensions
@@ -193,11 +190,6 @@ def categorize_uploaded_files(
token_threshold_k * 1000 if token_threshold_k else None
) # 0 → None = no limit
# Running total of embedded images across PDFs in this batch. Once the
# aggregate cap is reached, subsequent PDFs in the same upload are
# rejected even if they'd individually fit under MAX_EMBEDDED_IMAGES_PER_FILE.
batch_image_total = 0
for upload in files:
try:
filename = get_safe_filename(upload)
@@ -260,47 +252,6 @@ def categorize_uploaded_files(
)
continue
# Reject PDFs with an unreasonable number of embedded images
# (either per-file or accumulated across this upload batch).
# A PDF with thousands of embedded images can OOM the
# user-file-processing celery worker because every image is
# decoded with PIL and then sent to the vision LLM.
if extension == ".pdf":
file_cap = MAX_EMBEDDED_IMAGES_PER_FILE
batch_cap = MAX_EMBEDDED_IMAGES_PER_UPLOAD
# Use the larger of the two caps as the short-circuit
# threshold so we get a useful count for both checks.
# count_pdf_embedded_images restores the stream position.
count = count_pdf_embedded_images(
upload.file, max(file_cap, batch_cap)
)
if count > file_cap:
results.rejected.append(
RejectedFile(
filename=filename,
reason=(
f"PDF contains too many embedded images "
f"(more than {file_cap}). Try splitting "
f"the document into smaller files."
),
)
)
continue
if batch_image_total + count > batch_cap:
results.rejected.append(
RejectedFile(
filename=filename,
reason=(
f"Upload would exceed the "
f"{batch_cap}-image limit across all "
f"files in this batch. Try uploading "
f"fewer image-heavy files at once."
),
)
)
continue
batch_image_total += count
text_content = extract_file_text(
file=upload.file,
file_name=filename,

View File

@@ -3,7 +3,6 @@ from fastapi import Depends
from sqlalchemy.orm import Session
from onyx.auth.permissions import require_permission
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
from onyx.db.engine.sql_engine import get_session
from onyx.db.enums import Permission
from onyx.db.models import User
@@ -50,7 +49,6 @@ def get_opensearch_retrieval_status(
enable_opensearch_retrieval = get_opensearch_retrieval_state(db_session)
return OpenSearchRetrievalStatusResponse(
enable_opensearch_retrieval=enable_opensearch_retrieval,
toggling_retrieval_is_disabled=ONYX_DISABLE_VESPA,
)
@@ -65,5 +63,4 @@ def set_opensearch_retrieval_status(
)
return OpenSearchRetrievalStatusResponse(
enable_opensearch_retrieval=request.enable_opensearch_retrieval,
toggling_retrieval_is_disabled=ONYX_DISABLE_VESPA,
)

View File

@@ -19,4 +19,3 @@ class OpenSearchRetrievalStatusRequest(BaseModel):
class OpenSearchRetrievalStatusResponse(BaseModel):
model_config = {"frozen": True}
enable_opensearch_retrieval: bool
toggling_retrieval_is_disabled: bool = False

View File

@@ -1,151 +0,0 @@
"""Prometheus metrics for embedding generation latency and throughput.
Tracks client-side round-trip latency (as seen by callers of
``EmbeddingModel.encode``) and server-side execution time (as measured inside
the model server for the local-model path). Both API-provider and local-model
paths flow through the client-side metric; only the local path populates the
server-side metric.
"""
import logging
from collections.abc import Generator
from contextlib import contextmanager
from prometheus_client import Counter
from prometheus_client import Gauge
from prometheus_client import Histogram
from shared_configs.enums import EmbeddingProvider
from shared_configs.enums import EmbedTextType
logger = logging.getLogger(__name__)
LOCAL_PROVIDER_LABEL = "local"
_EMBEDDING_LATENCY_BUCKETS = (
0.005,
0.01,
0.025,
0.05,
0.1,
0.25,
0.5,
1.0,
2.5,
5.0,
10.0,
25.0,
)
PROVIDER_LABEL_NAME = "provider"
TEXT_TYPE_LABEL_NAME = "text_type"
STATUS_LABEL_NAME = "status"
_client_duration = Histogram(
"onyx_embedding_client_duration_seconds",
"Client-side end-to-end latency of an embedding batch as seen by the caller.",
[PROVIDER_LABEL_NAME, TEXT_TYPE_LABEL_NAME],
buckets=_EMBEDDING_LATENCY_BUCKETS,
)
_embedding_requests_total = Counter(
"onyx_embedding_requests_total",
"Total embedding batch requests, labeled by outcome.",
[PROVIDER_LABEL_NAME, TEXT_TYPE_LABEL_NAME, STATUS_LABEL_NAME],
)
_embedding_texts_total = Counter(
"onyx_embedding_texts_total",
"Total number of individual texts submitted for embedding.",
[PROVIDER_LABEL_NAME, TEXT_TYPE_LABEL_NAME],
)
_embedding_input_chars_total = Counter(
"onyx_embedding_input_chars_total",
"Total number of input characters submitted for embedding.",
[PROVIDER_LABEL_NAME, TEXT_TYPE_LABEL_NAME],
)
_embeddings_in_progress = Gauge(
"onyx_embeddings_in_progress",
"Number of embedding batches currently in-flight.",
[PROVIDER_LABEL_NAME, TEXT_TYPE_LABEL_NAME],
)
def provider_label(provider: EmbeddingProvider | None) -> str:
if provider is None:
return LOCAL_PROVIDER_LABEL
return provider.value
def observe_embedding_client(
provider: EmbeddingProvider | None,
text_type: EmbedTextType,
duration_s: float,
num_texts: int,
num_chars: int,
success: bool,
) -> None:
"""Records a completed embedding batch.
Args:
provider: The embedding provider, or ``None`` for the local model path.
text_type: Whether this was a query- or passage-style embedding.
duration_s: Wall-clock duration measured on the client side, in seconds.
num_texts: Number of texts in the batch.
num_chars: Total number of input characters in the batch.
success: Whether the embedding call succeeded.
"""
try:
provider_lbl = provider_label(provider)
text_type_lbl = text_type.value
status_lbl = "success" if success else "failure"
_embedding_requests_total.labels(
provider=provider_lbl, text_type=text_type_lbl, status=status_lbl
).inc()
_client_duration.labels(provider=provider_lbl, text_type=text_type_lbl).observe(
duration_s
)
if success:
_embedding_texts_total.labels(
provider=provider_lbl, text_type=text_type_lbl
).inc(num_texts)
_embedding_input_chars_total.labels(
provider=provider_lbl, text_type=text_type_lbl
).inc(num_chars)
except Exception:
logger.warning("Failed to record embedding client metrics.", exc_info=True)
@contextmanager
def track_embedding_in_progress(
provider: EmbeddingProvider | None,
text_type: EmbedTextType,
) -> Generator[None, None, None]:
"""Context manager that tracks in-flight embedding batches via a Gauge."""
incremented = False
provider_lbl = provider_label(provider)
text_type_lbl = text_type.value
try:
_embeddings_in_progress.labels(
provider=provider_lbl, text_type=text_type_lbl
).inc()
incremented = True
except Exception:
logger.warning(
"Failed to increment in-progress embedding gauge.", exc_info=True
)
try:
yield
finally:
if incremented:
try:
_embeddings_in_progress.labels(
provider=provider_lbl, text_type=text_type_lbl
).dec()
except Exception:
logger.warning(
"Failed to decrement in-progress embedding gauge.", exc_info=True
)

View File

@@ -395,15 +395,6 @@ class WorkerHealthCollector(_CachedCollector):
Reads worker status from ``WorkerHeartbeatMonitor`` which listens
to the Celery event stream via a single persistent connection.
TODO: every monitoring pod subscribes to the cluster-wide Celery event
stream, so each replica reports health for *all* workers in the cluster,
not just itself. Prometheus distinguishes the replicas via the ``instance``
label, so this doesn't break scraping, but it means N monitoring replicas
do N× the work and may emit slightly inconsistent snapshots of the same
cluster. The proper fix is to have each worker expose its own health (or
to elect a single monitoring replica as the reporter) rather than
broadcasting the full cluster view from every monitoring pod.
"""
def __init__(self, cache_ttl: float = 30.0) -> None:
@@ -422,16 +413,10 @@ class WorkerHealthCollector(_CachedCollector):
"onyx_celery_active_worker_count",
"Number of active Celery workers with recent heartbeats",
)
# Celery hostnames are ``{worker_type}@{nodename}`` (see supervisord.conf).
# Emitting only the worker_type as a label causes N replicas of the same
# type to collapse into identical timeseries within a single scrape,
# which Prometheus rejects as "duplicate sample for timestamp". Split
# the pieces into separate labels so each replica is distinct; callers
# can still ``sum by (worker_type)`` to recover the old aggregated view.
worker_up = GaugeMetricFamily(
"onyx_celery_worker_up",
"Whether a specific Celery worker is alive (1=up, 0=down)",
labels=["worker_type", "hostname"],
labels=["worker"],
)
try:
@@ -439,15 +424,11 @@ class WorkerHealthCollector(_CachedCollector):
alive_count = sum(1 for alive in status.values() if alive)
active_workers.add_metric([], alive_count)
for full_hostname in sorted(status):
worker_type, sep, host = full_hostname.partition("@")
if not sep:
# Hostname didn't contain "@" — fall back to using the
# whole string as the hostname with an empty type.
worker_type, host = "", full_hostname
worker_up.add_metric(
[worker_type, host], 1 if status[full_hostname] else 0
)
for hostname in sorted(status):
# Use short name (before @) for single-host deployments,
# full hostname when multiple hosts share a worker type.
label = hostname.split("@")[0]
worker_up.add_metric([label], 1 if status[hostname] else 0)
except Exception:
logger.debug("Failed to collect worker health metrics", exc_info=True)

View File

@@ -13,7 +13,7 @@ from onyx.configs.constants import PUBLIC_API_TAGS
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
from onyx.db.document import delete_documents_complete__no_commit
from onyx.db.document import delete_documents_complete
from onyx.db.document import get_document
from onyx.db.document import get_documents_by_cc_pair
from onyx.db.document import get_ingestion_documents
@@ -210,5 +210,4 @@ def delete_ingestion_doc(
)
# Delete from database
delete_documents_complete__no_commit(db_session, [document_id])
db_session.commit()
delete_documents_complete(db_session, [document_id])

View File

@@ -7,7 +7,6 @@ from onyx.configs.app_configs import DISABLE_VECTOR_DB
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
from onyx.configs.app_configs import INTEGRATION_TESTS_MODE
from onyx.configs.app_configs import MANAGED_VESPA
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
from onyx.configs.app_configs import VESPA_NUM_ATTEMPTS_ON_STARTUP
from onyx.configs.constants import KV_REINDEX_KEY
from onyx.configs.embedding_configs import SUPPORTED_EMBEDDING_MODELS
@@ -127,11 +126,10 @@ def setup_onyx(
"DISABLE_VECTOR_DB is set — skipping document index setup and embedding model warm-up."
)
else:
# Ensure the document indices are setup correctly. This step is
# relatively near the end because Vespa takes a bit of time to start up.
# Ensure Vespa is setup correctly, this step is relatively near the end
# because Vespa takes a bit of time to start up
logger.notice("Verifying Document Index(s) is/are available.")
# This flow is for setting up the document index so we get all indices
# here.
# This flow is for setting up the document index so we get all indices here.
document_indices = get_all_document_indices(
search_settings,
secondary_search_settings,
@@ -337,7 +335,7 @@ def setup_multitenant_onyx() -> None:
# For Managed Vespa, the schema is sent over via the Vespa Console manually.
# NOTE: Pretty sure this code is never hit in any production environment.
if not MANAGED_VESPA and not ONYX_DISABLE_VESPA:
if not MANAGED_VESPA:
setup_vespa_multitenant(SUPPORTED_EMBEDDING_MODELS)

View File

@@ -34,7 +34,6 @@ R = TypeVar("R")
KT = TypeVar("KT") # Key type
VT = TypeVar("VT") # Value type
_T = TypeVar("_T") # Default type
_MISSING: object = object()
class ThreadSafeDict(MutableMapping[KT, VT]):
@@ -118,10 +117,10 @@ class ThreadSafeDict(MutableMapping[KT, VT]):
with self.lock:
return self._dict.get(key, default)
def pop(self, key: KT, default: Any = _MISSING) -> Any:
def pop(self, key: KT, default: Any = None) -> Any:
"""Remove and return a value with optional default, atomically."""
with self.lock:
if default is _MISSING:
if default is None:
return self._dict.pop(key)
return self._dict.pop(key, default)

View File

@@ -60,7 +60,7 @@ attrs==25.4.0
# jsonschema
# referencing
# zeep
authlib==1.6.11
authlib==1.6.9
# via fastmcp
azure-cognitiveservices-speech==1.38.0
babel==2.17.0
@@ -214,9 +214,7 @@ distro==1.9.0
dnspython==2.8.0
# via email-validator
docstring-parser==0.17.0
# via
# cyclopts
# google-cloud-aiplatform
# via cyclopts
docutils==0.22.3
# via rich-rst
dropbox==12.0.2
@@ -272,13 +270,7 @@ gitdb==4.0.12
gitpython==3.1.45
# via braintrust
google-api-core==2.28.1
# via
# google-api-python-client
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
# via google-api-python-client
google-api-python-client==2.86.0
google-auth==2.48.0
# via
@@ -286,61 +278,21 @@ google-auth==2.48.0
# google-api-python-client
# google-auth-httplib2
# google-auth-oauthlib
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
# google-genai
# kubernetes
google-auth-httplib2==0.1.0
# via google-api-python-client
google-auth-oauthlib==1.0.0
google-cloud-aiplatform==1.133.0
# via litellm
google-cloud-bigquery==3.41.0
# via google-cloud-aiplatform
google-cloud-core==2.5.1
# via
# google-cloud-bigquery
# google-cloud-storage
google-cloud-resource-manager==1.17.0
# via google-cloud-aiplatform
google-cloud-storage==3.10.1
# via google-cloud-aiplatform
google-crc32c==1.8.0
# via
# google-cloud-storage
# google-resumable-media
google-genai==1.52.0
# via
# google-cloud-aiplatform
# onyx
google-resumable-media==2.8.2
# via
# google-cloud-bigquery
# google-cloud-storage
# via onyx
googleapis-common-protos==1.72.0
# via
# google-api-core
# grpc-google-iam-v1
# grpcio-status
# opentelemetry-exporter-otlp-proto-http
greenlet==3.2.4
# via
# playwright
# sqlalchemy
grpc-google-iam-v1==0.14.4
# via google-cloud-resource-manager
grpcio==1.80.0
# via
# google-api-core
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
grpcio-status==1.80.0
# via google-api-core
h11==0.16.0
# via
# httpcore
@@ -491,7 +443,7 @@ magika==0.6.3
# via markitdown
makefun==1.16.0
# via fastapi-users
mako==1.3.11
mako==1.2.4
# via alembic
mammoth==1.11.0
# via markitdown
@@ -607,8 +559,6 @@ packaging==24.2
# dask
# distributed
# fastmcp
# google-cloud-aiplatform
# google-cloud-bigquery
# huggingface-hub
# jira
# kombu
@@ -655,19 +605,12 @@ propcache==0.4.1
# aiohttp
# yarl
proto-plus==1.26.1
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
# via google-api-core
protobuf==6.33.5
# via
# ddtrace
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
# onnxruntime
# opentelemetry-proto
# proto-plus
@@ -700,7 +643,6 @@ pydantic==2.11.7
# exa-py
# fastapi
# fastmcp
# google-cloud-aiplatform
# google-genai
# langchain-core
# langfuse
@@ -737,7 +679,7 @@ pynacl==1.6.2
pypandoc-binary==1.16.2
pyparsing==3.2.5
# via httplib2
pypdf==6.10.2
pypdf==6.10.0
# via unstructured-client
pyperclip==1.11.0
# via fastmcp
@@ -759,7 +701,6 @@ python-dateutil==2.8.2
# botocore
# celery
# dateparser
# google-cloud-bigquery
# htmldate
# hubspot-api-client
# kubernetes
@@ -838,8 +779,6 @@ requests==2.33.0
# dropbox
# exa-py
# google-api-core
# google-cloud-bigquery
# google-cloud-storage
# google-genai
# hubspot-api-client
# jira
@@ -1012,9 +951,7 @@ typing-extensions==4.15.0
# exa-py
# exceptiongroup
# fastapi
# google-cloud-aiplatform
# google-genai
# grpcio
# huggingface-hub
# jira
# langchain-core

View File

@@ -114,8 +114,6 @@ distlib==0.4.0
# via virtualenv
distro==1.9.0
# via openai
docstring-parser==0.17.0
# via google-cloud-aiplatform
durationpy==0.10
# via kubernetes
execnet==2.1.2
@@ -143,65 +141,14 @@ frozenlist==1.8.0
# aiosignal
fsspec==2025.10.0
# via huggingface-hub
google-api-core==2.28.1
# via
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
google-auth==2.48.0
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
# google-genai
# kubernetes
google-cloud-aiplatform==1.133.0
# via litellm
google-cloud-bigquery==3.41.0
# via google-cloud-aiplatform
google-cloud-core==2.5.1
# via
# google-cloud-bigquery
# google-cloud-storage
google-cloud-resource-manager==1.17.0
# via google-cloud-aiplatform
google-cloud-storage==3.10.1
# via google-cloud-aiplatform
google-crc32c==1.8.0
# via
# google-cloud-storage
# google-resumable-media
google-genai==1.52.0
# via
# google-cloud-aiplatform
# onyx
google-resumable-media==2.8.2
# via
# google-cloud-bigquery
# google-cloud-storage
googleapis-common-protos==1.72.0
# via
# google-api-core
# grpc-google-iam-v1
# grpcio-status
# via onyx
greenlet==3.2.4 ; platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'
# via sqlalchemy
grpc-google-iam-v1==0.14.4
# via google-cloud-resource-manager
grpcio==1.80.0
# via
# google-api-core
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
grpcio-status==1.80.0
# via google-api-core
h11==0.16.0
# via
# httpcore
@@ -271,7 +218,7 @@ kubernetes==31.0.0
# via onyx
litellm==1.81.6
# via onyx
mako==1.3.11
mako==1.2.4
# via alembic
manygo==0.2.0
markdown-it-py==4.0.0
@@ -320,8 +267,6 @@ openapi-generator-cli==7.17.0
packaging==24.2
# via
# black
# google-cloud-aiplatform
# google-cloud-bigquery
# hatchling
# huggingface-hub
# ipykernel
@@ -362,20 +307,6 @@ propcache==0.4.1
# via
# aiohttp
# yarl
proto-plus==1.26.1
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
protobuf==6.33.5
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
# proto-plus
psutil==7.1.3
# via ipykernel
ptyprocess==0.7.0 ; sys_platform != 'emscripten' and sys_platform != 'win32'
@@ -397,7 +328,6 @@ pydantic==2.11.7
# agent-client-protocol
# cohere
# fastapi
# google-cloud-aiplatform
# google-genai
# litellm
# mcp
@@ -434,7 +364,6 @@ python-dateutil==2.8.2
# via
# aiobotocore
# botocore
# google-cloud-bigquery
# jupyter-client
# kubernetes
# matplotlib
@@ -469,9 +398,6 @@ reorder-python-imports-black==3.14.0
requests==2.33.0
# via
# cohere
# google-api-core
# google-cloud-bigquery
# google-cloud-storage
# google-genai
# kubernetes
# requests-oauthlib
@@ -572,9 +498,7 @@ typing-extensions==4.15.0
# celery-types
# cohere
# fastapi
# google-cloud-aiplatform
# google-genai
# grpcio
# huggingface-hub
# ipython
# mcp

View File

@@ -87,8 +87,6 @@ discord-py==2.4.0
# via onyx
distro==1.9.0
# via openai
docstring-parser==0.17.0
# via google-cloud-aiplatform
durationpy==0.10
# via kubernetes
fastapi==0.133.1
@@ -105,63 +103,12 @@ frozenlist==1.8.0
# aiosignal
fsspec==2025.10.0
# via huggingface-hub
google-api-core==2.28.1
# via
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
google-auth==2.48.0
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
# google-genai
# kubernetes
google-cloud-aiplatform==1.133.0
# via litellm
google-cloud-bigquery==3.41.0
# via google-cloud-aiplatform
google-cloud-core==2.5.1
# via
# google-cloud-bigquery
# google-cloud-storage
google-cloud-resource-manager==1.17.0
# via google-cloud-aiplatform
google-cloud-storage==3.10.1
# via google-cloud-aiplatform
google-crc32c==1.8.0
# via
# google-cloud-storage
# google-resumable-media
google-genai==1.52.0
# via
# google-cloud-aiplatform
# onyx
google-resumable-media==2.8.2
# via
# google-cloud-bigquery
# google-cloud-storage
googleapis-common-protos==1.72.0
# via
# google-api-core
# grpc-google-iam-v1
# grpcio-status
grpc-google-iam-v1==0.14.4
# via google-cloud-resource-manager
grpcio==1.80.0
# via
# google-api-core
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
grpcio-status==1.80.0
# via google-api-core
# via onyx
h11==0.16.0
# via
# httpcore
@@ -237,10 +184,7 @@ openai==2.14.0
# litellm
# onyx
packaging==24.2
# via
# google-cloud-aiplatform
# google-cloud-bigquery
# huggingface-hub
# via huggingface-hub
parameterized==0.9.0
# via cohere
posthog==3.7.4
@@ -254,20 +198,6 @@ propcache==0.4.1
# via
# aiohttp
# yarl
proto-plus==1.26.1
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
protobuf==6.33.5
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
# proto-plus
py==1.11.0
# via retry
pyasn1==0.6.3
@@ -283,7 +213,6 @@ pydantic==2.11.7
# agent-client-protocol
# cohere
# fastapi
# google-cloud-aiplatform
# google-genai
# litellm
# mcp
@@ -302,7 +231,6 @@ python-dateutil==2.8.2
# via
# aiobotocore
# botocore
# google-cloud-bigquery
# kubernetes
# posthog
python-dotenv==1.1.1
@@ -326,9 +254,6 @@ regex==2025.11.3
requests==2.33.0
# via
# cohere
# google-api-core
# google-cloud-bigquery
# google-cloud-storage
# google-genai
# kubernetes
# posthog
@@ -393,9 +318,7 @@ typing-extensions==4.15.0
# anyio
# cohere
# fastapi
# google-cloud-aiplatform
# google-genai
# grpcio
# huggingface-hub
# mcp
# openai

View File

@@ -102,8 +102,6 @@ discord-py==2.4.0
# via onyx
distro==1.9.0
# via openai
docstring-parser==0.17.0
# via google-cloud-aiplatform
durationpy==0.10
# via kubernetes
einops==0.8.1
@@ -127,63 +125,12 @@ fsspec==2025.10.0
# via
# huggingface-hub
# torch
google-api-core==2.28.1
# via
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
google-auth==2.48.0
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
# google-cloud-resource-manager
# google-cloud-storage
# google-genai
# kubernetes
google-cloud-aiplatform==1.133.0
# via litellm
google-cloud-bigquery==3.41.0
# via google-cloud-aiplatform
google-cloud-core==2.5.1
# via
# google-cloud-bigquery
# google-cloud-storage
google-cloud-resource-manager==1.17.0
# via google-cloud-aiplatform
google-cloud-storage==3.10.1
# via google-cloud-aiplatform
google-crc32c==1.8.0
# via
# google-cloud-storage
# google-resumable-media
google-genai==1.52.0
# via
# google-cloud-aiplatform
# onyx
google-resumable-media==2.8.2
# via
# google-cloud-bigquery
# google-cloud-storage
googleapis-common-protos==1.72.0
# via
# google-api-core
# grpc-google-iam-v1
# grpcio-status
grpc-google-iam-v1==0.14.4
# via google-cloud-resource-manager
grpcio==1.80.0
# via
# google-api-core
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
grpcio-status==1.80.0
# via google-api-core
# via onyx
h11==0.16.0
# via
# httpcore
@@ -318,8 +265,6 @@ openai==2.14.0
packaging==24.2
# via
# accelerate
# google-cloud-aiplatform
# google-cloud-bigquery
# huggingface-hub
# kombu
# transformers
@@ -337,20 +282,6 @@ propcache==0.4.1
# via
# aiohttp
# yarl
proto-plus==1.26.1
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
protobuf==6.33.5
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-resource-manager
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
# proto-plus
psutil==7.1.3
# via accelerate
py==1.11.0
@@ -368,7 +299,6 @@ pydantic==2.11.7
# agent-client-protocol
# cohere
# fastapi
# google-cloud-aiplatform
# google-genai
# litellm
# mcp
@@ -388,7 +318,6 @@ python-dateutil==2.8.2
# aiobotocore
# botocore
# celery
# google-cloud-bigquery
# kubernetes
python-dotenv==1.1.1
# via
@@ -415,9 +344,6 @@ regex==2025.11.3
requests==2.33.0
# via
# cohere
# google-api-core
# google-cloud-bigquery
# google-cloud-storage
# google-genai
# kubernetes
# requests-oauthlib
@@ -511,9 +437,7 @@ typing-extensions==4.15.0
# anyio
# cohere
# fastapi
# google-cloud-aiplatform
# google-genai
# grpcio
# huggingface-hub
# mcp
# openai

View File

@@ -46,7 +46,7 @@ stop_and_remove_containers
# Start the PostgreSQL container with optional volume
echo "Starting PostgreSQL container..."
if [[ -n "$POSTGRES_VOLUME" ]]; then
docker run -p 5432:5432 --name onyx_postgres -e POSTGRES_PASSWORD=password -d -v "$POSTGRES_VOLUME":/var/lib/postgresql/data postgres -c max_connections=250
docker run -p 5432:5432 --name onyx_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres -c max_connections=250
else
docker run -p 5432:5432 --name onyx_postgres -e POSTGRES_PASSWORD=password -d postgres -c max_connections=250
fi
@@ -54,7 +54,7 @@ fi
# Start the Vespa container with optional volume
echo "Starting Vespa container..."
if [[ -n "$VESPA_VOLUME" ]]; then
docker run --detach --name onyx_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v "$VESPA_VOLUME":/opt/vespa/var vespaengine/vespa:8
docker run --detach --name onyx_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v $VESPA_VOLUME:/opt/vespa/var vespaengine/vespa:8
else
docker run --detach --name onyx_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8
fi
@@ -85,7 +85,7 @@ docker compose -f "$COMPOSE_FILE" -f "$COMPOSE_DEV_FILE" --profile opensearch-en
# Start the Redis container with optional volume
echo "Starting Redis container..."
if [[ -n "$REDIS_VOLUME" ]]; then
docker run --detach --name onyx_redis --publish 6379:6379 -v "$REDIS_VOLUME":/data redis
docker run --detach --name onyx_redis --publish 6379:6379 -v $REDIS_VOLUME:/data redis
else
docker run --detach --name onyx_redis --publish 6379:6379 redis
fi
@@ -93,7 +93,7 @@ fi
# Start the MinIO container with optional volume
echo "Starting MinIO container..."
if [[ -n "$MINIO_VOLUME" ]]; then
docker run --detach --name onyx_minio --publish 9004:9000 --publish 9005:9001 -e MINIO_ROOT_USER=minioadmin -e MINIO_ROOT_PASSWORD=minioadmin -v "$MINIO_VOLUME":/data minio/minio server /data --console-address ":9001"
docker run --detach --name onyx_minio --publish 9004:9000 --publish 9005:9001 -e MINIO_ROOT_USER=minioadmin -e MINIO_ROOT_PASSWORD=minioadmin -v $MINIO_VOLUME:/data minio/minio server /data --console-address ":9001"
else
docker run --detach --name onyx_minio --publish 9004:9000 --publish 9005:9001 -e MINIO_ROOT_USER=minioadmin -e MINIO_ROOT_PASSWORD=minioadmin minio/minio server /data --console-address ":9001"
fi
@@ -111,7 +111,6 @@ sleep 1
# Alembic should be configured in the virtualenv for this repo
if [[ -f "../.venv/bin/activate" ]]; then
# shellcheck source=/dev/null
source ../.venv/bin/activate
else
echo "Warning: Python virtual environment not found at .venv/bin/activate; alembic may not work."

View File

@@ -9,10 +9,8 @@ import pytest
from onyx.configs.constants import BlobType
from onyx.connectors.blob.connector import BlobStorageConnector
from onyx.connectors.cross_connector_utils.tabular_section_utils import is_tabular_file
from onyx.connectors.models import Document
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import TabularSection
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import get_file_ext
from onyx.file_processing.file_types import OnyxFileExtensions
@@ -113,18 +111,15 @@ def test_blob_s3_connector(
for doc in all_docs:
section = doc.sections[0]
if is_tabular_file(doc.semantic_identifier):
assert isinstance(section, TabularSection)
assert len(section.text) > 0
continue
assert isinstance(section, TextSection)
file_extension = get_file_ext(doc.semantic_identifier)
if file_extension in OnyxFileExtensions.TEXT_AND_DOCUMENT_EXTENSIONS:
assert len(section.text) > 0
else:
assert len(section.text) == 0
continue
# unknown extension
assert len(section.text) == 0
@patch(

View File

@@ -0,0 +1,348 @@
"""External dependency unit tests for the file_id cleanup that runs alongside
document deletion across the three deletion paths:
1. `document_by_cc_pair_cleanup_task` (pruning + connector deletion)
2. `delete_ingestion_doc` (public ingestion API DELETE)
3. `delete_all_documents_for_connector_credential_pair` (index swap)
Each path captures attached `Document.file_id`s before the row is removed and
best-effort deletes the underlying files after the DB commit.
"""
from collections.abc import Generator
from io import BytesIO
from unittest.mock import MagicMock
from unittest.mock import patch
from uuid import uuid4
import pytest
from sqlalchemy.orm import Session
from onyx.background.celery.tasks.shared.tasks import (
document_by_cc_pair_cleanup_task,
)
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.connectors.models import InputType
from onyx.connectors.models import TextSection
from onyx.db.document import delete_all_documents_for_connector_credential_pair
from onyx.db.document import upsert_document_by_connector_credential_pair
from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.file_record import get_filerecord_by_file_id_optional
from onyx.db.models import Connector
from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import Credential
from onyx.db.models import Document as DBDocument
from onyx.db.models import FileRecord
from onyx.file_store.file_store import get_default_file_store
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
from onyx.server.onyx_api.ingestion import delete_ingestion_doc
from tests.external_dependency_unit.constants import TEST_TENANT_ID
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_doc(
doc_id: str,
file_id: str | None = None,
from_ingestion_api: bool = False,
) -> Document:
return Document(
id=doc_id,
source=DocumentSource.MOCK_CONNECTOR,
semantic_identifier=f"semantic-{doc_id}",
sections=[TextSection(text="content", link=None)],
metadata={},
file_id=file_id,
from_ingestion_api=from_ingestion_api,
)
def _stage_file(content: bytes = b"raw bytes") -> str:
return get_default_file_store().save_file(
content=BytesIO(content),
display_name=None,
file_origin=FileOrigin.INDEXING_STAGING,
file_type="application/octet-stream",
file_metadata={"test": True},
)
def _get_doc_row(db_session: Session, doc_id: str) -> DBDocument | None:
db_session.expire_all()
return db_session.query(DBDocument).filter(DBDocument.id == doc_id).one_or_none()
def _get_filerecord(db_session: Session, file_id: str) -> FileRecord | None:
db_session.expire_all()
return get_filerecord_by_file_id_optional(file_id=file_id, db_session=db_session)
def _index_doc(
db_session: Session,
doc: Document,
attempt_metadata: IndexAttemptMetadata,
) -> None:
"""Run the doc through the upsert pipeline so the row + cc_pair mapping
exist (so deletion paths have something to find)."""
index_doc_batch_prepare(
documents=[doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _make_cc_pair(db_session: Session) -> ConnectorCredentialPair:
connector = Connector(
name=f"test-connector-{uuid4().hex[:8]}",
source=DocumentSource.MOCK_CONNECTOR,
input_type=InputType.LOAD_STATE,
connector_specific_config={},
refresh_freq=None,
prune_freq=None,
indexing_start=None,
)
db_session.add(connector)
db_session.flush()
credential = Credential(
source=DocumentSource.MOCK_CONNECTOR,
credential_json={},
)
db_session.add(credential)
db_session.flush()
pair = ConnectorCredentialPair(
connector_id=connector.id,
credential_id=credential.id,
name=f"test-cc-pair-{uuid4().hex[:8]}",
status=ConnectorCredentialPairStatus.ACTIVE,
access_type=AccessType.PUBLIC,
auto_sync_options=None,
)
db_session.add(pair)
db_session.commit()
db_session.refresh(pair)
return pair
@pytest.fixture
def cc_pair(
db_session: Session,
tenant_context: None, # noqa: ARG001
initialize_file_store: None, # noqa: ARG001
) -> Generator[ConnectorCredentialPair, None, None]:
yield _make_cc_pair(db_session)
@pytest.fixture
def second_cc_pair(
db_session: Session,
tenant_context: None, # noqa: ARG001
initialize_file_store: None, # noqa: ARG001
) -> Generator[ConnectorCredentialPair, None, None]:
"""A second cc_pair, used to test the count > 1 branch."""
yield _make_cc_pair(db_session)
@pytest.fixture
def attempt_metadata(cc_pair: ConnectorCredentialPair) -> IndexAttemptMetadata:
return IndexAttemptMetadata(
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
attempt_id=None,
request_id="test-request",
)
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestDeleteAllDocumentsForCcPair:
"""Path 3: bulk delete during index swap (`INSTANT` switchover)."""
def test_cleans_up_files_for_all_docs(
self,
db_session: Session,
cc_pair: ConnectorCredentialPair,
attempt_metadata: IndexAttemptMetadata,
) -> None:
file_id_a = _stage_file(content=b"a")
file_id_b = _stage_file(content=b"b")
doc_a = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id_a)
doc_b = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id_b)
_index_doc(db_session, doc_a, attempt_metadata)
_index_doc(db_session, doc_b, attempt_metadata)
assert _get_filerecord(db_session, file_id_a) is not None
assert _get_filerecord(db_session, file_id_b) is not None
delete_all_documents_for_connector_credential_pair(
db_session=db_session,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
)
assert _get_doc_row(db_session, doc_a.id) is None
assert _get_doc_row(db_session, doc_b.id) is None
assert _get_filerecord(db_session, file_id_a) is None
assert _get_filerecord(db_session, file_id_b) is None
def test_handles_mixed_docs_with_and_without_file_ids(
self,
db_session: Session,
cc_pair: ConnectorCredentialPair,
attempt_metadata: IndexAttemptMetadata,
) -> None:
"""Docs without file_id should be cleanly removed — no errors,
no spurious file_store calls."""
file_id = _stage_file()
doc_with = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
doc_without = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=None)
_index_doc(db_session, doc_with, attempt_metadata)
_index_doc(db_session, doc_without, attempt_metadata)
delete_all_documents_for_connector_credential_pair(
db_session=db_session,
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
)
assert _get_doc_row(db_session, doc_with.id) is None
assert _get_doc_row(db_session, doc_without.id) is None
assert _get_filerecord(db_session, file_id) is None
class TestDeleteIngestionDoc:
"""Path 2: public ingestion API DELETE endpoint."""
def test_cleans_up_file_for_ingestion_api_doc(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
tenant_context: None, # noqa: ARG002
initialize_file_store: None, # noqa: ARG002
) -> None:
file_id = _stage_file()
doc = _make_doc(
f"doc-{uuid4().hex[:8]}",
file_id=file_id,
from_ingestion_api=True,
)
_index_doc(db_session, doc, attempt_metadata)
assert _get_filerecord(db_session, file_id) is not None
# Patch out Vespa — we're testing the file cleanup, not the document
# index integration.
with patch(
"onyx.server.onyx_api.ingestion.get_all_document_indices",
return_value=[],
):
delete_ingestion_doc(
document_id=doc.id,
_=MagicMock(), # auth dep — not used by the function body
db_session=db_session,
)
assert _get_doc_row(db_session, doc.id) is None
assert _get_filerecord(db_session, file_id) is None
class TestDocumentByCcPairCleanupTask:
"""Path 1: per-doc cleanup task fired by pruning / connector deletion."""
def test_count_1_branch_cleans_up_file(
self,
db_session: Session,
cc_pair: ConnectorCredentialPair,
attempt_metadata: IndexAttemptMetadata,
full_deployment_setup: None, # noqa: ARG002
) -> None:
"""When the doc has exactly one cc_pair reference, the full delete
path runs and the attached file is reaped."""
file_id = _stage_file()
doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
_index_doc(db_session, doc, attempt_metadata)
assert _get_filerecord(db_session, file_id) is not None
# Patch out Vespa interaction — no chunks were ever written, and we're
# not testing the document index here.
with patch(
"onyx.background.celery.tasks.shared.tasks.get_all_document_indices",
return_value=[],
):
result = document_by_cc_pair_cleanup_task.apply(
args=(
doc.id,
cc_pair.connector_id,
cc_pair.credential_id,
TEST_TENANT_ID,
),
)
assert result.successful(), result.traceback
assert _get_doc_row(db_session, doc.id) is None
assert _get_filerecord(db_session, file_id) is None
def test_count_gt_1_branch_preserves_file(
self,
db_session: Session,
cc_pair: ConnectorCredentialPair,
second_cc_pair: ConnectorCredentialPair,
attempt_metadata: IndexAttemptMetadata,
full_deployment_setup: None, # noqa: ARG002
) -> None:
"""When the doc is referenced by another cc_pair, only the mapping
for the detaching cc_pair is removed. The file MUST stay because
the doc and its file are still owned by the remaining cc_pair."""
file_id = _stage_file()
doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
_index_doc(db_session, doc, attempt_metadata)
# Attach the same doc to a second cc_pair so refcount becomes 2.
upsert_document_by_connector_credential_pair(
db_session,
second_cc_pair.connector_id,
second_cc_pair.credential_id,
[doc.id],
)
db_session.commit()
with patch(
"onyx.background.celery.tasks.shared.tasks.get_all_document_indices",
return_value=[],
):
result = document_by_cc_pair_cleanup_task.apply(
args=(
doc.id,
cc_pair.connector_id,
cc_pair.credential_id,
TEST_TENANT_ID,
),
)
assert result.successful(), result.traceback
# Document row still exists (other cc_pair owns it).
assert _get_doc_row(db_session, doc.id) is not None
# File MUST still exist.
record = _get_filerecord(db_session, file_id)
assert record is not None

View File

@@ -0,0 +1,346 @@
"""External dependency unit tests for `index_doc_batch_prepare`.
Validates the file_id lifecycle that runs alongside the document upsert:
* `document.file_id` is written on insert AND on conflict (upsert path)
* Newly-staged files get promoted from INDEXING_STAGING -> CONNECTOR
* Replaced files are deleted from both `file_record` and S3
* No-op when the file_id is unchanged
Uses real PostgreSQL + real S3/MinIO via the file store.
"""
from collections.abc import Generator
from io import BytesIO
from uuid import uuid4
import pytest
from sqlalchemy.orm import Session
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.connectors.models import InputType
from onyx.connectors.models import TextSection
from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.file_record import get_filerecord_by_file_id_optional
from onyx.db.models import Connector
from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import Credential
from onyx.db.models import Document as DBDocument
from onyx.db.models import FileRecord
from onyx.file_store.file_store import get_default_file_store
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_doc(doc_id: str, file_id: str | None = None) -> Document:
"""Minimal Document for indexing-pipeline tests. MOCK_CONNECTOR avoids
triggering the hierarchy-node linking branch (NOTION/CONFLUENCE only)."""
return Document(
id=doc_id,
source=DocumentSource.MOCK_CONNECTOR,
semantic_identifier=f"semantic-{doc_id}",
sections=[TextSection(text="content", link=None)],
metadata={},
file_id=file_id,
)
def _stage_file(content: bytes = b"raw bytes") -> str:
"""Write bytes to the file store as INDEXING_STAGING and return the file_id.
Mirrors what the connector raw_file_callback would do during fetch.
"""
return get_default_file_store().save_file(
content=BytesIO(content),
display_name=None,
file_origin=FileOrigin.INDEXING_STAGING,
file_type="application/octet-stream",
file_metadata={"test": True},
)
def _get_doc_row(db_session: Session, doc_id: str) -> DBDocument | None:
"""Reload the document row fresh from DB so we see post-upsert state."""
db_session.expire_all()
return db_session.query(DBDocument).filter(DBDocument.id == doc_id).one_or_none()
def _get_filerecord(db_session: Session, file_id: str) -> FileRecord | None:
db_session.expire_all()
return get_filerecord_by_file_id_optional(file_id=file_id, db_session=db_session)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def cc_pair(
db_session: Session,
tenant_context: None, # noqa: ARG001
initialize_file_store: None, # noqa: ARG001
) -> Generator[ConnectorCredentialPair, None, None]:
"""Create a connector + credential + cc_pair backing the index attempt."""
connector = Connector(
name=f"test-connector-{uuid4().hex[:8]}",
source=DocumentSource.MOCK_CONNECTOR,
input_type=InputType.LOAD_STATE,
connector_specific_config={},
refresh_freq=None,
prune_freq=None,
indexing_start=None,
)
db_session.add(connector)
db_session.flush()
credential = Credential(
source=DocumentSource.MOCK_CONNECTOR,
credential_json={},
)
db_session.add(credential)
db_session.flush()
pair = ConnectorCredentialPair(
connector_id=connector.id,
credential_id=credential.id,
name=f"test-cc-pair-{uuid4().hex[:8]}",
status=ConnectorCredentialPairStatus.ACTIVE,
access_type=AccessType.PUBLIC,
auto_sync_options=None,
)
db_session.add(pair)
db_session.commit()
db_session.refresh(pair)
yield pair
@pytest.fixture
def attempt_metadata(cc_pair: ConnectorCredentialPair) -> IndexAttemptMetadata:
return IndexAttemptMetadata(
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
attempt_id=None,
request_id="test-request",
)
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestNewDocuments:
"""First-time inserts — no previous file_id to reconcile against."""
def test_new_doc_without_file_id(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=None)
index_doc_batch_prepare(
documents=[doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
row = _get_doc_row(db_session, doc.id)
assert row is not None
assert row.file_id is None
def test_new_doc_with_staged_file_id_promotes_to_connector(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
file_id = _stage_file()
doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
index_doc_batch_prepare(
documents=[doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
row = _get_doc_row(db_session, doc.id)
assert row is not None and row.file_id == file_id
record = _get_filerecord(db_session, file_id)
assert record is not None
assert record.file_origin == FileOrigin.CONNECTOR
class TestExistingDocuments:
"""Re-index path — a `document` row already exists with some file_id."""
def test_unchanged_file_id_is_noop(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
file_id = _stage_file()
doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
# First pass: inserts the row + promotes the file.
index_doc_batch_prepare(
documents=[doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
# Second pass with the same file_id — should not delete or re-promote.
index_doc_batch_prepare(
documents=[doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
record = _get_filerecord(db_session, file_id)
assert record is not None
assert record.file_origin == FileOrigin.CONNECTOR
row = _get_doc_row(db_session, doc.id)
assert row is not None and row.file_id == file_id
def test_swapping_file_id_promotes_new_and_deletes_old(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
old_file_id = _stage_file(content=b"old bytes")
doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=old_file_id)
index_doc_batch_prepare(
documents=[doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
# Re-fetch produces a new staged file_id for the same doc.
new_file_id = _stage_file(content=b"new bytes")
doc_v2 = _make_doc(doc.id, file_id=new_file_id)
index_doc_batch_prepare(
documents=[doc_v2],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
row = _get_doc_row(db_session, doc.id)
assert row is not None and row.file_id == new_file_id
new_record = _get_filerecord(db_session, new_file_id)
assert new_record is not None
assert new_record.file_origin == FileOrigin.CONNECTOR
# Old file_record + S3 object are gone.
assert _get_filerecord(db_session, old_file_id) is None
def test_clearing_file_id_deletes_old_and_nulls_column(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
old_file_id = _stage_file()
doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=old_file_id)
index_doc_batch_prepare(
documents=[doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
# Connector opts out on next run — yields the doc without a file_id.
doc_v2 = _make_doc(doc.id, file_id=None)
index_doc_batch_prepare(
documents=[doc_v2],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
row = _get_doc_row(db_session, doc.id)
assert row is not None and row.file_id is None
assert _get_filerecord(db_session, old_file_id) is None
class TestBatchHandling:
"""Mixed batches — multiple docs at different lifecycle states in one call."""
def test_mixed_batch_each_doc_handled_independently(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
# Pre-seed an existing doc with a file_id we'll swap.
existing_old_id = _stage_file(content=b"existing-old")
existing_doc = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=existing_old_id)
index_doc_batch_prepare(
documents=[existing_doc],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
# Now: swap the existing one, add a brand-new doc with file_id, and a
# brand-new doc without file_id.
swap_new_id = _stage_file(content=b"existing-new")
new_with_file_id = _stage_file(content=b"new-with-file")
existing_v2 = _make_doc(existing_doc.id, file_id=swap_new_id)
new_with = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=new_with_file_id)
new_without = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=None)
index_doc_batch_prepare(
documents=[existing_v2, new_with, new_without],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
# Existing doc was swapped: old file gone, new file promoted.
existing_row = _get_doc_row(db_session, existing_doc.id)
assert existing_row is not None and existing_row.file_id == swap_new_id
assert _get_filerecord(db_session, existing_old_id) is None
swap_record = _get_filerecord(db_session, swap_new_id)
assert swap_record is not None
assert swap_record.file_origin == FileOrigin.CONNECTOR
# New doc with file_id: row exists, file promoted.
new_with_row = _get_doc_row(db_session, new_with.id)
assert new_with_row is not None and new_with_row.file_id == new_with_file_id
new_with_record = _get_filerecord(db_session, new_with_file_id)
assert new_with_record is not None
assert new_with_record.file_origin == FileOrigin.CONNECTOR
# New doc without file_id: row exists, no file_record involvement.
new_without_row = _get_doc_row(db_session, new_without.id)
assert new_without_row is not None and new_without_row.file_id is None

View File

@@ -446,107 +446,10 @@ class TestOpenSearchClient:
test_client.create_index(mappings=mappings, settings=settings)
def test_update_settings(self, test_client: OpenSearchIndexClient) -> None:
"""Tests updating index settings on an existing index."""
# Precondition.
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Assert that the current number of replicas is not the desired test
# number we are updating to.
test_num_replicas = 0
current_settings = test_client.get_settings()
assert current_settings["index"]["number_of_replicas"] != f"{test_num_replicas}"
# Under test.
# Should not raise. number_of_replicas is a dynamic setting that can be
# changed without closing the index.
test_client.update_settings(
settings={"index": {"number_of_replicas": test_num_replicas}}
)
# Postcondition.
current_settings = test_client.get_settings()
assert current_settings["index"]["number_of_replicas"] == f"{test_num_replicas}"
def test_update_settings_on_nonexistent_index(
self, test_client: OpenSearchIndexClient
) -> None:
"""Tests updating settings on a nonexistent index raises an error."""
"""Tests that update_settings raises NotImplementedError."""
# Under test and postcondition.
with pytest.raises(Exception, match="index_not_found_exception|404"):
test_client.update_settings(settings={"index": {"number_of_replicas": 0}})
def test_get_settings(self, test_client: OpenSearchIndexClient) -> None:
"""Tests getting index settings."""
# Precondition.
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
current_settings = test_client.get_settings()
# Postcondition.
assert "index" in current_settings
# These are always present for any index.
assert "number_of_shards" in current_settings["index"]
assert "number_of_replicas" in current_settings["index"]
assert current_settings["index"]["provided_name"] == test_client._index_name
def test_get_settings_on_nonexistent_index(
self, test_client: OpenSearchIndexClient
) -> None:
"""Tests getting settings on a nonexistent index raises an error."""
# Under test and postcondition.
with pytest.raises(Exception, match="index_not_found_exception|404"):
test_client.get_settings()
def test_close_and_open_index(self, test_client: OpenSearchIndexClient) -> None:
"""Tests closing and reopening an index."""
# Precondition.
mappings = DocumentSchema.get_document_schema(
vector_dimension=128, multitenant=True
)
settings = DocumentSchema.get_index_settings_based_on_environment()
test_client.create_index(mappings=mappings, settings=settings)
# Under test.
# Closing should not raise.
test_client.close_index()
# Postcondition.
# Searches on a closed index should fail.
with pytest.raises(Exception, match="index_closed_exception|closed"):
test_client.search_for_document_ids(
body={"_source": False, "query": {"match_all": {}}}
)
# Under test.
# Reopening should not raise.
test_client.open_index()
# Postcondition.
# Searches should work again after reopening.
result = test_client.search_for_document_ids(
body={"_source": False, "query": {"match_all": {}}}
)
assert result == []
def test_close_nonexistent_index(self, test_client: OpenSearchIndexClient) -> None:
"""Tests closing a nonexistent index raises an error."""
# Under test and postcondition.
with pytest.raises(Exception, match="index_not_found_exception|404"):
test_client.close_index()
def test_open_nonexistent_index(self, test_client: OpenSearchIndexClient) -> None:
"""Tests opening a nonexistent index raises an error."""
# Under test and postcondition.
with pytest.raises(Exception, match="index_not_found_exception|404"):
test_client.open_index()
with pytest.raises(NotImplementedError):
test_client.update_settings(settings={})
def test_create_and_delete_search_pipeline(
self, test_client: OpenSearchIndexClient

View File

@@ -0,0 +1,262 @@
"""Workflow-level test for the INSTANT index swap.
When `check_and_perform_index_swap` runs against an `INSTANT` switchover, it
calls `delete_all_documents_for_connector_credential_pair` for each cc_pair.
This test exercises that full workflow end-to-end and asserts that the
attached `Document.file_id`s are also reaped — not just the document rows.
Mocks Vespa (`get_all_document_indices`) since this is testing the postgres +
file_store side effects of the swap, not the document index integration.
"""
from collections.abc import Generator
from io import BytesIO
from unittest.mock import patch
from uuid import uuid4
import pytest
from sqlalchemy.orm import Session
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.models import Document
from onyx.connectors.models import IndexAttemptMetadata
from onyx.connectors.models import InputType
from onyx.connectors.models import TextSection
from onyx.context.search.models import SavedSearchSettings
from onyx.db.enums import AccessType
from onyx.db.enums import ConnectorCredentialPairStatus
from onyx.db.enums import EmbeddingPrecision
from onyx.db.enums import SwitchoverType
from onyx.db.file_record import get_filerecord_by_file_id_optional
from onyx.db.models import Connector
from onyx.db.models import ConnectorCredentialPair
from onyx.db.models import Credential
from onyx.db.models import Document as DBDocument
from onyx.db.models import FileRecord
from onyx.db.models import IndexModelStatus
from onyx.db.search_settings import create_search_settings
from onyx.db.swap_index import check_and_perform_index_swap
from onyx.file_store.file_store import get_default_file_store
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
# ---------------------------------------------------------------------------
# Helpers (kept inline; extract to a shared conftest if a 4th test file shows up)
# ---------------------------------------------------------------------------
def _make_doc(doc_id: str, file_id: str | None = None) -> Document:
return Document(
id=doc_id,
source=DocumentSource.MOCK_CONNECTOR,
semantic_identifier=f"semantic-{doc_id}",
sections=[TextSection(text="content", link=None)],
metadata={},
file_id=file_id,
)
def _stage_file(content: bytes = b"raw bytes") -> str:
return get_default_file_store().save_file(
content=BytesIO(content),
display_name=None,
file_origin=FileOrigin.INDEXING_STAGING,
file_type="application/octet-stream",
file_metadata={"test": True},
)
def _get_doc_row(db_session: Session, doc_id: str) -> DBDocument | None:
db_session.expire_all()
return db_session.query(DBDocument).filter(DBDocument.id == doc_id).one_or_none()
def _get_filerecord(db_session: Session, file_id: str) -> FileRecord | None:
db_session.expire_all()
return get_filerecord_by_file_id_optional(file_id=file_id, db_session=db_session)
def _make_cc_pair(db_session: Session) -> ConnectorCredentialPair:
connector = Connector(
name=f"test-connector-{uuid4().hex[:8]}",
source=DocumentSource.MOCK_CONNECTOR,
input_type=InputType.LOAD_STATE,
connector_specific_config={},
refresh_freq=None,
prune_freq=None,
indexing_start=None,
)
db_session.add(connector)
db_session.flush()
credential = Credential(
source=DocumentSource.MOCK_CONNECTOR,
credential_json={},
)
db_session.add(credential)
db_session.flush()
pair = ConnectorCredentialPair(
connector_id=connector.id,
credential_id=credential.id,
name=f"test-cc-pair-{uuid4().hex[:8]}",
status=ConnectorCredentialPairStatus.ACTIVE,
access_type=AccessType.PUBLIC,
auto_sync_options=None,
)
db_session.add(pair)
db_session.commit()
db_session.refresh(pair)
return pair
def _make_saved_search_settings(
*,
switchover_type: SwitchoverType = SwitchoverType.REINDEX,
) -> SavedSearchSettings:
return SavedSearchSettings(
model_name=f"test-embedding-model-{uuid4().hex[:8]}",
model_dim=768,
normalize=True,
query_prefix="",
passage_prefix="",
provider_type=None,
index_name=f"test_index_{uuid4().hex[:8]}",
multipass_indexing=False,
embedding_precision=EmbeddingPrecision.FLOAT,
reduced_dimension=None,
enable_contextual_rag=False,
contextual_rag_llm_name=None,
contextual_rag_llm_provider=None,
switchover_type=switchover_type,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def cc_pair(
db_session: Session,
tenant_context: None, # noqa: ARG001
initialize_file_store: None, # noqa: ARG001
full_deployment_setup: None, # noqa: ARG001
) -> Generator[ConnectorCredentialPair, None, None]:
yield _make_cc_pair(db_session)
@pytest.fixture
def attempt_metadata(cc_pair: ConnectorCredentialPair) -> IndexAttemptMetadata:
return IndexAttemptMetadata(
connector_id=cc_pair.connector_id,
credential_id=cc_pair.credential_id,
attempt_id=None,
request_id="test-request",
)
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestInstantIndexSwap:
"""`SwitchoverType.INSTANT` wipes all docs for every cc_pair as part of
the swap. The associated raw files must be reaped too."""
def test_instant_swap_deletes_docs_and_files(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
# Index two docs with attached files via the normal pipeline.
file_id_a = _stage_file(content=b"alpha")
file_id_b = _stage_file(content=b"beta")
doc_a = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id_a)
doc_b = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id_b)
index_doc_batch_prepare(
documents=[doc_a, doc_b],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
# Sanity: docs and files exist before the swap.
assert _get_doc_row(db_session, doc_a.id) is not None
assert _get_doc_row(db_session, doc_b.id) is not None
assert _get_filerecord(db_session, file_id_a) is not None
assert _get_filerecord(db_session, file_id_b) is not None
# Stage a FUTURE search settings with INSTANT switchover. The next
# `check_and_perform_index_swap` call will see this and trigger the
# bulk-delete path on every cc_pair.
create_search_settings(
search_settings=_make_saved_search_settings(
switchover_type=SwitchoverType.INSTANT
),
db_session=db_session,
status=IndexModelStatus.FUTURE,
)
# Vespa is patched out — we're testing the postgres + file_store
# side effects, not the document-index integration.
with patch(
"onyx.db.swap_index.get_all_document_indices",
return_value=[],
):
old_settings = check_and_perform_index_swap(db_session)
assert old_settings is not None, "INSTANT swap should have executed"
# Documents are gone.
assert _get_doc_row(db_session, doc_a.id) is None
assert _get_doc_row(db_session, doc_b.id) is None
# Files are gone — the workflow's bulk-delete path correctly
# propagated through to file cleanup.
assert _get_filerecord(db_session, file_id_a) is None
assert _get_filerecord(db_session, file_id_b) is None
def test_instant_swap_with_mixed_docs_does_not_break(
self,
db_session: Session,
attempt_metadata: IndexAttemptMetadata,
) -> None:
"""A mix of docs with and without file_ids must all be swept up
without errors during the swap."""
file_id = _stage_file()
doc_with = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=file_id)
doc_without = _make_doc(f"doc-{uuid4().hex[:8]}", file_id=None)
index_doc_batch_prepare(
documents=[doc_with, doc_without],
index_attempt_metadata=attempt_metadata,
db_session=db_session,
ignore_time_skip=True,
)
db_session.commit()
create_search_settings(
search_settings=_make_saved_search_settings(
switchover_type=SwitchoverType.INSTANT
),
db_session=db_session,
status=IndexModelStatus.FUTURE,
)
with patch(
"onyx.db.swap_index.get_all_document_indices",
return_value=[],
):
old_settings = check_and_perform_index_swap(db_session)
assert old_settings is not None
assert _get_doc_row(db_session, doc_with.id) is None
assert _get_doc_row(db_session, doc_without.id) is None
assert _get_filerecord(db_session, file_id) is None

View File

@@ -1,210 +0,0 @@
"""Tests for `cloud_beat_task_generator`'s tenant work-gating logic.
Exercises the gate-read path end-to-end against real Redis. The Celery
`.app.send_task` is mocked so we can count dispatches without actually
sending messages.
Requires a running Redis instance. Run with::
python -m dotenv -f .vscode/.env run -- pytest \
backend/tests/external_dependency_unit/tenant_work_gating/test_gate_generator.py
"""
from collections.abc import Generator
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from ee.onyx.background.celery.tasks.cloud import tasks as cloud_tasks
from onyx.configs.constants import ONYX_CLOUD_TENANT_ID
from onyx.redis import redis_tenant_work_gating as twg
from onyx.redis.redis_pool import get_redis_client
from onyx.redis.redis_tenant_work_gating import _SET_KEY
from onyx.redis.redis_tenant_work_gating import mark_tenant_active
_TENANT_A = "tenant_aaaa0000-0000-0000-0000-000000000001"
_TENANT_B = "tenant_bbbb0000-0000-0000-0000-000000000002"
_TENANT_C = "tenant_cccc0000-0000-0000-0000-000000000003"
_ALL_TEST_TENANTS = [_TENANT_A, _TENANT_B, _TENANT_C]
_FANOUT_KEY_PREFIX = cloud_tasks._FULL_FANOUT_TIMESTAMP_KEY_PREFIX
@pytest.fixture(autouse=True)
def _multi_tenant_true() -> Generator[None, None, None]:
with patch.object(twg, "MULTI_TENANT", True):
yield
@pytest.fixture(autouse=True)
def _clean_redis() -> Generator[None, None, None]:
"""Clear the active set AND the per-task full-fanout timestamp so each
test starts fresh."""
r = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)
r.delete(_SET_KEY)
r.delete(f"{_FANOUT_KEY_PREFIX}:test_task")
r.delete("runtime:tenant_work_gating:enabled")
r.delete("runtime:tenant_work_gating:enforce")
yield
r.delete(_SET_KEY)
r.delete(f"{_FANOUT_KEY_PREFIX}:test_task")
r.delete("runtime:tenant_work_gating:enabled")
r.delete("runtime:tenant_work_gating:enforce")
def _invoke_generator(
*,
work_gated: bool,
enabled: bool,
enforce: bool,
tenant_ids: list[str],
full_fanout_interval_seconds: int = 1200,
ttl_seconds: int = 1800,
) -> MagicMock:
"""Helper: call the generator with runtime flags fixed and the Celery
app mocked. Returns the mock so callers can assert on send_task calls."""
mock_app = MagicMock()
# The task binds `self` = the task itself when invoked via `.run()`;
# patch its `.app` so `self.app.send_task` routes to our mock.
with (
patch.object(cloud_tasks.cloud_beat_task_generator, "app", mock_app),
patch.object(cloud_tasks, "get_all_tenant_ids", return_value=list(tenant_ids)),
patch.object(cloud_tasks, "get_gated_tenants", return_value=set()),
patch(
"onyx.server.runtime.onyx_runtime.OnyxRuntime.get_tenant_work_gating_enabled",
return_value=enabled,
),
patch(
"onyx.server.runtime.onyx_runtime.OnyxRuntime.get_tenant_work_gating_enforce",
return_value=enforce,
),
patch(
"onyx.server.runtime.onyx_runtime.OnyxRuntime.get_tenant_work_gating_full_fanout_interval_seconds",
return_value=full_fanout_interval_seconds,
),
patch(
"onyx.server.runtime.onyx_runtime.OnyxRuntime.get_tenant_work_gating_ttl_seconds",
return_value=ttl_seconds,
),
):
cloud_tasks.cloud_beat_task_generator.run(
task_name="test_task",
work_gated=work_gated,
)
return mock_app
def _dispatched_tenants(mock_app: MagicMock) -> list[str]:
"""Pull tenant_ids out of each send_task call for assertion."""
return [c.kwargs["kwargs"]["tenant_id"] for c in mock_app.send_task.call_args_list]
def _seed_recent_full_fanout_timestamp() -> None:
"""Pre-seed the per-task timestamp so the interval-elapsed branch
reports False, i.e. the gate enforces normally instead of going into
full-fanout on first invocation."""
import time as _t
r = get_redis_client(tenant_id=ONYX_CLOUD_TENANT_ID)
r.set(f"{_FANOUT_KEY_PREFIX}:test_task", str(int(_t.time() * 1000)))
def test_enforce_skips_unmarked_tenants() -> None:
"""With enable+enforce on (interval NOT elapsed), only tenants in the
active set get dispatched."""
mark_tenant_active(_TENANT_A)
_seed_recent_full_fanout_timestamp()
mock_app = _invoke_generator(
work_gated=True,
enabled=True,
enforce=True,
tenant_ids=_ALL_TEST_TENANTS,
full_fanout_interval_seconds=3600,
)
dispatched = _dispatched_tenants(mock_app)
assert dispatched == [_TENANT_A]
def test_shadow_mode_dispatches_all_tenants() -> None:
"""enabled=True, enforce=False: gate computes skip but still dispatches."""
mark_tenant_active(_TENANT_A)
_seed_recent_full_fanout_timestamp()
mock_app = _invoke_generator(
work_gated=True,
enabled=True,
enforce=False,
tenant_ids=_ALL_TEST_TENANTS,
full_fanout_interval_seconds=3600,
)
dispatched = _dispatched_tenants(mock_app)
assert set(dispatched) == set(_ALL_TEST_TENANTS)
def test_full_fanout_cycle_dispatches_all_tenants() -> None:
"""First invocation (no prior timestamp → interval considered elapsed)
counts as full-fanout; every tenant gets dispatched even under enforce."""
mark_tenant_active(_TENANT_A)
mock_app = _invoke_generator(
work_gated=True,
enabled=True,
enforce=True,
tenant_ids=_ALL_TEST_TENANTS,
)
dispatched = _dispatched_tenants(mock_app)
assert set(dispatched) == set(_ALL_TEST_TENANTS)
def test_redis_unavailable_fails_open() -> None:
"""When `get_active_tenants` returns None (simulated Redis outage) the
gate treats the invocation as full-fanout and dispatches everyone —
even when the interval hasn't elapsed and enforce is on."""
mark_tenant_active(_TENANT_A)
_seed_recent_full_fanout_timestamp()
with patch.object(cloud_tasks, "get_active_tenants", return_value=None):
mock_app = _invoke_generator(
work_gated=True,
enabled=True,
enforce=True,
tenant_ids=_ALL_TEST_TENANTS,
full_fanout_interval_seconds=3600,
)
dispatched = _dispatched_tenants(mock_app)
assert set(dispatched) == set(_ALL_TEST_TENANTS)
def test_work_gated_false_bypasses_gate_entirely() -> None:
"""Beat templates that don't opt in (`work_gated=False`) never consult
the set — no matter the flag state."""
# Even with enforce on and nothing in the set, all tenants dispatch.
mock_app = _invoke_generator(
work_gated=False,
enabled=True,
enforce=True,
tenant_ids=_ALL_TEST_TENANTS,
)
dispatched = _dispatched_tenants(mock_app)
assert set(dispatched) == set(_ALL_TEST_TENANTS)
def test_gate_disabled_dispatches_everyone_regardless_of_enforce() -> None:
"""enabled=False means the gate isn't computed — dispatch is unchanged."""
# Intentionally don't add anyone to the set.
mock_app = _invoke_generator(
work_gated=True,
enabled=False,
enforce=True,
tenant_ids=_ALL_TEST_TENANTS,
)
dispatched = _dispatched_tenants(mock_app)
assert set(dispatched) == set(_ALL_TEST_TENANTS)

View File

@@ -16,14 +16,12 @@ from mcp import ClientSession
from mcp.client.streamable_http import streamablehttp_client
from mcp.types import CallToolResult
from mcp.types import TextContent
from pydantic import AnyUrl
from onyx.db.enums import AccessType
from tests.integration.common_utils.constants import MCP_SERVER_URL
from tests.integration.common_utils.managers.api_key import APIKeyManager
from tests.integration.common_utils.managers.cc_pair import CCPairManager
from tests.integration.common_utils.managers.document import DocumentManager
from tests.integration.common_utils.managers.document_set import DocumentSetManager
from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
from tests.integration.common_utils.managers.pat import PATManager
from tests.integration.common_utils.managers.user import UserManager
@@ -36,7 +34,6 @@ from tests.integration.common_utils.test_models import DATestUser
# Constants
MCP_SEARCH_TOOL = "search_indexed_documents"
INDEXED_SOURCES_RESOURCE_URI = "resource://indexed_sources"
DOCUMENT_SETS_RESOURCE_URI = "resource://document_sets"
DEFAULT_SEARCH_LIMIT = 5
STREAMABLE_HTTP_URL = f"{MCP_SERVER_URL.rstrip('/')}/?transportType=streamable-http"
@@ -76,22 +73,19 @@ def _extract_tool_payload(result: CallToolResult) -> dict[str, Any]:
def _call_search_tool(
headers: dict[str, str],
query: str,
limit: int = DEFAULT_SEARCH_LIMIT,
document_set_names: list[str] | None = None,
headers: dict[str, str], query: str, limit: int = DEFAULT_SEARCH_LIMIT
) -> CallToolResult:
"""Call the search_indexed_documents tool via MCP."""
async def _action(session: ClientSession) -> CallToolResult:
await session.initialize()
arguments: dict[str, Any] = {
"query": query,
"limit": limit,
}
if document_set_names is not None:
arguments["document_set_names"] = document_set_names
return await session.call_tool(MCP_SEARCH_TOOL, arguments)
return await session.call_tool(
MCP_SEARCH_TOOL,
{
"query": query,
"limit": limit,
},
)
return _run_with_mcp_session(headers, _action)
@@ -244,106 +238,3 @@ def test_mcp_search_respects_acl_filters(
blocked_payload = _extract_tool_payload(blocked_result)
assert blocked_payload["total_results"] == 0
assert blocked_payload["documents"] == []
def test_mcp_search_filters_by_document_set(
reset: None, # noqa: ARG001
admin_user: DATestUser,
) -> None:
"""Passing document_set_names should scope results to the named set."""
LLMProviderManager.create(user_performing_action=admin_user)
api_key = APIKeyManager.create(user_performing_action=admin_user)
cc_pair_in_set = CCPairManager.create_from_scratch(
user_performing_action=admin_user,
)
cc_pair_out_of_set = CCPairManager.create_from_scratch(
user_performing_action=admin_user,
)
shared_phrase = "document-set-filter-shared-phrase"
in_set_content = f"{shared_phrase} inside curated set"
out_of_set_content = f"{shared_phrase} outside curated set"
_seed_document_and_wait_for_indexing(
cc_pair=cc_pair_in_set,
content=in_set_content,
api_key=api_key,
user_performing_action=admin_user,
)
_seed_document_and_wait_for_indexing(
cc_pair=cc_pair_out_of_set,
content=out_of_set_content,
api_key=api_key,
user_performing_action=admin_user,
)
doc_set = DocumentSetManager.create(
cc_pair_ids=[cc_pair_in_set.id],
user_performing_action=admin_user,
)
DocumentSetManager.wait_for_sync(
user_performing_action=admin_user,
document_sets_to_check=[doc_set],
)
headers = _auth_headers(admin_user, name="mcp-doc-set-filter")
# The document_sets resource should surface the newly created set so MCP
# clients can discover which values to pass to document_set_names.
async def _list_resources(session: ClientSession) -> Any:
await session.initialize()
resources = await session.list_resources()
contents = await session.read_resource(AnyUrl(DOCUMENT_SETS_RESOURCE_URI))
return resources, contents
resources_result, doc_sets_contents = _run_with_mcp_session(
headers, _list_resources
)
resource_uris = {str(resource.uri) for resource in resources_result.resources}
assert DOCUMENT_SETS_RESOURCE_URI in resource_uris
doc_sets_payload = json.loads(doc_sets_contents.contents[0].text)
exposed_names = {entry["name"] for entry in doc_sets_payload}
assert doc_set.name in exposed_names
# Without the filter both documents are visible.
unfiltered_payload = _extract_tool_payload(
_call_search_tool(headers, shared_phrase, limit=10)
)
unfiltered_contents = [
doc.get("content") or "" for doc in unfiltered_payload["documents"]
]
assert any(in_set_content in content for content in unfiltered_contents)
assert any(out_of_set_content in content for content in unfiltered_contents)
# With the document set filter only the in-set document is returned.
filtered_payload = _extract_tool_payload(
_call_search_tool(
headers,
shared_phrase,
limit=10,
document_set_names=[doc_set.name],
)
)
filtered_contents = [
doc.get("content") or "" for doc in filtered_payload["documents"]
]
assert filtered_payload["total_results"] >= 1
assert any(in_set_content in content for content in filtered_contents)
assert all(out_of_set_content not in content for content in filtered_contents)
# An empty document_set_names should behave like "no filter" (normalized
# to None), not "match zero sets".
empty_list_payload = _extract_tool_payload(
_call_search_tool(
headers,
shared_phrase,
limit=10,
document_set_names=[],
)
)
empty_list_contents = [
doc.get("content") or "" for doc in empty_list_payload["documents"]
]
assert any(in_set_content in content for content in empty_list_contents)
assert any(out_of_set_content in content for content in empty_list_contents)

View File

@@ -1,53 +0,0 @@
import datetime
import pytest
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
def test_time_str_to_utc() -> None:
str_to_dt = {
"Tue, 5 Oct 2021 09:38:25 GMT": datetime.datetime(
2021, 10, 5, 9, 38, 25, tzinfo=datetime.timezone.utc
),
"Sat, 24 Jul 2021 09:21:20 +0000 (UTC)": datetime.datetime(
2021, 7, 24, 9, 21, 20, tzinfo=datetime.timezone.utc
),
"Thu, 29 Jul 2021 04:20:37 -0400 (EDT)": datetime.datetime(
2021, 7, 29, 8, 20, 37, tzinfo=datetime.timezone.utc
),
"30 Jun 2023 18:45:01 +0300": datetime.datetime(
2023, 6, 30, 15, 45, 1, tzinfo=datetime.timezone.utc
),
"22 Mar 2020 20:12:18 +0000 (GMT)": datetime.datetime(
2020, 3, 22, 20, 12, 18, tzinfo=datetime.timezone.utc
),
"Date: Wed, 27 Aug 2025 11:40:00 +0200": datetime.datetime(
2025, 8, 27, 9, 40, 0, tzinfo=datetime.timezone.utc
),
}
for strptime, expected_datetime in str_to_dt.items():
assert time_str_to_utc(strptime) == expected_datetime
def test_time_str_to_utc_recovers_from_concatenated_headers() -> None:
# TZ is dropped during recovery, so the expected result is UTC rather
# than the original offset.
assert time_str_to_utc(
'Sat, 3 Nov 2007 14:33:28 -0200To: "jason" <jason@example.net>'
) == datetime.datetime(2007, 11, 3, 14, 33, 28, tzinfo=datetime.timezone.utc)
assert time_str_to_utc(
"Fri, 20 Feb 2015 10:30:00 +0500Cc: someone@example.com"
) == datetime.datetime(2015, 2, 20, 10, 30, 0, tzinfo=datetime.timezone.utc)
def test_time_str_to_utc_raises_on_impossible_dates() -> None:
for bad in (
"Wed, 33 Sep 2007 13:42:59 +0100",
"Thu, 11 Oct 2007 31:50:55 +0900",
"not a date at all",
"",
):
with pytest.raises(ValueError):
time_str_to_utc(bad)

View File

@@ -1,4 +1,3 @@
import copy
import datetime
import json
import os
@@ -9,6 +8,7 @@ from unittest.mock import patch
from onyx.access.models import ExternalAccess
from onyx.configs.constants import DocumentSource
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from onyx.connectors.gmail.connector import _build_time_range_query
from onyx.connectors.gmail.connector import GmailCheckpoint
from onyx.connectors.gmail.connector import GmailConnector
@@ -51,43 +51,29 @@ def test_build_time_range_query() -> None:
assert query is None
def _thread_with_date(date_header: str | None) -> dict[str, Any]:
"""Load the fixture thread and replace (or strip, if None) its Date header."""
json_path = os.path.join(os.path.dirname(__file__), "thread.json")
with open(json_path, "r") as f:
thread = cast(dict[str, Any], json.load(f))
thread = copy.deepcopy(thread)
for message in thread["messages"]:
headers: list[dict[str, str]] = message["payload"]["headers"]
if date_header is None:
message["payload"]["headers"] = [
h for h in headers if h.get("name") != "Date"
]
continue
replaced = False
for header in headers:
if header.get("name") == "Date":
header["value"] = date_header
replaced = True
break
if not replaced:
headers.append({"name": "Date", "value": date_header})
return thread
def test_thread_to_document_skips_unparseable_dates() -> None:
for bad_date in (
"Wed, 33 Sep 2007 13:42:59 +0100",
"Thu, 11 Oct 2007 31:50:55 +0900",
"total garbage not even close to a date",
):
doc = thread_to_document(_thread_with_date(bad_date), "admin@example.com")
assert isinstance(doc, Document), f"failed for {bad_date!r}"
assert doc.doc_updated_at is None
assert doc.id == "192edefb315737c3"
def test_time_str_to_utc() -> None:
str_to_dt = {
"Tue, 5 Oct 2021 09:38:25 GMT": datetime.datetime(
2021, 10, 5, 9, 38, 25, tzinfo=datetime.timezone.utc
),
"Sat, 24 Jul 2021 09:21:20 +0000 (UTC)": datetime.datetime(
2021, 7, 24, 9, 21, 20, tzinfo=datetime.timezone.utc
),
"Thu, 29 Jul 2021 04:20:37 -0400 (EDT)": datetime.datetime(
2021, 7, 29, 8, 20, 37, tzinfo=datetime.timezone.utc
),
"30 Jun 2023 18:45:01 +0300": datetime.datetime(
2023, 6, 30, 15, 45, 1, tzinfo=datetime.timezone.utc
),
"22 Mar 2020 20:12:18 +0000 (GMT)": datetime.datetime(
2020, 3, 22, 20, 12, 18, tzinfo=datetime.timezone.utc
),
"Date: Wed, 27 Aug 2025 11:40:00 +0200": datetime.datetime(
2025, 8, 27, 9, 40, 0, tzinfo=datetime.timezone.utc
),
}
for strptime, expected_datetime in str_to_dt.items():
assert time_str_to_utc(strptime) == expected_datetime
def test_gmail_checkpoint_progression() -> None:

View File

@@ -12,14 +12,12 @@ from unittest.mock import patch
from onyx.background.celery.celery_utils import extract_ids_from_runnable_connector
from onyx.connectors.google_drive.connector import GoogleDriveConnector
from onyx.connectors.google_drive.file_retrieval import DriveFileFieldType
from onyx.connectors.google_drive.models import DriveRetrievalStage
from onyx.connectors.google_drive.models import GoogleDriveCheckpoint
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.interfaces import SlimConnectorWithPermSync
from onyx.connectors.models import SlimDocument
from onyx.utils.threadpool_concurrency import ThreadSafeDict
from onyx.utils.threadpool_concurrency import ThreadSafeSet
def _make_done_checkpoint() -> GoogleDriveCheckpoint:
@@ -200,90 +198,3 @@ class TestCeleryUtilsRouting:
mock_slim.assert_called_once()
mock_perm_sync.assert_not_called()
class TestFailedFolderIdsByEmail:
def _make_failed_map(
self, entries: dict[str, set[str]]
) -> ThreadSafeDict[str, ThreadSafeSet[str]]:
return ThreadSafeDict({k: ThreadSafeSet(v) for k, v in entries.items()})
def test_skips_api_call_for_known_failed_pair(self) -> None:
"""_get_folder_metadata must skip the API call for a (folder, email) pair
that previously confirmed no accessible parent."""
connector = _make_connector()
failed_map = self._make_failed_map(
{
"retriever@example.com": {"folder1"},
"admin@example.com": {"folder1"},
}
)
with patch(
"onyx.connectors.google_drive.connector.get_folder_metadata"
) as mock_api:
result = connector._get_folder_metadata(
folder_id="folder1",
retriever_email="retriever@example.com",
field_type=DriveFileFieldType.SLIM,
failed_folder_ids_by_email=failed_map,
)
mock_api.assert_not_called()
assert result is None
def test_records_failed_pair_when_no_parents(self) -> None:
"""_get_folder_metadata must record (email → folder_id) in the map
when the API returns a folder with no parents."""
connector = _make_connector()
failed_map: ThreadSafeDict[str, ThreadSafeSet[str]] = ThreadSafeDict()
folder_no_parents: dict = {"id": "folder1", "name": "Orphaned"}
with (
patch(
"onyx.connectors.google_drive.connector.get_drive_service",
return_value=MagicMock(),
),
patch(
"onyx.connectors.google_drive.connector.get_folder_metadata",
return_value=folder_no_parents,
),
):
connector._get_folder_metadata(
folder_id="folder1",
retriever_email="retriever@example.com",
field_type=DriveFileFieldType.SLIM,
failed_folder_ids_by_email=failed_map,
)
assert "folder1" in failed_map.get("retriever@example.com", ThreadSafeSet())
assert "folder1" in failed_map.get("admin@example.com", ThreadSafeSet())
def test_does_not_record_when_parents_found(self) -> None:
"""_get_folder_metadata must NOT record a pair when parents are found."""
connector = _make_connector()
failed_map: ThreadSafeDict[str, ThreadSafeSet[str]] = ThreadSafeDict()
folder_with_parents: dict = {
"id": "folder1",
"name": "Normal",
"parents": ["root"],
}
with (
patch(
"onyx.connectors.google_drive.connector.get_drive_service",
return_value=MagicMock(),
),
patch(
"onyx.connectors.google_drive.connector.get_folder_metadata",
return_value=folder_with_parents,
),
):
connector._get_folder_metadata(
folder_id="folder1",
retriever_email="retriever@example.com",
field_type=DriveFileFieldType.SLIM,
failed_folder_ids_by_email=failed_map,
)
assert len(failed_map) == 0

View File

@@ -1,101 +0,0 @@
"""Unit tests for SharepointConnector.load_credentials sp_tenant_domain resolution."""
from __future__ import annotations
import base64
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.connectors.sharepoint.connector import SharepointConnector
SITE_URL = "https://mytenant.sharepoint.com/sites/MySite"
EXPECTED_TENANT_DOMAIN = "mytenant"
CLIENT_SECRET_CREDS = {
"authentication_method": "client_secret",
"sp_client_id": "fake-client-id",
"sp_client_secret": "fake-client-secret",
"sp_directory_id": "fake-directory-id",
}
CERTIFICATE_CREDS = {
"authentication_method": "certificate",
"sp_client_id": "fake-client-id",
"sp_directory_id": "fake-directory-id",
"sp_private_key": base64.b64encode(b"fake-pfx-data").decode(),
"sp_certificate_password": "fake-password",
}
def _make_mock_msal() -> MagicMock:
mock_app = MagicMock()
mock_app.acquire_token_for_client.return_value = {"access_token": "fake-token"}
return mock_app
@patch("onyx.connectors.sharepoint.connector.msal.ConfidentialClientApplication")
@patch("onyx.connectors.sharepoint.connector.GraphClient")
def test_client_secret_with_site_pages_sets_tenant_domain(
_mock_graph_client: MagicMock,
mock_msal_cls: MagicMock,
) -> None:
"""client_secret auth + include_site_pages=True must resolve sp_tenant_domain."""
mock_msal_cls.return_value = _make_mock_msal()
connector = SharepointConnector(sites=[SITE_URL], include_site_pages=True)
connector.load_credentials(CLIENT_SECRET_CREDS)
assert connector.sp_tenant_domain == EXPECTED_TENANT_DOMAIN
@patch("onyx.connectors.sharepoint.connector.msal.ConfidentialClientApplication")
@patch("onyx.connectors.sharepoint.connector.GraphClient")
def test_client_secret_without_site_pages_still_sets_tenant_domain(
_mock_graph_client: MagicMock,
mock_msal_cls: MagicMock,
) -> None:
"""client_secret auth + include_site_pages=False must still resolve sp_tenant_domain
because _create_rest_client_context is also called for drive items."""
mock_msal_cls.return_value = _make_mock_msal()
connector = SharepointConnector(sites=[SITE_URL], include_site_pages=False)
connector.load_credentials(CLIENT_SECRET_CREDS)
assert connector.sp_tenant_domain == EXPECTED_TENANT_DOMAIN
@patch("onyx.connectors.sharepoint.connector.load_certificate_from_pfx")
@patch("onyx.connectors.sharepoint.connector.msal.ConfidentialClientApplication")
@patch("onyx.connectors.sharepoint.connector.GraphClient")
def test_certificate_with_site_pages_sets_tenant_domain(
_mock_graph_client: MagicMock,
mock_msal_cls: MagicMock,
mock_load_cert: MagicMock,
) -> None:
"""certificate auth + include_site_pages=True must resolve sp_tenant_domain."""
mock_msal_cls.return_value = _make_mock_msal()
mock_load_cert.return_value = MagicMock()
connector = SharepointConnector(sites=[SITE_URL], include_site_pages=True)
connector.load_credentials(CERTIFICATE_CREDS)
assert connector.sp_tenant_domain == EXPECTED_TENANT_DOMAIN
@patch("onyx.connectors.sharepoint.connector.load_certificate_from_pfx")
@patch("onyx.connectors.sharepoint.connector.msal.ConfidentialClientApplication")
@patch("onyx.connectors.sharepoint.connector.GraphClient")
def test_certificate_without_site_pages_sets_tenant_domain(
_mock_graph_client: MagicMock,
mock_msal_cls: MagicMock,
mock_load_cert: MagicMock,
) -> None:
"""certificate auth + include_site_pages=False must still resolve sp_tenant_domain
because _create_rest_client_context is also called for drive items."""
mock_msal_cls.return_value = _make_mock_msal()
mock_load_cert.return_value = MagicMock()
connector = SharepointConnector(sites=[SITE_URL], include_site_pages=False)
connector.load_credentials(CERTIFICATE_CREDS)
assert connector.sp_tenant_domain == EXPECTED_TENANT_DOMAIN

View File

@@ -12,10 +12,6 @@ dependency on pypdf internals (pypdf.generic).
from io import BytesIO
from pathlib import Path
import pytest
from onyx.file_processing import extract_file_text
from onyx.file_processing.extract_file_text import count_pdf_embedded_images
from onyx.file_processing.extract_file_text import pdf_to_text
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.password_validation import is_pdf_protected
@@ -100,80 +96,6 @@ class TestReadPdfFile:
# Returned list is empty when callback is used
assert images == []
def test_image_cap_skips_images_above_limit(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""When the embedded-image cap is exceeded, remaining images are skipped.
The cap protects the user-file-processing worker from OOMing on PDFs
with thousands of embedded images. Setting the cap to 0 should yield
zero extracted images even though the fixture has one.
"""
monkeypatch.setattr(extract_file_text, "MAX_EMBEDDED_IMAGES_PER_FILE", 0)
_, _, images = read_pdf_file(_load("with_image.pdf"), extract_images=True)
assert images == []
def test_image_cap_at_limit_extracts_up_to_cap(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""A cap >= image count behaves identically to the uncapped path."""
monkeypatch.setattr(extract_file_text, "MAX_EMBEDDED_IMAGES_PER_FILE", 100)
_, _, images = read_pdf_file(_load("with_image.pdf"), extract_images=True)
assert len(images) == 1
def test_image_cap_with_callback_stops_streaming_at_limit(
self, monkeypatch: pytest.MonkeyPatch
) -> None:
"""The cap also short-circuits the streaming callback path."""
monkeypatch.setattr(extract_file_text, "MAX_EMBEDDED_IMAGES_PER_FILE", 0)
collected: list[tuple[bytes, str]] = []
def callback(data: bytes, name: str) -> None:
collected.append((data, name))
read_pdf_file(
_load("with_image.pdf"), extract_images=True, image_callback=callback
)
assert collected == []
# ── count_pdf_embedded_images ────────────────────────────────────────────
class TestCountPdfEmbeddedImages:
def test_returns_count_for_normal_pdf(self) -> None:
assert count_pdf_embedded_images(_load("with_image.pdf"), cap=10) == 1
def test_short_circuits_above_cap(self) -> None:
# with_image.pdf has 1 image. cap=0 means "anything > 0 is over cap" —
# function returns on first increment as the over-cap sentinel.
assert count_pdf_embedded_images(_load("with_image.pdf"), cap=0) == 1
def test_returns_zero_for_pdf_without_images(self) -> None:
assert count_pdf_embedded_images(_load("simple.pdf"), cap=10) == 0
def test_returns_zero_for_invalid_pdf(self) -> None:
assert count_pdf_embedded_images(BytesIO(b"not a pdf"), cap=10) == 0
def test_returns_zero_for_password_locked_pdf(self) -> None:
# encrypted.pdf has an open password; we can't inspect without it, so
# the helper returns 0 — callers rely on the password-protected check
# that runs earlier in the upload pipeline.
assert count_pdf_embedded_images(_load("encrypted.pdf"), cap=10) == 0
def test_inspects_owner_password_only_pdf(self) -> None:
# owner_protected.pdf is encrypted but has no open password. It should
# decrypt with an empty string and count images normally. The fixture
# has zero images, so 0 is a real count (not the "bail on encrypted"
# path).
assert count_pdf_embedded_images(_load("owner_protected.pdf"), cap=10) == 0
def test_preserves_file_position(self) -> None:
pdf = _load("with_image.pdf")
pdf.seek(42)
count_pdf_embedded_images(pdf, cap=10)
assert pdf.tell() == 42
# ── pdf_to_text ──────────────────────────────────────────────────────────

View File

@@ -1,7 +1,6 @@
import io
from typing import cast
from unittest.mock import MagicMock
from unittest.mock import patch
import openpyxl
from openpyxl.worksheet.worksheet import Worksheet
@@ -322,17 +321,6 @@ class TestXlsxSheetExtraction:
sheets = xlsx_sheet_extraction(bad_file, file_name="~$temp.xlsx")
assert sheets == []
def test_known_openpyxl_bug_max_value_returns_empty(self) -> None:
"""openpyxl's strict descriptor validation rejects font family
values >14 with 'Max value is 14'. Treat as a known openpyxl bug
and skip the file rather than fail the whole connector batch."""
with patch(
"onyx.file_processing.extract_file_text.openpyxl.load_workbook",
side_effect=ValueError("Max value is 14"),
):
sheets = xlsx_sheet_extraction(io.BytesIO(b""), file_name="bad_font.xlsx")
assert sheets == []
def test_csv_content_matches_xlsx_to_text_per_sheet(self) -> None:
"""For a single-sheet workbook, xlsx_to_text output should equal
the csv_text from xlsx_sheet_extraction — they share the same

View File

@@ -1,257 +0,0 @@
"""Tests for embedding Prometheus metrics."""
from unittest.mock import patch
from onyx.server.metrics.embedding import _client_duration
from onyx.server.metrics.embedding import _embedding_input_chars_total
from onyx.server.metrics.embedding import _embedding_requests_total
from onyx.server.metrics.embedding import _embedding_texts_total
from onyx.server.metrics.embedding import _embeddings_in_progress
from onyx.server.metrics.embedding import LOCAL_PROVIDER_LABEL
from onyx.server.metrics.embedding import observe_embedding_client
from onyx.server.metrics.embedding import provider_label
from onyx.server.metrics.embedding import PROVIDER_LABEL_NAME
from onyx.server.metrics.embedding import TEXT_TYPE_LABEL_NAME
from onyx.server.metrics.embedding import track_embedding_in_progress
from shared_configs.enums import EmbeddingProvider
from shared_configs.enums import EmbedTextType
class TestProviderLabel:
def test_none_maps_to_local(self) -> None:
assert provider_label(None) == LOCAL_PROVIDER_LABEL
def test_enum_maps_to_value(self) -> None:
assert provider_label(EmbeddingProvider.OPENAI) == "openai"
assert provider_label(EmbeddingProvider.COHERE) == "cohere"
class TestObserveEmbeddingClient:
def test_success_records_all_counters(self) -> None:
# Precondition.
provider = EmbeddingProvider.OPENAI
text_type = EmbedTextType.QUERY
labels = {
PROVIDER_LABEL_NAME: provider.value,
TEXT_TYPE_LABEL_NAME: text_type.value,
}
before_requests = _embedding_requests_total.labels(
**labels, status="success"
)._value.get()
before_texts = _embedding_texts_total.labels(**labels)._value.get()
before_chars = _embedding_input_chars_total.labels(**labels)._value.get()
before_duration_sum = _client_duration.labels(**labels)._sum.get()
test_duration_s = 0.123
test_num_texts = 4
test_num_chars = 200
# Under test.
observe_embedding_client(
provider=provider,
text_type=text_type,
duration_s=test_duration_s,
num_texts=test_num_texts,
num_chars=test_num_chars,
success=True,
)
# Postcondition.
assert (
_embedding_requests_total.labels(**labels, status="success")._value.get()
== before_requests + 1
)
assert (
_embedding_texts_total.labels(**labels)._value.get()
== before_texts + test_num_texts
)
assert (
_embedding_input_chars_total.labels(**labels)._value.get()
== before_chars + test_num_chars
)
assert (
_client_duration.labels(**labels)._sum.get()
== before_duration_sum + test_duration_s
)
def test_failure_records_duration_and_failure_counter_only(self) -> None:
# Precondition.
provider = EmbeddingProvider.COHERE
text_type = EmbedTextType.PASSAGE
labels = {
PROVIDER_LABEL_NAME: provider.value,
TEXT_TYPE_LABEL_NAME: text_type.value,
}
before_failure = _embedding_requests_total.labels(
**labels, status="failure"
)._value.get()
before_texts = _embedding_texts_total.labels(**labels)._value.get()
before_chars = _embedding_input_chars_total.labels(**labels)._value.get()
before_duration_sum = _client_duration.labels(**labels)._sum.get()
test_duration_s = 0.5
test_num_texts = 3
test_num_chars = 150
# Under test.
observe_embedding_client(
provider=provider,
text_type=text_type,
duration_s=test_duration_s,
num_texts=test_num_texts,
num_chars=test_num_chars,
success=False,
)
# Postcondition.
# Failure counter incremented.
assert (
_embedding_requests_total.labels(**labels, status="failure")._value.get()
== before_failure + 1
)
# Duration still recorded.
assert (
_client_duration.labels(**labels)._sum.get()
== before_duration_sum + test_duration_s
)
# Throughput counters NOT bumped on failure.
assert _embedding_texts_total.labels(**labels)._value.get() == before_texts
assert (
_embedding_input_chars_total.labels(**labels)._value.get() == before_chars
)
def test_local_provider_uses_local_label(self) -> None:
# Precondition.
text_type = EmbedTextType.QUERY
labels = {
PROVIDER_LABEL_NAME: LOCAL_PROVIDER_LABEL,
TEXT_TYPE_LABEL_NAME: text_type.value,
}
before = _embedding_requests_total.labels(
**labels, status="success"
)._value.get()
test_duration_s = 0.05
test_num_texts = 1
test_num_chars = 10
# Under test.
observe_embedding_client(
provider=None,
text_type=text_type,
duration_s=test_duration_s,
num_texts=test_num_texts,
num_chars=test_num_chars,
success=True,
)
# Postcondition.
assert (
_embedding_requests_total.labels(**labels, status="success")._value.get()
== before + 1
)
def test_exceptions_do_not_propagate(self) -> None:
with patch.object(
_embedding_requests_total,
"labels",
side_effect=RuntimeError("boom"),
):
# Must not raise.
observe_embedding_client(
provider=EmbeddingProvider.OPENAI,
text_type=EmbedTextType.QUERY,
duration_s=0.1,
num_texts=1,
num_chars=10,
success=True,
)
class TestTrackEmbeddingInProgress:
def test_gauge_increments_and_decrements(self) -> None:
# Precondition.
provider = EmbeddingProvider.OPENAI
text_type = EmbedTextType.QUERY
labels = {
PROVIDER_LABEL_NAME: provider.value,
TEXT_TYPE_LABEL_NAME: text_type.value,
}
before = _embeddings_in_progress.labels(**labels)._value.get()
# Under test.
with track_embedding_in_progress(provider, text_type):
during = _embeddings_in_progress.labels(**labels)._value.get()
assert during == before + 1
# Postcondition.
after = _embeddings_in_progress.labels(**labels)._value.get()
assert after == before
def test_gauge_decrements_on_exception(self) -> None:
# Precondition.
provider = EmbeddingProvider.COHERE
text_type = EmbedTextType.PASSAGE
labels = {
PROVIDER_LABEL_NAME: provider.value,
TEXT_TYPE_LABEL_NAME: text_type.value,
}
before = _embeddings_in_progress.labels(**labels)._value.get()
# Under test.
raised = False
try:
with track_embedding_in_progress(provider, text_type):
raise ValueError("simulated embedding failure")
except ValueError:
raised = True
assert raised
# Postcondition.
after = _embeddings_in_progress.labels(**labels)._value.get()
assert after == before
def test_local_provider_uses_local_label(self) -> None:
# Precondition.
text_type = EmbedTextType.QUERY
labels = {
PROVIDER_LABEL_NAME: LOCAL_PROVIDER_LABEL,
TEXT_TYPE_LABEL_NAME: text_type.value,
}
before = _embeddings_in_progress.labels(**labels)._value.get()
# Under test.
with track_embedding_in_progress(None, text_type):
during = _embeddings_in_progress.labels(**labels)._value.get()
assert during == before + 1
# Postcondition.
after = _embeddings_in_progress.labels(**labels)._value.get()
assert after == before
def test_inc_exception_does_not_break_call(self) -> None:
# Precondition.
provider = EmbeddingProvider.VOYAGE
text_type = EmbedTextType.QUERY
labels = {
PROVIDER_LABEL_NAME: provider.value,
TEXT_TYPE_LABEL_NAME: text_type.value,
}
before = _embeddings_in_progress.labels(**labels)._value.get()
# Under test.
with patch.object(
_embeddings_in_progress.labels(**labels),
"inc",
side_effect=RuntimeError("boom"),
):
# Context manager should still yield without decrementing.
with track_embedding_in_progress(provider, text_type):
during = _embeddings_in_progress.labels(**labels)._value.get()
assert during == before
# Postcondition.
after = _embeddings_in_progress.labels(**labels)._value.get()
assert after == before

View File

@@ -129,36 +129,12 @@ class TestWorkerHealthCollector:
up = families[1]
assert up.name == "onyx_celery_worker_up"
assert len(up.samples) == 3
label_pairs = {
(s.labels["worker_type"], s.labels["hostname"]) for s in up.samples
}
assert label_pairs == {
("primary", "host1"),
("docfetching", "host1"),
("monitoring", "host1"),
}
# Labels use short names (before @)
labels = {s.labels["worker"] for s in up.samples}
assert labels == {"primary", "docfetching", "monitoring"}
for sample in up.samples:
assert sample.value == 1
def test_replicas_of_same_worker_type_are_distinct(self) -> None:
"""Regression: ``docprocessing@pod-1`` and ``docprocessing@pod-2`` must
produce separate samples, not collapse into one duplicate-timestamp
series.
"""
monitor = WorkerHeartbeatMonitor(MagicMock())
monitor._on_heartbeat({"hostname": "docprocessing@pod-1"})
monitor._on_heartbeat({"hostname": "docprocessing@pod-2"})
monitor._on_heartbeat({"hostname": "docprocessing@pod-3"})
collector = WorkerHealthCollector(cache_ttl=0)
collector.set_monitor(monitor)
up = collector.collect()[1]
assert len(up.samples) == 3
hostnames = {s.labels["hostname"] for s in up.samples}
assert hostnames == {"pod-1", "pod-2", "pod-3"}
assert all(s.labels["worker_type"] == "docprocessing" for s in up.samples)
def test_reports_dead_worker(self) -> None:
monitor = WorkerHeartbeatMonitor(MagicMock())
monitor._on_heartbeat({"hostname": "primary@host1"})
@@ -175,9 +151,9 @@ class TestWorkerHealthCollector:
assert active.samples[0].value == 1
up = families[1]
samples_by_type = {s.labels["worker_type"]: s.value for s in up.samples}
assert samples_by_type["primary"] == 1
assert samples_by_type["monitoring"] == 0
samples_by_name = {s.labels["worker"]: s.value for s in up.samples}
assert samples_by_name["primary"] == 1
assert samples_by_name["monitoring"] == 0
def test_empty_monitor_returns_zero(self) -> None:
monitor = WorkerHeartbeatMonitor(MagicMock())

View File

@@ -58,7 +58,8 @@ SERVICE_ORDER=(
validate_template() {
local template_file=$1
echo "Validating template: $template_file..."
if ! aws cloudformation validate-template --template-body file://"$template_file" --region "$AWS_REGION" > /dev/null; then
aws cloudformation validate-template --template-body file://"$template_file" --region "$AWS_REGION" > /dev/null
if [ $? -ne 0 ]; then
echo "Error: Validation failed for $template_file. Exiting."
exit 1
fi
@@ -107,15 +108,13 @@ deploy_stack() {
fi
# Create temporary parameters file for this template
local temp_params_file
temp_params_file=$(create_parameters_from_json "$template_file")
local temp_params_file=$(create_parameters_from_json "$template_file")
# Special handling for SubnetIDs parameter if needed
if grep -q "SubnetIDs" "$template_file"; then
echo "Template uses SubnetIDs parameter, ensuring it's properly formatted..."
# Make sure we're passing SubnetIDs as a comma-separated list
local subnet_ids
subnet_ids=$(remove_comments "$CONFIG_FILE" | jq -r '.SubnetIDs // empty')
local subnet_ids=$(remove_comments "$CONFIG_FILE" | jq -r '.SubnetIDs // empty')
if [ -n "$subnet_ids" ]; then
echo "Using SubnetIDs from config: $subnet_ids"
else
@@ -124,13 +123,15 @@ deploy_stack() {
fi
echo "Deploying stack: $stack_name with template: $template_file and generated config from: $CONFIG_FILE..."
if ! aws cloudformation deploy \
aws cloudformation deploy \
--stack-name "$stack_name" \
--template-file "$template_file" \
--parameter-overrides file://"$temp_params_file" \
--capabilities CAPABILITY_IAM CAPABILITY_NAMED_IAM CAPABILITY_AUTO_EXPAND \
--region "$AWS_REGION" \
--no-cli-auto-prompt > /dev/null; then
--no-cli-auto-prompt > /dev/null
if [ $? -ne 0 ]; then
echo "Error: Deployment failed for $stack_name. Exiting."
exit 1
fi

View File

@@ -52,9 +52,11 @@ delete_stack() {
--region "$AWS_REGION"
echo "Waiting for stack $stack_name to be deleted..."
if aws cloudformation wait stack-delete-complete \
aws cloudformation wait stack-delete-complete \
--stack-name "$stack_name" \
--region "$AWS_REGION"; then
--region "$AWS_REGION"
if [ $? -eq 0 ]; then
echo "Stack $stack_name deleted successfully."
sleep 10
else

View File

@@ -1,4 +1,3 @@
#!/bin/sh
# fill in the template
export ONYX_BACKEND_API_HOST="${ONYX_BACKEND_API_HOST:-api_server}"
export ONYX_WEB_SERVER_HOST="${ONYX_WEB_SERVER_HOST:-web_server}"
@@ -17,15 +16,12 @@ echo "Using web server host: $ONYX_WEB_SERVER_HOST"
echo "Using MCP server host: $ONYX_MCP_SERVER_HOST"
echo "Using nginx proxy timeouts - connect: ${NGINX_PROXY_CONNECT_TIMEOUT}s, send: ${NGINX_PROXY_SEND_TIMEOUT}s, read: ${NGINX_PROXY_READ_TIMEOUT}s"
# shellcheck disable=SC2016
envsubst '$DOMAIN $SSL_CERT_FILE_NAME $SSL_CERT_KEY_FILE_NAME $ONYX_BACKEND_API_HOST $ONYX_WEB_SERVER_HOST $ONYX_MCP_SERVER_HOST $NGINX_PROXY_CONNECT_TIMEOUT $NGINX_PROXY_SEND_TIMEOUT $NGINX_PROXY_READ_TIMEOUT' < "/etc/nginx/conf.d/$1" > /etc/nginx/conf.d/app.conf
# Conditionally create MCP server configuration
if [ "${MCP_SERVER_ENABLED}" = "True" ] || [ "${MCP_SERVER_ENABLED}" = "true" ]; then
echo "MCP server is enabled, creating MCP configuration..."
# shellcheck disable=SC2016
envsubst '$ONYX_MCP_SERVER_HOST' < "/etc/nginx/conf.d/mcp_upstream.conf.inc.template" > /etc/nginx/conf.d/mcp_upstream.conf.inc
# shellcheck disable=SC2016
envsubst '$ONYX_MCP_SERVER_HOST' < "/etc/nginx/conf.d/mcp.conf.inc.template" > /etc/nginx/conf.d/mcp.conf.inc
else
echo "MCP server is disabled, removing MCP configuration..."

View File

@@ -12,7 +12,7 @@ dependencies = [
"cohere==5.6.1",
"fastapi==0.133.1",
"google-genai==1.52.0",
"litellm[google]==1.81.6",
"litellm==1.81.6",
"openai==2.14.0",
"pydantic==2.11.7",
"prometheus_client>=0.21.1",
@@ -69,7 +69,7 @@ backend = [
"langchain-core==1.2.28",
"lazy_imports==1.0.1",
"lxml==5.3.0",
"Mako==1.3.11",
"Mako==1.2.4",
# NOTE: Do not update without understanding the patching behavior in
# get_markitdown_converter in
# backend/onyx/file_processing/extract_file_text.py and what impacts
@@ -96,7 +96,7 @@ backend = [
"python-gitlab==5.6.0",
"python-pptx==0.6.23",
"pypandoc_binary==1.16.2",
"pypdf==6.10.2",
"pypdf==6.10.0",
"pytest-mock==3.12.0",
"pytest-playwright==0.7.2",
"python-docx==1.1.2",

View File

@@ -48,19 +48,6 @@ func runWebScript(args []string) {
log.Fatalf("Failed to find web directory: %v", err)
}
nodeModules := filepath.Join(webDir, "node_modules")
if _, err := os.Stat(nodeModules); os.IsNotExist(err) {
log.Info("node_modules not found, running npm install --no-save...")
installCmd := exec.Command("npm", "install", "--no-save")
installCmd.Dir = webDir
installCmd.Stdout = os.Stdout
installCmd.Stderr = os.Stderr
installCmd.Stdin = os.Stdin
if err := installCmd.Run(); err != nil {
log.Fatalf("Failed to run npm install: %v", err)
}
}
scriptName := args[0]
scriptArgs := args[1:]
if len(scriptArgs) > 0 && scriptArgs[0] == "--" {

251
uv.lock generated
View File

@@ -447,14 +447,14 @@ wheels = [
[[package]]
name = "authlib"
version = "1.6.11"
version = "1.6.9"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "cryptography" },
]
sdist = { url = "https://files.pythonhosted.org/packages/28/10/b325d58ffe86815b399334a101e63bc6fa4e1953921cb23703b48a0a0220/authlib-1.6.11.tar.gz", hash = "sha256:64db35b9b01aeccb4715a6c9a6613a06f2bd7be2ab9d2eb89edd1dfc7580a38f", size = 165359, upload-time = "2026-04-16T07:22:50.279Z" }
sdist = { url = "https://files.pythonhosted.org/packages/af/98/00d3dd826d46959ad8e32af2dbb2398868fd9fd0683c26e56d0789bd0e68/authlib-1.6.9.tar.gz", hash = "sha256:d8f2421e7e5980cc1ddb4e32d3f5fa659cfaf60d8eaf3281ebed192e4ab74f04", size = 165134, upload-time = "2026-03-02T07:44:01.998Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/57/2f/55fca558f925a51db046e5b929deb317ddb05afed74b22d89f4eca578980/authlib-1.6.11-py2.py3-none-any.whl", hash = "sha256:c8687a9a26451c51a34a06fa17bb97cb15bba46a6a626755e2d7f50da8bff3e3", size = 244469, upload-time = "2026-04-16T07:22:48.413Z" },
{ url = "https://files.pythonhosted.org/packages/53/23/b65f568ed0c22f1efacb744d2db1a33c8068f384b8c9b482b52ebdbc3ef6/authlib-1.6.9-py2.py3-none-any.whl", hash = "sha256:f08b4c14e08f0861dc18a32357b33fbcfd2ea86cfe3fe149484b4d764c4a0ac3", size = 244197, upload-time = "2026-03-02T07:44:00.307Z" },
]
[[package]]
@@ -2124,12 +2124,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ed/d4/90197b416cb61cefd316964fd9e7bd8324bcbafabf40eef14a9f20b81974/google_api_core-2.28.1-py3-none-any.whl", hash = "sha256:4021b0f8ceb77a6fb4de6fde4502cecab45062e66ff4f2895169e0b35bc9466c", size = 173706, upload-time = "2025-10-28T21:34:50.151Z" },
]
[package.optional-dependencies]
grpc = [
{ name = "grpcio" },
{ name = "grpcio-status" },
]
[[package]]
name = "google-api-python-client"
version = "2.86.0"
@@ -2187,124 +2181,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/4a/07/8d9a8186e6768b55dfffeb57c719bc03770cf8a970a074616ae6f9e26a57/google_auth_oauthlib-1.0.0-py2.py3-none-any.whl", hash = "sha256:95880ca704928c300f48194d1770cf5b1462835b6e49db61445a520f793fd5fb", size = 18926, upload-time = "2023-02-07T20:53:18.837Z" },
]
[[package]]
name = "google-cloud-aiplatform"
version = "1.133.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "docstring-parser" },
{ name = "google-api-core", extra = ["grpc"] },
{ name = "google-auth" },
{ name = "google-cloud-bigquery" },
{ name = "google-cloud-resource-manager" },
{ name = "google-cloud-storage" },
{ name = "google-genai" },
{ name = "packaging" },
{ name = "proto-plus" },
{ name = "protobuf" },
{ name = "pydantic" },
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/d4/be/31ce7fd658ddebafbe5583977ddee536b2bacc491ad10b5a067388aec66f/google_cloud_aiplatform-1.133.0.tar.gz", hash = "sha256:3a6540711956dd178daaab3c2c05db476e46d94ac25912b8cf4f59b00b058ae0", size = 9921309, upload-time = "2026-01-08T22:11:25.079Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/01/5b/ef74ff65aebb74eaba51078e33ddd897247ba0d1197fd5a7953126205519/google_cloud_aiplatform-1.133.0-py2.py3-none-any.whl", hash = "sha256:dfc81228e987ca10d1c32c7204e2131b3c8d6b7c8e0b4e23bf7c56816bc4c566", size = 8184595, upload-time = "2026-01-08T22:11:22.067Z" },
]
[[package]]
name = "google-cloud-bigquery"
version = "3.41.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core", extra = ["grpc"] },
{ name = "google-auth" },
{ name = "google-cloud-core" },
{ name = "google-resumable-media" },
{ name = "packaging" },
{ name = "python-dateutil" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/ce/13/6515c7aab55a4a0cf708ffd309fb9af5bab54c13e32dc22c5acd6497193c/google_cloud_bigquery-3.41.0.tar.gz", hash = "sha256:2217e488b47ed576360c9b2cc07d59d883a54b83167c0ef37f915c26b01a06fe", size = 513434, upload-time = "2026-03-30T22:50:55.347Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/40/33/1d3902efadef9194566d499d61507e1f038454e0b55499d2d7f8ab2a4fee/google_cloud_bigquery-3.41.0-py3-none-any.whl", hash = "sha256:2a5b5a737b401cbd824a6e5eac7554100b878668d908e6548836b5d8aaa4dcaa", size = 262343, upload-time = "2026-03-30T22:48:45.444Z" },
]
[[package]]
name = "google-cloud-core"
version = "2.5.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core" },
{ name = "google-auth" },
]
sdist = { url = "https://files.pythonhosted.org/packages/dc/24/6ca08b0a03c7b0c620427503ab00353a4ae806b848b93bcea18b6b76fde6/google_cloud_core-2.5.1.tar.gz", hash = "sha256:3dc94bdec9d05a31d9f355045ed0f369fbc0d8c665076c734f065d729800f811", size = 36078, upload-time = "2026-03-30T22:50:08.057Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/73/d9/5bb050cb32826466aa9b25f79e2ca2879fe66cb76782d4ed798dd7506151/google_cloud_core-2.5.1-py3-none-any.whl", hash = "sha256:ea62cdf502c20e3e14be8a32c05ed02113d7bef454e40ff3fab6fe1ec9f1f4e7", size = 29452, upload-time = "2026-03-30T22:48:31.567Z" },
]
[[package]]
name = "google-cloud-resource-manager"
version = "1.17.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core", extra = ["grpc"] },
{ name = "google-auth" },
{ name = "grpc-google-iam-v1" },
{ name = "grpcio" },
{ name = "proto-plus" },
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b2/1a/13060cabf553d52d151d2afc26b39561e82853380d499dd525a0d422d9f0/google_cloud_resource_manager-1.17.0.tar.gz", hash = "sha256:0f486b62e2c58ff992a3a50fa0f4a96eef7750aa6c971bb373398ccb91828660", size = 464971, upload-time = "2026-03-26T22:17:29.204Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3d/f7/661d7a9023e877a226b5683429c3662f75a29ef45cb1464cf39adb689218/google_cloud_resource_manager-1.17.0-py3-none-any.whl", hash = "sha256:e479baf4b014a57f298e01b8279e3290b032e3476d69c8e5e1427af8f82739a5", size = 404403, upload-time = "2026-03-26T22:15:26.57Z" },
]
[[package]]
name = "google-cloud-storage"
version = "3.10.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core" },
{ name = "google-auth" },
{ name = "google-cloud-core" },
{ name = "google-crc32c" },
{ name = "google-resumable-media" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/4c/47/205eb8e9a1739b5345843e5a425775cbdc472cc38e7eda082ba5b8d02450/google_cloud_storage-3.10.1.tar.gz", hash = "sha256:97db9aa4460727982040edd2bd13ff3d5e2260b5331ad22895802da1fc2a5286", size = 17309950, upload-time = "2026-03-23T09:35:23.409Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/ad/ff/ca9ab2417fa913d75aae38bf40bf856bb2749a604b2e0f701b37cfcd23cc/google_cloud_storage-3.10.1-py3-none-any.whl", hash = "sha256:a72f656759b7b99bda700f901adcb3425a828d4a29f911bc26b3ea79c5b1217f", size = 324453, upload-time = "2026-03-23T09:35:21.368Z" },
]
[[package]]
name = "google-crc32c"
version = "1.8.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5d/ef/21ccfaab3d5078d41efe8612e0ed0bfc9ce22475de074162a91a25f7980d/google_crc32c-1.8.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:014a7e68d623e9a4222d663931febc3033c5c7c9730785727de2a81f87d5bab8", size = 31298, upload-time = "2025-12-16T00:20:32.241Z" },
{ url = "https://files.pythonhosted.org/packages/c5/b8/f8413d3f4b676136e965e764ceedec904fe38ae8de0cdc52a12d8eb1096e/google_crc32c-1.8.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:86cfc00fe45a0ac7359e5214a1704e51a99e757d0272554874f419f79838c5f7", size = 30872, upload-time = "2025-12-16T00:33:58.785Z" },
{ url = "https://files.pythonhosted.org/packages/f6/fd/33aa4ec62b290477181c55bb1c9302c9698c58c0ce9a6ab4874abc8b0d60/google_crc32c-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:19b40d637a54cb71e0829179f6cb41835f0fbd9e8eb60552152a8b52c36cbe15", size = 33243, upload-time = "2025-12-16T00:40:21.46Z" },
{ url = "https://files.pythonhosted.org/packages/71/03/4820b3bd99c9653d1a5210cb32f9ba4da9681619b4d35b6a052432df4773/google_crc32c-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:17446feb05abddc187e5441a45971b8394ea4c1b6efd88ab0af393fd9e0a156a", size = 33608, upload-time = "2025-12-16T00:40:22.204Z" },
{ url = "https://files.pythonhosted.org/packages/7c/43/acf61476a11437bf9733fb2f70599b1ced11ec7ed9ea760fdd9a77d0c619/google_crc32c-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:71734788a88f551fbd6a97be9668a0020698e07b2bf5b3aa26a36c10cdfb27b2", size = 34439, upload-time = "2025-12-16T00:35:20.458Z" },
{ url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" },
{ url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" },
{ url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" },
{ url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" },
{ url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" },
{ url = "https://files.pythonhosted.org/packages/d1/db/000f15b41724589b0e7bc24bc7a8967898d8d3bc8caf64c513d91ef1f6c0/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:3ebb04528e83b2634857f43f9bb8ef5b2bbe7f10f140daeb01b58f972d04736b", size = 31297, upload-time = "2025-12-16T00:23:20.709Z" },
{ url = "https://files.pythonhosted.org/packages/d7/0d/8ebed0c39c53a7e838e2a486da8abb0e52de135f1b376ae2f0b160eb4c1a/google_crc32c-1.8.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:450dc98429d3e33ed2926fc99ee81001928d63460f8538f21a5d6060912a8e27", size = 30867, upload-time = "2025-12-16T00:43:14.628Z" },
{ url = "https://files.pythonhosted.org/packages/ce/42/b468aec74a0354b34c8cbf748db20d6e350a68a2b0912e128cabee49806c/google_crc32c-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3b9776774b24ba76831609ffbabce8cdf6fa2bd5e9df37b594221c7e333a81fa", size = 33344, upload-time = "2025-12-16T00:40:24.742Z" },
{ url = "https://files.pythonhosted.org/packages/1c/e8/b33784d6fc77fb5062a8a7854e43e1e618b87d5ddf610a88025e4de6226e/google_crc32c-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:89c17d53d75562edfff86679244830599ee0a48efc216200691de8b02ab6b2b8", size = 33694, upload-time = "2025-12-16T00:40:25.505Z" },
{ url = "https://files.pythonhosted.org/packages/92/b1/d3cbd4d988afb3d8e4db94ca953df429ed6db7282ed0e700d25e6c7bfc8d/google_crc32c-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:57a50a9035b75643996fbf224d6661e386c7162d1dfdab9bc4ca790947d1007f", size = 34435, upload-time = "2025-12-16T00:35:22.107Z" },
{ url = "https://files.pythonhosted.org/packages/21/88/8ecf3c2b864a490b9e7010c84fd203ec8cf3b280651106a3a74dd1b0ca72/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:e6584b12cb06796d285d09e33f63309a09368b9d806a551d8036a4207ea43697", size = 31301, upload-time = "2025-12-16T00:24:48.527Z" },
{ url = "https://files.pythonhosted.org/packages/36/c6/f7ff6c11f5ca215d9f43d3629163727a272eabc356e5c9b2853df2bfe965/google_crc32c-1.8.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:f4b51844ef67d6cf2e9425983274da75f18b1597bb2c998e1c0a0e8d46f8f651", size = 30868, upload-time = "2025-12-16T00:48:12.163Z" },
{ url = "https://files.pythonhosted.org/packages/56/15/c25671c7aad70f8179d858c55a6ae8404902abe0cdcf32a29d581792b491/google_crc32c-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b0d1a7afc6e8e4635564ba8aa5c0548e3173e41b6384d7711a9123165f582de2", size = 33381, upload-time = "2025-12-16T00:40:26.268Z" },
{ url = "https://files.pythonhosted.org/packages/42/fa/f50f51260d7b0ef5d4898af122d8a7ec5a84e2984f676f746445f783705f/google_crc32c-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3f68782f3cbd1bce027e48768293072813469af6a61a86f6bb4977a4380f21", size = 33734, upload-time = "2025-12-16T00:40:27.028Z" },
{ url = "https://files.pythonhosted.org/packages/08/a5/7b059810934a09fb3ccb657e0843813c1fee1183d3bc2c8041800374aa2c/google_crc32c-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:d511b3153e7011a27ab6ee6bb3a5404a55b994dc1a7322c0b87b29606d9790e2", size = 34878, upload-time = "2025-12-16T00:35:23.142Z" },
{ url = "https://files.pythonhosted.org/packages/52/c5/c171e4d8c44fec1422d801a6d2e5d7ddabd733eeda505c79730ee9607f07/google_crc32c-1.8.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:87fa445064e7db928226b2e6f0d5304ab4cd0339e664a4e9a25029f384d9bb93", size = 28615, upload-time = "2025-12-16T00:40:29.298Z" },
{ url = "https://files.pythonhosted.org/packages/9c/97/7d75fe37a7a6ed171a2cf17117177e7aab7e6e0d115858741b41e9dd4254/google_crc32c-1.8.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f639065ea2042d5c034bf258a9f085eaa7af0cd250667c0635a3118e8f92c69c", size = 28800, upload-time = "2025-12-16T00:40:30.322Z" },
]
[[package]]
name = "google-genai"
version = "1.52.0"
@@ -2324,18 +2200,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/ec/66/03f663e7bca7abe9ccfebe6cb3fe7da9a118fd723a5abb278d6117e7990e/google_genai-1.52.0-py3-none-any.whl", hash = "sha256:c8352b9f065ae14b9322b949c7debab8562982f03bf71d44130cd2b798c20743", size = 261219, upload-time = "2025-11-21T02:18:54.515Z" },
]
[[package]]
name = "google-resumable-media"
version = "2.8.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-crc32c" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3f/d1/b1ea14b93b6b78f57fc580125de44e9f593ab88dd2460f1a8a8d18f74754/google_resumable_media-2.8.2.tar.gz", hash = "sha256:f3354a182ebd193ae3f42e3ef95e6c9b10f128320de23ac7637236713b1acd70", size = 2164510, upload-time = "2026-03-30T23:34:25.369Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5e/f8/50bfaf4658431ff9de45c5c3935af7ab01157a4903c603cd0eee6e78e087/google_resumable_media-2.8.2-py3-none-any.whl", hash = "sha256:82b6d8ccd11765268cdd2a2123f417ec806b8eef3000a9a38dfe3033da5fb220", size = 81511, upload-time = "2026-03-30T23:34:09.671Z" },
]
[[package]]
name = "googleapis-common-protos"
version = "1.72.0"
@@ -2348,11 +2212,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c4/ab/09169d5a4612a5f92490806649ac8d41e3ec9129c636754575b3553f4ea4/googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038", size = 297515, upload-time = "2025-11-06T18:29:13.14Z" },
]
[package.optional-dependencies]
grpc = [
{ name = "grpcio" },
]
[[package]]
name = "greenlet"
version = "3.2.4"
@@ -2403,85 +2262,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" },
]
[[package]]
name = "grpc-google-iam-v1"
version = "0.14.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "googleapis-common-protos", extra = ["grpc"] },
{ name = "grpcio" },
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/4f/d098419ad0bfc06c9ce440575f05aa22d8973b6c276e86ac7890093d3c37/grpc_google_iam_v1-0.14.4.tar.gz", hash = "sha256:392b3796947ed6334e61171d9ab06bf7eb357f554e5fc7556ad7aab6d0e17038", size = 23706, upload-time = "2026-04-01T01:57:49.813Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/89/22/c2dd50c09bf679bd38173656cd4402d2511e563b33bc88f90009cf50613c/grpc_google_iam_v1-0.14.4-py3-none-any.whl", hash = "sha256:412facc320fcbd94034b4df3d557662051d4d8adfa86e0ddb4dca70a3f739964", size = 32675, upload-time = "2026-04-01T01:57:47.69Z" },
]
[[package]]
name = "grpcio"
version = "1.80.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/5d/db/1d56e5f5823257b291962d6c0ce106146c6447f405b60b234c4f222a7cde/grpcio-1.80.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:dfab85db094068ff42e2a3563f60ab3dddcc9d6488a35abf0132daec13209c8a", size = 6055009, upload-time = "2026-03-30T08:46:46.265Z" },
{ url = "https://files.pythonhosted.org/packages/6e/18/c83f3cad64c5ca63bca7e91e5e46b0d026afc5af9d0a9972472ceba294b3/grpcio-1.80.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:5c07e82e822e1161354e32da2662f741a4944ea955f9f580ec8fb409dd6f6060", size = 12035295, upload-time = "2026-03-30T08:46:49.099Z" },
{ url = "https://files.pythonhosted.org/packages/0f/8e/e14966b435be2dda99fbe89db9525ea436edc79780431a1c2875a3582644/grpcio-1.80.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba0915d51fd4ced2db5ff719f84e270afe0e2d4c45a7bdb1e8d036e4502928c2", size = 6610297, upload-time = "2026-03-30T08:46:52.123Z" },
{ url = "https://files.pythonhosted.org/packages/cc/26/d5eb38f42ce0e3fdc8174ea4d52036ef8d58cc4426cb800f2610f625dd75/grpcio-1.80.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:3cb8130ba457d2aa09fa6b7c3ed6b6e4e6a2685fce63cb803d479576c4d80e21", size = 7300208, upload-time = "2026-03-30T08:46:54.859Z" },
{ url = "https://files.pythonhosted.org/packages/25/51/bd267c989f85a17a5b3eea65a6feb4ff672af41ca614e5a0279cc0ea381c/grpcio-1.80.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:09e5e478b3d14afd23f12e49e8b44c8684ac3c5f08561c43a5b9691c54d136ab", size = 6813442, upload-time = "2026-03-30T08:46:57.056Z" },
{ url = "https://files.pythonhosted.org/packages/9e/d9/d80eef735b19e9169e30164bbf889b46f9df9127598a83d174eb13a48b26/grpcio-1.80.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:00168469238b022500e486c1c33916acf2f2a9b2c022202cf8a1885d2e3073c1", size = 7414743, upload-time = "2026-03-30T08:46:59.682Z" },
{ url = "https://files.pythonhosted.org/packages/de/f2/567f5bd5054398ed6b0509b9a30900376dcf2786bd936812098808b49d8d/grpcio-1.80.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8502122a3cc1714038e39a0b071acb1207ca7844208d5ea0d091317555ee7106", size = 8426046, upload-time = "2026-03-30T08:47:02.474Z" },
{ url = "https://files.pythonhosted.org/packages/62/29/73ef0141b4732ff5eacd68430ff2512a65c004696997f70476a83e548e7e/grpcio-1.80.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ce1794f4ea6cc3ca29463f42d665c32ba1b964b48958a66497917fe9069f26e6", size = 7851641, upload-time = "2026-03-30T08:47:05.462Z" },
{ url = "https://files.pythonhosted.org/packages/46/69/abbfa360eb229a8623bab5f5a4f8105e445bd38ce81a89514ba55d281ad0/grpcio-1.80.0-cp311-cp311-win32.whl", hash = "sha256:51b4a7189b0bef2aa30adce3c78f09c83526cf3dddb24c6a96555e3b97340440", size = 4154368, upload-time = "2026-03-30T08:47:08.027Z" },
{ url = "https://files.pythonhosted.org/packages/6f/d4/ae92206d01183b08613e846076115f5ac5991bae358d2a749fa864da5699/grpcio-1.80.0-cp311-cp311-win_amd64.whl", hash = "sha256:02e64bb0bb2da14d947a49e6f120a75e947250aebe65f9629b62bb1f5c14e6e9", size = 4894235, upload-time = "2026-03-30T08:47:10.839Z" },
{ url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616, upload-time = "2026-03-30T08:47:13.428Z" },
{ url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204, upload-time = "2026-03-30T08:47:15.873Z" },
{ url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866, upload-time = "2026-03-30T08:47:18.588Z" },
{ url = "https://files.pythonhosted.org/packages/ae/f6/fdd975a2cb4d78eb67769a7b3b3830970bfa2e919f1decf724ae4445f42c/grpcio-1.80.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921", size = 7273060, upload-time = "2026-03-30T08:47:21.113Z" },
{ url = "https://files.pythonhosted.org/packages/db/f0/a3deb5feba60d9538a962913e37bd2e69a195f1c3376a3dd44fe0427e996/grpcio-1.80.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411", size = 6782121, upload-time = "2026-03-30T08:47:23.827Z" },
{ url = "https://files.pythonhosted.org/packages/ca/84/36c6dcfddc093e108141f757c407902a05085e0c328007cb090d56646cdf/grpcio-1.80.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd", size = 7383811, upload-time = "2026-03-30T08:47:26.517Z" },
{ url = "https://files.pythonhosted.org/packages/7c/ef/f3a77e3dc5b471a0ec86c564c98d6adfa3510d38f8ee99010410858d591e/grpcio-1.80.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f", size = 8393860, upload-time = "2026-03-30T08:47:29.439Z" },
{ url = "https://files.pythonhosted.org/packages/9b/8d/9d4d27ed7f33d109c50d6b5ce578a9914aa68edab75d65869a17e630a8d1/grpcio-1.80.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f", size = 7830132, upload-time = "2026-03-30T08:47:33.254Z" },
{ url = "https://files.pythonhosted.org/packages/14/e4/9990b41c6d7a44e1e9dee8ac11d7a9802ba1378b40d77468a7761d1ad288/grpcio-1.80.0-cp312-cp312-win32.whl", hash = "sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193", size = 4140904, upload-time = "2026-03-30T08:47:35.319Z" },
{ url = "https://files.pythonhosted.org/packages/2f/2c/296f6138caca1f4b92a31ace4ae1b87dab692fc16a7a3417af3bb3c805bf/grpcio-1.80.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff", size = 4880944, upload-time = "2026-03-30T08:47:37.831Z" },
{ url = "https://files.pythonhosted.org/packages/2f/3a/7c3c25789e3f069e581dc342e03613c5b1cb012c4e8c7d9d5cf960a75856/grpcio-1.80.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:e9e408fc016dffd20661f0126c53d8a31c2821b5c13c5d67a0f5ed5de93319ad", size = 6017243, upload-time = "2026-03-30T08:47:40.075Z" },
{ url = "https://files.pythonhosted.org/packages/04/19/21a9806eb8240e174fd1ab0cd5b9aa948bb0e05c2f2f55f9d5d7405e6d08/grpcio-1.80.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:92d787312e613754d4d8b9ca6d3297e69994a7912a32fa38c4c4e01c272974b0", size = 12010840, upload-time = "2026-03-30T08:47:43.11Z" },
{ url = "https://files.pythonhosted.org/packages/18/3a/23347d35f76f639e807fb7a36fad3068aed100996849a33809591f26eca6/grpcio-1.80.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8ac393b58aa16991a2f1144ec578084d544038c12242da3a215966b512904d0f", size = 6567644, upload-time = "2026-03-30T08:47:46.806Z" },
{ url = "https://files.pythonhosted.org/packages/ff/40/96e07ecb604a6a67ae6ab151e3e35b132875d98bc68ec65f3e5ab3e781d7/grpcio-1.80.0-cp313-cp313-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:68e5851ac4b9afe07e7f84483803ad167852570d65326b34d54ca560bfa53fb6", size = 7277830, upload-time = "2026-03-30T08:47:49.643Z" },
{ url = "https://files.pythonhosted.org/packages/9b/e2/da1506ecea1f34a5e365964644b35edef53803052b763ca214ba3870c856/grpcio-1.80.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:873ff5d17d68992ef6605330127425d2fc4e77e612fa3c3e0ed4e668685e3140", size = 6783216, upload-time = "2026-03-30T08:47:52.817Z" },
{ url = "https://files.pythonhosted.org/packages/44/83/3b20ff58d0c3b7f6caaa3af9a4174d4023701df40a3f39f7f1c8e7c48f9d/grpcio-1.80.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2bea16af2750fd0a899bf1abd9022244418b55d1f37da2202249ba4ba673838d", size = 7385866, upload-time = "2026-03-30T08:47:55.687Z" },
{ url = "https://files.pythonhosted.org/packages/47/45/55c507599c5520416de5eefecc927d6a0d7af55e91cfffb2e410607e5744/grpcio-1.80.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba0db34f7e1d803a878284cd70e4c63cb6ae2510ba51937bf8f45ba997cefcf7", size = 8391602, upload-time = "2026-03-30T08:47:58.303Z" },
{ url = "https://files.pythonhosted.org/packages/10/bb/dd06f4c24c01db9cf11341b547d0a016b2c90ed7dbbb086a5710df7dd1d7/grpcio-1.80.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8eb613f02d34721f1acf3626dfdb3545bd3c8505b0e52bf8b5710a28d02e8aa7", size = 7826752, upload-time = "2026-03-30T08:48:01.311Z" },
{ url = "https://files.pythonhosted.org/packages/f9/1e/9d67992ba23371fd63d4527096eb8c6b76d74d52b500df992a3343fd7251/grpcio-1.80.0-cp313-cp313-win32.whl", hash = "sha256:93b6f823810720912fd131f561f91f5fed0fda372b6b7028a2681b8194d5d294", size = 4142310, upload-time = "2026-03-30T08:48:04.594Z" },
{ url = "https://files.pythonhosted.org/packages/cf/e6/283326a27da9e2c3038bc93eeea36fb118ce0b2d03922a9cda6688f53c5b/grpcio-1.80.0-cp313-cp313-win_amd64.whl", hash = "sha256:e172cf795a3ba5246d3529e4d34c53db70e888fa582a8ffebd2e6e48bc0cba50", size = 4882833, upload-time = "2026-03-30T08:48:07.363Z" },
{ url = "https://files.pythonhosted.org/packages/c5/6d/e65307ce20f5a09244ba9e9d8476e99fb039de7154f37fb85f26978b59c3/grpcio-1.80.0-cp314-cp314-linux_armv7l.whl", hash = "sha256:3d4147a97c8344d065d01bbf8b6acec2cf86fb0400d40696c8bdad34a64ffc0e", size = 6017376, upload-time = "2026-03-30T08:48:10.005Z" },
{ url = "https://files.pythonhosted.org/packages/69/10/9cef5d9650c72625a699c549940f0abb3c4bfdb5ed45a5ce431f92f31806/grpcio-1.80.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8e11f167935b3eb089ac9038e1a063e6d7dbe995c0bb4a661e614583352e76f", size = 12018133, upload-time = "2026-03-30T08:48:12.927Z" },
{ url = "https://files.pythonhosted.org/packages/04/82/983aabaad82ba26113caceeb9091706a0696b25da004fe3defb5b346e15b/grpcio-1.80.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f14b618fc30de822681ee986cfdcc2d9327229dc4c98aed16896761cacd468b9", size = 6574748, upload-time = "2026-03-30T08:48:16.386Z" },
{ url = "https://files.pythonhosted.org/packages/07/d7/031666ef155aa0bf399ed7e19439656c38bbd143779ae0861b038ce82abd/grpcio-1.80.0-cp314-cp314-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:4ed39fbdcf9b87370f6e8df4e39ca7b38b3e5e9d1b0013c7b6be9639d6578d14", size = 7277711, upload-time = "2026-03-30T08:48:19.627Z" },
{ url = "https://files.pythonhosted.org/packages/e8/43/f437a78f7f4f1d311804189e8f11fb311a01049b2e08557c1068d470cb2e/grpcio-1.80.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2dcc70e9f0ba987526e8e8603a610fb4f460e42899e74e7a518bf3c68fe1bf05", size = 6785372, upload-time = "2026-03-30T08:48:22.373Z" },
{ url = "https://files.pythonhosted.org/packages/93/3d/f6558e9c6296cb4227faa5c43c54a34c68d32654b829f53288313d16a86e/grpcio-1.80.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:448c884b668b868562b1bda833c5fce6272d26e1926ec46747cda05741d302c1", size = 7395268, upload-time = "2026-03-30T08:48:25.638Z" },
{ url = "https://files.pythonhosted.org/packages/06/21/0fdd77e84720b08843c371a2efa6f2e19dbebf56adc72df73d891f5506f0/grpcio-1.80.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a1dc80fe55685b4a543555e6eef975303b36c8db1023b1599b094b92aa77965f", size = 8392000, upload-time = "2026-03-30T08:48:28.974Z" },
{ url = "https://files.pythonhosted.org/packages/f5/68/67f4947ed55d2e69f2cc199ab9fd85e0a0034d813bbeef84df6d2ba4d4b7/grpcio-1.80.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:31b9ac4ad1aa28ffee5503821fafd09e4da0a261ce1c1281c6c8da0423c83b6e", size = 7828477, upload-time = "2026-03-30T08:48:32.054Z" },
{ url = "https://files.pythonhosted.org/packages/44/b6/8d4096691b2e385e8271911a0de4f35f0a6c7d05aff7098e296c3de86939/grpcio-1.80.0-cp314-cp314-win32.whl", hash = "sha256:367ce30ba67d05e0592470428f0ec1c31714cab9ef19b8f2e37be1f4c7d32fae", size = 4218563, upload-time = "2026-03-30T08:48:34.538Z" },
{ url = "https://files.pythonhosted.org/packages/e5/8c/bbe6baf2557262834f2070cf668515fa308b2d38a4bbf771f8f7872a7036/grpcio-1.80.0-cp314-cp314-win_amd64.whl", hash = "sha256:3b01e1f5464c583d2f567b2e46ff0d516ef979978f72091fd81f5ab7fa6e2e7f", size = 5019457, upload-time = "2026-03-30T08:48:37.308Z" },
]
[[package]]
name = "grpcio-status"
version = "1.80.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "googleapis-common-protos" },
{ name = "grpcio" },
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b1/ed/105f619bdd00cb47a49aa2feea6232ea2bbb04199d52a22cc6a7d603b5cb/grpcio_status-1.80.0.tar.gz", hash = "sha256:df73802a4c89a3ea88aa2aff971e886fccce162bc2e6511408b3d67a144381cd", size = 13901, upload-time = "2026-03-30T08:54:34.784Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/76/80/58cd2dfc19a07d022abe44bde7c365627f6c7cb6f692ada6c65ca437d09a/grpcio_status-1.80.0-py3-none-any.whl", hash = "sha256:4b56990363af50dbf2c2ebb80f1967185c07d87aa25aa2bea45ddb75fc181dbe", size = 14638, upload-time = "2026-03-30T08:54:01.569Z" },
]
[[package]]
name = "h11"
version = "0.16.0"
@@ -3384,11 +3164,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/e6/05/3516cc7386b220d388aa0bd833308c677e94eceb82b2756dd95e06f6a13f/litellm-1.81.6-py3-none-any.whl", hash = "sha256:573206ba194d49a1691370ba33f781671609ac77c35347f8a0411d852cf6341a", size = 12224343, upload-time = "2026-02-01T04:02:23.704Z" },
]
[package.optional-dependencies]
google = [
{ name = "google-cloud-aiplatform" },
]
[[package]]
name = "locket"
version = "1.0.0"
@@ -3503,14 +3278,14 @@ wheels = [
[[package]]
name = "mako"
version = "1.3.11"
version = "1.2.4"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "markupsafe" },
]
sdist = { url = "https://files.pythonhosted.org/packages/59/8a/805404d0c0b9f3d7a326475ca008db57aea9c5c9f2e1e39ed0faa335571c/mako-1.3.11.tar.gz", hash = "sha256:071eb4ab4c5010443152255d77db7faa6ce5916f35226eb02dc34479b6858069", size = 399811, upload-time = "2026-04-14T20:19:51.493Z" }
sdist = { url = "https://files.pythonhosted.org/packages/05/5f/2ba6e026d33a0e6ddc1dddf9958677f76f5f80c236bd65309d280b166d3e/Mako-1.2.4.tar.gz", hash = "sha256:d60a3903dc3bb01a18ad6a89cdbe2e4eadc69c0bc8ef1e3773ba53d44c3f7a34", size = 497021, upload-time = "2022-11-15T14:37:51.327Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/68/a5/19d7aaa7e433713ffe881df33705925a196afb9532efc8475d26593921a6/mako-1.3.11-py3-none-any.whl", hash = "sha256:e372c6e333cf004aa736a15f425087ec977e1fcbd2966aae7f17c8dc1da27a77", size = 78503, upload-time = "2026-04-14T20:19:53.233Z" },
{ url = "https://files.pythonhosted.org/packages/03/3b/68690a035ba7347860f1b8c0cde853230ba69ff41df5884ea7d89fe68cd3/Mako-1.2.4-py3-none-any.whl", hash = "sha256:c97c79c018b9165ac9922ae4f32da095ffd3c4e6872b45eded42926deea46818", size = 78672, upload-time = "2022-11-15T14:37:53.675Z" },
]
[[package]]
@@ -4429,7 +4204,7 @@ dependencies = [
{ name = "fastapi" },
{ name = "google-genai" },
{ name = "kubernetes" },
{ name = "litellm", extra = ["google"] },
{ name = "litellm" },
{ name = "openai" },
{ name = "prometheus-client" },
{ name = "prometheus-fastapi-instrumentator" },
@@ -4602,7 +4377,7 @@ requires-dist = [
{ name = "fastapi", specifier = "==0.133.1" },
{ name = "google-genai", specifier = "==1.52.0" },
{ name = "kubernetes", specifier = ">=31.0.0" },
{ name = "litellm", extras = ["google"], specifier = "==1.81.6" },
{ name = "litellm", specifier = "==1.81.6" },
{ name = "openai", specifier = "==2.14.0" },
{ name = "prometheus-client", specifier = ">=0.21.1" },
{ name = "prometheus-fastapi-instrumentator", specifier = "==7.1.0" },
@@ -4655,7 +4430,7 @@ backend = [
{ name = "langfuse", specifier = "==3.10.0" },
{ name = "lazy-imports", specifier = "==1.0.1" },
{ name = "lxml", specifier = "==5.3.0" },
{ name = "mako", specifier = "==1.3.11" },
{ name = "mako", specifier = "==1.2.4" },
{ name = "markitdown", extras = ["pdf", "docx", "pptx", "xlsx", "xls"], specifier = "==0.1.2" },
{ name = "mcp", extras = ["cli"], specifier = "==1.26.0" },
{ name = "mistune", specifier = "==3.2.0" },
@@ -4678,7 +4453,7 @@ backend = [
{ name = "pygithub", specifier = "==2.5.0" },
{ name = "pympler", specifier = "==1.1" },
{ name = "pypandoc-binary", specifier = "==1.16.2" },
{ name = "pypdf", specifier = "==6.10.2" },
{ name = "pypdf", specifier = "==6.10.0" },
{ name = "pytest-mock", specifier = "==3.12.0" },
{ name = "pytest-playwright", specifier = "==0.7.2" },
{ name = "python-dateutil", specifier = "==2.8.2" },
@@ -5928,11 +5703,11 @@ wheels = [
[[package]]
name = "pypdf"
version = "6.10.2"
version = "6.10.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/7b/3f/9f2167401c2e94833ca3b69535bad89e533b5de75fefe4197a2c224baec2/pypdf-6.10.2.tar.gz", hash = "sha256:7d09ce108eff6bf67465d461b6ef352dcb8d84f7a91befc02f904455c6eea11d", size = 5315679, upload-time = "2026-04-15T16:37:36.978Z" }
sdist = { url = "https://files.pythonhosted.org/packages/b8/9f/ca96abf18683ca12602065e4ed2bec9050b672c87d317f1079abc7b6d993/pypdf-6.10.0.tar.gz", hash = "sha256:4c5a48ba258c37024ec2505f7e8fd858525f5502784a2e1c8d415604af29f6ef", size = 5314833, upload-time = "2026-04-10T09:34:57.102Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/0c/d6/1d5c60cc17bbdf37c1552d9c03862fc6d32c5836732a0415b2d637edc2d0/pypdf-6.10.2-py3-none-any.whl", hash = "sha256:aa53be9826655b51c96741e5d7983ca224d898ac0a77896e64636810517624aa", size = 336308, upload-time = "2026-04-15T16:37:34.851Z" },
{ url = "https://files.pythonhosted.org/packages/55/f2/7ebe366f633f30a6ad105f650f44f24f98cb1335c4157d21ae47138b3482/pypdf-6.10.0-py3-none-any.whl", hash = "sha256:90005e959e1596c6e6c84c8b0ad383285b3e17011751cedd17f2ce8fcdfc86de", size = 334459, upload-time = "2026-04-10T09:34:54.966Z" },
]
[[package]]

View File

@@ -55,7 +55,7 @@ A two-axis layout component that automatically routes to the correct internal la
Wraps `Content` and adds a `rightChildren` slot. Accepts all `Content` props plus:
- `rightChildren`: `ReactNode` — actions rendered on the right
- `padding`: `SizeVariant` — controls outer padding
- `paddingVariant`: `SizeVariant` — controls outer padding
```typescript
<ContentAction
@@ -544,7 +544,7 @@ function UserCard({
## 4. Spacing Guidelines
**Prefer padding over margins for spacing. When a library component exposes a padding prop
(e.g., `padding`), use that prop instead of wrapping it in a `<div>` with padding classes.
(e.g., `paddingVariant`), use that prop instead of wrapping it in a `<div>` with padding classes.
If a library component does not expose a padding override and you find yourself adding a wrapper
div for spacing, consider updating the library component to accept one.**
@@ -553,7 +553,7 @@ divs that exist solely for spacing.
```typescript
// ✅ Good — use the component's padding prop
<ContentAction padding="md" ... />
<ContentAction paddingVariant="md" ... />
// ✅ Good — padding utilities when no component prop exists
<div className="p-4 space-y-2">

View File

@@ -82,10 +82,7 @@ ARG NODE_OPTIONS
# SENTRY_AUTH_TOKEN is injected via BuildKit secret mount so it is never written
# to any image layer, build cache, or registry manifest.
# Use NODE_OPTIONS in the build command
RUN --mount=type=secret,id=sentry_auth_token \
if [ -f /run/secrets/sentry_auth_token ]; then \
export SENTRY_AUTH_TOKEN="$(cat /run/secrets/sentry_auth_token)"; \
fi && \
RUN --mount=type=secret,id=sentry_auth_token,env=SENTRY_AUTH_TOKEN \
NODE_OPTIONS="${NODE_OPTIONS}" npx next build
# Step 2. Production image, copy all the files and run next

View File

@@ -68,7 +68,9 @@ SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
# Run the conversion into a temp file so a failed run doesn't destroy an existing .tsx
TMPFILE="${BASE_NAME}.tsx.tmp"
if bunx @svgr/cli "$SVG_FILE" --typescript --svgo-config "$SVGO_CONFIG" --template "${SCRIPT_DIR}/icon-template.js" > "$TMPFILE"; then
bunx @svgr/cli "$SVG_FILE" --typescript --svgo-config "$SVGO_CONFIG" --template "${SCRIPT_DIR}/icon-template.js" > "$TMPFILE"
if [ $? -eq 0 ]; then
# Verify the temp file has content before replacing the destination
if [ ! -s "$TMPFILE" ]; then
rm -f "$TMPFILE"
@@ -82,14 +84,16 @@ if bunx @svgr/cli "$SVG_FILE" --typescript --svgo-config "$SVGO_CONFIG" --templa
# Using perl for cross-platform compatibility (works on macOS, Linux, Windows with WSL)
# Note: perl -i returns 0 even on some failures, so we validate the output
if ! perl -i -pe 's/<svg/<svg width={size} height={size}/g' "${BASE_NAME}.tsx"; then
perl -i -pe 's/<svg/<svg width={size} height={size}/g' "${BASE_NAME}.tsx"
if [ $? -ne 0 ]; then
echo "Error: Failed to add width/height attributes" >&2
exit 1
fi
# Icons additionally get stroke="currentColor"
if [ "$MODE" = "icon" ]; then
if ! perl -i -pe 's/\{\.\.\.props\}/stroke="currentColor" {...props}/g' "${BASE_NAME}.tsx"; then
perl -i -pe 's/\{\.\.\.props\}/stroke="currentColor" {...props}/g' "${BASE_NAME}.tsx"
if [ $? -ne 0 ]; then
echo "Error: Failed to add stroke attribute" >&2
exit 1
fi

View File

@@ -95,9 +95,9 @@ function Button({
<Interactive.Container
type={type}
border={interactiveProps.prominence === "secondary"}
size={size}
width={width}
rounding={isLarge ? "md" : size === "2xs" ? "xs" : "sm"}
heightVariant={size}
widthVariant={width}
roundingVariant={isLarge ? "md" : size === "2xs" ? "xs" : "sm"}
>
<div className="flex flex-row items-center gap-1">
{iconWrapper(Icon, size, !!children)}

View File

@@ -8,13 +8,13 @@ A composite component that wraps `Interactive.Stateful > Interactive.Container >
```
Interactive.Stateful <- selectVariant, state, interaction, onClick, href, ref
└─ Interactive.Container <- type, width, rounding
└─ ContentAction <- withInteractive, padding="lg"
└─ Interactive.Container <- type, width, roundingVariant
└─ ContentAction <- withInteractive, paddingVariant="lg"
├─ Content <- icon, title, description, sizePreset, variant, ...
└─ rightChildren
```
`padding` is hardcoded to `"lg"` and `withInteractive` is always `true`. These are not exposed as props.
`paddingVariant` is hardcoded to `"lg"` and `withInteractive` is always `true`. These are not exposed as props.
## Props
@@ -35,7 +35,7 @@ Interactive.Stateful <- selectVariant, state, interaction, onClick, href
| Prop | Type | Default | Description |
|------|------|---------|-------------|
| `rounding` | `InteractiveContainerRoundingVariant` | `"md"` | Corner rounding preset (height is content-driven) |
| `roundingVariant` | `InteractiveContainerRoundingVariant` | `"md"` | Corner rounding preset (height is content-driven) |
| `width` | `WidthVariant` | `"full"` | Container width |
| `type` | `"submit" \| "button" \| "reset"` | `"button"` | HTML button type |
| `tooltip` | `string` | — | Tooltip text shown on hover |
@@ -63,7 +63,7 @@ import { LineItemButton } from "@opal/components";
<LineItemButton
selectVariant="select-heavy"
state={isSelected ? "selected" : "empty"}
rounding="sm"
roundingVariant="sm"
onClick={handleClick}
title="gpt-4o"
sizePreset="main-ui"

View File

@@ -5,7 +5,8 @@ import {
} from "@opal/core";
import type { ExtremaSizeVariants, DistributiveOmit } from "@opal/types";
import { Tooltip, type TooltipSide } from "@opal/components";
import { type ContentActionProps, ContentAction } from "@opal/layouts";
import type { ContentActionProps } from "@opal/layouts/content-action/components";
import { ContentAction } from "@opal/layouts";
// ---------------------------------------------------------------------------
// Types
@@ -13,7 +14,7 @@ import { type ContentActionProps, ContentAction } from "@opal/layouts";
type ContentPassthroughProps = DistributiveOmit<
ContentActionProps,
"padding" | "width" | "ref"
"paddingVariant" | "widthVariant" | "ref"
>;
type LineItemButtonOwnProps = Pick<
@@ -31,7 +32,7 @@ type LineItemButtonOwnProps = Pick<
selectVariant?: "select-light" | "select-heavy";
/** Corner rounding preset (height is always content-driven). @default "md" */
rounding?: InteractiveContainerRoundingVariant;
roundingVariant?: InteractiveContainerRoundingVariant;
/** Container width. @default "full" */
width?: ExtremaSizeVariants;
@@ -62,7 +63,7 @@ function LineItemButton({
type = "button",
// Sizing
rounding = "md",
roundingVariant = "md",
width = "full",
tooltip,
tooltipSide = "top",
@@ -83,16 +84,14 @@ function LineItemButton({
>
<Interactive.Container
type={type}
width={width}
size="fit"
rounding={rounding}
widthVariant={width}
heightVariant="lg"
roundingVariant={roundingVariant}
>
<div className="w-full p-2">
<ContentAction
{...(contentActionProps as ContentActionProps)}
padding="fit"
/>
</div>
<ContentAction
{...(contentActionProps as ContentActionProps)}
paddingVariant="fit"
/>
</Interactive.Container>
</Interactive.Stateful>
);

View File

@@ -1,112 +0,0 @@
import React from "react";
import type { Meta, StoryObj } from "@storybook/react";
import { LinkButton } from "@opal/components";
import * as TooltipPrimitive from "@radix-ui/react-tooltip";
const meta: Meta<typeof LinkButton> = {
title: "opal/components/LinkButton",
component: LinkButton,
tags: ["autodocs"],
decorators: [
(Story) => (
<TooltipPrimitive.Provider>
<Story />
</TooltipPrimitive.Provider>
),
],
};
export default meta;
type Story = StoryObj<typeof LinkButton>;
// ─── Anchor mode ────────────────────────────────────────────────────────────
export const Default: Story = {
render: () => <LinkButton href="/">Home</LinkButton>,
};
export const ExternalLink: Story = {
render: () => (
<LinkButton href="https://onyx.app" target="_blank">
Onyx
</LinkButton>
),
};
export const LongLabel: Story = {
render: () => (
<LinkButton href="https://docs.onyx.app" target="_blank">
Go read the full Onyx documentation site
</LinkButton>
),
};
// ─── Button mode ────────────────────────────────────────────────────────────
export const AsButton: Story = {
render: () => (
<LinkButton onClick={() => alert("clicked")}>Click me</LinkButton>
),
};
// ─── Disabled ───────────────────────────────────────────────────────────────
export const DisabledLink: Story = {
render: () => (
<LinkButton href="/" disabled>
Disabled link
</LinkButton>
),
};
export const DisabledButton: Story = {
render: () => (
<LinkButton onClick={() => alert("should not fire")} disabled>
Disabled button
</LinkButton>
),
};
// ─── Tooltip ────────────────────────────────────────────────────────────────
export const Tooltip: Story = {
render: () => (
<LinkButton href="/" tooltip="This is a tooltip">
Hover me
</LinkButton>
),
};
export const TooltipSides: Story = {
render: () => (
<div className="flex flex-col gap-8 p-16">
<LinkButton href="/" tooltip="Tooltip on top" tooltipSide="top">
top
</LinkButton>
<LinkButton href="/" tooltip="Tooltip on right" tooltipSide="right">
right
</LinkButton>
<LinkButton href="/" tooltip="Tooltip on bottom" tooltipSide="bottom">
bottom
</LinkButton>
<LinkButton href="/" tooltip="Tooltip on left" tooltipSide="left">
left
</LinkButton>
</div>
),
};
// ─── Inline in prose ────────────────────────────────────────────────────────
export const InlineInProse: Story = {
render: () => (
<p style={{ maxWidth: "36rem", lineHeight: 1.7 }}>
Modifying embedding settings requires a full re-index of all documents and
may take hours or days depending on corpus size.{" "}
<LinkButton href="https://docs.onyx.app" target="_blank">
Learn more
</LinkButton>
.
</p>
),
};

Some files were not shown because too many files have changed in this diff Show More