Compare commits

..

1 Commits

Author SHA1 Message Date
Martin Bergamasco
a7b95c42d3 fix: consolidate Button/ and button/ case-split for Linux builds 2026-04-18 16:06:50 +02:00
47 changed files with 435 additions and 2204 deletions

View File

@@ -16,13 +16,9 @@
"source=onyx-devcontainer-local,target=/home/dev/.local,type=volume"
],
"containerEnv": {
"MODEL_SERVER_HOST": "inference_model_server",
"OPENSEARCH_HOST": "opensearch",
"POSTGRES_HOST": "relational_db",
"REDIS_HOST": "cache",
"S3_ENDPOINT_URL": "http://minio:9000",
"SSH_AUTH_SOCK": "/tmp/ssh-agent.sock",
"VESPA_HOST": "index"
"POSTGRES_HOST": "relational_db",
"REDIS_HOST": "cache"
},
"remoteUser": "${localEnv:DEVCONTAINER_REMOTE_USER:dev}",
"updateRemoteUserUID": false,

View File

@@ -40,7 +40,6 @@ ALLOWED_DOMAINS=(
"api.anthropic.com"
"api-staging.anthropic.com"
"files.anthropic.com"
"huggingface.co"
"sentry.io"
"update.code.visualstudio.com"
"pypi.org"

View File

@@ -403,7 +403,7 @@ jobs:
echo "CERT_ID=$CERT_ID" >> $GITHUB_ENV
echo "Certificate imported."
- uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5 # ratchet:tauri-apps/tauri-action@action-v0.6.2
- uses: tauri-apps/tauri-action@73fb865345c54760d875b94642314f8c0c894afa # ratchet:tauri-apps/tauri-action@action-v0.6.1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
APPLE_ID: ${{ env.APPLE_ID }}

View File

@@ -42,7 +42,7 @@ jobs:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v6
with:
persist-credentials: false
- uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # zizmor: ignore[cache-poisoning]
- uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # zizmor: ignore[cache-poisoning]
with:
go-version: ${{ env.GO_VERSION }}
cache-dependency-path: "**/go.sum"

View File

@@ -1,27 +0,0 @@
"""Add file_id to documents
Revision ID: 91d150c361f6
Revises: a6fcd3d631f9
Create Date: 2026-04-16 15:43:30.314823
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "91d150c361f6"
down_revision = "a6fcd3d631f9"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
"document",
sa.Column("file_id", sa.String(), nullable=True),
)
def downgrade() -> None:
op.drop_column("document", "file_id")

View File

@@ -1,48 +0,0 @@
"""replace document sync index with partial index
Replaces the composite index ix_document_sync_status (last_modified, last_synced)
with a partial index ix_document_needs_sync that only indexes rows where
last_modified > last_synced OR last_synced IS NULL.
The old index was never used by the query planner (0 scans in pg_stat_user_indexes)
because Postgres cannot use a B-tree composite index to evaluate a comparison
between two columns in the same row combined with an OR/IS NULL condition.
The partial index makes count_documents_by_needs_sync ~4000x faster for tenants
with no stale documents (161ms -> 0.04ms on a 929K row table) and ~17x faster
for tenants with large backlogs (846ms -> 50ms on a 164K row table).
Revision ID: a6fcd3d631f9
Revises: d129f37b3d87
Create Date: 2026-04-17 16:00:00.000000
"""
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision = "a6fcd3d631f9"
down_revision = "d129f37b3d87"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_index(
"ix_document_needs_sync",
"document",
["id"],
postgresql_where=sa.text("last_modified > last_synced OR last_synced IS NULL"),
)
op.drop_index("ix_document_sync_status", table_name="document")
def downgrade() -> None:
op.create_index(
"ix_document_sync_status",
"document",
["last_modified", "last_synced"],
)
op.drop_index("ix_document_needs_sync", table_name="document")

View File

@@ -506,18 +506,6 @@ def _perform_external_group_sync(
ext_group_sync_func = sync_config.group_sync_config.group_sync_func
# Clean up stale rows from previous cycle BEFORE marking new ones.
# This ensures cleanup always runs regardless of whether the current
# sync succeeds — previously, cleanup only ran at the END of the sync,
# so if the sync failed (e.g. DB connection killed by
# idle_in_transaction_session_timeout during long API calls), stale
# rows would accumulate indefinitely.
logger.info(
f"Removing stale external groups from prior cycle for {source_type} "
f"for cc_pair: {cc_pair_id}"
)
remove_stale_external_groups(db_session, cc_pair_id)
logger.info(
f"Marking old external groups as stale for {source_type} for cc_pair: {cc_pair_id}"
)

View File

@@ -5,7 +5,6 @@ from pydantic import BaseModel
from sqlalchemy import delete
from sqlalchemy import select
from sqlalchemy import update
from sqlalchemy.dialects.postgresql import insert as pg_insert
from sqlalchemy.orm import Session
from onyx.access.utils import build_ext_group_name_for_onyx
@@ -78,15 +77,6 @@ def mark_old_external_groups_as_stale(
.where(PublicExternalUserGroup.cc_pair_id == cc_pair_id)
.values(stale=True)
)
# Commit immediately so the transaction closes before potentially long
# external API calls (e.g. Google Drive folder iteration). Without this,
# the DB connection sits idle-in-transaction during API calls and gets
# killed by idle_in_transaction_session_timeout, causing the entire sync
# to fail and stale cleanup to never run.
db_session.commit()
_UPSERT_BATCH_SIZE = 5000
def upsert_external_groups(
@@ -96,102 +86,91 @@ def upsert_external_groups(
source: DocumentSource,
) -> None:
"""
Batch upsert external user groups using INSERT ... ON CONFLICT DO UPDATE.
- For existing rows (same user_id, external_user_group_id, cc_pair_id),
sets stale=False
- For new rows, inserts with stale=False
- Same logic for PublicExternalUserGroup
Performs a true upsert operation for external user groups:
- For existing groups (same user_id, external_user_group_id, cc_pair_id), updates the stale flag to False
- For new groups, inserts them with stale=False
- For public groups, uses upsert logic as well
"""
# If there are no groups to add, return early
if not external_groups:
return
# Collect all emails from all groups to batch-add users at once
all_group_member_emails: set[str] = set()
# collect all emails from all groups to batch add all users at once for efficiency
all_group_member_emails = set()
for external_group in external_groups:
all_group_member_emails.update(external_group.user_emails)
for user_email in external_group.user_emails:
all_group_member_emails.add(user_email)
# Batch add users if they don't exist and get their ids
# batch add users if they don't exist and get their ids
all_group_members: list[User] = batch_add_ext_perm_user_if_not_exists(
db_session=db_session,
# NOTE: this function handles case sensitivity for emails
emails=list(all_group_member_emails),
)
# map emails to ids
email_id_map = {user.email.lower(): user.id for user in all_group_members}
# Build all user-group mappings and public-group mappings
user_group_mappings: list[dict] = []
public_group_mappings: list[dict] = []
# Process each external group
for external_group in external_groups:
external_group_id = build_ext_group_name_for_onyx(
ext_group_name=external_group.id,
source=source,
)
# Handle user-group mappings
for user_email in external_group.user_emails:
user_id = email_id_map.get(user_email.lower())
if user_id is None:
logger.warning(
f"User in group {external_group.id}"
f" with email {user_email} not found"
f"User in group {external_group.id} with email {user_email} not found"
)
continue
user_group_mappings.append(
{
"user_id": user_id,
"external_user_group_id": external_group_id,
"cc_pair_id": cc_pair_id,
"stale": False,
}
# Check if the user-group mapping already exists
existing_user_group = db_session.scalar(
select(User__ExternalUserGroupId).where(
User__ExternalUserGroupId.user_id == user_id,
User__ExternalUserGroupId.external_user_group_id
== external_group_id,
User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
)
)
if existing_user_group:
# Update existing record
existing_user_group.stale = False
else:
# Insert new record
new_user_group = User__ExternalUserGroupId(
user_id=user_id,
external_user_group_id=external_group_id,
cc_pair_id=cc_pair_id,
stale=False,
)
db_session.add(new_user_group)
# Handle public group if needed
if external_group.gives_anyone_access:
public_group_mappings.append(
{
"external_user_group_id": external_group_id,
"cc_pair_id": cc_pair_id,
"stale": False,
}
# Check if the public group already exists
existing_public_group = db_session.scalar(
select(PublicExternalUserGroup).where(
PublicExternalUserGroup.external_user_group_id == external_group_id,
PublicExternalUserGroup.cc_pair_id == cc_pair_id,
)
)
# Deduplicate to avoid "ON CONFLICT DO UPDATE command cannot affect row
# a second time" when duplicate emails or overlapping groups produce
# identical (user_id, external_user_group_id, cc_pair_id) tuples.
user_group_mappings_deduped = list(
{
(m["user_id"], m["external_user_group_id"], m["cc_pair_id"]): m
for m in user_group_mappings
}.values()
)
# Batch upsert user-group mappings
for i in range(0, len(user_group_mappings_deduped), _UPSERT_BATCH_SIZE):
chunk = user_group_mappings_deduped[i : i + _UPSERT_BATCH_SIZE]
stmt = pg_insert(User__ExternalUserGroupId).values(chunk)
stmt = stmt.on_conflict_do_update(
index_elements=["user_id", "external_user_group_id", "cc_pair_id"],
set_={"stale": False},
)
db_session.execute(stmt)
# Deduplicate public group mappings as well
public_group_mappings_deduped = list(
{
(m["external_user_group_id"], m["cc_pair_id"]): m
for m in public_group_mappings
}.values()
)
# Batch upsert public group mappings
for i in range(0, len(public_group_mappings_deduped), _UPSERT_BATCH_SIZE):
chunk = public_group_mappings_deduped[i : i + _UPSERT_BATCH_SIZE]
stmt = pg_insert(PublicExternalUserGroup).values(chunk)
stmt = stmt.on_conflict_do_update(
index_elements=["external_user_group_id", "cc_pair_id"],
set_={"stale": False},
)
db_session.execute(stmt)
if existing_public_group:
# Update existing record
existing_public_group.stale = False
else:
# Insert new record
new_public_group = PublicExternalUserGroup(
external_user_group_id=external_group_id,
cc_pair_id=cc_pair_id,
stale=False,
)
db_session.add(new_public_group)
db_session.commit()

View File

@@ -27,7 +27,6 @@ from shared_configs.configs import MIN_THREADS_ML_MODELS
from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST
from shared_configs.configs import MODEL_SERVER_PORT
from shared_configs.configs import SENTRY_DSN
from shared_configs.configs import SENTRY_TRACES_SAMPLE_RATE
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
@@ -102,7 +101,7 @@ def get_model_app() -> FastAPI:
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[StarletteIntegration(), FastApiIntegration()],
traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)

View File

@@ -55,7 +55,6 @@ from onyx.utils.logger import setup_logger
from shared_configs.configs import DEV_LOGGING_ENABLED
from shared_configs.configs import MULTI_TENANT
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
from shared_configs.configs import SENTRY_CELERY_TRACES_SAMPLE_RATE
from shared_configs.configs import SENTRY_DSN
from shared_configs.configs import TENANT_ID_PREFIX
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
@@ -70,7 +69,7 @@ if SENTRY_DSN:
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[CeleryIntegration()],
traces_sample_rate=SENTRY_CELERY_TRACES_SAMPLE_RATE,
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)

View File

@@ -37,7 +37,6 @@ from onyx.redis.redis_connector import RedisConnector
from onyx.server.metrics.connector_health_metrics import on_index_attempt_status_change
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import global_version
from shared_configs.configs import SENTRY_CELERY_TRACES_SAMPLE_RATE
from shared_configs.configs import SENTRY_DSN
logger = setup_logger()
@@ -141,7 +140,7 @@ def _docfetching_task(
sentry_sdk.init(
dsn=SENTRY_DSN,
traces_sample_rate=SENTRY_CELERY_TRACES_SAMPLE_RATE,
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)

View File

@@ -58,8 +58,6 @@ from onyx.db.indexing_coordination import IndexingCoordination
from onyx.db.models import IndexAttempt
from onyx.file_store.document_batch_storage import DocumentBatchStorage
from onyx.file_store.document_batch_storage import get_document_batch_storage
from onyx.file_store.staging import build_raw_file_callback
from onyx.file_store.staging import RawFileCallback
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
@@ -92,7 +90,6 @@ def _get_connector_runner(
end_time: datetime,
include_permissions: bool,
leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
raw_file_callback: RawFileCallback | None = None,
) -> ConnectorRunner:
"""
NOTE: `start_time` and `end_time` are only used for poll connectors
@@ -111,7 +108,6 @@ def _get_connector_runner(
input_type=task,
connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
credential=attempt.connector_credential_pair.credential,
raw_file_callback=raw_file_callback,
)
# validate the connector settings
@@ -279,12 +275,6 @@ def run_docfetching_entrypoint(
f"credentials='{credential_id}'"
)
raw_file_callback = build_raw_file_callback(
index_attempt_id=index_attempt_id,
cc_pair_id=connector_credential_pair_id,
tenant_id=tenant_id,
)
connector_document_extraction(
app,
index_attempt_id,
@@ -292,7 +282,6 @@ def run_docfetching_entrypoint(
attempt.search_settings_id,
tenant_id,
callback,
raw_file_callback=raw_file_callback,
)
logger.info(
@@ -312,7 +301,6 @@ def connector_document_extraction(
search_settings_id: int,
tenant_id: str,
callback: IndexingHeartbeatInterface | None = None,
raw_file_callback: RawFileCallback | None = None,
) -> None:
"""Extract documents from connector and queue them for indexing pipeline processing.
@@ -463,7 +451,6 @@ def connector_document_extraction(
start_time=window_start,
end_time=window_end,
include_permissions=should_fetch_permissions_during_indexing,
raw_file_callback=raw_file_callback,
)
# don't use a checkpoint if we're explicitly indexing from

View File

@@ -868,15 +868,6 @@ MAX_EMBEDDED_IMAGES_PER_UPLOAD = max(
0, int(os.environ.get("MAX_EMBEDDED_IMAGES_PER_UPLOAD") or 1000)
)
# Maximum non-empty cells to extract from a single xlsx worksheet. Protects
# from OOM on honestly-huge spreadsheets: memory cost in the extractor is
# roughly proportional to this count. Once exceeded, the scan stops and a
# truncation marker row is appended to the sheet's CSV.
# Peak Memory ~= 100 B * MAX_CELLS
MAX_XLSX_CELLS_PER_SHEET = max(
0, int(os.environ.get("MAX_XLSX_CELLS_PER_SHEET") or 10_000_000)
)
# Use document summary for contextual rag
USE_DOCUMENT_SUMMARY = os.environ.get("USE_DOCUMENT_SUMMARY", "true").lower() == "true"
# Use chunk summary for contextual rag

View File

@@ -372,7 +372,6 @@ class FileOrigin(str, Enum):
CONNECTOR_METADATA = "connector_metadata"
GENERATED_REPORT = "generated_report"
INDEXING_CHECKPOINT = "indexing_checkpoint"
INDEXING_STAGING = "indexing_staging"
PLAINTEXT_CACHE = "plaintext_cache"
OTHER = "other"
QUERY_HISTORY_CSV = "query_history_csv"

View File

@@ -22,7 +22,6 @@ from onyx.db.credentials import backend_update_credential_json
from onyx.db.credentials import fetch_credential_by_id
from onyx.db.enums import AccessType
from onyx.db.models import Credential
from onyx.file_store.staging import RawFileCallback
from shared_configs.contextvars import get_current_tenant_id
@@ -108,7 +107,6 @@ def instantiate_connector(
input_type: InputType,
connector_specific_config: dict[str, Any],
credential: Credential,
raw_file_callback: RawFileCallback | None = None,
) -> BaseConnector:
connector_class = identify_connector_class(source, input_type)
@@ -132,9 +130,6 @@ def instantiate_connector(
connector.set_allow_images(get_image_extraction_and_analysis_enabled())
if raw_file_callback is not None:
connector.set_raw_file_callback(raw_file_callback)
return connector

View File

@@ -40,22 +40,6 @@ class GongConnectorCheckpoint(ConnectorCheckpoint):
cursor: str | None = None
# Cached time range — computed once, reused across checkpoint calls
time_range: tuple[str, str] | None = None
# Transcripts whose call details were not yet available from /v2/calls/extensive
# (Gong has a known race where transcript call IDs take time to propagate).
# Keyed by call_id. Retried on subsequent checkpoint invocations.
#
# Invariant: all entries share one resolution session — they're stashed
# together from a single page and share the attempt counter and retry
# deadline. load_from_checkpoint only fetches a new page when this dict
# is empty, so entries from different pages can't mix.
pending_transcripts: dict[str, dict[str, Any]] = {}
# Number of resolution attempts made for pending_transcripts so far.
pending_call_details_attempts: int = 0
# Unix timestamp before which we should not retry pending_transcripts.
# Enforces exponential backoff independent of worker cadence — Gong's
# transcript-ID propagation race can take tens of seconds to minutes,
# longer than typical worker reinvocation intervals.
pending_retry_after: float | None = None
class _TranscriptPage(BaseModel):
@@ -78,15 +62,8 @@ class _CursorExpiredError(Exception):
class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
BASE_URL = "https://api.gong.io"
# Max number of attempts to resolve missing call details across checkpoint
# invocations before giving up and emitting ConnectorFailure.
MAX_CALL_DETAILS_ATTEMPTS = 6
# Base delay for exponential backoff between pending-transcript retry
# attempts. Delay before attempt N (N >= 2) is CALL_DETAILS_DELAY * 2^(N-2)
# seconds (30, 60, 120, 240, 480 = ~15.5min total) — matching the original
# blocking-retry schedule, but enforced via checkpoint deadline rather
# than in-call time.sleep.
CALL_DETAILS_DELAY = 30
CALL_DETAILS_DELAY = 30 # in seconds
# Gong API limit is 3 calls/sec — stay safely under it
MIN_REQUEST_INTERVAL = 0.5 # seconds between requests
@@ -210,6 +187,50 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
return call_to_metadata
def _fetch_call_details_with_retry(self, call_ids: list[str]) -> dict[str, Any]:
"""Fetch call details with retry for the Gong API race condition.
The Gong API has a known race where transcript call IDs don't immediately
appear in /v2/calls/extensive. Retries with exponential backoff, only
re-requesting the missing IDs on each attempt.
"""
call_details_map = self._get_call_details_by_ids(call_ids)
if set(call_ids) == set(call_details_map.keys()):
return call_details_map
for attempt in range(2, self.MAX_CALL_DETAILS_ATTEMPTS + 1):
missing_ids = list(set(call_ids) - set(call_details_map.keys()))
logger.warning(
f"_get_call_details_by_ids is missing call id's: current_attempt={attempt - 1} missing_call_ids={missing_ids}"
)
wait_seconds = self.CALL_DETAILS_DELAY * pow(2, attempt - 2)
logger.warning(
f"_get_call_details_by_ids waiting to retry: "
f"wait={wait_seconds}s "
f"current_attempt={attempt - 1} "
f"next_attempt={attempt} "
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
)
time.sleep(wait_seconds)
# Only re-fetch the missing IDs, merge into existing results
new_details = self._get_call_details_by_ids(missing_ids)
call_details_map.update(new_details)
if set(call_ids) == set(call_details_map.keys()):
return call_details_map
missing_ids = list(set(call_ids) - set(call_details_map.keys()))
logger.error(
f"Giving up on missing call id's after "
f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
f"missing_call_ids={missing_ids}"
f"proceeding with {len(call_details_map)} of "
f"{len(call_ids)} calls"
)
return call_details_map
@staticmethod
def _parse_parties(parties: list[dict]) -> dict[str, str]:
id_mapping = {}
@@ -292,119 +313,87 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
return start_time, end_time
def _build_document(
self,
transcript: dict[str, Any],
call_details: dict[str, Any],
) -> Document:
"""Build a single Document from a transcript and its resolved call details."""
call_id = transcript["callId"]
call_metadata = call_details["metaData"]
call_time_str = call_metadata["started"]
call_title = call_metadata["title"]
logger.info(
f"Indexing Gong call id {call_id} from {call_time_str.split('T', 1)[0]}: {call_title}"
)
call_parties = cast(list[dict] | None, call_details.get("parties"))
if call_parties is None:
logger.error(f"Couldn't get parties for Call ID: {call_id}")
call_parties = []
id_to_name_map = self._parse_parties(call_parties)
speaker_to_name: dict[str, str] = {}
transcript_text = ""
call_purpose = call_metadata["purpose"]
if call_purpose:
transcript_text += f"Call Description: {call_purpose}\n\n"
contents = transcript["transcript"]
for segment in contents:
speaker_id = segment.get("speakerId", "")
if speaker_id not in speaker_to_name:
if self.hide_user_info:
speaker_to_name[speaker_id] = f"User {len(speaker_to_name) + 1}"
else:
speaker_to_name[speaker_id] = id_to_name_map.get(
speaker_id, "Unknown"
)
speaker_name = speaker_to_name[speaker_id]
sentences = segment.get("sentences", {})
monolog = " ".join([sentence.get("text", "") for sentence in sentences])
transcript_text += f"{speaker_name}: {monolog}\n\n"
return Document(
id=call_id,
sections=[TextSection(link=call_metadata["url"], text=transcript_text)],
source=DocumentSource.GONG,
semantic_identifier=call_title or "Untitled",
doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
timezone.utc
),
metadata={"client": call_metadata.get("system")},
)
def _process_transcripts(
self,
transcripts: list[dict[str, Any]],
checkpoint: GongConnectorCheckpoint,
) -> Generator[Document | ConnectorFailure, None, None]:
"""Fetch call details for a page of transcripts and yield resulting
Documents. Transcripts whose call details are missing (Gong race
condition) are stashed into `checkpoint.pending_transcripts` for retry
on a future checkpoint invocation rather than blocking here.
"""
"""Process a batch of transcripts into Documents or ConnectorFailures."""
transcript_call_ids = cast(
list[str],
[t.get("callId") for t in transcripts if t.get("callId")],
)
call_details_map = (
self._get_call_details_by_ids(transcript_call_ids)
if transcript_call_ids
else {}
)
newly_stashed: list[str] = []
call_details_map = self._fetch_call_details_with_retry(transcript_call_ids)
for transcript in transcripts:
call_id = transcript.get("callId")
if not call_id:
logger.error(
"Couldn't get call information for transcript missing callId"
)
if not call_id or call_id not in call_details_map:
logger.error(f"Couldn't get call information for Call ID: {call_id}")
if call_id:
logger.error(
f"Call debug info: call_id={call_id} "
f"call_ids={transcript_call_ids} "
f"call_details_map={call_details_map.keys()}"
)
yield ConnectorFailure(
failed_document=DocumentFailure(document_id="unknown"),
failure_message="Transcript missing callId",
failed_document=DocumentFailure(
document_id=call_id or "unknown",
),
failure_message=f"Couldn't get call information for Call ID: {call_id}",
)
continue
if call_id in call_details_map:
yield self._build_document(transcript, call_details_map[call_id])
continue
call_details = call_details_map[call_id]
call_metadata = call_details["metaData"]
# Details not available yet — stash for retry on next invocation.
checkpoint.pending_transcripts[call_id] = transcript
newly_stashed.append(call_id)
if newly_stashed:
logger.warning(
f"Gong call details not yet available (race condition); "
f"deferring to next checkpoint invocation: "
f"call_ids={newly_stashed}"
call_time_str = call_metadata["started"]
call_title = call_metadata["title"]
logger.info(
f"Indexing Gong call id {call_id} from {call_time_str.split('T', 1)[0]}: {call_title}"
)
call_parties = cast(list[dict] | None, call_details.get("parties"))
if call_parties is None:
logger.error(f"Couldn't get parties for Call ID: {call_id}")
call_parties = []
id_to_name_map = self._parse_parties(call_parties)
speaker_to_name: dict[str, str] = {}
transcript_text = ""
call_purpose = call_metadata["purpose"]
if call_purpose:
transcript_text += f"Call Description: {call_purpose}\n\n"
contents = transcript["transcript"]
for segment in contents:
speaker_id = segment.get("speakerId", "")
if speaker_id not in speaker_to_name:
if self.hide_user_info:
speaker_to_name[speaker_id] = f"User {len(speaker_to_name) + 1}"
else:
speaker_to_name[speaker_id] = id_to_name_map.get(
speaker_id, "Unknown"
)
speaker_name = speaker_to_name[speaker_id]
sentences = segment.get("sentences", {})
monolog = " ".join([sentence.get("text", "") for sentence in sentences])
transcript_text += f"{speaker_name}: {monolog}\n\n"
yield Document(
id=call_id,
sections=[TextSection(link=call_metadata["url"], text=transcript_text)],
source=DocumentSource.GONG,
semantic_identifier=call_title or "Untitled",
doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
timezone.utc
),
metadata={"client": call_metadata.get("system")},
)
# First attempt on any newly-stashed transcripts counts as attempt #1.
# pending_call_details_attempts is guaranteed 0 here because
# load_from_checkpoint only reaches _process_transcripts when
# pending_transcripts was empty at entry (see early-return above).
checkpoint.pending_call_details_attempts = 1
checkpoint.pending_retry_after = time.time() + self._next_retry_delay(1)
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
combined = (
@@ -443,18 +432,6 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
checkpoint.has_more = True
return checkpoint
# Step 2: Resolve any transcripts stashed by a prior invocation whose
# call details were missing due to Gong's propagation race. Worker
# cadence between checkpoint calls provides the spacing between retry
# attempts — no in-call sleep needed.
if checkpoint.pending_transcripts:
yield from self._resolve_pending_transcripts(checkpoint)
# If pending still exists and we haven't exhausted attempts, defer
# the rest of this invocation — _resolve_pending_transcripts set
# has_more=True for us.
if checkpoint.pending_transcripts:
return checkpoint
workspace_ids = checkpoint.workspace_ids
# If we've exhausted all workspaces, we're done
@@ -473,7 +450,7 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
workspace_id = workspace_ids[checkpoint.workspace_index]
# Step 3: Fetch one page of transcripts
# Step 2: Fetch one page of transcripts
try:
page = self._fetch_transcript_page(
start_datetime=start_time,
@@ -496,102 +473,23 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
checkpoint.has_more = True
return checkpoint
# Step 4: Process transcripts into documents. Missing-details
# transcripts get stashed into checkpoint.pending_transcripts.
# Step 3: Process transcripts into documents
if page.transcripts:
yield from self._process_transcripts(page.transcripts, checkpoint)
yield from self._process_transcripts(page.transcripts)
# Step 5: Update cursor/workspace state
# Step 4: Update checkpoint state
if page.next_cursor:
# More pages in this workspace
checkpoint.cursor = page.next_cursor
checkpoint.has_more = True
else:
# This workspace is exhausted — advance to next
checkpoint.workspace_index += 1
checkpoint.cursor = None
checkpoint.has_more = checkpoint.workspace_index < len(workspace_ids)
# If pending transcripts were stashed this invocation, we still have
# work to do on a future invocation even if pagination is exhausted.
if checkpoint.pending_transcripts:
checkpoint.has_more = True
return checkpoint
def _next_retry_delay(self, attempts_done: int) -> float:
"""Seconds to wait before attempt #(attempts_done + 1).
Matches the original exponential backoff: 30, 60, 120, 240, 480.
"""
return self.CALL_DETAILS_DELAY * pow(2, attempts_done - 1)
def _resolve_pending_transcripts(
self,
checkpoint: GongConnectorCheckpoint,
) -> Generator[Document | ConnectorFailure, None, None]:
"""Attempt to resolve transcripts whose call details were unavailable
in a prior invocation. Mutates checkpoint in place: resolved transcripts
are removed from pending_transcripts; on attempt exhaustion, emits
ConnectorFailure for each unresolved call_id and clears pending state.
If the backoff deadline hasn't elapsed yet, returns without issuing
any API call so the next invocation can try again later.
"""
if (
checkpoint.pending_retry_after is not None
and time.time() < checkpoint.pending_retry_after
):
# Backoff still in effect — defer to a later invocation without
# burning an attempt or an API call.
checkpoint.has_more = True
return
pending_call_ids = list(checkpoint.pending_transcripts.keys())
resolved = self._get_call_details_by_ids(pending_call_ids)
for call_id, details in resolved.items():
transcript = checkpoint.pending_transcripts.pop(call_id, None)
if transcript is None:
continue
yield self._build_document(transcript, details)
if not checkpoint.pending_transcripts:
checkpoint.pending_call_details_attempts = 0
checkpoint.pending_retry_after = None
return
checkpoint.pending_call_details_attempts += 1
logger.warning(
f"Gong call details still missing after "
f"{checkpoint.pending_call_details_attempts}/"
f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
f"missing_call_ids={list(checkpoint.pending_transcripts.keys())}"
)
if checkpoint.pending_call_details_attempts >= self.MAX_CALL_DETAILS_ATTEMPTS:
logger.error(
f"Giving up on missing Gong call details after "
f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
f"missing_call_ids={list(checkpoint.pending_transcripts.keys())}"
)
for call_id in list(checkpoint.pending_transcripts.keys()):
yield ConnectorFailure(
failed_document=DocumentFailure(document_id=call_id),
failure_message=(
f"Couldn't get call details after {self.MAX_CALL_DETAILS_ATTEMPTS} attempts for Call ID: {call_id}"
),
)
checkpoint.pending_transcripts = {}
checkpoint.pending_call_details_attempts = 0
checkpoint.pending_retry_after = None
# has_more is recomputed by the workspace iteration that follows;
# reset to False here so a stale True from a prior invocation
# can't leak out via any future early-return path.
checkpoint.has_more = False
else:
checkpoint.pending_retry_after = time.time() + self._next_retry_delay(
checkpoint.pending_call_details_attempts
)
checkpoint.has_more = True
if __name__ == "__main__":
import os

View File

@@ -578,18 +578,8 @@ class GoogleDriveConnector(
current_id, file.user_email, field_type, failed_folder_ids_by_email
)
if not folder:
# Can't access this folder - stop climbing.
# If the terminal node is a confirmed orphan, backfill all
# intermediate folders into failed_folder_ids_by_email so
# future files short-circuit via _get_folder_metadata's
# cache check instead of re-climbing the whole chain.
if failed_folder_ids_by_email is not None:
for email in {file.user_email, self.primary_admin_email}:
email_failed_ids = failed_folder_ids_by_email.get(email)
if email_failed_ids and current_id in email_failed_ids:
failed_folder_ids_by_email.setdefault(
email, ThreadSafeSet()
).update(set(node_ids_in_walk))
# Can't access this folder - stop climbing
# Don't mark as fully walked since we didn't reach root
break
folder_parent_id = _get_parent_id_from_file(folder)

View File

@@ -167,12 +167,9 @@ class GoogleDriveCheckpoint(ConnectorCheckpoint):
default_factory=ThreadSafeSet
)
# Maps email → set of folder IDs that email should skip when walking the
# parent chain. Covers two cases:
# 1. Folders where that email confirmed no accessible parent (true orphans).
# 2. Intermediate folders on a path that dead-ended at a confirmed orphan —
# backfilled so future walks short-circuit earlier in the chain.
# In both cases _get_folder_metadata skips the API call and returns None.
# Maps email → set of IDs of folders where that email confirmed no accessible parent.
# Avoids redundant API calls when the same (folder, email) pair is
# encountered again within the same retrieval run.
failed_folder_ids_by_email: ThreadSafeDict[str, ThreadSafeSet[str]] = Field(
default_factory=ThreadSafeDict
)

View File

@@ -15,7 +15,6 @@ from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import Document
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import SlimDocument
from onyx.file_store.staging import RawFileCallback
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
@@ -43,9 +42,6 @@ class NormalizationResult(BaseModel):
class BaseConnector(abc.ABC, Generic[CT]):
REDIS_KEY_PREFIX = "da_connector_data:"
# Optional raw-file persistence hook to save original file
raw_file_callback: RawFileCallback | None = None
@abc.abstractmethod
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
raise NotImplementedError
@@ -92,15 +88,6 @@ class BaseConnector(abc.ABC, Generic[CT]):
"""Implement if the underlying connector wants to skip/allow image downloading
based on the application level image analysis setting."""
def set_raw_file_callback(self, callback: RawFileCallback) -> None:
"""Inject the per-attempt raw-file persistence callback.
Wired up by the docfetching entrypoint via `instantiate_connector`.
Connectors that don't care about persisting raw bytes can ignore this
— `raw_file_callback` simply stays `None`.
"""
self.raw_file_callback = callback
@classmethod
def normalize_url(cls, url: str) -> "NormalizationResult": # noqa: ARG003
"""Normalize a URL to match the canonical Document.id format used during ingestion.

View File

@@ -19,7 +19,6 @@ from playwright.sync_api import Playwright
from playwright.sync_api import sync_playwright
from playwright.sync_api import TimeoutError
from requests_oauthlib import OAuth2Session
from typing_extensions import override
from urllib3.exceptions import MaxRetryError
from onyx.configs.app_configs import INDEX_BATCH_SIZE
@@ -33,16 +32,11 @@ from onyx.connectors.exceptions import CredentialExpiredError
from onyx.connectors.exceptions import InsufficientPermissionsError
from onyx.connectors.exceptions import UnexpectedValidationError
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.interfaces import SlimConnector
from onyx.connectors.models import Document
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.file_processing.html_utils import web_html_cleanup
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from onyx.utils.sitemap import list_pages_for_site
from onyx.utils.web_content import extract_pdf_text
@@ -61,6 +55,8 @@ class ScrapeSessionContext:
self.visited_links: set[str] = set()
self.content_hashes: set[int] = set()
self.doc_batch: list[Document | HierarchyNode] = []
self.at_least_one_doc: bool = False
self.last_error: str | None = None
self.needs_retry: bool = False
@@ -442,7 +438,7 @@ def _handle_cookies(context: BrowserContext, url: str) -> None:
)
class WebConnector(LoadConnector, SlimConnector):
class WebConnector(LoadConnector):
MAX_RETRIES = 3
def __init__(
@@ -497,14 +493,8 @@ class WebConnector(LoadConnector, SlimConnector):
index: int,
initial_url: str,
session_ctx: ScrapeSessionContext,
slim: bool = False,
) -> ScrapeResult:
"""Returns a ScrapeResult object with a doc and retry flag.
When slim=True, skips scroll, PDF content download, and content extraction.
The bot-detection render wait (5s) fires on CF/403 responses regardless of slim.
networkidle is always awaited so JS-rendered links are discovered correctly.
"""
"""Returns a ScrapeResult object with a doc and retry flag."""
if session_ctx.playwright is None:
raise RuntimeError("scrape_context.playwright is None")
@@ -525,16 +515,7 @@ class WebConnector(LoadConnector, SlimConnector):
is_pdf = is_pdf_resource(initial_url, content_type)
if is_pdf:
if slim:
result.doc = Document(
id=initial_url,
sections=[],
source=DocumentSource.WEB,
semantic_identifier=initial_url,
metadata={},
)
return result
# PDF files are not checked for links
response = requests.get(initial_url, headers=DEFAULT_HEADERS)
page_text, metadata = extract_pdf_text(response.content)
last_modified = response.headers.get("Last-Modified")
@@ -565,20 +546,14 @@ class WebConnector(LoadConnector, SlimConnector):
timeout=30000, # 30 seconds
wait_until="commit", # Wait for navigation to commit
)
# Give the page a moment to start rendering after navigation commits.
# Allows CloudFlare and other bot-detection challenges to complete.
page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)
# Bot-detection JS challenges (CloudFlare, Imperva, etc.) need a moment
# to start network activity after commit before networkidle is meaningful.
# We detect this via the cf-ray header (CloudFlare) or a 403 response,
# which is the common entry point for JS-challenge-based bot detection.
is_bot_challenge = page_response is not None and (
page_response.header_value("cf-ray") is not None
or page_response.status == 403
)
if is_bot_challenge:
page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)
# Wait for network activity to settle (handles SPAs, CF challenges, etc.)
# Wait for network activity to settle so SPAs that fetch content
# asynchronously after the initial JS bundle have time to render.
try:
# A bit of extra time to account for long-polling, websockets, etc.
page.wait_for_load_state("networkidle", timeout=PAGE_RENDER_TIMEOUT_MS)
except TimeoutError:
pass
@@ -601,7 +576,7 @@ class WebConnector(LoadConnector, SlimConnector):
session_ctx.visited_links.add(initial_url)
# If we got here, the request was successful
if not slim and self.scroll_before_scraping:
if self.scroll_before_scraping:
scroll_attempts = 0
previous_height = page.evaluate("document.body.scrollHeight")
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
@@ -640,16 +615,6 @@ class WebConnector(LoadConnector, SlimConnector):
result.retry = True
return result
if slim:
result.doc = Document(
id=initial_url,
sections=[],
source=DocumentSource.WEB,
semantic_identifier=initial_url,
metadata={},
)
return result
# after this point, we don't need the caller to retry
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
@@ -701,13 +666,9 @@ class WebConnector(LoadConnector, SlimConnector):
return result
def load_from_state(self, slim: bool = False) -> GenerateDocumentsOutput:
"""Traverses through all pages found on the website and converts them into
documents.
When slim=True, yields SlimDocument objects (URL id only, no content).
Playwright is used in all modes — slim skips content extraction only.
"""
def load_from_state(self) -> GenerateDocumentsOutput:
"""Traverses through all pages found on the website
and converts them into documents"""
if not self.to_visit_list:
raise ValueError("No URLs to visit")
@@ -718,8 +679,6 @@ class WebConnector(LoadConnector, SlimConnector):
session_ctx = ScrapeSessionContext(base_url, self.to_visit_list)
session_ctx.initialize()
batch: list[Document | SlimDocument | HierarchyNode] = []
while session_ctx.to_visit:
initial_url = session_ctx.to_visit.pop()
if initial_url in session_ctx.visited_links:
@@ -734,9 +693,7 @@ class WebConnector(LoadConnector, SlimConnector):
continue
index = len(session_ctx.visited_links)
logger.info(
f"{index}: {'Slim-visiting' if slim else 'Visiting'} {initial_url}"
)
logger.info(f"{index}: Visiting {initial_url}")
# Add retry mechanism with exponential backoff
retry_count = 0
@@ -751,14 +708,12 @@ class WebConnector(LoadConnector, SlimConnector):
time.sleep(delay)
try:
result = self._do_scrape(index, initial_url, session_ctx, slim=slim)
result = self._do_scrape(index, initial_url, session_ctx)
if result.retry:
continue
if result.doc:
batch.append(
SlimDocument(id=result.doc.id) if slim else result.doc
)
session_ctx.doc_batch.append(result.doc)
except Exception as e:
session_ctx.last_error = f"Failed to fetch '{initial_url}': {e}"
logger.exception(session_ctx.last_error)
@@ -769,16 +724,16 @@ class WebConnector(LoadConnector, SlimConnector):
break # success / don't retry
if len(batch) >= self.batch_size:
if len(session_ctx.doc_batch) >= self.batch_size:
session_ctx.initialize()
session_ctx.at_least_one_doc = True
yield batch # ty: ignore[invalid-yield]
batch = []
yield session_ctx.doc_batch
session_ctx.doc_batch = []
if batch:
if session_ctx.doc_batch:
session_ctx.stop()
session_ctx.at_least_one_doc = True
yield batch # ty: ignore[invalid-yield]
yield session_ctx.doc_batch
if not session_ctx.at_least_one_doc:
if session_ctx.last_error:
@@ -787,22 +742,6 @@ class WebConnector(LoadConnector, SlimConnector):
session_ctx.stop()
@override
def retrieve_all_slim_docs(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
callback: IndexingHeartbeatInterface | None = None,
) -> GenerateSlimDocumentOutput:
"""Yields SlimDocuments for all pages reachable from the configured URLs.
Uses the same Playwright crawl as full indexing but skips content extraction,
scroll, and PDF downloads. The 5s render wait fires only on bot-detection
responses (CloudFlare cf-ray header or HTTP 403).
The start/end parameters are ignored — WEB connector has no incremental path.
"""
yield from self.load_from_state(slim=True) # ty: ignore[invalid-yield]
def validate_connector_settings(self) -> None:
# Make sure we have at least one valid URL to check
if not self.to_visit_list:

View File

@@ -952,7 +952,6 @@ class Document(Base):
semantic_id: Mapped[str] = mapped_column(NullFilteredString)
# First Section's link
link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
file_id: Mapped[str | None] = mapped_column(String, nullable=True)
# The updated time is also used as a measure of the last successful state of the doc
# pulled from the source (to help skip reindexing already updated docs in case of
@@ -1055,9 +1054,9 @@ class Document(Base):
__table_args__ = (
Index(
"ix_document_needs_sync",
"id",
postgresql_where=text("last_modified > last_synced OR last_synced IS NULL"),
"ix_document_sync_status",
last_modified,
last_synced,
),
)

View File

@@ -129,13 +129,9 @@ def update_sync_record_status(
"""
sync_record = fetch_latest_sync_record(db_session, entity_id, sync_type)
if sync_record is None:
logger.warning(
f"No sync record found for entity_id={entity_id} "
f"sync_type={sync_type} — skipping status update. "
f"This typically means the record was never created "
f"(insert_sync_record failed silently) or was cleaned up."
raise ValueError(
f"No sync record found for entity_id={entity_id} sync_type={sync_type}"
)
return
sync_record.sync_status = sync_status
if num_docs_synced is not None:

View File

@@ -2,17 +2,11 @@ import datetime
from uuid import UUID
from sqlalchemy import func
from sqlalchemy import or_
from sqlalchemy import select
from sqlalchemy.orm import joinedload
from sqlalchemy.orm import selectinload
from sqlalchemy.orm import Session
from onyx.configs.constants import FileOrigin
from onyx.db.models import ChatMessage
from onyx.db.models import ChatSession
from onyx.db.models import ChatSessionSharedStatus
from onyx.db.models import FileRecord
from onyx.db.models import Persona
from onyx.db.models import Project__UserFile
from onyx.db.models import UserFile
@@ -114,73 +108,13 @@ def update_last_accessed_at_for_user_files(
db_session.commit()
def get_file_id_by_user_file_id(
user_file_id: str, user_id: UUID, db_session: Session
) -> str | None:
user_file = (
db_session.query(UserFile)
.filter(UserFile.id == user_file_id, UserFile.user_id == user_id)
.first()
)
def get_file_id_by_user_file_id(user_file_id: str, db_session: Session) -> str | None:
user_file = db_session.query(UserFile).filter(UserFile.id == user_file_id).first()
if user_file:
return user_file.file_id
return None
def user_can_access_chat_file(file_id: str, user_id: UUID, db_session: Session) -> bool:
"""Return True if `user_id` is allowed to read the raw `file_id` served by
`GET /chat/file/{file_id}`. Access is granted when any of:
- The `file_id` is the storage id of a `UserFile` owned by the user.
- The `file_id` is a persona avatar (`Persona.uploaded_image_id`); avatars
are visible to any authenticated user.
- The `file_id` appears in a `ChatMessage.files` descriptor of a chat
session the user owns or a session publicly shared via
`ChatSessionSharedStatus.PUBLIC`.
"""
owns_user_file = db_session.query(
select(UserFile.id)
.where(UserFile.file_id == file_id, UserFile.user_id == user_id)
.exists()
).scalar()
if owns_user_file:
return True
# TODO: move persona avatars to a dedicated endpoint (e.g.
# /assistants/{id}/avatar) so this branch can be removed. /chat/file is
# currently overloaded with multiple asset classes (user files, chat
# attachments, tool outputs, avatars), forcing this access-check fan-out.
#
# Restrict the avatar path to CHAT_UPLOAD-origin files so an attacker
# cannot bind another user's USER_FILE (or any other origin) to their
# own persona and read it through this check.
is_persona_avatar = db_session.query(
select(Persona.id)
.join(FileRecord, FileRecord.file_id == Persona.uploaded_image_id)
.where(
Persona.uploaded_image_id == file_id,
FileRecord.file_origin == FileOrigin.CHAT_UPLOAD,
)
.exists()
).scalar()
if is_persona_avatar:
return True
chat_file_stmt = (
select(ChatMessage.id)
.join(ChatSession, ChatMessage.chat_session_id == ChatSession.id)
.where(ChatMessage.files.op("@>")([{"id": file_id}]))
.where(
or_(
ChatSession.user_id == user_id,
ChatSession.shared_status == ChatSessionSharedStatus.PUBLIC,
)
)
.limit(1)
)
return db_session.execute(chat_file_stmt).first() is not None
def get_file_ids_by_user_file_ids(
user_file_ids: list[UUID], db_session: Session
) -> list[str]:

View File

@@ -12,7 +12,6 @@ from email.parser import Parser as EmailParser
from io import BytesIO
from pathlib import Path
from typing import Any
from typing import cast
from typing import IO
from typing import NamedTuple
from typing import Optional
@@ -21,11 +20,10 @@ from zipfile import BadZipFile
import chardet
import openpyxl
from openpyxl.worksheet._read_only import ReadOnlyWorksheet
from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image
from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_FILE
from onyx.configs.app_configs import MAX_XLSX_CELLS_PER_SHEET
from onyx.configs.constants import ONYX_METADATA_FILENAME
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
from onyx.file_processing.file_types import OnyxFileExtensions
@@ -51,7 +49,6 @@ KNOWN_OPENPYXL_BUGS = [
"Unable to read workbook: could not read stylesheet from None",
"Colors must be aRGB hex values",
"Max value is",
"There is no item named",
]
@@ -450,83 +447,104 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
return presentation.markdown
def _columns_to_keep(col_has_data: bytearray, max_empty: int) -> list[int]:
"""Keep non-empty columns, plus runs of up to ``max_empty`` empty columns
between them. Trailing empty columns are dropped."""
def _worksheet_to_matrix(
worksheet: Worksheet,
) -> list[list[str]]:
"""
Converts a singular worksheet to a matrix of values.
Rows are padded to a uniform width. In openpyxl's read_only mode,
iter_rows can yield rows of differing lengths (trailing empty cells
are sometimes omitted), and downstream column cleanup assumes a
rectangular matrix.
"""
rows: list[list[str]] = []
max_len = 0
for worksheet_row in worksheet.iter_rows(min_row=1, values_only=True):
row = ["" if cell is None else str(cell) for cell in worksheet_row]
if len(row) > max_len:
max_len = len(row)
rows.append(row)
for row in rows:
if len(row) < max_len:
row.extend([""] * (max_len - len(row)))
return rows
def _clean_worksheet_matrix(matrix: list[list[str]]) -> list[list[str]]:
"""
Cleans a worksheet matrix by removing rows if there are N consecutive empty
rows and removing cols if there are M consecutive empty columns
"""
MAX_EMPTY_ROWS = 2 # Runs longer than this are capped to max_empty; shorter runs are preserved as-is
MAX_EMPTY_COLS = 2
# Row cleanup
matrix = _remove_empty_runs(matrix, max_empty=MAX_EMPTY_ROWS)
if not matrix:
return matrix
# Column cleanup — determine which columns to keep without transposing.
num_cols = len(matrix[0])
keep_cols = _columns_to_keep(matrix, num_cols, max_empty=MAX_EMPTY_COLS)
if len(keep_cols) < num_cols:
matrix = [[row[c] for c in keep_cols] for row in matrix]
return matrix
def _columns_to_keep(
matrix: list[list[str]], num_cols: int, max_empty: int
) -> list[int]:
"""Return the indices of columns to keep after removing empty-column runs.
Uses the same logic as ``_remove_empty_runs`` but operates on column
indices so no transpose is needed.
"""
kept: list[int] = []
empty_buffer: list[int] = []
for c, has in enumerate(col_has_data):
if has:
kept.extend(empty_buffer[:max_empty])
kept.append(c)
empty_buffer = []
for col_idx in range(num_cols):
col_is_empty = all(not row[col_idx] for row in matrix)
if col_is_empty:
empty_buffer.append(col_idx)
else:
empty_buffer.append(c)
kept.extend(empty_buffer[:max_empty])
kept.append(col_idx)
empty_buffer = []
return kept
def _sheet_to_csv(rows: Iterator[tuple[Any, ...]]) -> str:
"""Stream worksheet rows into CSV text without materializing a dense matrix.
def _remove_empty_runs(
rows: list[list[str]],
max_empty: int,
) -> list[list[str]]:
"""Removes entire runs of empty rows when the run length exceeds max_empty.
Empty rows are never stored. Column occupancy is tracked as a ``bytearray``
bitmap so column trimming needs no transpose or copy. Runs of empty
rows/columns longer than 2 are collapsed; shorter runs are preserved.
Scanning stops once ``MAX_XLSX_CELLS_PER_SHEET`` non-empty cells have been
seen; the output gets a truncation marker row appended so downstream
indexing sees that the sheet was cut off.
Leading empty runs are capped to max_empty, just like interior runs.
Trailing empty rows are always dropped since there is no subsequent
non-empty row to flush them.
"""
MAX_EMPTY_ROWS_IN_OUTPUT = 2
MAX_EMPTY_COLS_IN_OUTPUT = 2
TRUNCATION_MARKER = "[truncated: sheet exceeded cell limit]"
result: list[list[str]] = []
empty_buffer: list[list[str]] = []
non_empty_rows: list[tuple[int, list[str]]] = []
col_has_data = bytearray()
total_non_empty = 0
truncated = False
for row in rows:
# Check if empty
if not any(row):
if len(empty_buffer) < max_empty:
empty_buffer.append(row)
else:
# Add upto max empty rows onto the result - that's what we allow
result.extend(empty_buffer[:max_empty])
# Add the new non-empty row
result.append(row)
empty_buffer = []
for row_idx, row_vals in enumerate(rows):
# Fast-reject empty rows before allocating a list of "".
if not any(v is not None and v != "" for v in row_vals):
continue
cells = ["" if v is None else str(v) for v in row_vals]
non_empty_rows.append((row_idx, cells))
if len(cells) > len(col_has_data):
col_has_data.extend(b"\x00" * (len(cells) - len(col_has_data)))
for i, v in enumerate(cells):
if v:
col_has_data[i] = 1
total_non_empty += 1
if total_non_empty > MAX_XLSX_CELLS_PER_SHEET:
truncated = True
break
if not non_empty_rows:
return ""
keep_cols = _columns_to_keep(col_has_data, MAX_EMPTY_COLS_IN_OUTPUT)
if not keep_cols:
return ""
buf = io.StringIO()
writer = csv.writer(buf, lineterminator="\n")
blank_row = [""] * len(keep_cols)
last_idx = -1
for row_idx, cells in non_empty_rows:
gap = row_idx - last_idx - 1
if gap > 0:
for _ in range(min(gap, MAX_EMPTY_ROWS_IN_OUTPUT)):
writer.writerow(blank_row)
writer.writerow([cells[c] if c < len(cells) else "" for c in keep_cols])
last_idx = row_idx
if truncated:
writer.writerow([TRUNCATION_MARKER])
return buf.getvalue().rstrip("\n")
return result
def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str, str]]:
@@ -554,24 +572,20 @@ def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str,
raise
sheets: list[tuple[str, str]] = []
try:
for sheet in workbook.worksheets:
# Declared dimensions can be different to what is actually there
ro_sheet = cast(ReadOnlyWorksheet, sheet)
ro_sheet.reset_dimensions()
csv_text = _sheet_to_csv(ro_sheet.iter_rows(values_only=True))
sheets.append((csv_text.strip(), ro_sheet.title))
finally:
workbook.close()
for sheet in workbook.worksheets:
sheet_matrix = _clean_worksheet_matrix(_worksheet_to_matrix(sheet))
buf = io.StringIO()
writer = csv.writer(buf, lineterminator="\n")
writer.writerows(sheet_matrix)
csv_text = buf.getvalue().rstrip("\n")
if csv_text.strip():
sheets.append((csv_text, sheet.title))
return sheets
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
sheets = xlsx_sheet_extraction(file, file_name)
return TEXT_SECTION_SEPARATOR.join(
csv_text for csv_text, _title in sheets if csv_text
)
return TEXT_SECTION_SEPARATOR.join(csv_text for csv_text, _title in sheets)
def eml_to_text(file: IO[Any]) -> str:

View File

@@ -1,63 +0,0 @@
from collections.abc import Callable
from typing import Any
from typing import IO
from onyx.configs.constants import FileOrigin
from onyx.file_store.file_store import get_default_file_store
from onyx.utils.logger import setup_logger
logger = setup_logger()
# (content, content_type) -> file_id
RawFileCallback = Callable[[IO[bytes], str], str]
def stage_raw_file(
content: IO,
content_type: str,
*,
metadata: dict[str, Any],
) -> str:
"""Persist raw bytes to the file store with FileOrigin.INDEXING_STAGING.
`metadata` is attached to the file_record so that downstream promotion
(in docprocessing) and orphan reaping (TTL janitor) can locate the file
by its originating context.
"""
file_store = get_default_file_store()
file_id = file_store.save_file(
content=content,
display_name=None,
file_origin=FileOrigin.INDEXING_STAGING,
file_type=content_type,
file_metadata=metadata,
)
return file_id
def build_raw_file_callback(
*,
index_attempt_id: int,
cc_pair_id: int,
tenant_id: str,
) -> RawFileCallback:
"""Build a per-attempt callback that connectors can invoke to opt in to
raw-file persistence. The closure binds the attempt-level context as the
staging metadata so the connector only needs to pass per-call info
(bytes, content_type) and gets back a file_id to attach to its Document.
"""
metadata: dict[str, Any] = {
"index_attempt_id": index_attempt_id,
"cc_pair_id": cc_pair_id,
"tenant_id": tenant_id,
}
def _callback(content: IO[bytes], content_type: str) -> str:
return stage_raw_file(
content=content,
content_type=content_type,
metadata=metadata,
)
return _callback

View File

@@ -166,7 +166,6 @@ from shared_configs.configs import CORS_ALLOWED_ORIGIN
from shared_configs.configs import MULTI_TENANT
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
from shared_configs.configs import SENTRY_DSN
from shared_configs.configs import SENTRY_TRACES_SAMPLE_RATE
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
warnings.filterwarnings(
@@ -440,7 +439,7 @@ def get_application(lifespan_override: Lifespan | None = None) -> FastAPI:
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[StarletteIntegration(), FastApiIntegration()],
traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)

View File

@@ -63,7 +63,6 @@ from onyx.db.persona import get_persona_by_id
from onyx.db.usage import increment_usage
from onyx.db.usage import UsageType
from onyx.db.user_file import get_file_id_by_user_file_id
from onyx.db.user_file import user_can_access_chat_file
from onyx.error_handling.error_codes import OnyxErrorCode
from onyx.error_handling.exceptions import OnyxError
from onyx.file_store.file_store import get_default_file_store
@@ -867,18 +866,14 @@ def seed_chat_from_slack(
def fetch_chat_file(
file_id: str,
request: Request,
user: User = Depends(require_permission(Permission.BASIC_ACCESS)),
_: User = Depends(require_permission(Permission.BASIC_ACCESS)),
db_session: Session = Depends(get_session),
) -> Response:
# For user files, we need to get the file id from the user file id
file_id_from_user_file = get_file_id_by_user_file_id(file_id, user.id, db_session)
file_id_from_user_file = get_file_id_by_user_file_id(file_id, db_session)
if file_id_from_user_file:
file_id = file_id_from_user_file
elif not user_can_access_chat_file(file_id, user.id, db_session):
# Return 404 (rather than 403) so callers cannot probe for file
# existence across ownership boundaries.
raise OnyxError(OnyxErrorCode.NOT_FOUND, "File not found")
file_store = get_default_file_store()
file_record = file_store.read_file_record(file_id)

View File

@@ -99,14 +99,6 @@ STRICT_CHUNK_TOKEN_LIMIT = (
# Set up Sentry integration (for error logging)
SENTRY_DSN = os.environ.get("SENTRY_DSN")
# Celery task spans dominate ingestion volume (~94%), so default celery
# tracing to 0. Web/API traces stay at a small non-zero rate so http.server
# traces remain available. Both are env-tunable without a code change.
SENTRY_TRACES_SAMPLE_RATE = float(os.environ.get("SENTRY_TRACES_SAMPLE_RATE", "0.01"))
SENTRY_CELERY_TRACES_SAMPLE_RATE = float(
os.environ.get("SENTRY_CELERY_TRACES_SAMPLE_RATE", "0.0")
)
# Fields which should only be set on new search setting
PRESERVED_SEARCH_FIELDS = [

View File

@@ -1,130 +0,0 @@
"""External dependency tests for onyx.file_store.staging.
Exercises the raw-file persistence hook used by the docfetching pipeline
against a real file store (Postgres + MinIO/S3), since mocking the store
would defeat the point of verifying that metadata round-trips through
FileRecord.
"""
from collections.abc import Generator
from io import BytesIO
from typing import Any
from uuid import uuid4
import pytest
from sqlalchemy.orm import Session
from onyx.configs.constants import FileOrigin
from onyx.connectors.interfaces import BaseConnector
from onyx.db.file_record import delete_filerecord_by_file_id
from onyx.db.file_record import get_filerecord_by_file_id
from onyx.file_store.file_store import get_default_file_store
from onyx.file_store.staging import build_raw_file_callback
from onyx.file_store.staging import stage_raw_file
@pytest.fixture(scope="function")
def cleanup_file_ids(
db_session: Session,
) -> Generator[list[str], None, None]:
created: list[str] = []
yield created
file_store = get_default_file_store()
for fid in created:
try:
file_store.delete_file(fid)
except Exception:
delete_filerecord_by_file_id(file_id=fid, db_session=db_session)
db_session.commit()
def test_stage_raw_file_persists_with_origin_and_metadata(
db_session: Session,
tenant_context: None, # noqa: ARG001
initialize_file_store: None, # noqa: ARG001
cleanup_file_ids: list[str],
) -> None:
"""stage_raw_file writes a FileRecord with INDEXING_STAGING origin and
round-trips the provided metadata verbatim."""
metadata: dict[str, Any] = {
"index_attempt_id": 42,
"cc_pair_id": 7,
"tenant_id": "tenant-abc",
"extra": "payload",
}
content_bytes = b"hello raw file"
content_type = "application/pdf"
file_id = stage_raw_file(
content=BytesIO(content_bytes),
content_type=content_type,
metadata=metadata,
)
cleanup_file_ids.append(file_id)
db_session.commit()
record = get_filerecord_by_file_id(file_id=file_id, db_session=db_session)
assert record.file_origin == FileOrigin.INDEXING_STAGING
assert record.file_type == content_type
assert record.file_metadata == metadata
def test_build_raw_file_callback_binds_attempt_context_per_call(
db_session: Session,
tenant_context: None, # noqa: ARG001
initialize_file_store: None, # noqa: ARG001
cleanup_file_ids: list[str],
) -> None:
"""The callback returned by build_raw_file_callback must bind the
attempt-level context into every FileRecord it produces, without
leaking state across invocations."""
callback = build_raw_file_callback(
index_attempt_id=1001,
cc_pair_id=202,
tenant_id="tenant-xyz",
)
file_id_a = callback(BytesIO(b"alpha"), "text/plain")
file_id_b = callback(BytesIO(b"beta"), "application/octet-stream")
cleanup_file_ids.extend([file_id_a, file_id_b])
db_session.commit()
assert file_id_a != file_id_b
for fid, expected_content_type in (
(file_id_a, "text/plain"),
(file_id_b, "application/octet-stream"),
):
record = get_filerecord_by_file_id(file_id=fid, db_session=db_session)
assert record.file_origin == FileOrigin.INDEXING_STAGING
assert record.file_type == expected_content_type
assert record.file_metadata == {
"index_attempt_id": 1001,
"cc_pair_id": 202,
"tenant_id": "tenant-xyz",
}
def test_set_raw_file_callback_on_base_connector() -> None:
"""set_raw_file_callback must install the callback as an instance
attribute usable by the connector."""
class _MinimalConnector(BaseConnector):
def load_credentials(
self,
credentials: dict[str, Any], # noqa: ARG002
) -> dict[str, Any] | None:
return None
connector = _MinimalConnector()
assert connector.raw_file_callback is None
sentinel_file_id = f"sentinel-{uuid4().hex[:8]}"
def _fake_callback(_content: Any, _content_type: str) -> str:
return sentinel_file_id
connector.set_raw_file_callback(_fake_callback)
assert connector.raw_file_callback is _fake_callback
assert connector.raw_file_callback(BytesIO(b""), "text/plain") == sentinel_file_id

View File

@@ -8,10 +8,8 @@ import io
from typing import NamedTuple
import pytest
import requests
from onyx.file_store.models import FileDescriptor
from tests.integration.common_utils.constants import API_SERVER_URL
from tests.integration.common_utils.managers.chat import ChatSessionManager
from tests.integration.common_utils.managers.file import FileManager
from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
@@ -121,31 +119,3 @@ def test_public_assistant_with_user_files(
assert (
len(chat_history) >= 2
), "Expected at least 2 messages (user message and assistant response)"
def test_cannot_download_other_users_file_via_chat_file_endpoint(
user_file_setup: UserFileTestSetup,
) -> None:
storage_file_id = user_file_setup.user1_file_descriptor["id"]
user_file_id = user_file_setup.user1_file_id
owner_response = requests.get(
f"{API_SERVER_URL}/chat/file/{storage_file_id}",
headers=user_file_setup.user1_file_owner.headers,
)
assert owner_response.status_code == 200
assert owner_response.content, "Owner should receive the file contents"
for file_id in (storage_file_id, user_file_id):
user2_response = requests.get(
f"{API_SERVER_URL}/chat/file/{file_id}",
headers=user_file_setup.user2_non_owner.headers,
)
assert user2_response.status_code in (
403,
404,
), (
f"Expected access denied for non-owner, got {user2_response.status_code} "
f"when fetching file_id={file_id}"
)
assert user2_response.content != owner_response.content

View File

@@ -69,9 +69,6 @@ class TestGongConnectorCheckpoint:
workspace_ids=["ws1", None],
workspace_index=1,
cursor="abc123",
pending_transcripts={"call1": _make_transcript("call1")},
pending_call_details_attempts=2,
pending_retry_after=1234567890.5,
)
json_str = original.model_dump_json()
restored = connector.validate_checkpoint_json(json_str)
@@ -234,11 +231,7 @@ class TestGongConnectorCheckpoint:
mock_request: MagicMock,
connector: GongConnector,
) -> None:
"""Missing call details persist across checkpoint invocations and
eventually yield ConnectorFailure once MAX_CALL_DETAILS_ATTEMPTS is hit.
No in-call sleep — retries happen on subsequent invocations, gated by
the wall-clock retry-after deadline on the checkpoint.
"""
"""When call details are missing after retries, yield ConnectorFailure."""
transcript_response = MagicMock()
transcript_response.status_code = 200
transcript_response.json.return_value = {
@@ -264,42 +257,23 @@ class TestGongConnectorCheckpoint:
failures: list[ConnectorFailure] = []
docs: list[Document] = []
# Jump the clock past any retry deadline on each invocation so we
# exercise the retry path without real sleeping. The test for the
# backoff-gate itself lives in test_backoff_gate_prevents_retry_too_soon.
fake_now = [1_000_000.0]
def _advance_clock() -> float:
fake_now[0] += 10_000.0
return fake_now[0]
invocation_cap = GongConnector.MAX_CALL_DETAILS_ATTEMPTS + 5
with patch(
"onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
):
for _ in range(invocation_cap):
if not checkpoint.has_more:
break
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
try:
while True:
item = next(generator)
if isinstance(item, ConnectorFailure):
failures.append(item)
elif isinstance(item, Document):
docs.append(item)
except StopIteration as e:
checkpoint = e.value
with patch("onyx.connectors.gong.connector.time.sleep"):
generator = connector.load_from_checkpoint(0, time.time(), checkpoint)
try:
while True:
item = next(generator)
if isinstance(item, ConnectorFailure):
failures.append(item)
elif isinstance(item, Document):
docs.append(item)
except StopIteration as e:
checkpoint = e.value
assert len(docs) == 0
assert len(failures) == 1
assert failures[0].failed_document is not None
assert failures[0].failed_document.document_id == "call1"
assert checkpoint.has_more is False
assert checkpoint.pending_transcripts == {}
assert checkpoint.pending_call_details_attempts == 0
assert checkpoint.pending_retry_after is None
assert mock_request.call_count == 1 + GongConnector.MAX_CALL_DETAILS_ATTEMPTS
@patch.object(GongConnector, "_throttled_request")
def test_multi_workspace_iteration(
@@ -407,14 +381,12 @@ class TestGongConnectorCheckpoint:
assert checkpoint.workspace_index == 1
@patch.object(GongConnector, "_throttled_request")
def test_partial_details_defers_and_resolves_next_invocation(
def test_retry_only_fetches_missing_ids(
self,
mock_request: MagicMock,
connector: GongConnector,
) -> None:
"""A transcript whose call details are missing gets stashed into
pending_transcripts and resolves on a later checkpoint invocation.
Resolved docs are yielded in the order they become available."""
"""Retry for missing call details should only re-request the missing IDs."""
transcript_response = MagicMock()
transcript_response.status_code = 200
transcript_response.json.return_value = {
@@ -432,7 +404,7 @@ class TestGongConnectorCheckpoint:
"calls": [_make_call_detail("call1", "Call One")]
}
# Second fetch (next invocation): returns call2
# Second fetch (retry): returns call2
missing_details = MagicMock()
missing_details.status_code = 200
missing_details.json.return_value = {
@@ -452,48 +424,19 @@ class TestGongConnectorCheckpoint:
)
docs: list[Document] = []
fake_now = [1_000_000.0]
def _advance_clock() -> float:
fake_now[0] += 10_000.0
return fake_now[0]
with patch(
"onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
):
# Invocation 1: fetches page + details, yields call1, stashes call2
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
with patch("onyx.connectors.gong.connector.time.sleep"):
generator = connector.load_from_checkpoint(0, time.time(), checkpoint)
try:
while True:
item = next(generator)
if isinstance(item, Document):
docs.append(item)
except StopIteration as e:
checkpoint = e.value
assert len(docs) == 1
assert docs[0].semantic_identifier == "Call One"
assert "call2" in checkpoint.pending_transcripts
assert checkpoint.pending_call_details_attempts == 1
assert checkpoint.pending_retry_after is not None
assert checkpoint.has_more is True
# Invocation 2: retries missing (only call2), yields it, clears pending
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
try:
while True:
item = next(generator)
if isinstance(item, Document):
docs.append(item)
except StopIteration as e:
checkpoint = e.value
except StopIteration:
pass
assert len(docs) == 2
assert docs[0].semantic_identifier == "Call One"
assert docs[1].semantic_identifier == "Call Two"
assert checkpoint.pending_transcripts == {}
assert checkpoint.pending_call_details_attempts == 0
assert checkpoint.pending_retry_after is None
# Verify: 3 API calls total (1 transcript + 1 full details + 1 retry for missing only)
assert mock_request.call_count == 3
@@ -501,107 +444,6 @@ class TestGongConnectorCheckpoint:
retry_call_body = mock_request.call_args_list[2][1]["json"]
assert retry_call_body["filter"]["callIds"] == ["call2"]
@patch.object(GongConnector, "_throttled_request")
def test_backoff_gate_prevents_retry_too_soon(
self,
mock_request: MagicMock,
connector: GongConnector,
) -> None:
"""If the retry-after deadline hasn't elapsed, _resolve_pending must
NOT issue a /v2/calls/extensive request. Prevents burning through
MAX_CALL_DETAILS_ATTEMPTS when workers re-invoke tightly.
"""
pending_transcript = _make_transcript("call1")
fixed_now = 1_000_000.0
# Deadline is 30s in the future from fixed_now
retry_after = fixed_now + 30
checkpoint = GongConnectorCheckpoint(
has_more=True,
workspace_ids=[None],
workspace_index=0,
pending_transcripts={"call1": pending_transcript},
pending_call_details_attempts=1,
pending_retry_after=retry_after,
)
with patch("onyx.connectors.gong.connector.time.time", return_value=fixed_now):
generator = connector.load_from_checkpoint(0, fixed_now, checkpoint)
try:
while True:
next(generator)
except StopIteration as e:
checkpoint = e.value
# No API calls should have been made — we were inside the backoff window
mock_request.assert_not_called()
# Pending state preserved for later retry
assert "call1" in checkpoint.pending_transcripts
assert checkpoint.pending_call_details_attempts == 1
assert checkpoint.pending_retry_after == retry_after
assert checkpoint.has_more is True
@patch.object(GongConnector, "_throttled_request")
def test_pending_retry_does_not_block_on_time_sleep(
self,
mock_request: MagicMock,
connector: GongConnector,
) -> None:
"""Pending-transcript retry must never call time.sleep() with a
non-trivial delay — spacing between retries is enforced via the
wall-clock retry-after deadline stored on the checkpoint, not by
blocking inside load_from_checkpoint.
"""
transcript_response = MagicMock()
transcript_response.status_code = 200
transcript_response.json.return_value = {
"callTranscripts": [_make_transcript("call1")],
"records": {},
}
empty_details = MagicMock()
empty_details.status_code = 200
empty_details.json.return_value = {"calls": []}
mock_request.side_effect = [transcript_response] + [
empty_details
] * GongConnector.MAX_CALL_DETAILS_ATTEMPTS
checkpoint = GongConnectorCheckpoint(
has_more=True,
workspace_ids=[None],
workspace_index=0,
)
fake_now = [1_000_000.0]
def _advance_clock() -> float:
fake_now[0] += 10_000.0
return fake_now[0]
with (
patch("onyx.connectors.gong.connector.time.sleep") as mock_sleep,
patch(
"onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
),
):
invocation_cap = GongConnector.MAX_CALL_DETAILS_ATTEMPTS + 5
for _ in range(invocation_cap):
if not checkpoint.has_more:
break
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
try:
while True:
next(generator)
except StopIteration as e:
checkpoint = e.value
# The only legitimate sleep is the sub-second throttle in
# _throttled_request (<= MIN_REQUEST_INTERVAL). Assert we never
# sleep for anything close to the per-retry backoff delays.
for call in mock_sleep.call_args_list:
delay_arg = call.args[0] if call.args else 0
assert delay_arg <= GongConnector.MIN_REQUEST_INTERVAL
@patch.object(GongConnector, "_throttled_request")
def test_expired_cursor_restarts_workspace(
self,

View File

@@ -287,140 +287,3 @@ class TestFailedFolderIdsByEmail:
)
assert len(failed_map) == 0
class TestOrphanedPathBackfill:
def _make_failed_map(
self, entries: dict[str, set[str]]
) -> ThreadSafeDict[str, ThreadSafeSet[str]]:
return ThreadSafeDict({k: ThreadSafeSet(v) for k, v in entries.items()})
def _make_file(self, parent_id: str) -> MagicMock:
file = MagicMock()
file.user_email = "retriever@example.com"
file.drive_file = {"parents": [parent_id]}
return file
def test_backfills_intermediate_folders_into_failed_map(self) -> None:
"""When a walk dead-ends at a confirmed orphan, all intermediate folder
IDs must be added to failed_folder_ids_by_email for both emails so
future files short-circuit via _get_folder_metadata's cache check."""
connector = _make_connector()
# Chain: folderA -> folderB -> folderC (confirmed orphan)
failed_map = self._make_failed_map(
{
"retriever@example.com": {"folderC"},
"admin@example.com": {"folderC"},
}
)
folder_a = {"id": "folderA", "name": "A", "parents": ["folderB"]}
folder_b = {"id": "folderB", "name": "B", "parents": ["folderC"]}
def mock_get_folder(
_service: MagicMock, folder_id: str, _field_type: DriveFileFieldType
) -> dict | None:
if folder_id == "folderA":
return folder_a
if folder_id == "folderB":
return folder_b
return None
with (
patch(
"onyx.connectors.google_drive.connector.get_drive_service",
return_value=MagicMock(),
),
patch(
"onyx.connectors.google_drive.connector.get_folder_metadata",
side_effect=mock_get_folder,
),
):
connector._get_new_ancestors_for_files(
files=[self._make_file("folderA")],
seen_hierarchy_node_raw_ids=ThreadSafeSet(),
fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
failed_folder_ids_by_email=failed_map,
)
# Both emails confirmed folderC as orphan, so both get the backfill
for email in ("retriever@example.com", "admin@example.com"):
cached = failed_map.get(email, ThreadSafeSet())
assert "folderA" in cached
assert "folderB" in cached
assert "folderC" in cached
def test_backfills_only_for_confirming_email(self) -> None:
"""Only the email that confirmed the orphan gets the path backfilled."""
connector = _make_connector()
# Only retriever confirmed folderC as orphan; admin has no entry
failed_map = self._make_failed_map({"retriever@example.com": {"folderC"}})
folder_a = {"id": "folderA", "name": "A", "parents": ["folderB"]}
folder_b = {"id": "folderB", "name": "B", "parents": ["folderC"]}
def mock_get_folder(
_service: MagicMock, folder_id: str, _field_type: DriveFileFieldType
) -> dict | None:
if folder_id == "folderA":
return folder_a
if folder_id == "folderB":
return folder_b
return None
with (
patch(
"onyx.connectors.google_drive.connector.get_drive_service",
return_value=MagicMock(),
),
patch(
"onyx.connectors.google_drive.connector.get_folder_metadata",
side_effect=mock_get_folder,
),
):
connector._get_new_ancestors_for_files(
files=[self._make_file("folderA")],
seen_hierarchy_node_raw_ids=ThreadSafeSet(),
fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
failed_folder_ids_by_email=failed_map,
)
retriever_cached = failed_map.get("retriever@example.com", ThreadSafeSet())
assert "folderA" in retriever_cached
assert "folderB" in retriever_cached
# admin did not confirm the orphan — must not get the backfill
assert failed_map.get("admin@example.com") is None
def test_short_circuits_on_backfilled_intermediate(self) -> None:
"""A second file whose parent is already in failed_folder_ids_by_email
must not trigger any folder metadata API calls."""
connector = _make_connector()
# folderA already in the failed map from a previous walk
failed_map = self._make_failed_map(
{
"retriever@example.com": {"folderA"},
"admin@example.com": {"folderA"},
}
)
with (
patch(
"onyx.connectors.google_drive.connector.get_drive_service",
return_value=MagicMock(),
),
patch(
"onyx.connectors.google_drive.connector.get_folder_metadata"
) as mock_api,
):
connector._get_new_ancestors_for_files(
files=[self._make_file("folderA")],
seen_hierarchy_node_raw_ids=ThreadSafeSet(),
fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
failed_folder_ids_by_email=failed_map,
)
mock_api.assert_not_called()

View File

@@ -1,243 +0,0 @@
"""Unit tests for WebConnector.retrieve_all_slim_docs (slim pruning path)."""
from __future__ import annotations
from typing import Any
from unittest.mock import MagicMock
from unittest.mock import patch
from onyx.connectors.models import SlimDocument
from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
from onyx.connectors.web.connector import WebConnector
BASE_URL = "http://example.com"
SINGLE_PAGE_HTML = (
"<html><body><p>Content that should not appear in slim output</p></body></html>"
)
RECURSIVE_ROOT_HTML = """
<html><body>
<a href="/page2">Page 2</a>
<a href="/page3">Page 3</a>
</body></html>
"""
PAGE2_HTML = "<html><body><p>page 2</p></body></html>"
PAGE3_HTML = "<html><body><p>page 3</p></body></html>"
def _make_playwright_context_mock(url_to_html: dict[str, str]) -> MagicMock:
"""Return a BrowserContext mock whose pages respond based on goto URL."""
context = MagicMock()
def _new_page() -> MagicMock:
page = MagicMock()
visited: list[str] = []
def _goto(url: str, **kwargs: Any) -> MagicMock: # noqa: ARG001
visited.append(url)
page.url = url
response = MagicMock()
response.status = 200
response.header_value.return_value = None # no cf-ray
return response
def _content() -> str:
return url_to_html.get(
visited[-1] if visited else "", "<html><body></body></html>"
)
page.goto.side_effect = _goto
page.content.side_effect = _content
return page
context.new_page.side_effect = _new_page
return context
def _make_playwright_mock() -> MagicMock:
playwright = MagicMock()
playwright.stop = MagicMock()
return playwright
def _make_page_mock(
html: str, cf_ray: str | None = None, status: int = 200
) -> MagicMock:
"""Return a Playwright page mock with configurable status and CF header."""
page = MagicMock()
page.url = BASE_URL + "/"
response = MagicMock()
response.status = status
response.header_value.side_effect = lambda h: cf_ray if h == "cf-ray" else None
page.goto.return_value = response
page.content.return_value = html
return page
@patch("onyx.connectors.web.connector.check_internet_connection")
@patch("onyx.connectors.web.connector.requests.head")
@patch("onyx.connectors.web.connector.start_playwright")
def test_slim_yields_slim_documents(
mock_start_playwright: MagicMock,
mock_head: MagicMock,
_mock_check: MagicMock,
) -> None:
"""retrieve_all_slim_docs yields SlimDocuments with the correct URL as id."""
context = _make_playwright_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
mock_start_playwright.return_value = (_make_playwright_mock(), context)
mock_head.return_value.headers = {"content-type": "text/html"}
connector = WebConnector(
base_url=BASE_URL + "/",
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
)
docs = [doc for batch in connector.retrieve_all_slim_docs() for doc in batch]
assert len(docs) == 1
assert isinstance(docs[0], SlimDocument)
assert docs[0].id == BASE_URL + "/"
@patch("onyx.connectors.web.connector.check_internet_connection")
@patch("onyx.connectors.web.connector.requests.head")
@patch("onyx.connectors.web.connector.start_playwright")
def test_slim_skips_content_extraction(
mock_start_playwright: MagicMock,
mock_head: MagicMock,
_mock_check: MagicMock,
) -> None:
"""web_html_cleanup is never called in slim mode."""
context = _make_playwright_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
mock_start_playwright.return_value = (_make_playwright_mock(), context)
mock_head.return_value.headers = {"content-type": "text/html"}
connector = WebConnector(
base_url=BASE_URL + "/",
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
)
with patch("onyx.connectors.web.connector.web_html_cleanup") as mock_cleanup:
list(connector.retrieve_all_slim_docs())
mock_cleanup.assert_not_called()
@patch("onyx.connectors.web.connector.check_internet_connection")
@patch("onyx.connectors.web.connector.requests.head")
@patch("onyx.connectors.web.connector.start_playwright")
def test_slim_discovers_links_recursively(
mock_start_playwright: MagicMock,
mock_head: MagicMock,
_mock_check: MagicMock,
) -> None:
"""In RECURSIVE mode, internal <a href> links are followed and all URLs yielded."""
url_to_html = {
BASE_URL + "/": RECURSIVE_ROOT_HTML,
BASE_URL + "/page2": PAGE2_HTML,
BASE_URL + "/page3": PAGE3_HTML,
}
context = _make_playwright_context_mock(url_to_html)
mock_start_playwright.return_value = (_make_playwright_mock(), context)
mock_head.return_value.headers = {"content-type": "text/html"}
connector = WebConnector(
base_url=BASE_URL + "/",
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
)
ids = {
doc.id
for batch in connector.retrieve_all_slim_docs()
for doc in batch
if isinstance(doc, SlimDocument)
}
assert ids == {
BASE_URL + "/",
BASE_URL + "/page2",
BASE_URL + "/page3",
}
@patch("onyx.connectors.web.connector.check_internet_connection")
@patch("onyx.connectors.web.connector.requests.head")
@patch("onyx.connectors.web.connector.start_playwright")
def test_normal_200_skips_5s_wait(
mock_start_playwright: MagicMock,
mock_head: MagicMock,
_mock_check: MagicMock,
) -> None:
"""Normal 200 responses without bot-detection signals skip the 5s render wait."""
page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray=None, status=200)
context = MagicMock()
context.new_page.return_value = page
mock_start_playwright.return_value = (_make_playwright_mock(), context)
mock_head.return_value.headers = {"content-type": "text/html"}
connector = WebConnector(
base_url=BASE_URL + "/",
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
)
list(connector.retrieve_all_slim_docs())
page.wait_for_timeout.assert_not_called()
@patch("onyx.connectors.web.connector.check_internet_connection")
@patch("onyx.connectors.web.connector.requests.head")
@patch("onyx.connectors.web.connector.start_playwright")
def test_cloudflare_applies_5s_wait(
mock_start_playwright: MagicMock,
mock_head: MagicMock,
_mock_check: MagicMock,
) -> None:
"""Pages with a cf-ray header trigger the 5s wait before networkidle."""
page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray="abc123-LAX")
context = MagicMock()
context.new_page.return_value = page
mock_start_playwright.return_value = (_make_playwright_mock(), context)
mock_head.return_value.headers = {"content-type": "text/html"}
connector = WebConnector(
base_url=BASE_URL + "/",
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
)
list(connector.retrieve_all_slim_docs())
page.wait_for_timeout.assert_called_once_with(5000)
@patch("onyx.connectors.web.connector.time")
@patch("onyx.connectors.web.connector.check_internet_connection")
@patch("onyx.connectors.web.connector.requests.head")
@patch("onyx.connectors.web.connector.start_playwright")
def test_403_applies_5s_wait(
mock_start_playwright: MagicMock,
mock_head: MagicMock,
_mock_check: MagicMock,
_mock_time: MagicMock,
) -> None:
"""A 403 response triggers the 5s wait (common bot-detection challenge entry point)."""
page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray=None, status=403)
context = MagicMock()
context.new_page.return_value = page
mock_start_playwright.return_value = (_make_playwright_mock(), context)
mock_head.return_value.headers = {"content-type": "text/html"}
connector = WebConnector(
base_url=BASE_URL + "/",
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
)
# All retries return 403 so no docs are found — that's expected here.
# We only care that the 5s wait fired.
try:
list(connector.retrieve_all_slim_docs())
except RuntimeError:
pass
page.wait_for_timeout.assert_called_with(5000)

View File

@@ -1,11 +1,13 @@
import io
from typing import cast
from unittest.mock import MagicMock
from unittest.mock import patch
import openpyxl
from openpyxl.worksheet.worksheet import Worksheet
from onyx.file_processing.extract_file_text import _sheet_to_csv
from onyx.file_processing.extract_file_text import _clean_worksheet_matrix
from onyx.file_processing.extract_file_text import _worksheet_to_matrix
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
from onyx.file_processing.extract_file_text import xlsx_to_text
@@ -201,179 +203,50 @@ class TestXlsxToText:
assert "r3c1" in lines[2] and "r3c2" in lines[2]
class TestSheetToCsvJaggedRows:
"""openpyxl's read-only mode yields rows of differing widths when
trailing cells are empty. These tests exercise ``_sheet_to_csv``
directly because ``_make_xlsx`` (via ``ws.append``) normalizes row
widths, so jagged input can only be produced in-memory."""
class TestWorksheetToMatrixJaggedRows:
"""openpyxl read_only mode can yield rows of differing widths when
trailing cells are empty. The matrix must be padded to a rectangle
so downstream column cleanup can index safely."""
def test_shorter_trailing_rows_padded_in_output(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
("A", "B", "C"),
("X", "Y"),
("P",),
]
)
def test_pads_shorter_trailing_rows(self) -> None:
ws = MagicMock()
ws.iter_rows.return_value = iter(
[
("A", "B", "C"),
("X", "Y"),
("P",),
]
)
assert csv_text.split("\n") == ["A,B,C", "X,Y,", "P,,"]
matrix = _worksheet_to_matrix(ws)
assert matrix == [["A", "B", "C"], ["X", "Y", ""], ["P", "", ""]]
def test_shorter_leading_row_padded_in_output(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
("A",),
("X", "Y", "Z"),
]
)
def test_pads_when_first_row_is_shorter(self) -> None:
ws = MagicMock()
ws.iter_rows.return_value = iter(
[
("A",),
("X", "Y", "Z"),
]
)
assert csv_text.split("\n") == ["A,,", "X,Y,Z"]
matrix = _worksheet_to_matrix(ws)
assert matrix == [["A", "", ""], ["X", "Y", "Z"]]
def test_no_index_error_on_jagged_rows(self) -> None:
"""Regression: the original dense-matrix version raised IndexError
when a later row was shorter than an earlier row whose out-of-range
columns happened to be empty."""
csv_text = _sheet_to_csv(
iter(
[
("A", "", "", "B"),
("X", "Y"),
]
)
def test_clean_worksheet_matrix_no_index_error_on_jagged_rows(self) -> None:
"""Regression: previously raised IndexError when a later row was
shorter than the first row and the out-of-range column on the
first row was empty (so the short-circuit in `all()` did not
save us)."""
ws = MagicMock()
ws.iter_rows.return_value = iter(
[
("A", "", "", "B"),
("X", "Y"),
]
)
assert csv_text.split("\n") == ["A,,,B", "X,Y,,"]
class TestSheetToCsvStreaming:
"""Pin the memory-safe streaming contract: empty rows are skipped
cheaply, empty-row/column runs are collapsed to at most 2, and sheets
with no data return the empty string."""
def test_empty_rows_between_data_capped_at_two(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
("A", "B"),
(None, None),
(None, None),
(None, None),
(None, None),
(None, None),
("C", "D"),
]
)
)
# 5 empty rows collapsed to 2
assert csv_text.split("\n") == ["A,B", ",", ",", "C,D"]
def test_empty_rows_at_or_below_cap_preserved(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
("A", "B"),
(None, None),
(None, None),
("C", "D"),
]
)
)
assert csv_text.split("\n") == ["A,B", ",", ",", "C,D"]
def test_empty_column_run_capped_at_two(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
("A", None, None, None, None, "B"),
("C", None, None, None, None, "D"),
]
)
)
# 4 empty cols between A and B collapsed to 2
assert csv_text.split("\n") == ["A,,,B", "C,,,D"]
def test_completely_empty_stream_returns_empty_string(self) -> None:
assert _sheet_to_csv(iter([])) == ""
def test_all_rows_empty_returns_empty_string(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
(None, None),
("", ""),
(None,),
]
)
)
assert csv_text == ""
def test_trailing_empty_rows_dropped(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
("A",),
("B",),
(None,),
(None,),
(None,),
]
)
)
# Trailing empties are never emitted (no subsequent non-empty row
# to flush them against).
assert csv_text.split("\n") == ["A", "B"]
def test_leading_empty_rows_capped_at_two(self) -> None:
csv_text = _sheet_to_csv(
iter(
[
(None, None),
(None, None),
(None, None),
(None, None),
(None, None),
("A", "B"),
]
)
)
# 5 leading empty rows collapsed to 2
assert csv_text.split("\n") == [",", ",", "A,B"]
def test_cell_cap_truncates_and_appends_marker(self) -> None:
"""When total non-empty cells exceeds the cap, scanning stops and
a truncation marker row is appended so downstream indexing sees
the sheet was cut off."""
with patch(
"onyx.file_processing.extract_file_text.MAX_XLSX_CELLS_PER_SHEET", 5
):
csv_text = _sheet_to_csv(
iter(
[
("A", "B", "C"),
("D", "E", "F"),
("G", "H", "I"),
("J", "K", "L"),
]
)
)
lines = csv_text.split("\n")
assert lines[-1] == "[truncated: sheet exceeded cell limit]"
# First two rows (6 cells) trip the cap=5 check after row 2; the
# third and fourth rows are never scanned.
assert "G" not in csv_text
assert "J" not in csv_text
def test_cell_cap_not_hit_no_marker(self) -> None:
"""Under the cap, no truncation marker is appended."""
csv_text = _sheet_to_csv(
iter(
[
("A", "B"),
("C", "D"),
]
)
)
assert "[truncated" not in csv_text
matrix = _worksheet_to_matrix(ws)
# Must not raise.
cleaned = _clean_worksheet_matrix(matrix)
assert cleaned == [["A", "", "", "B"], ["X", "Y", "", ""]]
class TestXlsxSheetExtraction:
@@ -408,9 +281,10 @@ class TestXlsxSheetExtraction:
assert "a" in csv_text
assert "b" in csv_text
def test_empty_sheet_included_with_empty_csv(self) -> None:
"""Every sheet in the workbook appears in the result; an empty
sheet contributes an empty csv_text alongside its title."""
def test_empty_sheet_is_skipped(self) -> None:
"""A sheet whose CSV output is empty/whitespace-only should NOT
appear in the result — the `if csv_text.strip():` guard filters
it out."""
xlsx = _make_xlsx(
{
"Data": [["a", "b"]],
@@ -418,17 +292,14 @@ class TestXlsxSheetExtraction:
}
)
sheets = xlsx_sheet_extraction(xlsx)
assert len(sheets) == 2
titles = [title for _csv, title in sheets]
assert titles == ["Data", "Empty"]
empty_csv = next(csv_text for csv_text, title in sheets if title == "Empty")
assert empty_csv == ""
assert len(sheets) == 1
assert sheets[0][1] == "Data"
def test_empty_workbook_returns_one_tuple_per_sheet(self) -> None:
"""All sheets empty → one empty-csv tuple per sheet."""
def test_empty_workbook_returns_empty_list(self) -> None:
"""All sheets empty → empty list (not a list of empty tuples)."""
xlsx = _make_xlsx({"Sheet1": [], "Sheet2": []})
sheets = xlsx_sheet_extraction(xlsx)
assert sheets == [("", "Sheet1"), ("", "Sheet2")]
assert sheets == []
def test_single_sheet(self) -> None:
xlsx = _make_xlsx({"Only": [["x", "y"], ["1", "2"]]})

View File

@@ -5,7 +5,7 @@ home: https://www.onyx.app/
sources:
- "https://github.com/onyx-dot-app/onyx"
type: application
version: 0.4.47
version: 0.4.46
appVersion: latest
annotations:
category: Productivity

View File

@@ -1,8 +0,0 @@
{{- range .Values.extraManifests }}
---
{{- if kindIs "string" . }}
{{ tpl . $ }}
{{- else }}
{{ tpl (toYaml .) $ }}
{{- end }}
{{- end }}

View File

@@ -1316,26 +1316,3 @@ configMap:
HARD_DELETE_CHATS: ""
MAX_ALLOWED_UPLOAD_SIZE_MB: ""
DEFAULT_USER_FILE_MAX_UPLOAD_SIZE_MB: ""
# -- Additional arbitrary manifests to render as part of the release. Each entry
# may be either a YAML mapping (a single Kubernetes object) or a multi-line
# string that will be passed through `tpl` so it can reference release values.
# Useful for injecting resources not covered by the chart (e.g. NetworkPolicies,
# ExternalSecrets, custom CRs) without forking the chart.
extraManifests: []
# extraManifests:
# - apiVersion: v1
# kind: ConfigMap
# metadata:
# name: my-extra-config
# data:
# key: value
# - |
# apiVersion: networking.k8s.io/v1
# kind: NetworkPolicy
# metadata:
# name: {{ include "onyx.fullname" . }}-extra
# spec:
# podSelector: {}
# policyTypes:
# - Ingress

View File

@@ -82,28 +82,6 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
[[package]]
name = "aws-lc-rs"
version = "1.16.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc"
dependencies = [
"aws-lc-sys",
"zeroize",
]
[[package]]
name = "aws-lc-sys"
version = "0.39.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399"
dependencies = [
"cc",
"cmake",
"dunce",
"fs_extra",
]
[[package]]
name = "base64"
version = "0.21.7"
@@ -277,8 +255,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2"
dependencies = [
"find-msvc-tools",
"jobserver",
"libc",
"shlex",
]
@@ -315,12 +291,6 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "cfg_aliases"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "chrono"
version = "0.4.44"
@@ -333,15 +303,6 @@ dependencies = [
"windows-link 0.2.1",
]
[[package]]
name = "cmake"
version = "0.1.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
dependencies = [
"cc",
]
[[package]]
name = "combine"
version = "4.6.7"
@@ -819,12 +780,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "fs_extra"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
[[package]]
name = "futf"
version = "0.1.5"
@@ -1042,10 +997,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"wasi 0.11.1+wasi-snapshot-preview1",
"wasm-bindgen",
]
[[package]]
@@ -1055,11 +1008,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"r-efi 5.3.0",
"wasip2",
"wasm-bindgen",
]
[[package]]
@@ -1334,22 +1285,6 @@ dependencies = [
"want",
]
[[package]]
name = "hyper-rustls"
version = "0.27.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
dependencies = [
"http",
"hyper",
"hyper-util",
"rustls",
"rustls-pki-types",
"tokio",
"tokio-rustls",
"tower-service",
]
[[package]]
name = "hyper-util"
version = "0.1.20"
@@ -1652,16 +1587,6 @@ version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
[[package]]
name = "jobserver"
version = "0.1.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
dependencies = [
"getrandom 0.3.4",
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.91"
@@ -1793,12 +1718,6 @@ version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "lru-slab"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "mac"
version = "0.1.1"
@@ -2122,7 +2041,6 @@ name = "onyx"
version = "0.0.0-dev"
dependencies = [
"directories",
"reqwest",
"serde",
"serde_json",
"tauri",
@@ -2147,12 +2065,6 @@ dependencies = [
"pathdiff",
]
[[package]]
name = "openssl-probe"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
[[package]]
name = "option-ext"
version = "0.2.0"
@@ -2543,62 +2455,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "quinn"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
dependencies = [
"bytes",
"cfg_aliases",
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash",
"rustls",
"socket2",
"thiserror 2.0.18",
"tokio",
"tracing",
"web-time",
]
[[package]]
name = "quinn-proto"
version = "0.11.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
dependencies = [
"aws-lc-rs",
"bytes",
"getrandom 0.3.4",
"lru-slab",
"rand 0.9.2",
"ring",
"rustc-hash",
"rustls",
"rustls-pki-types",
"slab",
"thiserror 2.0.18",
"tinyvec",
"tracing",
"web-time",
]
[[package]]
name = "quinn-udp"
version = "0.5.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
dependencies = [
"cfg_aliases",
"libc",
"once_cell",
"socket2",
"tracing",
"windows-sys 0.59.0",
]
[[package]]
name = "quote"
version = "1.0.45"
@@ -2645,16 +2501,6 @@ dependencies = [
"rand_core 0.6.4",
]
[[package]]
name = "rand"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
dependencies = [
"rand_chacha 0.9.0",
"rand_core 0.9.5",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
@@ -2675,16 +2521,6 @@ dependencies = [
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core 0.9.5",
]
[[package]]
name = "rand_core"
version = "0.5.1"
@@ -2703,15 +2539,6 @@ dependencies = [
"getrandom 0.2.17",
]
[[package]]
name = "rand_core"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
dependencies = [
"getrandom 0.3.4",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
@@ -2830,21 +2657,15 @@ dependencies = [
"http-body",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-util",
"js-sys",
"log",
"percent-encoding",
"pin-project-lite",
"quinn",
"rustls",
"rustls-pki-types",
"rustls-platform-verifier",
"serde",
"serde_json",
"sync_wrapper",
"tokio",
"tokio-rustls",
"tokio-util",
"tower",
"tower-http",
@@ -2856,26 +2677,6 @@ dependencies = [
"web-sys",
]
[[package]]
name = "ring"
version = "0.17.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
dependencies = [
"cc",
"cfg-if",
"getrandom 0.2.17",
"libc",
"untrusted",
"windows-sys 0.52.0",
]
[[package]]
name = "rustc-hash"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
[[package]]
name = "rustc_version"
version = "0.4.1"
@@ -2885,81 +2686,6 @@ dependencies = [
"semver",
]
[[package]]
name = "rustls"
version = "0.23.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
dependencies = [
"aws-lc-rs",
"once_cell",
"rustls-pki-types",
"rustls-webpki",
"subtle",
"zeroize",
]
[[package]]
name = "rustls-native-certs"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
dependencies = [
"openssl-probe",
"rustls-pki-types",
"schannel",
"security-framework",
]
[[package]]
name = "rustls-pki-types"
version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
dependencies = [
"web-time",
"zeroize",
]
[[package]]
name = "rustls-platform-verifier"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784"
dependencies = [
"core-foundation",
"core-foundation-sys",
"jni",
"log",
"once_cell",
"rustls",
"rustls-native-certs",
"rustls-platform-verifier-android",
"rustls-webpki",
"security-framework",
"security-framework-sys",
"webpki-root-certs",
"windows-sys 0.59.0",
]
[[package]]
name = "rustls-platform-verifier-android"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
[[package]]
name = "rustls-webpki"
version = "0.103.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
dependencies = [
"aws-lc-rs",
"ring",
"rustls-pki-types",
"untrusted",
]
[[package]]
name = "rustversion"
version = "1.0.22"
@@ -2975,15 +2701,6 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "schannel"
version = "0.1.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "schemars"
version = "0.8.22"
@@ -3041,29 +2758,6 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "security-framework"
version = "3.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
dependencies = [
"bitflags 2.11.0",
"core-foundation",
"core-foundation-sys",
"libc",
"security-framework-sys",
]
[[package]]
name = "security-framework-sys"
version = "2.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "selectors"
version = "0.24.0"
@@ -3434,12 +3128,6 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "swift-rs"
version = "1.0.7"
@@ -3921,21 +3609,6 @@ dependencies = [
"zerovec",
]
[[package]]
name = "tinyvec"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.50.0"
@@ -3950,16 +3623,6 @@ dependencies = [
"windows-sys 0.61.2",
]
[[package]]
name = "tokio-rustls"
version = "0.26.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
dependencies = [
"rustls",
"tokio",
]
[[package]]
name = "tokio-util"
version = "0.7.18"
@@ -4241,12 +3904,6 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "untrusted"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "url"
version = "2.5.8"
@@ -4493,16 +4150,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "webkit2gtk"
version = "2.0.2"
@@ -4547,15 +4194,6 @@ dependencies = [
"system-deps",
]
[[package]]
name = "webpki-root-certs"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
dependencies = [
"rustls-pki-types",
]
[[package]]
name = "webview2-com"
version = "0.38.2"
@@ -4810,15 +4448,6 @@ dependencies = [
"windows-targets 0.48.5",
]
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
@@ -5359,12 +4988,6 @@ dependencies = [
"synstructure",
]
[[package]]
name = "zeroize"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
[[package]]
name = "zerotrie"
version = "0.2.3"

View File

@@ -19,7 +19,6 @@ directories = "5.0"
tokio = { version = "1", features = ["time"] }
window-vibrancy = "0.7.1"
url = "2.5"
reqwest = { version = "0.13", default-features = false, features = ["rustls"] }
[features]
default = ["custom-protocol"]

View File

@@ -583,31 +583,6 @@ fn open_in_default_browser(url: &str) -> bool {
false
}
#[tauri::command]
async fn check_server_reachable(state: tauri::State<'_, ConfigState>) -> Result<(), String> {
let url = state.config.read().unwrap().server_url.clone();
let parsed = Url::parse(&url).map_err(|e| format!("Invalid URL: {}", e))?;
match parsed.scheme() {
"http" | "https" => {}
_ => return Err("URL must use http or https".to_string()),
}
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5))
.build()
.map_err(|e| format!("Failed to build HTTP client: {}", e))?;
match client.head(parsed).send().await {
Ok(_) => Ok(()),
// Only definitive "server didn't answer" errors count as unreachable.
// TLS / decode / redirect errors imply the server is listening — the
// webview, which has its own trust store, is likely to succeed even
// when rustls rejects a self-signed cert.
Err(e) if e.is_connect() || e.is_timeout() => Err(e.to_string()),
Err(_) => Ok(()),
}
}
#[tauri::command]
fn open_in_browser(url: String) -> Result<(), String> {
let parsed_url = Url::parse(&url).map_err(|_| "Invalid URL".to_string())?;
@@ -1319,7 +1294,6 @@ fn main() {
get_bootstrap_state,
set_server_url,
get_config_path_cmd,
check_server_reachable,
open_in_browser,
open_config_file,
open_config_directory,

View File

@@ -459,19 +459,8 @@
return;
}
// Not first launch and not explicit settings — confirm the server
// is reachable before handing the webview over. Otherwise the user
// lands on a native "connection refused" page with no way back.
try {
await invoke("check_server_reachable");
} catch (reachErr) {
showSettings();
showError(
`Could not connect to ${currentServerUrl}. Check the URL and your network connection.`,
);
return;
}
// Not first launch and not explicit settings
// Auto-redirect to configured domain
window.location.href = currentServerUrl;
} catch (error) {
// On error, default to cloud

View File

@@ -6,6 +6,7 @@ import { LlmDescriptor, LlmManager } from "@/lib/hooks";
import { structureValue } from "@/lib/llmConfig/utils";
import { getModelIcon } from "@/lib/llmConfig";
import { AGGREGATOR_PROVIDERS } from "@/lib/llmConfig/svc";
import { Slider } from "@/components/ui/slider";
import { useUser } from "@/providers/UserProvider";
import Text from "@/refresh-components/texts/Text";

View File

@@ -3,20 +3,18 @@
import { useState, useMemo, useRef, useEffect } from "react";
import { PopoverMenu } from "@/refresh-components/Popover";
import InputTypeIn from "@/refresh-components/inputs/InputTypeIn";
import { Button, LineItemButton, Text } from "@opal/components";
import { SvgCheck, SvgChevronRight } from "@opal/icons";
import { Text } from "@opal/components";
import { SvgCheck, SvgChevronDown, SvgChevronRight } from "@opal/icons";
import { Section } from "@/layouts/general-layouts";
import { LLMOption } from "./interfaces";
import { buildLlmOptions, groupLlmOptions } from "./LLMPopover";
import LineItem from "@/refresh-components/buttons/LineItem";
import { LLMProviderDescriptor } from "@/interfaces/llm";
import {
Collapsible,
CollapsibleContent,
CollapsibleTrigger,
} from "@/refresh-components/Collapsible";
import { cn } from "@opal/utils";
import { Interactive } from "@opal/core";
import { ContentAction } from "@opal/layouts";
export interface ModelListContentProps {
llmProviders: LLMProviderDescriptor[] | undefined;
@@ -116,22 +114,20 @@ export default function ModelListContent({
capabilities.length > 0 ? capabilities.join(", ") : undefined;
return (
<LineItemButton
<LineItem
key={`${option.provider}:${option.modelName}`}
selectVariant="select-heavy"
state={selected ? "selected" : "empty"}
icon={(props) => <div {...(props as any)} />}
title={option.displayName}
selected={selected}
disabled={disabled}
description={description}
onClick={() => onSelect(option)}
rightChildren={
selected ? (
<SvgCheck className="text-action-link-05" size={16} />
<SvgCheck className="h-4 w-4 stroke-action-link-05 shrink-0" />
) : null
}
sizePreset="main-ui"
rounding="sm"
/>
>
{option.displayName}
</LineItem>
);
};
@@ -160,7 +156,7 @@ export default function ModelListContent({
]
: groupedOptions.length === 1
? [
<Section key="single-provider" gap={1}>
<Section key="single-provider" gap={0.25}>
{groupedOptions[0]!.options.map(renderModelItem)}
</Section>,
]
@@ -171,45 +167,22 @@ export default function ModelListContent({
key={group.key}
open={open}
onOpenChange={() => toggleGroup(group.key)}
className="flex flex-col gap-1"
>
<CollapsibleTrigger asChild>
<Interactive.Stateless prominence="tertiary">
<Interactive.Container
size="fit"
rounding="sm"
width="full"
>
<div className="pl-2 pr-1 py-1 w-full">
<ContentAction
sizePreset="secondary"
variant="body"
prominence="muted"
icon={group.Icon}
title={group.displayName}
padding="fit"
rightChildren={
<Section>
<Button
icon={(props) => (
<SvgChevronRight
{...props}
className={cn(
"transition-all",
open && "rotate-90",
props.className
)}
/>
)}
prominence="tertiary"
size="sm"
/>
</Section>
}
/>
</div>
</Interactive.Container>
</Interactive.Stateless>
<LineItem
muted
icon={group.Icon}
strokeIcon={false}
rightChildren={
open ? (
<SvgChevronDown className="h-4 w-4 stroke-text-04 shrink-0" />
) : (
<SvgChevronRight className="h-4 w-4 stroke-text-04 shrink-0" />
)
}
>
{group.displayName}
</LineItem>
</CollapsibleTrigger>
<CollapsibleContent>

View File

@@ -51,16 +51,14 @@ test.describe("LLM Ordering", () => {
await page.waitForSelector('[role="dialog"]', { timeout: 5000 });
const dialog = page.locator('[role="dialog"]');
const allModelItems = dialog.locator("[data-interactive-state]");
const allModelItems = dialog.locator("[data-selected]");
await expect(allModelItems.first()).toBeVisible({ timeout: 5000 });
const count = await allModelItems.count();
expect(count).toBeGreaterThan(0);
// Pick the first non-selected model so the trigger text changes after click
const nonSelectedItem = dialog
.locator('[data-interactive-state="empty"]')
.first();
const nonSelectedItem = dialog.locator('[data-selected="false"]').first();
const hasNonSelected = (await nonSelectedItem.count()) > 0;
const targetItem = hasNonSelected ? nonSelectedItem : allModelItems.first();

View File

@@ -300,13 +300,11 @@ test.describe("LLM Runtime Selection", () => {
const regenerateDialog = page.locator('[role="dialog"]');
const alternateModelOption = regenerateDialog
.locator('[data-interactive-state="empty"]')
.locator('[data-selected="false"]')
.first();
test.skip(
(await regenerateDialog
.locator('[data-interactive-state="empty"]')
.count()) === 0,
(await regenerateDialog.locator('[data-selected="false"]').count()) === 0,
"Regenerate model picker requires at least two runtime model options"
);
@@ -410,11 +408,13 @@ test.describe("LLM Runtime Selection", () => {
const dialog = page.locator('[role="dialog"]');
await dialog.getByPlaceholder("Search models...").fill(sharedModelName);
const sharedModelOptions = dialog.locator("[data-interactive-state]");
const sharedModelOptions = dialog.locator("[data-selected]");
await expect(sharedModelOptions).toHaveCount(2);
// Two results with the same model name under different providers.
// Groups are sorted alphabetically, so OpenAI (index 1) comes after Anthropic (index 0).
const openAiModelOption = sharedModelOptions.nth(1);
const openAiModelOption = dialog
.getByRole("button", { name: /openai/i })
.locator("..")
.locator("[data-selected]")
.first();
await expect(openAiModelOption).toBeVisible();
await openAiModelOption.click();
await page.waitForSelector('[role="dialog"]', { state: "hidden" });
@@ -434,12 +434,13 @@ test.describe("LLM Runtime Selection", () => {
.getByPlaceholder("Search models...")
.fill(sharedModelName);
const secondSharedModelOptions = secondDialog.locator(
"[data-interactive-state]"
);
const secondSharedModelOptions = secondDialog.locator("[data-selected]");
await expect(secondSharedModelOptions).toHaveCount(2);
// Anthropic is index 0 (alphabetically first).
const anthropicModelOption = secondSharedModelOptions.nth(0);
const anthropicModelOption = secondDialog
.getByRole("button", { name: /anthropic/i })
.locator("..")
.locator("[data-selected]")
.first();
await expect(anthropicModelOption).toBeVisible();
await anthropicModelOption.click();
await page.waitForSelector('[role="dialog"]', { state: "hidden" });
@@ -447,11 +448,11 @@ test.describe("LLM Runtime Selection", () => {
await page.getByTestId("model-selector").locator("button").last().click();
await page.waitForSelector('[role="dialog"]', { state: "visible" });
const verifyDialog = page.locator('[role="dialog"]');
// Verify the Anthropic option (index 0) is selected.
const selectedOption = verifyDialog.locator(
'[data-interactive-state="selected"]'
);
await expect(selectedOption).toHaveCount(1);
const selectedAnthropicOption = verifyDialog
.getByRole("button", { name: /anthropic/i })
.locator("..")
.locator('[data-selected="true"]');
await expect(selectedAnthropicOption).toHaveCount(1);
await page.keyboard.press("Escape");
await page.waitForSelector('[role="dialog"]', { state: "hidden" });
@@ -517,7 +518,7 @@ test.describe("LLM Runtime Selection", () => {
await dialog.getByPlaceholder("Search models...").fill(restrictedModelName);
const restrictedModelOption = dialog
.locator("[data-interactive-state]")
.locator("[data-selected]")
.filter({ hasText: restrictedModelName });
await expect(restrictedModelOption).toHaveCount(0);

View File

@@ -85,10 +85,8 @@ export async function selectModelFromInputPopover(
for (const modelName of preferredModels) {
await searchInput.fill(modelName);
const modelOptions = dialog.locator("[data-interactive-state]");
const nonSelectedOptions = dialog.locator(
'[data-interactive-state="empty"]'
);
const modelOptions = dialog.locator("[data-selected]");
const nonSelectedOptions = dialog.locator('[data-selected="false"]');
if ((await modelOptions.count()) > 0) {
const candidate =
@@ -112,7 +110,7 @@ export async function selectModelFromInputPopover(
// Reset search so fallback sees all available models.
await searchInput.fill("");
const nonSelectedOptions = dialog.locator('[data-interactive-state="empty"]');
const nonSelectedOptions = dialog.locator('[data-selected="false"]');
if ((await nonSelectedOptions.count()) > 0) {
const fallback = nonSelectedOptions.first();
await expect(fallback).toBeVisible();
@@ -147,7 +145,7 @@ export async function switchModel(page: Page, modelName: string) {
const modelButton = page
.locator('[role="dialog"]')
.getByRole("button")
.locator('[role="button"]')
.filter({ hasText: modelName })
.first();