mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-21 01:16:45 +00:00
Compare commits
1 Commits
edge
...
run-ci/103
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a7b95c42d3 |
@@ -16,13 +16,9 @@
|
||||
"source=onyx-devcontainer-local,target=/home/dev/.local,type=volume"
|
||||
],
|
||||
"containerEnv": {
|
||||
"MODEL_SERVER_HOST": "inference_model_server",
|
||||
"OPENSEARCH_HOST": "opensearch",
|
||||
"POSTGRES_HOST": "relational_db",
|
||||
"REDIS_HOST": "cache",
|
||||
"S3_ENDPOINT_URL": "http://minio:9000",
|
||||
"SSH_AUTH_SOCK": "/tmp/ssh-agent.sock",
|
||||
"VESPA_HOST": "index"
|
||||
"POSTGRES_HOST": "relational_db",
|
||||
"REDIS_HOST": "cache"
|
||||
},
|
||||
"remoteUser": "${localEnv:DEVCONTAINER_REMOTE_USER:dev}",
|
||||
"updateRemoteUserUID": false,
|
||||
|
||||
@@ -40,7 +40,6 @@ ALLOWED_DOMAINS=(
|
||||
"api.anthropic.com"
|
||||
"api-staging.anthropic.com"
|
||||
"files.anthropic.com"
|
||||
"huggingface.co"
|
||||
"sentry.io"
|
||||
"update.code.visualstudio.com"
|
||||
"pypi.org"
|
||||
|
||||
2
.github/workflows/deployment.yml
vendored
2
.github/workflows/deployment.yml
vendored
@@ -403,7 +403,7 @@ jobs:
|
||||
echo "CERT_ID=$CERT_ID" >> $GITHUB_ENV
|
||||
echo "Certificate imported."
|
||||
|
||||
- uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5 # ratchet:tauri-apps/tauri-action@action-v0.6.2
|
||||
- uses: tauri-apps/tauri-action@73fb865345c54760d875b94642314f8c0c894afa # ratchet:tauri-apps/tauri-action@action-v0.6.1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
APPLE_ID: ${{ env.APPLE_ID }}
|
||||
|
||||
2
.github/workflows/pr-golang-tests.yml
vendored
2
.github/workflows/pr-golang-tests.yml
vendored
@@ -42,7 +42,7 @@ jobs:
|
||||
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v6
|
||||
with:
|
||||
persist-credentials: false
|
||||
- uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # zizmor: ignore[cache-poisoning]
|
||||
- uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # zizmor: ignore[cache-poisoning]
|
||||
with:
|
||||
go-version: ${{ env.GO_VERSION }}
|
||||
cache-dependency-path: "**/go.sum"
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
"""Add file_id to documents
|
||||
|
||||
Revision ID: 91d150c361f6
|
||||
Revises: a6fcd3d631f9
|
||||
Create Date: 2026-04-16 15:43:30.314823
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "91d150c361f6"
|
||||
down_revision = "a6fcd3d631f9"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"document",
|
||||
sa.Column("file_id", sa.String(), nullable=True),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("document", "file_id")
|
||||
@@ -1,48 +0,0 @@
|
||||
"""replace document sync index with partial index
|
||||
|
||||
Replaces the composite index ix_document_sync_status (last_modified, last_synced)
|
||||
with a partial index ix_document_needs_sync that only indexes rows where
|
||||
last_modified > last_synced OR last_synced IS NULL.
|
||||
|
||||
The old index was never used by the query planner (0 scans in pg_stat_user_indexes)
|
||||
because Postgres cannot use a B-tree composite index to evaluate a comparison
|
||||
between two columns in the same row combined with an OR/IS NULL condition.
|
||||
|
||||
The partial index makes count_documents_by_needs_sync ~4000x faster for tenants
|
||||
with no stale documents (161ms -> 0.04ms on a 929K row table) and ~17x faster
|
||||
for tenants with large backlogs (846ms -> 50ms on a 164K row table).
|
||||
|
||||
Revision ID: a6fcd3d631f9
|
||||
Revises: d129f37b3d87
|
||||
Create Date: 2026-04-17 16:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "a6fcd3d631f9"
|
||||
down_revision = "d129f37b3d87"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.create_index(
|
||||
"ix_document_needs_sync",
|
||||
"document",
|
||||
["id"],
|
||||
postgresql_where=sa.text("last_modified > last_synced OR last_synced IS NULL"),
|
||||
)
|
||||
op.drop_index("ix_document_sync_status", table_name="document")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.create_index(
|
||||
"ix_document_sync_status",
|
||||
"document",
|
||||
["last_modified", "last_synced"],
|
||||
)
|
||||
op.drop_index("ix_document_needs_sync", table_name="document")
|
||||
@@ -506,18 +506,6 @@ def _perform_external_group_sync(
|
||||
|
||||
ext_group_sync_func = sync_config.group_sync_config.group_sync_func
|
||||
|
||||
# Clean up stale rows from previous cycle BEFORE marking new ones.
|
||||
# This ensures cleanup always runs regardless of whether the current
|
||||
# sync succeeds — previously, cleanup only ran at the END of the sync,
|
||||
# so if the sync failed (e.g. DB connection killed by
|
||||
# idle_in_transaction_session_timeout during long API calls), stale
|
||||
# rows would accumulate indefinitely.
|
||||
logger.info(
|
||||
f"Removing stale external groups from prior cycle for {source_type} "
|
||||
f"for cc_pair: {cc_pair_id}"
|
||||
)
|
||||
remove_stale_external_groups(db_session, cc_pair_id)
|
||||
|
||||
logger.info(
|
||||
f"Marking old external groups as stale for {source_type} for cc_pair: {cc_pair_id}"
|
||||
)
|
||||
|
||||
@@ -5,7 +5,6 @@ from pydantic import BaseModel
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import update
|
||||
from sqlalchemy.dialects.postgresql import insert as pg_insert
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.access.utils import build_ext_group_name_for_onyx
|
||||
@@ -78,15 +77,6 @@ def mark_old_external_groups_as_stale(
|
||||
.where(PublicExternalUserGroup.cc_pair_id == cc_pair_id)
|
||||
.values(stale=True)
|
||||
)
|
||||
# Commit immediately so the transaction closes before potentially long
|
||||
# external API calls (e.g. Google Drive folder iteration). Without this,
|
||||
# the DB connection sits idle-in-transaction during API calls and gets
|
||||
# killed by idle_in_transaction_session_timeout, causing the entire sync
|
||||
# to fail and stale cleanup to never run.
|
||||
db_session.commit()
|
||||
|
||||
|
||||
_UPSERT_BATCH_SIZE = 5000
|
||||
|
||||
|
||||
def upsert_external_groups(
|
||||
@@ -96,102 +86,91 @@ def upsert_external_groups(
|
||||
source: DocumentSource,
|
||||
) -> None:
|
||||
"""
|
||||
Batch upsert external user groups using INSERT ... ON CONFLICT DO UPDATE.
|
||||
- For existing rows (same user_id, external_user_group_id, cc_pair_id),
|
||||
sets stale=False
|
||||
- For new rows, inserts with stale=False
|
||||
- Same logic for PublicExternalUserGroup
|
||||
Performs a true upsert operation for external user groups:
|
||||
- For existing groups (same user_id, external_user_group_id, cc_pair_id), updates the stale flag to False
|
||||
- For new groups, inserts them with stale=False
|
||||
- For public groups, uses upsert logic as well
|
||||
"""
|
||||
# If there are no groups to add, return early
|
||||
if not external_groups:
|
||||
return
|
||||
|
||||
# Collect all emails from all groups to batch-add users at once
|
||||
all_group_member_emails: set[str] = set()
|
||||
# collect all emails from all groups to batch add all users at once for efficiency
|
||||
all_group_member_emails = set()
|
||||
for external_group in external_groups:
|
||||
all_group_member_emails.update(external_group.user_emails)
|
||||
for user_email in external_group.user_emails:
|
||||
all_group_member_emails.add(user_email)
|
||||
|
||||
# Batch add users if they don't exist and get their ids
|
||||
# batch add users if they don't exist and get their ids
|
||||
all_group_members: list[User] = batch_add_ext_perm_user_if_not_exists(
|
||||
db_session=db_session,
|
||||
# NOTE: this function handles case sensitivity for emails
|
||||
emails=list(all_group_member_emails),
|
||||
)
|
||||
|
||||
# map emails to ids
|
||||
email_id_map = {user.email.lower(): user.id for user in all_group_members}
|
||||
|
||||
# Build all user-group mappings and public-group mappings
|
||||
user_group_mappings: list[dict] = []
|
||||
public_group_mappings: list[dict] = []
|
||||
|
||||
# Process each external group
|
||||
for external_group in external_groups:
|
||||
external_group_id = build_ext_group_name_for_onyx(
|
||||
ext_group_name=external_group.id,
|
||||
source=source,
|
||||
)
|
||||
|
||||
# Handle user-group mappings
|
||||
for user_email in external_group.user_emails:
|
||||
user_id = email_id_map.get(user_email.lower())
|
||||
if user_id is None:
|
||||
logger.warning(
|
||||
f"User in group {external_group.id}"
|
||||
f" with email {user_email} not found"
|
||||
f"User in group {external_group.id} with email {user_email} not found"
|
||||
)
|
||||
continue
|
||||
|
||||
user_group_mappings.append(
|
||||
{
|
||||
"user_id": user_id,
|
||||
"external_user_group_id": external_group_id,
|
||||
"cc_pair_id": cc_pair_id,
|
||||
"stale": False,
|
||||
}
|
||||
# Check if the user-group mapping already exists
|
||||
existing_user_group = db_session.scalar(
|
||||
select(User__ExternalUserGroupId).where(
|
||||
User__ExternalUserGroupId.user_id == user_id,
|
||||
User__ExternalUserGroupId.external_user_group_id
|
||||
== external_group_id,
|
||||
User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
|
||||
)
|
||||
)
|
||||
|
||||
if existing_user_group:
|
||||
# Update existing record
|
||||
existing_user_group.stale = False
|
||||
else:
|
||||
# Insert new record
|
||||
new_user_group = User__ExternalUserGroupId(
|
||||
user_id=user_id,
|
||||
external_user_group_id=external_group_id,
|
||||
cc_pair_id=cc_pair_id,
|
||||
stale=False,
|
||||
)
|
||||
db_session.add(new_user_group)
|
||||
|
||||
# Handle public group if needed
|
||||
if external_group.gives_anyone_access:
|
||||
public_group_mappings.append(
|
||||
{
|
||||
"external_user_group_id": external_group_id,
|
||||
"cc_pair_id": cc_pair_id,
|
||||
"stale": False,
|
||||
}
|
||||
# Check if the public group already exists
|
||||
existing_public_group = db_session.scalar(
|
||||
select(PublicExternalUserGroup).where(
|
||||
PublicExternalUserGroup.external_user_group_id == external_group_id,
|
||||
PublicExternalUserGroup.cc_pair_id == cc_pair_id,
|
||||
)
|
||||
)
|
||||
|
||||
# Deduplicate to avoid "ON CONFLICT DO UPDATE command cannot affect row
|
||||
# a second time" when duplicate emails or overlapping groups produce
|
||||
# identical (user_id, external_user_group_id, cc_pair_id) tuples.
|
||||
user_group_mappings_deduped = list(
|
||||
{
|
||||
(m["user_id"], m["external_user_group_id"], m["cc_pair_id"]): m
|
||||
for m in user_group_mappings
|
||||
}.values()
|
||||
)
|
||||
|
||||
# Batch upsert user-group mappings
|
||||
for i in range(0, len(user_group_mappings_deduped), _UPSERT_BATCH_SIZE):
|
||||
chunk = user_group_mappings_deduped[i : i + _UPSERT_BATCH_SIZE]
|
||||
stmt = pg_insert(User__ExternalUserGroupId).values(chunk)
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
index_elements=["user_id", "external_user_group_id", "cc_pair_id"],
|
||||
set_={"stale": False},
|
||||
)
|
||||
db_session.execute(stmt)
|
||||
|
||||
# Deduplicate public group mappings as well
|
||||
public_group_mappings_deduped = list(
|
||||
{
|
||||
(m["external_user_group_id"], m["cc_pair_id"]): m
|
||||
for m in public_group_mappings
|
||||
}.values()
|
||||
)
|
||||
|
||||
# Batch upsert public group mappings
|
||||
for i in range(0, len(public_group_mappings_deduped), _UPSERT_BATCH_SIZE):
|
||||
chunk = public_group_mappings_deduped[i : i + _UPSERT_BATCH_SIZE]
|
||||
stmt = pg_insert(PublicExternalUserGroup).values(chunk)
|
||||
stmt = stmt.on_conflict_do_update(
|
||||
index_elements=["external_user_group_id", "cc_pair_id"],
|
||||
set_={"stale": False},
|
||||
)
|
||||
db_session.execute(stmt)
|
||||
if existing_public_group:
|
||||
# Update existing record
|
||||
existing_public_group.stale = False
|
||||
else:
|
||||
# Insert new record
|
||||
new_public_group = PublicExternalUserGroup(
|
||||
external_user_group_id=external_group_id,
|
||||
cc_pair_id=cc_pair_id,
|
||||
stale=False,
|
||||
)
|
||||
db_session.add(new_public_group)
|
||||
|
||||
db_session.commit()
|
||||
|
||||
|
||||
@@ -27,7 +27,6 @@ from shared_configs.configs import MIN_THREADS_ML_MODELS
|
||||
from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST
|
||||
from shared_configs.configs import MODEL_SERVER_PORT
|
||||
from shared_configs.configs import SENTRY_DSN
|
||||
from shared_configs.configs import SENTRY_TRACES_SAMPLE_RATE
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
||||
@@ -102,7 +101,7 @@ def get_model_app() -> FastAPI:
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
integrations=[StarletteIntegration(), FastApiIntegration()],
|
||||
traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
|
||||
traces_sample_rate=0.1,
|
||||
release=__version__,
|
||||
before_send=_add_instance_tags,
|
||||
)
|
||||
|
||||
@@ -55,7 +55,6 @@ from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import DEV_LOGGING_ENABLED
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.configs import SENTRY_CELERY_TRACES_SAMPLE_RATE
|
||||
from shared_configs.configs import SENTRY_DSN
|
||||
from shared_configs.configs import TENANT_ID_PREFIX
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
@@ -70,7 +69,7 @@ if SENTRY_DSN:
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
integrations=[CeleryIntegration()],
|
||||
traces_sample_rate=SENTRY_CELERY_TRACES_SAMPLE_RATE,
|
||||
traces_sample_rate=0.1,
|
||||
release=__version__,
|
||||
before_send=_add_instance_tags,
|
||||
)
|
||||
|
||||
@@ -37,7 +37,6 @@ from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.server.metrics.connector_health_metrics import on_index_attempt_status_change
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.variable_functionality import global_version
|
||||
from shared_configs.configs import SENTRY_CELERY_TRACES_SAMPLE_RATE
|
||||
from shared_configs.configs import SENTRY_DSN
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -141,7 +140,7 @@ def _docfetching_task(
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
traces_sample_rate=SENTRY_CELERY_TRACES_SAMPLE_RATE,
|
||||
traces_sample_rate=0.1,
|
||||
release=__version__,
|
||||
before_send=_add_instance_tags,
|
||||
)
|
||||
|
||||
@@ -58,8 +58,6 @@ from onyx.db.indexing_coordination import IndexingCoordination
|
||||
from onyx.db.models import IndexAttempt
|
||||
from onyx.file_store.document_batch_storage import DocumentBatchStorage
|
||||
from onyx.file_store.document_batch_storage import get_document_batch_storage
|
||||
from onyx.file_store.staging import build_raw_file_callback
|
||||
from onyx.file_store.staging import RawFileCallback
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
|
||||
from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
|
||||
@@ -92,7 +90,6 @@ def _get_connector_runner(
|
||||
end_time: datetime,
|
||||
include_permissions: bool,
|
||||
leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
|
||||
raw_file_callback: RawFileCallback | None = None,
|
||||
) -> ConnectorRunner:
|
||||
"""
|
||||
NOTE: `start_time` and `end_time` are only used for poll connectors
|
||||
@@ -111,7 +108,6 @@ def _get_connector_runner(
|
||||
input_type=task,
|
||||
connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
|
||||
credential=attempt.connector_credential_pair.credential,
|
||||
raw_file_callback=raw_file_callback,
|
||||
)
|
||||
|
||||
# validate the connector settings
|
||||
@@ -279,12 +275,6 @@ def run_docfetching_entrypoint(
|
||||
f"credentials='{credential_id}'"
|
||||
)
|
||||
|
||||
raw_file_callback = build_raw_file_callback(
|
||||
index_attempt_id=index_attempt_id,
|
||||
cc_pair_id=connector_credential_pair_id,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
connector_document_extraction(
|
||||
app,
|
||||
index_attempt_id,
|
||||
@@ -292,7 +282,6 @@ def run_docfetching_entrypoint(
|
||||
attempt.search_settings_id,
|
||||
tenant_id,
|
||||
callback,
|
||||
raw_file_callback=raw_file_callback,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
@@ -312,7 +301,6 @@ def connector_document_extraction(
|
||||
search_settings_id: int,
|
||||
tenant_id: str,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
raw_file_callback: RawFileCallback | None = None,
|
||||
) -> None:
|
||||
"""Extract documents from connector and queue them for indexing pipeline processing.
|
||||
|
||||
@@ -463,7 +451,6 @@ def connector_document_extraction(
|
||||
start_time=window_start,
|
||||
end_time=window_end,
|
||||
include_permissions=should_fetch_permissions_during_indexing,
|
||||
raw_file_callback=raw_file_callback,
|
||||
)
|
||||
|
||||
# don't use a checkpoint if we're explicitly indexing from
|
||||
|
||||
@@ -868,15 +868,6 @@ MAX_EMBEDDED_IMAGES_PER_UPLOAD = max(
|
||||
0, int(os.environ.get("MAX_EMBEDDED_IMAGES_PER_UPLOAD") or 1000)
|
||||
)
|
||||
|
||||
# Maximum non-empty cells to extract from a single xlsx worksheet. Protects
|
||||
# from OOM on honestly-huge spreadsheets: memory cost in the extractor is
|
||||
# roughly proportional to this count. Once exceeded, the scan stops and a
|
||||
# truncation marker row is appended to the sheet's CSV.
|
||||
# Peak Memory ~= 100 B * MAX_CELLS
|
||||
MAX_XLSX_CELLS_PER_SHEET = max(
|
||||
0, int(os.environ.get("MAX_XLSX_CELLS_PER_SHEET") or 10_000_000)
|
||||
)
|
||||
|
||||
# Use document summary for contextual rag
|
||||
USE_DOCUMENT_SUMMARY = os.environ.get("USE_DOCUMENT_SUMMARY", "true").lower() == "true"
|
||||
# Use chunk summary for contextual rag
|
||||
|
||||
@@ -372,7 +372,6 @@ class FileOrigin(str, Enum):
|
||||
CONNECTOR_METADATA = "connector_metadata"
|
||||
GENERATED_REPORT = "generated_report"
|
||||
INDEXING_CHECKPOINT = "indexing_checkpoint"
|
||||
INDEXING_STAGING = "indexing_staging"
|
||||
PLAINTEXT_CACHE = "plaintext_cache"
|
||||
OTHER = "other"
|
||||
QUERY_HISTORY_CSV = "query_history_csv"
|
||||
|
||||
@@ -22,7 +22,6 @@ from onyx.db.credentials import backend_update_credential_json
|
||||
from onyx.db.credentials import fetch_credential_by_id
|
||||
from onyx.db.enums import AccessType
|
||||
from onyx.db.models import Credential
|
||||
from onyx.file_store.staging import RawFileCallback
|
||||
from shared_configs.contextvars import get_current_tenant_id
|
||||
|
||||
|
||||
@@ -108,7 +107,6 @@ def instantiate_connector(
|
||||
input_type: InputType,
|
||||
connector_specific_config: dict[str, Any],
|
||||
credential: Credential,
|
||||
raw_file_callback: RawFileCallback | None = None,
|
||||
) -> BaseConnector:
|
||||
connector_class = identify_connector_class(source, input_type)
|
||||
|
||||
@@ -132,9 +130,6 @@ def instantiate_connector(
|
||||
|
||||
connector.set_allow_images(get_image_extraction_and_analysis_enabled())
|
||||
|
||||
if raw_file_callback is not None:
|
||||
connector.set_raw_file_callback(raw_file_callback)
|
||||
|
||||
return connector
|
||||
|
||||
|
||||
|
||||
@@ -40,22 +40,6 @@ class GongConnectorCheckpoint(ConnectorCheckpoint):
|
||||
cursor: str | None = None
|
||||
# Cached time range — computed once, reused across checkpoint calls
|
||||
time_range: tuple[str, str] | None = None
|
||||
# Transcripts whose call details were not yet available from /v2/calls/extensive
|
||||
# (Gong has a known race where transcript call IDs take time to propagate).
|
||||
# Keyed by call_id. Retried on subsequent checkpoint invocations.
|
||||
#
|
||||
# Invariant: all entries share one resolution session — they're stashed
|
||||
# together from a single page and share the attempt counter and retry
|
||||
# deadline. load_from_checkpoint only fetches a new page when this dict
|
||||
# is empty, so entries from different pages can't mix.
|
||||
pending_transcripts: dict[str, dict[str, Any]] = {}
|
||||
# Number of resolution attempts made for pending_transcripts so far.
|
||||
pending_call_details_attempts: int = 0
|
||||
# Unix timestamp before which we should not retry pending_transcripts.
|
||||
# Enforces exponential backoff independent of worker cadence — Gong's
|
||||
# transcript-ID propagation race can take tens of seconds to minutes,
|
||||
# longer than typical worker reinvocation intervals.
|
||||
pending_retry_after: float | None = None
|
||||
|
||||
|
||||
class _TranscriptPage(BaseModel):
|
||||
@@ -78,15 +62,8 @@ class _CursorExpiredError(Exception):
|
||||
|
||||
class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
|
||||
BASE_URL = "https://api.gong.io"
|
||||
# Max number of attempts to resolve missing call details across checkpoint
|
||||
# invocations before giving up and emitting ConnectorFailure.
|
||||
MAX_CALL_DETAILS_ATTEMPTS = 6
|
||||
# Base delay for exponential backoff between pending-transcript retry
|
||||
# attempts. Delay before attempt N (N >= 2) is CALL_DETAILS_DELAY * 2^(N-2)
|
||||
# seconds (30, 60, 120, 240, 480 = ~15.5min total) — matching the original
|
||||
# blocking-retry schedule, but enforced via checkpoint deadline rather
|
||||
# than in-call time.sleep.
|
||||
CALL_DETAILS_DELAY = 30
|
||||
CALL_DETAILS_DELAY = 30 # in seconds
|
||||
# Gong API limit is 3 calls/sec — stay safely under it
|
||||
MIN_REQUEST_INTERVAL = 0.5 # seconds between requests
|
||||
|
||||
@@ -210,6 +187,50 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
|
||||
|
||||
return call_to_metadata
|
||||
|
||||
def _fetch_call_details_with_retry(self, call_ids: list[str]) -> dict[str, Any]:
|
||||
"""Fetch call details with retry for the Gong API race condition.
|
||||
|
||||
The Gong API has a known race where transcript call IDs don't immediately
|
||||
appear in /v2/calls/extensive. Retries with exponential backoff, only
|
||||
re-requesting the missing IDs on each attempt.
|
||||
"""
|
||||
call_details_map = self._get_call_details_by_ids(call_ids)
|
||||
if set(call_ids) == set(call_details_map.keys()):
|
||||
return call_details_map
|
||||
|
||||
for attempt in range(2, self.MAX_CALL_DETAILS_ATTEMPTS + 1):
|
||||
missing_ids = list(set(call_ids) - set(call_details_map.keys()))
|
||||
logger.warning(
|
||||
f"_get_call_details_by_ids is missing call id's: current_attempt={attempt - 1} missing_call_ids={missing_ids}"
|
||||
)
|
||||
|
||||
wait_seconds = self.CALL_DETAILS_DELAY * pow(2, attempt - 2)
|
||||
logger.warning(
|
||||
f"_get_call_details_by_ids waiting to retry: "
|
||||
f"wait={wait_seconds}s "
|
||||
f"current_attempt={attempt - 1} "
|
||||
f"next_attempt={attempt} "
|
||||
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
|
||||
)
|
||||
time.sleep(wait_seconds)
|
||||
|
||||
# Only re-fetch the missing IDs, merge into existing results
|
||||
new_details = self._get_call_details_by_ids(missing_ids)
|
||||
call_details_map.update(new_details)
|
||||
|
||||
if set(call_ids) == set(call_details_map.keys()):
|
||||
return call_details_map
|
||||
|
||||
missing_ids = list(set(call_ids) - set(call_details_map.keys()))
|
||||
logger.error(
|
||||
f"Giving up on missing call id's after "
|
||||
f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
|
||||
f"missing_call_ids={missing_ids} — "
|
||||
f"proceeding with {len(call_details_map)} of "
|
||||
f"{len(call_ids)} calls"
|
||||
)
|
||||
return call_details_map
|
||||
|
||||
@staticmethod
|
||||
def _parse_parties(parties: list[dict]) -> dict[str, str]:
|
||||
id_mapping = {}
|
||||
@@ -292,119 +313,87 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
|
||||
|
||||
return start_time, end_time
|
||||
|
||||
def _build_document(
|
||||
self,
|
||||
transcript: dict[str, Any],
|
||||
call_details: dict[str, Any],
|
||||
) -> Document:
|
||||
"""Build a single Document from a transcript and its resolved call details."""
|
||||
call_id = transcript["callId"]
|
||||
call_metadata = call_details["metaData"]
|
||||
|
||||
call_time_str = call_metadata["started"]
|
||||
call_title = call_metadata["title"]
|
||||
logger.info(
|
||||
f"Indexing Gong call id {call_id} from {call_time_str.split('T', 1)[0]}: {call_title}"
|
||||
)
|
||||
|
||||
call_parties = cast(list[dict] | None, call_details.get("parties"))
|
||||
if call_parties is None:
|
||||
logger.error(f"Couldn't get parties for Call ID: {call_id}")
|
||||
call_parties = []
|
||||
|
||||
id_to_name_map = self._parse_parties(call_parties)
|
||||
|
||||
speaker_to_name: dict[str, str] = {}
|
||||
|
||||
transcript_text = ""
|
||||
call_purpose = call_metadata["purpose"]
|
||||
if call_purpose:
|
||||
transcript_text += f"Call Description: {call_purpose}\n\n"
|
||||
|
||||
contents = transcript["transcript"]
|
||||
for segment in contents:
|
||||
speaker_id = segment.get("speakerId", "")
|
||||
if speaker_id not in speaker_to_name:
|
||||
if self.hide_user_info:
|
||||
speaker_to_name[speaker_id] = f"User {len(speaker_to_name) + 1}"
|
||||
else:
|
||||
speaker_to_name[speaker_id] = id_to_name_map.get(
|
||||
speaker_id, "Unknown"
|
||||
)
|
||||
|
||||
speaker_name = speaker_to_name[speaker_id]
|
||||
|
||||
sentences = segment.get("sentences", {})
|
||||
monolog = " ".join([sentence.get("text", "") for sentence in sentences])
|
||||
transcript_text += f"{speaker_name}: {monolog}\n\n"
|
||||
|
||||
return Document(
|
||||
id=call_id,
|
||||
sections=[TextSection(link=call_metadata["url"], text=transcript_text)],
|
||||
source=DocumentSource.GONG,
|
||||
semantic_identifier=call_title or "Untitled",
|
||||
doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
|
||||
timezone.utc
|
||||
),
|
||||
metadata={"client": call_metadata.get("system")},
|
||||
)
|
||||
|
||||
def _process_transcripts(
|
||||
self,
|
||||
transcripts: list[dict[str, Any]],
|
||||
checkpoint: GongConnectorCheckpoint,
|
||||
) -> Generator[Document | ConnectorFailure, None, None]:
|
||||
"""Fetch call details for a page of transcripts and yield resulting
|
||||
Documents. Transcripts whose call details are missing (Gong race
|
||||
condition) are stashed into `checkpoint.pending_transcripts` for retry
|
||||
on a future checkpoint invocation rather than blocking here.
|
||||
"""
|
||||
"""Process a batch of transcripts into Documents or ConnectorFailures."""
|
||||
transcript_call_ids = cast(
|
||||
list[str],
|
||||
[t.get("callId") for t in transcripts if t.get("callId")],
|
||||
)
|
||||
|
||||
call_details_map = (
|
||||
self._get_call_details_by_ids(transcript_call_ids)
|
||||
if transcript_call_ids
|
||||
else {}
|
||||
)
|
||||
|
||||
newly_stashed: list[str] = []
|
||||
call_details_map = self._fetch_call_details_with_retry(transcript_call_ids)
|
||||
|
||||
for transcript in transcripts:
|
||||
call_id = transcript.get("callId")
|
||||
|
||||
if not call_id:
|
||||
logger.error(
|
||||
"Couldn't get call information for transcript missing callId"
|
||||
)
|
||||
if not call_id or call_id not in call_details_map:
|
||||
logger.error(f"Couldn't get call information for Call ID: {call_id}")
|
||||
if call_id:
|
||||
logger.error(
|
||||
f"Call debug info: call_id={call_id} "
|
||||
f"call_ids={transcript_call_ids} "
|
||||
f"call_details_map={call_details_map.keys()}"
|
||||
)
|
||||
yield ConnectorFailure(
|
||||
failed_document=DocumentFailure(document_id="unknown"),
|
||||
failure_message="Transcript missing callId",
|
||||
failed_document=DocumentFailure(
|
||||
document_id=call_id or "unknown",
|
||||
),
|
||||
failure_message=f"Couldn't get call information for Call ID: {call_id}",
|
||||
)
|
||||
continue
|
||||
|
||||
if call_id in call_details_map:
|
||||
yield self._build_document(transcript, call_details_map[call_id])
|
||||
continue
|
||||
call_details = call_details_map[call_id]
|
||||
call_metadata = call_details["metaData"]
|
||||
|
||||
# Details not available yet — stash for retry on next invocation.
|
||||
checkpoint.pending_transcripts[call_id] = transcript
|
||||
newly_stashed.append(call_id)
|
||||
|
||||
if newly_stashed:
|
||||
logger.warning(
|
||||
f"Gong call details not yet available (race condition); "
|
||||
f"deferring to next checkpoint invocation: "
|
||||
f"call_ids={newly_stashed}"
|
||||
call_time_str = call_metadata["started"]
|
||||
call_title = call_metadata["title"]
|
||||
logger.info(
|
||||
f"Indexing Gong call id {call_id} from {call_time_str.split('T', 1)[0]}: {call_title}"
|
||||
)
|
||||
|
||||
call_parties = cast(list[dict] | None, call_details.get("parties"))
|
||||
if call_parties is None:
|
||||
logger.error(f"Couldn't get parties for Call ID: {call_id}")
|
||||
call_parties = []
|
||||
|
||||
id_to_name_map = self._parse_parties(call_parties)
|
||||
|
||||
speaker_to_name: dict[str, str] = {}
|
||||
|
||||
transcript_text = ""
|
||||
call_purpose = call_metadata["purpose"]
|
||||
if call_purpose:
|
||||
transcript_text += f"Call Description: {call_purpose}\n\n"
|
||||
|
||||
contents = transcript["transcript"]
|
||||
for segment in contents:
|
||||
speaker_id = segment.get("speakerId", "")
|
||||
if speaker_id not in speaker_to_name:
|
||||
if self.hide_user_info:
|
||||
speaker_to_name[speaker_id] = f"User {len(speaker_to_name) + 1}"
|
||||
else:
|
||||
speaker_to_name[speaker_id] = id_to_name_map.get(
|
||||
speaker_id, "Unknown"
|
||||
)
|
||||
|
||||
speaker_name = speaker_to_name[speaker_id]
|
||||
|
||||
sentences = segment.get("sentences", {})
|
||||
monolog = " ".join([sentence.get("text", "") for sentence in sentences])
|
||||
transcript_text += f"{speaker_name}: {monolog}\n\n"
|
||||
|
||||
yield Document(
|
||||
id=call_id,
|
||||
sections=[TextSection(link=call_metadata["url"], text=transcript_text)],
|
||||
source=DocumentSource.GONG,
|
||||
semantic_identifier=call_title or "Untitled",
|
||||
doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
|
||||
timezone.utc
|
||||
),
|
||||
metadata={"client": call_metadata.get("system")},
|
||||
)
|
||||
# First attempt on any newly-stashed transcripts counts as attempt #1.
|
||||
# pending_call_details_attempts is guaranteed 0 here because
|
||||
# load_from_checkpoint only reaches _process_transcripts when
|
||||
# pending_transcripts was empty at entry (see early-return above).
|
||||
checkpoint.pending_call_details_attempts = 1
|
||||
checkpoint.pending_retry_after = time.time() + self._next_retry_delay(1)
|
||||
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
combined = (
|
||||
@@ -443,18 +432,6 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
|
||||
checkpoint.has_more = True
|
||||
return checkpoint
|
||||
|
||||
# Step 2: Resolve any transcripts stashed by a prior invocation whose
|
||||
# call details were missing due to Gong's propagation race. Worker
|
||||
# cadence between checkpoint calls provides the spacing between retry
|
||||
# attempts — no in-call sleep needed.
|
||||
if checkpoint.pending_transcripts:
|
||||
yield from self._resolve_pending_transcripts(checkpoint)
|
||||
# If pending still exists and we haven't exhausted attempts, defer
|
||||
# the rest of this invocation — _resolve_pending_transcripts set
|
||||
# has_more=True for us.
|
||||
if checkpoint.pending_transcripts:
|
||||
return checkpoint
|
||||
|
||||
workspace_ids = checkpoint.workspace_ids
|
||||
|
||||
# If we've exhausted all workspaces, we're done
|
||||
@@ -473,7 +450,7 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
|
||||
|
||||
workspace_id = workspace_ids[checkpoint.workspace_index]
|
||||
|
||||
# Step 3: Fetch one page of transcripts
|
||||
# Step 2: Fetch one page of transcripts
|
||||
try:
|
||||
page = self._fetch_transcript_page(
|
||||
start_datetime=start_time,
|
||||
@@ -496,102 +473,23 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
|
||||
checkpoint.has_more = True
|
||||
return checkpoint
|
||||
|
||||
# Step 4: Process transcripts into documents. Missing-details
|
||||
# transcripts get stashed into checkpoint.pending_transcripts.
|
||||
# Step 3: Process transcripts into documents
|
||||
if page.transcripts:
|
||||
yield from self._process_transcripts(page.transcripts, checkpoint)
|
||||
yield from self._process_transcripts(page.transcripts)
|
||||
|
||||
# Step 5: Update cursor/workspace state
|
||||
# Step 4: Update checkpoint state
|
||||
if page.next_cursor:
|
||||
# More pages in this workspace
|
||||
checkpoint.cursor = page.next_cursor
|
||||
checkpoint.has_more = True
|
||||
else:
|
||||
# This workspace is exhausted — advance to next
|
||||
checkpoint.workspace_index += 1
|
||||
checkpoint.cursor = None
|
||||
checkpoint.has_more = checkpoint.workspace_index < len(workspace_ids)
|
||||
|
||||
# If pending transcripts were stashed this invocation, we still have
|
||||
# work to do on a future invocation even if pagination is exhausted.
|
||||
if checkpoint.pending_transcripts:
|
||||
checkpoint.has_more = True
|
||||
|
||||
return checkpoint
|
||||
|
||||
def _next_retry_delay(self, attempts_done: int) -> float:
|
||||
"""Seconds to wait before attempt #(attempts_done + 1).
|
||||
Matches the original exponential backoff: 30, 60, 120, 240, 480.
|
||||
"""
|
||||
return self.CALL_DETAILS_DELAY * pow(2, attempts_done - 1)
|
||||
|
||||
def _resolve_pending_transcripts(
|
||||
self,
|
||||
checkpoint: GongConnectorCheckpoint,
|
||||
) -> Generator[Document | ConnectorFailure, None, None]:
|
||||
"""Attempt to resolve transcripts whose call details were unavailable
|
||||
in a prior invocation. Mutates checkpoint in place: resolved transcripts
|
||||
are removed from pending_transcripts; on attempt exhaustion, emits
|
||||
ConnectorFailure for each unresolved call_id and clears pending state.
|
||||
|
||||
If the backoff deadline hasn't elapsed yet, returns without issuing
|
||||
any API call so the next invocation can try again later.
|
||||
"""
|
||||
if (
|
||||
checkpoint.pending_retry_after is not None
|
||||
and time.time() < checkpoint.pending_retry_after
|
||||
):
|
||||
# Backoff still in effect — defer to a later invocation without
|
||||
# burning an attempt or an API call.
|
||||
checkpoint.has_more = True
|
||||
return
|
||||
|
||||
pending_call_ids = list(checkpoint.pending_transcripts.keys())
|
||||
resolved = self._get_call_details_by_ids(pending_call_ids)
|
||||
|
||||
for call_id, details in resolved.items():
|
||||
transcript = checkpoint.pending_transcripts.pop(call_id, None)
|
||||
if transcript is None:
|
||||
continue
|
||||
yield self._build_document(transcript, details)
|
||||
|
||||
if not checkpoint.pending_transcripts:
|
||||
checkpoint.pending_call_details_attempts = 0
|
||||
checkpoint.pending_retry_after = None
|
||||
return
|
||||
|
||||
checkpoint.pending_call_details_attempts += 1
|
||||
logger.warning(
|
||||
f"Gong call details still missing after "
|
||||
f"{checkpoint.pending_call_details_attempts}/"
|
||||
f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
|
||||
f"missing_call_ids={list(checkpoint.pending_transcripts.keys())}"
|
||||
)
|
||||
|
||||
if checkpoint.pending_call_details_attempts >= self.MAX_CALL_DETAILS_ATTEMPTS:
|
||||
logger.error(
|
||||
f"Giving up on missing Gong call details after "
|
||||
f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
|
||||
f"missing_call_ids={list(checkpoint.pending_transcripts.keys())}"
|
||||
)
|
||||
for call_id in list(checkpoint.pending_transcripts.keys()):
|
||||
yield ConnectorFailure(
|
||||
failed_document=DocumentFailure(document_id=call_id),
|
||||
failure_message=(
|
||||
f"Couldn't get call details after {self.MAX_CALL_DETAILS_ATTEMPTS} attempts for Call ID: {call_id}"
|
||||
),
|
||||
)
|
||||
checkpoint.pending_transcripts = {}
|
||||
checkpoint.pending_call_details_attempts = 0
|
||||
checkpoint.pending_retry_after = None
|
||||
# has_more is recomputed by the workspace iteration that follows;
|
||||
# reset to False here so a stale True from a prior invocation
|
||||
# can't leak out via any future early-return path.
|
||||
checkpoint.has_more = False
|
||||
else:
|
||||
checkpoint.pending_retry_after = time.time() + self._next_retry_delay(
|
||||
checkpoint.pending_call_details_attempts
|
||||
)
|
||||
checkpoint.has_more = True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
|
||||
@@ -578,18 +578,8 @@ class GoogleDriveConnector(
|
||||
current_id, file.user_email, field_type, failed_folder_ids_by_email
|
||||
)
|
||||
if not folder:
|
||||
# Can't access this folder - stop climbing.
|
||||
# If the terminal node is a confirmed orphan, backfill all
|
||||
# intermediate folders into failed_folder_ids_by_email so
|
||||
# future files short-circuit via _get_folder_metadata's
|
||||
# cache check instead of re-climbing the whole chain.
|
||||
if failed_folder_ids_by_email is not None:
|
||||
for email in {file.user_email, self.primary_admin_email}:
|
||||
email_failed_ids = failed_folder_ids_by_email.get(email)
|
||||
if email_failed_ids and current_id in email_failed_ids:
|
||||
failed_folder_ids_by_email.setdefault(
|
||||
email, ThreadSafeSet()
|
||||
).update(set(node_ids_in_walk))
|
||||
# Can't access this folder - stop climbing
|
||||
# Don't mark as fully walked since we didn't reach root
|
||||
break
|
||||
|
||||
folder_parent_id = _get_parent_id_from_file(folder)
|
||||
|
||||
@@ -167,12 +167,9 @@ class GoogleDriveCheckpoint(ConnectorCheckpoint):
|
||||
default_factory=ThreadSafeSet
|
||||
)
|
||||
|
||||
# Maps email → set of folder IDs that email should skip when walking the
|
||||
# parent chain. Covers two cases:
|
||||
# 1. Folders where that email confirmed no accessible parent (true orphans).
|
||||
# 2. Intermediate folders on a path that dead-ended at a confirmed orphan —
|
||||
# backfilled so future walks short-circuit earlier in the chain.
|
||||
# In both cases _get_folder_metadata skips the API call and returns None.
|
||||
# Maps email → set of IDs of folders where that email confirmed no accessible parent.
|
||||
# Avoids redundant API calls when the same (folder, email) pair is
|
||||
# encountered again within the same retrieval run.
|
||||
failed_folder_ids_by_email: ThreadSafeDict[str, ThreadSafeSet[str]] = Field(
|
||||
default_factory=ThreadSafeDict
|
||||
)
|
||||
|
||||
@@ -15,7 +15,6 @@ from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.file_store.staging import RawFileCallback
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
|
||||
|
||||
@@ -43,9 +42,6 @@ class NormalizationResult(BaseModel):
|
||||
class BaseConnector(abc.ABC, Generic[CT]):
|
||||
REDIS_KEY_PREFIX = "da_connector_data:"
|
||||
|
||||
# Optional raw-file persistence hook to save original file
|
||||
raw_file_callback: RawFileCallback | None = None
|
||||
|
||||
@abc.abstractmethod
|
||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||
raise NotImplementedError
|
||||
@@ -92,15 +88,6 @@ class BaseConnector(abc.ABC, Generic[CT]):
|
||||
"""Implement if the underlying connector wants to skip/allow image downloading
|
||||
based on the application level image analysis setting."""
|
||||
|
||||
def set_raw_file_callback(self, callback: RawFileCallback) -> None:
|
||||
"""Inject the per-attempt raw-file persistence callback.
|
||||
|
||||
Wired up by the docfetching entrypoint via `instantiate_connector`.
|
||||
Connectors that don't care about persisting raw bytes can ignore this
|
||||
— `raw_file_callback` simply stays `None`.
|
||||
"""
|
||||
self.raw_file_callback = callback
|
||||
|
||||
@classmethod
|
||||
def normalize_url(cls, url: str) -> "NormalizationResult": # noqa: ARG003
|
||||
"""Normalize a URL to match the canonical Document.id format used during ingestion.
|
||||
|
||||
@@ -19,7 +19,6 @@ from playwright.sync_api import Playwright
|
||||
from playwright.sync_api import sync_playwright
|
||||
from playwright.sync_api import TimeoutError
|
||||
from requests_oauthlib import OAuth2Session
|
||||
from typing_extensions import override
|
||||
from urllib3.exceptions import MaxRetryError
|
||||
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
@@ -33,16 +32,11 @@ from onyx.connectors.exceptions import CredentialExpiredError
|
||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||
from onyx.connectors.exceptions import UnexpectedValidationError
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import web_html_cleanup
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.sitemap import list_pages_for_site
|
||||
from onyx.utils.web_content import extract_pdf_text
|
||||
@@ -61,6 +55,8 @@ class ScrapeSessionContext:
|
||||
self.visited_links: set[str] = set()
|
||||
self.content_hashes: set[int] = set()
|
||||
|
||||
self.doc_batch: list[Document | HierarchyNode] = []
|
||||
|
||||
self.at_least_one_doc: bool = False
|
||||
self.last_error: str | None = None
|
||||
self.needs_retry: bool = False
|
||||
@@ -442,7 +438,7 @@ def _handle_cookies(context: BrowserContext, url: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
class WebConnector(LoadConnector, SlimConnector):
|
||||
class WebConnector(LoadConnector):
|
||||
MAX_RETRIES = 3
|
||||
|
||||
def __init__(
|
||||
@@ -497,14 +493,8 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
index: int,
|
||||
initial_url: str,
|
||||
session_ctx: ScrapeSessionContext,
|
||||
slim: bool = False,
|
||||
) -> ScrapeResult:
|
||||
"""Returns a ScrapeResult object with a doc and retry flag.
|
||||
|
||||
When slim=True, skips scroll, PDF content download, and content extraction.
|
||||
The bot-detection render wait (5s) fires on CF/403 responses regardless of slim.
|
||||
networkidle is always awaited so JS-rendered links are discovered correctly.
|
||||
"""
|
||||
"""Returns a ScrapeResult object with a doc and retry flag."""
|
||||
|
||||
if session_ctx.playwright is None:
|
||||
raise RuntimeError("scrape_context.playwright is None")
|
||||
@@ -525,16 +515,7 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
is_pdf = is_pdf_resource(initial_url, content_type)
|
||||
|
||||
if is_pdf:
|
||||
if slim:
|
||||
result.doc = Document(
|
||||
id=initial_url,
|
||||
sections=[],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=initial_url,
|
||||
metadata={},
|
||||
)
|
||||
return result
|
||||
|
||||
# PDF files are not checked for links
|
||||
response = requests.get(initial_url, headers=DEFAULT_HEADERS)
|
||||
page_text, metadata = extract_pdf_text(response.content)
|
||||
last_modified = response.headers.get("Last-Modified")
|
||||
@@ -565,20 +546,14 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
timeout=30000, # 30 seconds
|
||||
wait_until="commit", # Wait for navigation to commit
|
||||
)
|
||||
# Give the page a moment to start rendering after navigation commits.
|
||||
# Allows CloudFlare and other bot-detection challenges to complete.
|
||||
page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)
|
||||
|
||||
# Bot-detection JS challenges (CloudFlare, Imperva, etc.) need a moment
|
||||
# to start network activity after commit before networkidle is meaningful.
|
||||
# We detect this via the cf-ray header (CloudFlare) or a 403 response,
|
||||
# which is the common entry point for JS-challenge-based bot detection.
|
||||
is_bot_challenge = page_response is not None and (
|
||||
page_response.header_value("cf-ray") is not None
|
||||
or page_response.status == 403
|
||||
)
|
||||
if is_bot_challenge:
|
||||
page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)
|
||||
|
||||
# Wait for network activity to settle (handles SPAs, CF challenges, etc.)
|
||||
# Wait for network activity to settle so SPAs that fetch content
|
||||
# asynchronously after the initial JS bundle have time to render.
|
||||
try:
|
||||
# A bit of extra time to account for long-polling, websockets, etc.
|
||||
page.wait_for_load_state("networkidle", timeout=PAGE_RENDER_TIMEOUT_MS)
|
||||
except TimeoutError:
|
||||
pass
|
||||
@@ -601,7 +576,7 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
session_ctx.visited_links.add(initial_url)
|
||||
|
||||
# If we got here, the request was successful
|
||||
if not slim and self.scroll_before_scraping:
|
||||
if self.scroll_before_scraping:
|
||||
scroll_attempts = 0
|
||||
previous_height = page.evaluate("document.body.scrollHeight")
|
||||
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||
@@ -640,16 +615,6 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
result.retry = True
|
||||
return result
|
||||
|
||||
if slim:
|
||||
result.doc = Document(
|
||||
id=initial_url,
|
||||
sections=[],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=initial_url,
|
||||
metadata={},
|
||||
)
|
||||
return result
|
||||
|
||||
# after this point, we don't need the caller to retry
|
||||
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
|
||||
|
||||
@@ -701,13 +666,9 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
|
||||
return result
|
||||
|
||||
def load_from_state(self, slim: bool = False) -> GenerateDocumentsOutput:
|
||||
"""Traverses through all pages found on the website and converts them into
|
||||
documents.
|
||||
|
||||
When slim=True, yields SlimDocument objects (URL id only, no content).
|
||||
Playwright is used in all modes — slim skips content extraction only.
|
||||
"""
|
||||
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||
"""Traverses through all pages found on the website
|
||||
and converts them into documents"""
|
||||
|
||||
if not self.to_visit_list:
|
||||
raise ValueError("No URLs to visit")
|
||||
@@ -718,8 +679,6 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
session_ctx = ScrapeSessionContext(base_url, self.to_visit_list)
|
||||
session_ctx.initialize()
|
||||
|
||||
batch: list[Document | SlimDocument | HierarchyNode] = []
|
||||
|
||||
while session_ctx.to_visit:
|
||||
initial_url = session_ctx.to_visit.pop()
|
||||
if initial_url in session_ctx.visited_links:
|
||||
@@ -734,9 +693,7 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
continue
|
||||
|
||||
index = len(session_ctx.visited_links)
|
||||
logger.info(
|
||||
f"{index}: {'Slim-visiting' if slim else 'Visiting'} {initial_url}"
|
||||
)
|
||||
logger.info(f"{index}: Visiting {initial_url}")
|
||||
|
||||
# Add retry mechanism with exponential backoff
|
||||
retry_count = 0
|
||||
@@ -751,14 +708,12 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
time.sleep(delay)
|
||||
|
||||
try:
|
||||
result = self._do_scrape(index, initial_url, session_ctx, slim=slim)
|
||||
result = self._do_scrape(index, initial_url, session_ctx)
|
||||
if result.retry:
|
||||
continue
|
||||
|
||||
if result.doc:
|
||||
batch.append(
|
||||
SlimDocument(id=result.doc.id) if slim else result.doc
|
||||
)
|
||||
session_ctx.doc_batch.append(result.doc)
|
||||
except Exception as e:
|
||||
session_ctx.last_error = f"Failed to fetch '{initial_url}': {e}"
|
||||
logger.exception(session_ctx.last_error)
|
||||
@@ -769,16 +724,16 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
|
||||
break # success / don't retry
|
||||
|
||||
if len(batch) >= self.batch_size:
|
||||
if len(session_ctx.doc_batch) >= self.batch_size:
|
||||
session_ctx.initialize()
|
||||
session_ctx.at_least_one_doc = True
|
||||
yield batch # ty: ignore[invalid-yield]
|
||||
batch = []
|
||||
yield session_ctx.doc_batch
|
||||
session_ctx.doc_batch = []
|
||||
|
||||
if batch:
|
||||
if session_ctx.doc_batch:
|
||||
session_ctx.stop()
|
||||
session_ctx.at_least_one_doc = True
|
||||
yield batch # ty: ignore[invalid-yield]
|
||||
yield session_ctx.doc_batch
|
||||
|
||||
if not session_ctx.at_least_one_doc:
|
||||
if session_ctx.last_error:
|
||||
@@ -787,22 +742,6 @@ class WebConnector(LoadConnector, SlimConnector):
|
||||
|
||||
session_ctx.stop()
|
||||
|
||||
@override
|
||||
def retrieve_all_slim_docs(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: IndexingHeartbeatInterface | None = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""Yields SlimDocuments for all pages reachable from the configured URLs.
|
||||
|
||||
Uses the same Playwright crawl as full indexing but skips content extraction,
|
||||
scroll, and PDF downloads. The 5s render wait fires only on bot-detection
|
||||
responses (CloudFlare cf-ray header or HTTP 403).
|
||||
The start/end parameters are ignored — WEB connector has no incremental path.
|
||||
"""
|
||||
yield from self.load_from_state(slim=True) # ty: ignore[invalid-yield]
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
# Make sure we have at least one valid URL to check
|
||||
if not self.to_visit_list:
|
||||
|
||||
@@ -952,7 +952,6 @@ class Document(Base):
|
||||
semantic_id: Mapped[str] = mapped_column(NullFilteredString)
|
||||
# First Section's link
|
||||
link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
|
||||
file_id: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
|
||||
# The updated time is also used as a measure of the last successful state of the doc
|
||||
# pulled from the source (to help skip reindexing already updated docs in case of
|
||||
@@ -1055,9 +1054,9 @@ class Document(Base):
|
||||
|
||||
__table_args__ = (
|
||||
Index(
|
||||
"ix_document_needs_sync",
|
||||
"id",
|
||||
postgresql_where=text("last_modified > last_synced OR last_synced IS NULL"),
|
||||
"ix_document_sync_status",
|
||||
last_modified,
|
||||
last_synced,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -129,13 +129,9 @@ def update_sync_record_status(
|
||||
"""
|
||||
sync_record = fetch_latest_sync_record(db_session, entity_id, sync_type)
|
||||
if sync_record is None:
|
||||
logger.warning(
|
||||
f"No sync record found for entity_id={entity_id} "
|
||||
f"sync_type={sync_type} — skipping status update. "
|
||||
f"This typically means the record was never created "
|
||||
f"(insert_sync_record failed silently) or was cleaned up."
|
||||
raise ValueError(
|
||||
f"No sync record found for entity_id={entity_id} sync_type={sync_type}"
|
||||
)
|
||||
return
|
||||
|
||||
sync_record.sync_status = sync_status
|
||||
if num_docs_synced is not None:
|
||||
|
||||
@@ -2,17 +2,11 @@ import datetime
|
||||
from uuid import UUID
|
||||
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy import or_
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import joinedload
|
||||
from sqlalchemy.orm import selectinload
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.db.models import ChatMessage
|
||||
from onyx.db.models import ChatSession
|
||||
from onyx.db.models import ChatSessionSharedStatus
|
||||
from onyx.db.models import FileRecord
|
||||
from onyx.db.models import Persona
|
||||
from onyx.db.models import Project__UserFile
|
||||
from onyx.db.models import UserFile
|
||||
@@ -114,73 +108,13 @@ def update_last_accessed_at_for_user_files(
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def get_file_id_by_user_file_id(
|
||||
user_file_id: str, user_id: UUID, db_session: Session
|
||||
) -> str | None:
|
||||
user_file = (
|
||||
db_session.query(UserFile)
|
||||
.filter(UserFile.id == user_file_id, UserFile.user_id == user_id)
|
||||
.first()
|
||||
)
|
||||
def get_file_id_by_user_file_id(user_file_id: str, db_session: Session) -> str | None:
|
||||
user_file = db_session.query(UserFile).filter(UserFile.id == user_file_id).first()
|
||||
if user_file:
|
||||
return user_file.file_id
|
||||
return None
|
||||
|
||||
|
||||
def user_can_access_chat_file(file_id: str, user_id: UUID, db_session: Session) -> bool:
|
||||
"""Return True if `user_id` is allowed to read the raw `file_id` served by
|
||||
`GET /chat/file/{file_id}`. Access is granted when any of:
|
||||
|
||||
- The `file_id` is the storage id of a `UserFile` owned by the user.
|
||||
- The `file_id` is a persona avatar (`Persona.uploaded_image_id`); avatars
|
||||
are visible to any authenticated user.
|
||||
- The `file_id` appears in a `ChatMessage.files` descriptor of a chat
|
||||
session the user owns or a session publicly shared via
|
||||
`ChatSessionSharedStatus.PUBLIC`.
|
||||
"""
|
||||
owns_user_file = db_session.query(
|
||||
select(UserFile.id)
|
||||
.where(UserFile.file_id == file_id, UserFile.user_id == user_id)
|
||||
.exists()
|
||||
).scalar()
|
||||
if owns_user_file:
|
||||
return True
|
||||
|
||||
# TODO: move persona avatars to a dedicated endpoint (e.g.
|
||||
# /assistants/{id}/avatar) so this branch can be removed. /chat/file is
|
||||
# currently overloaded with multiple asset classes (user files, chat
|
||||
# attachments, tool outputs, avatars), forcing this access-check fan-out.
|
||||
#
|
||||
# Restrict the avatar path to CHAT_UPLOAD-origin files so an attacker
|
||||
# cannot bind another user's USER_FILE (or any other origin) to their
|
||||
# own persona and read it through this check.
|
||||
is_persona_avatar = db_session.query(
|
||||
select(Persona.id)
|
||||
.join(FileRecord, FileRecord.file_id == Persona.uploaded_image_id)
|
||||
.where(
|
||||
Persona.uploaded_image_id == file_id,
|
||||
FileRecord.file_origin == FileOrigin.CHAT_UPLOAD,
|
||||
)
|
||||
.exists()
|
||||
).scalar()
|
||||
if is_persona_avatar:
|
||||
return True
|
||||
|
||||
chat_file_stmt = (
|
||||
select(ChatMessage.id)
|
||||
.join(ChatSession, ChatMessage.chat_session_id == ChatSession.id)
|
||||
.where(ChatMessage.files.op("@>")([{"id": file_id}]))
|
||||
.where(
|
||||
or_(
|
||||
ChatSession.user_id == user_id,
|
||||
ChatSession.shared_status == ChatSessionSharedStatus.PUBLIC,
|
||||
)
|
||||
)
|
||||
.limit(1)
|
||||
)
|
||||
return db_session.execute(chat_file_stmt).first() is not None
|
||||
|
||||
|
||||
def get_file_ids_by_user_file_ids(
|
||||
user_file_ids: list[UUID], db_session: Session
|
||||
) -> list[str]:
|
||||
|
||||
@@ -12,7 +12,6 @@ from email.parser import Parser as EmailParser
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import IO
|
||||
from typing import NamedTuple
|
||||
from typing import Optional
|
||||
@@ -21,11 +20,10 @@ from zipfile import BadZipFile
|
||||
|
||||
import chardet
|
||||
import openpyxl
|
||||
from openpyxl.worksheet._read_only import ReadOnlyWorksheet
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
from PIL import Image
|
||||
|
||||
from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_FILE
|
||||
from onyx.configs.app_configs import MAX_XLSX_CELLS_PER_SHEET
|
||||
from onyx.configs.constants import ONYX_METADATA_FILENAME
|
||||
from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
|
||||
from onyx.file_processing.file_types import OnyxFileExtensions
|
||||
@@ -51,7 +49,6 @@ KNOWN_OPENPYXL_BUGS = [
|
||||
"Unable to read workbook: could not read stylesheet from None",
|
||||
"Colors must be aRGB hex values",
|
||||
"Max value is",
|
||||
"There is no item named",
|
||||
]
|
||||
|
||||
|
||||
@@ -450,83 +447,104 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
|
||||
return presentation.markdown
|
||||
|
||||
|
||||
def _columns_to_keep(col_has_data: bytearray, max_empty: int) -> list[int]:
|
||||
"""Keep non-empty columns, plus runs of up to ``max_empty`` empty columns
|
||||
between them. Trailing empty columns are dropped."""
|
||||
def _worksheet_to_matrix(
|
||||
worksheet: Worksheet,
|
||||
) -> list[list[str]]:
|
||||
"""
|
||||
Converts a singular worksheet to a matrix of values.
|
||||
|
||||
Rows are padded to a uniform width. In openpyxl's read_only mode,
|
||||
iter_rows can yield rows of differing lengths (trailing empty cells
|
||||
are sometimes omitted), and downstream column cleanup assumes a
|
||||
rectangular matrix.
|
||||
"""
|
||||
rows: list[list[str]] = []
|
||||
max_len = 0
|
||||
for worksheet_row in worksheet.iter_rows(min_row=1, values_only=True):
|
||||
row = ["" if cell is None else str(cell) for cell in worksheet_row]
|
||||
if len(row) > max_len:
|
||||
max_len = len(row)
|
||||
rows.append(row)
|
||||
|
||||
for row in rows:
|
||||
if len(row) < max_len:
|
||||
row.extend([""] * (max_len - len(row)))
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def _clean_worksheet_matrix(matrix: list[list[str]]) -> list[list[str]]:
|
||||
"""
|
||||
Cleans a worksheet matrix by removing rows if there are N consecutive empty
|
||||
rows and removing cols if there are M consecutive empty columns
|
||||
"""
|
||||
MAX_EMPTY_ROWS = 2 # Runs longer than this are capped to max_empty; shorter runs are preserved as-is
|
||||
MAX_EMPTY_COLS = 2
|
||||
|
||||
# Row cleanup
|
||||
matrix = _remove_empty_runs(matrix, max_empty=MAX_EMPTY_ROWS)
|
||||
|
||||
if not matrix:
|
||||
return matrix
|
||||
|
||||
# Column cleanup — determine which columns to keep without transposing.
|
||||
num_cols = len(matrix[0])
|
||||
keep_cols = _columns_to_keep(matrix, num_cols, max_empty=MAX_EMPTY_COLS)
|
||||
if len(keep_cols) < num_cols:
|
||||
matrix = [[row[c] for c in keep_cols] for row in matrix]
|
||||
|
||||
return matrix
|
||||
|
||||
|
||||
def _columns_to_keep(
|
||||
matrix: list[list[str]], num_cols: int, max_empty: int
|
||||
) -> list[int]:
|
||||
"""Return the indices of columns to keep after removing empty-column runs.
|
||||
|
||||
Uses the same logic as ``_remove_empty_runs`` but operates on column
|
||||
indices so no transpose is needed.
|
||||
"""
|
||||
kept: list[int] = []
|
||||
empty_buffer: list[int] = []
|
||||
for c, has in enumerate(col_has_data):
|
||||
if has:
|
||||
kept.extend(empty_buffer[:max_empty])
|
||||
kept.append(c)
|
||||
empty_buffer = []
|
||||
|
||||
for col_idx in range(num_cols):
|
||||
col_is_empty = all(not row[col_idx] for row in matrix)
|
||||
if col_is_empty:
|
||||
empty_buffer.append(col_idx)
|
||||
else:
|
||||
empty_buffer.append(c)
|
||||
kept.extend(empty_buffer[:max_empty])
|
||||
kept.append(col_idx)
|
||||
empty_buffer = []
|
||||
|
||||
return kept
|
||||
|
||||
|
||||
def _sheet_to_csv(rows: Iterator[tuple[Any, ...]]) -> str:
|
||||
"""Stream worksheet rows into CSV text without materializing a dense matrix.
|
||||
def _remove_empty_runs(
|
||||
rows: list[list[str]],
|
||||
max_empty: int,
|
||||
) -> list[list[str]]:
|
||||
"""Removes entire runs of empty rows when the run length exceeds max_empty.
|
||||
|
||||
Empty rows are never stored. Column occupancy is tracked as a ``bytearray``
|
||||
bitmap so column trimming needs no transpose or copy. Runs of empty
|
||||
rows/columns longer than 2 are collapsed; shorter runs are preserved.
|
||||
|
||||
Scanning stops once ``MAX_XLSX_CELLS_PER_SHEET`` non-empty cells have been
|
||||
seen; the output gets a truncation marker row appended so downstream
|
||||
indexing sees that the sheet was cut off.
|
||||
Leading empty runs are capped to max_empty, just like interior runs.
|
||||
Trailing empty rows are always dropped since there is no subsequent
|
||||
non-empty row to flush them.
|
||||
"""
|
||||
MAX_EMPTY_ROWS_IN_OUTPUT = 2
|
||||
MAX_EMPTY_COLS_IN_OUTPUT = 2
|
||||
TRUNCATION_MARKER = "[truncated: sheet exceeded cell limit]"
|
||||
result: list[list[str]] = []
|
||||
empty_buffer: list[list[str]] = []
|
||||
|
||||
non_empty_rows: list[tuple[int, list[str]]] = []
|
||||
col_has_data = bytearray()
|
||||
total_non_empty = 0
|
||||
truncated = False
|
||||
for row in rows:
|
||||
# Check if empty
|
||||
if not any(row):
|
||||
if len(empty_buffer) < max_empty:
|
||||
empty_buffer.append(row)
|
||||
else:
|
||||
# Add upto max empty rows onto the result - that's what we allow
|
||||
result.extend(empty_buffer[:max_empty])
|
||||
# Add the new non-empty row
|
||||
result.append(row)
|
||||
empty_buffer = []
|
||||
|
||||
for row_idx, row_vals in enumerate(rows):
|
||||
# Fast-reject empty rows before allocating a list of "".
|
||||
if not any(v is not None and v != "" for v in row_vals):
|
||||
continue
|
||||
|
||||
cells = ["" if v is None else str(v) for v in row_vals]
|
||||
non_empty_rows.append((row_idx, cells))
|
||||
|
||||
if len(cells) > len(col_has_data):
|
||||
col_has_data.extend(b"\x00" * (len(cells) - len(col_has_data)))
|
||||
for i, v in enumerate(cells):
|
||||
if v:
|
||||
col_has_data[i] = 1
|
||||
total_non_empty += 1
|
||||
|
||||
if total_non_empty > MAX_XLSX_CELLS_PER_SHEET:
|
||||
truncated = True
|
||||
break
|
||||
|
||||
if not non_empty_rows:
|
||||
return ""
|
||||
|
||||
keep_cols = _columns_to_keep(col_has_data, MAX_EMPTY_COLS_IN_OUTPUT)
|
||||
if not keep_cols:
|
||||
return ""
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf, lineterminator="\n")
|
||||
blank_row = [""] * len(keep_cols)
|
||||
last_idx = -1
|
||||
for row_idx, cells in non_empty_rows:
|
||||
gap = row_idx - last_idx - 1
|
||||
if gap > 0:
|
||||
for _ in range(min(gap, MAX_EMPTY_ROWS_IN_OUTPUT)):
|
||||
writer.writerow(blank_row)
|
||||
writer.writerow([cells[c] if c < len(cells) else "" for c in keep_cols])
|
||||
last_idx = row_idx
|
||||
|
||||
if truncated:
|
||||
writer.writerow([TRUNCATION_MARKER])
|
||||
|
||||
return buf.getvalue().rstrip("\n")
|
||||
return result
|
||||
|
||||
|
||||
def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str, str]]:
|
||||
@@ -554,24 +572,20 @@ def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str,
|
||||
raise
|
||||
|
||||
sheets: list[tuple[str, str]] = []
|
||||
try:
|
||||
for sheet in workbook.worksheets:
|
||||
# Declared dimensions can be different to what is actually there
|
||||
ro_sheet = cast(ReadOnlyWorksheet, sheet)
|
||||
ro_sheet.reset_dimensions()
|
||||
csv_text = _sheet_to_csv(ro_sheet.iter_rows(values_only=True))
|
||||
sheets.append((csv_text.strip(), ro_sheet.title))
|
||||
finally:
|
||||
workbook.close()
|
||||
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_matrix = _clean_worksheet_matrix(_worksheet_to_matrix(sheet))
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf, lineterminator="\n")
|
||||
writer.writerows(sheet_matrix)
|
||||
csv_text = buf.getvalue().rstrip("\n")
|
||||
if csv_text.strip():
|
||||
sheets.append((csv_text, sheet.title))
|
||||
return sheets
|
||||
|
||||
|
||||
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
|
||||
sheets = xlsx_sheet_extraction(file, file_name)
|
||||
return TEXT_SECTION_SEPARATOR.join(
|
||||
csv_text for csv_text, _title in sheets if csv_text
|
||||
)
|
||||
return TEXT_SECTION_SEPARATOR.join(csv_text for csv_text, _title in sheets)
|
||||
|
||||
|
||||
def eml_to_text(file: IO[Any]) -> str:
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
from typing import IO
|
||||
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
# (content, content_type) -> file_id
|
||||
RawFileCallback = Callable[[IO[bytes], str], str]
|
||||
|
||||
|
||||
def stage_raw_file(
|
||||
content: IO,
|
||||
content_type: str,
|
||||
*,
|
||||
metadata: dict[str, Any],
|
||||
) -> str:
|
||||
"""Persist raw bytes to the file store with FileOrigin.INDEXING_STAGING.
|
||||
|
||||
`metadata` is attached to the file_record so that downstream promotion
|
||||
(in docprocessing) and orphan reaping (TTL janitor) can locate the file
|
||||
by its originating context.
|
||||
"""
|
||||
file_store = get_default_file_store()
|
||||
file_id = file_store.save_file(
|
||||
content=content,
|
||||
display_name=None,
|
||||
file_origin=FileOrigin.INDEXING_STAGING,
|
||||
file_type=content_type,
|
||||
file_metadata=metadata,
|
||||
)
|
||||
return file_id
|
||||
|
||||
|
||||
def build_raw_file_callback(
|
||||
*,
|
||||
index_attempt_id: int,
|
||||
cc_pair_id: int,
|
||||
tenant_id: str,
|
||||
) -> RawFileCallback:
|
||||
"""Build a per-attempt callback that connectors can invoke to opt in to
|
||||
raw-file persistence. The closure binds the attempt-level context as the
|
||||
staging metadata so the connector only needs to pass per-call info
|
||||
(bytes, content_type) and gets back a file_id to attach to its Document.
|
||||
"""
|
||||
metadata: dict[str, Any] = {
|
||||
"index_attempt_id": index_attempt_id,
|
||||
"cc_pair_id": cc_pair_id,
|
||||
"tenant_id": tenant_id,
|
||||
}
|
||||
|
||||
def _callback(content: IO[bytes], content_type: str) -> str:
|
||||
return stage_raw_file(
|
||||
content=content,
|
||||
content_type=content_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
return _callback
|
||||
@@ -166,7 +166,6 @@ from shared_configs.configs import CORS_ALLOWED_ORIGIN
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
from shared_configs.configs import SENTRY_DSN
|
||||
from shared_configs.configs import SENTRY_TRACES_SAMPLE_RATE
|
||||
from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
|
||||
|
||||
warnings.filterwarnings(
|
||||
@@ -440,7 +439,7 @@ def get_application(lifespan_override: Lifespan | None = None) -> FastAPI:
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
integrations=[StarletteIntegration(), FastApiIntegration()],
|
||||
traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
|
||||
traces_sample_rate=0.1,
|
||||
release=__version__,
|
||||
before_send=_add_instance_tags,
|
||||
)
|
||||
|
||||
@@ -63,7 +63,6 @@ from onyx.db.persona import get_persona_by_id
|
||||
from onyx.db.usage import increment_usage
|
||||
from onyx.db.usage import UsageType
|
||||
from onyx.db.user_file import get_file_id_by_user_file_id
|
||||
from onyx.db.user_file import user_can_access_chat_file
|
||||
from onyx.error_handling.error_codes import OnyxErrorCode
|
||||
from onyx.error_handling.exceptions import OnyxError
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
@@ -867,18 +866,14 @@ def seed_chat_from_slack(
|
||||
def fetch_chat_file(
|
||||
file_id: str,
|
||||
request: Request,
|
||||
user: User = Depends(require_permission(Permission.BASIC_ACCESS)),
|
||||
_: User = Depends(require_permission(Permission.BASIC_ACCESS)),
|
||||
db_session: Session = Depends(get_session),
|
||||
) -> Response:
|
||||
|
||||
# For user files, we need to get the file id from the user file id
|
||||
file_id_from_user_file = get_file_id_by_user_file_id(file_id, user.id, db_session)
|
||||
file_id_from_user_file = get_file_id_by_user_file_id(file_id, db_session)
|
||||
if file_id_from_user_file:
|
||||
file_id = file_id_from_user_file
|
||||
elif not user_can_access_chat_file(file_id, user.id, db_session):
|
||||
# Return 404 (rather than 403) so callers cannot probe for file
|
||||
# existence across ownership boundaries.
|
||||
raise OnyxError(OnyxErrorCode.NOT_FOUND, "File not found")
|
||||
|
||||
file_store = get_default_file_store()
|
||||
file_record = file_store.read_file_record(file_id)
|
||||
|
||||
@@ -99,14 +99,6 @@ STRICT_CHUNK_TOKEN_LIMIT = (
|
||||
# Set up Sentry integration (for error logging)
|
||||
SENTRY_DSN = os.environ.get("SENTRY_DSN")
|
||||
|
||||
# Celery task spans dominate ingestion volume (~94%), so default celery
|
||||
# tracing to 0. Web/API traces stay at a small non-zero rate so http.server
|
||||
# traces remain available. Both are env-tunable without a code change.
|
||||
SENTRY_TRACES_SAMPLE_RATE = float(os.environ.get("SENTRY_TRACES_SAMPLE_RATE", "0.01"))
|
||||
SENTRY_CELERY_TRACES_SAMPLE_RATE = float(
|
||||
os.environ.get("SENTRY_CELERY_TRACES_SAMPLE_RATE", "0.0")
|
||||
)
|
||||
|
||||
|
||||
# Fields which should only be set on new search setting
|
||||
PRESERVED_SEARCH_FIELDS = [
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
"""External dependency tests for onyx.file_store.staging.
|
||||
|
||||
Exercises the raw-file persistence hook used by the docfetching pipeline
|
||||
against a real file store (Postgres + MinIO/S3), since mocking the store
|
||||
would defeat the point of verifying that metadata round-trips through
|
||||
FileRecord.
|
||||
"""
|
||||
|
||||
from collections.abc import Generator
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
import pytest
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.interfaces import BaseConnector
|
||||
from onyx.db.file_record import delete_filerecord_by_file_id
|
||||
from onyx.db.file_record import get_filerecord_by_file_id
|
||||
from onyx.file_store.file_store import get_default_file_store
|
||||
from onyx.file_store.staging import build_raw_file_callback
|
||||
from onyx.file_store.staging import stage_raw_file
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def cleanup_file_ids(
|
||||
db_session: Session,
|
||||
) -> Generator[list[str], None, None]:
|
||||
created: list[str] = []
|
||||
yield created
|
||||
file_store = get_default_file_store()
|
||||
for fid in created:
|
||||
try:
|
||||
file_store.delete_file(fid)
|
||||
except Exception:
|
||||
delete_filerecord_by_file_id(file_id=fid, db_session=db_session)
|
||||
db_session.commit()
|
||||
|
||||
|
||||
def test_stage_raw_file_persists_with_origin_and_metadata(
|
||||
db_session: Session,
|
||||
tenant_context: None, # noqa: ARG001
|
||||
initialize_file_store: None, # noqa: ARG001
|
||||
cleanup_file_ids: list[str],
|
||||
) -> None:
|
||||
"""stage_raw_file writes a FileRecord with INDEXING_STAGING origin and
|
||||
round-trips the provided metadata verbatim."""
|
||||
metadata: dict[str, Any] = {
|
||||
"index_attempt_id": 42,
|
||||
"cc_pair_id": 7,
|
||||
"tenant_id": "tenant-abc",
|
||||
"extra": "payload",
|
||||
}
|
||||
content_bytes = b"hello raw file"
|
||||
content_type = "application/pdf"
|
||||
|
||||
file_id = stage_raw_file(
|
||||
content=BytesIO(content_bytes),
|
||||
content_type=content_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
cleanup_file_ids.append(file_id)
|
||||
db_session.commit()
|
||||
|
||||
record = get_filerecord_by_file_id(file_id=file_id, db_session=db_session)
|
||||
assert record.file_origin == FileOrigin.INDEXING_STAGING
|
||||
assert record.file_type == content_type
|
||||
assert record.file_metadata == metadata
|
||||
|
||||
|
||||
def test_build_raw_file_callback_binds_attempt_context_per_call(
|
||||
db_session: Session,
|
||||
tenant_context: None, # noqa: ARG001
|
||||
initialize_file_store: None, # noqa: ARG001
|
||||
cleanup_file_ids: list[str],
|
||||
) -> None:
|
||||
"""The callback returned by build_raw_file_callback must bind the
|
||||
attempt-level context into every FileRecord it produces, without
|
||||
leaking state across invocations."""
|
||||
callback = build_raw_file_callback(
|
||||
index_attempt_id=1001,
|
||||
cc_pair_id=202,
|
||||
tenant_id="tenant-xyz",
|
||||
)
|
||||
|
||||
file_id_a = callback(BytesIO(b"alpha"), "text/plain")
|
||||
file_id_b = callback(BytesIO(b"beta"), "application/octet-stream")
|
||||
cleanup_file_ids.extend([file_id_a, file_id_b])
|
||||
db_session.commit()
|
||||
|
||||
assert file_id_a != file_id_b
|
||||
|
||||
for fid, expected_content_type in (
|
||||
(file_id_a, "text/plain"),
|
||||
(file_id_b, "application/octet-stream"),
|
||||
):
|
||||
record = get_filerecord_by_file_id(file_id=fid, db_session=db_session)
|
||||
assert record.file_origin == FileOrigin.INDEXING_STAGING
|
||||
assert record.file_type == expected_content_type
|
||||
assert record.file_metadata == {
|
||||
"index_attempt_id": 1001,
|
||||
"cc_pair_id": 202,
|
||||
"tenant_id": "tenant-xyz",
|
||||
}
|
||||
|
||||
|
||||
def test_set_raw_file_callback_on_base_connector() -> None:
|
||||
"""set_raw_file_callback must install the callback as an instance
|
||||
attribute usable by the connector."""
|
||||
|
||||
class _MinimalConnector(BaseConnector):
|
||||
def load_credentials(
|
||||
self,
|
||||
credentials: dict[str, Any], # noqa: ARG002
|
||||
) -> dict[str, Any] | None:
|
||||
return None
|
||||
|
||||
connector = _MinimalConnector()
|
||||
assert connector.raw_file_callback is None
|
||||
|
||||
sentinel_file_id = f"sentinel-{uuid4().hex[:8]}"
|
||||
|
||||
def _fake_callback(_content: Any, _content_type: str) -> str:
|
||||
return sentinel_file_id
|
||||
|
||||
connector.set_raw_file_callback(_fake_callback)
|
||||
|
||||
assert connector.raw_file_callback is _fake_callback
|
||||
assert connector.raw_file_callback(BytesIO(b""), "text/plain") == sentinel_file_id
|
||||
@@ -8,10 +8,8 @@ import io
|
||||
from typing import NamedTuple
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from onyx.file_store.models import FileDescriptor
|
||||
from tests.integration.common_utils.constants import API_SERVER_URL
|
||||
from tests.integration.common_utils.managers.chat import ChatSessionManager
|
||||
from tests.integration.common_utils.managers.file import FileManager
|
||||
from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
|
||||
@@ -121,31 +119,3 @@ def test_public_assistant_with_user_files(
|
||||
assert (
|
||||
len(chat_history) >= 2
|
||||
), "Expected at least 2 messages (user message and assistant response)"
|
||||
|
||||
|
||||
def test_cannot_download_other_users_file_via_chat_file_endpoint(
|
||||
user_file_setup: UserFileTestSetup,
|
||||
) -> None:
|
||||
storage_file_id = user_file_setup.user1_file_descriptor["id"]
|
||||
user_file_id = user_file_setup.user1_file_id
|
||||
|
||||
owner_response = requests.get(
|
||||
f"{API_SERVER_URL}/chat/file/{storage_file_id}",
|
||||
headers=user_file_setup.user1_file_owner.headers,
|
||||
)
|
||||
assert owner_response.status_code == 200
|
||||
assert owner_response.content, "Owner should receive the file contents"
|
||||
|
||||
for file_id in (storage_file_id, user_file_id):
|
||||
user2_response = requests.get(
|
||||
f"{API_SERVER_URL}/chat/file/{file_id}",
|
||||
headers=user_file_setup.user2_non_owner.headers,
|
||||
)
|
||||
assert user2_response.status_code in (
|
||||
403,
|
||||
404,
|
||||
), (
|
||||
f"Expected access denied for non-owner, got {user2_response.status_code} "
|
||||
f"when fetching file_id={file_id}"
|
||||
)
|
||||
assert user2_response.content != owner_response.content
|
||||
|
||||
@@ -69,9 +69,6 @@ class TestGongConnectorCheckpoint:
|
||||
workspace_ids=["ws1", None],
|
||||
workspace_index=1,
|
||||
cursor="abc123",
|
||||
pending_transcripts={"call1": _make_transcript("call1")},
|
||||
pending_call_details_attempts=2,
|
||||
pending_retry_after=1234567890.5,
|
||||
)
|
||||
json_str = original.model_dump_json()
|
||||
restored = connector.validate_checkpoint_json(json_str)
|
||||
@@ -234,11 +231,7 @@ class TestGongConnectorCheckpoint:
|
||||
mock_request: MagicMock,
|
||||
connector: GongConnector,
|
||||
) -> None:
|
||||
"""Missing call details persist across checkpoint invocations and
|
||||
eventually yield ConnectorFailure once MAX_CALL_DETAILS_ATTEMPTS is hit.
|
||||
No in-call sleep — retries happen on subsequent invocations, gated by
|
||||
the wall-clock retry-after deadline on the checkpoint.
|
||||
"""
|
||||
"""When call details are missing after retries, yield ConnectorFailure."""
|
||||
transcript_response = MagicMock()
|
||||
transcript_response.status_code = 200
|
||||
transcript_response.json.return_value = {
|
||||
@@ -264,42 +257,23 @@ class TestGongConnectorCheckpoint:
|
||||
failures: list[ConnectorFailure] = []
|
||||
docs: list[Document] = []
|
||||
|
||||
# Jump the clock past any retry deadline on each invocation so we
|
||||
# exercise the retry path without real sleeping. The test for the
|
||||
# backoff-gate itself lives in test_backoff_gate_prevents_retry_too_soon.
|
||||
fake_now = [1_000_000.0]
|
||||
|
||||
def _advance_clock() -> float:
|
||||
fake_now[0] += 10_000.0
|
||||
return fake_now[0]
|
||||
|
||||
invocation_cap = GongConnector.MAX_CALL_DETAILS_ATTEMPTS + 5
|
||||
with patch(
|
||||
"onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
|
||||
):
|
||||
for _ in range(invocation_cap):
|
||||
if not checkpoint.has_more:
|
||||
break
|
||||
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
|
||||
try:
|
||||
while True:
|
||||
item = next(generator)
|
||||
if isinstance(item, ConnectorFailure):
|
||||
failures.append(item)
|
||||
elif isinstance(item, Document):
|
||||
docs.append(item)
|
||||
except StopIteration as e:
|
||||
checkpoint = e.value
|
||||
with patch("onyx.connectors.gong.connector.time.sleep"):
|
||||
generator = connector.load_from_checkpoint(0, time.time(), checkpoint)
|
||||
try:
|
||||
while True:
|
||||
item = next(generator)
|
||||
if isinstance(item, ConnectorFailure):
|
||||
failures.append(item)
|
||||
elif isinstance(item, Document):
|
||||
docs.append(item)
|
||||
except StopIteration as e:
|
||||
checkpoint = e.value
|
||||
|
||||
assert len(docs) == 0
|
||||
assert len(failures) == 1
|
||||
assert failures[0].failed_document is not None
|
||||
assert failures[0].failed_document.document_id == "call1"
|
||||
assert checkpoint.has_more is False
|
||||
assert checkpoint.pending_transcripts == {}
|
||||
assert checkpoint.pending_call_details_attempts == 0
|
||||
assert checkpoint.pending_retry_after is None
|
||||
assert mock_request.call_count == 1 + GongConnector.MAX_CALL_DETAILS_ATTEMPTS
|
||||
|
||||
@patch.object(GongConnector, "_throttled_request")
|
||||
def test_multi_workspace_iteration(
|
||||
@@ -407,14 +381,12 @@ class TestGongConnectorCheckpoint:
|
||||
assert checkpoint.workspace_index == 1
|
||||
|
||||
@patch.object(GongConnector, "_throttled_request")
|
||||
def test_partial_details_defers_and_resolves_next_invocation(
|
||||
def test_retry_only_fetches_missing_ids(
|
||||
self,
|
||||
mock_request: MagicMock,
|
||||
connector: GongConnector,
|
||||
) -> None:
|
||||
"""A transcript whose call details are missing gets stashed into
|
||||
pending_transcripts and resolves on a later checkpoint invocation.
|
||||
Resolved docs are yielded in the order they become available."""
|
||||
"""Retry for missing call details should only re-request the missing IDs."""
|
||||
transcript_response = MagicMock()
|
||||
transcript_response.status_code = 200
|
||||
transcript_response.json.return_value = {
|
||||
@@ -432,7 +404,7 @@ class TestGongConnectorCheckpoint:
|
||||
"calls": [_make_call_detail("call1", "Call One")]
|
||||
}
|
||||
|
||||
# Second fetch (next invocation): returns call2
|
||||
# Second fetch (retry): returns call2
|
||||
missing_details = MagicMock()
|
||||
missing_details.status_code = 200
|
||||
missing_details.json.return_value = {
|
||||
@@ -452,48 +424,19 @@ class TestGongConnectorCheckpoint:
|
||||
)
|
||||
|
||||
docs: list[Document] = []
|
||||
|
||||
fake_now = [1_000_000.0]
|
||||
|
||||
def _advance_clock() -> float:
|
||||
fake_now[0] += 10_000.0
|
||||
return fake_now[0]
|
||||
|
||||
with patch(
|
||||
"onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
|
||||
):
|
||||
# Invocation 1: fetches page + details, yields call1, stashes call2
|
||||
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
|
||||
with patch("onyx.connectors.gong.connector.time.sleep"):
|
||||
generator = connector.load_from_checkpoint(0, time.time(), checkpoint)
|
||||
try:
|
||||
while True:
|
||||
item = next(generator)
|
||||
if isinstance(item, Document):
|
||||
docs.append(item)
|
||||
except StopIteration as e:
|
||||
checkpoint = e.value
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].semantic_identifier == "Call One"
|
||||
assert "call2" in checkpoint.pending_transcripts
|
||||
assert checkpoint.pending_call_details_attempts == 1
|
||||
assert checkpoint.pending_retry_after is not None
|
||||
assert checkpoint.has_more is True
|
||||
|
||||
# Invocation 2: retries missing (only call2), yields it, clears pending
|
||||
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
|
||||
try:
|
||||
while True:
|
||||
item = next(generator)
|
||||
if isinstance(item, Document):
|
||||
docs.append(item)
|
||||
except StopIteration as e:
|
||||
checkpoint = e.value
|
||||
except StopIteration:
|
||||
pass
|
||||
|
||||
assert len(docs) == 2
|
||||
assert docs[0].semantic_identifier == "Call One"
|
||||
assert docs[1].semantic_identifier == "Call Two"
|
||||
assert checkpoint.pending_transcripts == {}
|
||||
assert checkpoint.pending_call_details_attempts == 0
|
||||
assert checkpoint.pending_retry_after is None
|
||||
|
||||
# Verify: 3 API calls total (1 transcript + 1 full details + 1 retry for missing only)
|
||||
assert mock_request.call_count == 3
|
||||
@@ -501,107 +444,6 @@ class TestGongConnectorCheckpoint:
|
||||
retry_call_body = mock_request.call_args_list[2][1]["json"]
|
||||
assert retry_call_body["filter"]["callIds"] == ["call2"]
|
||||
|
||||
@patch.object(GongConnector, "_throttled_request")
|
||||
def test_backoff_gate_prevents_retry_too_soon(
|
||||
self,
|
||||
mock_request: MagicMock,
|
||||
connector: GongConnector,
|
||||
) -> None:
|
||||
"""If the retry-after deadline hasn't elapsed, _resolve_pending must
|
||||
NOT issue a /v2/calls/extensive request. Prevents burning through
|
||||
MAX_CALL_DETAILS_ATTEMPTS when workers re-invoke tightly.
|
||||
"""
|
||||
pending_transcript = _make_transcript("call1")
|
||||
fixed_now = 1_000_000.0
|
||||
# Deadline is 30s in the future from fixed_now
|
||||
retry_after = fixed_now + 30
|
||||
|
||||
checkpoint = GongConnectorCheckpoint(
|
||||
has_more=True,
|
||||
workspace_ids=[None],
|
||||
workspace_index=0,
|
||||
pending_transcripts={"call1": pending_transcript},
|
||||
pending_call_details_attempts=1,
|
||||
pending_retry_after=retry_after,
|
||||
)
|
||||
|
||||
with patch("onyx.connectors.gong.connector.time.time", return_value=fixed_now):
|
||||
generator = connector.load_from_checkpoint(0, fixed_now, checkpoint)
|
||||
try:
|
||||
while True:
|
||||
next(generator)
|
||||
except StopIteration as e:
|
||||
checkpoint = e.value
|
||||
|
||||
# No API calls should have been made — we were inside the backoff window
|
||||
mock_request.assert_not_called()
|
||||
# Pending state preserved for later retry
|
||||
assert "call1" in checkpoint.pending_transcripts
|
||||
assert checkpoint.pending_call_details_attempts == 1
|
||||
assert checkpoint.pending_retry_after == retry_after
|
||||
assert checkpoint.has_more is True
|
||||
|
||||
@patch.object(GongConnector, "_throttled_request")
|
||||
def test_pending_retry_does_not_block_on_time_sleep(
|
||||
self,
|
||||
mock_request: MagicMock,
|
||||
connector: GongConnector,
|
||||
) -> None:
|
||||
"""Pending-transcript retry must never call time.sleep() with a
|
||||
non-trivial delay — spacing between retries is enforced via the
|
||||
wall-clock retry-after deadline stored on the checkpoint, not by
|
||||
blocking inside load_from_checkpoint.
|
||||
"""
|
||||
transcript_response = MagicMock()
|
||||
transcript_response.status_code = 200
|
||||
transcript_response.json.return_value = {
|
||||
"callTranscripts": [_make_transcript("call1")],
|
||||
"records": {},
|
||||
}
|
||||
empty_details = MagicMock()
|
||||
empty_details.status_code = 200
|
||||
empty_details.json.return_value = {"calls": []}
|
||||
|
||||
mock_request.side_effect = [transcript_response] + [
|
||||
empty_details
|
||||
] * GongConnector.MAX_CALL_DETAILS_ATTEMPTS
|
||||
|
||||
checkpoint = GongConnectorCheckpoint(
|
||||
has_more=True,
|
||||
workspace_ids=[None],
|
||||
workspace_index=0,
|
||||
)
|
||||
|
||||
fake_now = [1_000_000.0]
|
||||
|
||||
def _advance_clock() -> float:
|
||||
fake_now[0] += 10_000.0
|
||||
return fake_now[0]
|
||||
|
||||
with (
|
||||
patch("onyx.connectors.gong.connector.time.sleep") as mock_sleep,
|
||||
patch(
|
||||
"onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
|
||||
),
|
||||
):
|
||||
invocation_cap = GongConnector.MAX_CALL_DETAILS_ATTEMPTS + 5
|
||||
for _ in range(invocation_cap):
|
||||
if not checkpoint.has_more:
|
||||
break
|
||||
generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
|
||||
try:
|
||||
while True:
|
||||
next(generator)
|
||||
except StopIteration as e:
|
||||
checkpoint = e.value
|
||||
|
||||
# The only legitimate sleep is the sub-second throttle in
|
||||
# _throttled_request (<= MIN_REQUEST_INTERVAL). Assert we never
|
||||
# sleep for anything close to the per-retry backoff delays.
|
||||
for call in mock_sleep.call_args_list:
|
||||
delay_arg = call.args[0] if call.args else 0
|
||||
assert delay_arg <= GongConnector.MIN_REQUEST_INTERVAL
|
||||
|
||||
@patch.object(GongConnector, "_throttled_request")
|
||||
def test_expired_cursor_restarts_workspace(
|
||||
self,
|
||||
|
||||
@@ -287,140 +287,3 @@ class TestFailedFolderIdsByEmail:
|
||||
)
|
||||
|
||||
assert len(failed_map) == 0
|
||||
|
||||
|
||||
class TestOrphanedPathBackfill:
|
||||
def _make_failed_map(
|
||||
self, entries: dict[str, set[str]]
|
||||
) -> ThreadSafeDict[str, ThreadSafeSet[str]]:
|
||||
return ThreadSafeDict({k: ThreadSafeSet(v) for k, v in entries.items()})
|
||||
|
||||
def _make_file(self, parent_id: str) -> MagicMock:
|
||||
file = MagicMock()
|
||||
file.user_email = "retriever@example.com"
|
||||
file.drive_file = {"parents": [parent_id]}
|
||||
return file
|
||||
|
||||
def test_backfills_intermediate_folders_into_failed_map(self) -> None:
|
||||
"""When a walk dead-ends at a confirmed orphan, all intermediate folder
|
||||
IDs must be added to failed_folder_ids_by_email for both emails so
|
||||
future files short-circuit via _get_folder_metadata's cache check."""
|
||||
connector = _make_connector()
|
||||
|
||||
# Chain: folderA -> folderB -> folderC (confirmed orphan)
|
||||
failed_map = self._make_failed_map(
|
||||
{
|
||||
"retriever@example.com": {"folderC"},
|
||||
"admin@example.com": {"folderC"},
|
||||
}
|
||||
)
|
||||
|
||||
folder_a = {"id": "folderA", "name": "A", "parents": ["folderB"]}
|
||||
folder_b = {"id": "folderB", "name": "B", "parents": ["folderC"]}
|
||||
|
||||
def mock_get_folder(
|
||||
_service: MagicMock, folder_id: str, _field_type: DriveFileFieldType
|
||||
) -> dict | None:
|
||||
if folder_id == "folderA":
|
||||
return folder_a
|
||||
if folder_id == "folderB":
|
||||
return folder_b
|
||||
return None
|
||||
|
||||
with (
|
||||
patch(
|
||||
"onyx.connectors.google_drive.connector.get_drive_service",
|
||||
return_value=MagicMock(),
|
||||
),
|
||||
patch(
|
||||
"onyx.connectors.google_drive.connector.get_folder_metadata",
|
||||
side_effect=mock_get_folder,
|
||||
),
|
||||
):
|
||||
connector._get_new_ancestors_for_files(
|
||||
files=[self._make_file("folderA")],
|
||||
seen_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
failed_folder_ids_by_email=failed_map,
|
||||
)
|
||||
|
||||
# Both emails confirmed folderC as orphan, so both get the backfill
|
||||
for email in ("retriever@example.com", "admin@example.com"):
|
||||
cached = failed_map.get(email, ThreadSafeSet())
|
||||
assert "folderA" in cached
|
||||
assert "folderB" in cached
|
||||
assert "folderC" in cached
|
||||
|
||||
def test_backfills_only_for_confirming_email(self) -> None:
|
||||
"""Only the email that confirmed the orphan gets the path backfilled."""
|
||||
connector = _make_connector()
|
||||
|
||||
# Only retriever confirmed folderC as orphan; admin has no entry
|
||||
failed_map = self._make_failed_map({"retriever@example.com": {"folderC"}})
|
||||
|
||||
folder_a = {"id": "folderA", "name": "A", "parents": ["folderB"]}
|
||||
folder_b = {"id": "folderB", "name": "B", "parents": ["folderC"]}
|
||||
|
||||
def mock_get_folder(
|
||||
_service: MagicMock, folder_id: str, _field_type: DriveFileFieldType
|
||||
) -> dict | None:
|
||||
if folder_id == "folderA":
|
||||
return folder_a
|
||||
if folder_id == "folderB":
|
||||
return folder_b
|
||||
return None
|
||||
|
||||
with (
|
||||
patch(
|
||||
"onyx.connectors.google_drive.connector.get_drive_service",
|
||||
return_value=MagicMock(),
|
||||
),
|
||||
patch(
|
||||
"onyx.connectors.google_drive.connector.get_folder_metadata",
|
||||
side_effect=mock_get_folder,
|
||||
),
|
||||
):
|
||||
connector._get_new_ancestors_for_files(
|
||||
files=[self._make_file("folderA")],
|
||||
seen_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
failed_folder_ids_by_email=failed_map,
|
||||
)
|
||||
|
||||
retriever_cached = failed_map.get("retriever@example.com", ThreadSafeSet())
|
||||
assert "folderA" in retriever_cached
|
||||
assert "folderB" in retriever_cached
|
||||
|
||||
# admin did not confirm the orphan — must not get the backfill
|
||||
assert failed_map.get("admin@example.com") is None
|
||||
|
||||
def test_short_circuits_on_backfilled_intermediate(self) -> None:
|
||||
"""A second file whose parent is already in failed_folder_ids_by_email
|
||||
must not trigger any folder metadata API calls."""
|
||||
connector = _make_connector()
|
||||
|
||||
# folderA already in the failed map from a previous walk
|
||||
failed_map = self._make_failed_map(
|
||||
{
|
||||
"retriever@example.com": {"folderA"},
|
||||
"admin@example.com": {"folderA"},
|
||||
}
|
||||
)
|
||||
|
||||
with (
|
||||
patch(
|
||||
"onyx.connectors.google_drive.connector.get_drive_service",
|
||||
return_value=MagicMock(),
|
||||
),
|
||||
patch(
|
||||
"onyx.connectors.google_drive.connector.get_folder_metadata"
|
||||
) as mock_api,
|
||||
):
|
||||
connector._get_new_ancestors_for_files(
|
||||
files=[self._make_file("folderA")],
|
||||
seen_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
|
||||
failed_folder_ids_by_email=failed_map,
|
||||
)
|
||||
|
||||
mock_api.assert_not_called()
|
||||
|
||||
@@ -1,243 +0,0 @@
|
||||
"""Unit tests for WebConnector.retrieve_all_slim_docs (slim pruning path)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
|
||||
from onyx.connectors.web.connector import WebConnector
|
||||
|
||||
BASE_URL = "http://example.com"
|
||||
|
||||
SINGLE_PAGE_HTML = (
|
||||
"<html><body><p>Content that should not appear in slim output</p></body></html>"
|
||||
)
|
||||
|
||||
RECURSIVE_ROOT_HTML = """
|
||||
<html><body>
|
||||
<a href="/page2">Page 2</a>
|
||||
<a href="/page3">Page 3</a>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
PAGE2_HTML = "<html><body><p>page 2</p></body></html>"
|
||||
PAGE3_HTML = "<html><body><p>page 3</p></body></html>"
|
||||
|
||||
|
||||
def _make_playwright_context_mock(url_to_html: dict[str, str]) -> MagicMock:
|
||||
"""Return a BrowserContext mock whose pages respond based on goto URL."""
|
||||
context = MagicMock()
|
||||
|
||||
def _new_page() -> MagicMock:
|
||||
page = MagicMock()
|
||||
visited: list[str] = []
|
||||
|
||||
def _goto(url: str, **kwargs: Any) -> MagicMock: # noqa: ARG001
|
||||
visited.append(url)
|
||||
page.url = url
|
||||
response = MagicMock()
|
||||
response.status = 200
|
||||
response.header_value.return_value = None # no cf-ray
|
||||
return response
|
||||
|
||||
def _content() -> str:
|
||||
return url_to_html.get(
|
||||
visited[-1] if visited else "", "<html><body></body></html>"
|
||||
)
|
||||
|
||||
page.goto.side_effect = _goto
|
||||
page.content.side_effect = _content
|
||||
return page
|
||||
|
||||
context.new_page.side_effect = _new_page
|
||||
return context
|
||||
|
||||
|
||||
def _make_playwright_mock() -> MagicMock:
|
||||
playwright = MagicMock()
|
||||
playwright.stop = MagicMock()
|
||||
return playwright
|
||||
|
||||
|
||||
def _make_page_mock(
|
||||
html: str, cf_ray: str | None = None, status: int = 200
|
||||
) -> MagicMock:
|
||||
"""Return a Playwright page mock with configurable status and CF header."""
|
||||
page = MagicMock()
|
||||
page.url = BASE_URL + "/"
|
||||
response = MagicMock()
|
||||
response.status = status
|
||||
response.header_value.side_effect = lambda h: cf_ray if h == "cf-ray" else None
|
||||
page.goto.return_value = response
|
||||
page.content.return_value = html
|
||||
return page
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_slim_yields_slim_documents(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""retrieve_all_slim_docs yields SlimDocuments with the correct URL as id."""
|
||||
context = _make_playwright_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
docs = [doc for batch in connector.retrieve_all_slim_docs() for doc in batch]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert isinstance(docs[0], SlimDocument)
|
||||
assert docs[0].id == BASE_URL + "/"
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_slim_skips_content_extraction(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""web_html_cleanup is never called in slim mode."""
|
||||
context = _make_playwright_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
with patch("onyx.connectors.web.connector.web_html_cleanup") as mock_cleanup:
|
||||
list(connector.retrieve_all_slim_docs())
|
||||
mock_cleanup.assert_not_called()
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_slim_discovers_links_recursively(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""In RECURSIVE mode, internal <a href> links are followed and all URLs yielded."""
|
||||
url_to_html = {
|
||||
BASE_URL + "/": RECURSIVE_ROOT_HTML,
|
||||
BASE_URL + "/page2": PAGE2_HTML,
|
||||
BASE_URL + "/page3": PAGE3_HTML,
|
||||
}
|
||||
context = _make_playwright_context_mock(url_to_html)
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
|
||||
)
|
||||
|
||||
ids = {
|
||||
doc.id
|
||||
for batch in connector.retrieve_all_slim_docs()
|
||||
for doc in batch
|
||||
if isinstance(doc, SlimDocument)
|
||||
}
|
||||
|
||||
assert ids == {
|
||||
BASE_URL + "/",
|
||||
BASE_URL + "/page2",
|
||||
BASE_URL + "/page3",
|
||||
}
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_normal_200_skips_5s_wait(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""Normal 200 responses without bot-detection signals skip the 5s render wait."""
|
||||
page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray=None, status=200)
|
||||
context = MagicMock()
|
||||
context.new_page.return_value = page
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
list(connector.retrieve_all_slim_docs())
|
||||
|
||||
page.wait_for_timeout.assert_not_called()
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_cloudflare_applies_5s_wait(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""Pages with a cf-ray header trigger the 5s wait before networkidle."""
|
||||
page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray="abc123-LAX")
|
||||
context = MagicMock()
|
||||
context.new_page.return_value = page
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
list(connector.retrieve_all_slim_docs())
|
||||
|
||||
page.wait_for_timeout.assert_called_once_with(5000)
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.time")
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_403_applies_5s_wait(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
_mock_time: MagicMock,
|
||||
) -> None:
|
||||
"""A 403 response triggers the 5s wait (common bot-detection challenge entry point)."""
|
||||
page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray=None, status=403)
|
||||
context = MagicMock()
|
||||
context.new_page.return_value = page
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
# All retries return 403 so no docs are found — that's expected here.
|
||||
# We only care that the 5s wait fired.
|
||||
try:
|
||||
list(connector.retrieve_all_slim_docs())
|
||||
except RuntimeError:
|
||||
pass
|
||||
|
||||
page.wait_for_timeout.assert_called_with(5000)
|
||||
@@ -1,11 +1,13 @@
|
||||
import io
|
||||
from typing import cast
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
import openpyxl
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
|
||||
from onyx.file_processing.extract_file_text import _sheet_to_csv
|
||||
from onyx.file_processing.extract_file_text import _clean_worksheet_matrix
|
||||
from onyx.file_processing.extract_file_text import _worksheet_to_matrix
|
||||
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
|
||||
from onyx.file_processing.extract_file_text import xlsx_to_text
|
||||
|
||||
@@ -201,179 +203,50 @@ class TestXlsxToText:
|
||||
assert "r3c1" in lines[2] and "r3c2" in lines[2]
|
||||
|
||||
|
||||
class TestSheetToCsvJaggedRows:
|
||||
"""openpyxl's read-only mode yields rows of differing widths when
|
||||
trailing cells are empty. These tests exercise ``_sheet_to_csv``
|
||||
directly because ``_make_xlsx`` (via ``ws.append``) normalizes row
|
||||
widths, so jagged input can only be produced in-memory."""
|
||||
class TestWorksheetToMatrixJaggedRows:
|
||||
"""openpyxl read_only mode can yield rows of differing widths when
|
||||
trailing cells are empty. The matrix must be padded to a rectangle
|
||||
so downstream column cleanup can index safely."""
|
||||
|
||||
def test_shorter_trailing_rows_padded_in_output(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A", "B", "C"),
|
||||
("X", "Y"),
|
||||
("P",),
|
||||
]
|
||||
)
|
||||
def test_pads_shorter_trailing_rows(self) -> None:
|
||||
ws = MagicMock()
|
||||
ws.iter_rows.return_value = iter(
|
||||
[
|
||||
("A", "B", "C"),
|
||||
("X", "Y"),
|
||||
("P",),
|
||||
]
|
||||
)
|
||||
assert csv_text.split("\n") == ["A,B,C", "X,Y,", "P,,"]
|
||||
matrix = _worksheet_to_matrix(ws)
|
||||
assert matrix == [["A", "B", "C"], ["X", "Y", ""], ["P", "", ""]]
|
||||
|
||||
def test_shorter_leading_row_padded_in_output(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A",),
|
||||
("X", "Y", "Z"),
|
||||
]
|
||||
)
|
||||
def test_pads_when_first_row_is_shorter(self) -> None:
|
||||
ws = MagicMock()
|
||||
ws.iter_rows.return_value = iter(
|
||||
[
|
||||
("A",),
|
||||
("X", "Y", "Z"),
|
||||
]
|
||||
)
|
||||
assert csv_text.split("\n") == ["A,,", "X,Y,Z"]
|
||||
matrix = _worksheet_to_matrix(ws)
|
||||
assert matrix == [["A", "", ""], ["X", "Y", "Z"]]
|
||||
|
||||
def test_no_index_error_on_jagged_rows(self) -> None:
|
||||
"""Regression: the original dense-matrix version raised IndexError
|
||||
when a later row was shorter than an earlier row whose out-of-range
|
||||
columns happened to be empty."""
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A", "", "", "B"),
|
||||
("X", "Y"),
|
||||
]
|
||||
)
|
||||
def test_clean_worksheet_matrix_no_index_error_on_jagged_rows(self) -> None:
|
||||
"""Regression: previously raised IndexError when a later row was
|
||||
shorter than the first row and the out-of-range column on the
|
||||
first row was empty (so the short-circuit in `all()` did not
|
||||
save us)."""
|
||||
ws = MagicMock()
|
||||
ws.iter_rows.return_value = iter(
|
||||
[
|
||||
("A", "", "", "B"),
|
||||
("X", "Y"),
|
||||
]
|
||||
)
|
||||
assert csv_text.split("\n") == ["A,,,B", "X,Y,,"]
|
||||
|
||||
|
||||
class TestSheetToCsvStreaming:
|
||||
"""Pin the memory-safe streaming contract: empty rows are skipped
|
||||
cheaply, empty-row/column runs are collapsed to at most 2, and sheets
|
||||
with no data return the empty string."""
|
||||
|
||||
def test_empty_rows_between_data_capped_at_two(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A", "B"),
|
||||
(None, None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
("C", "D"),
|
||||
]
|
||||
)
|
||||
)
|
||||
# 5 empty rows collapsed to 2
|
||||
assert csv_text.split("\n") == ["A,B", ",", ",", "C,D"]
|
||||
|
||||
def test_empty_rows_at_or_below_cap_preserved(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A", "B"),
|
||||
(None, None),
|
||||
(None, None),
|
||||
("C", "D"),
|
||||
]
|
||||
)
|
||||
)
|
||||
assert csv_text.split("\n") == ["A,B", ",", ",", "C,D"]
|
||||
|
||||
def test_empty_column_run_capped_at_two(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A", None, None, None, None, "B"),
|
||||
("C", None, None, None, None, "D"),
|
||||
]
|
||||
)
|
||||
)
|
||||
# 4 empty cols between A and B collapsed to 2
|
||||
assert csv_text.split("\n") == ["A,,,B", "C,,,D"]
|
||||
|
||||
def test_completely_empty_stream_returns_empty_string(self) -> None:
|
||||
assert _sheet_to_csv(iter([])) == ""
|
||||
|
||||
def test_all_rows_empty_returns_empty_string(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
(None, None),
|
||||
("", ""),
|
||||
(None,),
|
||||
]
|
||||
)
|
||||
)
|
||||
assert csv_text == ""
|
||||
|
||||
def test_trailing_empty_rows_dropped(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A",),
|
||||
("B",),
|
||||
(None,),
|
||||
(None,),
|
||||
(None,),
|
||||
]
|
||||
)
|
||||
)
|
||||
# Trailing empties are never emitted (no subsequent non-empty row
|
||||
# to flush them against).
|
||||
assert csv_text.split("\n") == ["A", "B"]
|
||||
|
||||
def test_leading_empty_rows_capped_at_two(self) -> None:
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
(None, None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
(None, None),
|
||||
("A", "B"),
|
||||
]
|
||||
)
|
||||
)
|
||||
# 5 leading empty rows collapsed to 2
|
||||
assert csv_text.split("\n") == [",", ",", "A,B"]
|
||||
|
||||
def test_cell_cap_truncates_and_appends_marker(self) -> None:
|
||||
"""When total non-empty cells exceeds the cap, scanning stops and
|
||||
a truncation marker row is appended so downstream indexing sees
|
||||
the sheet was cut off."""
|
||||
with patch(
|
||||
"onyx.file_processing.extract_file_text.MAX_XLSX_CELLS_PER_SHEET", 5
|
||||
):
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A", "B", "C"),
|
||||
("D", "E", "F"),
|
||||
("G", "H", "I"),
|
||||
("J", "K", "L"),
|
||||
]
|
||||
)
|
||||
)
|
||||
lines = csv_text.split("\n")
|
||||
assert lines[-1] == "[truncated: sheet exceeded cell limit]"
|
||||
# First two rows (6 cells) trip the cap=5 check after row 2; the
|
||||
# third and fourth rows are never scanned.
|
||||
assert "G" not in csv_text
|
||||
assert "J" not in csv_text
|
||||
|
||||
def test_cell_cap_not_hit_no_marker(self) -> None:
|
||||
"""Under the cap, no truncation marker is appended."""
|
||||
csv_text = _sheet_to_csv(
|
||||
iter(
|
||||
[
|
||||
("A", "B"),
|
||||
("C", "D"),
|
||||
]
|
||||
)
|
||||
)
|
||||
assert "[truncated" not in csv_text
|
||||
matrix = _worksheet_to_matrix(ws)
|
||||
# Must not raise.
|
||||
cleaned = _clean_worksheet_matrix(matrix)
|
||||
assert cleaned == [["A", "", "", "B"], ["X", "Y", "", ""]]
|
||||
|
||||
|
||||
class TestXlsxSheetExtraction:
|
||||
@@ -408,9 +281,10 @@ class TestXlsxSheetExtraction:
|
||||
assert "a" in csv_text
|
||||
assert "b" in csv_text
|
||||
|
||||
def test_empty_sheet_included_with_empty_csv(self) -> None:
|
||||
"""Every sheet in the workbook appears in the result; an empty
|
||||
sheet contributes an empty csv_text alongside its title."""
|
||||
def test_empty_sheet_is_skipped(self) -> None:
|
||||
"""A sheet whose CSV output is empty/whitespace-only should NOT
|
||||
appear in the result — the `if csv_text.strip():` guard filters
|
||||
it out."""
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"Data": [["a", "b"]],
|
||||
@@ -418,17 +292,14 @@ class TestXlsxSheetExtraction:
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 2
|
||||
titles = [title for _csv, title in sheets]
|
||||
assert titles == ["Data", "Empty"]
|
||||
empty_csv = next(csv_text for csv_text, title in sheets if title == "Empty")
|
||||
assert empty_csv == ""
|
||||
assert len(sheets) == 1
|
||||
assert sheets[0][1] == "Data"
|
||||
|
||||
def test_empty_workbook_returns_one_tuple_per_sheet(self) -> None:
|
||||
"""All sheets empty → one empty-csv tuple per sheet."""
|
||||
def test_empty_workbook_returns_empty_list(self) -> None:
|
||||
"""All sheets empty → empty list (not a list of empty tuples)."""
|
||||
xlsx = _make_xlsx({"Sheet1": [], "Sheet2": []})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert sheets == [("", "Sheet1"), ("", "Sheet2")]
|
||||
assert sheets == []
|
||||
|
||||
def test_single_sheet(self) -> None:
|
||||
xlsx = _make_xlsx({"Only": [["x", "y"], ["1", "2"]]})
|
||||
|
||||
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
|
||||
sources:
|
||||
- "https://github.com/onyx-dot-app/onyx"
|
||||
type: application
|
||||
version: 0.4.47
|
||||
version: 0.4.46
|
||||
appVersion: latest
|
||||
annotations:
|
||||
category: Productivity
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
{{- range .Values.extraManifests }}
|
||||
---
|
||||
{{- if kindIs "string" . }}
|
||||
{{ tpl . $ }}
|
||||
{{- else }}
|
||||
{{ tpl (toYaml .) $ }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1316,26 +1316,3 @@ configMap:
|
||||
HARD_DELETE_CHATS: ""
|
||||
MAX_ALLOWED_UPLOAD_SIZE_MB: ""
|
||||
DEFAULT_USER_FILE_MAX_UPLOAD_SIZE_MB: ""
|
||||
|
||||
# -- Additional arbitrary manifests to render as part of the release. Each entry
|
||||
# may be either a YAML mapping (a single Kubernetes object) or a multi-line
|
||||
# string that will be passed through `tpl` so it can reference release values.
|
||||
# Useful for injecting resources not covered by the chart (e.g. NetworkPolicies,
|
||||
# ExternalSecrets, custom CRs) without forking the chart.
|
||||
extraManifests: []
|
||||
# extraManifests:
|
||||
# - apiVersion: v1
|
||||
# kind: ConfigMap
|
||||
# metadata:
|
||||
# name: my-extra-config
|
||||
# data:
|
||||
# key: value
|
||||
# - |
|
||||
# apiVersion: networking.k8s.io/v1
|
||||
# kind: NetworkPolicy
|
||||
# metadata:
|
||||
# name: {{ include "onyx.fullname" . }}-extra
|
||||
# spec:
|
||||
# podSelector: {}
|
||||
# policyTypes:
|
||||
# - Ingress
|
||||
|
||||
377
desktop/src-tauri/Cargo.lock
generated
377
desktop/src-tauri/Cargo.lock
generated
@@ -82,28 +82,6 @@ version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
|
||||
|
||||
[[package]]
|
||||
name = "aws-lc-rs"
|
||||
version = "1.16.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc"
|
||||
dependencies = [
|
||||
"aws-lc-sys",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-lc-sys"
|
||||
version = "0.39.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cmake",
|
||||
"dunce",
|
||||
"fs_extra",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.21.7"
|
||||
@@ -277,8 +255,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"jobserver",
|
||||
"libc",
|
||||
"shlex",
|
||||
]
|
||||
|
||||
@@ -315,12 +291,6 @@ version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "cfg_aliases"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.44"
|
||||
@@ -333,15 +303,6 @@ dependencies = [
|
||||
"windows-link 0.2.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cmake"
|
||||
version = "0.1.58"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "combine"
|
||||
version = "4.6.7"
|
||||
@@ -819,12 +780,6 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs_extra"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "futf"
|
||||
version = "0.1.5"
|
||||
@@ -1042,10 +997,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"wasi 0.11.1+wasi-snapshot-preview1",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1055,11 +1008,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"r-efi 5.3.0",
|
||||
"wasip2",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1334,22 +1285,6 @@ dependencies = [
|
||||
"want",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-rustls"
|
||||
version = "0.27.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
|
||||
dependencies = [
|
||||
"http",
|
||||
"hyper",
|
||||
"hyper-util",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.20"
|
||||
@@ -1652,16 +1587,6 @@ version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.34"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
|
||||
dependencies = [
|
||||
"getrandom 0.3.4",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.91"
|
||||
@@ -1793,12 +1718,6 @@ version = "0.4.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
||||
|
||||
[[package]]
|
||||
name = "mac"
|
||||
version = "0.1.1"
|
||||
@@ -2122,7 +2041,6 @@ name = "onyx"
|
||||
version = "0.0.0-dev"
|
||||
dependencies = [
|
||||
"directories",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tauri",
|
||||
@@ -2147,12 +2065,6 @@ dependencies = [
|
||||
"pathdiff",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
@@ -2543,62 +2455,6 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn"
|
||||
version = "0.11.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"cfg_aliases",
|
||||
"pin-project-lite",
|
||||
"quinn-proto",
|
||||
"quinn-udp",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"socket2",
|
||||
"thiserror 2.0.18",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn-proto"
|
||||
version = "0.11.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
|
||||
dependencies = [
|
||||
"aws-lc-rs",
|
||||
"bytes",
|
||||
"getrandom 0.3.4",
|
||||
"lru-slab",
|
||||
"rand 0.9.2",
|
||||
"ring",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"slab",
|
||||
"thiserror 2.0.18",
|
||||
"tinyvec",
|
||||
"tracing",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn-udp"
|
||||
version = "0.5.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
|
||||
dependencies = [
|
||||
"cfg_aliases",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"socket2",
|
||||
"tracing",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
@@ -2645,16 +2501,6 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
|
||||
dependencies = [
|
||||
"rand_chacha 0.9.0",
|
||||
"rand_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
@@ -2675,16 +2521,6 @@ dependencies = [
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.9.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
@@ -2703,15 +2539,6 @@ dependencies = [
|
||||
"getrandom 0.2.17",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
|
||||
dependencies = [
|
||||
"getrandom 0.3.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
@@ -2830,21 +2657,15 @@ dependencies = [
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"hyper-util",
|
||||
"js-sys",
|
||||
"log",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"quinn",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"rustls-platform-verifier",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tokio-util",
|
||||
"tower",
|
||||
"tower-http",
|
||||
@@ -2856,26 +2677,6 @@ dependencies = [
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"getrandom 0.2.17",
|
||||
"libc",
|
||||
"untrusted",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.1"
|
||||
@@ -2885,81 +2686,6 @@ dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.23.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
|
||||
dependencies = [
|
||||
"aws-lc-rs",
|
||||
"once_cell",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-native-certs"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
|
||||
dependencies = [
|
||||
"openssl-probe",
|
||||
"rustls-pki-types",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pki-types"
|
||||
version = "1.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
|
||||
dependencies = [
|
||||
"web-time",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-platform-verifier"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784"
|
||||
dependencies = [
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"jni",
|
||||
"log",
|
||||
"once_cell",
|
||||
"rustls",
|
||||
"rustls-native-certs",
|
||||
"rustls-platform-verifier-android",
|
||||
"rustls-webpki",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"webpki-root-certs",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-platform-verifier-android"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.103.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
|
||||
dependencies = [
|
||||
"aws-lc-rs",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.22"
|
||||
@@ -2975,15 +2701,6 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
|
||||
dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "schemars"
|
||||
version = "0.8.22"
|
||||
@@ -3041,29 +2758,6 @@ version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "security-framework"
|
||||
version = "3.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
|
||||
dependencies = [
|
||||
"bitflags 2.11.0",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"security-framework-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework-sys"
|
||||
version = "2.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.24.0"
|
||||
@@ -3434,12 +3128,6 @@ version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "subtle"
|
||||
version = "2.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
|
||||
|
||||
[[package]]
|
||||
name = "swift-rs"
|
||||
version = "1.0.7"
|
||||
@@ -3921,21 +3609,6 @@ dependencies = [
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
|
||||
dependencies = [
|
||||
"tinyvec_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec_macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.50.0"
|
||||
@@ -3950,16 +3623,6 @@ dependencies = [
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-rustls"
|
||||
version = "0.26.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
|
||||
dependencies = [
|
||||
"rustls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.7.18"
|
||||
@@ -4241,12 +3904,6 @@ version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.8"
|
||||
@@ -4493,16 +4150,6 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webkit2gtk"
|
||||
version = "2.0.2"
|
||||
@@ -4547,15 +4194,6 @@ dependencies = [
|
||||
"system-deps",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-root-certs"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
|
||||
dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webview2-com"
|
||||
version = "0.38.2"
|
||||
@@ -4810,15 +4448,6 @@ dependencies = [
|
||||
"windows-targets 0.48.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
||||
dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.59.0"
|
||||
@@ -5359,12 +4988,6 @@ dependencies = [
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
|
||||
|
||||
[[package]]
|
||||
name = "zerotrie"
|
||||
version = "0.2.3"
|
||||
|
||||
@@ -19,7 +19,6 @@ directories = "5.0"
|
||||
tokio = { version = "1", features = ["time"] }
|
||||
window-vibrancy = "0.7.1"
|
||||
url = "2.5"
|
||||
reqwest = { version = "0.13", default-features = false, features = ["rustls"] }
|
||||
|
||||
[features]
|
||||
default = ["custom-protocol"]
|
||||
|
||||
@@ -583,31 +583,6 @@ fn open_in_default_browser(url: &str) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
#[tauri::command]
|
||||
async fn check_server_reachable(state: tauri::State<'_, ConfigState>) -> Result<(), String> {
|
||||
let url = state.config.read().unwrap().server_url.clone();
|
||||
let parsed = Url::parse(&url).map_err(|e| format!("Invalid URL: {}", e))?;
|
||||
match parsed.scheme() {
|
||||
"http" | "https" => {}
|
||||
_ => return Err("URL must use http or https".to_string()),
|
||||
}
|
||||
|
||||
let client = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(5))
|
||||
.build()
|
||||
.map_err(|e| format!("Failed to build HTTP client: {}", e))?;
|
||||
|
||||
match client.head(parsed).send().await {
|
||||
Ok(_) => Ok(()),
|
||||
// Only definitive "server didn't answer" errors count as unreachable.
|
||||
// TLS / decode / redirect errors imply the server is listening — the
|
||||
// webview, which has its own trust store, is likely to succeed even
|
||||
// when rustls rejects a self-signed cert.
|
||||
Err(e) if e.is_connect() || e.is_timeout() => Err(e.to_string()),
|
||||
Err(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
#[tauri::command]
|
||||
fn open_in_browser(url: String) -> Result<(), String> {
|
||||
let parsed_url = Url::parse(&url).map_err(|_| "Invalid URL".to_string())?;
|
||||
@@ -1319,7 +1294,6 @@ fn main() {
|
||||
get_bootstrap_state,
|
||||
set_server_url,
|
||||
get_config_path_cmd,
|
||||
check_server_reachable,
|
||||
open_in_browser,
|
||||
open_config_file,
|
||||
open_config_directory,
|
||||
|
||||
@@ -459,19 +459,8 @@
|
||||
return;
|
||||
}
|
||||
|
||||
// Not first launch and not explicit settings — confirm the server
|
||||
// is reachable before handing the webview over. Otherwise the user
|
||||
// lands on a native "connection refused" page with no way back.
|
||||
try {
|
||||
await invoke("check_server_reachable");
|
||||
} catch (reachErr) {
|
||||
showSettings();
|
||||
showError(
|
||||
`Could not connect to ${currentServerUrl}. Check the URL and your network connection.`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// Not first launch and not explicit settings
|
||||
// Auto-redirect to configured domain
|
||||
window.location.href = currentServerUrl;
|
||||
} catch (error) {
|
||||
// On error, default to cloud
|
||||
|
||||
@@ -6,6 +6,7 @@ import { LlmDescriptor, LlmManager } from "@/lib/hooks";
|
||||
import { structureValue } from "@/lib/llmConfig/utils";
|
||||
import { getModelIcon } from "@/lib/llmConfig";
|
||||
import { AGGREGATOR_PROVIDERS } from "@/lib/llmConfig/svc";
|
||||
|
||||
import { Slider } from "@/components/ui/slider";
|
||||
import { useUser } from "@/providers/UserProvider";
|
||||
import Text from "@/refresh-components/texts/Text";
|
||||
|
||||
@@ -3,20 +3,18 @@
|
||||
import { useState, useMemo, useRef, useEffect } from "react";
|
||||
import { PopoverMenu } from "@/refresh-components/Popover";
|
||||
import InputTypeIn from "@/refresh-components/inputs/InputTypeIn";
|
||||
import { Button, LineItemButton, Text } from "@opal/components";
|
||||
import { SvgCheck, SvgChevronRight } from "@opal/icons";
|
||||
import { Text } from "@opal/components";
|
||||
import { SvgCheck, SvgChevronDown, SvgChevronRight } from "@opal/icons";
|
||||
import { Section } from "@/layouts/general-layouts";
|
||||
import { LLMOption } from "./interfaces";
|
||||
import { buildLlmOptions, groupLlmOptions } from "./LLMPopover";
|
||||
import LineItem from "@/refresh-components/buttons/LineItem";
|
||||
import { LLMProviderDescriptor } from "@/interfaces/llm";
|
||||
import {
|
||||
Collapsible,
|
||||
CollapsibleContent,
|
||||
CollapsibleTrigger,
|
||||
} from "@/refresh-components/Collapsible";
|
||||
import { cn } from "@opal/utils";
|
||||
import { Interactive } from "@opal/core";
|
||||
import { ContentAction } from "@opal/layouts";
|
||||
|
||||
export interface ModelListContentProps {
|
||||
llmProviders: LLMProviderDescriptor[] | undefined;
|
||||
@@ -116,22 +114,20 @@ export default function ModelListContent({
|
||||
capabilities.length > 0 ? capabilities.join(", ") : undefined;
|
||||
|
||||
return (
|
||||
<LineItemButton
|
||||
<LineItem
|
||||
key={`${option.provider}:${option.modelName}`}
|
||||
selectVariant="select-heavy"
|
||||
state={selected ? "selected" : "empty"}
|
||||
icon={(props) => <div {...(props as any)} />}
|
||||
title={option.displayName}
|
||||
selected={selected}
|
||||
disabled={disabled}
|
||||
description={description}
|
||||
onClick={() => onSelect(option)}
|
||||
rightChildren={
|
||||
selected ? (
|
||||
<SvgCheck className="text-action-link-05" size={16} />
|
||||
<SvgCheck className="h-4 w-4 stroke-action-link-05 shrink-0" />
|
||||
) : null
|
||||
}
|
||||
sizePreset="main-ui"
|
||||
rounding="sm"
|
||||
/>
|
||||
>
|
||||
{option.displayName}
|
||||
</LineItem>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -160,7 +156,7 @@ export default function ModelListContent({
|
||||
]
|
||||
: groupedOptions.length === 1
|
||||
? [
|
||||
<Section key="single-provider" gap={1}>
|
||||
<Section key="single-provider" gap={0.25}>
|
||||
{groupedOptions[0]!.options.map(renderModelItem)}
|
||||
</Section>,
|
||||
]
|
||||
@@ -171,45 +167,22 @@ export default function ModelListContent({
|
||||
key={group.key}
|
||||
open={open}
|
||||
onOpenChange={() => toggleGroup(group.key)}
|
||||
className="flex flex-col gap-1"
|
||||
>
|
||||
<CollapsibleTrigger asChild>
|
||||
<Interactive.Stateless prominence="tertiary">
|
||||
<Interactive.Container
|
||||
size="fit"
|
||||
rounding="sm"
|
||||
width="full"
|
||||
>
|
||||
<div className="pl-2 pr-1 py-1 w-full">
|
||||
<ContentAction
|
||||
sizePreset="secondary"
|
||||
variant="body"
|
||||
prominence="muted"
|
||||
icon={group.Icon}
|
||||
title={group.displayName}
|
||||
padding="fit"
|
||||
rightChildren={
|
||||
<Section>
|
||||
<Button
|
||||
icon={(props) => (
|
||||
<SvgChevronRight
|
||||
{...props}
|
||||
className={cn(
|
||||
"transition-all",
|
||||
open && "rotate-90",
|
||||
props.className
|
||||
)}
|
||||
/>
|
||||
)}
|
||||
prominence="tertiary"
|
||||
size="sm"
|
||||
/>
|
||||
</Section>
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
</Interactive.Container>
|
||||
</Interactive.Stateless>
|
||||
<LineItem
|
||||
muted
|
||||
icon={group.Icon}
|
||||
strokeIcon={false}
|
||||
rightChildren={
|
||||
open ? (
|
||||
<SvgChevronDown className="h-4 w-4 stroke-text-04 shrink-0" />
|
||||
) : (
|
||||
<SvgChevronRight className="h-4 w-4 stroke-text-04 shrink-0" />
|
||||
)
|
||||
}
|
||||
>
|
||||
{group.displayName}
|
||||
</LineItem>
|
||||
</CollapsibleTrigger>
|
||||
|
||||
<CollapsibleContent>
|
||||
|
||||
@@ -51,16 +51,14 @@ test.describe("LLM Ordering", () => {
|
||||
await page.waitForSelector('[role="dialog"]', { timeout: 5000 });
|
||||
|
||||
const dialog = page.locator('[role="dialog"]');
|
||||
const allModelItems = dialog.locator("[data-interactive-state]");
|
||||
const allModelItems = dialog.locator("[data-selected]");
|
||||
await expect(allModelItems.first()).toBeVisible({ timeout: 5000 });
|
||||
|
||||
const count = await allModelItems.count();
|
||||
expect(count).toBeGreaterThan(0);
|
||||
|
||||
// Pick the first non-selected model so the trigger text changes after click
|
||||
const nonSelectedItem = dialog
|
||||
.locator('[data-interactive-state="empty"]')
|
||||
.first();
|
||||
const nonSelectedItem = dialog.locator('[data-selected="false"]').first();
|
||||
const hasNonSelected = (await nonSelectedItem.count()) > 0;
|
||||
const targetItem = hasNonSelected ? nonSelectedItem : allModelItems.first();
|
||||
|
||||
|
||||
@@ -300,13 +300,11 @@ test.describe("LLM Runtime Selection", () => {
|
||||
|
||||
const regenerateDialog = page.locator('[role="dialog"]');
|
||||
const alternateModelOption = regenerateDialog
|
||||
.locator('[data-interactive-state="empty"]')
|
||||
.locator('[data-selected="false"]')
|
||||
.first();
|
||||
|
||||
test.skip(
|
||||
(await regenerateDialog
|
||||
.locator('[data-interactive-state="empty"]')
|
||||
.count()) === 0,
|
||||
(await regenerateDialog.locator('[data-selected="false"]').count()) === 0,
|
||||
"Regenerate model picker requires at least two runtime model options"
|
||||
);
|
||||
|
||||
@@ -410,11 +408,13 @@ test.describe("LLM Runtime Selection", () => {
|
||||
const dialog = page.locator('[role="dialog"]');
|
||||
await dialog.getByPlaceholder("Search models...").fill(sharedModelName);
|
||||
|
||||
const sharedModelOptions = dialog.locator("[data-interactive-state]");
|
||||
const sharedModelOptions = dialog.locator("[data-selected]");
|
||||
await expect(sharedModelOptions).toHaveCount(2);
|
||||
// Two results with the same model name under different providers.
|
||||
// Groups are sorted alphabetically, so OpenAI (index 1) comes after Anthropic (index 0).
|
||||
const openAiModelOption = sharedModelOptions.nth(1);
|
||||
const openAiModelOption = dialog
|
||||
.getByRole("button", { name: /openai/i })
|
||||
.locator("..")
|
||||
.locator("[data-selected]")
|
||||
.first();
|
||||
await expect(openAiModelOption).toBeVisible();
|
||||
await openAiModelOption.click();
|
||||
await page.waitForSelector('[role="dialog"]', { state: "hidden" });
|
||||
@@ -434,12 +434,13 @@ test.describe("LLM Runtime Selection", () => {
|
||||
.getByPlaceholder("Search models...")
|
||||
.fill(sharedModelName);
|
||||
|
||||
const secondSharedModelOptions = secondDialog.locator(
|
||||
"[data-interactive-state]"
|
||||
);
|
||||
const secondSharedModelOptions = secondDialog.locator("[data-selected]");
|
||||
await expect(secondSharedModelOptions).toHaveCount(2);
|
||||
// Anthropic is index 0 (alphabetically first).
|
||||
const anthropicModelOption = secondSharedModelOptions.nth(0);
|
||||
const anthropicModelOption = secondDialog
|
||||
.getByRole("button", { name: /anthropic/i })
|
||||
.locator("..")
|
||||
.locator("[data-selected]")
|
||||
.first();
|
||||
await expect(anthropicModelOption).toBeVisible();
|
||||
await anthropicModelOption.click();
|
||||
await page.waitForSelector('[role="dialog"]', { state: "hidden" });
|
||||
@@ -447,11 +448,11 @@ test.describe("LLM Runtime Selection", () => {
|
||||
await page.getByTestId("model-selector").locator("button").last().click();
|
||||
await page.waitForSelector('[role="dialog"]', { state: "visible" });
|
||||
const verifyDialog = page.locator('[role="dialog"]');
|
||||
// Verify the Anthropic option (index 0) is selected.
|
||||
const selectedOption = verifyDialog.locator(
|
||||
'[data-interactive-state="selected"]'
|
||||
);
|
||||
await expect(selectedOption).toHaveCount(1);
|
||||
const selectedAnthropicOption = verifyDialog
|
||||
.getByRole("button", { name: /anthropic/i })
|
||||
.locator("..")
|
||||
.locator('[data-selected="true"]');
|
||||
await expect(selectedAnthropicOption).toHaveCount(1);
|
||||
await page.keyboard.press("Escape");
|
||||
await page.waitForSelector('[role="dialog"]', { state: "hidden" });
|
||||
|
||||
@@ -517,7 +518,7 @@ test.describe("LLM Runtime Selection", () => {
|
||||
await dialog.getByPlaceholder("Search models...").fill(restrictedModelName);
|
||||
|
||||
const restrictedModelOption = dialog
|
||||
.locator("[data-interactive-state]")
|
||||
.locator("[data-selected]")
|
||||
.filter({ hasText: restrictedModelName });
|
||||
|
||||
await expect(restrictedModelOption).toHaveCount(0);
|
||||
|
||||
@@ -85,10 +85,8 @@ export async function selectModelFromInputPopover(
|
||||
|
||||
for (const modelName of preferredModels) {
|
||||
await searchInput.fill(modelName);
|
||||
const modelOptions = dialog.locator("[data-interactive-state]");
|
||||
const nonSelectedOptions = dialog.locator(
|
||||
'[data-interactive-state="empty"]'
|
||||
);
|
||||
const modelOptions = dialog.locator("[data-selected]");
|
||||
const nonSelectedOptions = dialog.locator('[data-selected="false"]');
|
||||
|
||||
if ((await modelOptions.count()) > 0) {
|
||||
const candidate =
|
||||
@@ -112,7 +110,7 @@ export async function selectModelFromInputPopover(
|
||||
// Reset search so fallback sees all available models.
|
||||
await searchInput.fill("");
|
||||
|
||||
const nonSelectedOptions = dialog.locator('[data-interactive-state="empty"]');
|
||||
const nonSelectedOptions = dialog.locator('[data-selected="false"]');
|
||||
if ((await nonSelectedOptions.count()) > 0) {
|
||||
const fallback = nonSelectedOptions.first();
|
||||
await expect(fallback).toBeVisible();
|
||||
@@ -147,7 +145,7 @@ export async function switchModel(page: Page, modelName: string) {
|
||||
|
||||
const modelButton = page
|
||||
.locator('[role="dialog"]')
|
||||
.getByRole("button")
|
||||
.locator('[role="button"]')
|
||||
.filter({ hasText: modelName })
|
||||
.first();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user