fix: consolidate Button/ and button/ case-split for Linux builds

2026-04-21 01:16:45 +00:00 · 2026-04-18 16:06:50 +02:00
47 changed files with 435 additions and 2204 deletions
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -16,13 +16,9 @@
    "source=onyx-devcontainer-local,target=/home/dev/.local,type=volume"
  ],
  "containerEnv": {
-    "MODEL_SERVER_HOST": "inference_model_server",
-    "OPENSEARCH_HOST": "opensearch",
-    "POSTGRES_HOST": "relational_db",
-    "REDIS_HOST": "cache",
-    "S3_ENDPOINT_URL": "http://minio:9000",
    "SSH_AUTH_SOCK": "/tmp/ssh-agent.sock",
-    "VESPA_HOST": "index"
+    "POSTGRES_HOST": "relational_db",
+    "REDIS_HOST": "cache"
  },
  "remoteUser": "${localEnv:DEVCONTAINER_REMOTE_USER:dev}",
  "updateRemoteUserUID": false,
--- a/.devcontainer/init-firewall.sh
+++ b/.devcontainer/init-firewall.sh
@@ -40,7 +40,6 @@ ALLOWED_DOMAINS=(
    "api.anthropic.com"
    "api-staging.anthropic.com"
    "files.anthropic.com"
-    "huggingface.co"
    "sentry.io"
    "update.code.visualstudio.com"
    "pypi.org"
--- a/.github/workflows/deployment.yml
+++ b/.github/workflows/deployment.yml
@@ -403,7 +403,7 @@ jobs:
          echo "CERT_ID=$CERT_ID" >> $GITHUB_ENV
          echo "Certificate imported."

-      - uses: tauri-apps/tauri-action@84b9d35b5fc46c1e45415bdb6144030364f7ebc5 # ratchet:tauri-apps/tauri-action@action-v0.6.2
+      - uses: tauri-apps/tauri-action@73fb865345c54760d875b94642314f8c0c894afa # ratchet:tauri-apps/tauri-action@action-v0.6.1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          APPLE_ID: ${{ env.APPLE_ID }}
--- a/.github/workflows/pr-golang-tests.yml
+++ b/.github/workflows/pr-golang-tests.yml
@@ -42,7 +42,7 @@ jobs:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v6
        with:
          persist-credentials: false
-      - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # zizmor: ignore[cache-poisoning]
+      - uses: actions/setup-go@4dc6199c7b1a012772edbd06daecab0f50c9053c # zizmor: ignore[cache-poisoning]
        with:
          go-version: ${{ env.GO_VERSION }}
          cache-dependency-path: "**/go.sum"
--- a/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
+++ b/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
@@ -1,27 +0,0 @@
-"""Add file_id to documents
-
-Revision ID: 91d150c361f6
-Revises: a6fcd3d631f9
-Create Date: 2026-04-16 15:43:30.314823
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-# revision identifiers, used by Alembic.
-revision = "91d150c361f6"
-down_revision = "a6fcd3d631f9"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    op.add_column(
-        "document",
-        sa.Column("file_id", sa.String(), nullable=True),
-    )
-
-
-def downgrade() -> None:
-    op.drop_column("document", "file_id")
--- a/backend/alembic/versions/a6fcd3d631f9_replace_document_sync_index_with_partial.py
+++ b/backend/alembic/versions/a6fcd3d631f9_replace_document_sync_index_with_partial.py
@@ -1,48 +0,0 @@
-"""replace document sync index with partial index
-
-Replaces the composite index ix_document_sync_status (last_modified, last_synced)
-with a partial index ix_document_needs_sync that only indexes rows where
-last_modified > last_synced OR last_synced IS NULL.
-
-The old index was never used by the query planner (0 scans in pg_stat_user_indexes)
-because Postgres cannot use a B-tree composite index to evaluate a comparison
-between two columns in the same row combined with an OR/IS NULL condition.
-
-The partial index makes count_documents_by_needs_sync ~4000x faster for tenants
-with no stale documents (161ms -> 0.04ms on a 929K row table) and ~17x faster
-for tenants with large backlogs (846ms -> 50ms on a 164K row table).
-
-Revision ID: a6fcd3d631f9
-Revises: d129f37b3d87
-Create Date: 2026-04-17 16:00:00.000000
-
-"""
-
-from alembic import op
-import sqlalchemy as sa
-
-
-# revision identifiers, used by Alembic.
-revision = "a6fcd3d631f9"
-down_revision = "d129f37b3d87"
-branch_labels = None
-depends_on = None
-
-
-def upgrade() -> None:
-    op.create_index(
-        "ix_document_needs_sync",
-        "document",
-        ["id"],
-        postgresql_where=sa.text("last_modified > last_synced OR last_synced IS NULL"),
-    )
-    op.drop_index("ix_document_sync_status", table_name="document")
-
-
-def downgrade() -> None:
-    op.create_index(
-        "ix_document_sync_status",
-        "document",
-        ["last_modified", "last_synced"],
-    )
-    op.drop_index("ix_document_needs_sync", table_name="document")
--- a/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
@@ -506,18 +506,6 @@ def _perform_external_group_sync(

        ext_group_sync_func = sync_config.group_sync_config.group_sync_func

-        # Clean up stale rows from previous cycle BEFORE marking new ones.
-        # This ensures cleanup always runs regardless of whether the current
-        # sync succeeds — previously, cleanup only ran at the END of the sync,
-        # so if the sync failed (e.g. DB connection killed by
-        # idle_in_transaction_session_timeout during long API calls), stale
-        # rows would accumulate indefinitely.
-        logger.info(
-            f"Removing stale external groups from prior cycle for {source_type} "
-            f"for cc_pair: {cc_pair_id}"
-        )
-        remove_stale_external_groups(db_session, cc_pair_id)
-
        logger.info(
            f"Marking old external groups as stale for {source_type} for cc_pair: {cc_pair_id}"
        )
--- a/backend/ee/onyx/db/external_perm.py
+++ b/backend/ee/onyx/db/external_perm.py
@@ -5,7 +5,6 @@ from pydantic import BaseModel
 from sqlalchemy import delete
 from sqlalchemy import select
 from sqlalchemy import update
-from sqlalchemy.dialects.postgresql import insert as pg_insert
 from sqlalchemy.orm import Session

 from onyx.access.utils import build_ext_group_name_for_onyx
@@ -78,15 +77,6 @@ def mark_old_external_groups_as_stale(
        .where(PublicExternalUserGroup.cc_pair_id == cc_pair_id)
        .values(stale=True)
    )
-    # Commit immediately so the transaction closes before potentially long
-    # external API calls (e.g. Google Drive folder iteration). Without this,
-    # the DB connection sits idle-in-transaction during API calls and gets
-    # killed by idle_in_transaction_session_timeout, causing the entire sync
-    # to fail and stale cleanup to never run.
-    db_session.commit()
-
-
-_UPSERT_BATCH_SIZE = 5000


 def upsert_external_groups(
@@ -96,102 +86,91 @@ def upsert_external_groups(
    source: DocumentSource,
 ) -> None:
    """
-    Batch upsert external user groups using INSERT ... ON CONFLICT DO UPDATE.
-    - For existing rows (same user_id, external_user_group_id, cc_pair_id),
-      sets stale=False
-    - For new rows, inserts with stale=False
-    - Same logic for PublicExternalUserGroup
+    Performs a true upsert operation for external user groups:
+    - For existing groups (same user_id, external_user_group_id, cc_pair_id), updates the stale flag to False
+    - For new groups, inserts them with stale=False
+    - For public groups, uses upsert logic as well
    """
+    # If there are no groups to add, return early
    if not external_groups:
        return

-    # Collect all emails from all groups to batch-add users at once
-    all_group_member_emails: set[str] = set()
+    # collect all emails from all groups to batch add all users at once for efficiency
+    all_group_member_emails = set()
    for external_group in external_groups:
-        all_group_member_emails.update(external_group.user_emails)
+        for user_email in external_group.user_emails:
+            all_group_member_emails.add(user_email)

-    # Batch add users if they don't exist and get their ids
+    # batch add users if they don't exist and get their ids
    all_group_members: list[User] = batch_add_ext_perm_user_if_not_exists(
        db_session=db_session,
+        # NOTE: this function handles case sensitivity for emails
        emails=list(all_group_member_emails),
    )

+    # map emails to ids
    email_id_map = {user.email.lower(): user.id for user in all_group_members}

-    # Build all user-group mappings and public-group mappings
-    user_group_mappings: list[dict] = []
-    public_group_mappings: list[dict] = []
-
+    # Process each external group
    for external_group in external_groups:
        external_group_id = build_ext_group_name_for_onyx(
            ext_group_name=external_group.id,
            source=source,
        )

+        # Handle user-group mappings
        for user_email in external_group.user_emails:
            user_id = email_id_map.get(user_email.lower())
            if user_id is None:
                logger.warning(
-                    f"User in group {external_group.id}"
-                    f" with email {user_email} not found"
+                    f"User in group {external_group.id} with email {user_email} not found"
                )
                continue

-            user_group_mappings.append(
-                {
-                    "user_id": user_id,
-                    "external_user_group_id": external_group_id,
-                    "cc_pair_id": cc_pair_id,
-                    "stale": False,
-                }
+            # Check if the user-group mapping already exists
+            existing_user_group = db_session.scalar(
+                select(User__ExternalUserGroupId).where(
+                    User__ExternalUserGroupId.user_id == user_id,
+                    User__ExternalUserGroupId.external_user_group_id
+                    == external_group_id,
+                    User__ExternalUserGroupId.cc_pair_id == cc_pair_id,
+                )
            )

+            if existing_user_group:
+                # Update existing record
+                existing_user_group.stale = False
+            else:
+                # Insert new record
+                new_user_group = User__ExternalUserGroupId(
+                    user_id=user_id,
+                    external_user_group_id=external_group_id,
+                    cc_pair_id=cc_pair_id,
+                    stale=False,
+                )
+                db_session.add(new_user_group)
+
+        # Handle public group if needed
        if external_group.gives_anyone_access:
-            public_group_mappings.append(
-                {
-                    "external_user_group_id": external_group_id,
-                    "cc_pair_id": cc_pair_id,
-                    "stale": False,
-                }
+            # Check if the public group already exists
+            existing_public_group = db_session.scalar(
+                select(PublicExternalUserGroup).where(
+                    PublicExternalUserGroup.external_user_group_id == external_group_id,
+                    PublicExternalUserGroup.cc_pair_id == cc_pair_id,
+                )
            )

-    # Deduplicate to avoid "ON CONFLICT DO UPDATE command cannot affect row
-    # a second time" when duplicate emails or overlapping groups produce
-    # identical (user_id, external_user_group_id, cc_pair_id) tuples.
-    user_group_mappings_deduped = list(
-        {
-            (m["user_id"], m["external_user_group_id"], m["cc_pair_id"]): m
-            for m in user_group_mappings
-        }.values()
-    )
-
-    # Batch upsert user-group mappings
-    for i in range(0, len(user_group_mappings_deduped), _UPSERT_BATCH_SIZE):
-        chunk = user_group_mappings_deduped[i : i + _UPSERT_BATCH_SIZE]
-        stmt = pg_insert(User__ExternalUserGroupId).values(chunk)
-        stmt = stmt.on_conflict_do_update(
-            index_elements=["user_id", "external_user_group_id", "cc_pair_id"],
-            set_={"stale": False},
-        )
-        db_session.execute(stmt)
-
-    # Deduplicate public group mappings as well
-    public_group_mappings_deduped = list(
-        {
-            (m["external_user_group_id"], m["cc_pair_id"]): m
-            for m in public_group_mappings
-        }.values()
-    )
-
-    # Batch upsert public group mappings
-    for i in range(0, len(public_group_mappings_deduped), _UPSERT_BATCH_SIZE):
-        chunk = public_group_mappings_deduped[i : i + _UPSERT_BATCH_SIZE]
-        stmt = pg_insert(PublicExternalUserGroup).values(chunk)
-        stmt = stmt.on_conflict_do_update(
-            index_elements=["external_user_group_id", "cc_pair_id"],
-            set_={"stale": False},
-        )
-        db_session.execute(stmt)
+            if existing_public_group:
+                # Update existing record
+                existing_public_group.stale = False
+            else:
+                # Insert new record
+                new_public_group = PublicExternalUserGroup(
+                    external_user_group_id=external_group_id,
+                    cc_pair_id=cc_pair_id,
+                    stale=False,
+                )
+                db_session.add(new_public_group)

    db_session.commit()

--- a/backend/model_server/main.py
+++ b/backend/model_server/main.py
@@ -27,7 +27,6 @@ from shared_configs.configs import MIN_THREADS_ML_MODELS
 from shared_configs.configs import MODEL_SERVER_ALLOWED_HOST
 from shared_configs.configs import MODEL_SERVER_PORT
 from shared_configs.configs import SENTRY_DSN
-from shared_configs.configs import SENTRY_TRACES_SAMPLE_RATE

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
@@ -102,7 +101,7 @@ def get_model_app() -> FastAPI:
        sentry_sdk.init(
            dsn=SENTRY_DSN,
            integrations=[StarletteIntegration(), FastApiIntegration()],
-            traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
+            traces_sample_rate=0.1,
            release=__version__,
            before_send=_add_instance_tags,
        )
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -55,7 +55,6 @@ from onyx.utils.logger import setup_logger
 from shared_configs.configs import DEV_LOGGING_ENABLED
 from shared_configs.configs import MULTI_TENANT
 from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
-from shared_configs.configs import SENTRY_CELERY_TRACES_SAMPLE_RATE
 from shared_configs.configs import SENTRY_DSN
 from shared_configs.configs import TENANT_ID_PREFIX
 from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR
@@ -70,7 +69,7 @@ if SENTRY_DSN:
    sentry_sdk.init(
        dsn=SENTRY_DSN,
        integrations=[CeleryIntegration()],
-        traces_sample_rate=SENTRY_CELERY_TRACES_SAMPLE_RATE,
+        traces_sample_rate=0.1,
        release=__version__,
        before_send=_add_instance_tags,
    )
--- a/backend/onyx/background/celery/tasks/docfetching/tasks.py
+++ b/backend/onyx/background/celery/tasks/docfetching/tasks.py
@@ -37,7 +37,6 @@ from onyx.redis.redis_connector import RedisConnector
 from onyx.server.metrics.connector_health_metrics import on_index_attempt_status_change
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import global_version
-from shared_configs.configs import SENTRY_CELERY_TRACES_SAMPLE_RATE
 from shared_configs.configs import SENTRY_DSN

 logger = setup_logger()
@@ -141,7 +140,7 @@ def _docfetching_task(

        sentry_sdk.init(
            dsn=SENTRY_DSN,
-            traces_sample_rate=SENTRY_CELERY_TRACES_SAMPLE_RATE,
+            traces_sample_rate=0.1,
            release=__version__,
            before_send=_add_instance_tags,
        )
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -58,8 +58,6 @@ from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.db.models import IndexAttempt
 from onyx.file_store.document_batch_storage import DocumentBatchStorage
 from onyx.file_store.document_batch_storage import get_document_batch_storage
-from onyx.file_store.staging import build_raw_file_callback
-from onyx.file_store.staging import RawFileCallback
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.indexing.indexing_pipeline import index_doc_batch_prepare
 from onyx.redis.redis_hierarchy import cache_hierarchy_nodes_batch
@@ -92,7 +90,6 @@ def _get_connector_runner(
    end_time: datetime,
    include_permissions: bool,
    leave_connector_active: bool = LEAVE_CONNECTOR_ACTIVE_ON_INITIALIZATION_FAILURE,
-    raw_file_callback: RawFileCallback | None = None,
 ) -> ConnectorRunner:
    """
    NOTE: `start_time` and `end_time` are only used for poll connectors
@@ -111,7 +108,6 @@ def _get_connector_runner(
            input_type=task,
            connector_specific_config=attempt.connector_credential_pair.connector.connector_specific_config,
            credential=attempt.connector_credential_pair.credential,
-            raw_file_callback=raw_file_callback,
        )

        # validate the connector settings
@@ -279,12 +275,6 @@ def run_docfetching_entrypoint(
        f"credentials='{credential_id}'"
    )

-    raw_file_callback = build_raw_file_callback(
-        index_attempt_id=index_attempt_id,
-        cc_pair_id=connector_credential_pair_id,
-        tenant_id=tenant_id,
-    )
-
    connector_document_extraction(
        app,
        index_attempt_id,
@@ -292,7 +282,6 @@ def run_docfetching_entrypoint(
        attempt.search_settings_id,
        tenant_id,
        callback,
-        raw_file_callback=raw_file_callback,
    )

    logger.info(
@@ -312,7 +301,6 @@ def connector_document_extraction(
    search_settings_id: int,
    tenant_id: str,
    callback: IndexingHeartbeatInterface | None = None,
-    raw_file_callback: RawFileCallback | None = None,
 ) -> None:
    """Extract documents from connector and queue them for indexing pipeline processing.

@@ -463,7 +451,6 @@ def connector_document_extraction(
            start_time=window_start,
            end_time=window_end,
            include_permissions=should_fetch_permissions_during_indexing,
-            raw_file_callback=raw_file_callback,
        )

        # don't use a checkpoint if we're explicitly indexing from
--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -868,15 +868,6 @@ MAX_EMBEDDED_IMAGES_PER_UPLOAD = max(
    0, int(os.environ.get("MAX_EMBEDDED_IMAGES_PER_UPLOAD") or 1000)
 )

-# Maximum non-empty cells to extract from a single xlsx worksheet. Protects
-# from OOM on honestly-huge spreadsheets: memory cost in the extractor is
-# roughly proportional to this count. Once exceeded, the scan stops and a
-# truncation marker row is appended to the sheet's CSV.
-# Peak Memory ~= 100 B * MAX_CELLS
-MAX_XLSX_CELLS_PER_SHEET = max(
-    0, int(os.environ.get("MAX_XLSX_CELLS_PER_SHEET") or 10_000_000)
-)
-
 # Use document summary for contextual rag
 USE_DOCUMENT_SUMMARY = os.environ.get("USE_DOCUMENT_SUMMARY", "true").lower() == "true"
 # Use chunk summary for contextual rag
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -372,7 +372,6 @@ class FileOrigin(str, Enum):
    CONNECTOR_METADATA = "connector_metadata"
    GENERATED_REPORT = "generated_report"
    INDEXING_CHECKPOINT = "indexing_checkpoint"
-    INDEXING_STAGING = "indexing_staging"
    PLAINTEXT_CACHE = "plaintext_cache"
    OTHER = "other"
    QUERY_HISTORY_CSV = "query_history_csv"
--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@@ -22,7 +22,6 @@ from onyx.db.credentials import backend_update_credential_json
 from onyx.db.credentials import fetch_credential_by_id
 from onyx.db.enums import AccessType
 from onyx.db.models import Credential
-from onyx.file_store.staging import RawFileCallback
 from shared_configs.contextvars import get_current_tenant_id


@@ -108,7 +107,6 @@ def instantiate_connector(
    input_type: InputType,
    connector_specific_config: dict[str, Any],
    credential: Credential,
-    raw_file_callback: RawFileCallback | None = None,
 ) -> BaseConnector:
    connector_class = identify_connector_class(source, input_type)

@@ -132,9 +130,6 @@ def instantiate_connector(

    connector.set_allow_images(get_image_extraction_and_analysis_enabled())

-    if raw_file_callback is not None:
-        connector.set_raw_file_callback(raw_file_callback)
-
    return connector


--- a/backend/onyx/connectors/gong/connector.py
+++ b/backend/onyx/connectors/gong/connector.py
@@ -40,22 +40,6 @@ class GongConnectorCheckpoint(ConnectorCheckpoint):
    cursor: str | None = None
    # Cached time range — computed once, reused across checkpoint calls
    time_range: tuple[str, str] | None = None
-    # Transcripts whose call details were not yet available from /v2/calls/extensive
-    # (Gong has a known race where transcript call IDs take time to propagate).
-    # Keyed by call_id. Retried on subsequent checkpoint invocations.
-    #
-    # Invariant: all entries share one resolution session — they're stashed
-    # together from a single page and share the attempt counter and retry
-    # deadline. load_from_checkpoint only fetches a new page when this dict
-    # is empty, so entries from different pages can't mix.
-    pending_transcripts: dict[str, dict[str, Any]] = {}
-    # Number of resolution attempts made for pending_transcripts so far.
-    pending_call_details_attempts: int = 0
-    # Unix timestamp before which we should not retry pending_transcripts.
-    # Enforces exponential backoff independent of worker cadence — Gong's
-    # transcript-ID propagation race can take tens of seconds to minutes,
-    # longer than typical worker reinvocation intervals.
-    pending_retry_after: float | None = None


 class _TranscriptPage(BaseModel):
@@ -78,15 +62,8 @@ class _CursorExpiredError(Exception):

 class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
    BASE_URL = "https://api.gong.io"
-    # Max number of attempts to resolve missing call details across checkpoint
-    # invocations before giving up and emitting ConnectorFailure.
    MAX_CALL_DETAILS_ATTEMPTS = 6
-    # Base delay for exponential backoff between pending-transcript retry
-    # attempts. Delay before attempt N (N >= 2) is CALL_DETAILS_DELAY * 2^(N-2)
-    # seconds (30, 60, 120, 240, 480 = ~15.5min total) — matching the original
-    # blocking-retry schedule, but enforced via checkpoint deadline rather
-    # than in-call time.sleep.
-    CALL_DETAILS_DELAY = 30
+    CALL_DETAILS_DELAY = 30  # in seconds
    # Gong API limit is 3 calls/sec — stay safely under it
    MIN_REQUEST_INTERVAL = 0.5  # seconds between requests

@@ -210,6 +187,50 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):

        return call_to_metadata

+    def _fetch_call_details_with_retry(self, call_ids: list[str]) -> dict[str, Any]:
+        """Fetch call details with retry for the Gong API race condition.
+
+        The Gong API has a known race where transcript call IDs don't immediately
+        appear in /v2/calls/extensive. Retries with exponential backoff, only
+        re-requesting the missing IDs on each attempt.
+        """
+        call_details_map = self._get_call_details_by_ids(call_ids)
+        if set(call_ids) == set(call_details_map.keys()):
+            return call_details_map
+
+        for attempt in range(2, self.MAX_CALL_DETAILS_ATTEMPTS + 1):
+            missing_ids = list(set(call_ids) - set(call_details_map.keys()))
+            logger.warning(
+                f"_get_call_details_by_ids is missing call id's: current_attempt={attempt - 1} missing_call_ids={missing_ids}"
+            )
+
+            wait_seconds = self.CALL_DETAILS_DELAY * pow(2, attempt - 2)
+            logger.warning(
+                f"_get_call_details_by_ids waiting to retry: "
+                f"wait={wait_seconds}s "
+                f"current_attempt={attempt - 1} "
+                f"next_attempt={attempt} "
+                f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
+            )
+            time.sleep(wait_seconds)
+
+            # Only re-fetch the missing IDs, merge into existing results
+            new_details = self._get_call_details_by_ids(missing_ids)
+            call_details_map.update(new_details)
+
+            if set(call_ids) == set(call_details_map.keys()):
+                return call_details_map
+
+        missing_ids = list(set(call_ids) - set(call_details_map.keys()))
+        logger.error(
+            f"Giving up on missing call id's after "
+            f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
+            f"missing_call_ids={missing_ids} — "
+            f"proceeding with {len(call_details_map)} of "
+            f"{len(call_ids)} calls"
+        )
+        return call_details_map
+
    @staticmethod
    def _parse_parties(parties: list[dict]) -> dict[str, str]:
        id_mapping = {}
@@ -292,119 +313,87 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):

        return start_time, end_time

-    def _build_document(
-        self,
-        transcript: dict[str, Any],
-        call_details: dict[str, Any],
-    ) -> Document:
-        """Build a single Document from a transcript and its resolved call details."""
-        call_id = transcript["callId"]
-        call_metadata = call_details["metaData"]
-
-        call_time_str = call_metadata["started"]
-        call_title = call_metadata["title"]
-        logger.info(
-            f"Indexing Gong call id {call_id} from {call_time_str.split('T', 1)[0]}: {call_title}"
-        )
-
-        call_parties = cast(list[dict] | None, call_details.get("parties"))
-        if call_parties is None:
-            logger.error(f"Couldn't get parties for Call ID: {call_id}")
-            call_parties = []
-
-        id_to_name_map = self._parse_parties(call_parties)
-
-        speaker_to_name: dict[str, str] = {}
-
-        transcript_text = ""
-        call_purpose = call_metadata["purpose"]
-        if call_purpose:
-            transcript_text += f"Call Description: {call_purpose}\n\n"
-
-        contents = transcript["transcript"]
-        for segment in contents:
-            speaker_id = segment.get("speakerId", "")
-            if speaker_id not in speaker_to_name:
-                if self.hide_user_info:
-                    speaker_to_name[speaker_id] = f"User {len(speaker_to_name) + 1}"
-                else:
-                    speaker_to_name[speaker_id] = id_to_name_map.get(
-                        speaker_id, "Unknown"
-                    )
-
-            speaker_name = speaker_to_name[speaker_id]
-
-            sentences = segment.get("sentences", {})
-            monolog = " ".join([sentence.get("text", "") for sentence in sentences])
-            transcript_text += f"{speaker_name}: {monolog}\n\n"
-
-        return Document(
-            id=call_id,
-            sections=[TextSection(link=call_metadata["url"], text=transcript_text)],
-            source=DocumentSource.GONG,
-            semantic_identifier=call_title or "Untitled",
-            doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
-                timezone.utc
-            ),
-            metadata={"client": call_metadata.get("system")},
-        )
-
    def _process_transcripts(
        self,
        transcripts: list[dict[str, Any]],
-        checkpoint: GongConnectorCheckpoint,
    ) -> Generator[Document | ConnectorFailure, None, None]:
-        """Fetch call details for a page of transcripts and yield resulting
-        Documents. Transcripts whose call details are missing (Gong race
-        condition) are stashed into `checkpoint.pending_transcripts` for retry
-        on a future checkpoint invocation rather than blocking here.
-        """
+        """Process a batch of transcripts into Documents or ConnectorFailures."""
        transcript_call_ids = cast(
            list[str],
            [t.get("callId") for t in transcripts if t.get("callId")],
        )

-        call_details_map = (
-            self._get_call_details_by_ids(transcript_call_ids)
-            if transcript_call_ids
-            else {}
-        )
-
-        newly_stashed: list[str] = []
+        call_details_map = self._fetch_call_details_with_retry(transcript_call_ids)

        for transcript in transcripts:
            call_id = transcript.get("callId")

-            if not call_id:
-                logger.error(
-                    "Couldn't get call information for transcript missing callId"
-                )
+            if not call_id or call_id not in call_details_map:
+                logger.error(f"Couldn't get call information for Call ID: {call_id}")
+                if call_id:
+                    logger.error(
+                        f"Call debug info: call_id={call_id} "
+                        f"call_ids={transcript_call_ids} "
+                        f"call_details_map={call_details_map.keys()}"
+                    )
                yield ConnectorFailure(
-                    failed_document=DocumentFailure(document_id="unknown"),
-                    failure_message="Transcript missing callId",
+                    failed_document=DocumentFailure(
+                        document_id=call_id or "unknown",
+                    ),
+                    failure_message=f"Couldn't get call information for Call ID: {call_id}",
                )
                continue

-            if call_id in call_details_map:
-                yield self._build_document(transcript, call_details_map[call_id])
-                continue
+            call_details = call_details_map[call_id]
+            call_metadata = call_details["metaData"]

-            # Details not available yet — stash for retry on next invocation.
-            checkpoint.pending_transcripts[call_id] = transcript
-            newly_stashed.append(call_id)
-
-        if newly_stashed:
-            logger.warning(
-                f"Gong call details not yet available (race condition); "
-                f"deferring to next checkpoint invocation: "
-                f"call_ids={newly_stashed}"
+            call_time_str = call_metadata["started"]
+            call_title = call_metadata["title"]
+            logger.info(
+                f"Indexing Gong call id {call_id} from {call_time_str.split('T', 1)[0]}: {call_title}"
+            )
+
+            call_parties = cast(list[dict] | None, call_details.get("parties"))
+            if call_parties is None:
+                logger.error(f"Couldn't get parties for Call ID: {call_id}")
+                call_parties = []
+
+            id_to_name_map = self._parse_parties(call_parties)
+
+            speaker_to_name: dict[str, str] = {}
+
+            transcript_text = ""
+            call_purpose = call_metadata["purpose"]
+            if call_purpose:
+                transcript_text += f"Call Description: {call_purpose}\n\n"
+
+            contents = transcript["transcript"]
+            for segment in contents:
+                speaker_id = segment.get("speakerId", "")
+                if speaker_id not in speaker_to_name:
+                    if self.hide_user_info:
+                        speaker_to_name[speaker_id] = f"User {len(speaker_to_name) + 1}"
+                    else:
+                        speaker_to_name[speaker_id] = id_to_name_map.get(
+                            speaker_id, "Unknown"
+                        )
+
+                speaker_name = speaker_to_name[speaker_id]
+
+                sentences = segment.get("sentences", {})
+                monolog = " ".join([sentence.get("text", "") for sentence in sentences])
+                transcript_text += f"{speaker_name}: {monolog}\n\n"
+
+            yield Document(
+                id=call_id,
+                sections=[TextSection(link=call_metadata["url"], text=transcript_text)],
+                source=DocumentSource.GONG,
+                semantic_identifier=call_title or "Untitled",
+                doc_updated_at=datetime.fromisoformat(call_time_str).astimezone(
+                    timezone.utc
+                ),
+                metadata={"client": call_metadata.get("system")},
            )
-            # First attempt on any newly-stashed transcripts counts as attempt #1.
-            # pending_call_details_attempts is guaranteed 0 here because
-            # load_from_checkpoint only reaches _process_transcripts when
-            # pending_transcripts was empty at entry (see early-return above).
-            checkpoint.pending_call_details_attempts = 1
-            checkpoint.pending_retry_after = time.time() + self._next_retry_delay(1)

    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        combined = (
@@ -443,18 +432,6 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
            checkpoint.has_more = True
            return checkpoint

-        # Step 2: Resolve any transcripts stashed by a prior invocation whose
-        # call details were missing due to Gong's propagation race. Worker
-        # cadence between checkpoint calls provides the spacing between retry
-        # attempts — no in-call sleep needed.
-        if checkpoint.pending_transcripts:
-            yield from self._resolve_pending_transcripts(checkpoint)
-            # If pending still exists and we haven't exhausted attempts, defer
-            # the rest of this invocation — _resolve_pending_transcripts set
-            # has_more=True for us.
-            if checkpoint.pending_transcripts:
-                return checkpoint
-
        workspace_ids = checkpoint.workspace_ids

        # If we've exhausted all workspaces, we're done
@@ -473,7 +450,7 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):

        workspace_id = workspace_ids[checkpoint.workspace_index]

-        # Step 3: Fetch one page of transcripts
+        # Step 2: Fetch one page of transcripts
        try:
            page = self._fetch_transcript_page(
                start_datetime=start_time,
@@ -496,102 +473,23 @@ class GongConnector(CheckpointedConnector[GongConnectorCheckpoint]):
            checkpoint.has_more = True
            return checkpoint

-        # Step 4: Process transcripts into documents. Missing-details
-        # transcripts get stashed into checkpoint.pending_transcripts.
+        # Step 3: Process transcripts into documents
        if page.transcripts:
-            yield from self._process_transcripts(page.transcripts, checkpoint)
+            yield from self._process_transcripts(page.transcripts)

-        # Step 5: Update cursor/workspace state
+        # Step 4: Update checkpoint state
        if page.next_cursor:
+            # More pages in this workspace
            checkpoint.cursor = page.next_cursor
            checkpoint.has_more = True
        else:
+            # This workspace is exhausted — advance to next
            checkpoint.workspace_index += 1
            checkpoint.cursor = None
            checkpoint.has_more = checkpoint.workspace_index < len(workspace_ids)

-        # If pending transcripts were stashed this invocation, we still have
-        # work to do on a future invocation even if pagination is exhausted.
-        if checkpoint.pending_transcripts:
-            checkpoint.has_more = True
-
        return checkpoint

-    def _next_retry_delay(self, attempts_done: int) -> float:
-        """Seconds to wait before attempt #(attempts_done + 1).
-        Matches the original exponential backoff: 30, 60, 120, 240, 480.
-        """
-        return self.CALL_DETAILS_DELAY * pow(2, attempts_done - 1)
-
-    def _resolve_pending_transcripts(
-        self,
-        checkpoint: GongConnectorCheckpoint,
-    ) -> Generator[Document | ConnectorFailure, None, None]:
-        """Attempt to resolve transcripts whose call details were unavailable
-        in a prior invocation. Mutates checkpoint in place: resolved transcripts
-        are removed from pending_transcripts; on attempt exhaustion, emits
-        ConnectorFailure for each unresolved call_id and clears pending state.
-
-        If the backoff deadline hasn't elapsed yet, returns without issuing
-        any API call so the next invocation can try again later.
-        """
-        if (
-            checkpoint.pending_retry_after is not None
-            and time.time() < checkpoint.pending_retry_after
-        ):
-            # Backoff still in effect — defer to a later invocation without
-            # burning an attempt or an API call.
-            checkpoint.has_more = True
-            return
-
-        pending_call_ids = list(checkpoint.pending_transcripts.keys())
-        resolved = self._get_call_details_by_ids(pending_call_ids)
-
-        for call_id, details in resolved.items():
-            transcript = checkpoint.pending_transcripts.pop(call_id, None)
-            if transcript is None:
-                continue
-            yield self._build_document(transcript, details)
-
-        if not checkpoint.pending_transcripts:
-            checkpoint.pending_call_details_attempts = 0
-            checkpoint.pending_retry_after = None
-            return
-
-        checkpoint.pending_call_details_attempts += 1
-        logger.warning(
-            f"Gong call details still missing after "
-            f"{checkpoint.pending_call_details_attempts}/"
-            f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
-            f"missing_call_ids={list(checkpoint.pending_transcripts.keys())}"
-        )
-
-        if checkpoint.pending_call_details_attempts >= self.MAX_CALL_DETAILS_ATTEMPTS:
-            logger.error(
-                f"Giving up on missing Gong call details after "
-                f"{self.MAX_CALL_DETAILS_ATTEMPTS} attempts: "
-                f"missing_call_ids={list(checkpoint.pending_transcripts.keys())}"
-            )
-            for call_id in list(checkpoint.pending_transcripts.keys()):
-                yield ConnectorFailure(
-                    failed_document=DocumentFailure(document_id=call_id),
-                    failure_message=(
-                        f"Couldn't get call details after {self.MAX_CALL_DETAILS_ATTEMPTS} attempts for Call ID: {call_id}"
-                    ),
-                )
-            checkpoint.pending_transcripts = {}
-            checkpoint.pending_call_details_attempts = 0
-            checkpoint.pending_retry_after = None
-            # has_more is recomputed by the workspace iteration that follows;
-            # reset to False here so a stale True from a prior invocation
-            # can't leak out via any future early-return path.
-            checkpoint.has_more = False
-        else:
-            checkpoint.pending_retry_after = time.time() + self._next_retry_delay(
-                checkpoint.pending_call_details_attempts
-            )
-            checkpoint.has_more = True
-

 if __name__ == "__main__":
    import os
--- a/backend/onyx/connectors/google_drive/connector.py
+++ b/backend/onyx/connectors/google_drive/connector.py
@@ -578,18 +578,8 @@ class GoogleDriveConnector(
                    current_id, file.user_email, field_type, failed_folder_ids_by_email
                )
                if not folder:
-                    # Can't access this folder - stop climbing.
-                    # If the terminal node is a confirmed orphan, backfill all
-                    # intermediate folders into failed_folder_ids_by_email so
-                    # future files short-circuit via _get_folder_metadata's
-                    # cache check instead of re-climbing the whole chain.
-                    if failed_folder_ids_by_email is not None:
-                        for email in {file.user_email, self.primary_admin_email}:
-                            email_failed_ids = failed_folder_ids_by_email.get(email)
-                            if email_failed_ids and current_id in email_failed_ids:
-                                failed_folder_ids_by_email.setdefault(
-                                    email, ThreadSafeSet()
-                                ).update(set(node_ids_in_walk))
+                    # Can't access this folder - stop climbing
+                    # Don't mark as fully walked since we didn't reach root
                    break

                folder_parent_id = _get_parent_id_from_file(folder)
--- a/backend/onyx/connectors/google_drive/models.py
+++ b/backend/onyx/connectors/google_drive/models.py
@@ -167,12 +167,9 @@ class GoogleDriveCheckpoint(ConnectorCheckpoint):
        default_factory=ThreadSafeSet
    )

-    # Maps email → set of folder IDs that email should skip when walking the
-    # parent chain. Covers two cases:
-    #   1. Folders where that email confirmed no accessible parent (true orphans).
-    #   2. Intermediate folders on a path that dead-ended at a confirmed orphan —
-    #      backfilled so future walks short-circuit earlier in the chain.
-    # In both cases _get_folder_metadata skips the API call and returns None.
+    # Maps email → set of IDs of folders where that email confirmed no accessible parent.
+    # Avoids redundant API calls when the same (folder, email) pair is
+    # encountered again within the same retrieval run.
    failed_folder_ids_by_email: ThreadSafeDict[str, ThreadSafeSet[str]] = Field(
        default_factory=ThreadSafeDict
    )
--- a/backend/onyx/connectors/interfaces.py
+++ b/backend/onyx/connectors/interfaces.py
@@ -15,7 +15,6 @@ from onyx.connectors.models import ConnectorFailure
 from onyx.connectors.models import Document
 from onyx.connectors.models import HierarchyNode
 from onyx.connectors.models import SlimDocument
-from onyx.file_store.staging import RawFileCallback
 from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop

@@ -43,9 +42,6 @@ class NormalizationResult(BaseModel):
 class BaseConnector(abc.ABC, Generic[CT]):
    REDIS_KEY_PREFIX = "da_connector_data:"

-    # Optional raw-file persistence hook to save original file
-    raw_file_callback: RawFileCallback | None = None
-
    @abc.abstractmethod
    def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
        raise NotImplementedError
@@ -92,15 +88,6 @@ class BaseConnector(abc.ABC, Generic[CT]):
        """Implement if the underlying connector wants to skip/allow image downloading
        based on the application level image analysis setting."""

-    def set_raw_file_callback(self, callback: RawFileCallback) -> None:
-        """Inject the per-attempt raw-file persistence callback.
-
-        Wired up by the docfetching entrypoint via `instantiate_connector`.
-        Connectors that don't care about persisting raw bytes can ignore this
-        — `raw_file_callback` simply stays `None`.
-        """
-        self.raw_file_callback = callback
-
    @classmethod
    def normalize_url(cls, url: str) -> "NormalizationResult":  # noqa: ARG003
        """Normalize a URL to match the canonical Document.id format used during ingestion.
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@@ -19,7 +19,6 @@ from playwright.sync_api import Playwright
 from playwright.sync_api import sync_playwright
 from playwright.sync_api import TimeoutError
 from requests_oauthlib import OAuth2Session
-from typing_extensions import override
 from urllib3.exceptions import MaxRetryError

 from onyx.configs.app_configs import INDEX_BATCH_SIZE
@@ -33,16 +32,11 @@ from onyx.connectors.exceptions import CredentialExpiredError
 from onyx.connectors.exceptions import InsufficientPermissionsError
 from onyx.connectors.exceptions import UnexpectedValidationError
 from onyx.connectors.interfaces import GenerateDocumentsOutput
-from onyx.connectors.interfaces import GenerateSlimDocumentOutput
 from onyx.connectors.interfaces import LoadConnector
-from onyx.connectors.interfaces import SecondsSinceUnixEpoch
-from onyx.connectors.interfaces import SlimConnector
 from onyx.connectors.models import Document
 from onyx.connectors.models import HierarchyNode
-from onyx.connectors.models import SlimDocument
 from onyx.connectors.models import TextSection
 from onyx.file_processing.html_utils import web_html_cleanup
-from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
 from onyx.utils.logger import setup_logger
 from onyx.utils.sitemap import list_pages_for_site
 from onyx.utils.web_content import extract_pdf_text
@@ -61,6 +55,8 @@ class ScrapeSessionContext:
        self.visited_links: set[str] = set()
        self.content_hashes: set[int] = set()

+        self.doc_batch: list[Document | HierarchyNode] = []
+
        self.at_least_one_doc: bool = False
        self.last_error: str | None = None
        self.needs_retry: bool = False
@@ -442,7 +438,7 @@ def _handle_cookies(context: BrowserContext, url: str) -> None:
        )


-class WebConnector(LoadConnector, SlimConnector):
+class WebConnector(LoadConnector):
    MAX_RETRIES = 3

    def __init__(
@@ -497,14 +493,8 @@ class WebConnector(LoadConnector, SlimConnector):
        index: int,
        initial_url: str,
        session_ctx: ScrapeSessionContext,
-        slim: bool = False,
    ) -> ScrapeResult:
-        """Returns a ScrapeResult object with a doc and retry flag.
-
-        When slim=True, skips scroll, PDF content download, and content extraction.
-        The bot-detection render wait (5s) fires on CF/403 responses regardless of slim.
-        networkidle is always awaited so JS-rendered links are discovered correctly.
-        """
+        """Returns a ScrapeResult object with a doc and retry flag."""

        if session_ctx.playwright is None:
            raise RuntimeError("scrape_context.playwright is None")
@@ -525,16 +515,7 @@ class WebConnector(LoadConnector, SlimConnector):
        is_pdf = is_pdf_resource(initial_url, content_type)

        if is_pdf:
-            if slim:
-                result.doc = Document(
-                    id=initial_url,
-                    sections=[],
-                    source=DocumentSource.WEB,
-                    semantic_identifier=initial_url,
-                    metadata={},
-                )
-                return result
-
+            # PDF files are not checked for links
            response = requests.get(initial_url, headers=DEFAULT_HEADERS)
            page_text, metadata = extract_pdf_text(response.content)
            last_modified = response.headers.get("Last-Modified")
@@ -565,20 +546,14 @@ class WebConnector(LoadConnector, SlimConnector):
                timeout=30000,  # 30 seconds
                wait_until="commit",  # Wait for navigation to commit
            )
+            # Give the page a moment to start rendering after navigation commits.
+            # Allows CloudFlare and other bot-detection challenges to complete.
+            page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)

-            # Bot-detection JS challenges (CloudFlare, Imperva, etc.) need a moment
-            # to start network activity after commit before networkidle is meaningful.
-            # We detect this via the cf-ray header (CloudFlare) or a 403 response,
-            # which is the common entry point for JS-challenge-based bot detection.
-            is_bot_challenge = page_response is not None and (
-                page_response.header_value("cf-ray") is not None
-                or page_response.status == 403
-            )
-            if is_bot_challenge:
-                page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)
-
-            # Wait for network activity to settle (handles SPAs, CF challenges, etc.)
+            # Wait for network activity to settle so SPAs that fetch content
+            # asynchronously after the initial JS bundle have time to render.
            try:
+                # A bit of extra time to account for long-polling, websockets, etc.
                page.wait_for_load_state("networkidle", timeout=PAGE_RENDER_TIMEOUT_MS)
            except TimeoutError:
                pass
@@ -601,7 +576,7 @@ class WebConnector(LoadConnector, SlimConnector):
                session_ctx.visited_links.add(initial_url)

            # If we got here, the request was successful
-            if not slim and self.scroll_before_scraping:
+            if self.scroll_before_scraping:
                scroll_attempts = 0
                previous_height = page.evaluate("document.body.scrollHeight")
                while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
@@ -640,16 +615,6 @@ class WebConnector(LoadConnector, SlimConnector):
                result.retry = True
                return result

-            if slim:
-                result.doc = Document(
-                    id=initial_url,
-                    sections=[],
-                    source=DocumentSource.WEB,
-                    semantic_identifier=initial_url,
-                    metadata={},
-                )
-                return result
-
            # after this point, we don't need the caller to retry
            parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)

@@ -701,13 +666,9 @@ class WebConnector(LoadConnector, SlimConnector):

        return result

-    def load_from_state(self, slim: bool = False) -> GenerateDocumentsOutput:
-        """Traverses through all pages found on the website and converts them into
-        documents.
-
-        When slim=True, yields SlimDocument objects (URL id only, no content).
-        Playwright is used in all modes — slim skips content extraction only.
-        """
+    def load_from_state(self) -> GenerateDocumentsOutput:
+        """Traverses through all pages found on the website
+        and converts them into documents"""

        if not self.to_visit_list:
            raise ValueError("No URLs to visit")
@@ -718,8 +679,6 @@ class WebConnector(LoadConnector, SlimConnector):
        session_ctx = ScrapeSessionContext(base_url, self.to_visit_list)
        session_ctx.initialize()

-        batch: list[Document | SlimDocument | HierarchyNode] = []
-
        while session_ctx.to_visit:
            initial_url = session_ctx.to_visit.pop()
            if initial_url in session_ctx.visited_links:
@@ -734,9 +693,7 @@ class WebConnector(LoadConnector, SlimConnector):
                continue

            index = len(session_ctx.visited_links)
-            logger.info(
-                f"{index}: {'Slim-visiting' if slim else 'Visiting'} {initial_url}"
-            )
+            logger.info(f"{index}: Visiting {initial_url}")

            # Add retry mechanism with exponential backoff
            retry_count = 0
@@ -751,14 +708,12 @@ class WebConnector(LoadConnector, SlimConnector):
                    time.sleep(delay)

                try:
-                    result = self._do_scrape(index, initial_url, session_ctx, slim=slim)
+                    result = self._do_scrape(index, initial_url, session_ctx)
                    if result.retry:
                        continue

                    if result.doc:
-                        batch.append(
-                            SlimDocument(id=result.doc.id) if slim else result.doc
-                        )
+                        session_ctx.doc_batch.append(result.doc)
                except Exception as e:
                    session_ctx.last_error = f"Failed to fetch '{initial_url}': {e}"
                    logger.exception(session_ctx.last_error)
@@ -769,16 +724,16 @@ class WebConnector(LoadConnector, SlimConnector):

                break  # success / don't retry

-            if len(batch) >= self.batch_size:
+            if len(session_ctx.doc_batch) >= self.batch_size:
                session_ctx.initialize()
                session_ctx.at_least_one_doc = True
-                yield batch  # ty: ignore[invalid-yield]
-                batch = []
+                yield session_ctx.doc_batch
+                session_ctx.doc_batch = []

-        if batch:
+        if session_ctx.doc_batch:
            session_ctx.stop()
            session_ctx.at_least_one_doc = True
-            yield batch  # ty: ignore[invalid-yield]
+            yield session_ctx.doc_batch

        if not session_ctx.at_least_one_doc:
            if session_ctx.last_error:
@@ -787,22 +742,6 @@ class WebConnector(LoadConnector, SlimConnector):

        session_ctx.stop()

-    @override
-    def retrieve_all_slim_docs(
-        self,
-        start: SecondsSinceUnixEpoch | None = None,
-        end: SecondsSinceUnixEpoch | None = None,
-        callback: IndexingHeartbeatInterface | None = None,
-    ) -> GenerateSlimDocumentOutput:
-        """Yields SlimDocuments for all pages reachable from the configured URLs.
-
-        Uses the same Playwright crawl as full indexing but skips content extraction,
-        scroll, and PDF downloads. The 5s render wait fires only on bot-detection
-        responses (CloudFlare cf-ray header or HTTP 403).
-        The start/end parameters are ignored — WEB connector has no incremental path.
-        """
-        yield from self.load_from_state(slim=True)  # ty: ignore[invalid-yield]
-
    def validate_connector_settings(self) -> None:
        # Make sure we have at least one valid URL to check
        if not self.to_visit_list:
--- a/backend/onyx/db/models.py
+++ b/backend/onyx/db/models.py
@@ -952,7 +952,6 @@ class Document(Base):
    semantic_id: Mapped[str] = mapped_column(NullFilteredString)
    # First Section's link
    link: Mapped[str | None] = mapped_column(NullFilteredString, nullable=True)
-    file_id: Mapped[str | None] = mapped_column(String, nullable=True)

    # The updated time is also used as a measure of the last successful state of the doc
    # pulled from the source (to help skip reindexing already updated docs in case of
@@ -1055,9 +1054,9 @@ class Document(Base):

    __table_args__ = (
        Index(
-            "ix_document_needs_sync",
-            "id",
-            postgresql_where=text("last_modified > last_synced OR last_synced IS NULL"),
+            "ix_document_sync_status",
+            last_modified,
+            last_synced,
        ),
    )

--- a/backend/onyx/db/sync_record.py
+++ b/backend/onyx/db/sync_record.py
@@ -129,13 +129,9 @@ def update_sync_record_status(
    """
    sync_record = fetch_latest_sync_record(db_session, entity_id, sync_type)
    if sync_record is None:
-        logger.warning(
-            f"No sync record found for entity_id={entity_id} "
-            f"sync_type={sync_type} — skipping status update. "
-            f"This typically means the record was never created "
-            f"(insert_sync_record failed silently) or was cleaned up."
+        raise ValueError(
+            f"No sync record found for entity_id={entity_id} sync_type={sync_type}"
        )
-        return

    sync_record.sync_status = sync_status
    if num_docs_synced is not None:
--- a/backend/onyx/db/user_file.py
+++ b/backend/onyx/db/user_file.py
@@ -2,17 +2,11 @@ import datetime
 from uuid import UUID

 from sqlalchemy import func
-from sqlalchemy import or_
 from sqlalchemy import select
 from sqlalchemy.orm import joinedload
 from sqlalchemy.orm import selectinload
 from sqlalchemy.orm import Session

-from onyx.configs.constants import FileOrigin
-from onyx.db.models import ChatMessage
-from onyx.db.models import ChatSession
-from onyx.db.models import ChatSessionSharedStatus
-from onyx.db.models import FileRecord
 from onyx.db.models import Persona
 from onyx.db.models import Project__UserFile
 from onyx.db.models import UserFile
@@ -114,73 +108,13 @@ def update_last_accessed_at_for_user_files(
    db_session.commit()


-def get_file_id_by_user_file_id(
-    user_file_id: str, user_id: UUID, db_session: Session
-) -> str | None:
-    user_file = (
-        db_session.query(UserFile)
-        .filter(UserFile.id == user_file_id, UserFile.user_id == user_id)
-        .first()
-    )
+def get_file_id_by_user_file_id(user_file_id: str, db_session: Session) -> str | None:
+    user_file = db_session.query(UserFile).filter(UserFile.id == user_file_id).first()
    if user_file:
        return user_file.file_id
    return None


-def user_can_access_chat_file(file_id: str, user_id: UUID, db_session: Session) -> bool:
-    """Return True if `user_id` is allowed to read the raw `file_id` served by
-    `GET /chat/file/{file_id}`. Access is granted when any of:
-
-    - The `file_id` is the storage id of a `UserFile` owned by the user.
-    - The `file_id` is a persona avatar (`Persona.uploaded_image_id`); avatars
-      are visible to any authenticated user.
-    - The `file_id` appears in a `ChatMessage.files` descriptor of a chat
-      session the user owns or a session publicly shared via
-      `ChatSessionSharedStatus.PUBLIC`.
-    """
-    owns_user_file = db_session.query(
-        select(UserFile.id)
-        .where(UserFile.file_id == file_id, UserFile.user_id == user_id)
-        .exists()
-    ).scalar()
-    if owns_user_file:
-        return True
-
-    # TODO: move persona avatars to a dedicated endpoint (e.g.
-    # /assistants/{id}/avatar) so this branch can be removed. /chat/file is
-    # currently overloaded with multiple asset classes (user files, chat
-    # attachments, tool outputs, avatars), forcing this access-check fan-out.
-    #
-    # Restrict the avatar path to CHAT_UPLOAD-origin files so an attacker
-    # cannot bind another user's USER_FILE (or any other origin) to their
-    # own persona and read it through this check.
-    is_persona_avatar = db_session.query(
-        select(Persona.id)
-        .join(FileRecord, FileRecord.file_id == Persona.uploaded_image_id)
-        .where(
-            Persona.uploaded_image_id == file_id,
-            FileRecord.file_origin == FileOrigin.CHAT_UPLOAD,
-        )
-        .exists()
-    ).scalar()
-    if is_persona_avatar:
-        return True
-
-    chat_file_stmt = (
-        select(ChatMessage.id)
-        .join(ChatSession, ChatMessage.chat_session_id == ChatSession.id)
-        .where(ChatMessage.files.op("@>")([{"id": file_id}]))
-        .where(
-            or_(
-                ChatSession.user_id == user_id,
-                ChatSession.shared_status == ChatSessionSharedStatus.PUBLIC,
-            )
-        )
-        .limit(1)
-    )
-    return db_session.execute(chat_file_stmt).first() is not None
-
-
 def get_file_ids_by_user_file_ids(
    user_file_ids: list[UUID], db_session: Session
 ) -> list[str]:
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -12,7 +12,6 @@ from email.parser import Parser as EmailParser
 from io import BytesIO
 from pathlib import Path
 from typing import Any
-from typing import cast
 from typing import IO
 from typing import NamedTuple
 from typing import Optional
@@ -21,11 +20,10 @@ from zipfile import BadZipFile

 import chardet
 import openpyxl
-from openpyxl.worksheet._read_only import ReadOnlyWorksheet
+from openpyxl.worksheet.worksheet import Worksheet
 from PIL import Image

 from onyx.configs.app_configs import MAX_EMBEDDED_IMAGES_PER_FILE
-from onyx.configs.app_configs import MAX_XLSX_CELLS_PER_SHEET
 from onyx.configs.constants import ONYX_METADATA_FILENAME
 from onyx.configs.llm_configs import get_image_extraction_and_analysis_enabled
 from onyx.file_processing.file_types import OnyxFileExtensions
@@ -51,7 +49,6 @@ KNOWN_OPENPYXL_BUGS = [
    "Unable to read workbook: could not read stylesheet from None",
    "Colors must be aRGB hex values",
    "Max value is",
-    "There is no item named",
 ]


@@ -450,83 +447,104 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
    return presentation.markdown


-def _columns_to_keep(col_has_data: bytearray, max_empty: int) -> list[int]:
-    """Keep non-empty columns, plus runs of up to ``max_empty`` empty columns
-    between them. Trailing empty columns are dropped."""
+def _worksheet_to_matrix(
+    worksheet: Worksheet,
+) -> list[list[str]]:
+    """
+    Converts a singular worksheet to a matrix of values.
+
+    Rows are padded to a uniform width. In openpyxl's read_only mode,
+    iter_rows can yield rows of differing lengths (trailing empty cells
+    are sometimes omitted), and downstream column cleanup assumes a
+    rectangular matrix.
+    """
+    rows: list[list[str]] = []
+    max_len = 0
+    for worksheet_row in worksheet.iter_rows(min_row=1, values_only=True):
+        row = ["" if cell is None else str(cell) for cell in worksheet_row]
+        if len(row) > max_len:
+            max_len = len(row)
+        rows.append(row)
+
+    for row in rows:
+        if len(row) < max_len:
+            row.extend([""] * (max_len - len(row)))
+
+    return rows
+
+
+def _clean_worksheet_matrix(matrix: list[list[str]]) -> list[list[str]]:
+    """
+    Cleans a worksheet matrix by removing rows if there are N consecutive empty
+    rows and removing cols if there are M consecutive empty columns
+    """
+    MAX_EMPTY_ROWS = 2  # Runs longer than this are capped to max_empty; shorter runs are preserved as-is
+    MAX_EMPTY_COLS = 2
+
+    # Row cleanup
+    matrix = _remove_empty_runs(matrix, max_empty=MAX_EMPTY_ROWS)
+
+    if not matrix:
+        return matrix
+
+    # Column cleanup — determine which columns to keep without transposing.
+    num_cols = len(matrix[0])
+    keep_cols = _columns_to_keep(matrix, num_cols, max_empty=MAX_EMPTY_COLS)
+    if len(keep_cols) < num_cols:
+        matrix = [[row[c] for c in keep_cols] for row in matrix]
+
+    return matrix
+
+
+def _columns_to_keep(
+    matrix: list[list[str]], num_cols: int, max_empty: int
+) -> list[int]:
+    """Return the indices of columns to keep after removing empty-column runs.
+
+    Uses the same logic as ``_remove_empty_runs`` but operates on column
+    indices so no transpose is needed.
+    """
    kept: list[int] = []
    empty_buffer: list[int] = []
-    for c, has in enumerate(col_has_data):
-        if has:
-            kept.extend(empty_buffer[:max_empty])
-            kept.append(c)
-            empty_buffer = []
+
+    for col_idx in range(num_cols):
+        col_is_empty = all(not row[col_idx] for row in matrix)
+        if col_is_empty:
+            empty_buffer.append(col_idx)
        else:
-            empty_buffer.append(c)
+            kept.extend(empty_buffer[:max_empty])
+            kept.append(col_idx)
+            empty_buffer = []
+
    return kept


-def _sheet_to_csv(rows: Iterator[tuple[Any, ...]]) -> str:
-    """Stream worksheet rows into CSV text without materializing a dense matrix.
+def _remove_empty_runs(
+    rows: list[list[str]],
+    max_empty: int,
+) -> list[list[str]]:
+    """Removes entire runs of empty rows when the run length exceeds max_empty.

-    Empty rows are never stored. Column occupancy is tracked as a ``bytearray``
-    bitmap so column trimming needs no transpose or copy. Runs of empty
-    rows/columns longer than 2 are collapsed; shorter runs are preserved.
-
-    Scanning stops once ``MAX_XLSX_CELLS_PER_SHEET`` non-empty cells have been
-    seen; the output gets a truncation marker row appended so downstream
-    indexing sees that the sheet was cut off.
+    Leading empty runs are capped to max_empty, just like interior runs.
+    Trailing empty rows are always dropped since there is no subsequent
+    non-empty row to flush them.
    """
-    MAX_EMPTY_ROWS_IN_OUTPUT = 2
-    MAX_EMPTY_COLS_IN_OUTPUT = 2
-    TRUNCATION_MARKER = "[truncated: sheet exceeded cell limit]"
+    result: list[list[str]] = []
+    empty_buffer: list[list[str]] = []

-    non_empty_rows: list[tuple[int, list[str]]] = []
-    col_has_data = bytearray()
-    total_non_empty = 0
-    truncated = False
+    for row in rows:
+        # Check if empty
+        if not any(row):
+            if len(empty_buffer) < max_empty:
+                empty_buffer.append(row)
+        else:
+            # Add upto max empty rows onto the result - that's what we allow
+            result.extend(empty_buffer[:max_empty])
+            # Add the new non-empty row
+            result.append(row)
+            empty_buffer = []

-    for row_idx, row_vals in enumerate(rows):
-        # Fast-reject empty rows before allocating a list of "".
-        if not any(v is not None and v != "" for v in row_vals):
-            continue
-
-        cells = ["" if v is None else str(v) for v in row_vals]
-        non_empty_rows.append((row_idx, cells))
-
-        if len(cells) > len(col_has_data):
-            col_has_data.extend(b"\x00" * (len(cells) - len(col_has_data)))
-        for i, v in enumerate(cells):
-            if v:
-                col_has_data[i] = 1
-                total_non_empty += 1
-
-        if total_non_empty > MAX_XLSX_CELLS_PER_SHEET:
-            truncated = True
-            break
-
-    if not non_empty_rows:
-        return ""
-
-    keep_cols = _columns_to_keep(col_has_data, MAX_EMPTY_COLS_IN_OUTPUT)
-    if not keep_cols:
-        return ""
-
-    buf = io.StringIO()
-    writer = csv.writer(buf, lineterminator="\n")
-    blank_row = [""] * len(keep_cols)
-    last_idx = -1
-    for row_idx, cells in non_empty_rows:
-        gap = row_idx - last_idx - 1
-        if gap > 0:
-            for _ in range(min(gap, MAX_EMPTY_ROWS_IN_OUTPUT)):
-                writer.writerow(blank_row)
-        writer.writerow([cells[c] if c < len(cells) else "" for c in keep_cols])
-        last_idx = row_idx
-
-    if truncated:
-        writer.writerow([TRUNCATION_MARKER])
-
-    return buf.getvalue().rstrip("\n")
+    return result


 def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str, str]]:
@@ -554,24 +572,20 @@ def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str,
        raise

    sheets: list[tuple[str, str]] = []
-    try:
-        for sheet in workbook.worksheets:
-            # Declared dimensions can be different to what is actually there
-            ro_sheet = cast(ReadOnlyWorksheet, sheet)
-            ro_sheet.reset_dimensions()
-            csv_text = _sheet_to_csv(ro_sheet.iter_rows(values_only=True))
-            sheets.append((csv_text.strip(), ro_sheet.title))
-    finally:
-        workbook.close()
-
+    for sheet in workbook.worksheets:
+        sheet_matrix = _clean_worksheet_matrix(_worksheet_to_matrix(sheet))
+        buf = io.StringIO()
+        writer = csv.writer(buf, lineterminator="\n")
+        writer.writerows(sheet_matrix)
+        csv_text = buf.getvalue().rstrip("\n")
+        if csv_text.strip():
+            sheets.append((csv_text, sheet.title))
    return sheets


 def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
    sheets = xlsx_sheet_extraction(file, file_name)
-    return TEXT_SECTION_SEPARATOR.join(
-        csv_text for csv_text, _title in sheets if csv_text
-    )
+    return TEXT_SECTION_SEPARATOR.join(csv_text for csv_text, _title in sheets)


 def eml_to_text(file: IO[Any]) -> str:
--- a/backend/onyx/file_store/staging.py
+++ b/backend/onyx/file_store/staging.py
@@ -1,63 +0,0 @@
-from collections.abc import Callable
-from typing import Any
-from typing import IO
-
-from onyx.configs.constants import FileOrigin
-from onyx.file_store.file_store import get_default_file_store
-from onyx.utils.logger import setup_logger
-
-logger = setup_logger()
-
-
-# (content, content_type) -> file_id
-RawFileCallback = Callable[[IO[bytes], str], str]
-
-
-def stage_raw_file(
-    content: IO,
-    content_type: str,
-    *,
-    metadata: dict[str, Any],
-) -> str:
-    """Persist raw bytes to the file store with FileOrigin.INDEXING_STAGING.
-
-    `metadata` is attached to the file_record so that downstream promotion
-    (in docprocessing) and orphan reaping (TTL janitor) can locate the file
-    by its originating context.
-    """
-    file_store = get_default_file_store()
-    file_id = file_store.save_file(
-        content=content,
-        display_name=None,
-        file_origin=FileOrigin.INDEXING_STAGING,
-        file_type=content_type,
-        file_metadata=metadata,
-    )
-    return file_id
-
-
-def build_raw_file_callback(
-    *,
-    index_attempt_id: int,
-    cc_pair_id: int,
-    tenant_id: str,
-) -> RawFileCallback:
-    """Build a per-attempt callback that connectors can invoke to opt in to
-    raw-file persistence. The closure binds the attempt-level context as the
-    staging metadata so the connector only needs to pass per-call info
-    (bytes, content_type) and gets back a file_id to attach to its Document.
-    """
-    metadata: dict[str, Any] = {
-        "index_attempt_id": index_attempt_id,
-        "cc_pair_id": cc_pair_id,
-        "tenant_id": tenant_id,
-    }
-
-    def _callback(content: IO[bytes], content_type: str) -> str:
-        return stage_raw_file(
-            content=content,
-            content_type=content_type,
-            metadata=metadata,
-        )
-
-    return _callback
--- a/backend/onyx/main.py
+++ b/backend/onyx/main.py
@@ -166,7 +166,6 @@ from shared_configs.configs import CORS_ALLOWED_ORIGIN
 from shared_configs.configs import MULTI_TENANT
 from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
 from shared_configs.configs import SENTRY_DSN
-from shared_configs.configs import SENTRY_TRACES_SAMPLE_RATE
 from shared_configs.contextvars import CURRENT_TENANT_ID_CONTEXTVAR

 warnings.filterwarnings(
@@ -440,7 +439,7 @@ def get_application(lifespan_override: Lifespan | None = None) -> FastAPI:
        sentry_sdk.init(
            dsn=SENTRY_DSN,
            integrations=[StarletteIntegration(), FastApiIntegration()],
-            traces_sample_rate=SENTRY_TRACES_SAMPLE_RATE,
+            traces_sample_rate=0.1,
            release=__version__,
            before_send=_add_instance_tags,
        )
--- a/backend/onyx/server/query_and_chat/chat_backend.py
+++ b/backend/onyx/server/query_and_chat/chat_backend.py
@@ -63,7 +63,6 @@ from onyx.db.persona import get_persona_by_id
 from onyx.db.usage import increment_usage
 from onyx.db.usage import UsageType
 from onyx.db.user_file import get_file_id_by_user_file_id
-from onyx.db.user_file import user_can_access_chat_file
 from onyx.error_handling.error_codes import OnyxErrorCode
 from onyx.error_handling.exceptions import OnyxError
 from onyx.file_store.file_store import get_default_file_store
@@ -867,18 +866,14 @@ def seed_chat_from_slack(
 def fetch_chat_file(
    file_id: str,
    request: Request,
-    user: User = Depends(require_permission(Permission.BASIC_ACCESS)),
+    _: User = Depends(require_permission(Permission.BASIC_ACCESS)),
    db_session: Session = Depends(get_session),
 ) -> Response:

    # For user files, we need to get the file id from the user file id
-    file_id_from_user_file = get_file_id_by_user_file_id(file_id, user.id, db_session)
+    file_id_from_user_file = get_file_id_by_user_file_id(file_id, db_session)
    if file_id_from_user_file:
        file_id = file_id_from_user_file
-    elif not user_can_access_chat_file(file_id, user.id, db_session):
-        # Return 404 (rather than 403) so callers cannot probe for file
-        # existence across ownership boundaries.
-        raise OnyxError(OnyxErrorCode.NOT_FOUND, "File not found")

    file_store = get_default_file_store()
    file_record = file_store.read_file_record(file_id)
--- a/backend/shared_configs/configs.py
+++ b/backend/shared_configs/configs.py
@@ -99,14 +99,6 @@ STRICT_CHUNK_TOKEN_LIMIT = (
 # Set up Sentry integration (for error logging)
 SENTRY_DSN = os.environ.get("SENTRY_DSN")

-# Celery task spans dominate ingestion volume (~94%), so default celery
-# tracing to 0. Web/API traces stay at a small non-zero rate so http.server
-# traces remain available. Both are env-tunable without a code change.
-SENTRY_TRACES_SAMPLE_RATE = float(os.environ.get("SENTRY_TRACES_SAMPLE_RATE", "0.01"))
-SENTRY_CELERY_TRACES_SAMPLE_RATE = float(
-    os.environ.get("SENTRY_CELERY_TRACES_SAMPLE_RATE", "0.0")
-)
-

 # Fields which should only be set on new search setting
 PRESERVED_SEARCH_FIELDS = [
--- a/backend/tests/external_dependency_unit/file_store/test_staging.py
+++ b/backend/tests/external_dependency_unit/file_store/test_staging.py
@@ -1,130 +0,0 @@
-"""External dependency tests for onyx.file_store.staging.
-
-Exercises the raw-file persistence hook used by the docfetching pipeline
-against a real file store (Postgres + MinIO/S3), since mocking the store
-would defeat the point of verifying that metadata round-trips through
-FileRecord.
-"""
-
-from collections.abc import Generator
-from io import BytesIO
-from typing import Any
-from uuid import uuid4
-
-import pytest
-from sqlalchemy.orm import Session
-
-from onyx.configs.constants import FileOrigin
-from onyx.connectors.interfaces import BaseConnector
-from onyx.db.file_record import delete_filerecord_by_file_id
-from onyx.db.file_record import get_filerecord_by_file_id
-from onyx.file_store.file_store import get_default_file_store
-from onyx.file_store.staging import build_raw_file_callback
-from onyx.file_store.staging import stage_raw_file
-
-
-@pytest.fixture(scope="function")
-def cleanup_file_ids(
-    db_session: Session,
-) -> Generator[list[str], None, None]:
-    created: list[str] = []
-    yield created
-    file_store = get_default_file_store()
-    for fid in created:
-        try:
-            file_store.delete_file(fid)
-        except Exception:
-            delete_filerecord_by_file_id(file_id=fid, db_session=db_session)
-            db_session.commit()
-
-
-def test_stage_raw_file_persists_with_origin_and_metadata(
-    db_session: Session,
-    tenant_context: None,  # noqa: ARG001
-    initialize_file_store: None,  # noqa: ARG001
-    cleanup_file_ids: list[str],
-) -> None:
-    """stage_raw_file writes a FileRecord with INDEXING_STAGING origin and
-    round-trips the provided metadata verbatim."""
-    metadata: dict[str, Any] = {
-        "index_attempt_id": 42,
-        "cc_pair_id": 7,
-        "tenant_id": "tenant-abc",
-        "extra": "payload",
-    }
-    content_bytes = b"hello raw file"
-    content_type = "application/pdf"
-
-    file_id = stage_raw_file(
-        content=BytesIO(content_bytes),
-        content_type=content_type,
-        metadata=metadata,
-    )
-    cleanup_file_ids.append(file_id)
-    db_session.commit()
-
-    record = get_filerecord_by_file_id(file_id=file_id, db_session=db_session)
-    assert record.file_origin == FileOrigin.INDEXING_STAGING
-    assert record.file_type == content_type
-    assert record.file_metadata == metadata
-
-
-def test_build_raw_file_callback_binds_attempt_context_per_call(
-    db_session: Session,
-    tenant_context: None,  # noqa: ARG001
-    initialize_file_store: None,  # noqa: ARG001
-    cleanup_file_ids: list[str],
-) -> None:
-    """The callback returned by build_raw_file_callback must bind the
-    attempt-level context into every FileRecord it produces, without
-    leaking state across invocations."""
-    callback = build_raw_file_callback(
-        index_attempt_id=1001,
-        cc_pair_id=202,
-        tenant_id="tenant-xyz",
-    )
-
-    file_id_a = callback(BytesIO(b"alpha"), "text/plain")
-    file_id_b = callback(BytesIO(b"beta"), "application/octet-stream")
-    cleanup_file_ids.extend([file_id_a, file_id_b])
-    db_session.commit()
-
-    assert file_id_a != file_id_b
-
-    for fid, expected_content_type in (
-        (file_id_a, "text/plain"),
-        (file_id_b, "application/octet-stream"),
-    ):
-        record = get_filerecord_by_file_id(file_id=fid, db_session=db_session)
-        assert record.file_origin == FileOrigin.INDEXING_STAGING
-        assert record.file_type == expected_content_type
-        assert record.file_metadata == {
-            "index_attempt_id": 1001,
-            "cc_pair_id": 202,
-            "tenant_id": "tenant-xyz",
-        }
-
-
-def test_set_raw_file_callback_on_base_connector() -> None:
-    """set_raw_file_callback must install the callback as an instance
-    attribute usable by the connector."""
-
-    class _MinimalConnector(BaseConnector):
-        def load_credentials(
-            self,
-            credentials: dict[str, Any],  # noqa: ARG002
-        ) -> dict[str, Any] | None:
-            return None
-
-    connector = _MinimalConnector()
-    assert connector.raw_file_callback is None
-
-    sentinel_file_id = f"sentinel-{uuid4().hex[:8]}"
-
-    def _fake_callback(_content: Any, _content_type: str) -> str:
-        return sentinel_file_id
-
-    connector.set_raw_file_callback(_fake_callback)
-
-    assert connector.raw_file_callback is _fake_callback
-    assert connector.raw_file_callback(BytesIO(b""), "text/plain") == sentinel_file_id
--- a/backend/tests/integration/tests/permissions/test_user_file_permissions.py
+++ b/backend/tests/integration/tests/permissions/test_user_file_permissions.py
@@ -8,10 +8,8 @@ import io
 from typing import NamedTuple

 import pytest
-import requests

 from onyx.file_store.models import FileDescriptor
-from tests.integration.common_utils.constants import API_SERVER_URL
 from tests.integration.common_utils.managers.chat import ChatSessionManager
 from tests.integration.common_utils.managers.file import FileManager
 from tests.integration.common_utils.managers.llm_provider import LLMProviderManager
@@ -121,31 +119,3 @@ def test_public_assistant_with_user_files(
    assert (
        len(chat_history) >= 2
    ), "Expected at least 2 messages (user message and assistant response)"
-
-
-def test_cannot_download_other_users_file_via_chat_file_endpoint(
-    user_file_setup: UserFileTestSetup,
-) -> None:
-    storage_file_id = user_file_setup.user1_file_descriptor["id"]
-    user_file_id = user_file_setup.user1_file_id
-
-    owner_response = requests.get(
-        f"{API_SERVER_URL}/chat/file/{storage_file_id}",
-        headers=user_file_setup.user1_file_owner.headers,
-    )
-    assert owner_response.status_code == 200
-    assert owner_response.content, "Owner should receive the file contents"
-
-    for file_id in (storage_file_id, user_file_id):
-        user2_response = requests.get(
-            f"{API_SERVER_URL}/chat/file/{file_id}",
-            headers=user_file_setup.user2_non_owner.headers,
-        )
-        assert user2_response.status_code in (
-            403,
-            404,
-        ), (
-            f"Expected access denied for non-owner, got {user2_response.status_code} "
-            f"when fetching file_id={file_id}"
-        )
-        assert user2_response.content != owner_response.content
--- a/backend/tests/unit/onyx/connectors/gong/test_gong_checkpoint.py
+++ b/backend/tests/unit/onyx/connectors/gong/test_gong_checkpoint.py
@@ -69,9 +69,6 @@ class TestGongConnectorCheckpoint:
            workspace_ids=["ws1", None],
            workspace_index=1,
            cursor="abc123",
-            pending_transcripts={"call1": _make_transcript("call1")},
-            pending_call_details_attempts=2,
-            pending_retry_after=1234567890.5,
        )
        json_str = original.model_dump_json()
        restored = connector.validate_checkpoint_json(json_str)
@@ -234,11 +231,7 @@ class TestGongConnectorCheckpoint:
        mock_request: MagicMock,
        connector: GongConnector,
    ) -> None:
-        """Missing call details persist across checkpoint invocations and
-        eventually yield ConnectorFailure once MAX_CALL_DETAILS_ATTEMPTS is hit.
-        No in-call sleep — retries happen on subsequent invocations, gated by
-        the wall-clock retry-after deadline on the checkpoint.
-        """
+        """When call details are missing after retries, yield ConnectorFailure."""
        transcript_response = MagicMock()
        transcript_response.status_code = 200
        transcript_response.json.return_value = {
@@ -264,42 +257,23 @@ class TestGongConnectorCheckpoint:
        failures: list[ConnectorFailure] = []
        docs: list[Document] = []

-        # Jump the clock past any retry deadline on each invocation so we
-        # exercise the retry path without real sleeping. The test for the
-        # backoff-gate itself lives in test_backoff_gate_prevents_retry_too_soon.
-        fake_now = [1_000_000.0]
-
-        def _advance_clock() -> float:
-            fake_now[0] += 10_000.0
-            return fake_now[0]
-
-        invocation_cap = GongConnector.MAX_CALL_DETAILS_ATTEMPTS + 5
-        with patch(
-            "onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
-        ):
-            for _ in range(invocation_cap):
-                if not checkpoint.has_more:
-                    break
-                generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
-                try:
-                    while True:
-                        item = next(generator)
-                        if isinstance(item, ConnectorFailure):
-                            failures.append(item)
-                        elif isinstance(item, Document):
-                            docs.append(item)
-                except StopIteration as e:
-                    checkpoint = e.value
+        with patch("onyx.connectors.gong.connector.time.sleep"):
+            generator = connector.load_from_checkpoint(0, time.time(), checkpoint)
+            try:
+                while True:
+                    item = next(generator)
+                    if isinstance(item, ConnectorFailure):
+                        failures.append(item)
+                    elif isinstance(item, Document):
+                        docs.append(item)
+            except StopIteration as e:
+                checkpoint = e.value

        assert len(docs) == 0
        assert len(failures) == 1
        assert failures[0].failed_document is not None
        assert failures[0].failed_document.document_id == "call1"
        assert checkpoint.has_more is False
-        assert checkpoint.pending_transcripts == {}
-        assert checkpoint.pending_call_details_attempts == 0
-        assert checkpoint.pending_retry_after is None
-        assert mock_request.call_count == 1 + GongConnector.MAX_CALL_DETAILS_ATTEMPTS

    @patch.object(GongConnector, "_throttled_request")
    def test_multi_workspace_iteration(
@@ -407,14 +381,12 @@ class TestGongConnectorCheckpoint:
        assert checkpoint.workspace_index == 1

    @patch.object(GongConnector, "_throttled_request")
-    def test_partial_details_defers_and_resolves_next_invocation(
+    def test_retry_only_fetches_missing_ids(
        self,
        mock_request: MagicMock,
        connector: GongConnector,
    ) -> None:
-        """A transcript whose call details are missing gets stashed into
-        pending_transcripts and resolves on a later checkpoint invocation.
-        Resolved docs are yielded in the order they become available."""
+        """Retry for missing call details should only re-request the missing IDs."""
        transcript_response = MagicMock()
        transcript_response.status_code = 200
        transcript_response.json.return_value = {
@@ -432,7 +404,7 @@ class TestGongConnectorCheckpoint:
            "calls": [_make_call_detail("call1", "Call One")]
        }

-        # Second fetch (next invocation): returns call2
+        # Second fetch (retry): returns call2
        missing_details = MagicMock()
        missing_details.status_code = 200
        missing_details.json.return_value = {
@@ -452,48 +424,19 @@ class TestGongConnectorCheckpoint:
        )

        docs: list[Document] = []
-
-        fake_now = [1_000_000.0]
-
-        def _advance_clock() -> float:
-            fake_now[0] += 10_000.0
-            return fake_now[0]
-
-        with patch(
-            "onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
-        ):
-            # Invocation 1: fetches page + details, yields call1, stashes call2
-            generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
+        with patch("onyx.connectors.gong.connector.time.sleep"):
+            generator = connector.load_from_checkpoint(0, time.time(), checkpoint)
            try:
                while True:
                    item = next(generator)
                    if isinstance(item, Document):
                        docs.append(item)
-            except StopIteration as e:
-                checkpoint = e.value
-
-            assert len(docs) == 1
-            assert docs[0].semantic_identifier == "Call One"
-            assert "call2" in checkpoint.pending_transcripts
-            assert checkpoint.pending_call_details_attempts == 1
-            assert checkpoint.pending_retry_after is not None
-            assert checkpoint.has_more is True
-
-            # Invocation 2: retries missing (only call2), yields it, clears pending
-            generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
-            try:
-                while True:
-                    item = next(generator)
-                    if isinstance(item, Document):
-                        docs.append(item)
-            except StopIteration as e:
-                checkpoint = e.value
+            except StopIteration:
+                pass

        assert len(docs) == 2
+        assert docs[0].semantic_identifier == "Call One"
        assert docs[1].semantic_identifier == "Call Two"
-        assert checkpoint.pending_transcripts == {}
-        assert checkpoint.pending_call_details_attempts == 0
-        assert checkpoint.pending_retry_after is None

        # Verify: 3 API calls total (1 transcript + 1 full details + 1 retry for missing only)
        assert mock_request.call_count == 3
@@ -501,107 +444,6 @@ class TestGongConnectorCheckpoint:
        retry_call_body = mock_request.call_args_list[2][1]["json"]
        assert retry_call_body["filter"]["callIds"] == ["call2"]

-    @patch.object(GongConnector, "_throttled_request")
-    def test_backoff_gate_prevents_retry_too_soon(
-        self,
-        mock_request: MagicMock,
-        connector: GongConnector,
-    ) -> None:
-        """If the retry-after deadline hasn't elapsed, _resolve_pending must
-        NOT issue a /v2/calls/extensive request. Prevents burning through
-        MAX_CALL_DETAILS_ATTEMPTS when workers re-invoke tightly.
-        """
-        pending_transcript = _make_transcript("call1")
-        fixed_now = 1_000_000.0
-        # Deadline is 30s in the future from fixed_now
-        retry_after = fixed_now + 30
-
-        checkpoint = GongConnectorCheckpoint(
-            has_more=True,
-            workspace_ids=[None],
-            workspace_index=0,
-            pending_transcripts={"call1": pending_transcript},
-            pending_call_details_attempts=1,
-            pending_retry_after=retry_after,
-        )
-
-        with patch("onyx.connectors.gong.connector.time.time", return_value=fixed_now):
-            generator = connector.load_from_checkpoint(0, fixed_now, checkpoint)
-            try:
-                while True:
-                    next(generator)
-            except StopIteration as e:
-                checkpoint = e.value
-
-        # No API calls should have been made — we were inside the backoff window
-        mock_request.assert_not_called()
-        # Pending state preserved for later retry
-        assert "call1" in checkpoint.pending_transcripts
-        assert checkpoint.pending_call_details_attempts == 1
-        assert checkpoint.pending_retry_after == retry_after
-        assert checkpoint.has_more is True
-
-    @patch.object(GongConnector, "_throttled_request")
-    def test_pending_retry_does_not_block_on_time_sleep(
-        self,
-        mock_request: MagicMock,
-        connector: GongConnector,
-    ) -> None:
-        """Pending-transcript retry must never call time.sleep() with a
-        non-trivial delay — spacing between retries is enforced via the
-        wall-clock retry-after deadline stored on the checkpoint, not by
-        blocking inside load_from_checkpoint.
-        """
-        transcript_response = MagicMock()
-        transcript_response.status_code = 200
-        transcript_response.json.return_value = {
-            "callTranscripts": [_make_transcript("call1")],
-            "records": {},
-        }
-        empty_details = MagicMock()
-        empty_details.status_code = 200
-        empty_details.json.return_value = {"calls": []}
-
-        mock_request.side_effect = [transcript_response] + [
-            empty_details
-        ] * GongConnector.MAX_CALL_DETAILS_ATTEMPTS
-
-        checkpoint = GongConnectorCheckpoint(
-            has_more=True,
-            workspace_ids=[None],
-            workspace_index=0,
-        )
-
-        fake_now = [1_000_000.0]
-
-        def _advance_clock() -> float:
-            fake_now[0] += 10_000.0
-            return fake_now[0]
-
-        with (
-            patch("onyx.connectors.gong.connector.time.sleep") as mock_sleep,
-            patch(
-                "onyx.connectors.gong.connector.time.time", side_effect=_advance_clock
-            ),
-        ):
-            invocation_cap = GongConnector.MAX_CALL_DETAILS_ATTEMPTS + 5
-            for _ in range(invocation_cap):
-                if not checkpoint.has_more:
-                    break
-                generator = connector.load_from_checkpoint(0, fake_now[0], checkpoint)
-                try:
-                    while True:
-                        next(generator)
-                except StopIteration as e:
-                    checkpoint = e.value
-
-            # The only legitimate sleep is the sub-second throttle in
-            # _throttled_request (<= MIN_REQUEST_INTERVAL). Assert we never
-            # sleep for anything close to the per-retry backoff delays.
-            for call in mock_sleep.call_args_list:
-                delay_arg = call.args[0] if call.args else 0
-                assert delay_arg <= GongConnector.MIN_REQUEST_INTERVAL
-
    @patch.object(GongConnector, "_throttled_request")
    def test_expired_cursor_restarts_workspace(
        self,
--- a/backend/tests/unit/onyx/connectors/google_drive/test_slim_retrieval.py
+++ b/backend/tests/unit/onyx/connectors/google_drive/test_slim_retrieval.py
@@ -287,140 +287,3 @@ class TestFailedFolderIdsByEmail:
            )

        assert len(failed_map) == 0
-
-
-class TestOrphanedPathBackfill:
-    def _make_failed_map(
-        self, entries: dict[str, set[str]]
-    ) -> ThreadSafeDict[str, ThreadSafeSet[str]]:
-        return ThreadSafeDict({k: ThreadSafeSet(v) for k, v in entries.items()})
-
-    def _make_file(self, parent_id: str) -> MagicMock:
-        file = MagicMock()
-        file.user_email = "retriever@example.com"
-        file.drive_file = {"parents": [parent_id]}
-        return file
-
-    def test_backfills_intermediate_folders_into_failed_map(self) -> None:
-        """When a walk dead-ends at a confirmed orphan, all intermediate folder
-        IDs must be added to failed_folder_ids_by_email for both emails so
-        future files short-circuit via _get_folder_metadata's cache check."""
-        connector = _make_connector()
-
-        # Chain: folderA -> folderB -> folderC (confirmed orphan)
-        failed_map = self._make_failed_map(
-            {
-                "retriever@example.com": {"folderC"},
-                "admin@example.com": {"folderC"},
-            }
-        )
-
-        folder_a = {"id": "folderA", "name": "A", "parents": ["folderB"]}
-        folder_b = {"id": "folderB", "name": "B", "parents": ["folderC"]}
-
-        def mock_get_folder(
-            _service: MagicMock, folder_id: str, _field_type: DriveFileFieldType
-        ) -> dict | None:
-            if folder_id == "folderA":
-                return folder_a
-            if folder_id == "folderB":
-                return folder_b
-            return None
-
-        with (
-            patch(
-                "onyx.connectors.google_drive.connector.get_drive_service",
-                return_value=MagicMock(),
-            ),
-            patch(
-                "onyx.connectors.google_drive.connector.get_folder_metadata",
-                side_effect=mock_get_folder,
-            ),
-        ):
-            connector._get_new_ancestors_for_files(
-                files=[self._make_file("folderA")],
-                seen_hierarchy_node_raw_ids=ThreadSafeSet(),
-                fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
-                failed_folder_ids_by_email=failed_map,
-            )
-
-        # Both emails confirmed folderC as orphan, so both get the backfill
-        for email in ("retriever@example.com", "admin@example.com"):
-            cached = failed_map.get(email, ThreadSafeSet())
-            assert "folderA" in cached
-            assert "folderB" in cached
-            assert "folderC" in cached
-
-    def test_backfills_only_for_confirming_email(self) -> None:
-        """Only the email that confirmed the orphan gets the path backfilled."""
-        connector = _make_connector()
-
-        # Only retriever confirmed folderC as orphan; admin has no entry
-        failed_map = self._make_failed_map({"retriever@example.com": {"folderC"}})
-
-        folder_a = {"id": "folderA", "name": "A", "parents": ["folderB"]}
-        folder_b = {"id": "folderB", "name": "B", "parents": ["folderC"]}
-
-        def mock_get_folder(
-            _service: MagicMock, folder_id: str, _field_type: DriveFileFieldType
-        ) -> dict | None:
-            if folder_id == "folderA":
-                return folder_a
-            if folder_id == "folderB":
-                return folder_b
-            return None
-
-        with (
-            patch(
-                "onyx.connectors.google_drive.connector.get_drive_service",
-                return_value=MagicMock(),
-            ),
-            patch(
-                "onyx.connectors.google_drive.connector.get_folder_metadata",
-                side_effect=mock_get_folder,
-            ),
-        ):
-            connector._get_new_ancestors_for_files(
-                files=[self._make_file("folderA")],
-                seen_hierarchy_node_raw_ids=ThreadSafeSet(),
-                fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
-                failed_folder_ids_by_email=failed_map,
-            )
-
-        retriever_cached = failed_map.get("retriever@example.com", ThreadSafeSet())
-        assert "folderA" in retriever_cached
-        assert "folderB" in retriever_cached
-
-        # admin did not confirm the orphan — must not get the backfill
-        assert failed_map.get("admin@example.com") is None
-
-    def test_short_circuits_on_backfilled_intermediate(self) -> None:
-        """A second file whose parent is already in failed_folder_ids_by_email
-        must not trigger any folder metadata API calls."""
-        connector = _make_connector()
-
-        # folderA already in the failed map from a previous walk
-        failed_map = self._make_failed_map(
-            {
-                "retriever@example.com": {"folderA"},
-                "admin@example.com": {"folderA"},
-            }
-        )
-
-        with (
-            patch(
-                "onyx.connectors.google_drive.connector.get_drive_service",
-                return_value=MagicMock(),
-            ),
-            patch(
-                "onyx.connectors.google_drive.connector.get_folder_metadata"
-            ) as mock_api,
-        ):
-            connector._get_new_ancestors_for_files(
-                files=[self._make_file("folderA")],
-                seen_hierarchy_node_raw_ids=ThreadSafeSet(),
-                fully_walked_hierarchy_node_raw_ids=ThreadSafeSet(),
-                failed_folder_ids_by_email=failed_map,
-            )
-
-        mock_api.assert_not_called()
--- a/backend/tests/unit/onyx/connectors/web/init.py
+++ b/backend/tests/unit/onyx/connectors/web/init.py
--- a/backend/tests/unit/onyx/connectors/web/test_slim_retrieval.py
+++ b/backend/tests/unit/onyx/connectors/web/test_slim_retrieval.py
@@ -1,243 +0,0 @@
-"""Unit tests for WebConnector.retrieve_all_slim_docs (slim pruning path)."""
-
-from __future__ import annotations
-
-from typing import Any
-from unittest.mock import MagicMock
-from unittest.mock import patch
-
-from onyx.connectors.models import SlimDocument
-from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
-from onyx.connectors.web.connector import WebConnector
-
-BASE_URL = "http://example.com"
-
-SINGLE_PAGE_HTML = (
-    "<html><body><p>Content that should not appear in slim output</p></body></html>"
-)
-
-RECURSIVE_ROOT_HTML = """
-<html><body>
-  <a href="/page2">Page 2</a>
-  <a href="/page3">Page 3</a>
-</body></html>
-"""
-
-PAGE2_HTML = "<html><body><p>page 2</p></body></html>"
-PAGE3_HTML = "<html><body><p>page 3</p></body></html>"
-
-
-def _make_playwright_context_mock(url_to_html: dict[str, str]) -> MagicMock:
-    """Return a BrowserContext mock whose pages respond based on goto URL."""
-    context = MagicMock()
-
-    def _new_page() -> MagicMock:
-        page = MagicMock()
-        visited: list[str] = []
-
-        def _goto(url: str, **kwargs: Any) -> MagicMock:  # noqa: ARG001
-            visited.append(url)
-            page.url = url
-            response = MagicMock()
-            response.status = 200
-            response.header_value.return_value = None  # no cf-ray
-            return response
-
-        def _content() -> str:
-            return url_to_html.get(
-                visited[-1] if visited else "", "<html><body></body></html>"
-            )
-
-        page.goto.side_effect = _goto
-        page.content.side_effect = _content
-        return page
-
-    context.new_page.side_effect = _new_page
-    return context
-
-
-def _make_playwright_mock() -> MagicMock:
-    playwright = MagicMock()
-    playwright.stop = MagicMock()
-    return playwright
-
-
-def _make_page_mock(
-    html: str, cf_ray: str | None = None, status: int = 200
-) -> MagicMock:
-    """Return a Playwright page mock with configurable status and CF header."""
-    page = MagicMock()
-    page.url = BASE_URL + "/"
-    response = MagicMock()
-    response.status = status
-    response.header_value.side_effect = lambda h: cf_ray if h == "cf-ray" else None
-    page.goto.return_value = response
-    page.content.return_value = html
-    return page
-
-
-@patch("onyx.connectors.web.connector.check_internet_connection")
-@patch("onyx.connectors.web.connector.requests.head")
-@patch("onyx.connectors.web.connector.start_playwright")
-def test_slim_yields_slim_documents(
-    mock_start_playwright: MagicMock,
-    mock_head: MagicMock,
-    _mock_check: MagicMock,
-) -> None:
-    """retrieve_all_slim_docs yields SlimDocuments with the correct URL as id."""
-    context = _make_playwright_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
-    mock_start_playwright.return_value = (_make_playwright_mock(), context)
-    mock_head.return_value.headers = {"content-type": "text/html"}
-
-    connector = WebConnector(
-        base_url=BASE_URL + "/",
-        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
-    )
-
-    docs = [doc for batch in connector.retrieve_all_slim_docs() for doc in batch]
-
-    assert len(docs) == 1
-    assert isinstance(docs[0], SlimDocument)
-    assert docs[0].id == BASE_URL + "/"
-
-
-@patch("onyx.connectors.web.connector.check_internet_connection")
-@patch("onyx.connectors.web.connector.requests.head")
-@patch("onyx.connectors.web.connector.start_playwright")
-def test_slim_skips_content_extraction(
-    mock_start_playwright: MagicMock,
-    mock_head: MagicMock,
-    _mock_check: MagicMock,
-) -> None:
-    """web_html_cleanup is never called in slim mode."""
-    context = _make_playwright_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
-    mock_start_playwright.return_value = (_make_playwright_mock(), context)
-    mock_head.return_value.headers = {"content-type": "text/html"}
-
-    connector = WebConnector(
-        base_url=BASE_URL + "/",
-        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
-    )
-
-    with patch("onyx.connectors.web.connector.web_html_cleanup") as mock_cleanup:
-        list(connector.retrieve_all_slim_docs())
-        mock_cleanup.assert_not_called()
-
-
-@patch("onyx.connectors.web.connector.check_internet_connection")
-@patch("onyx.connectors.web.connector.requests.head")
-@patch("onyx.connectors.web.connector.start_playwright")
-def test_slim_discovers_links_recursively(
-    mock_start_playwright: MagicMock,
-    mock_head: MagicMock,
-    _mock_check: MagicMock,
-) -> None:
-    """In RECURSIVE mode, internal <a href> links are followed and all URLs yielded."""
-    url_to_html = {
-        BASE_URL + "/": RECURSIVE_ROOT_HTML,
-        BASE_URL + "/page2": PAGE2_HTML,
-        BASE_URL + "/page3": PAGE3_HTML,
-    }
-    context = _make_playwright_context_mock(url_to_html)
-    mock_start_playwright.return_value = (_make_playwright_mock(), context)
-    mock_head.return_value.headers = {"content-type": "text/html"}
-
-    connector = WebConnector(
-        base_url=BASE_URL + "/",
-        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
-    )
-
-    ids = {
-        doc.id
-        for batch in connector.retrieve_all_slim_docs()
-        for doc in batch
-        if isinstance(doc, SlimDocument)
-    }
-
-    assert ids == {
-        BASE_URL + "/",
-        BASE_URL + "/page2",
-        BASE_URL + "/page3",
-    }
-
-
-@patch("onyx.connectors.web.connector.check_internet_connection")
-@patch("onyx.connectors.web.connector.requests.head")
-@patch("onyx.connectors.web.connector.start_playwright")
-def test_normal_200_skips_5s_wait(
-    mock_start_playwright: MagicMock,
-    mock_head: MagicMock,
-    _mock_check: MagicMock,
-) -> None:
-    """Normal 200 responses without bot-detection signals skip the 5s render wait."""
-    page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray=None, status=200)
-    context = MagicMock()
-    context.new_page.return_value = page
-    mock_start_playwright.return_value = (_make_playwright_mock(), context)
-    mock_head.return_value.headers = {"content-type": "text/html"}
-
-    connector = WebConnector(
-        base_url=BASE_URL + "/",
-        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
-    )
-
-    list(connector.retrieve_all_slim_docs())
-
-    page.wait_for_timeout.assert_not_called()
-
-
-@patch("onyx.connectors.web.connector.check_internet_connection")
-@patch("onyx.connectors.web.connector.requests.head")
-@patch("onyx.connectors.web.connector.start_playwright")
-def test_cloudflare_applies_5s_wait(
-    mock_start_playwright: MagicMock,
-    mock_head: MagicMock,
-    _mock_check: MagicMock,
-) -> None:
-    """Pages with a cf-ray header trigger the 5s wait before networkidle."""
-    page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray="abc123-LAX")
-    context = MagicMock()
-    context.new_page.return_value = page
-    mock_start_playwright.return_value = (_make_playwright_mock(), context)
-    mock_head.return_value.headers = {"content-type": "text/html"}
-
-    connector = WebConnector(
-        base_url=BASE_URL + "/",
-        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
-    )
-
-    list(connector.retrieve_all_slim_docs())
-
-    page.wait_for_timeout.assert_called_once_with(5000)
-
-
-@patch("onyx.connectors.web.connector.time")
-@patch("onyx.connectors.web.connector.check_internet_connection")
-@patch("onyx.connectors.web.connector.requests.head")
-@patch("onyx.connectors.web.connector.start_playwright")
-def test_403_applies_5s_wait(
-    mock_start_playwright: MagicMock,
-    mock_head: MagicMock,
-    _mock_check: MagicMock,
-    _mock_time: MagicMock,
-) -> None:
-    """A 403 response triggers the 5s wait (common bot-detection challenge entry point)."""
-    page = _make_page_mock(SINGLE_PAGE_HTML, cf_ray=None, status=403)
-    context = MagicMock()
-    context.new_page.return_value = page
-    mock_start_playwright.return_value = (_make_playwright_mock(), context)
-    mock_head.return_value.headers = {"content-type": "text/html"}
-
-    connector = WebConnector(
-        base_url=BASE_URL + "/",
-        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
-    )
-
-    # All retries return 403 so no docs are found — that's expected here.
-    # We only care that the 5s wait fired.
-    try:
-        list(connector.retrieve_all_slim_docs())
-    except RuntimeError:
-        pass
-
-    page.wait_for_timeout.assert_called_with(5000)
--- a/backend/tests/unit/onyx/file_processing/test_xlsx_to_text.py
+++ b/backend/tests/unit/onyx/file_processing/test_xlsx_to_text.py
@@ -1,11 +1,13 @@
 import io
 from typing import cast
+from unittest.mock import MagicMock
 from unittest.mock import patch

 import openpyxl
 from openpyxl.worksheet.worksheet import Worksheet

-from onyx.file_processing.extract_file_text import _sheet_to_csv
+from onyx.file_processing.extract_file_text import _clean_worksheet_matrix
+from onyx.file_processing.extract_file_text import _worksheet_to_matrix
 from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
 from onyx.file_processing.extract_file_text import xlsx_to_text

@@ -201,179 +203,50 @@ class TestXlsxToText:
        assert "r3c1" in lines[2] and "r3c2" in lines[2]


-class TestSheetToCsvJaggedRows:
-    """openpyxl's read-only mode yields rows of differing widths when
-    trailing cells are empty. These tests exercise ``_sheet_to_csv``
-    directly because ``_make_xlsx`` (via ``ws.append``) normalizes row
-    widths, so jagged input can only be produced in-memory."""
+class TestWorksheetToMatrixJaggedRows:
+    """openpyxl read_only mode can yield rows of differing widths when
+    trailing cells are empty. The matrix must be padded to a rectangle
+    so downstream column cleanup can index safely."""

-    def test_shorter_trailing_rows_padded_in_output(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A", "B", "C"),
-                    ("X", "Y"),
-                    ("P",),
-                ]
-            )
+    def test_pads_shorter_trailing_rows(self) -> None:
+        ws = MagicMock()
+        ws.iter_rows.return_value = iter(
+            [
+                ("A", "B", "C"),
+                ("X", "Y"),
+                ("P",),
+            ]
        )
-        assert csv_text.split("\n") == ["A,B,C", "X,Y,", "P,,"]
+        matrix = _worksheet_to_matrix(ws)
+        assert matrix == [["A", "B", "C"], ["X", "Y", ""], ["P", "", ""]]

-    def test_shorter_leading_row_padded_in_output(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A",),
-                    ("X", "Y", "Z"),
-                ]
-            )
+    def test_pads_when_first_row_is_shorter(self) -> None:
+        ws = MagicMock()
+        ws.iter_rows.return_value = iter(
+            [
+                ("A",),
+                ("X", "Y", "Z"),
+            ]
        )
-        assert csv_text.split("\n") == ["A,,", "X,Y,Z"]
+        matrix = _worksheet_to_matrix(ws)
+        assert matrix == [["A", "", ""], ["X", "Y", "Z"]]

-    def test_no_index_error_on_jagged_rows(self) -> None:
-        """Regression: the original dense-matrix version raised IndexError
-        when a later row was shorter than an earlier row whose out-of-range
-        columns happened to be empty."""
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A", "", "", "B"),
-                    ("X", "Y"),
-                ]
-            )
+    def test_clean_worksheet_matrix_no_index_error_on_jagged_rows(self) -> None:
+        """Regression: previously raised IndexError when a later row was
+        shorter than the first row and the out-of-range column on the
+        first row was empty (so the short-circuit in `all()` did not
+        save us)."""
+        ws = MagicMock()
+        ws.iter_rows.return_value = iter(
+            [
+                ("A", "", "", "B"),
+                ("X", "Y"),
+            ]
        )
-        assert csv_text.split("\n") == ["A,,,B", "X,Y,,"]
-
-
-class TestSheetToCsvStreaming:
-    """Pin the memory-safe streaming contract: empty rows are skipped
-    cheaply, empty-row/column runs are collapsed to at most 2, and sheets
-    with no data return the empty string."""
-
-    def test_empty_rows_between_data_capped_at_two(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A", "B"),
-                    (None, None),
-                    (None, None),
-                    (None, None),
-                    (None, None),
-                    (None, None),
-                    ("C", "D"),
-                ]
-            )
-        )
-        # 5 empty rows collapsed to 2
-        assert csv_text.split("\n") == ["A,B", ",", ",", "C,D"]
-
-    def test_empty_rows_at_or_below_cap_preserved(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A", "B"),
-                    (None, None),
-                    (None, None),
-                    ("C", "D"),
-                ]
-            )
-        )
-        assert csv_text.split("\n") == ["A,B", ",", ",", "C,D"]
-
-    def test_empty_column_run_capped_at_two(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A", None, None, None, None, "B"),
-                    ("C", None, None, None, None, "D"),
-                ]
-            )
-        )
-        # 4 empty cols between A and B collapsed to 2
-        assert csv_text.split("\n") == ["A,,,B", "C,,,D"]
-
-    def test_completely_empty_stream_returns_empty_string(self) -> None:
-        assert _sheet_to_csv(iter([])) == ""
-
-    def test_all_rows_empty_returns_empty_string(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    (None, None),
-                    ("", ""),
-                    (None,),
-                ]
-            )
-        )
-        assert csv_text == ""
-
-    def test_trailing_empty_rows_dropped(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A",),
-                    ("B",),
-                    (None,),
-                    (None,),
-                    (None,),
-                ]
-            )
-        )
-        # Trailing empties are never emitted (no subsequent non-empty row
-        # to flush them against).
-        assert csv_text.split("\n") == ["A", "B"]
-
-    def test_leading_empty_rows_capped_at_two(self) -> None:
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    (None, None),
-                    (None, None),
-                    (None, None),
-                    (None, None),
-                    (None, None),
-                    ("A", "B"),
-                ]
-            )
-        )
-        # 5 leading empty rows collapsed to 2
-        assert csv_text.split("\n") == [",", ",", "A,B"]
-
-    def test_cell_cap_truncates_and_appends_marker(self) -> None:
-        """When total non-empty cells exceeds the cap, scanning stops and
-        a truncation marker row is appended so downstream indexing sees
-        the sheet was cut off."""
-        with patch(
-            "onyx.file_processing.extract_file_text.MAX_XLSX_CELLS_PER_SHEET", 5
-        ):
-            csv_text = _sheet_to_csv(
-                iter(
-                    [
-                        ("A", "B", "C"),
-                        ("D", "E", "F"),
-                        ("G", "H", "I"),
-                        ("J", "K", "L"),
-                    ]
-                )
-            )
-        lines = csv_text.split("\n")
-        assert lines[-1] == "[truncated: sheet exceeded cell limit]"
-        # First two rows (6 cells) trip the cap=5 check after row 2; the
-        # third and fourth rows are never scanned.
-        assert "G" not in csv_text
-        assert "J" not in csv_text
-
-    def test_cell_cap_not_hit_no_marker(self) -> None:
-        """Under the cap, no truncation marker is appended."""
-        csv_text = _sheet_to_csv(
-            iter(
-                [
-                    ("A", "B"),
-                    ("C", "D"),
-                ]
-            )
-        )
-        assert "[truncated" not in csv_text
+        matrix = _worksheet_to_matrix(ws)
+        # Must not raise.
+        cleaned = _clean_worksheet_matrix(matrix)
+        assert cleaned == [["A", "", "", "B"], ["X", "Y", "", ""]]


 class TestXlsxSheetExtraction:
@@ -408,9 +281,10 @@ class TestXlsxSheetExtraction:
        assert "a" in csv_text
        assert "b" in csv_text

-    def test_empty_sheet_included_with_empty_csv(self) -> None:
-        """Every sheet in the workbook appears in the result; an empty
-        sheet contributes an empty csv_text alongside its title."""
+    def test_empty_sheet_is_skipped(self) -> None:
+        """A sheet whose CSV output is empty/whitespace-only should NOT
+        appear in the result — the `if csv_text.strip():` guard filters
+        it out."""
        xlsx = _make_xlsx(
            {
                "Data": [["a", "b"]],
@@ -418,17 +292,14 @@ class TestXlsxSheetExtraction:
            }
        )
        sheets = xlsx_sheet_extraction(xlsx)
-        assert len(sheets) == 2
-        titles = [title for _csv, title in sheets]
-        assert titles == ["Data", "Empty"]
-        empty_csv = next(csv_text for csv_text, title in sheets if title == "Empty")
-        assert empty_csv == ""
+        assert len(sheets) == 1
+        assert sheets[0][1] == "Data"

-    def test_empty_workbook_returns_one_tuple_per_sheet(self) -> None:
-        """All sheets empty → one empty-csv tuple per sheet."""
+    def test_empty_workbook_returns_empty_list(self) -> None:
+        """All sheets empty → empty list (not a list of empty tuples)."""
        xlsx = _make_xlsx({"Sheet1": [], "Sheet2": []})
        sheets = xlsx_sheet_extraction(xlsx)
-        assert sheets == [("", "Sheet1"), ("", "Sheet2")]
+        assert sheets == []

    def test_single_sheet(self) -> None:
        xlsx = _make_xlsx({"Only": [["x", "y"], ["1", "2"]]})
--- a/deployment/helm/charts/onyx/Chart.yaml
+++ b/deployment/helm/charts/onyx/Chart.yaml
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
 sources:
  - "https://github.com/onyx-dot-app/onyx"
 type: application
-version: 0.4.47
+version: 0.4.46
 appVersion: latest
 annotations:
  category: Productivity
--- a/deployment/helm/charts/onyx/templates/extra-manifests.yaml
+++ b/deployment/helm/charts/onyx/templates/extra-manifests.yaml
@@ -1,8 +0,0 @@
-{{- range .Values.extraManifests }}
---
-{{- if kindIs "string" . }}
-{{ tpl . $ }}
-{{- else }}
-{{ tpl (toYaml .) $ }}
-{{- end }}
-{{- end }}
--- a/deployment/helm/charts/onyx/values.yaml
+++ b/deployment/helm/charts/onyx/values.yaml
@@ -1316,26 +1316,3 @@ configMap:
  HARD_DELETE_CHATS: ""
  MAX_ALLOWED_UPLOAD_SIZE_MB: ""
  DEFAULT_USER_FILE_MAX_UPLOAD_SIZE_MB: ""
-
-# -- Additional arbitrary manifests to render as part of the release. Each entry
-# may be either a YAML mapping (a single Kubernetes object) or a multi-line
-# string that will be passed through `tpl` so it can reference release values.
-# Useful for injecting resources not covered by the chart (e.g. NetworkPolicies,
-# ExternalSecrets, custom CRs) without forking the chart.
-extraManifests: []
-# extraManifests:
-#   - apiVersion: v1
-#     kind: ConfigMap
-#     metadata:
-#       name: my-extra-config
-#     data:
-#       key: value
-#   - |
-#     apiVersion: networking.k8s.io/v1
-#     kind: NetworkPolicy
-#     metadata:
-#       name: {{ include "onyx.fullname" . }}-extra
-#     spec:
-#       podSelector: {}
-#       policyTypes:
-#         - Ingress
--- a/desktop/src-tauri/Cargo.lock
+++ b/desktop/src-tauri/Cargo.lock
@@ -82,28 +82,6 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"

-[[package]]
-name = "aws-lc-rs"
-version = "1.16.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc"
-dependencies = [
- "aws-lc-sys",
- "zeroize",
-]
-
-[[package]]
-name = "aws-lc-sys"
-version = "0.39.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399"
-dependencies = [
- "cc",
- "cmake",
- "dunce",
- "fs_extra",
-]
-
 [[package]]
 name = "base64"
 version = "0.21.7"
@@ -277,8 +255,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2"
 dependencies = [
 "find-msvc-tools",
- "jobserver",
- "libc",
 "shlex",
 ]

@@ -315,12 +291,6 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"

-[[package]]
-name = "cfg_aliases"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
-
 [[package]]
 name = "chrono"
 version = "0.4.44"
@@ -333,15 +303,6 @@ dependencies = [
 "windows-link 0.2.1",
 ]

-[[package]]
-name = "cmake"
-version = "0.1.58"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "combine"
 version = "4.6.7"
@@ -819,12 +780,6 @@ dependencies = [
 "percent-encoding",
 ]

-[[package]]
-name = "fs_extra"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-
 [[package]]
 name = "futf"
 version = "0.1.5"
@@ -1042,10 +997,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
 dependencies = [
 "cfg-if",
- "js-sys",
 "libc",
 "wasi 0.11.1+wasi-snapshot-preview1",
- "wasm-bindgen",
 ]

 [[package]]
@@ -1055,11 +1008,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
 "cfg-if",
- "js-sys",
 "libc",
 "r-efi 5.3.0",
 "wasip2",
- "wasm-bindgen",
 ]

 [[package]]
@@ -1334,22 +1285,6 @@ dependencies = [
 "want",
 ]

-[[package]]
-name = "hyper-rustls"
-version = "0.27.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
-dependencies = [
- "http",
- "hyper",
- "hyper-util",
- "rustls",
- "rustls-pki-types",
- "tokio",
- "tokio-rustls",
- "tower-service",
-]
-
 [[package]]
 name = "hyper-util"
 version = "0.1.20"
@@ -1652,16 +1587,6 @@ version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"

-[[package]]
-name = "jobserver"
-version = "0.1.34"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
-dependencies = [
- "getrandom 0.3.4",
- "libc",
-]
-
 [[package]]
 name = "js-sys"
 version = "0.3.91"
@@ -1793,12 +1718,6 @@ version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"

-[[package]]
-name = "lru-slab"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
-
 [[package]]
 name = "mac"
 version = "0.1.1"
@@ -2122,7 +2041,6 @@ name = "onyx"
 version = "0.0.0-dev"
 dependencies = [
 "directories",
- "reqwest",
 "serde",
 "serde_json",
 "tauri",
@@ -2147,12 +2065,6 @@ dependencies = [
 "pathdiff",
 ]

-[[package]]
-name = "openssl-probe"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
-
 [[package]]
 name = "option-ext"
 version = "0.2.0"
@@ -2543,62 +2455,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "quinn"
-version = "0.11.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
-dependencies = [
- "bytes",
- "cfg_aliases",
- "pin-project-lite",
- "quinn-proto",
- "quinn-udp",
- "rustc-hash",
- "rustls",
- "socket2",
- "thiserror 2.0.18",
- "tokio",
- "tracing",
- "web-time",
-]
-
-[[package]]
-name = "quinn-proto"
-version = "0.11.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
-dependencies = [
- "aws-lc-rs",
- "bytes",
- "getrandom 0.3.4",
- "lru-slab",
- "rand 0.9.2",
- "ring",
- "rustc-hash",
- "rustls",
- "rustls-pki-types",
- "slab",
- "thiserror 2.0.18",
- "tinyvec",
- "tracing",
- "web-time",
-]
-
-[[package]]
-name = "quinn-udp"
-version = "0.5.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
-dependencies = [
- "cfg_aliases",
- "libc",
- "once_cell",
- "socket2",
- "tracing",
- "windows-sys 0.59.0",
-]
-
 [[package]]
 name = "quote"
 version = "1.0.45"
@@ -2645,16 +2501,6 @@ dependencies = [
 "rand_core 0.6.4",
 ]

-[[package]]
-name = "rand"
-version = "0.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
-dependencies = [
- "rand_chacha 0.9.0",
- "rand_core 0.9.5",
-]
-
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -2675,16 +2521,6 @@ dependencies = [
 "rand_core 0.6.4",
 ]

-[[package]]
-name = "rand_chacha"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
-dependencies = [
- "ppv-lite86",
- "rand_core 0.9.5",
-]
-
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -2703,15 +2539,6 @@ dependencies = [
 "getrandom 0.2.17",
 ]

-[[package]]
-name = "rand_core"
-version = "0.9.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
-dependencies = [
- "getrandom 0.3.4",
-]
-
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -2830,21 +2657,15 @@ dependencies = [
 "http-body",
 "http-body-util",
 "hyper",
- "hyper-rustls",
 "hyper-util",
 "js-sys",
 "log",
 "percent-encoding",
 "pin-project-lite",
- "quinn",
- "rustls",
- "rustls-pki-types",
- "rustls-platform-verifier",
 "serde",
 "serde_json",
 "sync_wrapper",
 "tokio",
- "tokio-rustls",
 "tokio-util",
 "tower",
 "tower-http",
@@ -2856,26 +2677,6 @@ dependencies = [
 "web-sys",
 ]

-[[package]]
-name = "ring"
-version = "0.17.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
-dependencies = [
- "cc",
- "cfg-if",
- "getrandom 0.2.17",
- "libc",
- "untrusted",
- "windows-sys 0.52.0",
-]
-
-[[package]]
-name = "rustc-hash"
-version = "2.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
-
 [[package]]
 name = "rustc_version"
 version = "0.4.1"
@@ -2885,81 +2686,6 @@ dependencies = [
 "semver",
 ]

-[[package]]
-name = "rustls"
-version = "0.23.37"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4"
-dependencies = [
- "aws-lc-rs",
- "once_cell",
- "rustls-pki-types",
- "rustls-webpki",
- "subtle",
- "zeroize",
-]
-
-[[package]]
-name = "rustls-native-certs"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63"
-dependencies = [
- "openssl-probe",
- "rustls-pki-types",
- "schannel",
- "security-framework",
-]
-
-[[package]]
-name = "rustls-pki-types"
-version = "1.14.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd"
-dependencies = [
- "web-time",
- "zeroize",
-]
-
-[[package]]
-name = "rustls-platform-verifier"
-version = "0.6.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784"
-dependencies = [
- "core-foundation",
- "core-foundation-sys",
- "jni",
- "log",
- "once_cell",
- "rustls",
- "rustls-native-certs",
- "rustls-platform-verifier-android",
- "rustls-webpki",
- "security-framework",
- "security-framework-sys",
- "webpki-root-certs",
- "windows-sys 0.59.0",
-]
-
-[[package]]
-name = "rustls-platform-verifier-android"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f"
-
-[[package]]
-name = "rustls-webpki"
-version = "0.103.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
-dependencies = [
- "aws-lc-rs",
- "ring",
- "rustls-pki-types",
- "untrusted",
-]
-
 [[package]]
 name = "rustversion"
 version = "1.0.22"
@@ -2975,15 +2701,6 @@ dependencies = [
 "winapi-util",
 ]

-[[package]]
-name = "schannel"
-version = "0.1.29"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939"
-dependencies = [
- "windows-sys 0.61.2",
-]
-
 [[package]]
 name = "schemars"
 version = "0.8.22"
@@ -3041,29 +2758,6 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"

-[[package]]
-name = "security-framework"
-version = "3.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d"
-dependencies = [
- "bitflags 2.11.0",
- "core-foundation",
- "core-foundation-sys",
- "libc",
- "security-framework-sys",
-]
-
-[[package]]
-name = "security-framework-sys"
-version = "2.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3"
-dependencies = [
- "core-foundation-sys",
- "libc",
-]
-
 [[package]]
 name = "selectors"
 version = "0.24.0"
@@ -3434,12 +3128,6 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"

-[[package]]
-name = "subtle"
-version = "2.6.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
-
 [[package]]
 name = "swift-rs"
 version = "1.0.7"
@@ -3921,21 +3609,6 @@ dependencies = [
 "zerovec",
 ]

-[[package]]
-name = "tinyvec"
-version = "1.11.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
-dependencies = [
- "tinyvec_macros",
-]
-
-[[package]]
-name = "tinyvec_macros"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
-
 [[package]]
 name = "tokio"
 version = "1.50.0"
@@ -3950,16 +3623,6 @@ dependencies = [
 "windows-sys 0.61.2",
 ]

-[[package]]
-name = "tokio-rustls"
-version = "0.26.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61"
-dependencies = [
- "rustls",
- "tokio",
-]
-
 [[package]]
 name = "tokio-util"
 version = "0.7.18"
@@ -4241,12 +3904,6 @@ version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"

-[[package]]
-name = "untrusted"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
-
 [[package]]
 name = "url"
 version = "2.5.8"
@@ -4493,16 +4150,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "web-time"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
-dependencies = [
- "js-sys",
- "wasm-bindgen",
-]
-
 [[package]]
 name = "webkit2gtk"
 version = "2.0.2"
@@ -4547,15 +4194,6 @@ dependencies = [
 "system-deps",
 ]

-[[package]]
-name = "webpki-root-certs"
-version = "1.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca"
-dependencies = [
- "rustls-pki-types",
-]
-
 [[package]]
 name = "webview2-com"
 version = "0.38.2"
@@ -4810,15 +4448,6 @@ dependencies = [
 "windows-targets 0.48.5",
 ]

-[[package]]
-name = "windows-sys"
-version = "0.52.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
@@ -5359,12 +4988,6 @@ dependencies = [
 "synstructure",
 ]

-[[package]]
-name = "zeroize"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
-
 [[package]]
 name = "zerotrie"
 version = "0.2.3"
--- a/desktop/src-tauri/Cargo.toml
+++ b/desktop/src-tauri/Cargo.toml
@@ -19,7 +19,6 @@ directories = "5.0"
 tokio = { version = "1", features = ["time"] }
 window-vibrancy = "0.7.1"
 url = "2.5"
-reqwest = { version = "0.13", default-features = false, features = ["rustls"] }

 [features]
 default = ["custom-protocol"]
--- a/desktop/src-tauri/src/main.rs
+++ b/desktop/src-tauri/src/main.rs
@@ -583,31 +583,6 @@ fn open_in_default_browser(url: &str) -> bool {
    false
 }

-#[tauri::command]
-async fn check_server_reachable(state: tauri::State<'_, ConfigState>) -> Result<(), String> {
-    let url = state.config.read().unwrap().server_url.clone();
-    let parsed = Url::parse(&url).map_err(|e| format!("Invalid URL: {}", e))?;
-    match parsed.scheme() {
-        "http" | "https" => {}
-        _ => return Err("URL must use http or https".to_string()),
-    }
-
-    let client = reqwest::Client::builder()
-        .timeout(std::time::Duration::from_secs(5))
-        .build()
-        .map_err(|e| format!("Failed to build HTTP client: {}", e))?;
-
-    match client.head(parsed).send().await {
-        Ok(_) => Ok(()),
-        // Only definitive "server didn't answer" errors count as unreachable.
-        // TLS / decode / redirect errors imply the server is listening — the
-        // webview, which has its own trust store, is likely to succeed even
-        // when rustls rejects a self-signed cert.
-        Err(e) if e.is_connect() || e.is_timeout() => Err(e.to_string()),
-        Err(_) => Ok(()),
-    }
-}
-
 #[tauri::command]
 fn open_in_browser(url: String) -> Result<(), String> {
    let parsed_url = Url::parse(&url).map_err(|_| "Invalid URL".to_string())?;
@@ -1319,7 +1294,6 @@ fn main() {
            get_bootstrap_state,
            set_server_url,
            get_config_path_cmd,
-            check_server_reachable,
            open_in_browser,
            open_config_file,
            open_config_directory,
--- a/desktop/src/index.html
+++ b/desktop/src/index.html
@@ -459,19 +459,8 @@
            return;
          }

-          // Not first launch and not explicit settings — confirm the server
-          // is reachable before handing the webview over. Otherwise the user
-          // lands on a native "connection refused" page with no way back.
-          try {
-            await invoke("check_server_reachable");
-          } catch (reachErr) {
-            showSettings();
-            showError(
-              `Could not connect to ${currentServerUrl}. Check the URL and your network connection.`,
-            );
-            return;
-          }
-
+          // Not first launch and not explicit settings
+          // Auto-redirect to configured domain
          window.location.href = currentServerUrl;
        } catch (error) {
          // On error, default to cloud
--- a/web/src/refresh-components/popovers/LLMPopover.tsx
+++ b/web/src/refresh-components/popovers/LLMPopover.tsx
@@ -6,6 +6,7 @@ import { LlmDescriptor, LlmManager } from "@/lib/hooks";
 import { structureValue } from "@/lib/llmConfig/utils";
 import { getModelIcon } from "@/lib/llmConfig";
 import { AGGREGATOR_PROVIDERS } from "@/lib/llmConfig/svc";
+
 import { Slider } from "@/components/ui/slider";
 import { useUser } from "@/providers/UserProvider";
 import Text from "@/refresh-components/texts/Text";
--- a/web/src/refresh-components/popovers/ModelListContent.tsx
+++ b/web/src/refresh-components/popovers/ModelListContent.tsx
@@ -3,20 +3,18 @@
 import { useState, useMemo, useRef, useEffect } from "react";
 import { PopoverMenu } from "@/refresh-components/Popover";
 import InputTypeIn from "@/refresh-components/inputs/InputTypeIn";
-import { Button, LineItemButton, Text } from "@opal/components";
-import { SvgCheck, SvgChevronRight } from "@opal/icons";
+import { Text } from "@opal/components";
+import { SvgCheck, SvgChevronDown, SvgChevronRight } from "@opal/icons";
 import { Section } from "@/layouts/general-layouts";
 import { LLMOption } from "./interfaces";
 import { buildLlmOptions, groupLlmOptions } from "./LLMPopover";
+import LineItem from "@/refresh-components/buttons/LineItem";
 import { LLMProviderDescriptor } from "@/interfaces/llm";
 import {
  Collapsible,
  CollapsibleContent,
  CollapsibleTrigger,
 } from "@/refresh-components/Collapsible";
-import { cn } from "@opal/utils";
-import { Interactive } from "@opal/core";
-import { ContentAction } from "@opal/layouts";

 export interface ModelListContentProps {
  llmProviders: LLMProviderDescriptor[] | undefined;
@@ -116,22 +114,20 @@ export default function ModelListContent({
      capabilities.length > 0 ? capabilities.join(", ") : undefined;

    return (
-      <LineItemButton
+      <LineItem
        key={`${option.provider}:${option.modelName}`}
-        selectVariant="select-heavy"
-        state={selected ? "selected" : "empty"}
-        icon={(props) => <div {...(props as any)} />}
-        title={option.displayName}
+        selected={selected}
+        disabled={disabled}
        description={description}
        onClick={() => onSelect(option)}
        rightChildren={
          selected ? (
-            <SvgCheck className="text-action-link-05" size={16} />
+            <SvgCheck className="h-4 w-4 stroke-action-link-05 shrink-0" />
          ) : null
        }
-        sizePreset="main-ui"
-        rounding="sm"
-      />
+      >
+        {option.displayName}
+      </LineItem>
    );
  };

@@ -160,7 +156,7 @@ export default function ModelListContent({
              ]
            : groupedOptions.length === 1
              ? [
-                  <Section key="single-provider" gap={1}>
+                  <Section key="single-provider" gap={0.25}>
                    {groupedOptions[0]!.options.map(renderModelItem)}
                  </Section>,
                ]
@@ -171,45 +167,22 @@ export default function ModelListContent({
                      key={group.key}
                      open={open}
                      onOpenChange={() => toggleGroup(group.key)}
-                      className="flex flex-col gap-1"
                    >
                      <CollapsibleTrigger asChild>
-                        <Interactive.Stateless prominence="tertiary">
-                          <Interactive.Container
-                            size="fit"
-                            rounding="sm"
-                            width="full"
-                          >
-                            <div className="pl-2 pr-1 py-1 w-full">
-                              <ContentAction
-                                sizePreset="secondary"
-                                variant="body"
-                                prominence="muted"
-                                icon={group.Icon}
-                                title={group.displayName}
-                                padding="fit"
-                                rightChildren={
-                                  <Section>
-                                    <Button
-                                      icon={(props) => (
-                                        <SvgChevronRight
-                                          {...props}
-                                          className={cn(
-                                            "transition-all",
-                                            open && "rotate-90",
-                                            props.className
-                                          )}
-                                        />
-                                      )}
-                                      prominence="tertiary"
-                                      size="sm"
-                                    />
-                                  </Section>
-                                }
-                              />
-                            </div>
-                          </Interactive.Container>
-                        </Interactive.Stateless>
+                        <LineItem
+                          muted
+                          icon={group.Icon}
+                          strokeIcon={false}
+                          rightChildren={
+                            open ? (
+                              <SvgChevronDown className="h-4 w-4 stroke-text-04 shrink-0" />
+                            ) : (
+                              <SvgChevronRight className="h-4 w-4 stroke-text-04 shrink-0" />
+                            )
+                          }
+                        >
+                          {group.displayName}
+                        </LineItem>
                      </CollapsibleTrigger>

                      <CollapsibleContent>
--- a/web/tests/e2e/chat/llm_ordering.spec.ts
+++ b/web/tests/e2e/chat/llm_ordering.spec.ts
@@ -51,16 +51,14 @@ test.describe("LLM Ordering", () => {
    await page.waitForSelector('[role="dialog"]', { timeout: 5000 });

    const dialog = page.locator('[role="dialog"]');
-    const allModelItems = dialog.locator("[data-interactive-state]");
+    const allModelItems = dialog.locator("[data-selected]");
    await expect(allModelItems.first()).toBeVisible({ timeout: 5000 });

    const count = await allModelItems.count();
    expect(count).toBeGreaterThan(0);

    // Pick the first non-selected model so the trigger text changes after click
-    const nonSelectedItem = dialog
-      .locator('[data-interactive-state="empty"]')
-      .first();
+    const nonSelectedItem = dialog.locator('[data-selected="false"]').first();
    const hasNonSelected = (await nonSelectedItem.count()) > 0;
    const targetItem = hasNonSelected ? nonSelectedItem : allModelItems.first();

--- a/web/tests/e2e/chat/llm_runtime_selection.spec.ts
+++ b/web/tests/e2e/chat/llm_runtime_selection.spec.ts
@@ -300,13 +300,11 @@ test.describe("LLM Runtime Selection", () => {

    const regenerateDialog = page.locator('[role="dialog"]');
    const alternateModelOption = regenerateDialog
-      .locator('[data-interactive-state="empty"]')
+      .locator('[data-selected="false"]')
      .first();

    test.skip(
-      (await regenerateDialog
-        .locator('[data-interactive-state="empty"]')
-        .count()) === 0,
+      (await regenerateDialog.locator('[data-selected="false"]').count()) === 0,
      "Regenerate model picker requires at least two runtime model options"
    );

@@ -410,11 +408,13 @@ test.describe("LLM Runtime Selection", () => {
    const dialog = page.locator('[role="dialog"]');
    await dialog.getByPlaceholder("Search models...").fill(sharedModelName);

-    const sharedModelOptions = dialog.locator("[data-interactive-state]");
+    const sharedModelOptions = dialog.locator("[data-selected]");
    await expect(sharedModelOptions).toHaveCount(2);
-    // Two results with the same model name under different providers.
-    // Groups are sorted alphabetically, so OpenAI (index 1) comes after Anthropic (index 0).
-    const openAiModelOption = sharedModelOptions.nth(1);
+    const openAiModelOption = dialog
+      .getByRole("button", { name: /openai/i })
+      .locator("..")
+      .locator("[data-selected]")
+      .first();
    await expect(openAiModelOption).toBeVisible();
    await openAiModelOption.click();
    await page.waitForSelector('[role="dialog"]', { state: "hidden" });
@@ -434,12 +434,13 @@ test.describe("LLM Runtime Selection", () => {
      .getByPlaceholder("Search models...")
      .fill(sharedModelName);

-    const secondSharedModelOptions = secondDialog.locator(
-      "[data-interactive-state]"
-    );
+    const secondSharedModelOptions = secondDialog.locator("[data-selected]");
    await expect(secondSharedModelOptions).toHaveCount(2);
-    // Anthropic is index 0 (alphabetically first).
-    const anthropicModelOption = secondSharedModelOptions.nth(0);
+    const anthropicModelOption = secondDialog
+      .getByRole("button", { name: /anthropic/i })
+      .locator("..")
+      .locator("[data-selected]")
+      .first();
    await expect(anthropicModelOption).toBeVisible();
    await anthropicModelOption.click();
    await page.waitForSelector('[role="dialog"]', { state: "hidden" });
@@ -447,11 +448,11 @@ test.describe("LLM Runtime Selection", () => {
    await page.getByTestId("model-selector").locator("button").last().click();
    await page.waitForSelector('[role="dialog"]', { state: "visible" });
    const verifyDialog = page.locator('[role="dialog"]');
-    // Verify the Anthropic option (index 0) is selected.
-    const selectedOption = verifyDialog.locator(
-      '[data-interactive-state="selected"]'
-    );
-    await expect(selectedOption).toHaveCount(1);
+    const selectedAnthropicOption = verifyDialog
+      .getByRole("button", { name: /anthropic/i })
+      .locator("..")
+      .locator('[data-selected="true"]');
+    await expect(selectedAnthropicOption).toHaveCount(1);
    await page.keyboard.press("Escape");
    await page.waitForSelector('[role="dialog"]', { state: "hidden" });

@@ -517,7 +518,7 @@ test.describe("LLM Runtime Selection", () => {
    await dialog.getByPlaceholder("Search models...").fill(restrictedModelName);

    const restrictedModelOption = dialog
-      .locator("[data-interactive-state]")
+      .locator("[data-selected]")
      .filter({ hasText: restrictedModelName });

    await expect(restrictedModelOption).toHaveCount(0);
--- a/web/tests/e2e/utils/chatActions.ts
+++ b/web/tests/e2e/utils/chatActions.ts
@@ -85,10 +85,8 @@ export async function selectModelFromInputPopover(

  for (const modelName of preferredModels) {
    await searchInput.fill(modelName);
-    const modelOptions = dialog.locator("[data-interactive-state]");
-    const nonSelectedOptions = dialog.locator(
-      '[data-interactive-state="empty"]'
-    );
+    const modelOptions = dialog.locator("[data-selected]");
+    const nonSelectedOptions = dialog.locator('[data-selected="false"]');

    if ((await modelOptions.count()) > 0) {
      const candidate =
@@ -112,7 +110,7 @@ export async function selectModelFromInputPopover(
  // Reset search so fallback sees all available models.
  await searchInput.fill("");

-  const nonSelectedOptions = dialog.locator('[data-interactive-state="empty"]');
+  const nonSelectedOptions = dialog.locator('[data-selected="false"]');
  if ((await nonSelectedOptions.count()) > 0) {
    const fallback = nonSelectedOptions.first();
    await expect(fallback).toBeVisible();
@@ -147,7 +145,7 @@ export async function switchModel(page: Page, modelName: string) {

  const modelButton = page
    .locator('[role="dialog"]')
-    .getByRole("button")
+    .locator('[role="button"]')
    .filter({ hasText: modelName })
    .first();