quick nit

rate limit github fix
k
2026-02-17 15:55:45 +00:00 · 2025-02-19 11:28:13 -08:00 · 2025-02-19 11:28:13 -08:00 · 2025-02-19 11:28:13 -08:00 · 2025-02-19 11:28:13 -08:00 · 2025-02-19 11:28:13 -08:00
15 changed files with 237 additions and 40 deletions
--- a/.github/workflows/pr-python-connector-tests.yml
+++ b/.github/workflows/pr-python-connector-tests.yml
@@ -74,7 +74,9 @@ jobs:
          python -m pip install --upgrade pip
          pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
          pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
-
+          playwright install chromium
+          playwright install-deps chromium
+          
      - name: Run Tests
        shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
        run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
--- a/backend/alembic/versions/acaab4ef4507_remove_inactive_ccpair_status_on_.py
+++ b/backend/alembic/versions/acaab4ef4507_remove_inactive_ccpair_status_on_.py
@@ -1,7 +1,7 @@
 """remove inactive ccpair status on downgrade

 Revision ID: acaab4ef4507
-Revises: b7a7eee5aa15
+Revises: b388730a2899
 Create Date: 2025-02-16 18:21:41.330212

 """
@@ -12,7 +12,7 @@ from sqlalchemy import update

 # revision identifiers, used by Alembic.
 revision = "acaab4ef4507"
-down_revision = "b7a7eee5aa15"
+down_revision = "b388730a2899"
 branch_labels = None
 depends_on = None

--- a/backend/onyx/auth/users.py
+++ b/backend/onyx/auth/users.py
@@ -95,6 +95,7 @@ from onyx.db.models import User
 from onyx.db.users import get_user_by_email
 from onyx.redis.redis_pool import get_async_redis_connection
 from onyx.redis.redis_pool import get_redis_client
+from onyx.server.utils import BasicAuthenticationError
 from onyx.utils.logger import setup_logger
 from onyx.utils.telemetry import create_milestone_and_report
 from onyx.utils.telemetry import optional_telemetry
@@ -109,11 +110,6 @@ from shared_configs.contextvars import get_current_tenant_id
 logger = setup_logger()


-class BasicAuthenticationError(HTTPException):
-    def __init__(self, detail: str):
-        super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
-
-
 def is_user_admin(user: User | None) -> bool:
    if AUTH_TYPE == AuthType.DISABLED:
        return True
--- a/backend/onyx/connectors/bookstack/client.py
+++ b/backend/onyx/connectors/bookstack/client.py
@@ -5,6 +5,8 @@ import requests

 class BookStackClientRequestFailedError(ConnectionError):
    def __init__(self, status: int, error: str) -> None:
+        self.status_code = status
+        self.error = error
        super().__init__(
            "BookStack Client request failed with status {status}: {error}".format(
                status=status, error=error
--- a/backend/onyx/connectors/bookstack/connector.py
+++ b/backend/onyx/connectors/bookstack/connector.py
@@ -7,8 +7,12 @@ from typing import Any
 from onyx.configs.app_configs import INDEX_BATCH_SIZE
 from onyx.configs.constants import DocumentSource
 from onyx.connectors.bookstack.client import BookStackApiClient
+from onyx.connectors.bookstack.client import BookStackClientRequestFailedError
 from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
+from onyx.connectors.interfaces import ConnectorValidationError
+from onyx.connectors.interfaces import CredentialExpiredError
 from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import InsufficientPermissionsError
 from onyx.connectors.interfaces import LoadConnector
 from onyx.connectors.interfaces import PollConnector
 from onyx.connectors.interfaces import SecondsSinceUnixEpoch
@@ -214,3 +218,39 @@ class BookstackConnector(LoadConnector, PollConnector):
                    break
                else:
                    time.sleep(0.2)
+
+    def validate_connector_settings(self) -> None:
+        """
+        Validate that the BookStack credentials and connector settings are correct.
+        Specifically checks that we can make an authenticated request to BookStack.
+        """
+        if not self.bookstack_client:
+            raise ConnectorMissingCredentialError(
+                "BookStack credentials have not been loaded."
+            )
+
+        try:
+            # Attempt to fetch a small batch of books (arbitrary endpoint) to verify credentials
+            _ = self.bookstack_client.get(
+                "/books", params={"count": "1", "offset": "0"}
+            )
+
+        except BookStackClientRequestFailedError as e:
+            # Check for HTTP status codes
+            if e.status_code == 401:
+                raise CredentialExpiredError(
+                    "Your BookStack credentials appear to be invalid or expired (HTTP 401)."
+                ) from e
+            elif e.status_code == 403:
+                raise InsufficientPermissionsError(
+                    "The configured BookStack token does not have sufficient permissions (HTTP 403)."
+                ) from e
+            else:
+                raise ConnectorValidationError(
+                    f"Unexpected BookStack error (status={e.status_code}): {e}"
+                ) from e
+
+        except Exception as exc:
+            raise ConnectorValidationError(
+                f"Unexpected error while validating BookStack connector settings: {exc}"
+            ) from exc
--- a/backend/onyx/connectors/cross_connector_utils/miscellaneous_utils.py
+++ b/backend/onyx/connectors/cross_connector_utils/miscellaneous_utils.py
@@ -1,3 +1,4 @@
+import re
 from collections.abc import Callable
 from collections.abc import Iterator
 from datetime import datetime
@@ -24,16 +25,22 @@ def datetime_to_utc(dt: datetime) -> datetime:


 def time_str_to_utc(datetime_str: str) -> datetime:
+    # Remove all timezone abbreviations in parentheses
+    datetime_str = re.sub(r"\([A-Z]+\)", "", datetime_str).strip()
+
+    # Remove any remaining parentheses and their contents
+    datetime_str = re.sub(r"\(.*?\)", "", datetime_str).strip()
+
    try:
        dt = parse(datetime_str)
    except ValueError:
-        # Handle malformed timezone by attempting to fix common format issues
+        # Fix common format issues (e.g. "0000" => "+0000")
        if "0000" in datetime_str:
-            # Convert "0000" to "+0000" for proper timezone parsing
-            fixed_dt_str = datetime_str.replace(" 0000", " +0000")
-            dt = parse(fixed_dt_str)
+            datetime_str = datetime_str.replace(" 0000", " +0000")
+            dt = parse(datetime_str)
        else:
            raise
+
    return datetime_to_utc(dt)


--- a/backend/onyx/connectors/factory.py
+++ b/backend/onyx/connectors/factory.py
@@ -210,7 +210,6 @@ def validate_ccpair_for_user(
            tenant_id=tenant_id,
        )
    except Exception as e:
-        error_msg = f"Unexpected error creating connector: {e}"
-        raise ConnectorValidationError(error_msg)
+        raise ConnectorValidationError(str(e))

    runnable_connector.validate_connector_settings()
--- a/backend/onyx/connectors/fireflies/connector.py
+++ b/backend/onyx/connectors/fireflies/connector.py
@@ -187,12 +187,12 @@ class FirefliesConnector(PollConnector, LoadConnector):
        return self._process_transcripts()

    def poll_source(
-        self, start_unixtime: SecondsSinceUnixEpoch, end_unixtime: SecondsSinceUnixEpoch
+        self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
    ) -> GenerateDocumentsOutput:
-        start_datetime = datetime.fromtimestamp(
-            start_unixtime, tz=timezone.utc
-        ).strftime("%Y-%m-%dT%H:%M:%S.000Z")
-        end_datetime = datetime.fromtimestamp(end_unixtime, tz=timezone.utc).strftime(
+        start_datetime = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
+            "%Y-%m-%dT%H:%M:%S.000Z"
+        )
+        end_datetime = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
            "%Y-%m-%dT%H:%M:%S.000Z"
        )

--- a/backend/onyx/connectors/github/connector.py
+++ b/backend/onyx/connectors/github/connector.py
@@ -24,6 +24,7 @@ from onyx.connectors.interfaces import InsufficientPermissionsError
 from onyx.connectors.interfaces import LoadConnector
 from onyx.connectors.interfaces import PollConnector
 from onyx.connectors.interfaces import SecondsSinceUnixEpoch
+from onyx.connectors.interfaces import UnexpectedError
 from onyx.connectors.models import ConnectorMissingCredentialError
 from onyx.connectors.models import Document
 from onyx.connectors.models import Section
@@ -245,7 +246,7 @@ class GithubConnector(LoadConnector, PollConnector):
            test_repo.get_contents("")

        except RateLimitExceededException:
-            raise ConnectorValidationError(
+            raise UnexpectedError(
                "Validation failed due to GitHub rate-limits being exceeded. Please try again later."
            )

--- a/backend/onyx/connectors/gmail/connector.py
+++ b/backend/onyx/connectors/gmail/connector.py
@@ -297,6 +297,7 @@ class GmailConnector(LoadConnector, PollConnector, SlimConnector):
                userId=user_email,
                fields=THREAD_LIST_FIELDS,
                q=query,
+                continue_on_404_or_403=True,
            ):
                full_threads = execute_paginated_retrieval(
                    retrieval_function=gmail_service.users().threads().get,
--- a/backend/onyx/connectors/web/connector.py
+++ b/backend/onyx/connectors/web/connector.py
@@ -25,8 +25,12 @@ from onyx.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
 from onyx.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
 from onyx.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
 from onyx.configs.constants import DocumentSource
+from onyx.connectors.interfaces import ConnectorValidationError
+from onyx.connectors.interfaces import CredentialExpiredError
 from onyx.connectors.interfaces import GenerateDocumentsOutput
+from onyx.connectors.interfaces import InsufficientPermissionsError
 from onyx.connectors.interfaces import LoadConnector
+from onyx.connectors.interfaces import UnexpectedError
 from onyx.connectors.models import Document
 from onyx.connectors.models import Section
 from onyx.file_processing.extract_file_text import read_pdf_file
@@ -37,6 +41,8 @@ from shared_configs.configs import MULTI_TENANT

 logger = setup_logger()

+WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
+

 class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
    # Given a base site, index everything under that path
@@ -170,26 +176,35 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:


 def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
-    response = requests.get(sitemap_url)
-    response.raise_for_status()
+    try:
+        response = requests.get(sitemap_url)
+        response.raise_for_status()

-    soup = BeautifulSoup(response.content, "html.parser")
-    urls = [
-        _ensure_absolute_url(sitemap_url, loc_tag.text)
-        for loc_tag in soup.find_all("loc")
-    ]
+        soup = BeautifulSoup(response.content, "html.parser")
+        urls = [
+            _ensure_absolute_url(sitemap_url, loc_tag.text)
+            for loc_tag in soup.find_all("loc")
+        ]

-    if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
-        # the given url doesn't look like a sitemap, let's try to find one
-        urls = list_pages_for_site(sitemap_url)
+        if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
+            # the given url doesn't look like a sitemap, let's try to find one
+            urls = list_pages_for_site(sitemap_url)

-    if len(urls) == 0:
-        raise ValueError(
-            f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
+        if len(urls) == 0:
+            raise ValueError(
+                f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
+            )
+
+        return urls
+    except requests.RequestException as e:
+        raise RuntimeError(f"Failed to fetch sitemap from {sitemap_url}: {e}")
+    except ValueError as e:
+        raise RuntimeError(f"Error processing sitemap {sitemap_url}: {e}")
+    except Exception as e:
+        raise RuntimeError(
+            f"Unexpected error while processing sitemap {sitemap_url}: {e}"
        )

-    return urls
-

 def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
    if not urlparse(maybe_relative_url).netloc:
@@ -225,10 +240,14 @@ class WebConnector(LoadConnector):
        web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
        mintlify_cleanup: bool = True,  # Mostly ok to apply to other websites as well
        batch_size: int = INDEX_BATCH_SIZE,
+        scroll_before_scraping: bool = False,
+        **kwargs: Any,
    ) -> None:
        self.mintlify_cleanup = mintlify_cleanup
        self.batch_size = batch_size
        self.recursive = False
+        self.scroll_before_scraping = scroll_before_scraping
+        self.web_connector_type = web_connector_type

        if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
            self.recursive = True
@@ -344,6 +363,18 @@ class WebConnector(LoadConnector):
                        continue
                    visited_links.add(current_url)

+                if self.scroll_before_scraping:
+                    scroll_attempts = 0
+                    previous_height = page.evaluate("document.body.scrollHeight")
+                    while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
+                        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+                        page.wait_for_load_state("networkidle", timeout=30000)
+                        new_height = page.evaluate("document.body.scrollHeight")
+                        if new_height == previous_height:
+                            break  # Stop scrolling when no more content is loaded
+                        previous_height = new_height
+                        scroll_attempts += 1
+
                content = page.content()
                soup = BeautifulSoup(content, "html.parser")

@@ -402,6 +433,53 @@ class WebConnector(LoadConnector):
                raise RuntimeError(last_error)
            raise RuntimeError("No valid pages found.")

+    def validate_connector_settings(self) -> None:
+        # Make sure we have at least one valid URL to check
+        if not self.to_visit_list:
+            raise ConnectorValidationError(
+                "No URL configured. Please provide at least one valid URL."
+            )
+
+        if self.web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.SITEMAP.value:
+            return None
+
+        # We'll just test the first URL for connectivity and correctness
+        test_url = self.to_visit_list[0]
+
+        # Check that the URL is allowed and well-formed
+        try:
+            protected_url_check(test_url)
+        except ValueError as e:
+            raise ConnectorValidationError(
+                f"Protected URL check failed for '{test_url}': {e}"
+            )
+        except ConnectionError as e:
+            # Typically DNS or other network issues
+            raise ConnectorValidationError(str(e))
+
+        # Make a quick request to see if we get a valid response
+        try:
+            check_internet_connection(test_url)
+        except Exception as e:
+            err_str = str(e)
+            if "401" in err_str:
+                raise CredentialExpiredError(
+                    f"Unauthorized access to '{test_url}': {e}"
+                )
+            elif "403" in err_str:
+                raise InsufficientPermissionsError(
+                    f"Forbidden access to '{test_url}': {e}"
+                )
+            elif "404" in err_str:
+                raise ConnectorValidationError(f"Page not found for '{test_url}': {e}")
+            elif "Max retries exceeded" in err_str and "NameResolutionError" in err_str:
+                raise ConnectorValidationError(
+                    f"Unable to resolve hostname for '{test_url}'. Please check the URL and your internet connection."
+                )
+            else:
+                # Could be a 5xx or another error, treat as unexpected
+                raise UnexpectedError(f"Unexpected error validating '{test_url}': {e}")
+

 if __name__ == "__main__":
    connector = WebConnector("https://docs.onyx.app/")
--- a/backend/onyx/server/documents/cc_pair.py
+++ b/backend/onyx/server/documents/cc_pair.py
@@ -665,7 +665,8 @@ def associate_credential_to_connector(
        logger.error(f"IntegrityError: {e}")
        raise HTTPException(status_code=400, detail="Name must be unique")

-    except Exception:
+    except Exception as e:
+        logger.exception(f"Unexpected error: {e}")
        raise HTTPException(status_code=500, detail="Unexpected error")


--- a/backend/onyx/server/documents/connector.py
+++ b/backend/onyx/server/documents/connector.py
@@ -28,6 +28,7 @@ from onyx.configs.constants import FileOrigin
 from onyx.configs.constants import MilestoneRecordType
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryTask
+from onyx.connectors.factory import validate_ccpair_for_user
 from onyx.connectors.google_utils.google_auth import (
    get_google_oauth_creds,
 )
@@ -61,6 +62,7 @@ from onyx.connectors.google_utils.shared_constants import DB_CREDENTIALS_DICT_TO
 from onyx.connectors.google_utils.shared_constants import (
    GoogleOAuthAuthenticationMethod,
 )
+from onyx.connectors.interfaces import ConnectorValidationError
 from onyx.db.connector import create_connector
 from onyx.db.connector import delete_connector
 from onyx.db.connector import fetch_connector_by_id
@@ -844,11 +846,22 @@ def create_connector_with_mock_credential(
            db_session=db_session,
        )

+        # Store the created connector and credential IDs
+        connector_id = cast(int, connector_response.id)
+        credential_id = credential.id
+
+        validate_ccpair_for_user(
+            connector_id=connector_id,
+            credential_id=credential_id,
+            db_session=db_session,
+            user=user,
+            tenant_id=tenant_id,
+        )
        response = add_credential_to_connector(
            db_session=db_session,
            user=user,
-            connector_id=cast(int, connector_response.id),  # will aways be an int
-            credential_id=credential.id,
+            connector_id=connector_id,
+            credential_id=credential_id,
            access_type=connector_data.access_type,
            cc_pair_name=connector_data.name,
            groups=connector_data.groups,
@@ -873,9 +886,12 @@ def create_connector_with_mock_credential(
            properties=None,
            db_session=db_session,
        )
-
        return response

+    except ConnectorValidationError as e:
+        raise HTTPException(
+            status_code=400, detail="Connector validation error: " + str(e)
+        )
    except ValueError as e:
        raise HTTPException(status_code=400, detail=str(e))

--- a/backend/tests/daily/connectors/web/test_web_connector.py
+++ b/backend/tests/daily/connectors/web/test_web_connector.py
@@ -0,0 +1,44 @@
+import pytest
+
+from onyx.connectors.models import Document
+from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
+from onyx.connectors.web.connector import WebConnector
+
+
+# NOTE(rkuo): we will probably need to adjust this test to point at our own test site
+# to avoid depending on a third party site
+@pytest.fixture
+def web_connector(request: pytest.FixtureRequest) -> WebConnector:
+    scroll_before_scraping = request.param
+    connector = WebConnector(
+        base_url="https://developer.onewelcome.com",
+        web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
+        scroll_before_scraping=scroll_before_scraping,
+    )
+    return connector
+
+
+@pytest.mark.parametrize("web_connector", [True], indirect=True)
+def test_web_connector_scroll(web_connector: WebConnector) -> None:
+    all_docs: list[Document] = []
+    document_batches = web_connector.load_from_state()
+    for doc_batch in document_batches:
+        for doc in doc_batch:
+            all_docs.append(doc)
+
+    assert len(all_docs) == 1
+    doc = all_docs[0]
+    assert "Onegini Identity Cloud" in doc.sections[0].text
+
+
+@pytest.mark.parametrize("web_connector", [False], indirect=True)
+def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
+    all_docs: list[Document] = []
+    document_batches = web_connector.load_from_state()
+    for doc_batch in document_batches:
+        for doc in doc_batch:
+            all_docs.append(doc)
+
+    assert len(all_docs) == 1
+    doc = all_docs[0]
+    assert "Onegini Identity Cloud" not in doc.sections[0].text
--- a/web/src/lib/connectors/connectors.tsx
+++ b/web/src/lib/connectors/connectors.tsx
@@ -152,7 +152,17 @@ export const connectorConfigs: Record<
        ],
      },
    ],
-    advanced_values: [],
+    advanced_values: [
+      {
+        type: "checkbox",
+        query: "Scroll before scraping:",
+        label: "Scroll before scraping",
+        description:
+          "Enable if the website requires scrolling for the desired content to load",
+        name: "scroll_before_scraping",
+        optional: true,
+      },
+    ],
    overrideDefaultFreq: 60 * 60 * 24,
  },
  github: {
Author	SHA1	Message	Date
pablonyx	6fb85d53c9	quick nit	2025-02-19 11:28:13 -08:00
pablonyx	3b92cf2f38	rate limit github fix	2025-02-19 11:28:13 -08:00
pablonyx	65485e0ea1	k	2025-02-19 11:28:13 -08:00
pablonyx	67028782f0	k	2025-02-19 11:28:13 -08:00
pablonyx	09b14c68ca	full gmail fix	2025-02-19 11:28:13 -08:00
pablonyx	8347bfe5ee	k	2025-02-19 11:28:13 -08:00
pablonyx	bf175d0749	k	2025-02-19 11:28:13 -08:00
pablonyx	c892dd9c6f	finalize	2025-02-19 11:28:13 -08:00
pablonyx	bf51ac5dc0	update	2025-02-19 11:28:13 -08:00
pablonyx	353c185856	Update error class (#4006 )	2025-02-19 10:52:23 -08:00
pablonyx	7c96b7f24e	minor alembic nit	2025-02-19 10:47:33 -08:00
pablonyx	31524a3eff	add connector validation (#4016 )	2025-02-19 10:46:06 -08:00
rkuo-danswer	c9f618798e	support scrolling before scraping (#4040 ) * support scrolling before scraping * fix mypy * install playwright deps --------- Co-authored-by: Richard Kuo <rkuo@rkuo.com>	2025-02-19 17:54:58 +00:00