mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-17 15:55:45 +00:00
Compare commits
13 Commits
connector_
...
bookstack_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6fb85d53c9 | ||
|
|
3b92cf2f38 | ||
|
|
65485e0ea1 | ||
|
|
67028782f0 | ||
|
|
09b14c68ca | ||
|
|
8347bfe5ee | ||
|
|
bf175d0749 | ||
|
|
c892dd9c6f | ||
|
|
bf51ac5dc0 | ||
|
|
353c185856 | ||
|
|
7c96b7f24e | ||
|
|
31524a3eff | ||
|
|
c9f618798e |
@@ -74,7 +74,9 @@ jobs:
|
||||
python -m pip install --upgrade pip
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/default.txt
|
||||
pip install --retries 5 --timeout 30 -r backend/requirements/dev.txt
|
||||
|
||||
playwright install chromium
|
||||
playwright install-deps chromium
|
||||
|
||||
- name: Run Tests
|
||||
shell: script -q -e -c "bash --noprofile --norc -eo pipefail {0}"
|
||||
run: py.test -o junit_family=xunit2 -xv --ff backend/tests/daily/connectors
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""remove inactive ccpair status on downgrade
|
||||
|
||||
Revision ID: acaab4ef4507
|
||||
Revises: b7a7eee5aa15
|
||||
Revises: b388730a2899
|
||||
Create Date: 2025-02-16 18:21:41.330212
|
||||
|
||||
"""
|
||||
@@ -12,7 +12,7 @@ from sqlalchemy import update
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "acaab4ef4507"
|
||||
down_revision = "b7a7eee5aa15"
|
||||
down_revision = "b388730a2899"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
@@ -95,6 +95,7 @@ from onyx.db.models import User
|
||||
from onyx.db.users import get_user_by_email
|
||||
from onyx.redis.redis_pool import get_async_redis_connection
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.server.utils import BasicAuthenticationError
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.telemetry import create_milestone_and_report
|
||||
from onyx.utils.telemetry import optional_telemetry
|
||||
@@ -109,11 +110,6 @@ from shared_configs.contextvars import get_current_tenant_id
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class BasicAuthenticationError(HTTPException):
|
||||
def __init__(self, detail: str):
|
||||
super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
|
||||
|
||||
|
||||
def is_user_admin(user: User | None) -> bool:
|
||||
if AUTH_TYPE == AuthType.DISABLED:
|
||||
return True
|
||||
|
||||
@@ -5,6 +5,8 @@ import requests
|
||||
|
||||
class BookStackClientRequestFailedError(ConnectionError):
|
||||
def __init__(self, status: int, error: str) -> None:
|
||||
self.status_code = status
|
||||
self.error = error
|
||||
super().__init__(
|
||||
"BookStack Client request failed with status {status}: {error}".format(
|
||||
status=status, error=error
|
||||
|
||||
@@ -7,8 +7,12 @@ from typing import Any
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.bookstack.client import BookStackApiClient
|
||||
from onyx.connectors.bookstack.client import BookStackClientRequestFailedError
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from onyx.connectors.interfaces import ConnectorValidationError
|
||||
from onyx.connectors.interfaces import CredentialExpiredError
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import InsufficientPermissionsError
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
@@ -214,3 +218,39 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
break
|
||||
else:
|
||||
time.sleep(0.2)
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
"""
|
||||
Validate that the BookStack credentials and connector settings are correct.
|
||||
Specifically checks that we can make an authenticated request to BookStack.
|
||||
"""
|
||||
if not self.bookstack_client:
|
||||
raise ConnectorMissingCredentialError(
|
||||
"BookStack credentials have not been loaded."
|
||||
)
|
||||
|
||||
try:
|
||||
# Attempt to fetch a small batch of books (arbitrary endpoint) to verify credentials
|
||||
_ = self.bookstack_client.get(
|
||||
"/books", params={"count": "1", "offset": "0"}
|
||||
)
|
||||
|
||||
except BookStackClientRequestFailedError as e:
|
||||
# Check for HTTP status codes
|
||||
if e.status_code == 401:
|
||||
raise CredentialExpiredError(
|
||||
"Your BookStack credentials appear to be invalid or expired (HTTP 401)."
|
||||
) from e
|
||||
elif e.status_code == 403:
|
||||
raise InsufficientPermissionsError(
|
||||
"The configured BookStack token does not have sufficient permissions (HTTP 403)."
|
||||
) from e
|
||||
else:
|
||||
raise ConnectorValidationError(
|
||||
f"Unexpected BookStack error (status={e.status_code}): {e}"
|
||||
) from e
|
||||
|
||||
except Exception as exc:
|
||||
raise ConnectorValidationError(
|
||||
f"Unexpected error while validating BookStack connector settings: {exc}"
|
||||
) from exc
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import re
|
||||
from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
@@ -24,16 +25,22 @@ def datetime_to_utc(dt: datetime) -> datetime:
|
||||
|
||||
|
||||
def time_str_to_utc(datetime_str: str) -> datetime:
|
||||
# Remove all timezone abbreviations in parentheses
|
||||
datetime_str = re.sub(r"\([A-Z]+\)", "", datetime_str).strip()
|
||||
|
||||
# Remove any remaining parentheses and their contents
|
||||
datetime_str = re.sub(r"\(.*?\)", "", datetime_str).strip()
|
||||
|
||||
try:
|
||||
dt = parse(datetime_str)
|
||||
except ValueError:
|
||||
# Handle malformed timezone by attempting to fix common format issues
|
||||
# Fix common format issues (e.g. "0000" => "+0000")
|
||||
if "0000" in datetime_str:
|
||||
# Convert "0000" to "+0000" for proper timezone parsing
|
||||
fixed_dt_str = datetime_str.replace(" 0000", " +0000")
|
||||
dt = parse(fixed_dt_str)
|
||||
datetime_str = datetime_str.replace(" 0000", " +0000")
|
||||
dt = parse(datetime_str)
|
||||
else:
|
||||
raise
|
||||
|
||||
return datetime_to_utc(dt)
|
||||
|
||||
|
||||
|
||||
@@ -210,7 +210,6 @@ def validate_ccpair_for_user(
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
except Exception as e:
|
||||
error_msg = f"Unexpected error creating connector: {e}"
|
||||
raise ConnectorValidationError(error_msg)
|
||||
raise ConnectorValidationError(str(e))
|
||||
|
||||
runnable_connector.validate_connector_settings()
|
||||
|
||||
@@ -187,12 +187,12 @@ class FirefliesConnector(PollConnector, LoadConnector):
|
||||
return self._process_transcripts()
|
||||
|
||||
def poll_source(
|
||||
self, start_unixtime: SecondsSinceUnixEpoch, end_unixtime: SecondsSinceUnixEpoch
|
||||
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
|
||||
) -> GenerateDocumentsOutput:
|
||||
start_datetime = datetime.fromtimestamp(
|
||||
start_unixtime, tz=timezone.utc
|
||||
).strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
||||
end_datetime = datetime.fromtimestamp(end_unixtime, tz=timezone.utc).strftime(
|
||||
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.000Z"
|
||||
)
|
||||
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc).strftime(
|
||||
"%Y-%m-%dT%H:%M:%S.000Z"
|
||||
)
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ from onyx.connectors.interfaces import InsufficientPermissionsError
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import PollConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import UnexpectedError
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
@@ -245,7 +246,7 @@ class GithubConnector(LoadConnector, PollConnector):
|
||||
test_repo.get_contents("")
|
||||
|
||||
except RateLimitExceededException:
|
||||
raise ConnectorValidationError(
|
||||
raise UnexpectedError(
|
||||
"Validation failed due to GitHub rate-limits being exceeded. Please try again later."
|
||||
)
|
||||
|
||||
|
||||
@@ -297,6 +297,7 @@ class GmailConnector(LoadConnector, PollConnector, SlimConnector):
|
||||
userId=user_email,
|
||||
fields=THREAD_LIST_FIELDS,
|
||||
q=query,
|
||||
continue_on_404_or_403=True,
|
||||
):
|
||||
full_threads = execute_paginated_retrieval(
|
||||
retrieval_function=gmail_service.users().threads().get,
|
||||
|
||||
@@ -25,8 +25,12 @@ from onyx.configs.app_configs import WEB_CONNECTOR_OAUTH_CLIENT_SECRET
|
||||
from onyx.configs.app_configs import WEB_CONNECTOR_OAUTH_TOKEN_URL
|
||||
from onyx.configs.app_configs import WEB_CONNECTOR_VALIDATE_URLS
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.connectors.interfaces import ConnectorValidationError
|
||||
from onyx.connectors.interfaces import CredentialExpiredError
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import InsufficientPermissionsError
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import UnexpectedError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
@@ -37,6 +41,8 @@ from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS = 20
|
||||
|
||||
|
||||
class WEB_CONNECTOR_VALID_SETTINGS(str, Enum):
|
||||
# Given a base site, index everything under that path
|
||||
@@ -170,26 +176,35 @@ def start_playwright() -> Tuple[Playwright, BrowserContext]:
|
||||
|
||||
|
||||
def extract_urls_from_sitemap(sitemap_url: str) -> list[str]:
|
||||
response = requests.get(sitemap_url)
|
||||
response.raise_for_status()
|
||||
try:
|
||||
response = requests.get(sitemap_url)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
urls = [
|
||||
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
||||
for loc_tag in soup.find_all("loc")
|
||||
]
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
urls = [
|
||||
_ensure_absolute_url(sitemap_url, loc_tag.text)
|
||||
for loc_tag in soup.find_all("loc")
|
||||
]
|
||||
|
||||
if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
|
||||
# the given url doesn't look like a sitemap, let's try to find one
|
||||
urls = list_pages_for_site(sitemap_url)
|
||||
if len(urls) == 0 and len(soup.find_all("urlset")) == 0:
|
||||
# the given url doesn't look like a sitemap, let's try to find one
|
||||
urls = list_pages_for_site(sitemap_url)
|
||||
|
||||
if len(urls) == 0:
|
||||
raise ValueError(
|
||||
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
|
||||
if len(urls) == 0:
|
||||
raise ValueError(
|
||||
f"No URLs found in sitemap {sitemap_url}. Try using the 'single' or 'recursive' scraping options instead."
|
||||
)
|
||||
|
||||
return urls
|
||||
except requests.RequestException as e:
|
||||
raise RuntimeError(f"Failed to fetch sitemap from {sitemap_url}: {e}")
|
||||
except ValueError as e:
|
||||
raise RuntimeError(f"Error processing sitemap {sitemap_url}: {e}")
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Unexpected error while processing sitemap {sitemap_url}: {e}"
|
||||
)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def _ensure_absolute_url(source_url: str, maybe_relative_url: str) -> str:
|
||||
if not urlparse(maybe_relative_url).netloc:
|
||||
@@ -225,10 +240,14 @@ class WebConnector(LoadConnector):
|
||||
web_connector_type: str = WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
|
||||
mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well
|
||||
batch_size: int = INDEX_BATCH_SIZE,
|
||||
scroll_before_scraping: bool = False,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
self.mintlify_cleanup = mintlify_cleanup
|
||||
self.batch_size = batch_size
|
||||
self.recursive = False
|
||||
self.scroll_before_scraping = scroll_before_scraping
|
||||
self.web_connector_type = web_connector_type
|
||||
|
||||
if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value:
|
||||
self.recursive = True
|
||||
@@ -344,6 +363,18 @@ class WebConnector(LoadConnector):
|
||||
continue
|
||||
visited_links.add(current_url)
|
||||
|
||||
if self.scroll_before_scraping:
|
||||
scroll_attempts = 0
|
||||
previous_height = page.evaluate("document.body.scrollHeight")
|
||||
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
page.wait_for_load_state("networkidle", timeout=30000)
|
||||
new_height = page.evaluate("document.body.scrollHeight")
|
||||
if new_height == previous_height:
|
||||
break # Stop scrolling when no more content is loaded
|
||||
previous_height = new_height
|
||||
scroll_attempts += 1
|
||||
|
||||
content = page.content()
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
@@ -402,6 +433,53 @@ class WebConnector(LoadConnector):
|
||||
raise RuntimeError(last_error)
|
||||
raise RuntimeError("No valid pages found.")
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
# Make sure we have at least one valid URL to check
|
||||
if not self.to_visit_list:
|
||||
raise ConnectorValidationError(
|
||||
"No URL configured. Please provide at least one valid URL."
|
||||
)
|
||||
|
||||
if self.web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.SITEMAP.value:
|
||||
return None
|
||||
|
||||
# We'll just test the first URL for connectivity and correctness
|
||||
test_url = self.to_visit_list[0]
|
||||
|
||||
# Check that the URL is allowed and well-formed
|
||||
try:
|
||||
protected_url_check(test_url)
|
||||
except ValueError as e:
|
||||
raise ConnectorValidationError(
|
||||
f"Protected URL check failed for '{test_url}': {e}"
|
||||
)
|
||||
except ConnectionError as e:
|
||||
# Typically DNS or other network issues
|
||||
raise ConnectorValidationError(str(e))
|
||||
|
||||
# Make a quick request to see if we get a valid response
|
||||
try:
|
||||
check_internet_connection(test_url)
|
||||
except Exception as e:
|
||||
err_str = str(e)
|
||||
if "401" in err_str:
|
||||
raise CredentialExpiredError(
|
||||
f"Unauthorized access to '{test_url}': {e}"
|
||||
)
|
||||
elif "403" in err_str:
|
||||
raise InsufficientPermissionsError(
|
||||
f"Forbidden access to '{test_url}': {e}"
|
||||
)
|
||||
elif "404" in err_str:
|
||||
raise ConnectorValidationError(f"Page not found for '{test_url}': {e}")
|
||||
elif "Max retries exceeded" in err_str and "NameResolutionError" in err_str:
|
||||
raise ConnectorValidationError(
|
||||
f"Unable to resolve hostname for '{test_url}'. Please check the URL and your internet connection."
|
||||
)
|
||||
else:
|
||||
# Could be a 5xx or another error, treat as unexpected
|
||||
raise UnexpectedError(f"Unexpected error validating '{test_url}': {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
connector = WebConnector("https://docs.onyx.app/")
|
||||
|
||||
@@ -665,7 +665,8 @@ def associate_credential_to_connector(
|
||||
logger.error(f"IntegrityError: {e}")
|
||||
raise HTTPException(status_code=400, detail="Name must be unique")
|
||||
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
logger.exception(f"Unexpected error: {e}")
|
||||
raise HTTPException(status_code=500, detail="Unexpected error")
|
||||
|
||||
|
||||
|
||||
@@ -28,6 +28,7 @@ from onyx.configs.constants import FileOrigin
|
||||
from onyx.configs.constants import MilestoneRecordType
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.connectors.factory import validate_ccpair_for_user
|
||||
from onyx.connectors.google_utils.google_auth import (
|
||||
get_google_oauth_creds,
|
||||
)
|
||||
@@ -61,6 +62,7 @@ from onyx.connectors.google_utils.shared_constants import DB_CREDENTIALS_DICT_TO
|
||||
from onyx.connectors.google_utils.shared_constants import (
|
||||
GoogleOAuthAuthenticationMethod,
|
||||
)
|
||||
from onyx.connectors.interfaces import ConnectorValidationError
|
||||
from onyx.db.connector import create_connector
|
||||
from onyx.db.connector import delete_connector
|
||||
from onyx.db.connector import fetch_connector_by_id
|
||||
@@ -844,11 +846,22 @@ def create_connector_with_mock_credential(
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
# Store the created connector and credential IDs
|
||||
connector_id = cast(int, connector_response.id)
|
||||
credential_id = credential.id
|
||||
|
||||
validate_ccpair_for_user(
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
response = add_credential_to_connector(
|
||||
db_session=db_session,
|
||||
user=user,
|
||||
connector_id=cast(int, connector_response.id), # will aways be an int
|
||||
credential_id=credential.id,
|
||||
connector_id=connector_id,
|
||||
credential_id=credential_id,
|
||||
access_type=connector_data.access_type,
|
||||
cc_pair_name=connector_data.name,
|
||||
groups=connector_data.groups,
|
||||
@@ -873,9 +886,12 @@ def create_connector_with_mock_credential(
|
||||
properties=None,
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except ConnectorValidationError as e:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="Connector validation error: " + str(e)
|
||||
)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
|
||||
44
backend/tests/daily/connectors/web/test_web_connector.py
Normal file
44
backend/tests/daily/connectors/web/test_web_connector.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import pytest
|
||||
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
|
||||
from onyx.connectors.web.connector import WebConnector
|
||||
|
||||
|
||||
# NOTE(rkuo): we will probably need to adjust this test to point at our own test site
|
||||
# to avoid depending on a third party site
|
||||
@pytest.fixture
|
||||
def web_connector(request: pytest.FixtureRequest) -> WebConnector:
|
||||
scroll_before_scraping = request.param
|
||||
connector = WebConnector(
|
||||
base_url="https://developer.onewelcome.com",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
scroll_before_scraping=scroll_before_scraping,
|
||||
)
|
||||
return connector
|
||||
|
||||
|
||||
@pytest.mark.parametrize("web_connector", [True], indirect=True)
|
||||
def test_web_connector_scroll(web_connector: WebConnector) -> None:
|
||||
all_docs: list[Document] = []
|
||||
document_batches = web_connector.load_from_state()
|
||||
for doc_batch in document_batches:
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
|
||||
assert len(all_docs) == 1
|
||||
doc = all_docs[0]
|
||||
assert "Onegini Identity Cloud" in doc.sections[0].text
|
||||
|
||||
|
||||
@pytest.mark.parametrize("web_connector", [False], indirect=True)
|
||||
def test_web_connector_no_scroll(web_connector: WebConnector) -> None:
|
||||
all_docs: list[Document] = []
|
||||
document_batches = web_connector.load_from_state()
|
||||
for doc_batch in document_batches:
|
||||
for doc in doc_batch:
|
||||
all_docs.append(doc)
|
||||
|
||||
assert len(all_docs) == 1
|
||||
doc = all_docs[0]
|
||||
assert "Onegini Identity Cloud" not in doc.sections[0].text
|
||||
@@ -152,7 +152,17 @@ export const connectorConfigs: Record<
|
||||
],
|
||||
},
|
||||
],
|
||||
advanced_values: [],
|
||||
advanced_values: [
|
||||
{
|
||||
type: "checkbox",
|
||||
query: "Scroll before scraping:",
|
||||
label: "Scroll before scraping",
|
||||
description:
|
||||
"Enable if the website requires scrolling for the desired content to load",
|
||||
name: "scroll_before_scraping",
|
||||
optional: true,
|
||||
},
|
||||
],
|
||||
overrideDefaultFreq: 60 * 60 * 24,
|
||||
},
|
||||
github: {
|
||||
|
||||
Reference in New Issue
Block a user