mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-17 23:46:47 +00:00
Compare commits
6 Commits
v3.2.0-clo
...
bo/pruning
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c3e857c338 | ||
|
|
146d8522df | ||
|
|
ac5bae3631 | ||
|
|
b5434b2391 | ||
|
|
28e13b503b | ||
|
|
99a90ec196 |
@@ -45,7 +45,7 @@ if [ "$ACTIVE_HOME" != "$MOUNT_HOME" ]; then
|
||||
[ -d "$MOUNT_HOME/$item" ] || continue
|
||||
if [ -e "$ACTIVE_HOME/$item" ] && [ ! -L "$ACTIVE_HOME/$item" ]; then
|
||||
echo "warning: replacing $ACTIVE_HOME/$item with symlink to $MOUNT_HOME/$item" >&2
|
||||
rm -rf "$ACTIVE_HOME/$item"
|
||||
rm -rf "${ACTIVE_HOME:?}/$item"
|
||||
fi
|
||||
ln -sfn "$MOUNT_HOME/$item" "$ACTIVE_HOME/$item"
|
||||
done
|
||||
|
||||
@@ -86,6 +86,17 @@ repos:
|
||||
hooks:
|
||||
- id: actionlint
|
||||
|
||||
- repo: https://github.com/shellcheck-py/shellcheck-py
|
||||
rev: 745eface02aef23e168a8afb6b5737818efbea95 # frozen: v0.11.0.1
|
||||
hooks:
|
||||
- id: shellcheck
|
||||
exclude: >-
|
||||
(?x)^(
|
||||
backend/scripts/setup_craft_templates\.sh|
|
||||
deployment/docker_compose/init-letsencrypt\.sh|
|
||||
deployment/docker_compose/install\.sh
|
||||
)$
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 8a737e727ac5ab2f1d4cf5876720ed276dc8dc4b # frozen: 25.1.0
|
||||
hooks:
|
||||
|
||||
@@ -282,6 +282,7 @@ OPENSEARCH_ADMIN_USERNAME = os.environ.get("OPENSEARCH_ADMIN_USERNAME", "admin")
|
||||
OPENSEARCH_ADMIN_PASSWORD = os.environ.get(
|
||||
"OPENSEARCH_ADMIN_PASSWORD", "StrongPassword123!"
|
||||
)
|
||||
OPENSEARCH_USE_SSL = os.environ.get("OPENSEARCH_USE_SSL", "true").lower() == "true"
|
||||
USING_AWS_MANAGED_OPENSEARCH = (
|
||||
os.environ.get("USING_AWS_MANAGED_OPENSEARCH", "").lower() == "true"
|
||||
)
|
||||
|
||||
@@ -62,17 +62,19 @@ def best_effort_get_field_from_issue(jira_issue: Issue, field: str) -> Any:
|
||||
def extract_text_from_adf(adf: dict | None) -> str:
|
||||
"""Extracts plain text from Atlassian Document Format:
|
||||
https://developer.atlassian.com/cloud/jira/platform/apis/document/structure/
|
||||
|
||||
WARNING: This function is incomplete and will e.g. skip lists!
|
||||
"""
|
||||
# TODO: complete this function
|
||||
texts = []
|
||||
if adf is not None and "content" in adf:
|
||||
for block in adf["content"]:
|
||||
if "content" in block:
|
||||
for item in block["content"]:
|
||||
if item["type"] == "text":
|
||||
texts.append(item["text"])
|
||||
texts: list[str] = []
|
||||
|
||||
def _extract(node: dict) -> None:
|
||||
if node.get("type") == "text":
|
||||
text = node.get("text", "")
|
||||
if text:
|
||||
texts.append(text)
|
||||
for child in node.get("content", []):
|
||||
_extract(child)
|
||||
|
||||
if adf is not None:
|
||||
_extract(adf)
|
||||
return " ".join(texts)
|
||||
|
||||
|
||||
|
||||
@@ -32,11 +32,16 @@ from onyx.connectors.exceptions import CredentialExpiredError
|
||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||
from onyx.connectors.exceptions import UnexpectedValidationError
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnector
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.html_utils import web_html_cleanup
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.sitemap import list_pages_for_site
|
||||
from onyx.utils.web_content import extract_pdf_text
|
||||
@@ -438,7 +443,7 @@ def _handle_cookies(context: BrowserContext, url: str) -> None:
|
||||
)
|
||||
|
||||
|
||||
class WebConnector(LoadConnector):
|
||||
class WebConnector(LoadConnector, SlimConnector):
|
||||
MAX_RETRIES = 3
|
||||
|
||||
def __init__(
|
||||
@@ -493,8 +498,14 @@ class WebConnector(LoadConnector):
|
||||
index: int,
|
||||
initial_url: str,
|
||||
session_ctx: ScrapeSessionContext,
|
||||
slim: bool = False,
|
||||
) -> ScrapeResult:
|
||||
"""Returns a ScrapeResult object with a doc and retry flag."""
|
||||
"""Returns a ScrapeResult object with a doc and retry flag.
|
||||
|
||||
When slim=True, skips all content extraction and render waits.
|
||||
result.url is set to the resolved URL; result.doc is always None.
|
||||
Link discovery via <a href> tags is performed in both modes.
|
||||
"""
|
||||
|
||||
if session_ctx.playwright is None:
|
||||
raise RuntimeError("scrape_context.playwright is None")
|
||||
@@ -516,6 +527,17 @@ class WebConnector(LoadConnector):
|
||||
|
||||
if is_pdf:
|
||||
# PDF files are not checked for links
|
||||
if slim:
|
||||
# No content needed; record the URL via a minimal Document
|
||||
result.doc = Document(
|
||||
id=initial_url,
|
||||
sections=[],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=initial_url,
|
||||
metadata={},
|
||||
)
|
||||
return result
|
||||
|
||||
response = requests.get(initial_url, headers=DEFAULT_HEADERS)
|
||||
page_text, metadata = extract_pdf_text(response.content)
|
||||
last_modified = response.headers.get("Last-Modified")
|
||||
@@ -546,17 +568,21 @@ class WebConnector(LoadConnector):
|
||||
timeout=30000, # 30 seconds
|
||||
wait_until="commit", # Wait for navigation to commit
|
||||
)
|
||||
# Give the page a moment to start rendering after navigation commits.
|
||||
# Allows CloudFlare and other bot-detection challenges to complete.
|
||||
page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)
|
||||
|
||||
# Wait for network activity to settle so SPAs that fetch content
|
||||
# asynchronously after the initial JS bundle have time to render.
|
||||
try:
|
||||
# A bit of extra time to account for long-polling, websockets, etc.
|
||||
page.wait_for_load_state("networkidle", timeout=PAGE_RENDER_TIMEOUT_MS)
|
||||
except TimeoutError:
|
||||
pass
|
||||
if not slim:
|
||||
# Give the page a moment to start rendering after navigation commits.
|
||||
# Allows CloudFlare and other bot-detection challenges to complete.
|
||||
page.wait_for_timeout(PAGE_RENDER_TIMEOUT_MS)
|
||||
|
||||
# Wait for network activity to settle so SPAs that fetch content
|
||||
# asynchronously after the initial JS bundle have time to render.
|
||||
try:
|
||||
# A bit of extra time to account for long-polling, websockets, etc.
|
||||
page.wait_for_load_state(
|
||||
"networkidle", timeout=PAGE_RENDER_TIMEOUT_MS
|
||||
)
|
||||
except TimeoutError:
|
||||
pass
|
||||
|
||||
last_modified = (
|
||||
page_response.header_value("Last-Modified") if page_response else None
|
||||
@@ -576,7 +602,7 @@ class WebConnector(LoadConnector):
|
||||
session_ctx.visited_links.add(initial_url)
|
||||
|
||||
# If we got here, the request was successful
|
||||
if self.scroll_before_scraping:
|
||||
if not slim and self.scroll_before_scraping:
|
||||
scroll_attempts = 0
|
||||
previous_height = page.evaluate("document.body.scrollHeight")
|
||||
while scroll_attempts < WEB_CONNECTOR_MAX_SCROLL_ATTEMPTS:
|
||||
@@ -615,6 +641,16 @@ class WebConnector(LoadConnector):
|
||||
result.retry = True
|
||||
return result
|
||||
|
||||
if slim:
|
||||
result.doc = Document(
|
||||
id=initial_url,
|
||||
sections=[],
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=initial_url,
|
||||
metadata={},
|
||||
)
|
||||
return result
|
||||
|
||||
# after this point, we don't need the caller to retry
|
||||
parsed_html = web_html_cleanup(soup, self.mintlify_cleanup)
|
||||
|
||||
@@ -742,6 +778,93 @@ class WebConnector(LoadConnector):
|
||||
|
||||
session_ctx.stop()
|
||||
|
||||
def retrieve_all_slim_docs(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None, # noqa: ARG002
|
||||
end: SecondsSinceUnixEpoch | None = None, # noqa: ARG002
|
||||
callback: IndexingHeartbeatInterface | None = None, # noqa: ARG002
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
"""Yields SlimDocuments for all pages reachable from the configured URLs.
|
||||
|
||||
Uses the same Playwright-based crawl as load_from_state for accurate
|
||||
JS-rendered link discovery, but skips all content extraction and render
|
||||
waits. The start/end parameters are ignored — WEB connector has no
|
||||
incremental path.
|
||||
"""
|
||||
if not self.to_visit_list:
|
||||
raise ValueError("No URLs to visit")
|
||||
|
||||
base_url = self.to_visit_list[0]
|
||||
check_internet_connection(base_url)
|
||||
|
||||
session_ctx = ScrapeSessionContext(base_url, self.to_visit_list)
|
||||
session_ctx.initialize()
|
||||
|
||||
slim_batch: list[SlimDocument | HierarchyNode] = []
|
||||
|
||||
while session_ctx.to_visit:
|
||||
initial_url = session_ctx.to_visit.pop()
|
||||
if initial_url in session_ctx.visited_links:
|
||||
continue
|
||||
session_ctx.visited_links.add(initial_url)
|
||||
|
||||
try:
|
||||
protected_url_check(initial_url)
|
||||
except Exception as e:
|
||||
session_ctx.last_error = f"Invalid URL {initial_url} due to {e}"
|
||||
logger.warning(session_ctx.last_error)
|
||||
continue
|
||||
|
||||
index = len(session_ctx.visited_links)
|
||||
logger.info(f"{index}: Slim-visiting {initial_url}")
|
||||
|
||||
retry_count = 0
|
||||
while retry_count < self.MAX_RETRIES:
|
||||
if retry_count > 0:
|
||||
delay = min(2**retry_count + random.uniform(0, 1), 10)
|
||||
logger.info(
|
||||
f"Retry {retry_count}/{self.MAX_RETRIES} for {initial_url} after {delay:.2f}s delay"
|
||||
)
|
||||
time.sleep(delay)
|
||||
|
||||
try:
|
||||
result = self._do_scrape(index, initial_url, session_ctx, slim=True)
|
||||
if result.retry:
|
||||
continue
|
||||
|
||||
if result.doc:
|
||||
slim_batch.append(SlimDocument(id=result.doc.id))
|
||||
except Exception as e:
|
||||
session_ctx.last_error = (
|
||||
f"Failed to slim-fetch '{initial_url}': {e}"
|
||||
)
|
||||
logger.exception(session_ctx.last_error)
|
||||
session_ctx.initialize()
|
||||
continue
|
||||
finally:
|
||||
retry_count += 1
|
||||
|
||||
break
|
||||
|
||||
if len(slim_batch) >= self.batch_size:
|
||||
session_ctx.initialize()
|
||||
session_ctx.at_least_one_doc = True
|
||||
yield slim_batch
|
||||
slim_batch = []
|
||||
|
||||
if slim_batch:
|
||||
session_ctx.stop()
|
||||
session_ctx.at_least_one_doc = True
|
||||
yield slim_batch
|
||||
|
||||
if not session_ctx.at_least_one_doc:
|
||||
session_ctx.stop()
|
||||
if session_ctx.last_error:
|
||||
raise RuntimeError(session_ctx.last_error)
|
||||
raise RuntimeError("No valid pages found.")
|
||||
|
||||
session_ctx.stop()
|
||||
|
||||
def validate_connector_settings(self) -> None:
|
||||
# Make sure we have at least one valid URL to check
|
||||
if not self.to_visit_list:
|
||||
|
||||
@@ -17,6 +17,7 @@ from onyx.configs.app_configs import OPENSEARCH_ADMIN_PASSWORD
|
||||
from onyx.configs.app_configs import OPENSEARCH_ADMIN_USERNAME
|
||||
from onyx.configs.app_configs import OPENSEARCH_HOST
|
||||
from onyx.configs.app_configs import OPENSEARCH_REST_API_PORT
|
||||
from onyx.configs.app_configs import OPENSEARCH_USE_SSL
|
||||
from onyx.document_index.interfaces_new import TenantState
|
||||
from onyx.document_index.opensearch.constants import OpenSearchSearchType
|
||||
from onyx.document_index.opensearch.schema import DocumentChunk
|
||||
@@ -132,7 +133,7 @@ class OpenSearchClient(AbstractContextManager):
|
||||
host: str = OPENSEARCH_HOST,
|
||||
port: int = OPENSEARCH_REST_API_PORT,
|
||||
auth: tuple[str, str] = (OPENSEARCH_ADMIN_USERNAME, OPENSEARCH_ADMIN_PASSWORD),
|
||||
use_ssl: bool = True,
|
||||
use_ssl: bool = OPENSEARCH_USE_SSL,
|
||||
verify_certs: bool = False,
|
||||
ssl_show_warn: bool = False,
|
||||
timeout: int = DEFAULT_OPENSEARCH_CLIENT_TIMEOUT_S,
|
||||
@@ -302,7 +303,7 @@ class OpenSearchIndexClient(OpenSearchClient):
|
||||
host: str = OPENSEARCH_HOST,
|
||||
port: int = OPENSEARCH_REST_API_PORT,
|
||||
auth: tuple[str, str] = (OPENSEARCH_ADMIN_USERNAME, OPENSEARCH_ADMIN_PASSWORD),
|
||||
use_ssl: bool = True,
|
||||
use_ssl: bool = OPENSEARCH_USE_SSL,
|
||||
verify_certs: bool = False,
|
||||
ssl_show_warn: bool = False,
|
||||
timeout: int = DEFAULT_OPENSEARCH_CLIENT_TIMEOUT_S,
|
||||
|
||||
@@ -5,6 +5,7 @@ import uvicorn
|
||||
from onyx.configs.app_configs import MCP_SERVER_ENABLED
|
||||
from onyx.configs.app_configs import MCP_SERVER_HOST
|
||||
from onyx.configs.app_configs import MCP_SERVER_PORT
|
||||
from onyx.tracing.setup import setup_tracing
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.variable_functionality import set_is_ee_based_on_env_variable
|
||||
|
||||
@@ -18,6 +19,7 @@ def main() -> None:
|
||||
return
|
||||
|
||||
set_is_ee_based_on_env_variable()
|
||||
setup_tracing()
|
||||
logger.info(f"Starting MCP server on {MCP_SERVER_HOST}:{MCP_SERVER_PORT}")
|
||||
|
||||
from onyx.mcp_server.api import mcp_app
|
||||
|
||||
@@ -46,7 +46,7 @@ stop_and_remove_containers
|
||||
# Start the PostgreSQL container with optional volume
|
||||
echo "Starting PostgreSQL container..."
|
||||
if [[ -n "$POSTGRES_VOLUME" ]]; then
|
||||
docker run -p 5432:5432 --name onyx_postgres -e POSTGRES_PASSWORD=password -d -v $POSTGRES_VOLUME:/var/lib/postgresql/data postgres -c max_connections=250
|
||||
docker run -p 5432:5432 --name onyx_postgres -e POSTGRES_PASSWORD=password -d -v "$POSTGRES_VOLUME":/var/lib/postgresql/data postgres -c max_connections=250
|
||||
else
|
||||
docker run -p 5432:5432 --name onyx_postgres -e POSTGRES_PASSWORD=password -d postgres -c max_connections=250
|
||||
fi
|
||||
@@ -54,7 +54,7 @@ fi
|
||||
# Start the Vespa container with optional volume
|
||||
echo "Starting Vespa container..."
|
||||
if [[ -n "$VESPA_VOLUME" ]]; then
|
||||
docker run --detach --name onyx_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v $VESPA_VOLUME:/opt/vespa/var vespaengine/vespa:8
|
||||
docker run --detach --name onyx_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 -v "$VESPA_VOLUME":/opt/vespa/var vespaengine/vespa:8
|
||||
else
|
||||
docker run --detach --name onyx_vespa --hostname vespa-container --publish 8081:8081 --publish 19071:19071 vespaengine/vespa:8
|
||||
fi
|
||||
@@ -85,7 +85,7 @@ docker compose -f "$COMPOSE_FILE" -f "$COMPOSE_DEV_FILE" --profile opensearch-en
|
||||
# Start the Redis container with optional volume
|
||||
echo "Starting Redis container..."
|
||||
if [[ -n "$REDIS_VOLUME" ]]; then
|
||||
docker run --detach --name onyx_redis --publish 6379:6379 -v $REDIS_VOLUME:/data redis
|
||||
docker run --detach --name onyx_redis --publish 6379:6379 -v "$REDIS_VOLUME":/data redis
|
||||
else
|
||||
docker run --detach --name onyx_redis --publish 6379:6379 redis
|
||||
fi
|
||||
@@ -93,7 +93,7 @@ fi
|
||||
# Start the MinIO container with optional volume
|
||||
echo "Starting MinIO container..."
|
||||
if [[ -n "$MINIO_VOLUME" ]]; then
|
||||
docker run --detach --name onyx_minio --publish 9004:9000 --publish 9005:9001 -e MINIO_ROOT_USER=minioadmin -e MINIO_ROOT_PASSWORD=minioadmin -v $MINIO_VOLUME:/data minio/minio server /data --console-address ":9001"
|
||||
docker run --detach --name onyx_minio --publish 9004:9000 --publish 9005:9001 -e MINIO_ROOT_USER=minioadmin -e MINIO_ROOT_PASSWORD=minioadmin -v "$MINIO_VOLUME":/data minio/minio server /data --console-address ":9001"
|
||||
else
|
||||
docker run --detach --name onyx_minio --publish 9004:9000 --publish 9005:9001 -e MINIO_ROOT_USER=minioadmin -e MINIO_ROOT_PASSWORD=minioadmin minio/minio server /data --console-address ":9001"
|
||||
fi
|
||||
@@ -111,6 +111,7 @@ sleep 1
|
||||
|
||||
# Alembic should be configured in the virtualenv for this repo
|
||||
if [[ -f "../.venv/bin/activate" ]]; then
|
||||
# shellcheck source=/dev/null
|
||||
source ../.venv/bin/activate
|
||||
else
|
||||
echo "Warning: Python virtual environment not found at .venv/bin/activate; alembic may not work."
|
||||
|
||||
0
backend/tests/unit/onyx/connectors/web/__init__.py
Normal file
0
backend/tests/unit/onyx/connectors/web/__init__.py
Normal file
188
backend/tests/unit/onyx/connectors/web/test_slim_retrieval.py
Normal file
188
backend/tests/unit/onyx/connectors/web/test_slim_retrieval.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""Unit tests for WebConnector.retrieve_all_slim_docs (slim pruning path)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock
|
||||
from unittest.mock import patch
|
||||
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.web.connector import WEB_CONNECTOR_VALID_SETTINGS
|
||||
from onyx.connectors.web.connector import WebConnector
|
||||
|
||||
BASE_URL = "http://example.com"
|
||||
|
||||
|
||||
def _make_context_mock(url_to_html: dict[str, str]) -> MagicMock:
|
||||
"""Return a BrowserContext mock whose pages respond based on the goto URL."""
|
||||
context = MagicMock()
|
||||
|
||||
def _new_page() -> MagicMock:
|
||||
page = MagicMock()
|
||||
visited: list[str] = []
|
||||
|
||||
def _goto(url: str, **kwargs: Any) -> MagicMock: # noqa: ARG001
|
||||
visited.append(url)
|
||||
page.url = url # no redirect
|
||||
response = MagicMock()
|
||||
response.status = 200
|
||||
response.header_value.return_value = None
|
||||
return response
|
||||
|
||||
def _content() -> str:
|
||||
return url_to_html.get(
|
||||
visited[-1] if visited else "", "<html><body></body></html>"
|
||||
)
|
||||
|
||||
page.goto.side_effect = _goto
|
||||
page.content.side_effect = _content
|
||||
return page
|
||||
|
||||
context.new_page.side_effect = _new_page
|
||||
return context
|
||||
|
||||
|
||||
def _make_playwright_mock(context: MagicMock) -> MagicMock: # noqa: ARG001
|
||||
playwright = MagicMock()
|
||||
playwright.stop = MagicMock()
|
||||
return playwright
|
||||
|
||||
|
||||
SINGLE_PAGE_HTML = (
|
||||
"<html><body><p>Content that should not appear in slim output</p></body></html>"
|
||||
)
|
||||
|
||||
RECURSIVE_ROOT_HTML = """
|
||||
<html><body>
|
||||
<a href="/page2">Page 2</a>
|
||||
<a href="/page3">Page 3</a>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
PAGE2_HTML = "<html><body><p>page 2</p></body></html>"
|
||||
PAGE3_HTML = "<html><body><p>page 3</p></body></html>"
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_slim_yields_slim_documents(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""retrieve_all_slim_docs yields SlimDocuments with the correct URL as id."""
|
||||
context = _make_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(context), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
docs = [doc for batch in connector.retrieve_all_slim_docs() for doc in batch]
|
||||
|
||||
assert len(docs) == 1
|
||||
assert isinstance(docs[0], SlimDocument)
|
||||
assert docs[0].id == BASE_URL + "/"
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_slim_skips_content_extraction(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""web_html_cleanup is never called in slim mode."""
|
||||
context = _make_context_mock({BASE_URL + "/": SINGLE_PAGE_HTML})
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(context), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
with patch("onyx.connectors.web.connector.web_html_cleanup") as mock_cleanup:
|
||||
list(connector.retrieve_all_slim_docs())
|
||||
mock_cleanup.assert_not_called()
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_slim_discovers_links_recursively(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""In RECURSIVE mode, internal <a href> links are followed and all URLs yielded."""
|
||||
url_to_html = {
|
||||
BASE_URL + "/": RECURSIVE_ROOT_HTML,
|
||||
BASE_URL + "/page2": PAGE2_HTML,
|
||||
BASE_URL + "/page3": PAGE3_HTML,
|
||||
}
|
||||
context = _make_context_mock(url_to_html)
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(context), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value,
|
||||
)
|
||||
|
||||
ids = {
|
||||
doc.id
|
||||
for batch in connector.retrieve_all_slim_docs()
|
||||
for doc in batch
|
||||
if isinstance(doc, SlimDocument)
|
||||
}
|
||||
|
||||
assert ids == {
|
||||
BASE_URL + "/",
|
||||
BASE_URL + "/page2",
|
||||
BASE_URL + "/page3",
|
||||
}
|
||||
|
||||
|
||||
@patch("onyx.connectors.web.connector.check_internet_connection")
|
||||
@patch("onyx.connectors.web.connector.requests.head")
|
||||
@patch("onyx.connectors.web.connector.start_playwright")
|
||||
def test_slim_render_wait_not_called_confirmed(
|
||||
mock_start_playwright: MagicMock,
|
||||
mock_head: MagicMock,
|
||||
_mock_check: MagicMock,
|
||||
) -> None:
|
||||
"""Confirm wait_for_timeout is called in full mode but not in slim mode."""
|
||||
pages_visited: list[MagicMock] = []
|
||||
|
||||
context = MagicMock()
|
||||
|
||||
def _new_page() -> MagicMock:
|
||||
page = MagicMock()
|
||||
page.url = BASE_URL + "/"
|
||||
response = MagicMock()
|
||||
response.status = 200
|
||||
response.header_value.return_value = None
|
||||
page.goto.return_value = response
|
||||
page.content.return_value = SINGLE_PAGE_HTML
|
||||
pages_visited.append(page)
|
||||
return page
|
||||
|
||||
context.new_page.side_effect = _new_page
|
||||
mock_start_playwright.return_value = (_make_playwright_mock(context), context)
|
||||
mock_head.return_value.headers = {"content-type": "text/html"}
|
||||
|
||||
connector = WebConnector(
|
||||
base_url=BASE_URL + "/",
|
||||
web_connector_type=WEB_CONNECTOR_VALID_SETTINGS.SINGLE.value,
|
||||
)
|
||||
|
||||
pages_visited.clear()
|
||||
list(connector.retrieve_all_slim_docs())
|
||||
for page in pages_visited:
|
||||
page.wait_for_timeout.assert_not_called()
|
||||
page.wait_for_load_state.assert_not_called()
|
||||
@@ -58,8 +58,7 @@ SERVICE_ORDER=(
|
||||
validate_template() {
|
||||
local template_file=$1
|
||||
echo "Validating template: $template_file..."
|
||||
aws cloudformation validate-template --template-body file://"$template_file" --region "$AWS_REGION" > /dev/null
|
||||
if [ $? -ne 0 ]; then
|
||||
if ! aws cloudformation validate-template --template-body file://"$template_file" --region "$AWS_REGION" > /dev/null; then
|
||||
echo "Error: Validation failed for $template_file. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
@@ -108,13 +107,15 @@ deploy_stack() {
|
||||
fi
|
||||
|
||||
# Create temporary parameters file for this template
|
||||
local temp_params_file=$(create_parameters_from_json "$template_file")
|
||||
local temp_params_file
|
||||
temp_params_file=$(create_parameters_from_json "$template_file")
|
||||
|
||||
# Special handling for SubnetIDs parameter if needed
|
||||
if grep -q "SubnetIDs" "$template_file"; then
|
||||
echo "Template uses SubnetIDs parameter, ensuring it's properly formatted..."
|
||||
# Make sure we're passing SubnetIDs as a comma-separated list
|
||||
local subnet_ids=$(remove_comments "$CONFIG_FILE" | jq -r '.SubnetIDs // empty')
|
||||
local subnet_ids
|
||||
subnet_ids=$(remove_comments "$CONFIG_FILE" | jq -r '.SubnetIDs // empty')
|
||||
if [ -n "$subnet_ids" ]; then
|
||||
echo "Using SubnetIDs from config: $subnet_ids"
|
||||
else
|
||||
@@ -123,15 +124,13 @@ deploy_stack() {
|
||||
fi
|
||||
|
||||
echo "Deploying stack: $stack_name with template: $template_file and generated config from: $CONFIG_FILE..."
|
||||
aws cloudformation deploy \
|
||||
if ! aws cloudformation deploy \
|
||||
--stack-name "$stack_name" \
|
||||
--template-file "$template_file" \
|
||||
--parameter-overrides file://"$temp_params_file" \
|
||||
--capabilities CAPABILITY_IAM CAPABILITY_NAMED_IAM CAPABILITY_AUTO_EXPAND \
|
||||
--region "$AWS_REGION" \
|
||||
--no-cli-auto-prompt > /dev/null
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
--no-cli-auto-prompt > /dev/null; then
|
||||
echo "Error: Deployment failed for $stack_name. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -52,11 +52,9 @@ delete_stack() {
|
||||
--region "$AWS_REGION"
|
||||
|
||||
echo "Waiting for stack $stack_name to be deleted..."
|
||||
aws cloudformation wait stack-delete-complete \
|
||||
if aws cloudformation wait stack-delete-complete \
|
||||
--stack-name "$stack_name" \
|
||||
--region "$AWS_REGION"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
--region "$AWS_REGION"; then
|
||||
echo "Stack $stack_name deleted successfully."
|
||||
sleep 10
|
||||
else
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#!/bin/sh
|
||||
# fill in the template
|
||||
export ONYX_BACKEND_API_HOST="${ONYX_BACKEND_API_HOST:-api_server}"
|
||||
export ONYX_WEB_SERVER_HOST="${ONYX_WEB_SERVER_HOST:-web_server}"
|
||||
@@ -16,12 +17,15 @@ echo "Using web server host: $ONYX_WEB_SERVER_HOST"
|
||||
echo "Using MCP server host: $ONYX_MCP_SERVER_HOST"
|
||||
echo "Using nginx proxy timeouts - connect: ${NGINX_PROXY_CONNECT_TIMEOUT}s, send: ${NGINX_PROXY_SEND_TIMEOUT}s, read: ${NGINX_PROXY_READ_TIMEOUT}s"
|
||||
|
||||
# shellcheck disable=SC2016
|
||||
envsubst '$DOMAIN $SSL_CERT_FILE_NAME $SSL_CERT_KEY_FILE_NAME $ONYX_BACKEND_API_HOST $ONYX_WEB_SERVER_HOST $ONYX_MCP_SERVER_HOST $NGINX_PROXY_CONNECT_TIMEOUT $NGINX_PROXY_SEND_TIMEOUT $NGINX_PROXY_READ_TIMEOUT' < "/etc/nginx/conf.d/$1" > /etc/nginx/conf.d/app.conf
|
||||
|
||||
# Conditionally create MCP server configuration
|
||||
if [ "${MCP_SERVER_ENABLED}" = "True" ] || [ "${MCP_SERVER_ENABLED}" = "true" ]; then
|
||||
echo "MCP server is enabled, creating MCP configuration..."
|
||||
# shellcheck disable=SC2016
|
||||
envsubst '$ONYX_MCP_SERVER_HOST' < "/etc/nginx/conf.d/mcp_upstream.conf.inc.template" > /etc/nginx/conf.d/mcp_upstream.conf.inc
|
||||
# shellcheck disable=SC2016
|
||||
envsubst '$ONYX_MCP_SERVER_HOST' < "/etc/nginx/conf.d/mcp.conf.inc.template" > /etc/nginx/conf.d/mcp.conf.inc
|
||||
else
|
||||
echo "MCP server is disabled, removing MCP configuration..."
|
||||
|
||||
@@ -48,6 +48,19 @@ func runWebScript(args []string) {
|
||||
log.Fatalf("Failed to find web directory: %v", err)
|
||||
}
|
||||
|
||||
nodeModules := filepath.Join(webDir, "node_modules")
|
||||
if _, err := os.Stat(nodeModules); os.IsNotExist(err) {
|
||||
log.Info("node_modules not found, running npm install --no-save...")
|
||||
installCmd := exec.Command("npm", "install", "--no-save")
|
||||
installCmd.Dir = webDir
|
||||
installCmd.Stdout = os.Stdout
|
||||
installCmd.Stderr = os.Stderr
|
||||
installCmd.Stdin = os.Stdin
|
||||
if err := installCmd.Run(); err != nil {
|
||||
log.Fatalf("Failed to run npm install: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
scriptName := args[0]
|
||||
scriptArgs := args[1:]
|
||||
if len(scriptArgs) > 0 && scriptArgs[0] == "--" {
|
||||
|
||||
@@ -68,9 +68,7 @@ SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
|
||||
|
||||
# Run the conversion into a temp file so a failed run doesn't destroy an existing .tsx
|
||||
TMPFILE="${BASE_NAME}.tsx.tmp"
|
||||
bunx @svgr/cli "$SVG_FILE" --typescript --svgo-config "$SVGO_CONFIG" --template "${SCRIPT_DIR}/icon-template.js" > "$TMPFILE"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
if bunx @svgr/cli "$SVG_FILE" --typescript --svgo-config "$SVGO_CONFIG" --template "${SCRIPT_DIR}/icon-template.js" > "$TMPFILE"; then
|
||||
# Verify the temp file has content before replacing the destination
|
||||
if [ ! -s "$TMPFILE" ]; then
|
||||
rm -f "$TMPFILE"
|
||||
@@ -84,16 +82,14 @@ if [ $? -eq 0 ]; then
|
||||
# Using perl for cross-platform compatibility (works on macOS, Linux, Windows with WSL)
|
||||
# Note: perl -i returns 0 even on some failures, so we validate the output
|
||||
|
||||
perl -i -pe 's/<svg/<svg width={size} height={size}/g' "${BASE_NAME}.tsx"
|
||||
if [ $? -ne 0 ]; then
|
||||
if ! perl -i -pe 's/<svg/<svg width={size} height={size}/g' "${BASE_NAME}.tsx"; then
|
||||
echo "Error: Failed to add width/height attributes" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Icons additionally get stroke="currentColor"
|
||||
if [ "$MODE" = "icon" ]; then
|
||||
perl -i -pe 's/\{\.\.\.props\}/stroke="currentColor" {...props}/g' "${BASE_NAME}.tsx"
|
||||
if [ $? -ne 0 ]; then
|
||||
if ! perl -i -pe 's/\{\.\.\.props\}/stroke="currentColor" {...props}/g' "${BASE_NAME}.tsx"; then
|
||||
echo "Error: Failed to add stroke attribute" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -21,7 +21,6 @@ import {
|
||||
SvgExpand,
|
||||
SvgFold,
|
||||
SvgExternalLink,
|
||||
SvgAlertCircle,
|
||||
SvgRefreshCw,
|
||||
} from "@opal/icons";
|
||||
import { ADMIN_ROUTES } from "@/lib/admin-routes";
|
||||
@@ -49,7 +48,7 @@ import {
|
||||
PYTHON_TOOL_ID,
|
||||
OPEN_URL_TOOL_ID,
|
||||
} from "@/app/app/components/tools/constants";
|
||||
import { Button, Divider, Text, Card } from "@opal/components";
|
||||
import { Button, Divider, Text, Card, MessageCard } from "@opal/components";
|
||||
import Modal from "@/refresh-components/Modal";
|
||||
import Switch from "@/refresh-components/inputs/Switch";
|
||||
import useMcpServersForAgentEditor from "@/hooks/useMcpServersForAgentEditor";
|
||||
@@ -1091,14 +1090,11 @@ export default function ChatPreferencesPage() {
|
||||
)}
|
||||
</Text>
|
||||
</Section>
|
||||
<Card background="none" border="solid" padding="sm">
|
||||
<Content
|
||||
sizePreset="main-ui"
|
||||
icon={SvgAlertCircle}
|
||||
title="Modify with caution."
|
||||
description="System prompt affects all chats, agents, and projects. Significant changes may degrade response quality."
|
||||
/>
|
||||
</Card>
|
||||
<MessageCard
|
||||
title="Modify with caution."
|
||||
description="System prompt affects all chats, agents, and projects. Significant changes may degrade response quality."
|
||||
padding="xs"
|
||||
/>
|
||||
</Modal.Body>
|
||||
<Modal.Footer>
|
||||
<Button
|
||||
|
||||
@@ -28,15 +28,9 @@ import {
|
||||
ModalWrapper,
|
||||
} from "@/sections/modals/llmConfig/shared";
|
||||
import { fetchBedrockModels } from "@/lib/llmConfig/svc";
|
||||
import { Card } from "@opal/components";
|
||||
import { Card, MessageCard } from "@opal/components";
|
||||
import { Section } from "@/layouts/general-layouts";
|
||||
import { SvgAlertCircle } from "@opal/icons";
|
||||
import {
|
||||
Content,
|
||||
InputDivider,
|
||||
InputPadder,
|
||||
InputVertical,
|
||||
} from "@opal/layouts";
|
||||
import { InputDivider, InputPadder, InputVertical } from "@opal/layouts";
|
||||
import { toast } from "@/hooks/useToast";
|
||||
import { refreshLlmProviderCaches } from "@/lib/llmConfig/cache";
|
||||
|
||||
@@ -218,14 +212,10 @@ function BedrockModalInternals({
|
||||
|
||||
{authMethod === AUTH_METHOD_IAM && (
|
||||
<InputPadder>
|
||||
<Card background="none" border="solid" padding="sm">
|
||||
<Content
|
||||
icon={SvgAlertCircle}
|
||||
title="Onyx will use the IAM role attached to the environment it’s running in to authenticate."
|
||||
variant="body"
|
||||
sizePreset="main-ui"
|
||||
/>
|
||||
</Card>
|
||||
<MessageCard
|
||||
variant="info"
|
||||
title="Onyx will use the IAM role attached to the environment it’s running in to authenticate."
|
||||
/>
|
||||
</InputPadder>
|
||||
)}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user