Compare commits

..

4 Commits

Author SHA1 Message Date
Dane Urban
83558ae04c . 2026-03-11 17:52:55 -07:00
Dane Urban
005009602c Some fixes 2026-03-11 17:44:49 -07:00
Dane Urban
b93875353b . 2026-03-11 17:32:30 -07:00
Dane Urban
2290141b53 xlsx to text 2026-03-11 17:19:09 -07:00
14 changed files with 611 additions and 945 deletions

View File

@@ -1,8 +1,6 @@
from collections.abc import Generator
from typing import Any
from jira import JIRA
from jira.exceptions import JIRAError
from ee.onyx.db.external_perm import ExternalUserGroup
from onyx.connectors.jira.utils import build_jira_client
@@ -11,102 +9,107 @@ from onyx.utils.logger import setup_logger
logger = setup_logger()
_ATLASSIAN_ACCOUNT_TYPE = "atlassian"
_GROUP_MEMBER_PAGE_SIZE = 50
# The GET /group/member endpoint was introduced in Jira 6.0.
# Jira versions older than 6.0 do not have group management REST APIs at all.
_MIN_JIRA_VERSION_FOR_GROUP_MEMBER = "6.0"
def _fetch_group_member_page(
def _get_jira_group_members_email(
jira_client: JIRA,
group_name: str,
start_at: int,
) -> dict[str, Any]:
"""Fetch a single page from the non-deprecated GET /group/member endpoint.
) -> list[str]:
"""Get all member emails for a Jira group.
The old GET /group endpoint (used by jira_client.group_members()) is deprecated
and decommissioned in Jira Server 10.3+. This uses the replacement endpoint
directly via the library's internal _get_json helper, following the same pattern
as enhanced_search_ids / bulk_fetch_issues in connector.py.
There is an open PR to the library to switch to this endpoint since last year:
https://github.com/pycontribs/jira/pull/2356
so once it is merged and released, we can switch to using the library function.
Filters out app accounts (bots, integrations) and only returns real user emails.
"""
emails: list[str] = []
try:
return jira_client._get_json(
"group/member",
params={
"groupname": group_name,
"includeInactiveUsers": "false",
"startAt": start_at,
"maxResults": _GROUP_MEMBER_PAGE_SIZE,
},
)
except JIRAError as e:
if e.status_code == 404:
raise RuntimeError(
f"GET /group/member returned 404 for group '{group_name}'. "
f"This endpoint requires Jira {_MIN_JIRA_VERSION_FOR_GROUP_MEMBER}+. "
f"If you are running a self-hosted Jira instance, please upgrade "
f"to at least Jira {_MIN_JIRA_VERSION_FOR_GROUP_MEMBER}."
) from e
raise
# group_members returns an OrderedDict of account_id -> member_info
members = jira_client.group_members(group=group_name)
if not members:
logger.warning(f"No members found for group {group_name}")
return emails
def _get_group_member_emails(
jira_client: JIRA,
group_name: str,
) -> set[str]:
"""Get all member emails for a single Jira group.
for account_id, member_info in members.items():
# member_info is a dict with keys like 'fullname', 'email', 'active'
email = member_info.get("email")
Uses the non-deprecated GET /group/member endpoint which returns full user
objects including accountType, so we can filter out app/customer accounts
without making separate user() calls.
"""
emails: set[str] = set()
start_at = 0
while True:
try:
page = _fetch_group_member_page(jira_client, group_name, start_at)
except Exception as e:
logger.error(f"Error fetching members for group {group_name}: {e}")
raise
members: list[dict[str, Any]] = page.get("values", [])
for member in members:
account_type = member.get("accountType")
# On Jira DC < 9.0, accountType is absent; include those users.
# On Cloud / DC 9.0+, filter to real user accounts only.
if account_type is not None and account_type != _ATLASSIAN_ACCOUNT_TYPE:
continue
email = member.get("emailAddress")
if email:
emails.add(email)
# Skip "hidden" emails - these are typically app accounts
if email and email != "hidden":
emails.append(email)
else:
logger.warning(
f"Atlassian user {member.get('accountId', 'unknown')} "
f"in group {group_name} has no visible email address"
)
# For cloud, we might need to fetch user details separately
try:
user = jira_client.user(id=account_id)
if page.get("isLast", True) or not members:
break
start_at += len(members)
# Skip app accounts (bots, integrations, etc.)
if hasattr(user, "accountType") and user.accountType == "app":
logger.info(
f"Skipping app account {account_id} for group {group_name}"
)
continue
if hasattr(user, "emailAddress") and user.emailAddress:
emails.append(user.emailAddress)
else:
logger.warning(f"User {account_id} has no email address")
except Exception as e:
logger.warning(
f"Could not fetch email for user {account_id} in group {group_name}: {e}"
)
except Exception as e:
logger.error(f"Error fetching members for group {group_name}: {e}")
return emails
def _build_group_member_email_map(
jira_client: JIRA,
) -> dict[str, set[str]]:
"""Build a map of group names to member emails."""
group_member_emails: dict[str, set[str]] = {}
try:
# Get all groups from Jira - returns a list of group name strings
group_names = jira_client.groups()
if not group_names:
logger.warning("No groups found in Jira")
return group_member_emails
logger.info(f"Found {len(group_names)} groups in Jira")
for group_name in group_names:
if not group_name:
continue
member_emails = _get_jira_group_members_email(
jira_client=jira_client,
group_name=group_name,
)
if member_emails:
group_member_emails[group_name] = set(member_emails)
logger.debug(
f"Found {len(member_emails)} members for group {group_name}"
)
else:
logger.debug(f"No members found for group {group_name}")
except Exception as e:
logger.error(f"Error building group member email map: {e}")
return group_member_emails
def jira_group_sync(
tenant_id: str, # noqa: ARG001
cc_pair: ConnectorCredentialPair,
) -> Generator[ExternalUserGroup, None, None]:
"""Sync Jira groups and their members, yielding one group at a time.
"""
Sync Jira groups and their members.
Streams group-by-group rather than accumulating all groups in memory.
This function fetches all groups from Jira and yields ExternalUserGroup
objects containing the group ID and member emails.
"""
jira_base_url = cc_pair.connector.connector_specific_config.get("jira_base_url", "")
scoped_token = cc_pair.connector.connector_specific_config.get(
@@ -127,26 +130,12 @@ def jira_group_sync(
scoped_token=scoped_token,
)
group_names = jira_client.groups()
if not group_names:
raise ValueError(f"No groups found for cc_pair_id={cc_pair.id}")
group_member_email_map = _build_group_member_email_map(jira_client=jira_client)
if not group_member_email_map:
raise ValueError(f"No groups with members found for cc_pair_id={cc_pair.id}")
logger.info(f"Found {len(group_names)} groups in Jira")
for group_name in group_names:
if not group_name:
continue
member_emails = _get_group_member_emails(
jira_client=jira_client,
group_name=group_name,
)
if not member_emails:
logger.debug(f"No members found for group {group_name}")
continue
logger.debug(f"Found {len(member_emails)} members for group {group_name}")
for group_id, group_member_emails in group_member_email_map.items():
yield ExternalUserGroup(
id=group_name,
user_emails=list(member_emails),
id=group_id,
user_emails=list(group_member_emails),
)

View File

@@ -11,9 +11,6 @@
# lock after its cleanup which happens at most after its soft timeout.
# Constants corresponding to migrate_documents_from_vespa_to_opensearch_task.
from onyx.configs.app_configs import OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE
MIGRATION_TASK_SOFT_TIME_LIMIT_S = 60 * 5 # 5 minutes.
MIGRATION_TASK_TIME_LIMIT_S = 60 * 6 # 6 minutes.
# The maximum time the lock can be held for. Will automatically be released
@@ -47,7 +44,7 @@ TOTAL_ALLOWABLE_DOC_MIGRATION_ATTEMPTS_BEFORE_PERMANENT_FAILURE = 15
# WARNING: Do not change these values without knowing what changes also need to
# be made to OpenSearchTenantMigrationRecord.
GET_VESPA_CHUNKS_PAGE_SIZE = OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE
GET_VESPA_CHUNKS_PAGE_SIZE = 500
GET_VESPA_CHUNKS_SLICE_COUNT = 4
# String used to indicate in the vespa_visit_continuation_token mapping that the

View File

@@ -311,12 +311,6 @@ VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT = (
os.environ.get("VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT", "true").lower()
== "true"
)
OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE = int(
os.environ.get("OPENSEARCH_MIGRATION_GET_VESPA_CHUNKS_PAGE_SIZE") or 500
)
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = int(
os.environ.get("OPENSEARCH_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES") or 0
)
VESPA_HOST = os.environ.get("VESPA_HOST") or "localhost"
# NOTE: this is used if and only if the vespa config server is accessible via a

View File

@@ -1,10 +1,5 @@
# Default value for the maximum number of tokens a chunk can hold, if none is
# specified when creating an index.
from onyx.configs.app_configs import (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
)
DEFAULT_MAX_CHUNK_SIZE = 512
# Size of the dynamic list used to consider elements during kNN graph creation.
@@ -15,43 +10,27 @@ EF_CONSTRUCTION = 256
# quality but increase memory footprint. Values typically range between 12 - 48.
M = 32 # Set relatively high for better accuracy.
# When performing hybrid search, we need to consider more candidates than the
# number of results to be returned. This is because the scoring is hybrid and
# the results are reordered due to the hybrid scoring. Higher = more candidates
# for hybrid fusion = better retrieval accuracy, but results in more computation
# per query. Imagine a simple case with a single keyword query and a single
# vector query and we want 10 final docs. If we only fetch 10 candidates from
# each of keyword and vector, they would have to have perfect overlap to get a
# good hybrid ranking for the 10 results. If we fetch 1000 candidates from each,
# we have a much higher chance of all 10 of the final desired docs showing up
# and getting scored. In worse situations, the final 10 docs don't even show up
# as the final 10 (worse than just a miss at the reranking step).
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = (
OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
if OPENSEARCH_OVERRIDE_DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES > 0
else 750
)
# When performing hybrid search, we need to consider more candidates than the number of results to be returned.
# This is because the scoring is hybrid and the results are reordered due to the hybrid scoring.
# Higher = more candidates for hybrid fusion = better retrieval accuracy, but results in more computation per query.
# Imagine a simple case with a single keyword query and a single vector query and we want 10 final docs.
# If we only fetch 10 candidates from each of keyword and vector, they would have to have perfect overlap to get a good hybrid
# ranking for the 10 results. If we fetch 1000 candidates from each, we have a much higher chance of all 10 of the final desired
# docs showing up and getting scored. In worse situations, the final 10 docs don't even show up as the final 10 (worse than just
# a miss at the reranking step).
DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES = 750
# Number of vectors to examine to decide the top k neighbors for the HNSW
# method.
# NOTE: "When creating a search query, you must specify k. If you provide both k
# and ef_search, then the larger value is passed to the engine. If ef_search is
# larger than k, you can provide the size parameter to limit the final number of
# results to k." from
# https://docs.opensearch.org/latest/query-dsl/specialized/k-nn/index/#ef_search
# Number of vectors to examine for top k neighbors for the HNSW method.
EF_SEARCH = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES
# Since the titles are included in the contents, the embedding matches are
# heavily downweighted as they act as a boost rather than an independent scoring
# component.
# Since the titles are included in the contents, they are heavily downweighted as they act as a boost
# rather than an independent scoring component.
SEARCH_TITLE_VECTOR_WEIGHT = 0.1
SEARCH_CONTENT_VECTOR_WEIGHT = 0.45
# Single keyword weight for both title and content (merged from former title
# keyword + content keyword).
# Single keyword weight for both title and content (merged from former title keyword + content keyword).
SEARCH_KEYWORD_WEIGHT = 0.45
# NOTE: It is critical that the order of these weights matches the order of the
# sub-queries in the hybrid search.
# NOTE: it is critical that the order of these weights matches the order of the sub-queries in the hybrid search.
HYBRID_SEARCH_NORMALIZATION_WEIGHTS = [
SEARCH_TITLE_VECTOR_WEIGHT,
SEARCH_CONTENT_VECTOR_WEIGHT,

View File

@@ -285,16 +285,13 @@ class DocumentQuery:
hybrid_search_query: dict[str, Any] = {
"hybrid": {
"queries": hybrid_search_subqueries,
# Max results per subquery per shard before aggregation. Ensures
# keyword and vector subqueries contribute equally to the
# candidate pool for hybrid fusion.
# Max results per subquery per shard before aggregation. Ensures keyword and vector
# subqueries contribute equally to the candidate pool for hybrid fusion.
# Sources:
# https://docs.opensearch.org/latest/vector-search/ai-search/hybrid-search/pagination/
# https://opensearch.org/blog/navigating-pagination-in-hybrid-queries-with-the-pagination_depth-parameter/
"pagination_depth": DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
# Applied to all the sub-queries independently (this avoids
# subqueries having a lot of results thrown out during
# aggregation).
# Applied to all the sub-queries independently (this avoids having subqueries having a lot of results thrown out).
# Sources:
# https://docs.opensearch.org/latest/query-dsl/compound/hybrid/
# https://opensearch.org/blog/introducing-common-filter-support-for-hybrid-search-queries
@@ -377,10 +374,9 @@ class DocumentQuery:
def _get_hybrid_search_subqueries(
query_text: str,
query_vector: list[float],
# The default number of neighbors to consider for knn vector similarity
# search. This is higher than the number of results because the scoring
# is hybrid. For a detailed breakdown, see where the default value is
# set.
# The default number of neighbors to consider for knn vector similarity search.
# This is higher than the number of results because the scoring is hybrid.
# for a detailed breakdown, see where the default value is set.
vector_candidates: int = DEFAULT_NUM_HYBRID_SEARCH_CANDIDATES,
) -> list[dict[str, Any]]:
"""Returns subqueries for hybrid search.
@@ -404,27 +400,20 @@ class DocumentQuery:
in a single hybrid query. Source:
https://docs.opensearch.org/latest/query-dsl/compound/hybrid/
NOTE: Each query is independent during the search phase, there is no
backfilling of scores for missing query components. What this means is
that if a document was a good vector match but did not show up for
keyword, it gets a score of 0 for the keyword component of the hybrid
scoring. This is not as bad as just disregarding a score though as there
is normalization applied after. So really it is "increasing" the missing
score compared to if it was included and the range was renormalized.
This does however mean that between docs that have high scores for say
the vector field, the keyword scores between them are completely ignored
unless they also showed up in the keyword query as a reasonably high
match. TLDR, this is a bit of unique funky behavior but it seems ok.
NOTE: Each query is independent during the search phase, there is no backfilling of scores for missing query components.
What this means is that if a document was a good vector match but did not show up for keyword, it gets a score of 0 for
the keyword component of the hybrid scoring. This is not as bad as just disregarding a score though as there is
normalization applied after. So really it is "increasing" the missing score compared to if it was included and the range
was renormalized. This does however mean that between docs that have high scores for say the vector field, the keyword
scores between them are completely ignored unless they also showed up in the keyword query as a reasonably high match.
TLDR, this is a bit of unique funky behavior but it seems ok.
NOTE: Options considered and rejected:
- minimum_should_match: Since it's hybrid search and users often provide
semantic queries, there is often a lot of terms, and very low number
of meaningful keywords (and a low ratio of keywords).
- fuzziness AUTO: Typo tolerance (0/1/2 edit distance by term length).
It's mostly for typos as the analyzer ("english" by default) already
does some stemming and tokenization. In testing datasets, this makes
recall slightly worse. It also is less performant so not really any
reason to do it.
- minimum_should_match: Since it's hybrid search and users often provide semantic queries, there is often a lot of terms,
and very low number of meaningful keywords (and a low ratio of keywords).
- fuzziness AUTO: typo tolerance (0/1/2 edit distance by term length). It's mostly for typos as the analyzer ("english by
default") already does some stemming and tokenization. In testing datasets, this makes recall slightly worse. It also is
less performant so not really any reason to do it.
Args:
query_text: The text of the query to search for.

View File

@@ -1,3 +1,4 @@
import csv
import gc
import io
import json
@@ -19,6 +20,7 @@ from zipfile import BadZipFile
import chardet
import openpyxl
from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image
from onyx.configs.constants import ONYX_METADATA_FILENAME
@@ -352,6 +354,65 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
return presentation.markdown
def _worksheet_to_matrix(
worksheet: Worksheet,
) -> list[list[str]]:
"""
Converts a singular worksheet to a matrix of values
"""
rows: list[list[str]] = []
for worksheet_row in worksheet.iter_rows(min_row=1, values_only=True):
row = ["" if cell is None else str(cell) for cell in worksheet_row]
rows.append(row)
return rows
def _clean_worksheet_matrix(matrix: list[list[str]]) -> list[list[str]]:
"""
Cleans a worksheet matrix by removing rows if there are N consecutive empty
rows and removing cols if there are M consecutive empty columns
"""
MAX_EMPTY_ROWS = 2 # Runs longer than this are capped to max_empty; shorter runs are preserved as-is
MAX_EMPTY_COLS = 2
# Row cleanup
matrix = _remove_empty_runs(matrix, max_empty=MAX_EMPTY_ROWS)
# Column cleanup (transpose, clean, transpose back)
transposed = list(map(list, zip(*matrix))) if matrix else []
transposed = _remove_empty_runs(transposed, max_empty=MAX_EMPTY_COLS)
matrix = list(map(list, zip(*transposed))) if transposed else []
return matrix
def _remove_empty_runs(
rows: list[list[str]],
max_empty: int,
) -> list[list[str]]:
"""Removes entire runs of empty rows when the run length exceeds max_empty.
Leading and trailing empty rows are always dropped regardless of run length,
since there is no adjacent non-empty row to bound the run.
"""
result: list[list[str]] = []
empty_buffer: list[list[str]] = []
for row in rows:
# Check if empty
if not any(row):
empty_buffer.append(row)
else:
# Add upto max empty rows onto the result - that's what we allow
result.extend(empty_buffer[:max_empty])
# Add the new non-empty row
result.append(row)
empty_buffer = []
return result
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
# TODO: switch back to this approach in a few months when markitdown
# fixes their handling of excel files
@@ -390,30 +451,15 @@ def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
)
return ""
raise e
raise
text_content = []
for sheet in workbook.worksheets:
rows = []
num_empty_consecutive_rows = 0
for row in sheet.iter_rows(min_row=1, values_only=True):
row_str = ",".join(str(cell or "") for cell in row)
# Only add the row if there are any values in the cells
if len(row_str) >= len(row):
rows.append(row_str)
num_empty_consecutive_rows = 0
else:
num_empty_consecutive_rows += 1
if num_empty_consecutive_rows > 100:
# handle massive excel sheets with mostly empty cells
logger.warning(
f"Found {num_empty_consecutive_rows} empty rows in {file_name}, skipping rest of file"
)
break
sheet_str = "\n".join(rows)
text_content.append(sheet_str)
sheet_matrix = _clean_worksheet_matrix(_worksheet_to_matrix(sheet))
buf = io.StringIO()
writer = csv.writer(buf, lineterminator="\n")
writer.writerows(sheet_matrix)
text_content.append(buf.getvalue().rstrip("\n"))
return TEXT_SECTION_SEPARATOR.join(text_content)

View File

@@ -19,7 +19,7 @@ from fastapi.testclient import TestClient
from onyx.auth.users import current_admin_user
from onyx.db.engine.sql_engine import get_session
from onyx.db.models import UserRole
from onyx.main import get_application
from onyx.main import fetch_versioned_implementation
from onyx.utils.logger import setup_logger
logger = setup_logger()
@@ -51,8 +51,11 @@ def client() -> Generator[TestClient, None, None]:
# Patch out prometheus metrics setup to avoid "Duplicated timeseries in
# CollectorRegistry" errors when multiple tests each create a new app
# (prometheus registers metrics globally and rejects duplicate names).
get_app = fetch_versioned_implementation(
module="onyx.main", attribute="get_application"
)
with patch("onyx.main.setup_prometheus_metrics"):
app: FastAPI = get_application(lifespan_override=test_lifespan)
app: FastAPI = get_app(lifespan_override=test_lifespan)
# Override the database session dependency with a mock
# (these tests don't actually need DB access)

View File

@@ -17,9 +17,6 @@ from unittest.mock import patch
import pytest
from sqlalchemy.orm import Session
from onyx.background.celery.tasks.opensearch_migration.constants import (
GET_VESPA_CHUNKS_SLICE_COUNT,
)
from onyx.background.celery.tasks.opensearch_migration.tasks import (
is_continuation_token_done_for_all_slices,
)
@@ -323,15 +320,9 @@ def test_embedding_dimension(db_session: Session) -> Generator[int, None, None]:
@pytest.fixture(scope="function")
def patch_get_vespa_chunks_page_size() -> Generator[int, None, None]:
test_page_size = 5
with (
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
test_page_size,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.constants.GET_VESPA_CHUNKS_PAGE_SIZE",
test_page_size,
),
with patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
test_page_size,
):
yield test_page_size # Test runs here.
@@ -591,175 +582,6 @@ class TestMigrateChunksFromVespaToOpenSearchTask:
document_chunks[document.id][opensearch_chunk.chunk_index],
)
def test_chunk_migration_visits_all_chunks_even_when_batch_size_varies(
self,
db_session: Session,
test_documents: list[Document],
vespa_document_index: VespaDocumentIndex,
opensearch_client: OpenSearchIndexClient,
test_embedding_dimension: int,
clean_migration_tables: None, # noqa: ARG002
enable_opensearch_indexing_for_onyx: None, # noqa: ARG002
) -> None:
"""
Tests that chunk migration works correctly even when the batch size
changes halfway through a migration.
Simulates task time running out my mocking the locking behavior.
"""
# Precondition.
# Index chunks into Vespa.
document_chunks: dict[str, list[dict[str, Any]]] = {
document.id: [
_create_raw_document_chunk(
document_id=document.id,
chunk_index=i,
content=f"Test content {i} for {document.id}",
embedding=_generate_test_vector(test_embedding_dimension),
now=datetime.now(),
title=f"Test title {document.id}",
title_embedding=_generate_test_vector(test_embedding_dimension),
)
for i in range(CHUNK_COUNT)
]
for document in test_documents
}
all_chunks: list[dict[str, Any]] = []
for chunks in document_chunks.values():
all_chunks.extend(chunks)
vespa_document_index.index_raw_chunks(all_chunks)
# Run the initial batch. To simulate partial progress we will mock the
# redis lock to return True for the first invocation of .owned() and
# False subsequently.
# NOTE: The batch size is currently set to 5 in
# patch_get_vespa_chunks_page_size.
mock_redis_client = Mock()
mock_lock = Mock()
mock_lock.owned.side_effect = [True, False, False]
mock_lock.acquire.return_value = True
mock_redis_client.lock.return_value = mock_lock
with patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.get_redis_client",
return_value=mock_redis_client,
):
result_1 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
)
assert result_1 is True
# Expire the session cache to see the committed changes from the task.
db_session.expire_all()
# Verify partial progress was saved.
tenant_record = db_session.query(OpenSearchTenantMigrationRecord).first()
assert tenant_record is not None
partial_chunks_migrated = tenant_record.total_chunks_migrated
assert partial_chunks_migrated > 0
# page_size applies per slice, so one iteration can fetch up to
# page_size * GET_VESPA_CHUNKS_SLICE_COUNT chunks total.
assert partial_chunks_migrated <= 5 * GET_VESPA_CHUNKS_SLICE_COUNT
assert tenant_record.vespa_visit_continuation_token is not None
# Slices are not necessarily evenly distributed across all document
# chunks so we can't test that every token is non-None, but certainly at
# least one must be.
assert any(json.loads(tenant_record.vespa_visit_continuation_token).values())
assert tenant_record.migration_completed_at is None
assert tenant_record.approx_chunk_count_in_vespa is not None
# Under test.
# Now patch the batch size to be some other number, like 2.
mock_redis_client = Mock()
mock_lock = Mock()
mock_lock.owned.side_effect = [True, False, False]
mock_lock.acquire.return_value = True
mock_redis_client.lock.return_value = mock_lock
with (
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.constants.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.get_redis_client",
return_value=mock_redis_client,
),
):
result_2 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
)
# Postcondition.
assert result_2 is True
# Expire the session cache to see the committed changes from the task.
db_session.expire_all()
# Verify next partial progress was saved.
tenant_record = db_session.query(OpenSearchTenantMigrationRecord).first()
assert tenant_record is not None
new_partial_chunks_migrated = tenant_record.total_chunks_migrated
assert new_partial_chunks_migrated > partial_chunks_migrated
# page_size applies per slice, so one iteration can fetch up to
# page_size * GET_VESPA_CHUNKS_SLICE_COUNT chunks total.
assert new_partial_chunks_migrated <= (5 + 2) * GET_VESPA_CHUNKS_SLICE_COUNT
assert tenant_record.vespa_visit_continuation_token is not None
# Slices are not necessarily evenly distributed across all document
# chunks so we can't test that every token is non-None, but certainly at
# least one must be.
assert any(json.loads(tenant_record.vespa_visit_continuation_token).values())
assert tenant_record.migration_completed_at is None
assert tenant_record.approx_chunk_count_in_vespa is not None
# Under test.
# Run the remainder of the migration.
with (
patch(
"onyx.background.celery.tasks.opensearch_migration.tasks.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
patch(
"onyx.background.celery.tasks.opensearch_migration.constants.GET_VESPA_CHUNKS_PAGE_SIZE",
2,
),
):
result_3 = migrate_chunks_from_vespa_to_opensearch_task(
tenant_id=get_current_tenant_id()
)
# Postcondition.
assert result_3 is True
# Expire the session cache to see the committed changes from the task.
db_session.expire_all()
# Verify completion.
tenant_record = db_session.query(OpenSearchTenantMigrationRecord).first()
assert tenant_record is not None
assert tenant_record.total_chunks_migrated > new_partial_chunks_migrated
assert tenant_record.total_chunks_migrated == len(all_chunks)
# Visit is complete so continuation token should be None.
assert tenant_record.vespa_visit_continuation_token is not None
assert is_continuation_token_done_for_all_slices(
json.loads(tenant_record.vespa_visit_continuation_token)
)
assert tenant_record.migration_completed_at is not None
assert tenant_record.approx_chunk_count_in_vespa == len(all_chunks)
# Verify chunks were indexed in OpenSearch.
for document in test_documents:
opensearch_chunks = _get_document_chunks_from_opensearch(
opensearch_client, document.id, get_current_tenant_id()
)
assert len(opensearch_chunks) == CHUNK_COUNT
opensearch_chunks.sort(key=lambda x: x.chunk_index)
for opensearch_chunk in opensearch_chunks:
_assert_chunk_matches_vespa_chunk(
opensearch_chunk,
document_chunks[document.id][opensearch_chunk.chunk_index],
)
def test_chunk_migration_empty_vespa(
self,
db_session: Session,

View File

@@ -0,0 +1,196 @@
import io
import openpyxl
from onyx.file_processing.extract_file_text import xlsx_to_text
def _make_xlsx(sheets: dict[str, list[list[str]]]) -> io.BytesIO:
"""Create an in-memory xlsx file from a dict of sheet_name -> matrix of strings."""
wb = openpyxl.Workbook()
if wb.active is not None:
wb.remove(wb.active)
for sheet_name, rows in sheets.items():
ws = wb.create_sheet(title=sheet_name)
for row in rows:
ws.append(row)
buf = io.BytesIO()
wb.save(buf)
buf.seek(0)
return buf
class TestXlsxToText:
def test_single_sheet_basic(self) -> None:
xlsx = _make_xlsx(
{
"Sheet1": [
["Name", "Age"],
["Alice", "30"],
["Bob", "25"],
]
}
)
result = xlsx_to_text(xlsx)
lines = [line for line in result.strip().split("\n") if line.strip()]
assert len(lines) == 3
assert "Name" in lines[0]
assert "Age" in lines[0]
assert "Alice" in lines[1]
assert "30" in lines[1]
assert "Bob" in lines[2]
def test_multiple_sheets_separated(self) -> None:
xlsx = _make_xlsx(
{
"Sheet1": [["a", "b"]],
"Sheet2": [["c", "d"]],
}
)
result = xlsx_to_text(xlsx)
# TEXT_SECTION_SEPARATOR is "\n\n"
assert "\n\n" in result
parts = result.split("\n\n")
assert any("a" in p for p in parts)
assert any("c" in p for p in parts)
def test_empty_cells(self) -> None:
xlsx = _make_xlsx(
{
"Sheet1": [
["a", "", "b"],
["", "c", ""],
]
}
)
result = xlsx_to_text(xlsx)
lines = [line for line in result.strip().split("\n") if line.strip()]
assert len(lines) == 2
def test_commas_in_cells_are_quoted(self) -> None:
"""Cells containing commas should be quoted in CSV output."""
xlsx = _make_xlsx(
{
"Sheet1": [
["hello, world", "normal"],
]
}
)
result = xlsx_to_text(xlsx)
assert '"hello, world"' in result
def test_empty_workbook(self) -> None:
xlsx = _make_xlsx({"Sheet1": []})
result = xlsx_to_text(xlsx)
assert result.strip() == ""
def test_long_empty_row_run_capped(self) -> None:
"""Runs of >2 empty rows should be capped to 2."""
xlsx = _make_xlsx(
{
"Sheet1": [
["header"],
[""],
[""],
[""],
[""],
["data"],
]
}
)
result = xlsx_to_text(xlsx)
lines = [line for line in result.strip().split("\n") if line.strip()]
# 4 empty rows capped to 2, so: header + 2 empty + data = 4 lines
assert len(lines) == 4
assert "header" in lines[0]
assert "data" in lines[-1]
def test_long_empty_col_run_capped(self) -> None:
"""Runs of >2 empty columns should be capped to 2."""
xlsx = _make_xlsx(
{
"Sheet1": [
["a", "", "", "", "b"],
["c", "", "", "", "d"],
]
}
)
result = xlsx_to_text(xlsx)
lines = [line for line in result.strip().split("\n") if line.strip()]
assert len(lines) == 2
# Each row should have 4 fields (a + 2 empty + b), not 5
# csv format: a,,,b (3 commas = 4 fields)
first_line = lines[0].strip()
# Count commas to verify column reduction
assert first_line.count(",") == 3
def test_short_empty_runs_kept(self) -> None:
"""Runs of <=2 empty rows/cols should be preserved."""
xlsx = _make_xlsx(
{
"Sheet1": [
["a", "b"],
["", ""],
["", ""],
["c", "d"],
]
}
)
result = xlsx_to_text(xlsx)
lines = [line for line in result.strip().split("\n") if line.strip()]
# All 4 rows preserved (2 empty rows <= threshold)
assert len(lines) == 4
def test_bad_zip_file_returns_empty(self) -> None:
bad_file = io.BytesIO(b"not a zip file")
result = xlsx_to_text(bad_file, file_name="test.xlsx")
assert result == ""
def test_bad_zip_tilde_file_returns_empty(self) -> None:
bad_file = io.BytesIO(b"not a zip file")
result = xlsx_to_text(bad_file, file_name="~$temp.xlsx")
assert result == ""
def test_large_sparse_sheet(self) -> None:
"""A sheet with data, a big empty gap, and more data — gap is capped to 2."""
rows: list[list[str]] = [["row1_data"]]
rows.extend([[""] for _ in range(10)])
rows.append(["row2_data"])
xlsx = _make_xlsx({"Sheet1": rows})
result = xlsx_to_text(xlsx)
lines = [line for line in result.strip().split("\n") if line.strip()]
# 10 empty rows capped to 2: row1_data + 2 empty + row2_data = 4
assert len(lines) == 4
assert "row1_data" in lines[0]
assert "row2_data" in lines[-1]
def test_quotes_in_cells(self) -> None:
"""Cells containing quotes should be properly escaped."""
xlsx = _make_xlsx(
{
"Sheet1": [
['say "hello"', "normal"],
]
}
)
result = xlsx_to_text(xlsx)
# csv.writer escapes quotes by doubling them
assert '""hello""' in result
def test_each_row_is_separate_line(self) -> None:
"""Each row should produce its own line (regression for writerow vs writerows)."""
xlsx = _make_xlsx(
{
"Sheet1": [
["r1c1", "r1c2"],
["r2c1", "r2c2"],
["r3c1", "r3c2"],
]
}
)
result = xlsx_to_text(xlsx)
lines = [line for line in result.strip().split("\n") if line.strip()]
assert len(lines) == 3
assert "r1c1" in lines[0] and "r1c2" in lines[0]
assert "r2c1" in lines[1] and "r2c2" in lines[1]
assert "r3c1" in lines[2] and "r3c2" in lines[2]

View File

@@ -15,9 +15,8 @@
# -f docker-compose.dev.yml up -d --wait
#
# This overlay:
# - Moves Vespa (index), both model servers, OpenSearch, MinIO,
# Redis (cache), and the background worker to profiles so they do
# not start by default
# - Moves Vespa (index), both model servers, code-interpreter, Redis (cache),
# and the background worker to profiles so they do not start by default
# - Makes depends_on references to removed services optional
# - Sets DISABLE_VECTOR_DB=true on the api_server
# - Uses PostgreSQL for caching and auth instead of Redis
@@ -28,8 +27,7 @@
# --profile inference Inference model server
# --profile background Background worker (Celery) — also needs redis
# --profile redis Redis cache
# --profile opensearch OpenSearch
# --profile s3-filestore MinIO (S3-compatible file store)
# --profile code-interpreter Code interpreter
# =============================================================================
name: onyx
@@ -40,9 +38,6 @@ services:
index:
condition: service_started
required: false
opensearch:
condition: service_started
required: false
cache:
condition: service_started
required: false
@@ -89,10 +84,4 @@ services:
inference_model_server:
profiles: ["inference"]
# OpenSearch is not needed in lite mode (no indexing).
opensearch:
profiles: ["opensearch"]
# MinIO is not needed in lite mode (Postgres handles file storage).
minio:
profiles: ["s3-filestore"]
code-interpreter: {}

View File

@@ -1,8 +1,8 @@
#!/bin/bash
set -euo pipefail
set -e
# Expected resource requirements (overridden below if --lite)
# Expected resource requirements
EXPECTED_DOCKER_RAM_GB=10
EXPECTED_DISK_GB=32
@@ -10,11 +10,6 @@ EXPECTED_DISK_GB=32
SHUTDOWN_MODE=false
DELETE_DATA_MODE=false
INCLUDE_CRAFT=false # Disabled by default, use --include-craft to enable
LITE_MODE=false # Disabled by default, use --lite to enable
USE_LOCAL_FILES=false # Disabled by default, use --local to skip downloading config files
NO_PROMPT=false
DRY_RUN=false
VERBOSE=false
while [[ $# -gt 0 ]]; do
case $1 in
@@ -30,26 +25,6 @@ while [[ $# -gt 0 ]]; do
INCLUDE_CRAFT=true
shift
;;
--lite)
LITE_MODE=true
shift
;;
--local)
USE_LOCAL_FILES=true
shift
;;
--no-prompt)
NO_PROMPT=true
shift
;;
--dry-run)
DRY_RUN=true
shift
;;
--verbose)
VERBOSE=true
shift
;;
--help|-h)
echo "Onyx Installation Script"
echo ""
@@ -57,23 +32,15 @@ while [[ $# -gt 0 ]]; do
echo ""
echo "Options:"
echo " --include-craft Enable Onyx Craft (AI-powered web app building)"
echo " --lite Deploy Onyx Lite (no Vespa, Redis, or model servers)"
echo " --local Use existing config files instead of downloading from GitHub"
echo " --shutdown Stop (pause) Onyx containers"
echo " --delete-data Remove all Onyx data (containers, volumes, and files)"
echo " --no-prompt Run non-interactively with defaults (for CI/automation)"
echo " --dry-run Show what would be done without making changes"
echo " --verbose Show detailed output for debugging"
echo " --help, -h Show this help message"
echo ""
echo "Examples:"
echo " $0 # Install Onyx"
echo " $0 --lite # Install Onyx Lite (minimal deployment)"
echo " $0 --include-craft # Install Onyx with Craft enabled"
echo " $0 --shutdown # Pause Onyx services"
echo " $0 --delete-data # Completely remove Onyx and all data"
echo " $0 --local # Re-run using existing config files on disk"
echo " $0 --no-prompt # Non-interactive install with defaults"
exit 0
;;
*)
@@ -84,129 +51,8 @@ while [[ $# -gt 0 ]]; do
esac
done
if [[ "$VERBOSE" = true ]]; then
set -x
fi
if [[ "$LITE_MODE" = true ]] && [[ "$INCLUDE_CRAFT" = true ]]; then
echo "ERROR: --lite and --include-craft cannot be used together."
echo "Craft requires services (Vespa, Redis, background workers) that lite mode disables."
exit 1
fi
# When --lite is passed as a flag, lower resource thresholds early (before the
# resource check). When lite is chosen interactively, the thresholds are adjusted
# inside the new-deployment flow, after the resource check has already passed
# with the standard thresholds — which is the safer direction.
if [[ "$LITE_MODE" = true ]]; then
EXPECTED_DOCKER_RAM_GB=4
EXPECTED_DISK_GB=16
fi
INSTALL_ROOT="${INSTALL_PREFIX:-onyx_data}"
LITE_COMPOSE_FILE="docker-compose.onyx-lite.yml"
# Build the -f flags for docker compose.
# Pass "true" as $1 to auto-detect a previously-downloaded lite overlay
# (used by shutdown/delete-data so users don't need to remember --lite).
# Without the argument, the lite overlay is only included when --lite was
# explicitly passed — preventing install/start from silently staying in
# lite mode just because the file exists on disk from a prior run.
compose_file_args() {
local auto_detect="${1:-false}"
local args="-f docker-compose.yml"
if [[ "$LITE_MODE" = true ]] || { [[ "$auto_detect" = true ]] && [[ -f "${INSTALL_ROOT}/deployment/${LITE_COMPOSE_FILE}" ]]; }; then
args="$args -f ${LITE_COMPOSE_FILE}"
fi
echo "$args"
}
# --- Downloader detection (curl with wget fallback) ---
DOWNLOADER=""
detect_downloader() {
if command -v curl &> /dev/null; then
DOWNLOADER="curl"
return 0
fi
if command -v wget &> /dev/null; then
DOWNLOADER="wget"
return 0
fi
echo "ERROR: Neither curl nor wget found. Please install one and retry."
exit 1
}
detect_downloader
download_file() {
local url="$1"
local output="$2"
if [[ "$DOWNLOADER" == "curl" ]]; then
curl -fsSL --retry 3 --retry-delay 2 --retry-connrefused -o "$output" "$url"
else
wget -q --tries=3 --timeout=20 -O "$output" "$url"
fi
}
# Ensures a required file is present. With --local, verifies the file exists on
# disk. Otherwise, downloads it from the given URL. Returns 0 on success, 1 on
# failure (caller should handle the exit).
ensure_file() {
local path="$1"
local url="$2"
local desc="$3"
if [[ "$USE_LOCAL_FILES" = true ]]; then
if [[ -f "$path" ]]; then
print_success "Using existing ${desc}"
return 0
fi
print_error "Required file missing: ${desc} (${path})"
return 1
fi
print_info "Downloading ${desc}..."
if download_file "$url" "$path" 2>/dev/null; then
print_success "${desc} downloaded"
return 0
fi
print_error "Failed to download ${desc}"
print_info "Please ensure you have internet connection and try again"
return 1
}
# --- Interactive prompt helpers ---
is_interactive() {
[[ "$NO_PROMPT" = false ]] && [[ -t 0 ]]
}
prompt_or_default() {
local prompt_text="$1"
local default_value="$2"
if is_interactive; then
read -p "$prompt_text" -r REPLY
if [[ -z "$REPLY" ]]; then
REPLY="$default_value"
fi
else
REPLY="$default_value"
fi
}
prompt_yn_or_default() {
local prompt_text="$1"
local default_value="$2"
if is_interactive; then
read -p "$prompt_text" -n 1 -r
echo ""
if [[ -z "$REPLY" ]]; then
REPLY="$default_value"
fi
else
REPLY="$default_value"
fi
}
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
@@ -265,7 +111,7 @@ if [ "$SHUTDOWN_MODE" = true ]; then
fi
# Stop containers (without removing them)
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD $(compose_file_args true) stop)
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD -f docker-compose.yml stop)
if [ $? -eq 0 ]; then
print_success "Onyx containers stopped (paused)"
else
@@ -294,17 +140,12 @@ if [ "$DELETE_DATA_MODE" = true ]; then
echo " • All downloaded files and configurations"
echo " • All user data and documents"
echo ""
if is_interactive; then
read -p "Are you sure you want to continue? Type 'DELETE' to confirm: " -r
echo ""
if [ "$REPLY" != "DELETE" ]; then
print_info "Operation cancelled."
exit 0
fi
else
print_error "Cannot confirm destructive operation in non-interactive mode."
print_info "Run interactively or remove the ${INSTALL_ROOT} directory manually."
exit 1
read -p "Are you sure you want to continue? Type 'DELETE' to confirm: " -r
echo ""
if [ "$REPLY" != "DELETE" ]; then
print_info "Operation cancelled."
exit 0
fi
print_info "Removing Onyx containers and volumes..."
@@ -323,7 +164,7 @@ if [ "$DELETE_DATA_MODE" = true ]; then
fi
# Stop and remove containers with volumes
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD $(compose_file_args true) down -v)
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD -f docker-compose.yml down -v)
if [ $? -eq 0 ]; then
print_success "Onyx containers and volumes removed"
else
@@ -345,117 +186,6 @@ if [ "$DELETE_DATA_MODE" = true ]; then
exit 0
fi
# --- Auto-install Docker (Linux only) ---
# Runs before the banner so a group-based re-exec doesn't repeat it.
install_docker_linux() {
local distro_id=""
if [[ -f /etc/os-release ]]; then
distro_id="$(. /etc/os-release && echo "${ID:-}")"
fi
case "$distro_id" in
amzn)
print_info "Detected Amazon Linux — installing Docker via package manager..."
if command -v dnf &> /dev/null; then
sudo dnf install -y docker
else
sudo yum install -y docker
fi
;;
*)
print_info "Installing Docker via get.docker.com..."
download_file "https://get.docker.com" /tmp/get-docker.sh
sudo sh /tmp/get-docker.sh
rm -f /tmp/get-docker.sh
;;
esac
sudo systemctl start docker 2>/dev/null || sudo service docker start 2>/dev/null || true
sudo systemctl enable docker 2>/dev/null || true
}
# Detect OS (including WSL)
IS_WSL=false
if [[ -n "${WSL_DISTRO_NAME:-}" ]] || grep -qi microsoft /proc/version 2>/dev/null; then
IS_WSL=true
fi
# Dry-run: show plan and exit
if [[ "$DRY_RUN" = true ]]; then
print_info "Dry run mode — showing what would happen:"
echo " • Install root: ${INSTALL_ROOT}"
echo " • Lite mode: ${LITE_MODE}"
echo " • Include Craft: ${INCLUDE_CRAFT}"
echo " • OS type: ${OSTYPE:-unknown} (WSL: ${IS_WSL})"
echo " • Downloader: ${DOWNLOADER}"
echo ""
print_success "Dry run complete (no changes made)"
exit 0
fi
if ! command -v docker &> /dev/null; then
if [[ "$OSTYPE" == "linux-gnu"* ]] || [[ -n "${WSL_DISTRO_NAME:-}" ]]; then
install_docker_linux
if ! command -v docker &> /dev/null; then
print_error "Docker installation failed."
echo " Visit: https://docs.docker.com/get-docker/"
exit 1
fi
print_success "Docker installed successfully"
fi
fi
# --- Auto-install Docker Compose plugin (Linux only) ---
if command -v docker &> /dev/null \
&& ! docker compose version &> /dev/null \
&& ! command -v docker-compose &> /dev/null \
&& { [[ "$OSTYPE" == "linux-gnu"* ]] || [[ -n "${WSL_DISTRO_NAME:-}" ]]; }; then
print_info "Docker Compose not found — installing plugin..."
COMPOSE_ARCH="$(uname -m)"
COMPOSE_URL="https://github.com/docker/compose/releases/latest/download/docker-compose-linux-${COMPOSE_ARCH}"
COMPOSE_DIR="/usr/local/lib/docker/cli-plugins"
COMPOSE_TMP="$(mktemp)"
sudo mkdir -p "$COMPOSE_DIR"
if download_file "$COMPOSE_URL" "$COMPOSE_TMP"; then
sudo mv "$COMPOSE_TMP" "$COMPOSE_DIR/docker-compose"
sudo chmod +x "$COMPOSE_DIR/docker-compose"
if docker compose version &> /dev/null; then
print_success "Docker Compose plugin installed"
else
print_error "Docker Compose plugin installed but not detected."
echo " Visit: https://docs.docker.com/compose/install/"
exit 1
fi
else
rm -f "$COMPOSE_TMP"
print_error "Failed to download Docker Compose plugin."
echo " Visit: https://docs.docker.com/compose/install/"
exit 1
fi
fi
# On Linux, ensure the current user can talk to the Docker daemon without
# sudo. If necessary, add them to the "docker" group and re-exec the
# script under that group so the rest of the install proceeds normally.
if command -v docker &> /dev/null \
&& { [[ "$OSTYPE" == "linux-gnu"* ]] || [[ -n "${WSL_DISTRO_NAME:-}" ]]; } \
&& [[ "$(id -u)" -ne 0 ]] \
&& ! docker info &> /dev/null; then
if [[ "${_ONYX_REEXEC:-}" = "1" ]]; then
print_error "Cannot connect to Docker after group re-exec."
print_info "Log out and back in, then run the script again."
exit 1
fi
if ! getent group docker &> /dev/null; then
sudo groupadd docker
fi
print_info "Adding $USER to the docker group..."
sudo usermod -aG docker "$USER"
print_info "Re-launching with docker group active..."
exec sg docker -c "_ONYX_REEXEC=1 bash $(printf '%q ' "$0" "$@")"
fi
# ASCII Art Banner
echo ""
echo -e "${BLUE}${BOLD}"
@@ -479,7 +209,8 @@ echo "2. Check your system resources (Docker, memory, disk space)"
echo "3. Guide you through deployment options (version, authentication)"
echo ""
if is_interactive; then
# Only prompt for acknowledgment if running interactively
if [ -t 0 ]; then
echo -e "${YELLOW}${BOLD}Please acknowledge and press Enter to continue...${NC}"
read -r
echo ""
@@ -529,35 +260,41 @@ else
exit 1
fi
# Returns 0 if $1 <= $2, 1 if $1 > $2
# Handles missing or non-numeric parts gracefully (treats them as 0)
# Function to compare version numbers
version_compare() {
local version1="${1:-0.0.0}"
local version2="${2:-0.0.0}"
# Returns 0 if $1 <= $2, 1 if $1 > $2
local version1=$1
local version2=$2
local v1_major v1_minor v1_patch v2_major v2_minor v2_patch
v1_major=$(echo "$version1" | cut -d. -f1)
v1_minor=$(echo "$version1" | cut -d. -f2)
v1_patch=$(echo "$version1" | cut -d. -f3)
v2_major=$(echo "$version2" | cut -d. -f1)
v2_minor=$(echo "$version2" | cut -d. -f2)
v2_patch=$(echo "$version2" | cut -d. -f3)
# Split versions into components
local v1_major=$(echo $version1 | cut -d. -f1)
local v1_minor=$(echo $version1 | cut -d. -f2)
local v1_patch=$(echo $version1 | cut -d. -f3)
# Default non-numeric or empty parts to 0
[[ "$v1_major" =~ ^[0-9]+$ ]] || v1_major=0
[[ "$v1_minor" =~ ^[0-9]+$ ]] || v1_minor=0
[[ "$v1_patch" =~ ^[0-9]+$ ]] || v1_patch=0
[[ "$v2_major" =~ ^[0-9]+$ ]] || v2_major=0
[[ "$v2_minor" =~ ^[0-9]+$ ]] || v2_minor=0
[[ "$v2_patch" =~ ^[0-9]+$ ]] || v2_patch=0
local v2_major=$(echo $version2 | cut -d. -f1)
local v2_minor=$(echo $version2 | cut -d. -f2)
local v2_patch=$(echo $version2 | cut -d. -f3)
if [ "$v1_major" -lt "$v2_major" ]; then return 0
elif [ "$v1_major" -gt "$v2_major" ]; then return 1; fi
# Compare major version
if [ "$v1_major" -lt "$v2_major" ]; then
return 0
elif [ "$v1_major" -gt "$v2_major" ]; then
return 1
fi
if [ "$v1_minor" -lt "$v2_minor" ]; then return 0
elif [ "$v1_minor" -gt "$v2_minor" ]; then return 1; fi
# Compare minor version
if [ "$v1_minor" -lt "$v2_minor" ]; then
return 0
elif [ "$v1_minor" -gt "$v2_minor" ]; then
return 1
fi
[ "$v1_patch" -le "$v2_patch" ]
# Compare patch version
if [ "$v1_patch" -le "$v2_patch" ]; then
return 0
else
return 1
fi
}
# Check Docker daemon
@@ -599,20 +336,10 @@ fi
# Convert to GB for display
if [ "$MEMORY_MB" -gt 0 ]; then
MEMORY_GB=$(awk "BEGIN {printf \"%.1f\", $MEMORY_MB / 1024}")
if [ "$(awk "BEGIN {print ($MEMORY_MB >= 1024)}")" = "1" ]; then
MEMORY_DISPLAY="~${MEMORY_GB}GB"
else
MEMORY_DISPLAY="${MEMORY_MB}MB"
fi
if [[ "$OSTYPE" == "darwin"* ]]; then
print_info "Docker memory allocation: ${MEMORY_DISPLAY}"
else
print_info "System memory: ${MEMORY_DISPLAY} (Docker uses host memory directly)"
fi
MEMORY_GB=$((MEMORY_MB / 1024))
print_info "Docker memory allocation: ~${MEMORY_GB}GB"
else
print_warning "Could not determine memory allocation"
MEMORY_DISPLAY="unknown"
print_warning "Could not determine Docker memory allocation"
MEMORY_MB=0
fi
@@ -631,7 +358,7 @@ RESOURCE_WARNING=false
EXPECTED_RAM_MB=$((EXPECTED_DOCKER_RAM_GB * 1024))
if [ "$MEMORY_MB" -gt 0 ] && [ "$MEMORY_MB" -lt "$EXPECTED_RAM_MB" ]; then
print_warning "Less than ${EXPECTED_DOCKER_RAM_GB}GB RAM available (found: ${MEMORY_DISPLAY})"
print_warning "Docker has less than ${EXPECTED_DOCKER_RAM_GB}GB RAM allocated (found: ~${MEMORY_GB}GB)"
RESOURCE_WARNING=true
fi
@@ -642,10 +369,10 @@ fi
if [ "$RESOURCE_WARNING" = true ]; then
echo ""
print_warning "Onyx recommends at least ${EXPECTED_DOCKER_RAM_GB}GB RAM and ${EXPECTED_DISK_GB}GB disk space for optimal performance in standard mode."
print_warning "Lite mode requires less resources (1-4GB RAM, 8-16GB disk depending on usage), but does not include a vector database."
print_warning "Onyx recommends at least ${EXPECTED_DOCKER_RAM_GB}GB RAM and ${EXPECTED_DISK_GB}GB disk space for optimal performance."
echo ""
read -p "Do you want to continue anyway? (y/N): " -n 1 -r
echo ""
prompt_yn_or_default "Do you want to continue anyway? (Y/n): " "y"
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_info "Installation cancelled. Please allocate more resources and try again."
exit 1
@@ -658,89 +385,117 @@ print_step "Creating directory structure"
if [ -d "${INSTALL_ROOT}" ]; then
print_info "Directory structure already exists"
print_success "Using existing ${INSTALL_ROOT} directory"
else
mkdir -p "${INSTALL_ROOT}/deployment"
mkdir -p "${INSTALL_ROOT}/data/nginx/local"
print_success "Directory structure created"
fi
mkdir -p "${INSTALL_ROOT}/deployment"
mkdir -p "${INSTALL_ROOT}/data/nginx/local"
print_success "Directory structure created"
# Ensure all required configuration files are present
# Download all required files
print_step "Downloading Onyx configuration files"
print_info "This step downloads all necessary configuration files from GitHub..."
echo ""
print_info "Downloading the following files:"
echo " • docker-compose.yml - Main Docker Compose configuration"
echo " • env.template - Environment variables template"
echo " • nginx/app.conf.template - Nginx web server configuration"
echo " • nginx/run-nginx.sh - Nginx startup script"
echo " • README.md - Documentation and setup instructions"
echo ""
# Download Docker Compose file
COMPOSE_FILE="${INSTALL_ROOT}/deployment/docker-compose.yml"
print_info "Downloading docker-compose.yml..."
if curl -fsSL -o "$COMPOSE_FILE" "${GITHUB_RAW_URL}/docker-compose.yml" 2>/dev/null; then
print_success "Docker Compose file downloaded successfully"
# Check if Docker Compose version is older than 2.24.0 and show warning
# Skip check for dev builds (assume they're recent enough)
if [ "$COMPOSE_VERSION" != "dev" ] && version_compare "$COMPOSE_VERSION" "2.24.0"; then
print_warning "Docker Compose version $COMPOSE_VERSION is older than 2.24.0"
echo ""
print_warning "The docker-compose.yml file uses the newer env_file format that requires Docker Compose 2.24.0 or later."
echo ""
print_info "To use this configuration with your current Docker Compose version, you have two options:"
echo ""
echo "1. Upgrade Docker Compose to version 2.24.0 or later (recommended)"
echo " Visit: https://docs.docker.com/compose/install/"
echo ""
echo "2. Manually replace all env_file sections in docker-compose.yml"
echo " Change from:"
echo " env_file:"
echo " - path: .env"
echo " required: false"
echo " To:"
echo " env_file: .env"
echo ""
print_warning "The installation will continue, but may fail if Docker Compose cannot parse the file."
echo ""
read -p "Do you want to continue anyway? (y/N): " -n 1 -r
echo ""
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_info "Installation cancelled. Please upgrade Docker Compose or manually edit the docker-compose.yml file."
exit 1
fi
print_info "Proceeding with installation despite Docker Compose version compatibility issues..."
fi
else
print_error "Failed to download Docker Compose file"
print_info "Please ensure you have internet connection and try again"
exit 1
fi
# Download env.template file
ENV_TEMPLATE="${INSTALL_ROOT}/deployment/env.template"
print_info "Downloading env.template..."
if curl -fsSL -o "$ENV_TEMPLATE" "${GITHUB_RAW_URL}/env.template" 2>/dev/null; then
print_success "Environment template downloaded successfully"
else
print_error "Failed to download env.template"
print_info "Please ensure you have internet connection and try again"
exit 1
fi
# Download nginx config files
NGINX_BASE_URL="https://raw.githubusercontent.com/onyx-dot-app/onyx/main/deployment/data/nginx"
if [[ "$USE_LOCAL_FILES" = true ]]; then
print_step "Verifying existing configuration files"
# Download app.conf.template
NGINX_CONFIG="${INSTALL_ROOT}/data/nginx/app.conf.template"
print_info "Downloading nginx configuration template..."
if curl -fsSL -o "$NGINX_CONFIG" "$NGINX_BASE_URL/app.conf.template" 2>/dev/null; then
print_success "Nginx configuration template downloaded"
else
print_step "Downloading Onyx configuration files"
print_info "This step downloads all necessary configuration files from GitHub..."
print_error "Failed to download nginx configuration template"
print_info "Please ensure you have internet connection and try again"
exit 1
fi
ensure_file "${INSTALL_ROOT}/deployment/docker-compose.yml" \
"${GITHUB_RAW_URL}/docker-compose.yml" "docker-compose.yml" || exit 1
# Check Docker Compose version compatibility after obtaining docker-compose.yml
if [ "$COMPOSE_VERSION" != "dev" ] && version_compare "$COMPOSE_VERSION" "2.24.0"; then
print_warning "Docker Compose version $COMPOSE_VERSION is older than 2.24.0"
echo ""
print_warning "The docker-compose.yml file uses the newer env_file format that requires Docker Compose 2.24.0 or later."
echo ""
print_info "To use this configuration with your current Docker Compose version, you have two options:"
echo ""
echo "1. Upgrade Docker Compose to version 2.24.0 or later (recommended)"
echo " Visit: https://docs.docker.com/compose/install/"
echo ""
echo "2. Manually replace all env_file sections in docker-compose.yml"
echo " Change from:"
echo " env_file:"
echo " - path: .env"
echo " required: false"
echo " To:"
echo " env_file: .env"
echo ""
print_warning "The installation will continue, but may fail if Docker Compose cannot parse the file."
echo ""
prompt_yn_or_default "Do you want to continue anyway? (Y/n): " "y"
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_info "Installation cancelled. Please upgrade Docker Compose or manually edit the docker-compose.yml file."
exit 1
fi
print_info "Proceeding with installation despite Docker Compose version compatibility issues..."
# Download run-nginx.sh script
NGINX_RUN_SCRIPT="${INSTALL_ROOT}/data/nginx/run-nginx.sh"
print_info "Downloading nginx startup script..."
if curl -fsSL -o "$NGINX_RUN_SCRIPT" "$NGINX_BASE_URL/run-nginx.sh" 2>/dev/null; then
chmod +x "$NGINX_RUN_SCRIPT"
print_success "Nginx startup script downloaded and made executable"
else
print_error "Failed to download nginx startup script"
print_info "Please ensure you have internet connection and try again"
exit 1
fi
# Handle lite overlay: ensure it if --lite, clean up stale copies otherwise
if [[ "$LITE_MODE" = true ]]; then
ensure_file "${INSTALL_ROOT}/deployment/${LITE_COMPOSE_FILE}" \
"${GITHUB_RAW_URL}/${LITE_COMPOSE_FILE}" "${LITE_COMPOSE_FILE}" || exit 1
elif [[ -f "${INSTALL_ROOT}/deployment/${LITE_COMPOSE_FILE}" ]]; then
if [[ -f "${INSTALL_ROOT}/deployment/.env" ]]; then
print_warning "Existing lite overlay found but --lite was not passed."
prompt_yn_or_default "Remove lite overlay and switch to standard mode? (y/N): " "n"
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
print_info "Keeping existing lite overlay. Pass --lite to keep using lite mode."
LITE_MODE=true
else
rm -f "${INSTALL_ROOT}/deployment/${LITE_COMPOSE_FILE}"
print_info "Removed lite overlay (switching to standard mode)"
fi
else
rm -f "${INSTALL_ROOT}/deployment/${LITE_COMPOSE_FILE}"
print_info "Removed previous lite overlay (switching to standard mode)"
fi
# Download README file
README_FILE="${INSTALL_ROOT}/README.md"
print_info "Downloading README.md..."
if curl -fsSL -o "$README_FILE" "${GITHUB_RAW_URL}/README.md" 2>/dev/null; then
print_success "README.md downloaded successfully"
else
print_error "Failed to download README.md"
print_info "Please ensure you have internet connection and try again"
exit 1
fi
ensure_file "${INSTALL_ROOT}/deployment/env.template" \
"${GITHUB_RAW_URL}/env.template" "env.template" || exit 1
ensure_file "${INSTALL_ROOT}/data/nginx/app.conf.template" \
"$NGINX_BASE_URL/app.conf.template" "nginx/app.conf.template" || exit 1
ensure_file "${INSTALL_ROOT}/data/nginx/run-nginx.sh" \
"$NGINX_BASE_URL/run-nginx.sh" "nginx/run-nginx.sh" || exit 1
chmod +x "${INSTALL_ROOT}/data/nginx/run-nginx.sh"
ensure_file "${INSTALL_ROOT}/README.md" \
"${GITHUB_RAW_URL}/README.md" "README.md" || exit 1
# Create empty local directory marker (if needed)
touch "${INSTALL_ROOT}/data/nginx/local/.gitkeep"
print_success "All configuration files ready"
print_success "All configuration files downloaded successfully"
# Set up deployment configuration
print_step "Setting up deployment configs"
@@ -758,7 +513,7 @@ if [ -d "${INSTALL_ROOT}/deployment" ] && [ -f "${INSTALL_ROOT}/deployment/docke
if [ -n "$COMPOSE_CMD" ]; then
# Check if any containers are running
RUNNING_CONTAINERS=$(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD $(compose_file_args true) ps -q 2>/dev/null | wc -l)
RUNNING_CONTAINERS=$(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD -f docker-compose.yml ps -q 2>/dev/null | wc -l)
if [ "$RUNNING_CONTAINERS" -gt 0 ]; then
print_error "Onyx services are currently running!"
echo ""
@@ -779,7 +534,7 @@ if [ -f "$ENV_FILE" ]; then
echo "• Press Enter to restart with current configuration"
echo "• Type 'update' to update to a newer version"
echo ""
prompt_or_default "Choose an option [default: restart]: " ""
read -p "Choose an option [default: restart]: " -r
echo ""
if [ "$REPLY" = "update" ]; then
@@ -788,30 +543,26 @@ if [ -f "$ENV_FILE" ]; then
echo "• Press Enter for latest (recommended)"
echo "• Type a specific tag (e.g., v0.1.0)"
echo ""
# If --include-craft was passed, default to craft-latest
if [ "$INCLUDE_CRAFT" = true ]; then
prompt_or_default "Enter tag [default: craft-latest]: " "craft-latest"
VERSION="$REPLY"
read -p "Enter tag [default: craft-latest]: " -r VERSION
else
prompt_or_default "Enter tag [default: latest]: " "latest"
VERSION="$REPLY"
read -p "Enter tag [default: latest]: " -r VERSION
fi
echo ""
if [ "$INCLUDE_CRAFT" = true ] && [ "$VERSION" = "craft-latest" ]; then
print_info "Selected: craft-latest (Craft enabled)"
elif [ "$VERSION" = "latest" ]; then
print_info "Selected: Latest version"
if [ -z "$VERSION" ]; then
if [ "$INCLUDE_CRAFT" = true ]; then
VERSION="craft-latest"
print_info "Selected: craft-latest (Craft enabled)"
else
VERSION="latest"
print_info "Selected: Latest version"
fi
else
print_info "Selected: $VERSION"
fi
# Reject craft image tags when running in lite mode
if [[ "$LITE_MODE" = true ]] && [[ "${VERSION:-}" == craft-* ]]; then
print_error "Cannot use a craft image tag (${VERSION}) with --lite."
print_info "Craft requires services (Vespa, Redis, background workers) that lite mode disables."
exit 1
fi
# Update .env file with new version
print_info "Updating configuration for version $VERSION..."
if grep -q "^IMAGE_TAG=" "$ENV_FILE"; then
@@ -830,67 +581,13 @@ if [ -f "$ENV_FILE" ]; then
fi
print_success "Configuration updated for upgrade"
else
# Reject restarting a craft deployment in lite mode
EXISTING_TAG=$(grep "^IMAGE_TAG=" "$ENV_FILE" | head -1 | cut -d'=' -f2 | tr -d ' "'"'"'')
if [[ "$LITE_MODE" = true ]] && [[ "${EXISTING_TAG:-}" == craft-* ]]; then
print_error "Cannot restart a craft deployment (${EXISTING_TAG}) with --lite."
print_info "Craft requires services (Vespa, Redis, background workers) that lite mode disables."
exit 1
fi
print_info "Keeping existing configuration..."
print_success "Will restart with current settings"
fi
# Ensure COMPOSE_PROFILES is cleared when running in lite mode on an
# existing .env (the template ships with s3-filestore enabled).
if [[ "$LITE_MODE" = true ]] && grep -q "^COMPOSE_PROFILES=.*s3-filestore" "$ENV_FILE" 2>/dev/null; then
sed -i.bak 's/^COMPOSE_PROFILES=.*/COMPOSE_PROFILES=/' "$ENV_FILE" 2>/dev/null || true
print_success "Cleared COMPOSE_PROFILES for lite mode"
fi
else
print_info "No existing .env file found. Setting up new deployment..."
echo ""
# Ask for deployment mode (standard vs lite) unless already set via --lite flag
if [[ "$LITE_MODE" = false ]]; then
print_info "Which deployment mode would you like?"
echo ""
echo " 1) Standard - Full deployment with search, connectors, and RAG"
echo " 2) Lite - Minimal deployment (no Vespa, Redis, or model servers)"
echo " LLM chat, tools, file uploads, and Projects still work"
echo ""
prompt_or_default "Choose a mode (1 or 2) [default: 1]: " "1"
echo ""
case "$REPLY" in
2)
LITE_MODE=true
print_info "Selected: Lite mode"
ensure_file "${INSTALL_ROOT}/deployment/${LITE_COMPOSE_FILE}" \
"${GITHUB_RAW_URL}/${LITE_COMPOSE_FILE}" "${LITE_COMPOSE_FILE}" || exit 1
;;
*)
print_info "Selected: Standard mode"
;;
esac
else
print_info "Deployment mode: Lite (set via --lite flag)"
fi
# Validate lite + craft combination (could now be set interactively)
if [[ "$LITE_MODE" = true ]] && [[ "$INCLUDE_CRAFT" = true ]]; then
print_error "--include-craft cannot be used with Lite mode."
print_info "Craft requires services (Vespa, Redis, background workers) that lite mode disables."
exit 1
fi
# Adjust resource expectations for lite mode
if [[ "$LITE_MODE" = true ]]; then
EXPECTED_DOCKER_RAM_GB=4
EXPECTED_DISK_GB=16
fi
# Ask for version
print_info "Which tag would you like to deploy?"
echo ""
@@ -898,21 +595,23 @@ else
echo "• Press Enter for craft-latest (recommended for Craft)"
echo "• Type a specific tag (e.g., craft-v1.0.0)"
echo ""
prompt_or_default "Enter tag [default: craft-latest]: " "craft-latest"
VERSION="$REPLY"
read -p "Enter tag [default: craft-latest]: " -r VERSION
else
echo "• Press Enter for latest (recommended)"
echo "• Type a specific tag (e.g., v0.1.0)"
echo ""
prompt_or_default "Enter tag [default: latest]: " "latest"
VERSION="$REPLY"
read -p "Enter tag [default: latest]: " -r VERSION
fi
echo ""
if [ "$INCLUDE_CRAFT" = true ] && [ "$VERSION" = "craft-latest" ]; then
print_info "Selected: craft-latest (Craft enabled)"
elif [ "$VERSION" = "latest" ]; then
print_info "Selected: Latest tag"
if [ -z "$VERSION" ]; then
if [ "$INCLUDE_CRAFT" = true ]; then
VERSION="craft-latest"
print_info "Selected: craft-latest (Craft enabled)"
else
VERSION="latest"
print_info "Selected: Latest tag"
fi
else
print_info "Selected: $VERSION"
fi
@@ -946,13 +645,6 @@ else
# Use basic auth by default
AUTH_SCHEMA="basic"
# Reject craft image tags when running in lite mode (must check before writing .env)
if [[ "$LITE_MODE" = true ]] && [[ "${VERSION:-}" == craft-* ]]; then
print_error "Cannot use a craft image tag (${VERSION}) with --lite."
print_info "Craft requires services (Vespa, Redis, background workers) that lite mode disables."
exit 1
fi
# Create .env file from template
print_info "Creating .env file with your selections..."
cp "$ENV_TEMPLATE" "$ENV_FILE"
@@ -962,13 +654,6 @@ else
sed -i.bak "s/^IMAGE_TAG=.*/IMAGE_TAG=$VERSION/" "$ENV_FILE"
print_success "IMAGE_TAG set to $VERSION"
# In lite mode, clear COMPOSE_PROFILES so profiled services (MinIO, etc.)
# stay disabled — the template ships with s3-filestore enabled by default.
if [[ "$LITE_MODE" = true ]]; then
sed -i.bak 's/^COMPOSE_PROFILES=.*/COMPOSE_PROFILES=/' "$ENV_FILE" 2>/dev/null || true
print_success "Cleared COMPOSE_PROFILES for lite mode"
fi
# Configure basic authentication (default)
sed -i.bak 's/^AUTH_TYPE=.*/AUTH_TYPE=basic/' "$ENV_FILE" 2>/dev/null || true
print_success "Basic authentication enabled in configuration"
@@ -1089,7 +774,7 @@ print_step "Pulling Docker images"
print_info "This may take several minutes depending on your internet connection..."
echo ""
print_info "Downloading Docker images (this may take a while)..."
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD $(compose_file_args) pull --quiet)
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD -f docker-compose.yml pull --quiet)
if [ $? -eq 0 ]; then
print_success "Docker images downloaded successfully"
else
@@ -1103,9 +788,9 @@ print_info "Launching containers..."
echo ""
if [ "$USE_LATEST" = true ]; then
print_info "Force pulling latest images and recreating containers..."
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD $(compose_file_args) up -d --pull always --force-recreate)
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD -f docker-compose.yml up -d --pull always --force-recreate)
else
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD $(compose_file_args) up -d)
(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD -f docker-compose.yml up -d)
fi
if [ $? -ne 0 ]; then
print_error "Failed to start Onyx services"
@@ -1127,7 +812,7 @@ echo ""
# Check for restart loops
print_info "Checking container health status..."
RESTART_ISSUES=false
CONTAINERS=$(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD $(compose_file_args) ps -q 2>/dev/null)
CONTAINERS=$(cd "${INSTALL_ROOT}/deployment" && $COMPOSE_CMD -f docker-compose.yml ps -q 2>/dev/null)
for CONTAINER in $CONTAINERS; do
PROJECT_NAME="$(basename "$INSTALL_ROOT")_deployment_"
@@ -1156,7 +841,7 @@ if [ "$RESTART_ISSUES" = true ]; then
print_error "Some containers are experiencing issues!"
echo ""
print_info "Please check the logs for more information:"
echo " (cd \"${INSTALL_ROOT}/deployment\" && $COMPOSE_CMD $(compose_file_args) logs)"
echo " (cd \"${INSTALL_ROOT}/deployment\" && $COMPOSE_CMD -f docker-compose.yml logs)"
echo ""
print_info "If the issue persists, please contact: founders@onyx.app"
@@ -1175,12 +860,8 @@ check_onyx_health() {
echo ""
while [ $attempt -le $max_attempts ]; do
local http_code=""
if [[ "$DOWNLOADER" == "curl" ]]; then
http_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:$port" 2>/dev/null || echo "000")
else
http_code=$(wget -q --spider -S "http://localhost:$port" 2>&1 | grep "HTTP/" | tail -1 | awk '{print $2}' || echo "000")
fi
# Check for successful HTTP responses (200, 301, 302, etc.)
local http_code=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:$port")
if echo "$http_code" | grep -qE "^(200|301|302|303|307|308)$"; then
return 0
fi
@@ -1236,18 +917,6 @@ print_info "If authentication is enabled, you can create your admin account here
echo " • Visit http://localhost:${HOST_PORT}/auth/signup to create your admin account"
echo " • The first user created will automatically have admin privileges"
echo ""
if [[ "$LITE_MODE" = true ]]; then
echo ""
print_info "Running in Lite mode — the following services are NOT started:"
echo " • Vespa (vector database)"
echo " • Redis (cache)"
echo " • Model servers (embedding/inference)"
echo " • Background workers (Celery)"
echo ""
print_info "Connectors and RAG search are disabled. LLM chat, tools, user file"
print_info "uploads, Projects, Agent knowledge, and code interpreter still work."
fi
echo ""
print_info "Refer to the README in the ${INSTALL_ROOT} directory for more information."
echo ""
print_info "For help or issues, contact: founders@onyx.app"

View File

@@ -1,7 +1,6 @@
"use client";
import { ReactNode, useState } from "react";
import { cn } from "@/lib/utils";
import { useState } from "react";
import { ChatFileType, FileDescriptor } from "@/app/app/interfaces";
import Attachment from "@/refresh-components/Attachment";
import { InMessageImage } from "@/app/app/components/files/images/InMessageImage";
@@ -10,27 +9,10 @@ import PreviewModal from "@/sections/modals/PreviewModal";
import { MinimalOnyxDocument } from "@/lib/search/interfaces";
import ExpandableContentWrapper from "@/components/tools/ExpandableContentWrapper";
interface FileContainerProps {
children: ReactNode;
className?: string;
id?: string;
}
interface FileDisplayProps {
files: FileDescriptor[];
}
function FileContainer({ children, className, id }: FileContainerProps) {
return (
<div
id={id}
className={cn("flex w-full flex-col items-end gap-2 py-2", className)}
>
{children}
</div>
);
}
export default function FileDisplay({ files }: FileDisplayProps) {
const [close, setClose] = useState(true);
const [previewingFile, setPreviewingFile] = useState<FileDescriptor | null>(
@@ -59,7 +41,7 @@ export default function FileDisplay({ files }: FileDisplayProps) {
)}
{textFiles.length > 0 && (
<FileContainer id="onyx-file">
<div id="onyx-file" className="flex flex-col items-end gap-2 py-2">
{textFiles.map((file) => (
<Attachment
key={file.id}
@@ -67,36 +49,40 @@ export default function FileDisplay({ files }: FileDisplayProps) {
open={() => setPreviewingFile(file)}
/>
))}
</FileContainer>
</div>
)}
{imageFiles.length > 0 && (
<FileContainer id="onyx-image">
<div id="onyx-image" className="flex flex-col items-end gap-2 py-2">
{imageFiles.map((file) => (
<InMessageImage key={file.id} fileId={file.id} />
))}
</FileContainer>
</div>
)}
{csvFiles.length > 0 && (
<FileContainer className="overflow-auto">
{csvFiles.map((file) =>
close ? (
<ExpandableContentWrapper
key={file.id}
fileDescriptor={file}
close={() => setClose(false)}
ContentComponent={CsvContent}
/>
) : (
<Attachment
key={file.id}
open={() => setClose(true)}
fileName={file.name || file.id}
/>
)
)}
</FileContainer>
<div className="flex flex-col items-end gap-2 py-2">
{csvFiles.map((file) => {
return (
<div key={file.id} className="w-fit">
{close ? (
<>
<ExpandableContentWrapper
fileDescriptor={file}
close={() => setClose(false)}
ContentComponent={CsvContent}
/>
</>
) : (
<Attachment
open={() => setClose(true)}
fileName={file.name || file.id}
/>
)}
</div>
);
})}
</div>
)}
</>
);

View File

@@ -40,7 +40,12 @@ export default function ExpandableContentWrapper({
};
const Content = (
<div className="w-message-default max-w-full !rounded-lg overflow-y-hidden h-full">
<div
className={cn(
!expanded ? "w-message-default" : "w-full",
"!rounded !rounded-lg overflow-y-hidden h-full"
)}
>
<CardHeader className="w-full bg-background-tint-02 top-0 p-3">
<div className="flex justify-between items-center">
<Text className="text-ellipsis line-clamp-1" text03 mainUiAction>
@@ -78,10 +83,12 @@ export default function ExpandableContentWrapper({
)}
>
<CardContent className="p-0">
<ContentComponent
fileDescriptor={fileDescriptor}
expanded={expanded}
/>
{!expanded && (
<ContentComponent
fileDescriptor={fileDescriptor}
expanded={expanded}
/>
)}
</CardContent>
</Card>
</div>