Compare commits

...

3 Commits

Author SHA1 Message Date
Chris Weaver
85311e7ac0 fix: add jira auto-sync option in UI (#5260)
* Add jira auto-sync option in UI

* Fix build
2025-08-26 13:20:20 -07:00
Sam Waddell
005f5f72b6 fix: update all log paths to reflect change related to non-root user (#5244) 2025-08-25 17:03:28 -07:00
Evan Lohn
a17e5c3333 fix: downloads are never larger than 20mb (#5247)
* fix: downloads are never larger than 20mb

* JT comments

* import to fix integration tests
2025-08-25 13:45:00 -07:00
10 changed files with 43 additions and 31 deletions

View File

@@ -14,7 +14,7 @@ logger = setup_logger()
# Only set up memory monitoring in container environment
if is_running_in_container():
# Set up a dedicated memory monitoring logger
MEMORY_LOG_DIR = "/var/log/memory"
MEMORY_LOG_DIR = "/var/log/onyx/memory"
MEMORY_LOG_FILE = os.path.join(MEMORY_LOG_DIR, "memory_usage.log")
MEMORY_LOG_MAX_BYTES = 10 * 1024 * 1024 # 10MB
MEMORY_LOG_BACKUP_COUNT = 5 # Keep 5 backup files

View File

@@ -52,6 +52,7 @@ SMART_CHIP_CHAR = "\ue907"
WEB_VIEW_LINK_KEY = "webViewLink"
MAX_RETRIEVER_EMAILS = 20
CHUNK_SIZE_BUFFER = 64 # extra bytes past the limit to read
# Mapping of Google Drive mime types to export formats
GOOGLE_MIME_TYPES_TO_EXPORT = {
@@ -97,18 +98,31 @@ def is_gdrive_image_mime_type(mime_type: str) -> bool:
return is_valid_image_type(mime_type)
def download_request(service: GoogleDriveService, file_id: str) -> bytes:
def download_request(
service: GoogleDriveService, file_id: str, size_threshold: int
) -> bytes:
"""
Download the file from Google Drive.
"""
# For other file types, download the file
# Use the correct API call for downloading files
request = service.files().get_media(fileId=file_id)
return _download_request(request, file_id, size_threshold)
def _download_request(request: Any, file_id: str, size_threshold: int) -> bytes:
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
downloader = MediaIoBaseDownload(
response_bytes, request, chunksize=size_threshold + CHUNK_SIZE_BUFFER
)
done = False
while not done:
_, done = downloader.next_chunk()
download_progress, done = downloader.next_chunk()
if download_progress.resumable_progress > size_threshold:
logger.warning(
f"File {file_id} exceeds size threshold of {size_threshold}. Skipping2."
)
return bytes()
response = response_bytes.getvalue()
if not response:
@@ -121,6 +135,7 @@ def _download_and_extract_sections_basic(
file: dict[str, str],
service: GoogleDriveService,
allow_images: bool,
size_threshold: int,
) -> list[TextSection | ImageSection]:
"""Extract text and images from a Google Drive file."""
file_id = file["id"]
@@ -132,7 +147,7 @@ def _download_and_extract_sections_basic(
# Use the correct API call for downloading files
# lazy evaluation to only download the file if necessary
def response_call() -> bytes:
return download_request(service, file_id)
return download_request(service, file_id, size_threshold)
if is_gdrive_image_mime_type(mime_type):
# Skip images if not explicitly enabled
@@ -162,13 +177,7 @@ def _download_and_extract_sections_basic(
request = service.files().export_media(
fileId=file_id, mimeType=export_mime_type
)
response_bytes = io.BytesIO()
downloader = MediaIoBaseDownload(response_bytes, request)
done = False
while not done:
_, done = downloader.next_chunk()
response = response_bytes.getvalue()
response = _download_request(request, file_id, size_threshold)
if not response:
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
return []
@@ -467,7 +476,7 @@ def _convert_drive_item_to_document(
" aligning with basic sections"
)
basic_sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images
file, _get_drive_service(), allow_images, size_threshold
)
sections = align_basic_advanced(basic_sections, doc_sections)
@@ -478,7 +487,7 @@ def _convert_drive_item_to_document(
# Not Google Doc, attempt basic extraction
else:
sections = _download_and_extract_sections_basic(
file, _get_drive_service(), allow_images
file, _get_drive_service(), allow_images, size_threshold
)
# If we still don't have any sections, skip this file

View File

@@ -40,6 +40,7 @@ langchainhub==0.1.21
langgraph==0.2.72
langgraph-checkpoint==2.0.13
langgraph-sdk==0.1.44
lazy_imports==1.0.1
litellm==1.72.2
lxml==5.3.0
lxml_html_clean==0.2.2

View File

@@ -148,7 +148,7 @@ services:
max-file: "6"
# optional, only for debugging purposes
volumes:
- api_server_logs:/var/log
- api_server_logs:/var/log/onyx
background:
image: onyxdotapp/onyx-backend:${IMAGE_TAG:-latest}
@@ -286,7 +286,7 @@ services:
- "host.docker.internal:host-gateway"
# optional, only for debugging purposes
volumes:
- background_logs:/var/log
- background_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -356,7 +356,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
# optional, only for debugging purposes
- inference_model_server_logs:/var/log
- inference_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -390,7 +390,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- indexing_huggingface_model_cache:/root/.cache/huggingface/
# optional, only for debugging purposes
- indexing_model_server_logs:/var/log
- indexing_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:

View File

@@ -118,7 +118,7 @@ services:
max-file: "6"
volumes:
# optional, only for debugging purposes
- api_server_logs:/var/log
- api_server_logs:/var/log/onyx
background:
image: onyxdotapp/onyx-backend:${IMAGE_TAG:-latest}
@@ -232,7 +232,7 @@ services:
- "host.docker.internal:host-gateway"
# optional, only for debugging purposes
volumes:
- background_logs:/var/log
- background_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -295,7 +295,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
# optional, only for debugging purposes
- inference_model_server_logs:/var/log
- inference_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -334,7 +334,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- indexing_huggingface_model_cache:/root/.cache/huggingface/
# optional, only for debugging purposes
- indexing_model_server_logs:/var/log
- indexing_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:

View File

@@ -43,7 +43,7 @@ services:
max-file: "6"
volumes:
# optional, only for debugging purposes
- api_server_logs:/var/log
- api_server_logs:/var/log/onyx
background:
@@ -82,7 +82,7 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- background_logs:/var/log
- background_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -136,7 +136,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
# optional, only for debugging purposes
- inference_model_server_logs:/var/log
- inference_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -166,7 +166,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- indexing_huggingface_model_cache:/root/.cache/huggingface/
# optional, only for debugging purposes
- indexing_model_server_logs:/var/log
- indexing_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:

View File

@@ -43,7 +43,7 @@ services:
max-size: "50m"
max-file: "6"
volumes:
- api_server_logs:/var/log
- api_server_logs:/var/log/onyx
background:
image: onyxdotapp/onyx-backend:${IMAGE_TAG:-latest}
@@ -86,7 +86,7 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- background_logs:/var/log
- background_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -164,7 +164,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- model_cache_huggingface:/root/.cache/huggingface/
# optional, only for debugging purposes
- inference_model_server_logs:/var/log
- inference_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:
@@ -194,7 +194,7 @@ services:
# Not necessary, this is just to reduce download time during startup
- indexing_huggingface_model_cache:/root/.cache/huggingface/
# optional, only for debugging purposes
- indexing_model_server_logs:/var/log
- indexing_model_server_logs:/var/log/onyx
logging:
driver: json-file
options:

View File

@@ -74,7 +74,7 @@ services:
extra_hosts:
- "host.docker.internal:host-gateway"
volumes:
- log_store:/var/log
- log_store:/var/log/onyx
logging:
driver: json-file
options:

View File

@@ -12,6 +12,7 @@ export const autoSyncConfigBySource: Record<
>
> = {
confluence: {},
jira: {},
google_drive: {},
gmail: {},
github: {},

View File

@@ -451,6 +451,7 @@ export const federatedSourceToRegularSource = (
export const validAutoSyncSources = [
ValidSources.Confluence,
ValidSources.Jira,
ValidSources.GoogleDrive,
ValidSources.Gmail,
ValidSources.Slack,