Compare commits

..

3 Commits

Author SHA1 Message Date
Jamison Lahman
d4a96d70f3 fix(desktop): prefer native scrollbar styling (#9879) 2026-04-03 00:33:18 +00:00
Evan Lohn
5b000c2173 chore: remove unused db rows (#9869) 2026-04-02 22:17:10 +00:00
acaprau
d62af28e40 fix(opensearch): Doc IDs whose length would exceed OpenSearch's ID length are hashed (#9847) 2026-04-02 21:35:17 +00:00
7 changed files with 280 additions and 130 deletions

View File

@@ -1,20 +1,14 @@
from datetime import datetime
from datetime import timezone
from uuid import UUID
from celery import shared_task
from celery import Task
from ee.onyx.background.celery_utils import should_perform_chat_ttl_check
from ee.onyx.background.task_name_builders import name_chat_ttl_task
from onyx.configs.app_configs import JOB_TIMEOUT
from onyx.configs.constants import OnyxCeleryTask
from onyx.db.chat import delete_chat_session
from onyx.db.chat import get_chat_sessions_older_than
from onyx.db.engine.sql_engine import get_session_with_current_tenant
from onyx.db.enums import TaskStatus
from onyx.db.tasks import mark_task_as_finished_with_id
from onyx.db.tasks import register_task
from onyx.server.settings.store import load_settings
from onyx.utils.logger import setup_logger
@@ -29,26 +23,16 @@ logger = setup_logger()
trail=False,
)
def perform_ttl_management_task(
self: Task, retention_limit_days: int, *, tenant_id: str
self: Task, retention_limit_days: int, *, tenant_id: str # noqa: ARG001
) -> None:
task_id = self.request.id
if not task_id:
raise RuntimeError("No task id defined for this task; cannot identify it")
start_time = datetime.now(tz=timezone.utc)
user_id: UUID | None = None
session_id: UUID | None = None
try:
with get_session_with_current_tenant() as db_session:
# we generally want to move off this, but keeping for now
register_task(
db_session=db_session,
task_name=name_chat_ttl_task(retention_limit_days, tenant_id),
task_id=task_id,
status=TaskStatus.STARTED,
start_time=start_time,
)
old_chat_sessions = get_chat_sessions_older_than(
retention_limit_days, db_session
@@ -65,23 +49,10 @@ def perform_ttl_management_task(
hard_delete=True,
)
with get_session_with_current_tenant() as db_session:
mark_task_as_finished_with_id(
db_session=db_session,
task_id=task_id,
success=True,
)
except Exception:
logger.exception(
f"delete_chat_session exceptioned. user_id={user_id} session_id={session_id}"
)
with get_session_with_current_tenant() as db_session:
mark_task_as_finished_with_id(
db_session=db_session,
task_id=task_id,
success=False,
)
raise

View File

@@ -1,3 +1,4 @@
import hashlib
from datetime import datetime
from datetime import timezone
from typing import Any
@@ -20,9 +21,13 @@ from onyx.document_index.opensearch.constants import DEFAULT_MAX_CHUNK_SIZE
from onyx.document_index.opensearch.constants import EF_CONSTRUCTION
from onyx.document_index.opensearch.constants import EF_SEARCH
from onyx.document_index.opensearch.constants import M
from onyx.document_index.opensearch.string_filtering import DocumentIDTooLongError
from onyx.document_index.opensearch.string_filtering import (
filter_and_validate_document_id,
)
from onyx.document_index.opensearch.string_filtering import (
MAX_DOCUMENT_ID_ENCODED_LENGTH,
)
from onyx.utils.tenant import get_tenant_id_short_string
from shared_configs.configs import MULTI_TENANT
from shared_configs.contextvars import get_current_tenant_id
@@ -75,17 +80,50 @@ def get_opensearch_doc_chunk_id(
This will be the string used to identify the chunk in OpenSearch. Any direct
chunk queries should use this function.
If the document ID is too long, a hash of the ID is used instead.
"""
sanitized_document_id = filter_and_validate_document_id(document_id)
opensearch_doc_chunk_id = (
f"{sanitized_document_id}__{max_chunk_size}__{chunk_index}"
opensearch_doc_chunk_id_suffix: str = f"__{max_chunk_size}__{chunk_index}"
encoded_suffix_length: int = len(opensearch_doc_chunk_id_suffix.encode("utf-8"))
max_encoded_permissible_doc_id_length: int = (
MAX_DOCUMENT_ID_ENCODED_LENGTH - encoded_suffix_length
)
opensearch_doc_chunk_id_tenant_prefix: str = ""
if tenant_state.multitenant:
short_tenant_id: str = get_tenant_id_short_string(tenant_state.tenant_id)
# Use tenant ID because in multitenant mode each tenant has its own
# Documents table, so there is a very small chance that doc IDs are not
# actually unique across all tenants.
short_tenant_id = get_tenant_id_short_string(tenant_state.tenant_id)
opensearch_doc_chunk_id = f"{short_tenant_id}__{opensearch_doc_chunk_id}"
opensearch_doc_chunk_id_tenant_prefix = f"{short_tenant_id}__"
encoded_prefix_length: int = len(
opensearch_doc_chunk_id_tenant_prefix.encode("utf-8")
)
max_encoded_permissible_doc_id_length -= encoded_prefix_length
try:
sanitized_document_id: str = filter_and_validate_document_id(
document_id, max_encoded_length=max_encoded_permissible_doc_id_length
)
except DocumentIDTooLongError:
# If the document ID is too long, use a hash instead.
# We use blake2b because it is faster and equally secure as SHA256, and
# accepts digest_size which controls the number of bytes returned in the
# hash.
# digest_size is the size of the returned hash in bytes. Since we're
# decoding the hash bytes as a hex string, the digest_size should be
# half the max target size of the hash string.
# Subtract 1 because filter_and_validate_document_id compares on >= on
# max_encoded_length.
# 64 is the max digest_size blake2b returns.
digest_size: int = min((max_encoded_permissible_doc_id_length - 1) // 2, 64)
sanitized_document_id = hashlib.blake2b(
document_id.encode("utf-8"), digest_size=digest_size
).hexdigest()
opensearch_doc_chunk_id: str = (
f"{opensearch_doc_chunk_id_tenant_prefix}{sanitized_document_id}{opensearch_doc_chunk_id_suffix}"
)
# Do one more validation to ensure we haven't exceeded the max length.
opensearch_doc_chunk_id = filter_and_validate_document_id(opensearch_doc_chunk_id)
return opensearch_doc_chunk_id

View File

@@ -1,7 +1,15 @@
import re
MAX_DOCUMENT_ID_ENCODED_LENGTH: int = 512
def filter_and_validate_document_id(document_id: str) -> str:
class DocumentIDTooLongError(ValueError):
"""Raised when a document ID is too long for OpenSearch after filtering."""
def filter_and_validate_document_id(
document_id: str, max_encoded_length: int = MAX_DOCUMENT_ID_ENCODED_LENGTH
) -> str:
"""
Filters and validates a document ID such that it can be used as an ID in
OpenSearch.
@@ -19,9 +27,13 @@ def filter_and_validate_document_id(document_id: str) -> str:
Args:
document_id: The document ID to filter and validate.
max_encoded_length: The maximum length of the document ID after
filtering in bytes. Compared with >= for extra resilience, so
encoded values of this length will fail.
Raises:
ValueError: If the document ID is empty or too long after filtering.
DocumentIDTooLongError: If the document ID is too long after filtering.
ValueError: If the document ID is empty after filtering.
Returns:
str: The filtered document ID.
@@ -29,6 +41,8 @@ def filter_and_validate_document_id(document_id: str) -> str:
filtered_document_id = re.sub(r"[^A-Za-z0-9_.\-~]", "", document_id)
if not filtered_document_id:
raise ValueError(f"Document ID {document_id} is empty after filtering.")
if len(filtered_document_id.encode("utf-8")) >= 512:
raise ValueError(f"Document ID {document_id} is too long after filtering.")
if len(filtered_document_id.encode("utf-8")) >= max_encoded_length:
raise DocumentIDTooLongError(
f"Document ID {document_id} is too long after filtering."
)
return filtered_document_id

View File

@@ -0,0 +1,203 @@
import pytest
from onyx.document_index.interfaces_new import TenantState
from onyx.document_index.opensearch.constants import DEFAULT_MAX_CHUNK_SIZE
from onyx.document_index.opensearch.schema import get_opensearch_doc_chunk_id
from onyx.document_index.opensearch.string_filtering import (
MAX_DOCUMENT_ID_ENCODED_LENGTH,
)
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE
SINGLE_TENANT_STATE = TenantState(
tenant_id=POSTGRES_DEFAULT_SCHEMA_STANDARD_VALUE, multitenant=False
)
MULTI_TENANT_STATE = TenantState(
tenant_id="tenant_abcdef12-3456-7890-abcd-ef1234567890", multitenant=True
)
EXPECTED_SHORT_TENANT = "abcdef12"
class TestGetOpensearchDocChunkIdSingleTenant:
def test_basic(self) -> None:
result = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, "my-doc-id", chunk_index=0
)
assert result == f"my-doc-id__{DEFAULT_MAX_CHUNK_SIZE}__0"
def test_custom_chunk_size(self) -> None:
result = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, "doc1", chunk_index=3, max_chunk_size=1024
)
assert result == "doc1__1024__3"
def test_special_chars_are_stripped(self) -> None:
"""Tests characters not matching [A-Za-z0-9_.-~] are removed."""
result = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, "doc/with?special#chars&more%stuff", chunk_index=0
)
assert "/" not in result
assert "?" not in result
assert "#" not in result
assert result == f"docwithspecialcharsmorestuff__{DEFAULT_MAX_CHUNK_SIZE}__0"
def test_short_doc_id_not_hashed(self) -> None:
"""
Tests that a short doc ID should appear directly in the result, not as a
hash.
"""
doc_id = "short-id"
result = get_opensearch_doc_chunk_id(SINGLE_TENANT_STATE, doc_id, chunk_index=0)
assert "short-id" in result
def test_long_doc_id_is_hashed(self) -> None:
"""
Tests that a doc ID exceeding the max length should be replaced with a
blake2b hash.
"""
# Create a doc ID that will exceed max length after the suffix is
# appended.
doc_id = "a" * MAX_DOCUMENT_ID_ENCODED_LENGTH
result = get_opensearch_doc_chunk_id(SINGLE_TENANT_STATE, doc_id, chunk_index=0)
# The original doc ID should NOT appear in the result.
assert doc_id not in result
# The suffix should still be present.
assert f"__{DEFAULT_MAX_CHUNK_SIZE}__0" in result
def test_long_doc_id_hash_is_deterministic(self) -> None:
doc_id = "x" * MAX_DOCUMENT_ID_ENCODED_LENGTH
result1 = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, doc_id, chunk_index=5
)
result2 = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, doc_id, chunk_index=5
)
assert result1 == result2
def test_long_doc_id_different_inputs_produce_different_hashes(self) -> None:
doc_id_a = "a" * MAX_DOCUMENT_ID_ENCODED_LENGTH
doc_id_b = "b" * MAX_DOCUMENT_ID_ENCODED_LENGTH
result_a = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, doc_id_a, chunk_index=0
)
result_b = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, doc_id_b, chunk_index=0
)
assert result_a != result_b
def test_result_never_exceeds_max_length(self) -> None:
"""
Tests that the final result should always be under
MAX_DOCUMENT_ID_ENCODED_LENGTH bytes.
"""
doc_id = "z" * (MAX_DOCUMENT_ID_ENCODED_LENGTH * 2)
result = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, doc_id, chunk_index=999, max_chunk_size=99999
)
assert len(result.encode("utf-8")) < MAX_DOCUMENT_ID_ENCODED_LENGTH
def test_no_tenant_prefix_in_single_tenant(self) -> None:
result = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, "mydoc", chunk_index=0
)
assert not result.startswith(SINGLE_TENANT_STATE.tenant_id)
class TestGetOpensearchDocChunkIdMultiTenant:
def test_includes_tenant_prefix(self) -> None:
result = get_opensearch_doc_chunk_id(MULTI_TENANT_STATE, "mydoc", chunk_index=0)
assert result.startswith(f"{EXPECTED_SHORT_TENANT}__")
def test_format(self) -> None:
result = get_opensearch_doc_chunk_id(
MULTI_TENANT_STATE, "mydoc", chunk_index=2, max_chunk_size=256
)
assert result == f"{EXPECTED_SHORT_TENANT}__mydoc__256__2"
def test_long_doc_id_is_hashed_multitenant(self) -> None:
doc_id = "d" * MAX_DOCUMENT_ID_ENCODED_LENGTH
result = get_opensearch_doc_chunk_id(MULTI_TENANT_STATE, doc_id, chunk_index=0)
# Should still have tenant prefix.
assert result.startswith(f"{EXPECTED_SHORT_TENANT}__")
# The original doc ID should NOT appear in the result.
assert doc_id not in result
# The suffix should still be present.
assert f"__{DEFAULT_MAX_CHUNK_SIZE}__0" in result
def test_result_never_exceeds_max_length_multitenant(self) -> None:
doc_id = "q" * (MAX_DOCUMENT_ID_ENCODED_LENGTH * 2)
result = get_opensearch_doc_chunk_id(
MULTI_TENANT_STATE, doc_id, chunk_index=999, max_chunk_size=99999
)
assert len(result.encode("utf-8")) < MAX_DOCUMENT_ID_ENCODED_LENGTH
def test_different_tenants_produce_different_ids(self) -> None:
tenant_a = TenantState(
tenant_id="tenant_aaaaaaaa-0000-0000-0000-000000000000", multitenant=True
)
tenant_b = TenantState(
tenant_id="tenant_bbbbbbbb-0000-0000-0000-000000000000", multitenant=True
)
result_a = get_opensearch_doc_chunk_id(tenant_a, "same-doc", chunk_index=0)
result_b = get_opensearch_doc_chunk_id(tenant_b, "same-doc", chunk_index=0)
assert result_a != result_b
class TestGetOpensearchDocChunkIdEdgeCases:
def test_chunk_index_zero(self) -> None:
result = get_opensearch_doc_chunk_id(SINGLE_TENANT_STATE, "doc", chunk_index=0)
assert result.endswith("__0")
def test_large_chunk_index(self) -> None:
result = get_opensearch_doc_chunk_id(
SINGLE_TENANT_STATE, "doc", chunk_index=99999
)
assert result.endswith("__99999")
def test_doc_id_with_only_special_chars_raises(self) -> None:
"""
Tests that a doc ID that becomes empty after filtering should raise
ValueError.
"""
with pytest.raises(ValueError, match="empty after filtering"):
get_opensearch_doc_chunk_id(SINGLE_TENANT_STATE, "###???///", chunk_index=0)
def test_doc_id_at_boundary_length(self) -> None:
"""
Tests that a doc ID right at the boundary should not be hashed.
"""
suffix = f"__{DEFAULT_MAX_CHUNK_SIZE}__0"
suffix_len = len(suffix.encode("utf-8"))
# Max doc ID length that won't trigger hashing (must be <
# max_encoded_length).
max_doc_len = MAX_DOCUMENT_ID_ENCODED_LENGTH - suffix_len - 1
doc_id = "a" * max_doc_len
result = get_opensearch_doc_chunk_id(SINGLE_TENANT_STATE, doc_id, chunk_index=0)
assert doc_id in result
def test_doc_id_at_boundary_length_multitenant(self) -> None:
"""
Tests that a doc ID right at the boundary should not be hashed in
multitenant mode.
"""
suffix = f"__{DEFAULT_MAX_CHUNK_SIZE}__0"
suffix_len = len(suffix.encode("utf-8"))
prefix = f"{EXPECTED_SHORT_TENANT}__"
prefix_len = len(prefix.encode("utf-8"))
# Max doc ID length that won't trigger hashing (must be <
# max_encoded_length).
max_doc_len = MAX_DOCUMENT_ID_ENCODED_LENGTH - suffix_len - prefix_len - 1
doc_id = "a" * max_doc_len
result = get_opensearch_doc_chunk_id(MULTI_TENANT_STATE, doc_id, chunk_index=0)
assert doc_id in result
def test_doc_id_one_over_boundary_is_hashed(self) -> None:
"""
Tests that a doc ID one byte over the boundary should be hashed.
"""
suffix = f"__{DEFAULT_MAX_CHUNK_SIZE}__0"
suffix_len = len(suffix.encode("utf-8"))
# This length will trigger the >= check in filter_and_validate_document_id
doc_id = "a" * (MAX_DOCUMENT_ID_ENCODED_LENGTH - suffix_len)
result = get_opensearch_doc_chunk_id(SINGLE_TENANT_STATE, doc_id, chunk_index=0)
assert doc_id not in result

View File

@@ -127,7 +127,7 @@ function SidebarTab({
rightChildren={truncationSpacer}
/>
) : (
<div className="flex flex-row items-center gap-2 w-full">
<div className="flex flex-row items-center gap-2 flex-1">
{Icon && (
<div className="flex items-center justify-center p-0.5">
<Icon className="h-[1rem] w-[1rem] text-text-03" />

View File

@@ -467,6 +467,10 @@
/* Frost Overlay (for FrostedDiv component) - lighter in light mode */
--frost-overlay: var(--alpha-grey-00-10);
/* Scrollbar */
--scrollbar-track: transparent;
--scrollbar-thumb: var(--alpha-grey-100-20);
}
/* Dark Colors */
@@ -671,4 +675,8 @@
/* Frost Overlay (for FrostedDiv component) - darker in dark mode */
--frost-overlay: var(--alpha-grey-100-10);
/* Scrollbar */
--scrollbar-track: transparent;
--scrollbar-thumb: var(--alpha-grey-00-20);
}

View File

@@ -127,17 +127,8 @@
}
@layer utilities {
/* Hide scrollbar for Chrome, Safari and Opera */
.no-scrollbar::-webkit-scrollbar {
display: none;
}
/* Hide scrollbar for IE, Edge and Firefox */
.no-scrollbar {
-ms-overflow-style: none;
/* IE and Edge */
scrollbar-width: none;
/* Firefox */
}
/* SHADOWS */
@@ -362,27 +353,9 @@
/* SCROLL BAR */
.default-scrollbar::-webkit-scrollbar {
width: 6px;
}
.default-scrollbar::-webkit-scrollbar-track {
background: #f1f1f1;
}
.default-scrollbar::-webkit-scrollbar-thumb {
background: #888;
border-radius: 4px;
}
.default-scrollbar::-webkit-scrollbar-thumb:hover {
background: #555;
}
.default-scrollbar {
scrollbar-width: thin;
scrollbar-color: #888 transparent;
overflow: overlay;
overflow-y: scroll;
overflow-x: hidden;
}
@@ -392,78 +365,21 @@
height: 100%;
}
.inputscroll::-webkit-scrollbar-track {
background: #e5e7eb;
.inputscroll {
scrollbar-width: none;
}
::-webkit-scrollbar {
width: 0px;
/* Vertical scrollbar width */
height: 8px;
/* Horizontal scrollbar height */
}
::-webkit-scrollbar-track {
background: transparent;
/* background: theme("colors.scrollbar.track"); */
/* Track background color */
}
/* Style the scrollbar handle */
::-webkit-scrollbar-thumb {
background: transparent;
/* background: theme("colors.scrollbar.thumb"); */
/* Handle color */
border-radius: 10px;
}
/* Handle on hover */
::-webkit-scrollbar-thumb:hover {
background: transparent;
/* background: theme("colors.scrollbar.thumb-hover"); */
/* Handle color on hover */
}
.dark-scrollbar::-webkit-scrollbar-thumb {
background: transparent;
/* background: theme("colors.scrollbar.dark.thumb"); */
/* Handle color */
border-radius: 10px;
}
.dark-scrollbar::-webkit-scrollbar-thumb:hover {
background: transparent;
/* background: theme("colors.scrollbar.dark.thumb-hover"); */
/* Handle color on hover */
/* Ensure native scrollbars are visible */
@layer base {
* {
scrollbar-width: auto;
}
}
/* TEXTAREA */
textarea::-webkit-scrollbar {
width: 8px;
}
textarea::-webkit-scrollbar-track {
background: var(--scrollbar-track);
border-radius: 4px;
}
textarea::-webkit-scrollbar-thumb {
background: var(--scrollbar-thumb);
border-radius: 4px;
}
textarea::-webkit-scrollbar-thumb:hover {
background: var(--scrollbar-thumb-hover);
}
textarea {
resize: vertical;
}
/* For Firefox */
textarea {
scrollbar-width: thin;
scrollbar-color: var(--scrollbar-thumb) var(--scrollbar-track);
}