mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-14 03:12:48 +00:00
Compare commits
11 Commits
dane/csv3
...
refactor/o
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
fbebeb8173 | ||
|
|
3a448fc7d2 | ||
|
|
65f772bdd5 | ||
|
|
2e3b869fa5 | ||
|
|
fe0088b398 | ||
|
|
5820fb063c | ||
|
|
dd19348a2f | ||
|
|
9eeba7a44d | ||
|
|
ce3df44533 | ||
|
|
4f6e66c913 | ||
|
|
b24235a785 |
2
.github/workflows/pr-playwright-tests.yml
vendored
2
.github/workflows/pr-playwright-tests.yml
vendored
@@ -710,7 +710,7 @@ jobs:
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Download visual diff summaries
|
||||
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c
|
||||
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3
|
||||
with:
|
||||
pattern: screenshot-diff-summary-*
|
||||
path: summaries/
|
||||
|
||||
2
.github/workflows/pr-quality-checks.yml
vendored
2
.github/workflows/pr-quality-checks.yml
vendored
@@ -38,7 +38,7 @@ jobs:
|
||||
- name: Install node dependencies
|
||||
working-directory: ./web
|
||||
run: npm ci
|
||||
- uses: j178/prek-action@cbc2f23eb5539cf20d82d1aabd0d0ecbcc56f4e3
|
||||
- uses: j178/prek-action@0bb87d7f00b0c99306c8bcb8b8beba1eb581c037 # ratchet:j178/prek-action@v1
|
||||
with:
|
||||
prek-version: '0.3.4'
|
||||
extra-args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || github.event_name == 'merge_group' && format('--from-ref {0} --to-ref {1}', github.event.merge_group.base_sha, github.event.merge_group.head_sha) || github.ref_name == 'main' && '--all-files' || '' }}
|
||||
|
||||
@@ -49,12 +49,12 @@ Onyx uses Celery for asynchronous task processing with multiple specialized work
|
||||
|
||||
4. **Light Worker** (`light`)
|
||||
- Handles lightweight, fast operations
|
||||
- Tasks: vespa metadata sync, connector deletion, doc permissions upsert, checkpoint cleanup, index attempt cleanup
|
||||
- Tasks: vespa operations, document permissions sync, external group sync
|
||||
- Higher concurrency for quick tasks
|
||||
|
||||
5. **Heavy Worker** (`heavy`)
|
||||
- Handles resource-intensive operations
|
||||
- Tasks: connector pruning, document permissions sync, external group sync, CSV generation
|
||||
- Primary task: document pruning operations
|
||||
- Runs with 4 threads concurrency
|
||||
|
||||
6. **KG Processing Worker** (`kg_processing`)
|
||||
|
||||
@@ -102,7 +102,7 @@ def revoke_tasks_blocking_deletion(
|
||||
f"Revoked permissions sync task {permissions_sync_payload.celery_task_id}."
|
||||
)
|
||||
except Exception:
|
||||
task_logger.exception("Exception while revoking permissions sync task")
|
||||
task_logger.exception("Exception while revoking pruning task")
|
||||
|
||||
try:
|
||||
prune_payload = redis_connector.prune.payload
|
||||
@@ -110,7 +110,7 @@ def revoke_tasks_blocking_deletion(
|
||||
app.control.revoke(prune_payload.celery_task_id)
|
||||
task_logger.info(f"Revoked pruning task {prune_payload.celery_task_id}.")
|
||||
except Exception:
|
||||
task_logger.exception("Exception while revoking pruning task")
|
||||
task_logger.exception("Exception while revoking permissions sync task")
|
||||
|
||||
try:
|
||||
external_group_sync_payload = redis_connector.external_group_sync.payload
|
||||
@@ -508,11 +508,7 @@ def monitor_connector_deletion_taskset(
|
||||
db_session=db_session,
|
||||
connector_id=connector_id_to_delete,
|
||||
)
|
||||
if not connector:
|
||||
task_logger.info(
|
||||
"Connector deletion - Connector already deleted, skipping connector cleanup"
|
||||
)
|
||||
elif not len(connector.credentials):
|
||||
if not connector or not len(connector.credentials):
|
||||
task_logger.info(
|
||||
"Connector deletion - Found no credentials left for connector, deleting connector"
|
||||
)
|
||||
|
||||
@@ -171,10 +171,7 @@ class ClickupConnector(LoadConnector, PollConnector):
|
||||
document.metadata[extra_field] = task[extra_field]
|
||||
|
||||
if self.retrieve_task_comments:
|
||||
document.sections = [
|
||||
*document.sections,
|
||||
*self._get_task_comments(task["id"]),
|
||||
]
|
||||
document.sections.extend(self._get_task_comments(task["id"]))
|
||||
|
||||
doc_batch.append(document)
|
||||
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
import csv
|
||||
import io
|
||||
from typing import IO
|
||||
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.file_processing.extract_file_text import file_io_to_text
|
||||
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
|
||||
from onyx.file_processing.file_types import OnyxFileExtensions
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def is_tabular_file(file_name: str) -> bool:
|
||||
lowered = file_name.lower()
|
||||
return any(lowered.endswith(ext) for ext in OnyxFileExtensions.TABULAR_EXTENSIONS)
|
||||
|
||||
|
||||
def _tsv_to_csv(tsv_text: str) -> str:
|
||||
"""Re-serialize tab-separated text as CSV so downstream parsers that
|
||||
assume the default Excel dialect read the columns correctly."""
|
||||
out = io.StringIO()
|
||||
csv.writer(out, lineterminator="\n").writerows(
|
||||
csv.reader(io.StringIO(tsv_text), dialect="excel-tab")
|
||||
)
|
||||
return out.getvalue().rstrip("\n")
|
||||
|
||||
|
||||
def tabular_file_to_sections(
|
||||
file: IO[bytes],
|
||||
file_name: str,
|
||||
link: str = "",
|
||||
) -> list[TabularSection]:
|
||||
"""Convert a tabular file into one or more TabularSections.
|
||||
|
||||
- .xlsx → one TabularSection per non-empty sheet.
|
||||
- .csv / .tsv → a single TabularSection containing the full decoded
|
||||
file.
|
||||
|
||||
Returns an empty list when the file yields no extractable content.
|
||||
"""
|
||||
lowered = file_name.lower()
|
||||
|
||||
if lowered.endswith(".xlsx"):
|
||||
return [
|
||||
TabularSection(link=f"sheet:{sheet_title}", text=csv_text)
|
||||
for csv_text, sheet_title in xlsx_sheet_extraction(
|
||||
file, file_name=file_name
|
||||
)
|
||||
]
|
||||
|
||||
if not lowered.endswith((".csv", ".tsv")):
|
||||
raise ValueError(f"{file_name!r} is not a tabular file")
|
||||
|
||||
try:
|
||||
text = file_io_to_text(file).strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Failure decoding {file_name}: {e}")
|
||||
raise e
|
||||
|
||||
if not text:
|
||||
return []
|
||||
if lowered.endswith(".tsv"):
|
||||
text = _tsv_to_csv(text)
|
||||
return [TabularSection(link=link or file_name, text=text)]
|
||||
@@ -1,10 +1,8 @@
|
||||
import sys
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
@@ -35,18 +33,9 @@ class ConnectorMissingCredentialError(PermissionError):
|
||||
)
|
||||
|
||||
|
||||
class SectionType(str, Enum):
|
||||
"""Discriminator for Section subclasses."""
|
||||
|
||||
TEXT = "text"
|
||||
IMAGE = "image"
|
||||
TABULAR = "tabular"
|
||||
|
||||
|
||||
class Section(BaseModel):
|
||||
"""Base section class with common attributes"""
|
||||
|
||||
type: SectionType
|
||||
link: str | None = None
|
||||
text: str | None = None
|
||||
image_file_id: str | None = None
|
||||
@@ -55,7 +44,6 @@ class Section(BaseModel):
|
||||
class TextSection(Section):
|
||||
"""Section containing text content"""
|
||||
|
||||
type: Literal[SectionType.TEXT] = SectionType.TEXT
|
||||
text: str
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
@@ -65,25 +53,12 @@ class TextSection(Section):
|
||||
class ImageSection(Section):
|
||||
"""Section containing an image reference"""
|
||||
|
||||
type: Literal[SectionType.IMAGE] = SectionType.IMAGE
|
||||
image_file_id: str
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
return sys.getsizeof(self.image_file_id) + sys.getsizeof(self.link)
|
||||
|
||||
|
||||
class TabularSection(Section):
|
||||
"""Section containing tabular data (csv/tsv content, or one sheet of
|
||||
an xlsx workbook rendered as CSV)."""
|
||||
|
||||
type: Literal[SectionType.TABULAR] = SectionType.TABULAR
|
||||
text: str # CSV representation in a string
|
||||
link: str
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
return sys.getsizeof(self.text) + sys.getsizeof(self.link)
|
||||
|
||||
|
||||
class BasicExpertInfo(BaseModel):
|
||||
"""Basic Information for the owner of a document, any of the fields can be left as None
|
||||
Display fallback goes as follows:
|
||||
@@ -186,7 +161,7 @@ class DocumentBase(BaseModel):
|
||||
"""Used for Onyx ingestion api, the ID is inferred before use if not provided"""
|
||||
|
||||
id: str | None = None
|
||||
sections: Sequence[TextSection | ImageSection | TabularSection]
|
||||
sections: list[TextSection | ImageSection]
|
||||
source: DocumentSource | None = None
|
||||
semantic_identifier: str # displayed in the UI as the main identifier for the doc
|
||||
# TODO(andrei): Ideally we could improve this to where each value is just a
|
||||
@@ -396,9 +371,12 @@ class IndexingDocument(Document):
|
||||
)
|
||||
else:
|
||||
section_len = sum(
|
||||
len(section.text) if section.text is not None else 0
|
||||
(
|
||||
len(section.text)
|
||||
if isinstance(section, TextSection) and section.text is not None
|
||||
else 0
|
||||
)
|
||||
for section in self.sections
|
||||
if isinstance(section, (TextSection, TabularSection))
|
||||
)
|
||||
|
||||
return title_len + section_len
|
||||
|
||||
@@ -463,13 +463,29 @@ def _remove_empty_runs(
|
||||
return result
|
||||
|
||||
|
||||
def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str, str]]:
|
||||
"""
|
||||
Converts each sheet in the excel file to a csv condensed string.
|
||||
Returns a string and the worksheet title for each worksheet
|
||||
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
|
||||
# TODO: switch back to this approach in a few months when markitdown
|
||||
# fixes their handling of excel files
|
||||
|
||||
Returns a list of (csv_text, sheet)
|
||||
"""
|
||||
# md = get_markitdown_converter()
|
||||
# stream_info = StreamInfo(
|
||||
# mimetype=SPREADSHEET_MIME_TYPE, filename=file_name or None, extension=".xlsx"
|
||||
# )
|
||||
# try:
|
||||
# workbook = md.convert(to_bytesio(file), stream_info=stream_info)
|
||||
# except (
|
||||
# BadZipFile,
|
||||
# ValueError,
|
||||
# FileConversionException,
|
||||
# UnsupportedFormatException,
|
||||
# ) as e:
|
||||
# error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
|
||||
# if file_name.startswith("~"):
|
||||
# logger.debug(error_str + " (this is expected for files with ~)")
|
||||
# else:
|
||||
# logger.warning(error_str)
|
||||
# return ""
|
||||
# return workbook.markdown
|
||||
try:
|
||||
workbook = openpyxl.load_workbook(file, read_only=True)
|
||||
except BadZipFile as e:
|
||||
@@ -478,30 +494,23 @@ def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str,
|
||||
logger.debug(error_str + " (this is expected for files with ~)")
|
||||
else:
|
||||
logger.warning(error_str)
|
||||
return []
|
||||
return ""
|
||||
except Exception as e:
|
||||
if any(s in str(e) for s in KNOWN_OPENPYXL_BUGS):
|
||||
logger.error(
|
||||
f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
|
||||
)
|
||||
return []
|
||||
return ""
|
||||
raise
|
||||
|
||||
sheets: list[tuple[str, str]] = []
|
||||
text_content = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_matrix = _clean_worksheet_matrix(_worksheet_to_matrix(sheet))
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf, lineterminator="\n")
|
||||
writer.writerows(sheet_matrix)
|
||||
csv_text = buf.getvalue().rstrip("\n")
|
||||
if csv_text.strip():
|
||||
sheets.append((csv_text, sheet.title))
|
||||
return sheets
|
||||
|
||||
|
||||
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
|
||||
sheets = xlsx_sheet_extraction(file, file_name)
|
||||
return TEXT_SECTION_SEPARATOR.join(csv_text for csv_text, _title in sheets)
|
||||
text_content.append(buf.getvalue().rstrip("\n"))
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
|
||||
|
||||
def eml_to_text(file: IO[Any]) -> str:
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from typing import cast
|
||||
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.configs.app_configs import AVERAGE_SUMMARY_EMBEDDINGS
|
||||
@@ -14,14 +16,16 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_metadata_keys_to_ignore,
|
||||
)
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.indexing.chunking import DocumentChunker
|
||||
from onyx.indexing.chunking import extract_blurb
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.llm.utils import MAX_CONTEXT_TOKENS
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.text_processing import clean_text
|
||||
from onyx.utils.text_processing import shared_precompare_cleanup
|
||||
from shared_configs.configs import DOC_EMBEDDING_CONTEXT_SIZE
|
||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
|
||||
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
||||
# actually help quality at all
|
||||
@@ -150,6 +154,9 @@ class Chunker:
|
||||
self.tokenizer = tokenizer
|
||||
self.callback = callback
|
||||
|
||||
self.max_context = 0
|
||||
self.prompt_tokens = 0
|
||||
|
||||
# Create a token counter function that returns the count instead of the tokens
|
||||
def token_counter(text: str) -> int:
|
||||
return len(tokenizer.encode(text))
|
||||
@@ -179,12 +186,234 @@ class Chunker:
|
||||
else None
|
||||
)
|
||||
|
||||
self._document_chunker = DocumentChunker(
|
||||
tokenizer=tokenizer,
|
||||
blurb_splitter=self.blurb_splitter,
|
||||
chunk_splitter=self.chunk_splitter,
|
||||
mini_chunk_splitter=self.mini_chunk_splitter,
|
||||
def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
|
||||
"""
|
||||
Splits the text into smaller chunks based on token count to ensure
|
||||
no chunk exceeds the content_token_limit.
|
||||
"""
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
chunks = []
|
||||
start = 0
|
||||
total_tokens = len(tokens)
|
||||
while start < total_tokens:
|
||||
end = min(start + content_token_limit, total_tokens)
|
||||
token_chunk = tokens[start:end]
|
||||
chunk_text = " ".join(token_chunk)
|
||||
chunks.append(chunk_text)
|
||||
start = end
|
||||
return chunks
|
||||
|
||||
def _extract_blurb(self, text: str) -> str:
|
||||
"""
|
||||
Extract a short blurb from the text (first chunk of size `blurb_size`).
|
||||
"""
|
||||
# chunker is in `text` mode
|
||||
texts = cast(list[str], self.blurb_splitter.chunk(text))
|
||||
if not texts:
|
||||
return ""
|
||||
return texts[0]
|
||||
|
||||
def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
|
||||
"""
|
||||
For "multipass" mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
|
||||
"""
|
||||
if self.mini_chunk_splitter and chunk_text.strip():
|
||||
# chunker is in `text` mode
|
||||
return cast(list[str], self.mini_chunk_splitter.chunk(chunk_text))
|
||||
return None
|
||||
|
||||
# ADDED: extra param image_url to store in the chunk
|
||||
def _create_chunk(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
chunks_list: list[DocAwareChunk],
|
||||
text: str,
|
||||
links: dict[int, str],
|
||||
is_continuation: bool = False,
|
||||
title_prefix: str = "",
|
||||
metadata_suffix_semantic: str = "",
|
||||
metadata_suffix_keyword: str = "",
|
||||
image_file_id: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Helper to create a new DocAwareChunk, append it to chunks_list.
|
||||
"""
|
||||
new_chunk = DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks_list),
|
||||
blurb=self._extract_blurb(text),
|
||||
content=text,
|
||||
source_links=links or {0: ""},
|
||||
image_file_id=image_file_id,
|
||||
section_continuation=is_continuation,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
mini_chunk_texts=self._get_mini_chunk_texts(text),
|
||||
large_chunk_id=None,
|
||||
doc_summary="",
|
||||
chunk_context="",
|
||||
contextual_rag_reserved_tokens=0, # set per-document in _handle_single_document
|
||||
)
|
||||
chunks_list.append(new_chunk)
|
||||
|
||||
def _chunk_document_with_sections(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
sections: list[Section],
|
||||
title_prefix: str,
|
||||
metadata_suffix_semantic: str,
|
||||
metadata_suffix_keyword: str,
|
||||
content_token_limit: int,
|
||||
) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Loops through sections of the document, converting them into one or more chunks.
|
||||
Works with processed sections that are base Section objects.
|
||||
"""
|
||||
chunks: list[DocAwareChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
chunk_text = ""
|
||||
|
||||
for section_idx, section in enumerate(sections):
|
||||
# Get section text and other attributes
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
section_link_text = section.link or ""
|
||||
image_url = section.image_file_id
|
||||
|
||||
# If there is no useful content, skip
|
||||
if not section_text and (not document.title or section_idx > 0):
|
||||
logger.warning(
|
||||
f"Skipping empty or irrelevant section in doc {document.semantic_identifier}, link={section_link_text}"
|
||||
)
|
||||
continue
|
||||
|
||||
# CASE 1: If this section has an image, force a separate chunk
|
||||
if image_url:
|
||||
# First, if we have any partially built text chunk, finalize it
|
||||
if chunk_text.strip():
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
is_continuation=False,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
chunk_text = ""
|
||||
link_offsets = {}
|
||||
|
||||
# Create a chunk specifically for this image section
|
||||
# (Using the text summary that was generated during processing)
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
section_text,
|
||||
links={0: section_link_text} if section_link_text else {},
|
||||
image_file_id=image_url,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
# Continue to next section
|
||||
continue
|
||||
|
||||
# CASE 2: Normal text section
|
||||
section_token_count = len(self.tokenizer.encode(section_text))
|
||||
|
||||
# If the section is large on its own, split it separately
|
||||
if section_token_count > content_token_limit:
|
||||
if chunk_text.strip():
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
chunk_text = ""
|
||||
link_offsets = {}
|
||||
|
||||
# chunker is in `text` mode
|
||||
split_texts = cast(list[str], self.chunk_splitter.chunk(section_text))
|
||||
for i, split_text in enumerate(split_texts):
|
||||
# If even the split_text is bigger than strict limit, further split
|
||||
if (
|
||||
STRICT_CHUNK_TOKEN_LIMIT
|
||||
and len(self.tokenizer.encode(split_text)) > content_token_limit
|
||||
):
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for j, small_chunk in enumerate(smaller_chunks):
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
small_chunk,
|
||||
{0: section_link_text},
|
||||
is_continuation=(j != 0),
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
else:
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
split_text,
|
||||
{0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
continue
|
||||
|
||||
# If we can still fit this section into the current chunk, do so
|
||||
current_token_count = len(self.tokenizer.encode(chunk_text))
|
||||
current_offset = len(shared_precompare_cleanup(chunk_text))
|
||||
next_section_tokens = (
|
||||
len(self.tokenizer.encode(SECTION_SEPARATOR)) + section_token_count
|
||||
)
|
||||
|
||||
if next_section_tokens + current_token_count <= content_token_limit:
|
||||
if chunk_text:
|
||||
chunk_text += SECTION_SEPARATOR
|
||||
chunk_text += section_text
|
||||
link_offsets[current_offset] = section_link_text
|
||||
else:
|
||||
# finalize the existing chunk
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
# start a new chunk
|
||||
link_offsets = {0: section_link_text}
|
||||
chunk_text = section_text
|
||||
|
||||
# finalize any leftover text chunk
|
||||
if chunk_text.strip() or not chunks:
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets or {0: ""}, # safe default
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
return chunks
|
||||
|
||||
def _handle_single_document(
|
||||
self, document: IndexingDocument
|
||||
@@ -194,10 +423,7 @@ class Chunker:
|
||||
logger.debug(f"Chunking {document.semantic_identifier}")
|
||||
|
||||
# Title prep
|
||||
title = extract_blurb(
|
||||
document.get_title_for_document_index() or "",
|
||||
self.blurb_splitter,
|
||||
)
|
||||
title = self._extract_blurb(document.get_title_for_document_index() or "")
|
||||
title_prefix = title + RETURN_SEPARATOR if title else ""
|
||||
title_tokens = len(self.tokenizer.encode(title_prefix))
|
||||
|
||||
@@ -265,7 +491,7 @@ class Chunker:
|
||||
# Use processed_sections if available (IndexingDocument), otherwise use original sections
|
||||
sections_to_chunk = document.processed_sections
|
||||
|
||||
normal_chunks = self._document_chunker.chunk(
|
||||
normal_chunks = self._chunk_document_with_sections(
|
||||
document,
|
||||
sections_to_chunk,
|
||||
title_prefix,
|
||||
|
||||
@@ -1,7 +0,0 @@
|
||||
from onyx.indexing.chunking.document_chunker import DocumentChunker
|
||||
from onyx.indexing.chunking.section_chunker import extract_blurb
|
||||
|
||||
__all__ = [
|
||||
"DocumentChunker",
|
||||
"extract_blurb",
|
||||
]
|
||||
@@ -1,111 +0,0 @@
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SectionType
|
||||
from onyx.indexing.chunking.image_section_chunker import ImageChunker
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.text_section_chunker import TextChunker
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.text_processing import clean_text
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class DocumentChunker:
|
||||
"""Converts a document's processed sections into DocAwareChunks.
|
||||
|
||||
Drop-in replacement for `Chunker._chunk_document_with_sections`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: BaseTokenizer,
|
||||
blurb_splitter: SentenceChunker,
|
||||
chunk_splitter: SentenceChunker,
|
||||
mini_chunk_splitter: SentenceChunker | None = None,
|
||||
) -> None:
|
||||
self.blurb_splitter = blurb_splitter
|
||||
self.mini_chunk_splitter = mini_chunk_splitter
|
||||
|
||||
self._dispatch: dict[SectionType, SectionChunker] = {
|
||||
SectionType.TEXT: TextChunker(
|
||||
tokenizer=tokenizer,
|
||||
chunk_splitter=chunk_splitter,
|
||||
),
|
||||
SectionType.IMAGE: ImageChunker(),
|
||||
}
|
||||
|
||||
def chunk(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
sections: list[Section],
|
||||
title_prefix: str,
|
||||
metadata_suffix_semantic: str,
|
||||
metadata_suffix_keyword: str,
|
||||
content_token_limit: int,
|
||||
) -> list[DocAwareChunk]:
|
||||
payloads = self._collect_section_payloads(
|
||||
document=document,
|
||||
sections=sections,
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
if not payloads:
|
||||
payloads.append(ChunkPayload(text="", links={0: ""}))
|
||||
|
||||
return [
|
||||
payload.to_doc_aware_chunk(
|
||||
document=document,
|
||||
chunk_id=idx,
|
||||
blurb_splitter=self.blurb_splitter,
|
||||
mini_chunk_splitter=self.mini_chunk_splitter,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
for idx, payload in enumerate(payloads)
|
||||
]
|
||||
|
||||
def _collect_section_payloads(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
sections: list[Section],
|
||||
content_token_limit: int,
|
||||
) -> list[ChunkPayload]:
|
||||
accumulator = AccumulatorState()
|
||||
payloads: list[ChunkPayload] = []
|
||||
|
||||
for section_idx, section in enumerate(sections):
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
|
||||
if not section_text and (not document.title or section_idx > 0):
|
||||
logger.warning(
|
||||
f"Skipping empty or irrelevant section in doc "
|
||||
f"{document.semantic_identifier}, link={section.link}"
|
||||
)
|
||||
continue
|
||||
|
||||
chunker = self._select_chunker(section)
|
||||
result = chunker.chunk_section(
|
||||
section=section,
|
||||
accumulator=accumulator,
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
payloads.extend(result.payloads)
|
||||
accumulator = result.accumulator
|
||||
|
||||
# Final flush — any leftover buffered text becomes one last payload.
|
||||
payloads.extend(accumulator.flush_to_list())
|
||||
|
||||
return payloads
|
||||
|
||||
def _select_chunker(self, section: Section) -> SectionChunker:
|
||||
try:
|
||||
return self._dispatch[section.type]
|
||||
except KeyError:
|
||||
raise ValueError(f"No SectionChunker registered for type={section.type}")
|
||||
@@ -1,35 +0,0 @@
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
|
||||
from onyx.utils.text_processing import clean_text
|
||||
|
||||
|
||||
class ImageChunker(SectionChunker):
|
||||
def chunk_section(
|
||||
self,
|
||||
section: Section,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int, # noqa: ARG002
|
||||
) -> SectionChunkerOutput:
|
||||
assert section.image_file_id is not None
|
||||
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
section_link = section.link or ""
|
||||
|
||||
# Flush any partially built text chunks
|
||||
payloads = accumulator.flush_to_list()
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=section_text,
|
||||
links={0: section_link} if section_link else {},
|
||||
image_file_id=section.image_file_id,
|
||||
is_continuation=False,
|
||||
)
|
||||
)
|
||||
|
||||
return SectionChunkerOutput(
|
||||
payloads=payloads,
|
||||
accumulator=AccumulatorState(),
|
||||
)
|
||||
@@ -1,100 +0,0 @@
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Sequence
|
||||
from typing import cast
|
||||
|
||||
from chonkie import SentenceChunker
|
||||
from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
|
||||
|
||||
def extract_blurb(text: str, blurb_splitter: SentenceChunker) -> str:
|
||||
texts = cast(list[str], blurb_splitter.chunk(text))
|
||||
if not texts:
|
||||
return ""
|
||||
return texts[0]
|
||||
|
||||
|
||||
def get_mini_chunk_texts(
|
||||
chunk_text: str,
|
||||
mini_chunk_splitter: SentenceChunker | None,
|
||||
) -> list[str] | None:
|
||||
if mini_chunk_splitter and chunk_text.strip():
|
||||
return list(cast(Sequence[str], mini_chunk_splitter.chunk(chunk_text)))
|
||||
return None
|
||||
|
||||
|
||||
class ChunkPayload(BaseModel):
|
||||
"""Section-local chunk content without document-scoped fields.
|
||||
|
||||
The orchestrator upgrades these to DocAwareChunks via
|
||||
`to_doc_aware_chunk` after assigning chunk_ids and attaching
|
||||
title/metadata.
|
||||
"""
|
||||
|
||||
text: str
|
||||
links: dict[int, str]
|
||||
is_continuation: bool = False
|
||||
image_file_id: str | None = None
|
||||
|
||||
def to_doc_aware_chunk(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
chunk_id: int,
|
||||
blurb_splitter: SentenceChunker,
|
||||
title_prefix: str = "",
|
||||
metadata_suffix_semantic: str = "",
|
||||
metadata_suffix_keyword: str = "",
|
||||
mini_chunk_splitter: SentenceChunker | None = None,
|
||||
) -> DocAwareChunk:
|
||||
return DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=chunk_id,
|
||||
blurb=extract_blurb(self.text, blurb_splitter),
|
||||
content=self.text,
|
||||
source_links=self.links or {0: ""},
|
||||
image_file_id=self.image_file_id,
|
||||
section_continuation=self.is_continuation,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
mini_chunk_texts=get_mini_chunk_texts(self.text, mini_chunk_splitter),
|
||||
large_chunk_id=None,
|
||||
doc_summary="",
|
||||
chunk_context="",
|
||||
contextual_rag_reserved_tokens=0,
|
||||
)
|
||||
|
||||
|
||||
class AccumulatorState(BaseModel):
|
||||
"""Cross-section text buffer threaded through SectionChunkers."""
|
||||
|
||||
text: str = ""
|
||||
link_offsets: dict[int, str] = Field(default_factory=dict)
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return not self.text.strip()
|
||||
|
||||
def flush_to_list(self) -> list[ChunkPayload]:
|
||||
if self.is_empty():
|
||||
return []
|
||||
return [ChunkPayload(text=self.text, links=self.link_offsets)]
|
||||
|
||||
|
||||
class SectionChunkerOutput(BaseModel):
|
||||
payloads: list[ChunkPayload]
|
||||
accumulator: AccumulatorState
|
||||
|
||||
|
||||
class SectionChunker(ABC):
|
||||
@abstractmethod
|
||||
def chunk_section(
|
||||
self,
|
||||
section: Section,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int,
|
||||
) -> SectionChunkerOutput: ...
|
||||
@@ -1,129 +0,0 @@
|
||||
from typing import cast
|
||||
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.configs.constants import SECTION_SEPARATOR
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.natural_language_processing.utils import count_tokens
|
||||
from onyx.utils.text_processing import clean_text
|
||||
from onyx.utils.text_processing import shared_precompare_cleanup
|
||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
|
||||
|
||||
class TextChunker(SectionChunker):
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: BaseTokenizer,
|
||||
chunk_splitter: SentenceChunker,
|
||||
) -> None:
|
||||
self.tokenizer = tokenizer
|
||||
self.chunk_splitter = chunk_splitter
|
||||
|
||||
self.section_separator_token_count = count_tokens(
|
||||
SECTION_SEPARATOR,
|
||||
self.tokenizer,
|
||||
)
|
||||
|
||||
def chunk_section(
|
||||
self,
|
||||
section: Section,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int,
|
||||
) -> SectionChunkerOutput:
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
section_link = section.link or ""
|
||||
section_token_count = len(self.tokenizer.encode(section_text))
|
||||
|
||||
# Oversized — flush buffer and split the section
|
||||
if section_token_count > content_token_limit:
|
||||
return self._handle_oversized_section(
|
||||
section_text=section_text,
|
||||
section_link=section_link,
|
||||
accumulator=accumulator,
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
current_token_count = count_tokens(accumulator.text, self.tokenizer)
|
||||
next_section_tokens = self.section_separator_token_count + section_token_count
|
||||
|
||||
# Fits — extend the accumulator
|
||||
if next_section_tokens + current_token_count <= content_token_limit:
|
||||
offset = len(shared_precompare_cleanup(accumulator.text))
|
||||
new_text = accumulator.text
|
||||
if new_text:
|
||||
new_text += SECTION_SEPARATOR
|
||||
new_text += section_text
|
||||
return SectionChunkerOutput(
|
||||
payloads=[],
|
||||
accumulator=AccumulatorState(
|
||||
text=new_text,
|
||||
link_offsets={**accumulator.link_offsets, offset: section_link},
|
||||
),
|
||||
)
|
||||
|
||||
# Doesn't fit — flush buffer and restart with this section
|
||||
return SectionChunkerOutput(
|
||||
payloads=accumulator.flush_to_list(),
|
||||
accumulator=AccumulatorState(
|
||||
text=section_text,
|
||||
link_offsets={0: section_link},
|
||||
),
|
||||
)
|
||||
|
||||
def _handle_oversized_section(
|
||||
self,
|
||||
section_text: str,
|
||||
section_link: str,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int,
|
||||
) -> SectionChunkerOutput:
|
||||
payloads = accumulator.flush_to_list()
|
||||
|
||||
split_texts = cast(list[str], self.chunk_splitter.chunk(section_text))
|
||||
for i, split_text in enumerate(split_texts):
|
||||
if (
|
||||
STRICT_CHUNK_TOKEN_LIMIT
|
||||
and count_tokens(split_text, self.tokenizer) > content_token_limit
|
||||
):
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for j, small_chunk in enumerate(smaller_chunks):
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=small_chunk,
|
||||
links={0: section_link},
|
||||
is_continuation=(j != 0),
|
||||
)
|
||||
)
|
||||
else:
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=split_text,
|
||||
links={0: section_link},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
|
||||
return SectionChunkerOutput(
|
||||
payloads=payloads,
|
||||
accumulator=AccumulatorState(),
|
||||
)
|
||||
|
||||
def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
total_tokens = len(tokens)
|
||||
while start < total_tokens:
|
||||
end = min(start + content_token_limit, total_tokens)
|
||||
token_chunk = tokens[start:end]
|
||||
chunk_text = " ".join(token_chunk)
|
||||
chunks.append(chunk_text)
|
||||
start = end
|
||||
return chunks
|
||||
@@ -542,7 +542,6 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
**document.model_dump(),
|
||||
processed_sections=[
|
||||
Section(
|
||||
type=section.type,
|
||||
text=section.text if isinstance(section, TextSection) else "",
|
||||
link=section.link,
|
||||
image_file_id=(
|
||||
@@ -567,7 +566,6 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
if isinstance(section, ImageSection):
|
||||
# Default section with image path preserved - ensure text is always a string
|
||||
processed_section = Section(
|
||||
type=section.type,
|
||||
link=section.link,
|
||||
image_file_id=section.image_file_id,
|
||||
text="", # Initialize with empty string
|
||||
@@ -611,7 +609,6 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
# For TextSection, create a base Section with text and link
|
||||
elif isinstance(section, TextSection):
|
||||
processed_section = Section(
|
||||
type=section.type,
|
||||
text=section.text or "", # Ensure text is always a string, not None
|
||||
link=section.link,
|
||||
image_file_id=None,
|
||||
|
||||
@@ -618,7 +618,6 @@ done
|
||||
"app.kubernetes.io/managed-by": "onyx",
|
||||
"onyx.app/sandbox-id": sandbox_id,
|
||||
"onyx.app/tenant-id": tenant_id,
|
||||
"admission.datadoghq.com/enabled": "false",
|
||||
},
|
||||
),
|
||||
spec=pod_spec,
|
||||
|
||||
@@ -65,7 +65,6 @@ class Settings(BaseModel):
|
||||
anonymous_user_enabled: bool | None = None
|
||||
invite_only_enabled: bool = False
|
||||
deep_research_enabled: bool | None = None
|
||||
multi_model_chat_enabled: bool | None = None
|
||||
search_ui_enabled: bool | None = None
|
||||
|
||||
# Whether EE features are unlocked for use.
|
||||
@@ -90,8 +89,7 @@ class Settings(BaseModel):
|
||||
default=DEFAULT_USER_FILE_MAX_UPLOAD_SIZE_MB, ge=0
|
||||
)
|
||||
file_token_count_threshold_k: int | None = Field(
|
||||
default=None,
|
||||
ge=0, # thousands of tokens; None = context-aware default
|
||||
default=None, ge=0 # thousands of tokens; None = context-aware default
|
||||
)
|
||||
|
||||
# Connector settings
|
||||
|
||||
@@ -17,7 +17,6 @@ def documents_to_indexing_documents(
|
||||
processed_sections = []
|
||||
for section in document.sections:
|
||||
processed_section = Section(
|
||||
type=section.type,
|
||||
text=section.text or "",
|
||||
link=section.link,
|
||||
image_file_id=None,
|
||||
|
||||
@@ -12,7 +12,6 @@ from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.connectors.models import TextSection
|
||||
|
||||
_ITERATION_LIMIT = 100_000
|
||||
@@ -142,15 +141,13 @@ def load_all_from_connector(
|
||||
|
||||
def to_sections(
|
||||
documents: list[Document],
|
||||
) -> Iterator[TextSection | ImageSection | TabularSection]:
|
||||
) -> Iterator[TextSection | ImageSection]:
|
||||
for doc in documents:
|
||||
for section in doc.sections:
|
||||
yield section
|
||||
|
||||
|
||||
def to_text_sections(
|
||||
sections: Iterator[TextSection | ImageSection | TabularSection],
|
||||
) -> Iterator[str]:
|
||||
def to_text_sections(sections: Iterator[TextSection | ImageSection]) -> Iterator[str]:
|
||||
for section in sections:
|
||||
if isinstance(section, TextSection):
|
||||
yield section.text
|
||||
|
||||
@@ -4,7 +4,6 @@ from typing import cast
|
||||
import openpyxl
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
|
||||
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
|
||||
from onyx.file_processing.extract_file_text import xlsx_to_text
|
||||
|
||||
|
||||
@@ -197,136 +196,3 @@ class TestXlsxToText:
|
||||
assert "r1c1" in lines[0] and "r1c2" in lines[0]
|
||||
assert "r2c1" in lines[1] and "r2c2" in lines[1]
|
||||
assert "r3c1" in lines[2] and "r3c2" in lines[2]
|
||||
|
||||
|
||||
class TestXlsxSheetExtraction:
|
||||
def test_one_tuple_per_sheet(self) -> None:
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"Revenue": [["Month", "Amount"], ["Jan", "100"]],
|
||||
"Expenses": [["Category", "Cost"], ["Rent", "500"]],
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 2
|
||||
# Order preserved from workbook sheet order
|
||||
titles = [title for _csv, title in sheets]
|
||||
assert titles == ["Revenue", "Expenses"]
|
||||
# Content present in the right tuple
|
||||
revenue_csv, _ = sheets[0]
|
||||
expenses_csv, _ = sheets[1]
|
||||
assert "Month" in revenue_csv
|
||||
assert "Jan" in revenue_csv
|
||||
assert "Category" in expenses_csv
|
||||
assert "Rent" in expenses_csv
|
||||
|
||||
def test_tuple_structure_is_csv_text_then_title(self) -> None:
|
||||
"""The tuple order is (csv_text, sheet_title) — pin it so callers
|
||||
that unpack positionally don't silently break."""
|
||||
xlsx = _make_xlsx({"MySheet": [["a", "b"]]})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, title = sheets[0]
|
||||
assert title == "MySheet"
|
||||
assert "a" in csv_text
|
||||
assert "b" in csv_text
|
||||
|
||||
def test_empty_sheet_is_skipped(self) -> None:
|
||||
"""A sheet whose CSV output is empty/whitespace-only should NOT
|
||||
appear in the result — the `if csv_text.strip():` guard filters
|
||||
it out."""
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"Data": [["a", "b"]],
|
||||
"Empty": [],
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
assert sheets[0][1] == "Data"
|
||||
|
||||
def test_empty_workbook_returns_empty_list(self) -> None:
|
||||
"""All sheets empty → empty list (not a list of empty tuples)."""
|
||||
xlsx = _make_xlsx({"Sheet1": [], "Sheet2": []})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert sheets == []
|
||||
|
||||
def test_single_sheet(self) -> None:
|
||||
xlsx = _make_xlsx({"Only": [["x", "y"], ["1", "2"]]})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, title = sheets[0]
|
||||
assert title == "Only"
|
||||
assert "x" in csv_text
|
||||
assert "1" in csv_text
|
||||
|
||||
def test_bad_zip_returns_empty_list(self) -> None:
|
||||
bad_file = io.BytesIO(b"not a zip file")
|
||||
sheets = xlsx_sheet_extraction(bad_file, file_name="test.xlsx")
|
||||
assert sheets == []
|
||||
|
||||
def test_bad_zip_tilde_file_returns_empty_list(self) -> None:
|
||||
"""`~$`-prefixed files are Excel lock files; failure should log
|
||||
at debug (not warning) and still return []."""
|
||||
bad_file = io.BytesIO(b"not a zip file")
|
||||
sheets = xlsx_sheet_extraction(bad_file, file_name="~$temp.xlsx")
|
||||
assert sheets == []
|
||||
|
||||
def test_csv_content_matches_xlsx_to_text_per_sheet(self) -> None:
|
||||
"""For a single-sheet workbook, xlsx_to_text output should equal
|
||||
the csv_text from xlsx_sheet_extraction — they share the same
|
||||
per-sheet CSV-ification logic."""
|
||||
single_sheet_data = [["Name", "Age"], ["Alice", "30"]]
|
||||
expected_text = xlsx_to_text(_make_xlsx({"People": single_sheet_data}))
|
||||
|
||||
sheets = xlsx_sheet_extraction(_make_xlsx({"People": single_sheet_data}))
|
||||
assert len(sheets) == 1
|
||||
csv_text, title = sheets[0]
|
||||
assert title == "People"
|
||||
assert csv_text.strip() == expected_text.strip()
|
||||
|
||||
def test_commas_in_cells_are_quoted(self) -> None:
|
||||
xlsx = _make_xlsx({"S1": [["hello, world", "normal"]]})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, _ = sheets[0]
|
||||
assert '"hello, world"' in csv_text
|
||||
|
||||
def test_long_empty_row_run_capped_within_sheet(self) -> None:
|
||||
"""The matrix cleanup applies per-sheet: >2 empty rows collapse
|
||||
to 2, which keeps the sheet non-empty and it still appears in
|
||||
the result."""
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"S1": [
|
||||
["header"],
|
||||
[""],
|
||||
[""],
|
||||
[""],
|
||||
[""],
|
||||
["data"],
|
||||
]
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, _ = sheets[0]
|
||||
lines = csv_text.strip().split("\n")
|
||||
# header + 2 empty (capped) + data = 4 lines
|
||||
assert len(lines) == 4
|
||||
assert "header" in lines[0]
|
||||
assert "data" in lines[-1]
|
||||
|
||||
def test_sheet_title_with_special_chars_preserved(self) -> None:
|
||||
"""Spaces, punctuation, unicode in sheet titles are preserved
|
||||
verbatim — the title is used as a link anchor downstream."""
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"Q1 Revenue (USD)": [["a", "b"]],
|
||||
"Données": [["c", "d"]],
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
titles = [title for _csv, title in sheets]
|
||||
assert "Q1 Revenue (USD)" in titles
|
||||
assert "Données" in titles
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
import pytest
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import SECTION_SEPARATOR
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SectionType
|
||||
from onyx.indexing.chunking import DocumentChunker
|
||||
from onyx.indexing.chunking import text_section_chunker as text_chunker_module
|
||||
from onyx.indexing import chunker as chunker_module
|
||||
from onyx.indexing.chunker import Chunker
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
|
||||
|
||||
@@ -29,26 +27,16 @@ class CharTokenizer(BaseTokenizer):
|
||||
CHUNK_LIMIT = 200
|
||||
|
||||
|
||||
def _make_document_chunker(
|
||||
def _make_chunker(
|
||||
chunk_token_limit: int = CHUNK_LIMIT,
|
||||
) -> DocumentChunker:
|
||||
def token_counter(text: str) -> int:
|
||||
return len(text)
|
||||
|
||||
return DocumentChunker(
|
||||
enable_multipass: bool = False,
|
||||
) -> Chunker:
|
||||
return Chunker(
|
||||
tokenizer=CharTokenizer(),
|
||||
blurb_splitter=SentenceChunker(
|
||||
tokenizer_or_token_counter=token_counter,
|
||||
chunk_size=128,
|
||||
chunk_overlap=0,
|
||||
return_type="texts",
|
||||
),
|
||||
chunk_splitter=SentenceChunker(
|
||||
tokenizer_or_token_counter=token_counter,
|
||||
chunk_size=chunk_token_limit,
|
||||
chunk_overlap=0,
|
||||
return_type="texts",
|
||||
),
|
||||
enable_multipass=enable_multipass,
|
||||
enable_large_chunks=False,
|
||||
enable_contextual_rag=False,
|
||||
chunk_token_limit=chunk_token_limit,
|
||||
)
|
||||
|
||||
|
||||
@@ -74,10 +62,10 @@ def _make_doc(
|
||||
def test_empty_processed_sections_returns_single_empty_safety_chunk() -> None:
|
||||
"""No sections at all should still yield one empty chunk (the
|
||||
`or not chunks` safety branch at the end)."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(sections=[])
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=[],
|
||||
title_prefix="TITLE\n",
|
||||
@@ -99,13 +87,13 @@ def test_empty_processed_sections_returns_single_empty_safety_chunk() -> None:
|
||||
def test_empty_section_on_first_position_without_title_is_skipped() -> None:
|
||||
"""Doc has no title, first section has empty text — the guard
|
||||
`(not document.title or section_idx > 0)` means it IS skipped."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[Section(type=SectionType.TEXT, text="", link="l0")],
|
||||
sections=[Section(text="", link="l0")],
|
||||
title=None,
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -121,16 +109,16 @@ def test_empty_section_on_first_position_without_title_is_skipped() -> None:
|
||||
|
||||
def test_empty_section_on_later_position_is_skipped_even_with_title() -> None:
|
||||
"""Index > 0 empty sections are skipped regardless of title."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text="Alpha.", link="l0"),
|
||||
Section(type=SectionType.TEXT, text="", link="l1"), # should be skipped
|
||||
Section(type=SectionType.TEXT, text="Beta.", link="l2"),
|
||||
Section(text="Alpha.", link="l0"),
|
||||
Section(text="", link="l1"), # should be skipped
|
||||
Section(text="Beta.", link="l2"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -150,12 +138,10 @@ def test_empty_section_on_later_position_is_skipped_even_with_title() -> None:
|
||||
|
||||
|
||||
def test_single_small_text_section_becomes_one_chunk() -> None:
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[Section(type=SectionType.TEXT, text="Hello world.", link="https://a")]
|
||||
)
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(sections=[Section(text="Hello world.", link="https://a")])
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="TITLE\n",
|
||||
@@ -179,15 +165,15 @@ def test_single_small_text_section_becomes_one_chunk() -> None:
|
||||
|
||||
|
||||
def test_multiple_small_sections_combine_into_one_chunk() -> None:
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
sections = [
|
||||
Section(type=SectionType.TEXT, text="Part one.", link="l1"),
|
||||
Section(type=SectionType.TEXT, text="Part two.", link="l2"),
|
||||
Section(type=SectionType.TEXT, text="Part three.", link="l3"),
|
||||
Section(text="Part one.", link="l1"),
|
||||
Section(text="Part two.", link="l2"),
|
||||
Section(text="Part three.", link="l3"),
|
||||
]
|
||||
doc = _make_doc(sections=sections)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -211,19 +197,19 @@ def test_multiple_small_sections_combine_into_one_chunk() -> None:
|
||||
def test_sections_overflow_into_second_chunk() -> None:
|
||||
"""Two sections that together exceed content_token_limit should
|
||||
finalize the first as one chunk and start a new one."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
# char-level: 120 char section → 120 tokens. 2 of these plus separator
|
||||
# exceed a 200-token limit, forcing a flush.
|
||||
a = "A" * 120
|
||||
b = "B" * 120
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text=a, link="la"),
|
||||
Section(type=SectionType.TEXT, text=b, link="lb"),
|
||||
Section(text=a, link="la"),
|
||||
Section(text=b, link="lb"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -250,11 +236,10 @@ def test_sections_overflow_into_second_chunk() -> None:
|
||||
|
||||
|
||||
def test_image_only_section_produces_single_chunk_with_image_id() -> None:
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(
|
||||
type=SectionType.IMAGE,
|
||||
text="summary of image",
|
||||
link="https://img",
|
||||
image_file_id="img-abc",
|
||||
@@ -262,7 +247,7 @@ def test_image_only_section_produces_single_chunk_with_image_id() -> None:
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -280,21 +265,20 @@ def test_image_only_section_produces_single_chunk_with_image_id() -> None:
|
||||
def test_image_section_flushes_pending_text_and_creates_its_own_chunk() -> None:
|
||||
"""A buffered text section followed by an image section:
|
||||
the pending text should be flushed first, then the image chunk."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text="Pending text.", link="ltext"),
|
||||
Section(text="Pending text.", link="ltext"),
|
||||
Section(
|
||||
type=SectionType.IMAGE,
|
||||
text="image summary",
|
||||
link="limage",
|
||||
image_file_id="img-1",
|
||||
),
|
||||
Section(type=SectionType.TEXT, text="Trailing text.", link="ltail"),
|
||||
Section(text="Trailing text.", link="ltail"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -323,19 +307,14 @@ def test_image_section_flushes_pending_text_and_creates_its_own_chunk() -> None:
|
||||
|
||||
def test_image_section_without_link_gets_empty_links_dict() -> None:
|
||||
"""If an image section has no link, links param is {} (not {0: ""})."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(
|
||||
type=SectionType.IMAGE,
|
||||
text="img",
|
||||
link=None,
|
||||
image_file_id="img-xyz",
|
||||
),
|
||||
Section(text="img", link=None, image_file_id="img-xyz"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -346,7 +325,7 @@ def test_image_section_without_link_gets_empty_links_dict() -> None:
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].image_file_id == "img-xyz"
|
||||
# to_doc_aware_chunk falls back to {0: ""} when given an empty dict
|
||||
# _create_chunk falls back to {0: ""} when given an empty dict
|
||||
assert chunks[0].source_links == {0: ""}
|
||||
|
||||
|
||||
@@ -357,7 +336,7 @@ def test_oversized_section_is_split_across_multiple_chunks() -> None:
|
||||
"""A section whose text exceeds content_token_limit should be passed
|
||||
through chunk_splitter and yield >1 chunks; only the first is not a
|
||||
continuation."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
# Build a section whose char-count is well over CHUNK_LIMIT (200), made
|
||||
# of many short sentences so chonkie's SentenceChunker can split cleanly.
|
||||
section_text = (
|
||||
@@ -370,10 +349,10 @@ def test_oversized_section_is_split_across_multiple_chunks() -> None:
|
||||
assert len(section_text) > CHUNK_LIMIT
|
||||
|
||||
doc = _make_doc(
|
||||
sections=[Section(type=SectionType.TEXT, text=section_text, link="big-link")],
|
||||
sections=[Section(text=section_text, link="big-link")],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -400,7 +379,7 @@ def test_oversized_section_is_split_across_multiple_chunks() -> None:
|
||||
def test_oversized_section_flushes_pending_text_first() -> None:
|
||||
"""A buffered text section followed by an oversized section should
|
||||
flush the pending chunk first, then emit the split chunks."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
pending = "Pending buffered text."
|
||||
big = (
|
||||
"Alpha beta gamma. Delta epsilon zeta. Eta theta iota. "
|
||||
@@ -412,12 +391,12 @@ def test_oversized_section_flushes_pending_text_first() -> None:
|
||||
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text=pending, link="l-pending"),
|
||||
Section(type=SectionType.TEXT, text=big, link="l-big"),
|
||||
Section(text=pending, link="l-pending"),
|
||||
Section(text=big, link="l-big"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -446,15 +425,15 @@ def test_oversized_section_flushes_pending_text_first() -> None:
|
||||
|
||||
|
||||
def test_title_prefix_and_metadata_propagate_to_all_chunks() -> None:
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text="A" * 120, link="la"),
|
||||
Section(type=SectionType.TEXT, text="B" * 120, link="lb"),
|
||||
Section(text="A" * 120, link="la"),
|
||||
Section(text="B" * 120, link="lb"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="MY_TITLE\n",
|
||||
@@ -474,16 +453,16 @@ def test_title_prefix_and_metadata_propagate_to_all_chunks() -> None:
|
||||
|
||||
|
||||
def test_chunk_ids_are_sequential_starting_at_zero() -> None:
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text="A" * 120, link="la"),
|
||||
Section(type=SectionType.TEXT, text="B" * 120, link="lb"),
|
||||
Section(type=SectionType.TEXT, text="C" * 120, link="lc"),
|
||||
Section(text="A" * 120, link="la"),
|
||||
Section(text="B" * 120, link="lb"),
|
||||
Section(text="C" * 120, link="lc"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -501,18 +480,18 @@ def test_chunk_ids_are_sequential_starting_at_zero() -> None:
|
||||
def test_overflow_flush_then_subsequent_section_joins_new_chunk() -> None:
|
||||
"""After an overflow flush starts a new chunk, the next fitting section
|
||||
should combine into that same new chunk (not spawn a third)."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
# 120 + 120 > 200 → first two sections produce two chunks.
|
||||
# Third section is small (20 chars) → should fit with second.
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text="A" * 120, link="la"),
|
||||
Section(type=SectionType.TEXT, text="B" * 120, link="lb"),
|
||||
Section(type=SectionType.TEXT, text="C" * 20, link="lc"),
|
||||
Section(text="A" * 120, link="la"),
|
||||
Section(text="B" * 120, link="lb"),
|
||||
Section(text="C" * 20, link="lc"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -532,7 +511,7 @@ def test_small_section_after_oversized_starts_a_fresh_chunk() -> None:
|
||||
"""After an oversized section is emitted as its own chunks, the internal
|
||||
accumulator should be empty so a following small section starts a new
|
||||
chunk instead of being swallowed."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
big = (
|
||||
"Alpha beta gamma. Delta epsilon zeta. Eta theta iota. "
|
||||
"Kappa lambda mu. Nu xi omicron. Pi rho sigma. Tau upsilon phi. "
|
||||
@@ -542,12 +521,12 @@ def test_small_section_after_oversized_starts_a_fresh_chunk() -> None:
|
||||
assert len(big) > CHUNK_LIMIT
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text=big, link="l-big"),
|
||||
Section(type=SectionType.TEXT, text="Tail text.", link="l-tail"),
|
||||
Section(text=big, link="l-big"),
|
||||
Section(text="Tail text.", link="l-tail"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -576,14 +555,14 @@ def test_strict_chunk_token_limit_subdivides_oversized_split(
|
||||
"""When STRICT_CHUNK_TOKEN_LIMIT is enabled and chonkie's chunk_splitter
|
||||
still produces a piece larger than content_token_limit (e.g. a single
|
||||
no-period run), the code must fall back to _split_oversized_chunk."""
|
||||
monkeypatch.setattr(text_chunker_module, "STRICT_CHUNK_TOKEN_LIMIT", True)
|
||||
dc = _make_document_chunker()
|
||||
monkeypatch.setattr(chunker_module, "STRICT_CHUNK_TOKEN_LIMIT", True)
|
||||
chunker = _make_chunker()
|
||||
# 500 non-whitespace chars with no sentence boundaries — chonkie will
|
||||
# return it as one oversized piece (>200) which triggers the fallback.
|
||||
run = "a" * 500
|
||||
doc = _make_doc(sections=[Section(type=SectionType.TEXT, text=run, link="l-run")])
|
||||
doc = _make_doc(sections=[Section(text=run, link="l-run")])
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -613,12 +592,12 @@ def test_strict_chunk_token_limit_disabled_allows_oversized_split(
|
||||
) -> None:
|
||||
"""Same pathological input, but with STRICT disabled: the oversized
|
||||
split is emitted verbatim as a single chunk (current behavior)."""
|
||||
monkeypatch.setattr(text_chunker_module, "STRICT_CHUNK_TOKEN_LIMIT", False)
|
||||
dc = _make_document_chunker()
|
||||
monkeypatch.setattr(chunker_module, "STRICT_CHUNK_TOKEN_LIMIT", False)
|
||||
chunker = _make_chunker()
|
||||
run = "a" * 500
|
||||
doc = _make_doc(sections=[Section(type=SectionType.TEXT, text=run, link="l-run")])
|
||||
doc = _make_doc(sections=[Section(text=run, link="l-run")])
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -640,18 +619,16 @@ def test_first_empty_section_with_title_is_processed_not_skipped() -> None:
|
||||
the doc has a title AND it's the first section, an empty text section
|
||||
is NOT skipped. This pins current behavior so a refactor can't silently
|
||||
change it."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(
|
||||
type=SectionType.TEXT, text="", link="l0"
|
||||
), # empty first section, kept
|
||||
Section(type=SectionType.TEXT, text="Real content.", link="l1"),
|
||||
Section(text="", link="l0"), # empty first section, kept
|
||||
Section(text="Real content.", link="l1"),
|
||||
],
|
||||
title="Has A Title",
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -674,13 +651,13 @@ def test_first_empty_section_with_title_is_processed_not_skipped() -> None:
|
||||
def test_clean_text_strips_control_chars_from_section_content() -> None:
|
||||
"""clean_text() should remove control chars before the text enters the
|
||||
accumulator — verifies the call isn't dropped by a refactor."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
# NUL + BEL are control chars below 0x20 and not \n or \t → should be
|
||||
# stripped by clean_text.
|
||||
dirty = "Hello\x00 World\x07!"
|
||||
doc = _make_doc(sections=[Section(type=SectionType.TEXT, text=dirty, link="l1")])
|
||||
doc = _make_doc(sections=[Section(text=dirty, link="l1")])
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -700,16 +677,16 @@ def test_section_with_none_text_behaves_like_empty_string() -> None:
|
||||
"""`section.text` may be None — the method coerces via
|
||||
`str(section.text or "")`, so a None-text section behaves identically
|
||||
to an empty one (skipped unless it's the first section of a titled doc)."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text="Alpha.", link="la"),
|
||||
Section(type=SectionType.TEXT, text=None, link="lnone"), # idx 1 → skipped
|
||||
Section(type=SectionType.TEXT, text="Beta.", link="lb"),
|
||||
Section(text="Alpha.", link="la"),
|
||||
Section(text=None, link="lnone"), # idx 1 → skipped
|
||||
Section(text="Beta.", link="lb"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -731,20 +708,15 @@ def test_no_trailing_empty_chunk_when_last_section_was_image() -> None:
|
||||
"""If the final section was an image (which emits its own chunk and
|
||||
resets chunk_text), the safety `or not chunks` branch should NOT fire
|
||||
because chunks is non-empty. Pin this explicitly."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(type=SectionType.TEXT, text="Leading text.", link="ltext"),
|
||||
Section(
|
||||
type=SectionType.IMAGE,
|
||||
text="img summary",
|
||||
link="limg",
|
||||
image_file_id="img-final",
|
||||
),
|
||||
Section(text="Leading text.", link="ltext"),
|
||||
Section(text="img summary", link="limg", image_file_id="img-final"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
@@ -764,7 +736,7 @@ def test_no_trailing_empty_chunk_when_last_section_was_image() -> None:
|
||||
def test_no_trailing_empty_chunk_when_last_section_was_oversized() -> None:
|
||||
"""Same guarantee for oversized sections: their splits fully clear the
|
||||
accumulator, and the trailing safety branch should be a no-op."""
|
||||
dc = _make_document_chunker()
|
||||
chunker = _make_chunker()
|
||||
big = (
|
||||
"Alpha beta gamma. Delta epsilon zeta. Eta theta iota. "
|
||||
"Kappa lambda mu. Nu xi omicron. Pi rho sigma. Tau upsilon phi. "
|
||||
@@ -772,9 +744,9 @@ def test_no_trailing_empty_chunk_when_last_section_was_oversized() -> None:
|
||||
"Ten eleven twelve. Thirteen fourteen fifteen. Sixteen seventeen."
|
||||
)
|
||||
assert len(big) > CHUNK_LIMIT
|
||||
doc = _make_doc(sections=[Section(type=SectionType.TEXT, text=big, link="l-big")])
|
||||
doc = _make_doc(sections=[Section(text=big, link="l-big")])
|
||||
|
||||
chunks = dc.chunk(
|
||||
chunks = chunker._chunk_document_with_sections(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
|
||||
@@ -193,9 +193,10 @@ hover, active, and disabled states.
|
||||
|
||||
### Disabled (`core/disabled/`)
|
||||
|
||||
A pure CSS wrapper that applies disabled visuals (`opacity-50`, `cursor-not-allowed`,
|
||||
`pointer-events: none`) to a single child element via Radix `Slot`. Has no React context —
|
||||
Interactive primitives and buttons manage their own disabled state via a `disabled` prop.
|
||||
A `<div>` wrapper that applies disabled visuals (`opacity-50`, `cursor-not-allowed`,
|
||||
`pointer-events: none`) to its children via the `data-opal-disabled` CSS attribute. Supports an
|
||||
optional `tooltip` prop (shown on hover when disabled) and `allowClick` to re-enable pointer
|
||||
events. Interactive primitives and buttons manage their own disabled state via a `disabled` prop.
|
||||
|
||||
### Hoverable (`core/animations/`)
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type { HTMLAttributes } from "react";
|
||||
|
||||
import type { RichStr, WithoutStyles } from "@opal/types";
|
||||
import { cn } from "@opal/utils";
|
||||
import { resolveStr } from "@opal/components/text/InlineMarkdown";
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
.opal-tooltip {
|
||||
z-index: var(--z-tooltip, 1300);
|
||||
max-width: 20rem;
|
||||
@apply rounded-08 px-3 py-2 text-sm
|
||||
bg-background-neutral-dark-03 text-text-light-05
|
||||
animate-in fade-in-0 zoom-in-95
|
||||
|
||||
129
web/lib/opal/src/core/disabled/Disabled.stories.tsx
Normal file
129
web/lib/opal/src/core/disabled/Disabled.stories.tsx
Normal file
@@ -0,0 +1,129 @@
|
||||
import type { Meta, StoryObj } from "@storybook/react";
|
||||
import * as TooltipPrimitive from "@radix-ui/react-tooltip";
|
||||
import type { Decorator } from "@storybook/react";
|
||||
import { Disabled } from "@opal/core";
|
||||
import { Card } from "@opal/components";
|
||||
import { Button } from "@opal/components/buttons/button/components";
|
||||
|
||||
const withTooltipProvider: Decorator = (Story) => (
|
||||
<TooltipPrimitive.Provider>
|
||||
<Story />
|
||||
</TooltipPrimitive.Provider>
|
||||
);
|
||||
|
||||
const meta: Meta<typeof Disabled> = {
|
||||
title: "opal/core/Disabled",
|
||||
component: Disabled,
|
||||
tags: ["autodocs"],
|
||||
decorators: [withTooltipProvider],
|
||||
};
|
||||
|
||||
export default meta;
|
||||
type Story = StoryObj<typeof Disabled>;
|
||||
|
||||
const SampleContent = () => (
|
||||
<Card border="solid" padding="md">
|
||||
<div className="flex flex-col gap-2">
|
||||
<p className="text-sm font-medium">Card Title</p>
|
||||
<p className="text-xs text-text-03">Some content that can be disabled.</p>
|
||||
<Button prominence="secondary" size="sm">
|
||||
Action
|
||||
</Button>
|
||||
</div>
|
||||
</Card>
|
||||
);
|
||||
|
||||
export const Enabled: Story = {
|
||||
render: () => (
|
||||
<div className="w-80">
|
||||
<Disabled disabled={false}>
|
||||
<SampleContent />
|
||||
</Disabled>
|
||||
</div>
|
||||
),
|
||||
};
|
||||
|
||||
export const DisabledState: Story = {
|
||||
render: () => (
|
||||
<div className="w-80">
|
||||
<Disabled disabled>
|
||||
<SampleContent />
|
||||
</Disabled>
|
||||
</div>
|
||||
),
|
||||
};
|
||||
|
||||
export const WithTooltip: Story = {
|
||||
render: () => (
|
||||
<div className="w-80">
|
||||
<Disabled disabled tooltip="This feature requires a Pro plan">
|
||||
<SampleContent />
|
||||
</Disabled>
|
||||
</div>
|
||||
),
|
||||
};
|
||||
|
||||
export const TooltipSides: Story = {
|
||||
render: () => (
|
||||
<div className="flex flex-col gap-8 items-center py-16">
|
||||
{(["top", "right", "bottom", "left"] as const).map((side) => (
|
||||
<Disabled
|
||||
key={side}
|
||||
disabled
|
||||
tooltip={`Tooltip on ${side}`}
|
||||
tooltipSide={side}
|
||||
>
|
||||
<Card border="solid" padding="sm">
|
||||
<p className="text-sm">tooltipSide: {side}</p>
|
||||
</Card>
|
||||
</Disabled>
|
||||
))}
|
||||
</div>
|
||||
),
|
||||
};
|
||||
|
||||
export const WithAllowClick: Story = {
|
||||
render: () => (
|
||||
<div className="w-80">
|
||||
<Disabled disabled allowClick>
|
||||
<Card border="solid" padding="md">
|
||||
<p className="text-sm">
|
||||
Disabled visuals, but pointer events are still active.
|
||||
</p>
|
||||
<Button
|
||||
prominence="tertiary"
|
||||
size="sm"
|
||||
onClick={() => alert("Clicked!")}
|
||||
>
|
||||
Click me
|
||||
</Button>
|
||||
</Card>
|
||||
</Disabled>
|
||||
</div>
|
||||
),
|
||||
};
|
||||
|
||||
export const Comparison: Story = {
|
||||
render: () => (
|
||||
<div className="flex gap-4">
|
||||
<div className="flex flex-col gap-2 w-60">
|
||||
<p className="text-xs font-medium">Enabled</p>
|
||||
<Disabled disabled={false}>
|
||||
<SampleContent />
|
||||
</Disabled>
|
||||
</div>
|
||||
<div className="flex flex-col gap-2 w-60">
|
||||
<p className="text-xs font-medium">Disabled</p>
|
||||
<Disabled disabled>
|
||||
<SampleContent />
|
||||
</Disabled>
|
||||
</div>
|
||||
<div className="flex flex-col gap-2 w-60">
|
||||
<p className="text-xs font-medium">Disabled + Tooltip</p>
|
||||
<Disabled disabled tooltip="Not available right now">
|
||||
<SampleContent />
|
||||
</Disabled>
|
||||
</div>
|
||||
</div>
|
||||
),
|
||||
};
|
||||
42
web/lib/opal/src/core/disabled/README.md
Normal file
42
web/lib/opal/src/core/disabled/README.md
Normal file
@@ -0,0 +1,42 @@
|
||||
# Disabled
|
||||
|
||||
**Import:** `import { Disabled } from "@opal/core";`
|
||||
|
||||
Wrapper component that applies baseline disabled CSS (opacity, cursor, pointer-events) to its children. Renders a `<div>` with the `data-opal-disabled` attribute so styling cascades into all descendants. Works with any children — DOM elements, React components, or fragments.
|
||||
|
||||
## Props
|
||||
|
||||
| Prop | Type | Default | Description |
|
||||
|------|------|---------|-------------|
|
||||
| `disabled` | `boolean` | `false` | Applies disabled styling when truthy |
|
||||
| `allowClick` | `boolean` | `false` | Re-enables pointer events while keeping disabled visuals |
|
||||
| `tooltip` | `string \| RichStr` | — | Tooltip shown on hover when disabled (implies `allowClick`). Supports `markdown()`. |
|
||||
| `tooltipSide` | `"top" \| "bottom" \| "left" \| "right"` | `"right"` | Which side the tooltip appears on |
|
||||
|
||||
## CSS behavior
|
||||
|
||||
| Selector | Effect |
|
||||
|----------|--------|
|
||||
| `[data-opal-disabled]` | `cursor-not-allowed`, `select-none`, `pointer-events: none` |
|
||||
| `[data-opal-disabled]:not(.interactive)` | `opacity-50` (non-Interactive elements only) |
|
||||
| `[data-opal-disabled].interactive` | `pointer-events: auto` (Interactive elements handle their own disabled colors) |
|
||||
| `[data-opal-disabled][data-allow-click]` | `pointer-events: auto` |
|
||||
|
||||
## Usage
|
||||
|
||||
```tsx
|
||||
// Basic — disables children visually and blocks pointer events
|
||||
<Disabled disabled={!canSubmit}>
|
||||
<Card>Content</Card>
|
||||
</Disabled>
|
||||
|
||||
// With tooltip — explains why the section is disabled
|
||||
<Disabled disabled={!canSubmit} tooltip="Complete the form first">
|
||||
<Card>Content</Card>
|
||||
</Disabled>
|
||||
|
||||
// With allowClick — keeps pointer events for custom handling
|
||||
<Disabled disabled={isProcessing} allowClick>
|
||||
<MyInputBar />
|
||||
</Disabled>
|
||||
```
|
||||
@@ -1,13 +1,18 @@
|
||||
import "@opal/core/disabled/styles.css";
|
||||
import "@opal/components/tooltip.css";
|
||||
import React from "react";
|
||||
import { Slot } from "@radix-ui/react-slot";
|
||||
import * as TooltipPrimitive from "@radix-ui/react-tooltip";
|
||||
import type { TooltipSide } from "@opal/components";
|
||||
import type { RichStr, WithoutStyles } from "@opal/types";
|
||||
import { Text } from "@opal/components";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface DisabledProps extends React.HTMLAttributes<HTMLElement> {
|
||||
ref?: React.Ref<HTMLElement>;
|
||||
interface DisabledProps
|
||||
extends WithoutStyles<React.HTMLAttributes<HTMLDivElement>> {
|
||||
ref?: React.Ref<HTMLDivElement>;
|
||||
|
||||
/**
|
||||
* When truthy, applies disabled styling to child elements.
|
||||
@@ -16,13 +21,23 @@ interface DisabledProps extends React.HTMLAttributes<HTMLElement> {
|
||||
|
||||
/**
|
||||
* When `true`, re-enables pointer events while keeping the disabled
|
||||
* visual treatment. Useful for elements that need to show tooltips or
|
||||
* error messages on click.
|
||||
* visual treatment. Useful for elements that need to remain interactive
|
||||
* (e.g. to show tooltips or handle clicks at a higher level).
|
||||
* @default false
|
||||
*/
|
||||
allowClick?: boolean;
|
||||
|
||||
children: React.ReactElement;
|
||||
/**
|
||||
* Tooltip content shown on hover when disabled. Implies `allowClick` so that
|
||||
* the tooltip trigger can receive pointer events. Supports inline markdown
|
||||
* via `markdown()`.
|
||||
*/
|
||||
tooltip?: string | RichStr;
|
||||
|
||||
/** Which side the tooltip appears on. @default "right" */
|
||||
tooltipSide?: TooltipSide;
|
||||
|
||||
children?: React.ReactNode;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -31,35 +46,63 @@ interface DisabledProps extends React.HTMLAttributes<HTMLElement> {
|
||||
|
||||
/**
|
||||
* Wrapper component that applies baseline disabled CSS (opacity, cursor,
|
||||
* pointer-events) to its child element.
|
||||
* pointer-events) to its children.
|
||||
*
|
||||
* Uses Radix `Slot` — merges props onto the single child element without
|
||||
* adding any DOM node. Works correctly inside Radix `asChild` chains.
|
||||
* Renders a `<div>` that carries the `data-opal-disabled` attribute so the
|
||||
* CSS rules in `styles.css` take effect on the wrapper and cascade into its
|
||||
* descendants. Works with any children (DOM elements, React components, or
|
||||
* fragments).
|
||||
*
|
||||
* @example
|
||||
* ```tsx
|
||||
* <Disabled disabled={!canSubmit}>
|
||||
* <div>...</div>
|
||||
* <MyComponent />
|
||||
* </Disabled>
|
||||
*
|
||||
* <Disabled disabled={!canSubmit} tooltip="Feature not available">
|
||||
* <MyComponent />
|
||||
* </Disabled>
|
||||
* ```
|
||||
*/
|
||||
function Disabled({
|
||||
disabled,
|
||||
allowClick,
|
||||
children,
|
||||
tooltip,
|
||||
tooltipSide = "right",
|
||||
ref,
|
||||
...rest
|
||||
}: DisabledProps) {
|
||||
return (
|
||||
<Slot
|
||||
const showTooltip = disabled && tooltip;
|
||||
const enableClick = allowClick || showTooltip;
|
||||
|
||||
const wrapper = (
|
||||
<div
|
||||
ref={ref}
|
||||
className="opal-disabled"
|
||||
{...rest}
|
||||
aria-disabled={disabled || undefined}
|
||||
data-opal-disabled={disabled || undefined}
|
||||
data-allow-click={disabled && allowClick ? "" : undefined}
|
||||
>
|
||||
{children}
|
||||
</Slot>
|
||||
data-allow-click={disabled && enableClick ? "" : undefined}
|
||||
/>
|
||||
);
|
||||
|
||||
if (!showTooltip) return wrapper;
|
||||
|
||||
// TODO(@raunakab): Replace this raw Radix tooltip with the opalified
|
||||
// Tooltip component once it lands.
|
||||
return (
|
||||
<TooltipPrimitive.Root>
|
||||
<TooltipPrimitive.Trigger asChild>{wrapper}</TooltipPrimitive.Trigger>
|
||||
<TooltipPrimitive.Portal>
|
||||
<TooltipPrimitive.Content
|
||||
className="opal-tooltip"
|
||||
side={tooltipSide}
|
||||
sideOffset={4}
|
||||
>
|
||||
<Text font="secondary-body">{tooltip}</Text>
|
||||
</TooltipPrimitive.Content>
|
||||
</TooltipPrimitive.Portal>
|
||||
</TooltipPrimitive.Root>
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
/* Disabled — baseline disabled visuals via Radix Slot (no extra DOM node).
|
||||
/* Disabled — baseline disabled visuals applied via a wrapper <div>.
|
||||
*
|
||||
* [data-opal-disabled] → cursor + pointer-events for all
|
||||
* [data-opal-disabled]:not(.interactive) → opacity for non-Interactive elements
|
||||
@@ -9,6 +9,10 @@
|
||||
* re-enabled so the JS layer can suppress onClick.
|
||||
*/
|
||||
|
||||
.opal-disabled {
|
||||
@apply self-stretch;
|
||||
}
|
||||
|
||||
[data-opal-disabled] {
|
||||
@apply cursor-not-allowed select-none;
|
||||
pointer-events: none;
|
||||
|
||||
@@ -258,102 +258,23 @@ pre[class*="language-"] {
|
||||
scrollbar-color: #4b5563 #1f2937;
|
||||
}
|
||||
|
||||
/* Card wrapper — holds the background, border-radius, padding, and fade overlay.
|
||||
Does NOT scroll — the inner .markdown-table-breakout handles that. */
|
||||
.markdown-table-card {
|
||||
position: relative;
|
||||
background: var(--background-neutral-01);
|
||||
border-radius: 0.5rem;
|
||||
padding: 0.5rem 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Scrollable table container — sits inside the card.
|
||||
* Table breakout container - allows tables to extend beyond their parent's
|
||||
* constrained width to use the full container query width (100cqw).
|
||||
*
|
||||
* Requires an ancestor element with `container-type: inline-size` (@container in Tailwind).
|
||||
*
|
||||
* How the math works:
|
||||
* - width: 100cqw → expand to full container query width
|
||||
* - marginLeft: calc((100% - 100cqw) / 2) → negative margin pulls element left
|
||||
* (100% is parent width, 100cqw is larger, so result is negative)
|
||||
* - paddingLeft/Right: calc((100cqw - 100%) / 2) → padding keeps content aligned
|
||||
* with original position while allowing scroll area to extend
|
||||
*/
|
||||
.markdown-table-breakout {
|
||||
overflow-x: auto;
|
||||
|
||||
/* Always reserve scrollbar height so hover doesn't shift content.
|
||||
Thumb is transparent by default, revealed on hover. */
|
||||
scrollbar-width: thin; /* Firefox — always shows track */
|
||||
scrollbar-color: transparent transparent; /* invisible thumb + track */
|
||||
}
|
||||
.markdown-table-breakout::-webkit-scrollbar {
|
||||
height: 6px;
|
||||
}
|
||||
.markdown-table-breakout::-webkit-scrollbar-track {
|
||||
background: transparent;
|
||||
}
|
||||
.markdown-table-breakout::-webkit-scrollbar-thumb {
|
||||
background: transparent;
|
||||
border-radius: 3px;
|
||||
}
|
||||
.markdown-table-breakout:hover {
|
||||
scrollbar-color: var(--border-03) transparent; /* Firefox — reveal thumb */
|
||||
}
|
||||
.markdown-table-breakout:hover::-webkit-scrollbar-thumb {
|
||||
background: var(--border-03);
|
||||
}
|
||||
|
||||
/* Fade the right edge via an ::after overlay on the non-scrolling card.
|
||||
Stays pinned while table scrolls; doesn't affect the sticky column. */
|
||||
.markdown-table-card::after {
|
||||
content: "";
|
||||
position: absolute;
|
||||
top: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
width: 2rem;
|
||||
pointer-events: none;
|
||||
z-index: 2;
|
||||
background: linear-gradient(
|
||||
to right,
|
||||
transparent,
|
||||
var(--background-neutral-01)
|
||||
);
|
||||
border-radius: 0 0.5rem 0.5rem 0;
|
||||
opacity: 0;
|
||||
transition: opacity 0.15s;
|
||||
}
|
||||
.markdown-table-card[data-overflows="true"]::after {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
/* Sticky first column — inherits the container's background so it
|
||||
matches regardless of theme or custom wallpaper. */
|
||||
.markdown-table-breakout th:first-child,
|
||||
.markdown-table-breakout td:first-child {
|
||||
position: sticky;
|
||||
left: 0;
|
||||
z-index: 1;
|
||||
padding-left: 0.75rem;
|
||||
background: var(--background-neutral-01);
|
||||
}
|
||||
.markdown-table-breakout th:last-child,
|
||||
.markdown-table-breakout td:last-child {
|
||||
padding-right: 0.75rem;
|
||||
}
|
||||
|
||||
/* Shadow on sticky column when scrolled. Uses an ::after pseudo-element
|
||||
so it isn't clipped by the overflow container or the mask-image fade. */
|
||||
.markdown-table-breakout th:first-child::after,
|
||||
.markdown-table-breakout td:first-child::after {
|
||||
content: "";
|
||||
position: absolute;
|
||||
top: 0;
|
||||
right: -6px;
|
||||
bottom: 0;
|
||||
width: 6px;
|
||||
pointer-events: none;
|
||||
opacity: 0;
|
||||
transition: opacity 0.15s;
|
||||
box-shadow: inset 6px 0 8px -4px var(--alpha-grey-100-25);
|
||||
}
|
||||
.dark .markdown-table-breakout th:first-child::after,
|
||||
.dark .markdown-table-breakout td:first-child::after {
|
||||
box-shadow: inset 6px 0 8px -4px var(--alpha-grey-100-60);
|
||||
}
|
||||
.markdown-table-breakout[data-scrolled="true"] th:first-child::after,
|
||||
.markdown-table-breakout[data-scrolled="true"] td:first-child::after {
|
||||
opacity: 1;
|
||||
width: 100cqw;
|
||||
margin-left: calc((100% - 100cqw) / 2);
|
||||
padding-left: calc((100cqw - 100%) / 2);
|
||||
padding-right: calc((100cqw - 100%) / 2);
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import React, { useCallback, useEffect, useRef, useMemo, JSX } from "react";
|
||||
import React, { useCallback, useMemo, JSX } from "react";
|
||||
import ReactMarkdown from "react-markdown";
|
||||
import remarkGfm from "remark-gfm";
|
||||
import remarkMath from "remark-math";
|
||||
@@ -17,66 +17,6 @@ import { transformLinkUri, cn } from "@/lib/utils";
|
||||
import { InMessageImage } from "@/app/app/components/files/images/InMessageImage";
|
||||
import { extractChatImageFileId } from "@/app/app/components/files/images/utils";
|
||||
|
||||
/** Table wrapper that detects horizontal overflow and shows a fade + scrollbar. */
|
||||
interface ScrollableTableProps
|
||||
extends React.TableHTMLAttributes<HTMLTableElement> {
|
||||
children: React.ReactNode;
|
||||
}
|
||||
|
||||
export function ScrollableTable({
|
||||
className,
|
||||
children,
|
||||
...props
|
||||
}: ScrollableTableProps) {
|
||||
const scrollRef = useRef<HTMLDivElement>(null);
|
||||
const wrapRef = useRef<HTMLDivElement>(null);
|
||||
const tableRef = useRef<HTMLTableElement>(null);
|
||||
|
||||
useEffect(() => {
|
||||
const el = scrollRef.current;
|
||||
const wrap = wrapRef.current;
|
||||
const table = tableRef.current;
|
||||
if (!el || !wrap) return;
|
||||
|
||||
const check = () => {
|
||||
const overflows = el.scrollWidth > el.clientWidth;
|
||||
const atEnd = el.scrollLeft + el.clientWidth >= el.scrollWidth - 2;
|
||||
wrap.dataset.overflows = overflows && !atEnd ? "true" : "false";
|
||||
el.dataset.scrolled = el.scrollLeft > 0 ? "true" : "false";
|
||||
};
|
||||
|
||||
check();
|
||||
el.addEventListener("scroll", check, { passive: true });
|
||||
// Observe both the scroll container (parent resize) and the table
|
||||
// itself (content growth during streaming).
|
||||
const ro = new ResizeObserver(check);
|
||||
ro.observe(el);
|
||||
if (table) ro.observe(table);
|
||||
|
||||
return () => {
|
||||
el.removeEventListener("scroll", check);
|
||||
ro.disconnect();
|
||||
};
|
||||
}, []);
|
||||
|
||||
return (
|
||||
<div ref={wrapRef} className="markdown-table-card">
|
||||
<div ref={scrollRef} className="markdown-table-breakout">
|
||||
<table
|
||||
ref={tableRef}
|
||||
className={cn(
|
||||
className,
|
||||
"min-w-full !my-0 [&_th]:whitespace-nowrap [&_td]:whitespace-nowrap"
|
||||
)}
|
||||
{...props}
|
||||
>
|
||||
{children}
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes content for markdown rendering by handling code blocks and LaTeX
|
||||
*/
|
||||
@@ -187,9 +127,11 @@ export const useMarkdownComponents = (
|
||||
},
|
||||
table: ({ node, className, children, ...props }: any) => {
|
||||
return (
|
||||
<ScrollableTable className={className} {...props}>
|
||||
{children}
|
||||
</ScrollableTable>
|
||||
<div className="markdown-table-breakout">
|
||||
<table className={cn(className, "min-w-full")} {...props}>
|
||||
{children}
|
||||
</table>
|
||||
</div>
|
||||
);
|
||||
},
|
||||
code: ({ node, className, children }: any) => {
|
||||
|
||||
@@ -16,7 +16,7 @@ import {
|
||||
} from "../../../services/streamingModels";
|
||||
import { MessageRenderer, FullChatState } from "../interfaces";
|
||||
import { isFinalAnswerComplete } from "../../../services/packetUtils";
|
||||
import { processContent, ScrollableTable } from "../markdownUtils";
|
||||
import { processContent } from "../markdownUtils";
|
||||
import { BlinkingBar } from "../../BlinkingBar";
|
||||
import { useVoiceMode } from "@/providers/VoiceModeProvider";
|
||||
import {
|
||||
@@ -338,9 +338,11 @@ export const MessageTextRenderer: MessageRenderer<
|
||||
</li>
|
||||
),
|
||||
table: ({ className, children, ...rest }) => (
|
||||
<ScrollableTable className={className} {...rest}>
|
||||
{children}
|
||||
</ScrollableTable>
|
||||
<div className="markdown-table-breakout">
|
||||
<table className={cn(className, "min-w-full")} {...rest}>
|
||||
{children}
|
||||
</table>
|
||||
</div>
|
||||
),
|
||||
code: ({ node, className, children }) => {
|
||||
const codeText = extractCodeText(
|
||||
|
||||
@@ -516,7 +516,8 @@ export default function NRFPage({ isSidePanel = false }: NRFPageProps) {
|
||||
ref={inputRef}
|
||||
className={cn(
|
||||
"w-full flex flex-col",
|
||||
!isSidePanel && "max-w-[var(--app-page-main-content-width)]"
|
||||
!isSidePanel &&
|
||||
"max-w-[var(--app-page-main-content-width)] px-4"
|
||||
)}
|
||||
>
|
||||
{hasMessages && liveAgent && !llmManager.isLoadingProviders && (
|
||||
|
||||
@@ -644,7 +644,6 @@ export default function useChatController({
|
||||
});
|
||||
node.modelDisplayName = model.displayName;
|
||||
node.overridden_model = model.modelName;
|
||||
node.is_generating = true;
|
||||
return node;
|
||||
});
|
||||
}
|
||||
@@ -1053,14 +1052,10 @@ export default function useChatController({
|
||||
|
||||
// In multi-model mode, route per-model errors to the specific model's
|
||||
// node instead of killing the entire stream. Other models keep streaming.
|
||||
if (isMultiModel) {
|
||||
// Multi-model: isolate the error to its panel. Never throw
|
||||
// or set global error state — other models keep streaming.
|
||||
const errorModelIndex = streamingError.details?.model_index as
|
||||
| number
|
||||
| undefined;
|
||||
if (isMultiModel && streamingError.details?.model_index != null) {
|
||||
const errorModelIndex = streamingError.details
|
||||
.model_index as number;
|
||||
if (
|
||||
errorModelIndex != null &&
|
||||
errorModelIndex >= 0 &&
|
||||
errorModelIndex < initialAssistantNodes.length
|
||||
) {
|
||||
@@ -1093,15 +1088,8 @@ export default function useChatController({
|
||||
completeMessageTreeOverride: currentMessageTreeLocal,
|
||||
chatSessionId: frozenSessionId!,
|
||||
});
|
||||
} else {
|
||||
// Error without model_index in multi-model — can't route
|
||||
// to a specific panel. Log and continue; the stream loop
|
||||
// stays alive for other models.
|
||||
console.warn(
|
||||
"Multi-model error without model_index:",
|
||||
streamingError.error
|
||||
);
|
||||
}
|
||||
// Skip the normal per-packet upsert — we already upserted the error node
|
||||
continue;
|
||||
} else {
|
||||
// Single-model: kill the stream
|
||||
@@ -1257,7 +1245,6 @@ export default function useChatController({
|
||||
errorCode,
|
||||
isRetryable,
|
||||
errorDetails,
|
||||
is_generating: false,
|
||||
})
|
||||
: [
|
||||
{
|
||||
|
||||
@@ -48,7 +48,6 @@ describe("useSettings", () => {
|
||||
anonymous_user_enabled: false,
|
||||
invite_only_enabled: false,
|
||||
deep_research_enabled: true,
|
||||
multi_model_chat_enabled: true,
|
||||
temperature_override_enabled: true,
|
||||
query_history_type: QueryHistoryType.NORMAL,
|
||||
});
|
||||
@@ -66,7 +65,6 @@ describe("useSettings", () => {
|
||||
anonymous_user_enabled: false,
|
||||
invite_only_enabled: false,
|
||||
deep_research_enabled: true,
|
||||
multi_model_chat_enabled: true,
|
||||
temperature_override_enabled: true,
|
||||
query_history_type: QueryHistoryType.NORMAL,
|
||||
};
|
||||
|
||||
@@ -23,7 +23,6 @@ const DEFAULT_SETTINGS = {
|
||||
anonymous_user_enabled: false,
|
||||
invite_only_enabled: false,
|
||||
deep_research_enabled: true,
|
||||
multi_model_chat_enabled: true,
|
||||
temperature_override_enabled: true,
|
||||
query_history_type: QueryHistoryType.NORMAL,
|
||||
} satisfies Settings;
|
||||
|
||||
@@ -27,7 +27,6 @@ export interface Settings {
|
||||
query_history_type: QueryHistoryType;
|
||||
|
||||
deep_research_enabled?: boolean;
|
||||
multi_model_chat_enabled?: boolean;
|
||||
search_ui_enabled?: boolean;
|
||||
|
||||
// Image processing settings
|
||||
|
||||
@@ -9,7 +9,6 @@ import { useField, useFormikContext } from "formik";
|
||||
import { Section } from "@/layouts/general-layouts";
|
||||
import { Content } from "@opal/layouts";
|
||||
import Label from "@/refresh-components/form/Label";
|
||||
import type { TagProps } from "@opal/components/tag/components";
|
||||
|
||||
interface OrientationLayoutProps {
|
||||
name?: string;
|
||||
@@ -17,8 +16,6 @@ interface OrientationLayoutProps {
|
||||
nonInteractive?: boolean;
|
||||
children?: React.ReactNode;
|
||||
title: string | RichStr;
|
||||
/** Tag rendered inline beside the title (passed through to Content). */
|
||||
tag?: TagProps;
|
||||
description?: string | RichStr;
|
||||
suffix?: "optional" | (string & {});
|
||||
sizePreset?: "main-content" | "main-ui";
|
||||
@@ -131,7 +128,6 @@ function HorizontalInputLayout({
|
||||
children,
|
||||
center,
|
||||
title,
|
||||
tag,
|
||||
description,
|
||||
suffix,
|
||||
sizePreset = "main-content",
|
||||
@@ -148,7 +144,6 @@ function HorizontalInputLayout({
|
||||
title={title}
|
||||
description={description}
|
||||
suffix={suffix}
|
||||
tag={tag}
|
||||
sizePreset={sizePreset}
|
||||
variant="section"
|
||||
widthVariant="full"
|
||||
|
||||
@@ -6,7 +6,6 @@ import { LlmManager } from "@/lib/hooks";
|
||||
import { getModelIcon } from "@/lib/llmConfig";
|
||||
import { Button, SelectButton, OpenButton } from "@opal/components";
|
||||
import { SvgPlusCircle, SvgX } from "@opal/icons";
|
||||
import { useSettingsContext } from "@/providers/SettingsProvider";
|
||||
import { LLMOption } from "@/refresh-components/popovers/interfaces";
|
||||
import ModelListContent from "@/refresh-components/popovers/ModelListContent";
|
||||
import Separator from "@/refresh-components/Separator";
|
||||
@@ -45,12 +44,8 @@ export default function ModelSelector({
|
||||
// Virtual anchor ref — points to the clicked pill so the popover positions above it
|
||||
const anchorRef = useRef<HTMLElement | null>(null);
|
||||
|
||||
const settings = useSettingsContext();
|
||||
const multiModelAllowed =
|
||||
settings?.settings?.multi_model_chat_enabled ?? true;
|
||||
|
||||
const isMultiModel = selectedModels.length > 1;
|
||||
const atMax = selectedModels.length >= MAX_MODELS || !multiModelAllowed;
|
||||
const atMax = selectedModels.length >= MAX_MODELS;
|
||||
|
||||
const selectedKeys = useMemo(
|
||||
() => new Set(selectedModels.map((m) => modelKey(m.provider, m.modelName))),
|
||||
|
||||
@@ -595,46 +595,25 @@ function ChatPreferencesForm() {
|
||||
variant="section"
|
||||
/>
|
||||
<Card>
|
||||
<SimpleTooltip
|
||||
tooltip={
|
||||
uniqueSources.length === 0
|
||||
? "Set up connectors to use Search Mode"
|
||||
: undefined
|
||||
}
|
||||
side="top"
|
||||
<Disabled
|
||||
disabled={uniqueSources.length === 0}
|
||||
tooltip="Set up connectors to use Search Mode"
|
||||
>
|
||||
<Disabled disabled={uniqueSources.length === 0} allowClick>
|
||||
<div className="w-full">
|
||||
<InputLayouts.Horizontal
|
||||
title="Search Mode"
|
||||
description="UI mode for quick document search across your organization."
|
||||
disabled={uniqueSources.length === 0}
|
||||
nonInteractive
|
||||
>
|
||||
<Switch
|
||||
checked={s.search_ui_enabled ?? false}
|
||||
onCheckedChange={(checked) => {
|
||||
void saveSettings({ search_ui_enabled: checked });
|
||||
}}
|
||||
disabled={uniqueSources.length === 0}
|
||||
/>
|
||||
</InputLayouts.Horizontal>
|
||||
</div>
|
||||
</Disabled>
|
||||
</SimpleTooltip>
|
||||
<InputLayouts.Horizontal
|
||||
title="Multi-Model Generation"
|
||||
tag={{ title: "beta", color: "blue" }}
|
||||
description="Allow multiple models to generate responses in parallel in chat."
|
||||
nonInteractive
|
||||
>
|
||||
<Switch
|
||||
checked={s.multi_model_chat_enabled ?? true}
|
||||
onCheckedChange={(checked) => {
|
||||
void saveSettings({ multi_model_chat_enabled: checked });
|
||||
}}
|
||||
/>
|
||||
</InputLayouts.Horizontal>
|
||||
<InputLayouts.Horizontal
|
||||
title="Search Mode"
|
||||
description="UI mode for quick document search across your organization."
|
||||
disabled={uniqueSources.length === 0}
|
||||
nonInteractive
|
||||
>
|
||||
<Switch
|
||||
checked={s.search_ui_enabled ?? false}
|
||||
onCheckedChange={(checked) => {
|
||||
void saveSettings({ search_ui_enabled: checked });
|
||||
}}
|
||||
disabled={uniqueSources.length === 0}
|
||||
/>
|
||||
</InputLayouts.Horizontal>
|
||||
</Disabled>
|
||||
<InputLayouts.Horizontal
|
||||
title="Deep Research"
|
||||
description="Agentic research system that works across the web and connected sources. Uses significantly more tokens per query."
|
||||
|
||||
@@ -331,13 +331,10 @@ const ChatUI = React.memo(
|
||||
return null;
|
||||
})}
|
||||
|
||||
{/* Error banner when last message is user message or error type.
|
||||
Skip for multi-model per-panel errors — those are shown in
|
||||
their own panel, not as a global banner. */}
|
||||
{/* Error banner when last message is user message or error type */}
|
||||
{(((error !== null || loadError !== null) &&
|
||||
messages[messages.length - 1]?.type === "user") ||
|
||||
(messages[messages.length - 1]?.type === "error" &&
|
||||
!messages[messages.length - 1]?.modelDisplayName)) && (
|
||||
messages[messages.length - 1]?.type === "error") && (
|
||||
<div className={`p-4 w-full ${MSG_MAX_W} self-center`}>
|
||||
<ErrorBanner
|
||||
resubmit={onResubmit}
|
||||
|
||||
Reference in New Issue
Block a user