mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-13 10:52:42 +00:00
Compare commits
19 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f22bd593f6 | ||
|
|
e76ffbd4c3 | ||
|
|
696e88710d | ||
|
|
e131ce9547 | ||
|
|
828c2ded5c | ||
|
|
ee3f399cc2 | ||
|
|
0a86507cda | ||
|
|
9ab125441f | ||
|
|
0de5399303 | ||
|
|
537bf1ce1d | ||
|
|
5b7779bc78 | ||
|
|
bb1c44daff | ||
|
|
f26ecafb51 | ||
|
|
9fdb425c0d | ||
|
|
47e20e89c5 | ||
|
|
8b28c127f2 | ||
|
|
9a861a71ad | ||
|
|
b4bc12f6dc | ||
|
|
9af9148ca7 |
@@ -24,13 +24,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
&& curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
||||
&& apt-get install -y nodejs \
|
||||
&& install -m 0755 -d /etc/apt/keyrings \
|
||||
&& curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
|
||||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" > /etc/apt/sources.list.d/docker.list \
|
||||
&& curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg -o /etc/apt/keyrings/githubcli-archive-keyring.gpg \
|
||||
&& chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \
|
||||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" > /etc/apt/sources.list.d/github-cli.list \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends docker-ce-cli docker-compose-plugin gh \
|
||||
&& apt-get install -y --no-install-recommends gh \
|
||||
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# fd-find installs as fdfind on Debian/Ubuntu — symlink to fd
|
||||
|
||||
@@ -6,7 +6,7 @@ A containerized development environment for working on Onyx.
|
||||
|
||||
- Ubuntu 26.04 base image
|
||||
- Node.js 20, uv, Claude Code
|
||||
- Docker CLI, GitHub CLI (`gh`)
|
||||
- GitHub CLI (`gh`)
|
||||
- Neovim, ripgrep, fd, fzf, jq, make, wget, unzip
|
||||
- Zsh as default shell (sources host `~/.zshrc` if available)
|
||||
- Python venv auto-activation
|
||||
@@ -73,19 +73,6 @@ user has read/write access to the bind-mounted workspace:
|
||||
To override the auto-detection, set `DEVCONTAINER_REMOTE_USER` before running
|
||||
`ods dev up`.
|
||||
|
||||
## Docker socket
|
||||
|
||||
The container mounts the host's Docker socket so you can run `docker` commands
|
||||
from inside. `ods dev` auto-detects the socket path and sets `DOCKER_SOCK`:
|
||||
|
||||
| Environment | Socket path |
|
||||
| ----------------------- | ------------------------------ |
|
||||
| Linux (rootless Docker) | `$XDG_RUNTIME_DIR/docker.sock` |
|
||||
| macOS (Docker Desktop) | `~/.docker/run/docker.sock` |
|
||||
| Linux (standard Docker) | `/var/run/docker.sock` |
|
||||
|
||||
To override, set `DOCKER_SOCK` before running `ods dev up`.
|
||||
|
||||
## Firewall
|
||||
|
||||
The container starts with a default-deny firewall (`init-firewall.sh`) that only allows outbound traffic to:
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
"image": "onyxdotapp/onyx-devcontainer@sha256:12184169c5bcc9cca0388286d5ffe504b569bc9c37bfa631b76ee8eee2064055",
|
||||
"runArgs": ["--cap-add=NET_ADMIN", "--cap-add=NET_RAW"],
|
||||
"mounts": [
|
||||
"source=${localEnv:DOCKER_SOCK},target=/var/run/docker.sock,type=bind",
|
||||
"source=${localEnv:HOME}/.claude,target=/home/dev/.claude,type=bind",
|
||||
"source=${localEnv:HOME}/.claude.json,target=/home/dev/.claude.json,type=bind",
|
||||
"source=${localEnv:HOME}/.zshrc,target=/home/dev/.zshrc.host,type=bind,readonly",
|
||||
|
||||
@@ -56,9 +56,10 @@ for domain in "${ALLOWED_DOMAINS[@]}"; do
|
||||
done
|
||||
done
|
||||
|
||||
# Detect host network
|
||||
if [[ "${DOCKER_HOST:-}" == "unix://"* ]]; then
|
||||
DOCKER_GATEWAY=$(ip -4 route show | grep "^default" | awk '{print $3}')
|
||||
# Allow traffic to the Docker gateway so the container can reach host services
|
||||
# (e.g. the Onyx stack at localhost:3000, localhost:8080, etc.)
|
||||
DOCKER_GATEWAY=$(ip -4 route show default | awk '{print $3}')
|
||||
if [ -n "$DOCKER_GATEWAY" ]; then
|
||||
if ! ipset add allowed-domains "$DOCKER_GATEWAY/32" -exist 2>&1; then
|
||||
echo "warning: failed to add Docker gateway $DOCKER_GATEWAY to allowlist" >&2
|
||||
fi
|
||||
|
||||
@@ -26,6 +26,10 @@ from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
process_onyx_metadata,
|
||||
)
|
||||
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
|
||||
is_tabular_file,
|
||||
tabular_file_to_sections,
|
||||
)
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.connectors.exceptions import CredentialExpiredError
|
||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||
@@ -451,6 +455,40 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
logger.exception(f"Error processing image {key}")
|
||||
continue
|
||||
|
||||
# Handle tabular files (xlsx, csv, tsv) — produce one
|
||||
# TabularSection per sheet (or per file for csv/tsv)
|
||||
# instead of a flat TextSection.
|
||||
if is_tabular_file(file_name):
|
||||
try:
|
||||
downloaded_file = self._download_object(key)
|
||||
if downloaded_file is None:
|
||||
continue
|
||||
tabular_sections = tabular_file_to_sections(
|
||||
BytesIO(downloaded_file),
|
||||
file_name=file_name,
|
||||
link=link,
|
||||
)
|
||||
batch.append(
|
||||
Document(
|
||||
id=f"{self.bucket_type}:{self.bucket_name}:{key}",
|
||||
sections=(
|
||||
tabular_sections
|
||||
if tabular_sections
|
||||
else [TextSection(link=link, text="")]
|
||||
),
|
||||
source=DocumentSource(self.bucket_type.value),
|
||||
semantic_identifier=file_name,
|
||||
doc_updated_at=last_modified,
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
if len(batch) == self.batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
except Exception:
|
||||
logger.exception(f"Error processing tabular file {key}")
|
||||
continue
|
||||
|
||||
# Handle text and document files
|
||||
try:
|
||||
downloaded_file = self._download_object(key)
|
||||
|
||||
@@ -0,0 +1,73 @@
|
||||
"""Helpers for converting tabular files (xlsx, csv, tsv) into
|
||||
TabularSection objects.
|
||||
|
||||
This lives in `connectors/cross_connector_utils` because:
|
||||
- It imports `TabularSection` from `connectors.models` (connector-layer type).
|
||||
- It calls `file_processing` primitives (`xlsx_sheet_extraction`, `file_io_to_text`)
|
||||
but does the connector-layer wrapping here so every connector that ingests
|
||||
tabular data can share the same section shape.
|
||||
"""
|
||||
|
||||
from typing import IO
|
||||
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.file_processing.extract_file_text import file_io_to_text
|
||||
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
# Extensions routed through this helper instead of the generic
|
||||
# `extract_text_and_images` path. Keep in sync with
|
||||
# `OnyxFileExtensions.TABULAR_EXTENSIONS`.
|
||||
TABULAR_FILE_EXTENSIONS = {".xlsx", ".csv", ".tsv"}
|
||||
|
||||
|
||||
def is_tabular_file(file_name: str) -> bool:
|
||||
"""Return True if the file extension indicates a tabular file
|
||||
(xlsx, csv, tsv)."""
|
||||
lowered = file_name.lower()
|
||||
return any(lowered.endswith(ext) for ext in TABULAR_FILE_EXTENSIONS)
|
||||
|
||||
|
||||
def tabular_file_to_sections(
|
||||
file: IO[bytes],
|
||||
file_name: str,
|
||||
link: str = "",
|
||||
) -> list[TabularSection]:
|
||||
"""Convert a tabular file into one or more TabularSections.
|
||||
|
||||
- `.xlsx` → one TabularSection per non-empty sheet, with
|
||||
`link=f"sheet:{title}"`.
|
||||
- `.csv` / `.tsv` → a single TabularSection containing the full
|
||||
decoded file, with `link=link` (falling back to `file_name` when
|
||||
the caller doesn't provide one — `TabularSection.link` is required).
|
||||
|
||||
Returns an empty list when the file yields no extractable content
|
||||
(empty workbook, empty csv, decode failure).
|
||||
|
||||
Raises `ValueError` if `file_name` isn't a recognized tabular
|
||||
extension — callers should gate on `is_tabular_file` first.
|
||||
"""
|
||||
lowered = file_name.lower()
|
||||
|
||||
if lowered.endswith(".xlsx"):
|
||||
return [
|
||||
TabularSection(link=f"sheet:{sheet_title}", text=csv_text)
|
||||
for csv_text, sheet_title in xlsx_sheet_extraction(
|
||||
file, file_name=file_name
|
||||
)
|
||||
]
|
||||
|
||||
if lowered.endswith((".csv", ".tsv")):
|
||||
try:
|
||||
text = file_io_to_text(file).strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to decode {file_name}: {e}")
|
||||
return []
|
||||
if not text:
|
||||
return []
|
||||
return [TabularSection(link=link or file_name, text=text)]
|
||||
|
||||
raise ValueError(f"{file_name!r} is not a tabular file")
|
||||
@@ -15,6 +15,10 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
)
|
||||
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rate_limit_builder
|
||||
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rl_requests
|
||||
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
|
||||
is_tabular_file,
|
||||
tabular_file_to_sections,
|
||||
)
|
||||
from onyx.connectors.drupal_wiki.models import DrupalWikiCheckpoint
|
||||
from onyx.connectors.drupal_wiki.models import DrupalWikiPage
|
||||
from onyx.connectors.drupal_wiki.models import DrupalWikiPageResponse
|
||||
@@ -33,6 +37,7 @@ from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_text_and_images
|
||||
from onyx.file_processing.extract_file_text import get_file_ext
|
||||
@@ -226,7 +231,7 @@ class DrupalWikiConnector(
|
||||
Tuple of (sections, error_message). If error_message is not None, the
|
||||
sections list should be treated as invalid.
|
||||
"""
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
sections: list[TextSection | ImageSection | TabularSection] = []
|
||||
|
||||
try:
|
||||
if not self._validate_attachment_filetype(attachment):
|
||||
@@ -273,6 +278,25 @@ class DrupalWikiConnector(
|
||||
|
||||
return sections, None
|
||||
|
||||
# Tabular attachments (xlsx, csv, tsv) — produce
|
||||
# TabularSections instead of a flat TextSection.
|
||||
if is_tabular_file(file_name):
|
||||
try:
|
||||
sections.extend(
|
||||
tabular_file_to_sections(
|
||||
BytesIO(raw_bytes),
|
||||
file_name=file_name,
|
||||
link=download_url,
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to extract tabular sections from {file_name}: {e}"
|
||||
)
|
||||
if not sections:
|
||||
return [], f"No content extracted from tabular file {file_name}"
|
||||
return sections, None
|
||||
|
||||
image_counter = 0
|
||||
|
||||
def _store_embedded_image(image_data: bytes, image_name: str) -> None:
|
||||
|
||||
@@ -12,6 +12,10 @@ from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
process_onyx_metadata,
|
||||
)
|
||||
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
|
||||
is_tabular_file,
|
||||
tabular_file_to_sections,
|
||||
)
|
||||
from onyx.connectors.interfaces import GenerateDocumentsOutput
|
||||
from onyx.connectors.interfaces import LoadConnector
|
||||
from onyx.connectors.models import Document
|
||||
@@ -145,6 +149,39 @@ def _process_file(
|
||||
logger.error(f"Failed to process image file {file_name}: {e}")
|
||||
return []
|
||||
|
||||
# 1b) If the file is tabular (xlsx/csv/tsv), produce one
|
||||
# TabularSection per sheet (or per file for csv/tsv) instead of
|
||||
# flattening through the generic text extractor.
|
||||
if is_tabular_file(file_name):
|
||||
file.seek(0)
|
||||
try:
|
||||
tabular_sections = tabular_file_to_sections(
|
||||
file=file,
|
||||
file_name=file_name,
|
||||
link=link or "",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process tabular file {file_name}: {e}")
|
||||
return []
|
||||
|
||||
if not tabular_sections:
|
||||
logger.warning(f"No content extracted from tabular file {file_name}")
|
||||
return []
|
||||
|
||||
return [
|
||||
Document(
|
||||
id=doc_id,
|
||||
sections=list(tabular_sections),
|
||||
source=source_type,
|
||||
semantic_identifier=file_display_name,
|
||||
title=title,
|
||||
doc_updated_at=time_updated,
|
||||
primary_owners=primary_owners,
|
||||
secondary_owners=secondary_owners,
|
||||
metadata=custom_tags,
|
||||
)
|
||||
]
|
||||
|
||||
# 2) Otherwise: text-based approach. Possibly with embedded images.
|
||||
file.seek(0)
|
||||
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
|
||||
is_tabular_file,
|
||||
tabular_file_to_sections,
|
||||
)
|
||||
import io
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
@@ -28,15 +32,16 @@ from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_file_text
|
||||
from onyx.file_processing.extract_file_text import get_file_ext
|
||||
from onyx.file_processing.extract_file_text import pptx_to_text
|
||||
from onyx.file_processing.extract_file_text import read_docx_file
|
||||
from onyx.file_processing.extract_file_text import read_pdf_file
|
||||
from onyx.file_processing.extract_file_text import xlsx_to_text
|
||||
from onyx.file_processing.file_types import OnyxFileExtensions
|
||||
from onyx.file_processing.file_types import OnyxMimeTypes
|
||||
from onyx.file_processing.file_types import SPREADSHEET_MIME_TYPE
|
||||
from onyx.file_processing.image_utils import store_image_and_create_section
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.variable_functionality import (
|
||||
@@ -289,7 +294,7 @@ def _download_and_extract_sections_basic(
|
||||
service: GoogleDriveService,
|
||||
allow_images: bool,
|
||||
size_threshold: int,
|
||||
) -> list[TextSection | ImageSection]:
|
||||
) -> list[TextSection | ImageSection | TabularSection]:
|
||||
"""Extract text and images from a Google Drive file."""
|
||||
file_id = file["id"]
|
||||
file_name = file["name"]
|
||||
@@ -308,7 +313,7 @@ def _download_and_extract_sections_basic(
|
||||
return []
|
||||
|
||||
# Store images for later processing
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
sections: list[TextSection | ImageSection | TabularSection] = []
|
||||
try:
|
||||
section, embedded_id = store_image_and_create_section(
|
||||
image_data=response_call(),
|
||||
@@ -323,10 +328,9 @@ def _download_and_extract_sections_basic(
|
||||
logger.error(f"Failed to process image {file_name}: {e}")
|
||||
return sections
|
||||
|
||||
# For Google Docs, Sheets, and Slides, export as plain text
|
||||
# For Google Docs, Sheets, and Slides, export via the Drive API
|
||||
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
|
||||
export_mime_type = GOOGLE_MIME_TYPES_TO_EXPORT[mime_type]
|
||||
# Use the correct API call for exporting files
|
||||
request = service.files().export_media(
|
||||
fileId=file_id, mimeType=export_mime_type
|
||||
)
|
||||
@@ -335,6 +339,17 @@ def _download_and_extract_sections_basic(
|
||||
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
|
||||
return []
|
||||
|
||||
if export_mime_type in OnyxMimeTypes.TABULAR_MIME_TYPES:
|
||||
# Synthesize an extension on the filename
|
||||
ext = ".xlsx" if export_mime_type == SPREADSHEET_MIME_TYPE else ".csv"
|
||||
return list(
|
||||
tabular_file_to_sections(
|
||||
io.BytesIO(response),
|
||||
file_name=f"{file_name}{ext}",
|
||||
link=link,
|
||||
)
|
||||
)
|
||||
|
||||
text = response.decode("utf-8")
|
||||
return [TextSection(link=link, text=text)]
|
||||
|
||||
@@ -356,9 +371,15 @@ def _download_and_extract_sections_basic(
|
||||
|
||||
elif (
|
||||
mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
or is_tabular_file(file_name)
|
||||
):
|
||||
text = xlsx_to_text(io.BytesIO(response_call()), file_name=file_name)
|
||||
return [TextSection(link=link, text=text)] if text else []
|
||||
return list(
|
||||
tabular_file_to_sections(
|
||||
io.BytesIO(response_call()),
|
||||
file_name=file_name,
|
||||
link=link,
|
||||
)
|
||||
)
|
||||
|
||||
elif (
|
||||
mime_type
|
||||
@@ -410,8 +431,9 @@ def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
|
||||
|
||||
|
||||
def align_basic_advanced(
|
||||
basic_sections: list[TextSection | ImageSection], adv_sections: list[TextSection]
|
||||
) -> list[TextSection | ImageSection]:
|
||||
basic_sections: list[TextSection | ImageSection | TabularSection],
|
||||
adv_sections: list[TextSection],
|
||||
) -> list[TextSection | ImageSection | TabularSection]:
|
||||
"""Align the basic sections with the advanced sections.
|
||||
In particular, the basic sections contain all content of the file,
|
||||
including smart chips like dates and doc links. The advanced sections
|
||||
@@ -428,7 +450,7 @@ def align_basic_advanced(
|
||||
basic_full_text = "".join(
|
||||
[section.text for section in basic_sections if isinstance(section, TextSection)]
|
||||
)
|
||||
new_sections: list[TextSection | ImageSection] = []
|
||||
new_sections: list[TextSection | ImageSection | TabularSection] = []
|
||||
heading_start = 0
|
||||
for adv_ind in range(1, len(adv_sections)):
|
||||
heading = adv_sections[adv_ind].text.split(HEADING_DELIMITER)[0]
|
||||
@@ -599,7 +621,7 @@ def _convert_drive_item_to_document(
|
||||
"""
|
||||
Main entry point for converting a Google Drive file => Document object.
|
||||
"""
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
sections: list[TextSection | ImageSection | TabularSection] = []
|
||||
|
||||
# Only construct these services when needed
|
||||
def _get_drive_service() -> GoogleDriveService:
|
||||
@@ -639,7 +661,9 @@ def _convert_drive_item_to_document(
|
||||
doc_id=file.get("id", ""),
|
||||
)
|
||||
if doc_sections:
|
||||
sections = cast(list[TextSection | ImageSection], doc_sections)
|
||||
sections = cast(
|
||||
list[TextSection | ImageSection | TabularSection], doc_sections
|
||||
)
|
||||
if any(SMART_CHIP_CHAR in section.text for section in doc_sections):
|
||||
logger.debug(
|
||||
f"found smart chips in {file.get('name')}, aligning with basic sections"
|
||||
|
||||
@@ -33,9 +33,19 @@ class ConnectorMissingCredentialError(PermissionError):
|
||||
)
|
||||
|
||||
|
||||
class SectionKind(str, Enum):
|
||||
"""Discriminator for Section subclasses.
|
||||
"""
|
||||
|
||||
TEXT = "text"
|
||||
IMAGE = "image"
|
||||
TABULAR = "tabular"
|
||||
|
||||
|
||||
class Section(BaseModel):
|
||||
"""Base section class with common attributes"""
|
||||
|
||||
kind: SectionKind
|
||||
link: str | None = None
|
||||
text: str | None = None
|
||||
image_file_id: str | None = None
|
||||
@@ -44,6 +54,7 @@ class Section(BaseModel):
|
||||
class TextSection(Section):
|
||||
"""Section containing text content"""
|
||||
|
||||
kind: SectionKind = SectionKind.TEXT
|
||||
text: str
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
@@ -53,12 +64,22 @@ class TextSection(Section):
|
||||
class ImageSection(Section):
|
||||
"""Section containing an image reference"""
|
||||
|
||||
kind: SectionKind = SectionKind.IMAGE
|
||||
image_file_id: str
|
||||
|
||||
def __sizeof__(self) -> int:
|
||||
return sys.getsizeof(self.image_file_id) + sys.getsizeof(self.link)
|
||||
|
||||
|
||||
class TabularSection(Section):
|
||||
"""Section containing tabular data (csv/tsv content, or one sheet of
|
||||
an xlsx workbook rendered as CSV)."""
|
||||
|
||||
kind: SectionKind = SectionKind.TABULAR
|
||||
text: str # CSV representation in a string
|
||||
link: str
|
||||
|
||||
|
||||
class BasicExpertInfo(BaseModel):
|
||||
"""Basic Information for the owner of a document, any of the fields can be left as None
|
||||
Display fallback goes as follows:
|
||||
@@ -161,7 +182,7 @@ class DocumentBase(BaseModel):
|
||||
"""Used for Onyx ingestion api, the ID is inferred before use if not provided"""
|
||||
|
||||
id: str | None = None
|
||||
sections: list[TextSection | ImageSection]
|
||||
sections: list[TextSection | ImageSection | TabularSection]
|
||||
source: DocumentSource | None = None
|
||||
semantic_identifier: str # displayed in the UI as the main identifier for the doc
|
||||
# TODO(andrei): Ideally we could improve this to where each value is just a
|
||||
|
||||
@@ -60,7 +60,12 @@ from onyx.connectors.models import ExternalAccess
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import SlimDocument
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
|
||||
is_tabular_file,
|
||||
tabular_file_to_sections,
|
||||
)
|
||||
from onyx.connectors.sharepoint.connector_utils import get_sharepoint_external_access
|
||||
from onyx.db.enums import HierarchyNodeType
|
||||
from onyx.file_processing.extract_file_text import extract_text_and_images
|
||||
@@ -586,7 +591,7 @@ def _convert_driveitem_to_document_with_permissions(
|
||||
driveitem, f"Failed to download via graph api: {e}", e
|
||||
)
|
||||
|
||||
sections: list[TextSection | ImageSection] = []
|
||||
sections: list[TextSection | ImageSection | TabularSection] = []
|
||||
file_ext = get_file_ext(driveitem.name)
|
||||
|
||||
if not content_bytes:
|
||||
@@ -602,6 +607,19 @@ def _convert_driveitem_to_document_with_permissions(
|
||||
)
|
||||
image_section.link = driveitem.web_url
|
||||
sections.append(image_section)
|
||||
elif is_tabular_file(driveitem.name):
|
||||
try:
|
||||
sections.extend(
|
||||
tabular_file_to_sections(
|
||||
file=io.BytesIO(content_bytes),
|
||||
file_name=driveitem.name,
|
||||
link=driveitem.web_url or "",
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to extract tabular sections for '{driveitem.name}': {e}"
|
||||
)
|
||||
else:
|
||||
|
||||
def _store_embedded_image(img_data: bytes, img_name: str) -> None:
|
||||
|
||||
@@ -462,30 +462,13 @@ def _remove_empty_runs(
|
||||
|
||||
return result
|
||||
|
||||
def xlsx_sheet_extraction(file: IO[Any], file_name: str = "") -> list[tuple[str, str]]:
|
||||
"""
|
||||
Converts each sheet in the excel file to a csv condensed string.
|
||||
Returns a string and the worksheet title for each worksheet
|
||||
|
||||
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
|
||||
# TODO: switch back to this approach in a few months when markitdown
|
||||
# fixes their handling of excel files
|
||||
|
||||
# md = get_markitdown_converter()
|
||||
# stream_info = StreamInfo(
|
||||
# mimetype=SPREADSHEET_MIME_TYPE, filename=file_name or None, extension=".xlsx"
|
||||
# )
|
||||
# try:
|
||||
# workbook = md.convert(to_bytesio(file), stream_info=stream_info)
|
||||
# except (
|
||||
# BadZipFile,
|
||||
# ValueError,
|
||||
# FileConversionException,
|
||||
# UnsupportedFormatException,
|
||||
# ) as e:
|
||||
# error_str = f"Failed to extract text from {file_name or 'xlsx file'}: {e}"
|
||||
# if file_name.startswith("~"):
|
||||
# logger.debug(error_str + " (this is expected for files with ~)")
|
||||
# else:
|
||||
# logger.warning(error_str)
|
||||
# return ""
|
||||
# return workbook.markdown
|
||||
Returns a list of (csv_text, sheet)
|
||||
"""
|
||||
try:
|
||||
workbook = openpyxl.load_workbook(file, read_only=True)
|
||||
except BadZipFile as e:
|
||||
@@ -494,23 +477,30 @@ def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
|
||||
logger.debug(error_str + " (this is expected for files with ~)")
|
||||
else:
|
||||
logger.warning(error_str)
|
||||
return ""
|
||||
return []
|
||||
except Exception as e:
|
||||
if any(s in str(e) for s in KNOWN_OPENPYXL_BUGS):
|
||||
logger.error(
|
||||
f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
|
||||
)
|
||||
return ""
|
||||
return []
|
||||
raise
|
||||
|
||||
text_content = []
|
||||
sheets: list[tuple[str, str]] = []
|
||||
for sheet in workbook.worksheets:
|
||||
sheet_matrix = _clean_worksheet_matrix(_worksheet_to_matrix(sheet))
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf, lineterminator="\n")
|
||||
writer.writerows(sheet_matrix)
|
||||
text_content.append(buf.getvalue().rstrip("\n"))
|
||||
return TEXT_SECTION_SEPARATOR.join(text_content)
|
||||
csv_text = buf.getvalue().rstrip("\n")
|
||||
if csv_text.strip():
|
||||
sheets.append((csv_text, sheet.title))
|
||||
return sheets
|
||||
|
||||
|
||||
def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
|
||||
sheets = xlsx_sheet_extraction(file, file_name)
|
||||
return TEXT_SECTION_SEPARATOR.join(csv_text for csv_text, _title in sheets)
|
||||
|
||||
|
||||
def eml_to_text(file: IO[Any]) -> str:
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from typing import cast
|
||||
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.configs.app_configs import AVERAGE_SUMMARY_EMBEDDINGS
|
||||
@@ -15,17 +13,15 @@ from onyx.configs.constants import SECTION_SEPARATOR
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
get_metadata_keys_to_ignore,
|
||||
)
|
||||
from onyx.indexing.chunking import DocumentChunker
|
||||
from onyx.indexing.chunking import extract_blurb
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.llm.utils import MAX_CONTEXT_TOKENS
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.text_processing import clean_text
|
||||
from onyx.utils.text_processing import shared_precompare_cleanup
|
||||
from shared_configs.configs import DOC_EMBEDDING_CONTEXT_SIZE
|
||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
|
||||
# Not supporting overlaps, we need a clean combination of chunks and it is unclear if overlaps
|
||||
# actually help quality at all
|
||||
@@ -154,9 +150,6 @@ class Chunker:
|
||||
self.tokenizer = tokenizer
|
||||
self.callback = callback
|
||||
|
||||
self.max_context = 0
|
||||
self.prompt_tokens = 0
|
||||
|
||||
# Create a token counter function that returns the count instead of the tokens
|
||||
def token_counter(text: str) -> int:
|
||||
return len(tokenizer.encode(text))
|
||||
@@ -186,234 +179,12 @@ class Chunker:
|
||||
else None
|
||||
)
|
||||
|
||||
def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
|
||||
"""
|
||||
Splits the text into smaller chunks based on token count to ensure
|
||||
no chunk exceeds the content_token_limit.
|
||||
"""
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
chunks = []
|
||||
start = 0
|
||||
total_tokens = len(tokens)
|
||||
while start < total_tokens:
|
||||
end = min(start + content_token_limit, total_tokens)
|
||||
token_chunk = tokens[start:end]
|
||||
chunk_text = " ".join(token_chunk)
|
||||
chunks.append(chunk_text)
|
||||
start = end
|
||||
return chunks
|
||||
|
||||
def _extract_blurb(self, text: str) -> str:
|
||||
"""
|
||||
Extract a short blurb from the text (first chunk of size `blurb_size`).
|
||||
"""
|
||||
# chunker is in `text` mode
|
||||
texts = cast(list[str], self.blurb_splitter.chunk(text))
|
||||
if not texts:
|
||||
return ""
|
||||
return texts[0]
|
||||
|
||||
def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
|
||||
"""
|
||||
For "multipass" mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
|
||||
"""
|
||||
if self.mini_chunk_splitter and chunk_text.strip():
|
||||
# chunker is in `text` mode
|
||||
return cast(list[str], self.mini_chunk_splitter.chunk(chunk_text))
|
||||
return None
|
||||
|
||||
# ADDED: extra param image_url to store in the chunk
|
||||
def _create_chunk(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
chunks_list: list[DocAwareChunk],
|
||||
text: str,
|
||||
links: dict[int, str],
|
||||
is_continuation: bool = False,
|
||||
title_prefix: str = "",
|
||||
metadata_suffix_semantic: str = "",
|
||||
metadata_suffix_keyword: str = "",
|
||||
image_file_id: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Helper to create a new DocAwareChunk, append it to chunks_list.
|
||||
"""
|
||||
new_chunk = DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=len(chunks_list),
|
||||
blurb=self._extract_blurb(text),
|
||||
content=text,
|
||||
source_links=links or {0: ""},
|
||||
image_file_id=image_file_id,
|
||||
section_continuation=is_continuation,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
mini_chunk_texts=self._get_mini_chunk_texts(text),
|
||||
large_chunk_id=None,
|
||||
doc_summary="",
|
||||
chunk_context="",
|
||||
contextual_rag_reserved_tokens=0, # set per-document in _handle_single_document
|
||||
self._document_chunker = DocumentChunker(
|
||||
tokenizer=tokenizer,
|
||||
blurb_splitter=self.blurb_splitter,
|
||||
chunk_splitter=self.chunk_splitter,
|
||||
mini_chunk_splitter=self.mini_chunk_splitter,
|
||||
)
|
||||
chunks_list.append(new_chunk)
|
||||
|
||||
def _chunk_document_with_sections(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
sections: list[Section],
|
||||
title_prefix: str,
|
||||
metadata_suffix_semantic: str,
|
||||
metadata_suffix_keyword: str,
|
||||
content_token_limit: int,
|
||||
) -> list[DocAwareChunk]:
|
||||
"""
|
||||
Loops through sections of the document, converting them into one or more chunks.
|
||||
Works with processed sections that are base Section objects.
|
||||
"""
|
||||
chunks: list[DocAwareChunk] = []
|
||||
link_offsets: dict[int, str] = {}
|
||||
chunk_text = ""
|
||||
|
||||
for section_idx, section in enumerate(sections):
|
||||
# Get section text and other attributes
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
section_link_text = section.link or ""
|
||||
image_url = section.image_file_id
|
||||
|
||||
# If there is no useful content, skip
|
||||
if not section_text and (not document.title or section_idx > 0):
|
||||
logger.warning(
|
||||
f"Skipping empty or irrelevant section in doc {document.semantic_identifier}, link={section_link_text}"
|
||||
)
|
||||
continue
|
||||
|
||||
# CASE 1: If this section has an image, force a separate chunk
|
||||
if image_url:
|
||||
# First, if we have any partially built text chunk, finalize it
|
||||
if chunk_text.strip():
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
is_continuation=False,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
chunk_text = ""
|
||||
link_offsets = {}
|
||||
|
||||
# Create a chunk specifically for this image section
|
||||
# (Using the text summary that was generated during processing)
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
section_text,
|
||||
links={0: section_link_text} if section_link_text else {},
|
||||
image_file_id=image_url,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
# Continue to next section
|
||||
continue
|
||||
|
||||
# CASE 2: Normal text section
|
||||
section_token_count = len(self.tokenizer.encode(section_text))
|
||||
|
||||
# If the section is large on its own, split it separately
|
||||
if section_token_count > content_token_limit:
|
||||
if chunk_text.strip():
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
chunk_text = ""
|
||||
link_offsets = {}
|
||||
|
||||
# chunker is in `text` mode
|
||||
split_texts = cast(list[str], self.chunk_splitter.chunk(section_text))
|
||||
for i, split_text in enumerate(split_texts):
|
||||
# If even the split_text is bigger than strict limit, further split
|
||||
if (
|
||||
STRICT_CHUNK_TOKEN_LIMIT
|
||||
and len(self.tokenizer.encode(split_text)) > content_token_limit
|
||||
):
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for j, small_chunk in enumerate(smaller_chunks):
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
small_chunk,
|
||||
{0: section_link_text},
|
||||
is_continuation=(j != 0),
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
else:
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
split_text,
|
||||
{0: section_link_text},
|
||||
is_continuation=(i != 0),
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
continue
|
||||
|
||||
# If we can still fit this section into the current chunk, do so
|
||||
current_token_count = len(self.tokenizer.encode(chunk_text))
|
||||
current_offset = len(shared_precompare_cleanup(chunk_text))
|
||||
next_section_tokens = (
|
||||
len(self.tokenizer.encode(SECTION_SEPARATOR)) + section_token_count
|
||||
)
|
||||
|
||||
if next_section_tokens + current_token_count <= content_token_limit:
|
||||
if chunk_text:
|
||||
chunk_text += SECTION_SEPARATOR
|
||||
chunk_text += section_text
|
||||
link_offsets[current_offset] = section_link_text
|
||||
else:
|
||||
# finalize the existing chunk
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets,
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
# start a new chunk
|
||||
link_offsets = {0: section_link_text}
|
||||
chunk_text = section_text
|
||||
|
||||
# finalize any leftover text chunk
|
||||
if chunk_text.strip() or not chunks:
|
||||
self._create_chunk(
|
||||
document,
|
||||
chunks,
|
||||
chunk_text,
|
||||
link_offsets or {0: ""}, # safe default
|
||||
False,
|
||||
title_prefix,
|
||||
metadata_suffix_semantic,
|
||||
metadata_suffix_keyword,
|
||||
)
|
||||
return chunks
|
||||
|
||||
def _handle_single_document(
|
||||
self, document: IndexingDocument
|
||||
@@ -423,7 +194,10 @@ class Chunker:
|
||||
logger.debug(f"Chunking {document.semantic_identifier}")
|
||||
|
||||
# Title prep
|
||||
title = self._extract_blurb(document.get_title_for_document_index() or "")
|
||||
title = extract_blurb(
|
||||
document.get_title_for_document_index() or "",
|
||||
self.blurb_splitter,
|
||||
)
|
||||
title_prefix = title + RETURN_SEPARATOR if title else ""
|
||||
title_tokens = len(self.tokenizer.encode(title_prefix))
|
||||
|
||||
@@ -491,7 +265,7 @@ class Chunker:
|
||||
# Use processed_sections if available (IndexingDocument), otherwise use original sections
|
||||
sections_to_chunk = document.processed_sections
|
||||
|
||||
normal_chunks = self._chunk_document_with_sections(
|
||||
normal_chunks = self._document_chunker.chunk(
|
||||
document,
|
||||
sections_to_chunk,
|
||||
title_prefix,
|
||||
|
||||
7
backend/onyx/indexing/chunking/__init__.py
Normal file
7
backend/onyx/indexing/chunking/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from onyx.indexing.chunking.document_chunker import DocumentChunker
|
||||
from onyx.indexing.chunking.section_chunker import extract_blurb
|
||||
|
||||
__all__ = [
|
||||
"DocumentChunker",
|
||||
"extract_blurb",
|
||||
]
|
||||
115
backend/onyx/indexing/chunking/document_chunker.py
Normal file
115
backend/onyx/indexing/chunking/document_chunker.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SectionKind
|
||||
from onyx.indexing.chunking.image_section_chunker import ImageChunker
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.tabular_section_chunker import TabularChunker
|
||||
from onyx.indexing.chunking.text_section_chunker import TextChunker
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.text_processing import clean_text
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class DocumentChunker:
|
||||
"""Converts a document's processed sections into DocAwareChunks.
|
||||
|
||||
Drop-in replacement for `Chunker._chunk_document_with_sections`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: BaseTokenizer,
|
||||
blurb_splitter: SentenceChunker,
|
||||
chunk_splitter: SentenceChunker,
|
||||
mini_chunk_splitter: SentenceChunker | None = None,
|
||||
) -> None:
|
||||
self.blurb_splitter = blurb_splitter
|
||||
self.mini_chunk_splitter = mini_chunk_splitter
|
||||
|
||||
self._dispatch: dict[SectionKind, SectionChunker] = {
|
||||
SectionKind.TEXT: TextChunker(
|
||||
tokenizer=tokenizer,
|
||||
chunk_splitter=chunk_splitter,
|
||||
),
|
||||
SectionKind.IMAGE: ImageChunker(),
|
||||
SectionKind.TABULAR: TabularChunker(tokenizer=tokenizer),
|
||||
}
|
||||
|
||||
def chunk(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
sections: list[Section],
|
||||
title_prefix: str,
|
||||
metadata_suffix_semantic: str,
|
||||
metadata_suffix_keyword: str,
|
||||
content_token_limit: int,
|
||||
) -> list[DocAwareChunk]:
|
||||
payloads = self._collect_section_payloads(
|
||||
document=document,
|
||||
sections=sections,
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
if not payloads:
|
||||
payloads.append(ChunkPayload(text="", links={0: ""}))
|
||||
|
||||
return [
|
||||
payload.to_doc_aware_chunk(
|
||||
document=document,
|
||||
chunk_id=idx,
|
||||
blurb_splitter=self.blurb_splitter,
|
||||
mini_chunk_splitter=self.mini_chunk_splitter,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
)
|
||||
for idx, payload in enumerate(payloads)
|
||||
]
|
||||
|
||||
def _collect_section_payloads(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
sections: list[Section],
|
||||
content_token_limit: int,
|
||||
) -> list[ChunkPayload]:
|
||||
accumulator = AccumulatorState()
|
||||
payloads: list[ChunkPayload] = []
|
||||
|
||||
for section_idx, section in enumerate(sections):
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
|
||||
if not section_text and (
|
||||
not document.title or section_idx > 0
|
||||
):
|
||||
logger.warning(
|
||||
f"Skipping empty or irrelevant section in doc "
|
||||
f"{document.semantic_identifier}, link={section.link}"
|
||||
)
|
||||
continue
|
||||
|
||||
chunker = self._select_chunker(section)
|
||||
result = chunker.chunk_section(
|
||||
section=section,
|
||||
accumulator=accumulator,
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
payloads.extend(result.payloads)
|
||||
accumulator = result.accumulator
|
||||
|
||||
payloads.extend(accumulator.flush_to_list())
|
||||
return payloads
|
||||
|
||||
def _select_chunker(self, section: Section) -> SectionChunker:
|
||||
try:
|
||||
return self._dispatch[section.kind]
|
||||
except KeyError:
|
||||
raise ValueError(
|
||||
f"No SectionChunker registered for kind={section.kind}"
|
||||
)
|
||||
34
backend/onyx/indexing/chunking/image_section_chunker.py
Normal file
34
backend/onyx/indexing/chunking/image_section_chunker.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
|
||||
from onyx.utils.text_processing import clean_text
|
||||
|
||||
|
||||
class ImageChunker(SectionChunker):
|
||||
def chunk_section(
|
||||
self,
|
||||
section: Section,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int, # noqa: ARG002
|
||||
) -> SectionChunkerOutput:
|
||||
assert section.image_file_id is not None
|
||||
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
section_link = section.link or ""
|
||||
|
||||
payloads = accumulator.flush_to_list()
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=section_text,
|
||||
links={0: section_link} if section_link else {},
|
||||
image_file_id=section.image_file_id,
|
||||
is_continuation=False,
|
||||
)
|
||||
)
|
||||
|
||||
return SectionChunkerOutput(
|
||||
payloads=payloads,
|
||||
accumulator=AccumulatorState(),
|
||||
)
|
||||
102
backend/onyx/indexing/chunking/section_chunker.py
Normal file
102
backend/onyx/indexing/chunking/section_chunker.py
Normal file
@@ -0,0 +1,102 @@
|
||||
from abc import ABC
|
||||
from abc import abstractmethod
|
||||
from typing import cast
|
||||
|
||||
from chonkie import SentenceChunker
|
||||
from pydantic import BaseModel
|
||||
from pydantic import Field
|
||||
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.models import DocAwareChunk
|
||||
|
||||
|
||||
def extract_blurb(text: str, blurb_splitter: SentenceChunker) -> str:
|
||||
texts = cast(list[str], blurb_splitter.chunk(text))
|
||||
if not texts:
|
||||
return ""
|
||||
return texts[0]
|
||||
|
||||
|
||||
def get_mini_chunk_texts(
|
||||
chunk_text: str,
|
||||
mini_chunk_splitter: SentenceChunker | None,
|
||||
) -> list[str] | None:
|
||||
if mini_chunk_splitter and chunk_text.strip():
|
||||
return cast(list[str], mini_chunk_splitter.chunk(chunk_text))
|
||||
return None
|
||||
|
||||
|
||||
class ChunkPayload(BaseModel):
|
||||
"""Section-local chunk content without document-scoped fields.
|
||||
|
||||
The orchestrator upgrades these to DocAwareChunks via
|
||||
`to_doc_aware_chunk` after assigning chunk_ids and attaching
|
||||
title/metadata.
|
||||
"""
|
||||
|
||||
text: str
|
||||
links: dict[int, str]
|
||||
is_continuation: bool = False
|
||||
image_file_id: str | None = None
|
||||
|
||||
def to_doc_aware_chunk(
|
||||
self,
|
||||
document: IndexingDocument,
|
||||
chunk_id: int,
|
||||
blurb_splitter: SentenceChunker,
|
||||
title_prefix: str = "",
|
||||
metadata_suffix_semantic: str = "",
|
||||
metadata_suffix_keyword: str = "",
|
||||
mini_chunk_splitter: SentenceChunker | None = None,
|
||||
) -> DocAwareChunk:
|
||||
return DocAwareChunk(
|
||||
source_document=document,
|
||||
chunk_id=chunk_id,
|
||||
blurb=extract_blurb(self.text, blurb_splitter),
|
||||
content=self.text,
|
||||
source_links=self.links or {0: ""},
|
||||
image_file_id=self.image_file_id,
|
||||
section_continuation=self.is_continuation,
|
||||
title_prefix=title_prefix,
|
||||
metadata_suffix_semantic=metadata_suffix_semantic,
|
||||
metadata_suffix_keyword=metadata_suffix_keyword,
|
||||
mini_chunk_texts=get_mini_chunk_texts(
|
||||
self.text, mini_chunk_splitter
|
||||
),
|
||||
large_chunk_id=None,
|
||||
doc_summary="",
|
||||
chunk_context="",
|
||||
contextual_rag_reserved_tokens=0,
|
||||
)
|
||||
|
||||
|
||||
class AccumulatorState(BaseModel):
|
||||
"""Cross-section text buffer threaded through SectionChunkers."""
|
||||
|
||||
text: str = ""
|
||||
link_offsets: dict[int, str] = Field(default_factory=dict)
|
||||
|
||||
def is_empty(self) -> bool:
|
||||
return not self.text.strip()
|
||||
|
||||
def flush_to_list(self) -> list["ChunkPayload"]:
|
||||
if self.is_empty():
|
||||
return []
|
||||
return [ChunkPayload(text=self.text, links=self.link_offsets)]
|
||||
|
||||
|
||||
class SectionChunkerOutput(BaseModel):
|
||||
payloads: list[ChunkPayload]
|
||||
accumulator: AccumulatorState
|
||||
|
||||
|
||||
class SectionChunker(ABC):
|
||||
@abstractmethod
|
||||
def chunk_section(
|
||||
self,
|
||||
section: Section,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int,
|
||||
) -> SectionChunkerOutput:
|
||||
...
|
||||
333
backend/onyx/indexing/chunking/tabular_section_chunker.py
Normal file
333
backend/onyx/indexing/chunking/tabular_section_chunker.py
Normal file
@@ -0,0 +1,333 @@
|
||||
import csv
|
||||
import io
|
||||
from collections.abc import Callable
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
# --- Markers / separators used in emitted chunks --------------------------
|
||||
|
||||
ROWS_MARKER = "Rows:"
|
||||
COLUMNS_MARKER = "Columns:"
|
||||
FIELD_VALUE_SEPARATOR = ", "
|
||||
ROW_JOIN = "\n"
|
||||
|
||||
# Minimum per-chunk row budget. Guards against a prelude so large that no
|
||||
# row could possibly fit — keeps at least a token or two of headroom so
|
||||
# the chunk still carries something.
|
||||
_MIN_ROW_BUDGET_TOKENS = 16
|
||||
|
||||
|
||||
# --- Parsing --------------------------------------------------------------
|
||||
|
||||
|
||||
class _ParsedSection(BaseModel):
|
||||
sheet_name: str
|
||||
link: str
|
||||
headers: list[str]
|
||||
rows: list[list[str]]
|
||||
|
||||
|
||||
def _parse_section(section: Section) -> _ParsedSection | None:
|
||||
"""Parse a CSV-encoded tabular section into headers + rows.
|
||||
|
||||
The first non-empty row is treated as the header. Blank rows are
|
||||
skipped so stray separator lines don't produce ghost rows. A CSV
|
||||
with only a header row is still parseable (returns empty rows).
|
||||
"""
|
||||
section_text = section.text or ""
|
||||
if not section_text.strip():
|
||||
return None
|
||||
|
||||
reader = csv.reader(io.StringIO(section_text))
|
||||
non_empty_rows = [
|
||||
row for row in reader if any(cell.strip() for cell in row)
|
||||
]
|
||||
if not non_empty_rows:
|
||||
return None
|
||||
|
||||
return _ParsedSection(
|
||||
sheet_name=section.link or "",
|
||||
link=section.link or "",
|
||||
headers=non_empty_rows[0],
|
||||
rows=non_empty_rows[1:],
|
||||
)
|
||||
|
||||
|
||||
# --- Step 1: FORMATTING ---------------------------------------------------
|
||||
#
|
||||
# Converts header + row → a single formatted string. Swap these out to
|
||||
# change the textual representation of rows in chunks (e.g. JSON-line,
|
||||
# bullet-list, markdown table row, etc.) without touching packing.
|
||||
|
||||
|
||||
def format_columns_header(headers: list[str]) -> str:
|
||||
"""Format the 'Columns:' line that appears in every chunk's prelude."""
|
||||
return f"{COLUMNS_MARKER} " + FIELD_VALUE_SEPARATOR.join(headers)
|
||||
|
||||
|
||||
def format_row_field_value(headers: list[str], row: list[str]) -> str:
|
||||
"""Format one row as ``col=val, col=val, ...``.
|
||||
|
||||
- Missing trailing cells (row shorter than headers) are treated as empty.
|
||||
- Empty values are dropped; omitting them keeps chunks dense with
|
||||
retrieval-relevant content rather than padded with ``col=``.
|
||||
"""
|
||||
parts: list[str] = []
|
||||
for i, header in enumerate(headers):
|
||||
value = row[i] if i < len(row) else ""
|
||||
if not value.strip():
|
||||
continue
|
||||
parts.append(f"{header}={value}")
|
||||
return FIELD_VALUE_SEPARATOR.join(parts)
|
||||
|
||||
|
||||
# --- Step 2: PACKING ------------------------------------------------------
|
||||
#
|
||||
# Given formatted row strings + a prelude + a token budget, emit a list of
|
||||
# chunk strings that each fit within the budget. Swap this out to change
|
||||
# the packing strategy (e.g. one-row-per-chunk, fixed-row-count, etc.)
|
||||
# without touching formatting.
|
||||
|
||||
|
||||
class _RowPacker:
|
||||
"""Packs formatted rows into chunks under a token limit.
|
||||
|
||||
Each emitted chunk looks like::
|
||||
|
||||
<prelude>
|
||||
<row 1>
|
||||
<row 2>
|
||||
...
|
||||
|
||||
The prelude is repeated at the top of every chunk so each chunk is
|
||||
self-describing for downstream retrieval.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
prelude: str,
|
||||
token_counter: Callable[[str], int],
|
||||
max_tokens: int,
|
||||
strict: bool,
|
||||
) -> None:
|
||||
self.prelude = prelude
|
||||
self.token_counter = token_counter
|
||||
self.max_tokens = max_tokens
|
||||
self.strict = strict
|
||||
|
||||
prelude_tokens = token_counter(prelude)
|
||||
# Budget for the rows alone, reserving room for the prelude plus
|
||||
# the newline that joins it to the row block.
|
||||
self._row_budget = max(
|
||||
_MIN_ROW_BUDGET_TOKENS, max_tokens - prelude_tokens - 1
|
||||
)
|
||||
|
||||
def pack(self, rows: list[str]) -> list[str]:
|
||||
chunks: list[str] = []
|
||||
buf: list[str] = []
|
||||
buf_tokens = 0
|
||||
|
||||
for row in rows:
|
||||
if not row:
|
||||
continue
|
||||
row_tokens = self.token_counter(row)
|
||||
|
||||
# Row that won't fit its own chunk: flush, split, emit each
|
||||
# piece as a standalone chunk.
|
||||
if row_tokens > self._row_budget:
|
||||
if buf:
|
||||
chunks.append(self._assemble(buf))
|
||||
buf, buf_tokens = [], 0
|
||||
for piece in self._split_oversized_row(row):
|
||||
chunks.append(self._assemble([piece]))
|
||||
continue
|
||||
|
||||
# +1 accounts for the newline separating rows in the buffer.
|
||||
sep_tokens = 1 if buf else 0
|
||||
if buf and buf_tokens + sep_tokens + row_tokens > self._row_budget:
|
||||
chunks.append(self._assemble(buf))
|
||||
buf, buf_tokens = [], 0
|
||||
sep_tokens = 0
|
||||
|
||||
buf.append(row)
|
||||
buf_tokens += sep_tokens + row_tokens
|
||||
|
||||
if buf:
|
||||
chunks.append(self._assemble(buf))
|
||||
return chunks
|
||||
|
||||
def _assemble(self, rows: list[str]) -> str:
|
||||
return self.prelude + ROW_JOIN + ROW_JOIN.join(rows)
|
||||
|
||||
def _split_oversized_row(self, row: str) -> list[str]:
|
||||
"""Split a single over-budget row.
|
||||
|
||||
First pass splits at ``field=value`` boundaries to preserve the
|
||||
column-level structure. If ``strict`` is set and any resulting
|
||||
piece is still over budget, fall back to a hard character-level
|
||||
split so no chunk ever exceeds ``max_tokens``.
|
||||
"""
|
||||
pieces = _split_by_field_boundary(
|
||||
row, self._row_budget, self.token_counter
|
||||
)
|
||||
|
||||
if not self.strict:
|
||||
return pieces
|
||||
|
||||
out: list[str] = []
|
||||
for piece in pieces:
|
||||
if self.token_counter(piece) > self._row_budget:
|
||||
out.extend(_hard_split_by_chars(piece, self._row_budget, self.token_counter))
|
||||
else:
|
||||
out.append(piece)
|
||||
return out
|
||||
|
||||
|
||||
def _split_by_field_boundary(
|
||||
row: str,
|
||||
max_tokens: int,
|
||||
token_counter: Callable[[str], int],
|
||||
) -> list[str]:
|
||||
"""Greedy split of a ``col=val, col=val, ...`` row at ``, `` boundaries."""
|
||||
parts = row.split(FIELD_VALUE_SEPARATOR)
|
||||
pieces: list[str] = []
|
||||
buf: list[str] = []
|
||||
buf_tokens = 0
|
||||
sep_tokens = token_counter(FIELD_VALUE_SEPARATOR)
|
||||
|
||||
for part in parts:
|
||||
part_tokens = token_counter(part)
|
||||
add_sep = sep_tokens if buf else 0
|
||||
if buf and buf_tokens + add_sep + part_tokens > max_tokens:
|
||||
pieces.append(FIELD_VALUE_SEPARATOR.join(buf))
|
||||
buf, buf_tokens = [part], part_tokens
|
||||
else:
|
||||
buf.append(part)
|
||||
buf_tokens += add_sep + part_tokens
|
||||
|
||||
if buf:
|
||||
pieces.append(FIELD_VALUE_SEPARATOR.join(buf))
|
||||
return pieces
|
||||
|
||||
|
||||
def _hard_split_by_chars(
|
||||
text: str,
|
||||
max_tokens: int,
|
||||
token_counter: Callable[[str], int],
|
||||
) -> list[str]:
|
||||
"""Last-resort character split when field-level splitting can't
|
||||
reduce a piece below ``max_tokens`` (e.g. a single field contains a
|
||||
giant value). Approximates via chars-per-token from the input string
|
||||
itself, then slices."""
|
||||
total_tokens = max(1, token_counter(text))
|
||||
approx_chars_per_token = max(1, len(text) // total_tokens)
|
||||
window = max(1, max_tokens * approx_chars_per_token)
|
||||
return [text[i : i + window] for i in range(0, len(text), window)]
|
||||
|
||||
|
||||
# --- Step 3: ORCHESTRATION ------------------------------------------------
|
||||
|
||||
|
||||
class TabularChunker(SectionChunker):
|
||||
"""Chunks tabular sections (csv text) into row-packed field=value chunks.
|
||||
|
||||
Each emitted chunk carries a prelude (sheet name + Rows: marker +
|
||||
Columns: header line) followed by as many ``col=val, col=val``
|
||||
rows as fit under ``content_token_limit``. Rows too large for a
|
||||
single chunk are split at field boundaries (and, under
|
||||
``STRICT_CHUNK_TOKEN_LIMIT``, hard-split by characters as a fallback).
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: BaseTokenizer) -> None:
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def chunk_section(
|
||||
self,
|
||||
section: Section,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int,
|
||||
) -> SectionChunkerOutput:
|
||||
assert section.text is not None
|
||||
|
||||
parsed = _parse_section(section)
|
||||
if parsed is None:
|
||||
logger.warning(
|
||||
f"TabularChunker: skipping unparseable section (link={section.link})"
|
||||
)
|
||||
return SectionChunkerOutput(payloads=[], accumulator=accumulator)
|
||||
|
||||
# Tabular sections are structurally standalone — flush any pending
|
||||
# text buffer before emitting our own chunks, matching ImageChunker.
|
||||
payloads = accumulator.flush_to_list()
|
||||
|
||||
prelude = self._build_prelude(parsed)
|
||||
formatted_rows = [
|
||||
line
|
||||
for line in (
|
||||
format_row_field_value(parsed.headers, row)
|
||||
for row in parsed.rows
|
||||
)
|
||||
if line
|
||||
]
|
||||
|
||||
# Header-only table (no non-empty rows): emit a single
|
||||
# prelude-only chunk so the column schema is still indexed.
|
||||
if not formatted_rows:
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=prelude,
|
||||
links={0: parsed.link},
|
||||
is_continuation=False,
|
||||
)
|
||||
)
|
||||
return SectionChunkerOutput(
|
||||
payloads=payloads,
|
||||
accumulator=AccumulatorState(),
|
||||
)
|
||||
|
||||
packer = _RowPacker(
|
||||
prelude=prelude,
|
||||
token_counter=self._count_tokens,
|
||||
max_tokens=content_token_limit,
|
||||
strict=STRICT_CHUNK_TOKEN_LIMIT,
|
||||
)
|
||||
chunk_texts = packer.pack(formatted_rows)
|
||||
|
||||
for i, text in enumerate(chunk_texts):
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=text,
|
||||
links={0: parsed.link},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
|
||||
return SectionChunkerOutput(
|
||||
payloads=payloads,
|
||||
accumulator=AccumulatorState(),
|
||||
)
|
||||
|
||||
def _build_prelude(self, parsed: _ParsedSection) -> str:
|
||||
"""The per-chunk header: sheet name (if any) + ``Rows:`` marker
|
||||
+ ``Columns:`` header line. Swap this to change the prelude shape."""
|
||||
parts: list[str] = []
|
||||
if parsed.sheet_name:
|
||||
parts.append(parsed.sheet_name)
|
||||
parts.append(ROWS_MARKER)
|
||||
parts.append(format_columns_header(parsed.headers))
|
||||
return ROW_JOIN.join(parts)
|
||||
|
||||
def _count_tokens(self, text: str) -> int:
|
||||
return len(self.tokenizer.encode(text))
|
||||
129
backend/onyx/indexing/chunking/text_section_chunker.py
Normal file
129
backend/onyx/indexing/chunking/text_section_chunker.py
Normal file
@@ -0,0 +1,129 @@
|
||||
from typing import cast
|
||||
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.configs.constants import SECTION_SEPARATOR
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.utils.text_processing import clean_text
|
||||
from onyx.utils.text_processing import shared_precompare_cleanup
|
||||
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
|
||||
|
||||
|
||||
class TextChunker(SectionChunker):
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: BaseTokenizer,
|
||||
chunk_splitter: SentenceChunker,
|
||||
) -> None:
|
||||
self.tokenizer = tokenizer
|
||||
self.chunk_splitter = chunk_splitter
|
||||
|
||||
def chunk_section(
|
||||
self,
|
||||
section: Section,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int,
|
||||
) -> SectionChunkerOutput:
|
||||
section_text = clean_text(str(section.text or ""))
|
||||
section_link = section.link or ""
|
||||
section_token_count = len(self.tokenizer.encode(section_text))
|
||||
|
||||
# Oversized — flush buffer and split the section
|
||||
if section_token_count > content_token_limit:
|
||||
return self._handle_oversized_section(
|
||||
section_text=section_text,
|
||||
section_link=section_link,
|
||||
accumulator=accumulator,
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
current_token_count = len(self.tokenizer.encode(accumulator.text))
|
||||
next_section_tokens = (
|
||||
len(self.tokenizer.encode(SECTION_SEPARATOR)) + section_token_count
|
||||
)
|
||||
|
||||
# Fits — extend the accumulator
|
||||
if next_section_tokens + current_token_count <= content_token_limit:
|
||||
offset = len(shared_precompare_cleanup(accumulator.text))
|
||||
new_text = accumulator.text
|
||||
if new_text:
|
||||
new_text += SECTION_SEPARATOR
|
||||
new_text += section_text
|
||||
return SectionChunkerOutput(
|
||||
payloads=[],
|
||||
accumulator=AccumulatorState(
|
||||
text=new_text,
|
||||
link_offsets={**accumulator.link_offsets, offset: section_link},
|
||||
),
|
||||
)
|
||||
|
||||
# Doesn't fit — flush buffer and restart with this section
|
||||
return SectionChunkerOutput(
|
||||
payloads=accumulator.flush_to_list(),
|
||||
accumulator=AccumulatorState(
|
||||
text=section_text,
|
||||
link_offsets={0: section_link},
|
||||
),
|
||||
)
|
||||
|
||||
def _handle_oversized_section(
|
||||
self,
|
||||
section_text: str,
|
||||
section_link: str,
|
||||
accumulator: AccumulatorState,
|
||||
content_token_limit: int,
|
||||
) -> SectionChunkerOutput:
|
||||
payloads = accumulator.flush_to_list()
|
||||
|
||||
split_texts = cast(
|
||||
list[str], self.chunk_splitter.chunk(section_text)
|
||||
)
|
||||
for i, split_text in enumerate(split_texts):
|
||||
if (
|
||||
STRICT_CHUNK_TOKEN_LIMIT
|
||||
and len(self.tokenizer.encode(split_text)) > content_token_limit
|
||||
):
|
||||
smaller_chunks = self._split_oversized_chunk(
|
||||
split_text, content_token_limit
|
||||
)
|
||||
for j, small_chunk in enumerate(smaller_chunks):
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=small_chunk,
|
||||
links={0: section_link},
|
||||
is_continuation=(j != 0),
|
||||
)
|
||||
)
|
||||
else:
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
text=split_text,
|
||||
links={0: section_link},
|
||||
is_continuation=(i != 0),
|
||||
)
|
||||
)
|
||||
|
||||
return SectionChunkerOutput(
|
||||
payloads=payloads,
|
||||
accumulator=AccumulatorState(),
|
||||
)
|
||||
|
||||
def _split_oversized_chunk(
|
||||
self, text: str, content_token_limit: int
|
||||
) -> list[str]:
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
total_tokens = len(tokens)
|
||||
while start < total_tokens:
|
||||
end = min(start + content_token_limit, total_tokens)
|
||||
token_chunk = tokens[start:end]
|
||||
chunk_text = " ".join(token_chunk)
|
||||
chunks.append(chunk_text)
|
||||
start = end
|
||||
return chunks
|
||||
@@ -542,6 +542,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
**document.model_dump(),
|
||||
processed_sections=[
|
||||
Section(
|
||||
kind=section.kind,
|
||||
text=section.text if isinstance(section, TextSection) else "",
|
||||
link=section.link,
|
||||
image_file_id=(
|
||||
@@ -566,6 +567,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
if isinstance(section, ImageSection):
|
||||
# Default section with image path preserved - ensure text is always a string
|
||||
processed_section = Section(
|
||||
kind=section.kind,
|
||||
link=section.link,
|
||||
image_file_id=section.image_file_id,
|
||||
text="", # Initialize with empty string
|
||||
@@ -609,6 +611,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
|
||||
# For TextSection, create a base Section with text and link
|
||||
elif isinstance(section, TextSection):
|
||||
processed_section = Section(
|
||||
kind=section.kind,
|
||||
text=section.text or "", # Ensure text is always a string, not None
|
||||
link=section.link,
|
||||
image_file_id=None,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SectionKind
|
||||
|
||||
|
||||
FINAL_CONTEXT_DOCUMENTS_ID = "final_context_documents"
|
||||
@@ -17,6 +18,7 @@ def documents_to_indexing_documents(
|
||||
processed_sections = []
|
||||
for section in document.sections:
|
||||
processed_section = Section(
|
||||
kind=SectionKind.TEXT,
|
||||
text=section.text or "",
|
||||
link=section.link,
|
||||
image_file_id=None,
|
||||
|
||||
@@ -4,6 +4,7 @@ from typing import cast
|
||||
import openpyxl
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
|
||||
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
|
||||
from onyx.file_processing.extract_file_text import xlsx_to_text
|
||||
|
||||
|
||||
@@ -196,3 +197,136 @@ class TestXlsxToText:
|
||||
assert "r1c1" in lines[0] and "r1c2" in lines[0]
|
||||
assert "r2c1" in lines[1] and "r2c2" in lines[1]
|
||||
assert "r3c1" in lines[2] and "r3c2" in lines[2]
|
||||
|
||||
|
||||
class TestXlsxSheetExtraction:
|
||||
def test_one_tuple_per_sheet(self) -> None:
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"Revenue": [["Month", "Amount"], ["Jan", "100"]],
|
||||
"Expenses": [["Category", "Cost"], ["Rent", "500"]],
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 2
|
||||
# Order preserved from workbook sheet order
|
||||
titles = [title for _csv, title in sheets]
|
||||
assert titles == ["Revenue", "Expenses"]
|
||||
# Content present in the right tuple
|
||||
revenue_csv, _ = sheets[0]
|
||||
expenses_csv, _ = sheets[1]
|
||||
assert "Month" in revenue_csv
|
||||
assert "Jan" in revenue_csv
|
||||
assert "Category" in expenses_csv
|
||||
assert "Rent" in expenses_csv
|
||||
|
||||
def test_tuple_structure_is_csv_text_then_title(self) -> None:
|
||||
"""The tuple order is (csv_text, sheet_title) — pin it so callers
|
||||
that unpack positionally don't silently break."""
|
||||
xlsx = _make_xlsx({"MySheet": [["a", "b"]]})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, title = sheets[0]
|
||||
assert title == "MySheet"
|
||||
assert "a" in csv_text
|
||||
assert "b" in csv_text
|
||||
|
||||
def test_empty_sheet_is_skipped(self) -> None:
|
||||
"""A sheet whose CSV output is empty/whitespace-only should NOT
|
||||
appear in the result — the `if csv_text.strip():` guard filters
|
||||
it out."""
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"Data": [["a", "b"]],
|
||||
"Empty": [],
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
assert sheets[0][1] == "Data"
|
||||
|
||||
def test_empty_workbook_returns_empty_list(self) -> None:
|
||||
"""All sheets empty → empty list (not a list of empty tuples)."""
|
||||
xlsx = _make_xlsx({"Sheet1": [], "Sheet2": []})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert sheets == []
|
||||
|
||||
def test_single_sheet(self) -> None:
|
||||
xlsx = _make_xlsx({"Only": [["x", "y"], ["1", "2"]]})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, title = sheets[0]
|
||||
assert title == "Only"
|
||||
assert "x" in csv_text
|
||||
assert "1" in csv_text
|
||||
|
||||
def test_bad_zip_returns_empty_list(self) -> None:
|
||||
bad_file = io.BytesIO(b"not a zip file")
|
||||
sheets = xlsx_sheet_extraction(bad_file, file_name="test.xlsx")
|
||||
assert sheets == []
|
||||
|
||||
def test_bad_zip_tilde_file_returns_empty_list(self) -> None:
|
||||
"""`~$`-prefixed files are Excel lock files; failure should log
|
||||
at debug (not warning) and still return []."""
|
||||
bad_file = io.BytesIO(b"not a zip file")
|
||||
sheets = xlsx_sheet_extraction(bad_file, file_name="~$temp.xlsx")
|
||||
assert sheets == []
|
||||
|
||||
def test_csv_content_matches_xlsx_to_text_per_sheet(self) -> None:
|
||||
"""For a single-sheet workbook, xlsx_to_text output should equal
|
||||
the csv_text from xlsx_sheet_extraction — they share the same
|
||||
per-sheet CSV-ification logic."""
|
||||
single_sheet_data = [["Name", "Age"], ["Alice", "30"]]
|
||||
expected_text = xlsx_to_text(_make_xlsx({"People": single_sheet_data}))
|
||||
|
||||
sheets = xlsx_sheet_extraction(_make_xlsx({"People": single_sheet_data}))
|
||||
assert len(sheets) == 1
|
||||
csv_text, title = sheets[0]
|
||||
assert title == "People"
|
||||
assert csv_text.strip() == expected_text.strip()
|
||||
|
||||
def test_commas_in_cells_are_quoted(self) -> None:
|
||||
xlsx = _make_xlsx({"S1": [["hello, world", "normal"]]})
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, _ = sheets[0]
|
||||
assert '"hello, world"' in csv_text
|
||||
|
||||
def test_long_empty_row_run_capped_within_sheet(self) -> None:
|
||||
"""The matrix cleanup applies per-sheet: >2 empty rows collapse
|
||||
to 2, which keeps the sheet non-empty and it still appears in
|
||||
the result."""
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"S1": [
|
||||
["header"],
|
||||
[""],
|
||||
[""],
|
||||
[""],
|
||||
[""],
|
||||
["data"],
|
||||
]
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
assert len(sheets) == 1
|
||||
csv_text, _ = sheets[0]
|
||||
lines = [line for line in csv_text.strip().split("\n") if line.strip()]
|
||||
# header + 2 empty (capped) + data = 4 lines
|
||||
assert len(lines) == 4
|
||||
assert "header" in lines[0]
|
||||
assert "data" in lines[-1]
|
||||
|
||||
def test_sheet_title_with_special_chars_preserved(self) -> None:
|
||||
"""Spaces, punctuation, unicode in sheet titles are preserved
|
||||
verbatim — the title is used as a link anchor downstream."""
|
||||
xlsx = _make_xlsx(
|
||||
{
|
||||
"Q1 Revenue (USD)": [["a", "b"]],
|
||||
"Données": [["c", "d"]],
|
||||
}
|
||||
)
|
||||
sheets = xlsx_sheet_extraction(xlsx)
|
||||
titles = [title for _csv, title in sheets]
|
||||
assert "Q1 Revenue (USD)" in titles
|
||||
assert "Données" in titles
|
||||
|
||||
804
backend/tests/unit/onyx/indexing/test_document_chunker.py
Normal file
804
backend/tests/unit/onyx/indexing/test_document_chunker.py
Normal file
@@ -0,0 +1,804 @@
|
||||
"""Unit tests for DocumentChunker.chunk (replacement for
|
||||
Chunker._chunk_document_with_sections).
|
||||
|
||||
These tests use a fake character-level tokenizer so every char counts as
|
||||
exactly one token. This makes token-limit arithmetic deterministic and lets
|
||||
us exercise every branch of the method without pulling real embedding
|
||||
models into the test.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from chonkie import SentenceChunker
|
||||
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.configs.constants import SECTION_SEPARATOR
|
||||
from onyx.connectors.models import SectionKind
|
||||
from onyx.connectors.models import IndexingDocument
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.indexing.chunking import DocumentChunker
|
||||
from onyx.indexing.chunking import text_section_chunker as text_chunker_module
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
|
||||
|
||||
class CharTokenizer(BaseTokenizer):
|
||||
"""1 character == 1 token. Deterministic & trivial to reason about."""
|
||||
|
||||
def encode(self, string: str) -> list[int]:
|
||||
return [ord(c) for c in string]
|
||||
|
||||
def tokenize(self, string: str) -> list[str]:
|
||||
return list(string)
|
||||
|
||||
def decode(self, tokens: list[int]) -> str:
|
||||
return "".join(chr(t) for t in tokens)
|
||||
|
||||
|
||||
# With a char-level tokenizer, each char is a token. 200 is comfortably
|
||||
# above BLURB_SIZE (128) so the blurb splitter won't get weird on small text.
|
||||
CHUNK_LIMIT = 200
|
||||
|
||||
|
||||
def _make_document_chunker(
|
||||
chunk_token_limit: int = CHUNK_LIMIT,
|
||||
) -> DocumentChunker:
|
||||
def token_counter(text: str) -> int:
|
||||
return len(text)
|
||||
|
||||
return DocumentChunker(
|
||||
tokenizer=CharTokenizer(),
|
||||
blurb_splitter=SentenceChunker(
|
||||
tokenizer_or_token_counter=token_counter,
|
||||
chunk_size=128,
|
||||
chunk_overlap=0,
|
||||
return_type="texts",
|
||||
),
|
||||
chunk_splitter=SentenceChunker(
|
||||
tokenizer_or_token_counter=token_counter,
|
||||
chunk_size=chunk_token_limit,
|
||||
chunk_overlap=0,
|
||||
return_type="texts",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _make_doc(
|
||||
sections: list[Section],
|
||||
title: str | None = "Test Doc",
|
||||
doc_id: str = "doc1",
|
||||
) -> IndexingDocument:
|
||||
return IndexingDocument(
|
||||
id=doc_id,
|
||||
source=DocumentSource.WEB,
|
||||
semantic_identifier=doc_id,
|
||||
title=title,
|
||||
metadata={},
|
||||
sections=[], # real sections unused — method reads processed_sections
|
||||
processed_sections=sections,
|
||||
)
|
||||
|
||||
|
||||
# --- Empty / degenerate input -------------------------------------------------
|
||||
|
||||
|
||||
def test_empty_processed_sections_returns_single_empty_safety_chunk() -> None:
|
||||
"""No sections at all should still yield one empty chunk (the
|
||||
`or not chunks` safety branch at the end)."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(sections=[])
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=[],
|
||||
title_prefix="TITLE\n",
|
||||
metadata_suffix_semantic="meta_sem",
|
||||
metadata_suffix_keyword="meta_kw",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].content == ""
|
||||
assert chunks[0].chunk_id == 0
|
||||
assert chunks[0].title_prefix == "TITLE\n"
|
||||
assert chunks[0].metadata_suffix_semantic == "meta_sem"
|
||||
assert chunks[0].metadata_suffix_keyword == "meta_kw"
|
||||
# safe default link offsets
|
||||
assert chunks[0].source_links == {0: ""}
|
||||
|
||||
|
||||
def test_empty_section_on_first_position_without_title_is_skipped() -> None:
|
||||
"""Doc has no title, first section has empty text — the guard
|
||||
`(not document.title or section_idx > 0)` means it IS skipped."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[Section(kind=SectionKind.TEXT, text="", link="l0")],
|
||||
title=None,
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
# skipped → no real content, but safety branch still yields 1 empty chunk
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].content == ""
|
||||
|
||||
|
||||
def test_empty_section_on_later_position_is_skipped_even_with_title() -> None:
|
||||
"""Index > 0 empty sections are skipped regardless of title."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="Alpha.", link="l0"),
|
||||
Section(kind=SectionKind.TEXT, text="", link="l1"), # should be skipped
|
||||
Section(kind=SectionKind.TEXT, text="Beta.", link="l2"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert "Alpha." in chunks[0].content
|
||||
assert "Beta." in chunks[0].content
|
||||
# link offsets should only contain l0 and l2 (no l1)
|
||||
assert "l1" not in (chunks[0].source_links or {}).values()
|
||||
|
||||
|
||||
# --- Single text section ------------------------------------------------------
|
||||
|
||||
|
||||
def test_single_small_text_section_becomes_one_chunk() -> None:
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="Hello world.", link="https://a")
|
||||
]
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="TITLE\n",
|
||||
metadata_suffix_semantic="ms",
|
||||
metadata_suffix_keyword="mk",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
chunk = chunks[0]
|
||||
assert chunk.content == "Hello world."
|
||||
assert chunk.source_links == {0: "https://a"}
|
||||
assert chunk.title_prefix == "TITLE\n"
|
||||
assert chunk.metadata_suffix_semantic == "ms"
|
||||
assert chunk.metadata_suffix_keyword == "mk"
|
||||
assert chunk.section_continuation is False
|
||||
assert chunk.image_file_id is None
|
||||
|
||||
|
||||
# --- Multiple text sections combined -----------------------------------------
|
||||
|
||||
|
||||
def test_multiple_small_sections_combine_into_one_chunk() -> None:
|
||||
dc = _make_document_chunker()
|
||||
sections = [
|
||||
Section(kind=SectionKind.TEXT, text="Part one.", link="l1"),
|
||||
Section(kind=SectionKind.TEXT, text="Part two.", link="l2"),
|
||||
Section(kind=SectionKind.TEXT, text="Part three.", link="l3"),
|
||||
]
|
||||
doc = _make_doc(sections=sections)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
expected = SECTION_SEPARATOR.join(["Part one.", "Part two.", "Part three."])
|
||||
assert chunks[0].content == expected
|
||||
|
||||
# link_offsets: indexed by shared_precompare_cleanup length of the
|
||||
# chunk_text *before* each section was appended.
|
||||
# "" -> "", len 0
|
||||
# "Part one." -> "partone", len 7
|
||||
# "Part one.\n\nPart two." -> "partoneparttwo", len 14
|
||||
assert chunks[0].source_links == {0: "l1", 7: "l2", 14: "l3"}
|
||||
|
||||
|
||||
def test_sections_overflow_into_second_chunk() -> None:
|
||||
"""Two sections that together exceed content_token_limit should
|
||||
finalize the first as one chunk and start a new one."""
|
||||
dc = _make_document_chunker()
|
||||
# char-level: 120 char section → 120 tokens. 2 of these plus separator
|
||||
# exceed a 200-token limit, forcing a flush.
|
||||
a = "A" * 120
|
||||
b = "B" * 120
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text=a, link="la"),
|
||||
Section(kind=SectionKind.TEXT, text=b, link="lb"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].content == a
|
||||
assert chunks[1].content == b
|
||||
# first chunk is not a continuation; second starts a new section → not either
|
||||
assert chunks[0].section_continuation is False
|
||||
assert chunks[1].section_continuation is False
|
||||
# chunk_ids should be sequential starting at 0
|
||||
assert chunks[0].chunk_id == 0
|
||||
assert chunks[1].chunk_id == 1
|
||||
# links routed appropriately
|
||||
assert chunks[0].source_links == {0: "la"}
|
||||
assert chunks[1].source_links == {0: "lb"}
|
||||
|
||||
|
||||
# --- Image section handling --------------------------------------------------
|
||||
|
||||
|
||||
def test_image_only_section_produces_single_chunk_with_image_id() -> None:
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(
|
||||
kind=SectionKind.IMAGE,
|
||||
text="summary of image",
|
||||
link="https://img",
|
||||
image_file_id="img-abc",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].image_file_id == "img-abc"
|
||||
assert chunks[0].content == "summary of image"
|
||||
assert chunks[0].source_links == {0: "https://img"}
|
||||
|
||||
|
||||
def test_image_section_flushes_pending_text_and_creates_its_own_chunk() -> None:
|
||||
"""A buffered text section followed by an image section:
|
||||
the pending text should be flushed first, then the image chunk."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="Pending text.", link="ltext"),
|
||||
Section(
|
||||
kind=SectionKind.IMAGE,
|
||||
text="image summary",
|
||||
link="limage",
|
||||
image_file_id="img-1",
|
||||
),
|
||||
Section(kind=SectionKind.TEXT, text="Trailing text.", link="ltail"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 3
|
||||
|
||||
# 0: flushed pending text
|
||||
assert chunks[0].content == "Pending text."
|
||||
assert chunks[0].image_file_id is None
|
||||
assert chunks[0].source_links == {0: "ltext"}
|
||||
|
||||
# 1: image chunk
|
||||
assert chunks[1].content == "image summary"
|
||||
assert chunks[1].image_file_id == "img-1"
|
||||
assert chunks[1].source_links == {0: "limage"}
|
||||
|
||||
# 2: trailing text, started fresh after image
|
||||
assert chunks[2].content == "Trailing text."
|
||||
assert chunks[2].image_file_id is None
|
||||
assert chunks[2].source_links == {0: "ltail"}
|
||||
|
||||
|
||||
def test_image_section_without_link_gets_empty_links_dict() -> None:
|
||||
"""If an image section has no link, links param is {} (not {0: ""})."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(
|
||||
kind=SectionKind.IMAGE,
|
||||
text="img",
|
||||
link=None,
|
||||
image_file_id="img-xyz",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].image_file_id == "img-xyz"
|
||||
# to_doc_aware_chunk falls back to {0: ""} when given an empty dict
|
||||
assert chunks[0].source_links == {0: ""}
|
||||
|
||||
|
||||
# --- Oversized section splitting ---------------------------------------------
|
||||
|
||||
|
||||
def test_oversized_section_is_split_across_multiple_chunks() -> None:
|
||||
"""A section whose text exceeds content_token_limit should be passed
|
||||
through chunk_splitter and yield >1 chunks; only the first is not a
|
||||
continuation."""
|
||||
dc = _make_document_chunker()
|
||||
# Build a section whose char-count is well over CHUNK_LIMIT (200), made
|
||||
# of many short sentences so chonkie's SentenceChunker can split cleanly.
|
||||
section_text = (
|
||||
"Alpha beta gamma. Delta epsilon zeta. Eta theta iota. "
|
||||
"Kappa lambda mu. Nu xi omicron. Pi rho sigma. Tau upsilon phi. "
|
||||
"Chi psi omega. One two three. Four five six. Seven eight nine. "
|
||||
"Ten eleven twelve. Thirteen fourteen fifteen. "
|
||||
"Sixteen seventeen eighteen. Nineteen twenty."
|
||||
)
|
||||
assert len(section_text) > CHUNK_LIMIT
|
||||
|
||||
doc = _make_doc(
|
||||
sections=[Section(kind=SectionKind.TEXT, text=section_text, link="big-link")],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) >= 2
|
||||
# First chunk is fresh, rest are continuations
|
||||
assert chunks[0].section_continuation is False
|
||||
for c in chunks[1:]:
|
||||
assert c.section_continuation is True
|
||||
# Every produced chunk should carry the section's link
|
||||
for c in chunks:
|
||||
assert c.source_links == {0: "big-link"}
|
||||
# Concatenated content should roughly cover the original (allowing
|
||||
# for chunker boundary whitespace differences).
|
||||
joined = "".join(c.content for c in chunks)
|
||||
for word in ("Alpha", "omega", "twenty"):
|
||||
assert word in joined
|
||||
|
||||
|
||||
def test_oversized_section_flushes_pending_text_first() -> None:
|
||||
"""A buffered text section followed by an oversized section should
|
||||
flush the pending chunk first, then emit the split chunks."""
|
||||
dc = _make_document_chunker()
|
||||
pending = "Pending buffered text."
|
||||
big = (
|
||||
"Alpha beta gamma. Delta epsilon zeta. Eta theta iota. "
|
||||
"Kappa lambda mu. Nu xi omicron. Pi rho sigma. Tau upsilon phi. "
|
||||
"Chi psi omega. One two three. Four five six. Seven eight nine. "
|
||||
"Ten eleven twelve. Thirteen fourteen fifteen. Sixteen seventeen."
|
||||
)
|
||||
assert len(big) > CHUNK_LIMIT
|
||||
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text=pending, link="l-pending"),
|
||||
Section(kind=SectionKind.TEXT, text=big, link="l-big"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
# First chunk is the flushed pending text
|
||||
assert chunks[0].content == pending
|
||||
assert chunks[0].source_links == {0: "l-pending"}
|
||||
assert chunks[0].section_continuation is False
|
||||
|
||||
# Remaining chunks correspond to the oversized section
|
||||
assert len(chunks) >= 2
|
||||
for c in chunks[1:]:
|
||||
assert c.source_links == {0: "l-big"}
|
||||
# Within the oversized section, the first is fresh and the rest are
|
||||
# continuations.
|
||||
assert chunks[1].section_continuation is False
|
||||
for c in chunks[2:]:
|
||||
assert c.section_continuation is True
|
||||
|
||||
|
||||
# --- Title prefix / metadata propagation -------------------------------------
|
||||
|
||||
|
||||
def test_title_prefix_and_metadata_propagate_to_all_chunks() -> None:
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="A" * 120, link="la"),
|
||||
Section(kind=SectionKind.TEXT, text="B" * 120, link="lb"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="MY_TITLE\n",
|
||||
metadata_suffix_semantic="MS",
|
||||
metadata_suffix_keyword="MK",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 2
|
||||
for chunk in chunks:
|
||||
assert chunk.title_prefix == "MY_TITLE\n"
|
||||
assert chunk.metadata_suffix_semantic == "MS"
|
||||
assert chunk.metadata_suffix_keyword == "MK"
|
||||
|
||||
|
||||
# --- chunk_id monotonicity ---------------------------------------------------
|
||||
|
||||
|
||||
def test_chunk_ids_are_sequential_starting_at_zero() -> None:
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="A" * 120, link="la"),
|
||||
Section(kind=SectionKind.TEXT, text="B" * 120, link="lb"),
|
||||
Section(kind=SectionKind.TEXT, text="C" * 120, link="lc"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert [c.chunk_id for c in chunks] == list(range(len(chunks)))
|
||||
|
||||
|
||||
# --- Overflow accumulation behavior ------------------------------------------
|
||||
|
||||
|
||||
def test_overflow_flush_then_subsequent_section_joins_new_chunk() -> None:
|
||||
"""After an overflow flush starts a new chunk, the next fitting section
|
||||
should combine into that same new chunk (not spawn a third)."""
|
||||
dc = _make_document_chunker()
|
||||
# 120 + 120 > 200 → first two sections produce two chunks.
|
||||
# Third section is small (20 chars) → should fit with second.
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="A" * 120, link="la"),
|
||||
Section(kind=SectionKind.TEXT, text="B" * 120, link="lb"),
|
||||
Section(kind=SectionKind.TEXT, text="C" * 20, link="lc"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].content == "A" * 120
|
||||
assert chunks[1].content == ("B" * 120) + SECTION_SEPARATOR + ("C" * 20)
|
||||
# link_offsets on second chunk: lb at 0, lc at precompare-len("BBBB...")=120
|
||||
assert chunks[1].source_links == {0: "lb", 120: "lc"}
|
||||
|
||||
|
||||
def test_small_section_after_oversized_starts_a_fresh_chunk() -> None:
|
||||
"""After an oversized section is emitted as its own chunks, the internal
|
||||
accumulator should be empty so a following small section starts a new
|
||||
chunk instead of being swallowed."""
|
||||
dc = _make_document_chunker()
|
||||
big = (
|
||||
"Alpha beta gamma. Delta epsilon zeta. Eta theta iota. "
|
||||
"Kappa lambda mu. Nu xi omicron. Pi rho sigma. Tau upsilon phi. "
|
||||
"Chi psi omega. One two three. Four five six. Seven eight nine. "
|
||||
"Ten eleven twelve. Thirteen fourteen fifteen. Sixteen seventeen."
|
||||
)
|
||||
assert len(big) > CHUNK_LIMIT
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text=big, link="l-big"),
|
||||
Section(kind=SectionKind.TEXT, text="Tail text.", link="l-tail"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
# All-but-last chunks belong to the oversized section; the very last is
|
||||
# the tail text starting fresh (not a continuation).
|
||||
assert len(chunks) >= 2
|
||||
assert chunks[-1].content == "Tail text."
|
||||
assert chunks[-1].source_links == {0: "l-tail"}
|
||||
assert chunks[-1].section_continuation is False
|
||||
# And earlier oversized chunks never leaked the tail link
|
||||
for c in chunks[:-1]:
|
||||
assert c.source_links == {0: "l-big"}
|
||||
|
||||
|
||||
# --- STRICT_CHUNK_TOKEN_LIMIT fallback path ----------------------------------
|
||||
|
||||
|
||||
def test_strict_chunk_token_limit_subdivides_oversized_split(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""When STRICT_CHUNK_TOKEN_LIMIT is enabled and chonkie's chunk_splitter
|
||||
still produces a piece larger than content_token_limit (e.g. a single
|
||||
no-period run), the code must fall back to _split_oversized_chunk."""
|
||||
monkeypatch.setattr(text_chunker_module, "STRICT_CHUNK_TOKEN_LIMIT", True)
|
||||
dc = _make_document_chunker()
|
||||
# 500 non-whitespace chars with no sentence boundaries — chonkie will
|
||||
# return it as one oversized piece (>200) which triggers the fallback.
|
||||
run = "a" * 500
|
||||
doc = _make_doc(
|
||||
sections=[Section(kind=SectionKind.TEXT, text=run, link="l-run")]
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
# With CHUNK_LIMIT=200 and a 500-char run we expect ceil(500/200)=3 sub-chunks.
|
||||
assert len(chunks) == 3
|
||||
# First is fresh, rest are continuations (is_continuation=(j != 0))
|
||||
assert chunks[0].section_continuation is False
|
||||
assert chunks[1].section_continuation is True
|
||||
assert chunks[2].section_continuation is True
|
||||
# All carry the section link
|
||||
for c in chunks:
|
||||
assert c.source_links == {0: "l-run"}
|
||||
# NOTE: we do NOT assert the chunks are at or below content_token_limit.
|
||||
# _split_oversized_chunk joins tokens with " ", which means the resulting
|
||||
# chunk contents can exceed the limit when tokens are short. That's a
|
||||
# quirk of the current implementation and this test pins the window
|
||||
# slicing, not the post-join length.
|
||||
|
||||
|
||||
def test_strict_chunk_token_limit_disabled_allows_oversized_split(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
) -> None:
|
||||
"""Same pathological input, but with STRICT disabled: the oversized
|
||||
split is emitted verbatim as a single chunk (current behavior)."""
|
||||
monkeypatch.setattr(text_chunker_module, "STRICT_CHUNK_TOKEN_LIMIT", False)
|
||||
dc = _make_document_chunker()
|
||||
run = "a" * 500
|
||||
doc = _make_doc(
|
||||
sections=[Section(kind=SectionKind.TEXT, text=run, link="l-run")]
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].content == run
|
||||
assert chunks[0].section_continuation is False
|
||||
|
||||
|
||||
# --- First-section-with-empty-text-but-document-has-title edge case ----------
|
||||
|
||||
|
||||
def test_first_empty_section_with_title_is_processed_not_skipped() -> None:
|
||||
"""The guard `(not document.title or section_idx > 0)` means: when
|
||||
the doc has a title AND it's the first section, an empty text section
|
||||
is NOT skipped. This pins current behavior so a refactor can't silently
|
||||
change it."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="", link="l0"), # empty first section, kept
|
||||
Section(kind=SectionKind.TEXT, text="Real content.", link="l1"),
|
||||
],
|
||||
title="Has A Title",
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].content == "Real content."
|
||||
# First (empty) section did register a link_offset at 0 before being
|
||||
# overwritten; that offset is then reused when "Real content." is added,
|
||||
# because shared_precompare_cleanup("") is still "". End state: {0: "l1"}
|
||||
assert chunks[0].source_links == {0: "l1"}
|
||||
|
||||
|
||||
# --- clean_text is applied to section text -----------------------------------
|
||||
|
||||
|
||||
def test_clean_text_strips_control_chars_from_section_content() -> None:
|
||||
"""clean_text() should remove control chars before the text enters the
|
||||
accumulator — verifies the call isn't dropped by a refactor."""
|
||||
dc = _make_document_chunker()
|
||||
# NUL + BEL are control chars below 0x20 and not \n or \t → should be
|
||||
# stripped by clean_text.
|
||||
dirty = "Hello\x00 World\x07!"
|
||||
doc = _make_doc(
|
||||
sections=[Section(kind=SectionKind.TEXT, text=dirty, link="l1")]
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].content == "Hello World!"
|
||||
|
||||
|
||||
# --- None-valued fields ------------------------------------------------------
|
||||
|
||||
|
||||
def test_section_with_none_text_behaves_like_empty_string() -> None:
|
||||
"""`section.text` may be None — the method coerces via
|
||||
`str(section.text or "")`, so a None-text section behaves identically
|
||||
to an empty one (skipped unless it's the first section of a titled doc)."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="Alpha.", link="la"),
|
||||
Section(kind=SectionKind.TEXT, text=None, link="lnone"), # idx 1 → skipped
|
||||
Section(kind=SectionKind.TEXT, text="Beta.", link="lb"),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 1
|
||||
assert "Alpha." in chunks[0].content
|
||||
assert "Beta." in chunks[0].content
|
||||
assert "lnone" not in (chunks[0].source_links or {}).values()
|
||||
|
||||
|
||||
# --- Trailing empty chunk suppression ----------------------------------------
|
||||
|
||||
|
||||
def test_no_trailing_empty_chunk_when_last_section_was_image() -> None:
|
||||
"""If the final section was an image (which emits its own chunk and
|
||||
resets chunk_text), the safety `or not chunks` branch should NOT fire
|
||||
because chunks is non-empty. Pin this explicitly."""
|
||||
dc = _make_document_chunker()
|
||||
doc = _make_doc(
|
||||
sections=[
|
||||
Section(kind=SectionKind.TEXT, text="Leading text.", link="ltext"),
|
||||
Section(
|
||||
kind=SectionKind.IMAGE,
|
||||
text="img summary",
|
||||
link="limg",
|
||||
image_file_id="img-final",
|
||||
),
|
||||
],
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
assert len(chunks) == 2
|
||||
assert chunks[0].content == "Leading text."
|
||||
assert chunks[0].image_file_id is None
|
||||
assert chunks[1].content == "img summary"
|
||||
assert chunks[1].image_file_id == "img-final"
|
||||
# Crucially: no third empty chunk got appended at the end.
|
||||
|
||||
|
||||
def test_no_trailing_empty_chunk_when_last_section_was_oversized() -> None:
|
||||
"""Same guarantee for oversized sections: their splits fully clear the
|
||||
accumulator, and the trailing safety branch should be a no-op."""
|
||||
dc = _make_document_chunker()
|
||||
big = (
|
||||
"Alpha beta gamma. Delta epsilon zeta. Eta theta iota. "
|
||||
"Kappa lambda mu. Nu xi omicron. Pi rho sigma. Tau upsilon phi. "
|
||||
"Chi psi omega. One two three. Four five six. Seven eight nine. "
|
||||
"Ten eleven twelve. Thirteen fourteen fifteen. Sixteen seventeen."
|
||||
)
|
||||
assert len(big) > CHUNK_LIMIT
|
||||
doc = _make_doc(
|
||||
sections=[Section(kind=SectionKind.TEXT, text=big, link="l-big")]
|
||||
)
|
||||
|
||||
chunks = dc.chunk(
|
||||
document=doc,
|
||||
sections=doc.processed_sections,
|
||||
title_prefix="",
|
||||
metadata_suffix_semantic="",
|
||||
metadata_suffix_keyword="",
|
||||
content_token_limit=CHUNK_LIMIT,
|
||||
)
|
||||
|
||||
# Every chunk should be non-empty — no dangling "" chunk at the tail.
|
||||
assert all(c.content.strip() for c in chunks)
|
||||
312
backend/tests/unit/onyx/indexing/test_tabular_section_chunker.py
Normal file
312
backend/tests/unit/onyx/indexing/test_tabular_section_chunker.py
Normal file
@@ -0,0 +1,312 @@
|
||||
"""End-to-end tests for `TabularChunker.chunk_section`.
|
||||
|
||||
Each test is structured as:
|
||||
INPUT — the CSV text passed to the chunker + token budget + link
|
||||
EXPECTED — the exact chunk texts the chunker should emit
|
||||
ACT — a single call to `chunk_section`
|
||||
ASSERT — literal equality against the expected chunk texts
|
||||
|
||||
A character-level tokenizer (1 char == 1 token) is used so token-budget
|
||||
arithmetic is deterministic and expected chunks can be spelled out
|
||||
exactly.
|
||||
"""
|
||||
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import SectionKind
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.tabular_section_chunker import TabularChunker
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
|
||||
|
||||
class CharTokenizer(BaseTokenizer):
|
||||
def encode(self, string: str) -> list[int]:
|
||||
return [ord(c) for c in string]
|
||||
|
||||
def tokenize(self, string: str) -> list[str]:
|
||||
return list(string)
|
||||
|
||||
def decode(self, tokens: list[int]) -> str:
|
||||
return "".join(chr(t) for t in tokens)
|
||||
|
||||
|
||||
def _make_chunker() -> TabularChunker:
|
||||
return TabularChunker(tokenizer=CharTokenizer())
|
||||
|
||||
|
||||
def _tabular_section(text: str, link: str = "sheet:Test") -> Section:
|
||||
return Section(kind=SectionKind.TABULAR, text=text, link=link)
|
||||
|
||||
|
||||
class TestTabularChunkerChunkSection:
|
||||
def test_simple_csv_all_rows_fit_one_chunk(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
csv_text = (
|
||||
"Name,Age,City\n"
|
||||
"Alice,30,NYC\n"
|
||||
"Bob,25,SF\n"
|
||||
)
|
||||
link = "sheet:People"
|
||||
content_token_limit = 500
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
(
|
||||
"sheet:People\n"
|
||||
"Rows:\n"
|
||||
"Columns: Name, Age, City\n"
|
||||
"Name=Alice, Age=30, City=NYC\n"
|
||||
"Name=Bob, Age=25, City=SF"
|
||||
),
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section(csv_text, link=link),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
assert [p.is_continuation for p in out.payloads] == [False]
|
||||
assert all(p.links == {0: link} for p in out.payloads)
|
||||
assert out.accumulator.is_empty()
|
||||
|
||||
def test_overflow_splits_into_two_deterministic_chunks(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# prelude = "sheet:S\nRows:\nColumns: col, val" (31 chars = 31 tokens)
|
||||
# At content_token_limit=57, row_budget = max(16, 57-31-1) = 25.
|
||||
# Each row "col=a, val=1" is 12 tokens; two rows + \n = 25 (fits),
|
||||
# three rows + 2×\n = 38 (overflows) → split after 2 rows.
|
||||
csv_text = (
|
||||
"col,val\n"
|
||||
"a,1\n"
|
||||
"b,2\n"
|
||||
"c,3\n"
|
||||
"d,4\n"
|
||||
)
|
||||
link = "sheet:S"
|
||||
content_token_limit = 57
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
(
|
||||
"sheet:S\n"
|
||||
"Rows:\n"
|
||||
"Columns: col, val\n"
|
||||
"col=a, val=1\n"
|
||||
"col=b, val=2"
|
||||
),
|
||||
(
|
||||
"sheet:S\n"
|
||||
"Rows:\n"
|
||||
"Columns: col, val\n"
|
||||
"col=c, val=3\n"
|
||||
"col=d, val=4"
|
||||
),
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section(csv_text, link=link),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
# First chunk is fresh; subsequent chunks mark as continuations.
|
||||
assert [p.is_continuation for p in out.payloads] == [False, True]
|
||||
# Link carries through every chunk.
|
||||
assert all(p.links == {0: link} for p in out.payloads)
|
||||
|
||||
def test_header_only_csv_produces_single_prelude_chunk(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
csv_text = "col1,col2\n"
|
||||
link = "sheet:Headers"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
"sheet:Headers\nRows:\nColumns: col1, col2",
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section(csv_text, link=link),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
|
||||
def test_empty_cells_dropped_from_chunk_text(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# Alice's Age is empty; Bob's City is empty. Empty cells should
|
||||
# not appear as `field=` pairs in the output.
|
||||
csv_text = (
|
||||
"Name,Age,City\n"
|
||||
"Alice,,NYC\n"
|
||||
"Bob,25,\n"
|
||||
)
|
||||
link = "sheet:P"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
(
|
||||
"sheet:P\n"
|
||||
"Rows:\n"
|
||||
"Columns: Name, Age, City\n"
|
||||
"Name=Alice, City=NYC\n"
|
||||
"Name=Bob, Age=25"
|
||||
),
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section(csv_text, link=link),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
|
||||
def test_quoted_commas_in_csv_preserved_as_one_field(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# "Hello, world" is quoted in the CSV, so it's a single field
|
||||
# value containing a comma — not two cells.
|
||||
csv_text = (
|
||||
'Name,Notes\n'
|
||||
'Alice,"Hello, world"\n'
|
||||
)
|
||||
link = "sheet:P"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
(
|
||||
"sheet:P\n"
|
||||
"Rows:\n"
|
||||
"Columns: Name, Notes\n"
|
||||
"Name=Alice, Notes=Hello, world"
|
||||
),
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section(csv_text, link=link),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
|
||||
def test_blank_rows_in_csv_are_skipped(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# Stray blank rows in the CSV (e.g. export artifacts) shouldn't
|
||||
# produce ghost rows in the output.
|
||||
csv_text = (
|
||||
"A,B\n"
|
||||
"\n"
|
||||
"1,2\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"3,4\n"
|
||||
)
|
||||
link = "sheet:S"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
(
|
||||
"sheet:S\n"
|
||||
"Rows:\n"
|
||||
"Columns: A, B\n"
|
||||
"A=1, B=2\n"
|
||||
"A=3, B=4"
|
||||
),
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section(csv_text, link=link),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
|
||||
def test_accumulator_flushes_before_tabular_chunks(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# A text accumulator was populated by the prior text section.
|
||||
# Tabular sections are structural boundaries, so the pending
|
||||
# text is flushed as its own chunk before the tabular content.
|
||||
pending_text = "prior paragraph from an earlier text section"
|
||||
pending_link = "prev-link"
|
||||
|
||||
csv_text = (
|
||||
"a,b\n"
|
||||
"1,2\n"
|
||||
)
|
||||
link = "sheet:S"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
pending_text, # flushed accumulator
|
||||
(
|
||||
"sheet:S\n"
|
||||
"Rows:\n"
|
||||
"Columns: a, b\n"
|
||||
"a=1, b=2"
|
||||
),
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section(csv_text, link=link),
|
||||
AccumulatorState(
|
||||
text=pending_text,
|
||||
link_offsets={0: pending_link},
|
||||
),
|
||||
content_token_limit=500,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
# Flushed chunk keeps the prior text's link; tabular chunk uses
|
||||
# the tabular section's link.
|
||||
assert out.payloads[0].links == {0: pending_link}
|
||||
assert out.payloads[1].links == {0: link}
|
||||
# Accumulator resets — tabular section is a structural boundary.
|
||||
assert out.accumulator.is_empty()
|
||||
|
||||
def test_empty_tabular_section_returns_no_payloads_and_preserves_accumulator(
|
||||
self,
|
||||
) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# Malformed/empty tabular section should not flush the text
|
||||
# accumulator — the caller (DocumentChunker) handles skip logic;
|
||||
# we preserve the accumulator so subsequent sections can use it.
|
||||
pending_text = "prior paragraph"
|
||||
pending_link_offsets = {0: "prev-link"}
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts: list[str] = []
|
||||
expected_accumulator_text = pending_text
|
||||
expected_accumulator_offsets = pending_link_offsets
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
_tabular_section("", link="sheet:Empty"),
|
||||
AccumulatorState(
|
||||
text=pending_text,
|
||||
link_offsets=pending_link_offsets,
|
||||
),
|
||||
content_token_limit=500,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
assert out.accumulator.text == expected_accumulator_text
|
||||
assert out.accumulator.link_offsets == expected_accumulator_offsets
|
||||
@@ -5,7 +5,7 @@ home: https://www.onyx.app/
|
||||
sources:
|
||||
- "https://github.com/onyx-dot-app/onyx"
|
||||
type: application
|
||||
version: 0.4.43
|
||||
version: 0.4.44
|
||||
appVersion: latest
|
||||
annotations:
|
||||
category: Productivity
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
{{- if and .Values.ingress.enabled .Values.mcpServer.enabled -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ include "onyx.fullname" . }}-ingress-mcp-oauth-callback
|
||||
annotations:
|
||||
{{- if not .Values.ingress.className }}
|
||||
kubernetes.io/ingress.class: nginx
|
||||
{{- end }}
|
||||
cert-manager.io/cluster-issuer: {{ include "onyx.fullname" . }}-letsencrypt
|
||||
spec:
|
||||
{{- if .Values.ingress.className }}
|
||||
ingressClassName: {{ .Values.ingress.className }}
|
||||
{{- end }}
|
||||
rules:
|
||||
- host: {{ .Values.ingress.api.host }}
|
||||
http:
|
||||
paths:
|
||||
- path: /mcp/oauth/callback
|
||||
pathType: Exact
|
||||
backend:
|
||||
service:
|
||||
name: {{ include "onyx.fullname" . }}-webserver
|
||||
port:
|
||||
number: {{ .Values.webserver.service.servicePort }}
|
||||
tls:
|
||||
- hosts:
|
||||
- {{ .Values.ingress.api.host }}
|
||||
secretName: {{ include "onyx.fullname" . }}-ingress-mcp-oauth-callback-tls
|
||||
{{- end }}
|
||||
@@ -63,7 +63,7 @@ func checkDevcontainerCLI() {
|
||||
}
|
||||
|
||||
// ensureDockerSock sets the DOCKER_SOCK environment variable if not already set.
|
||||
// devcontainer.json references ${localEnv:DOCKER_SOCK} for the socket mount.
|
||||
// Used by ensureRemoteUser to detect rootless Docker.
|
||||
func ensureDockerSock() {
|
||||
if os.Getenv("DOCKER_SOCK") != "" {
|
||||
return
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import "@opal/components/cards/card/styles.css";
|
||||
import type { PaddingVariants, RoundingVariants } from "@opal/types";
|
||||
import { cardPaddingVariants, cardRoundingVariants } from "@opal/shared";
|
||||
import { paddingVariants, cardRoundingVariants } from "@opal/shared";
|
||||
import { cn } from "@opal/utils";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -79,7 +79,7 @@ function Card({
|
||||
ref,
|
||||
children,
|
||||
}: CardProps) {
|
||||
const padding = cardPaddingVariants[paddingProp];
|
||||
const padding = paddingVariants[paddingProp];
|
||||
const rounding = cardRoundingVariants[roundingProp];
|
||||
|
||||
return (
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import "@opal/components/cards/select-card/styles.css";
|
||||
import type { PaddingVariants, RoundingVariants } from "@opal/types";
|
||||
import { cardPaddingVariants, cardRoundingVariants } from "@opal/shared";
|
||||
import { paddingVariants, cardRoundingVariants } from "@opal/shared";
|
||||
import { cn } from "@opal/utils";
|
||||
import { Interactive, type InteractiveStatefulProps } from "@opal/core";
|
||||
|
||||
@@ -78,7 +78,7 @@ function SelectCard({
|
||||
children,
|
||||
...statefulProps
|
||||
}: SelectCardProps) {
|
||||
const padding = cardPaddingVariants[paddingProp];
|
||||
const padding = paddingVariants[paddingProp];
|
||||
const rounding = cardRoundingVariants[roundingProp];
|
||||
|
||||
return (
|
||||
|
||||
@@ -15,6 +15,42 @@ export const Plain: Story = {
|
||||
render: () => <Divider />,
|
||||
};
|
||||
|
||||
export const Vertical: Story = {
|
||||
render: () => (
|
||||
<div
|
||||
style={{ display: "flex", alignItems: "stretch", height: 64, gap: 16 }}
|
||||
>
|
||||
<span>Left</span>
|
||||
<Divider orientation="vertical" />
|
||||
<span>Right</span>
|
||||
</div>
|
||||
),
|
||||
};
|
||||
|
||||
export const NoPadding: Story = {
|
||||
render: () => <Divider paddingParallel="fit" paddingPerpendicular="fit" />,
|
||||
};
|
||||
|
||||
export const CustomPadding: Story = {
|
||||
render: () => <Divider paddingParallel="lg" paddingPerpendicular="sm" />,
|
||||
};
|
||||
|
||||
export const VerticalNoPadding: Story = {
|
||||
render: () => (
|
||||
<div
|
||||
style={{ display: "flex", alignItems: "stretch", height: 64, gap: 16 }}
|
||||
>
|
||||
<span>Left</span>
|
||||
<Divider
|
||||
orientation="vertical"
|
||||
paddingParallel="fit"
|
||||
paddingPerpendicular="fit"
|
||||
/>
|
||||
<span>Right</span>
|
||||
</div>
|
||||
),
|
||||
};
|
||||
|
||||
export const WithTitle: Story = {
|
||||
render: () => <Divider title="Section" />,
|
||||
};
|
||||
|
||||
@@ -10,7 +10,13 @@ The component uses a discriminated union with four variants. `title` and `descri
|
||||
|
||||
### Bare divider
|
||||
|
||||
No props — renders a plain horizontal line.
|
||||
A plain line with no title or description.
|
||||
|
||||
| Prop | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `orientation` | `"horizontal" \| "vertical"` | `"horizontal"` | Direction of the line |
|
||||
| `paddingParallel` | `PaddingVariants` | `"sm"` | Padding along the line direction (0.5rem) |
|
||||
| `paddingPerpendicular` | `PaddingVariants` | `"xs"` | Padding perpendicular to the line (0.25rem) |
|
||||
|
||||
### Titled divider
|
||||
|
||||
@@ -40,9 +46,18 @@ No props — renders a plain horizontal line.
|
||||
```tsx
|
||||
import { Divider } from "@opal/components";
|
||||
|
||||
// Plain line
|
||||
// Plain horizontal line
|
||||
<Divider />
|
||||
|
||||
// Vertical line
|
||||
<Divider orientation="vertical" />
|
||||
|
||||
// No padding
|
||||
<Divider paddingParallel="fit" paddingPerpendicular="fit" />
|
||||
|
||||
// Custom padding
|
||||
<Divider paddingParallel="lg" paddingPerpendicular="sm" />
|
||||
|
||||
// With title
|
||||
<Divider title="Advanced" />
|
||||
|
||||
|
||||
@@ -2,16 +2,25 @@
|
||||
|
||||
import "@opal/components/divider/styles.css";
|
||||
import { useState, useCallback } from "react";
|
||||
import type { RichStr } from "@opal/types";
|
||||
import type { PaddingVariants, RichStr } from "@opal/types";
|
||||
import { Button, Text } from "@opal/components";
|
||||
import { SvgChevronRight } from "@opal/icons";
|
||||
import { Interactive } from "@opal/core";
|
||||
import { cn } from "@opal/utils";
|
||||
import { paddingXVariants, paddingYVariants } from "@opal/shared";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface DividerNeverFields {
|
||||
interface DividerSharedProps {
|
||||
ref?: React.Ref<HTMLDivElement>;
|
||||
title?: never;
|
||||
description?: never;
|
||||
foldable?: false;
|
||||
orientation?: never;
|
||||
paddingParallel?: never;
|
||||
paddingPerpendicular?: never;
|
||||
open?: never;
|
||||
defaultOpen?: never;
|
||||
onOpenChange?: never;
|
||||
@@ -19,36 +28,37 @@ interface DividerNeverFields {
|
||||
}
|
||||
|
||||
/** Plain line — no title, no description. */
|
||||
interface DividerBareProps extends DividerNeverFields {
|
||||
title?: never;
|
||||
description?: never;
|
||||
foldable?: false;
|
||||
ref?: React.Ref<HTMLDivElement>;
|
||||
}
|
||||
type DividerBareProps = Omit<
|
||||
DividerSharedProps,
|
||||
"orientation" | "paddingParallel" | "paddingPerpendicular"
|
||||
> & {
|
||||
/** Orientation of the line. Default: `"horizontal"`. */
|
||||
orientation?: "horizontal" | "vertical";
|
||||
/** Padding along the line direction. Default: `"sm"` (0.5rem). */
|
||||
paddingParallel?: PaddingVariants;
|
||||
/** Padding perpendicular to the line. Default: `"xs"` (0.25rem). */
|
||||
paddingPerpendicular?: PaddingVariants;
|
||||
};
|
||||
|
||||
/** Line with a title to the left. */
|
||||
interface DividerTitledProps extends DividerNeverFields {
|
||||
type DividerTitledProps = Omit<DividerSharedProps, "title"> & {
|
||||
title: string | RichStr;
|
||||
description?: never;
|
||||
foldable?: false;
|
||||
ref?: React.Ref<HTMLDivElement>;
|
||||
}
|
||||
};
|
||||
|
||||
/** Line with a description below. */
|
||||
interface DividerDescribedProps extends DividerNeverFields {
|
||||
title?: never;
|
||||
type DividerDescribedProps = Omit<DividerSharedProps, "description"> & {
|
||||
/** Description rendered below the divider line. */
|
||||
description: string | RichStr;
|
||||
foldable?: false;
|
||||
ref?: React.Ref<HTMLDivElement>;
|
||||
}
|
||||
};
|
||||
|
||||
/** Foldable — requires title, reveals children. */
|
||||
interface DividerFoldableProps {
|
||||
type DividerFoldableProps = Omit<
|
||||
DividerSharedProps,
|
||||
"title" | "foldable" | "open" | "defaultOpen" | "onOpenChange" | "children"
|
||||
> & {
|
||||
/** Title is required when foldable. */
|
||||
title: string | RichStr;
|
||||
foldable: true;
|
||||
description?: never;
|
||||
/** Controlled open state. */
|
||||
open?: boolean;
|
||||
/** Uncontrolled default open state. */
|
||||
@@ -57,8 +67,7 @@ interface DividerFoldableProps {
|
||||
onOpenChange?: (open: boolean) => void;
|
||||
/** Content revealed when open. */
|
||||
children?: React.ReactNode;
|
||||
ref?: React.Ref<HTMLDivElement>;
|
||||
}
|
||||
};
|
||||
|
||||
type DividerProps =
|
||||
| DividerBareProps
|
||||
@@ -75,12 +84,39 @@ function Divider(props: DividerProps) {
|
||||
return <FoldableDivider {...props} />;
|
||||
}
|
||||
|
||||
const { ref } = props;
|
||||
const title = "title" in props ? props.title : undefined;
|
||||
const description = "description" in props ? props.description : undefined;
|
||||
const {
|
||||
ref,
|
||||
title,
|
||||
description,
|
||||
orientation = "horizontal",
|
||||
paddingParallel = "sm",
|
||||
paddingPerpendicular = "xs",
|
||||
} = props;
|
||||
|
||||
if (orientation === "vertical") {
|
||||
return (
|
||||
<div
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"opal-divider-vertical",
|
||||
paddingXVariants[paddingPerpendicular],
|
||||
paddingYVariants[paddingParallel]
|
||||
)}
|
||||
>
|
||||
<div className="opal-divider-line-vertical" />
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div ref={ref} className="opal-divider">
|
||||
<div
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"opal-divider",
|
||||
paddingXVariants[paddingParallel],
|
||||
paddingYVariants[paddingPerpendicular]
|
||||
)}
|
||||
>
|
||||
<div className="opal-divider-row">
|
||||
{title && (
|
||||
<div className="opal-divider-title">
|
||||
|
||||
@@ -2,11 +2,13 @@
|
||||
Divider
|
||||
|
||||
A horizontal rule with optional title, foldable chevron, or description.
|
||||
Padding is controlled via Tailwind classes applied by the component.
|
||||
--------------------------------------------------------------------------- */
|
||||
|
||||
/* ── Horizontal ─────────────────────────────────────────────────────────────── */
|
||||
|
||||
.opal-divider {
|
||||
@apply flex flex-col w-full;
|
||||
padding: 0.25rem 0.5rem;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
@@ -29,6 +31,18 @@
|
||||
padding: 0px 2px;
|
||||
}
|
||||
|
||||
/* ── Vertical orientation ───────────────────────────────────────────────────── */
|
||||
|
||||
.opal-divider-vertical {
|
||||
@apply flex flex-row h-full;
|
||||
}
|
||||
|
||||
.opal-divider-line-vertical {
|
||||
@apply flex-1 w-px bg-border-01;
|
||||
}
|
||||
|
||||
/* ── Foldable chevron ───────────────────────────────────────────────────────── */
|
||||
|
||||
.opal-divider-chevron {
|
||||
@apply transition-transform duration-200 ease-in-out;
|
||||
}
|
||||
|
||||
@@ -100,7 +100,7 @@ const heightVariants: Record<ExtremaSizeVariants, string> = {
|
||||
// - SelectCard (paddingVariant, roundingVariant)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const cardPaddingVariants: Record<PaddingVariants, string> = {
|
||||
const paddingVariants: Record<PaddingVariants, string> = {
|
||||
lg: "p-6",
|
||||
md: "p-4",
|
||||
sm: "p-2",
|
||||
@@ -109,6 +109,24 @@ const cardPaddingVariants: Record<PaddingVariants, string> = {
|
||||
fit: "p-0",
|
||||
};
|
||||
|
||||
const paddingXVariants: Record<PaddingVariants, string> = {
|
||||
lg: "px-6",
|
||||
md: "px-4",
|
||||
sm: "px-2",
|
||||
xs: "px-1",
|
||||
"2xs": "px-0.5",
|
||||
fit: "px-0",
|
||||
};
|
||||
|
||||
const paddingYVariants: Record<PaddingVariants, string> = {
|
||||
lg: "py-6",
|
||||
md: "py-4",
|
||||
sm: "py-2",
|
||||
xs: "py-1",
|
||||
"2xs": "py-0.5",
|
||||
fit: "py-0",
|
||||
};
|
||||
|
||||
const cardRoundingVariants: Record<RoundingVariants, string> = {
|
||||
lg: "rounded-16",
|
||||
md: "rounded-12",
|
||||
@@ -122,7 +140,9 @@ export {
|
||||
type OverridableExtremaSizeVariants,
|
||||
type SizeVariants,
|
||||
containerSizeVariants,
|
||||
cardPaddingVariants,
|
||||
paddingVariants,
|
||||
paddingXVariants,
|
||||
paddingYVariants,
|
||||
cardRoundingVariants,
|
||||
widthVariants,
|
||||
heightVariants,
|
||||
|
||||
16
web/package-lock.json
generated
16
web/package-lock.json
generated
@@ -47,6 +47,7 @@
|
||||
"clsx": "^2.1.1",
|
||||
"cmdk": "^1.0.0",
|
||||
"cookies-next": "^5.1.0",
|
||||
"copy-to-clipboard": "^3.3.3",
|
||||
"date-fns": "^3.6.0",
|
||||
"docx-preview": "^0.3.7",
|
||||
"favicon-fetch": "^1.0.0",
|
||||
@@ -8843,6 +8844,15 @@
|
||||
"react": ">= 16.8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/copy-to-clipboard": {
|
||||
"version": "3.3.3",
|
||||
"resolved": "https://registry.npmjs.org/copy-to-clipboard/-/copy-to-clipboard-3.3.3.tgz",
|
||||
"integrity": "sha512-2KV8NhB5JqC3ky0r9PMCAZKbUHSwtEo4CwCs0KXgruG43gX5PMqDEBbVU4OUzw2MuAWUfsuFmWvEKG5QRfSnJA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"toggle-selection": "^1.0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/core-js": {
|
||||
"version": "3.46.0",
|
||||
"hasInstallScript": true,
|
||||
@@ -17426,6 +17436,12 @@
|
||||
"node": ">=8.0"
|
||||
}
|
||||
},
|
||||
"node_modules/toggle-selection": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/toggle-selection/-/toggle-selection-1.0.6.tgz",
|
||||
"integrity": "sha512-BiZS+C1OS8g/q2RRbJmy59xpyghNBqrr6k5L/uKBGRsTfxmu3ffiRnd8mlGPUVayg8pvfi5urfnu8TU7DVOkLQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/toposort": {
|
||||
"version": "2.0.2",
|
||||
"license": "MIT"
|
||||
|
||||
@@ -65,6 +65,7 @@
|
||||
"clsx": "^2.1.1",
|
||||
"cmdk": "^1.0.0",
|
||||
"cookies-next": "^5.1.0",
|
||||
"copy-to-clipboard": "^3.3.3",
|
||||
"date-fns": "^3.6.0",
|
||||
"docx-preview": "^0.3.7",
|
||||
"favicon-fetch": "^1.0.0",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import Modal from "@/refresh-components/Modal";
|
||||
import { Button } from "@opal/components";
|
||||
import { CloudEmbeddingModel } from "../../../../components/embedding/interfaces";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { SvgCheck } from "@opal/icons";
|
||||
|
||||
export interface AlreadyPickedModalProps {
|
||||
@@ -17,7 +18,7 @@ export default function AlreadyPickedModal({
|
||||
<Modal.Content width="sm" height="sm">
|
||||
<Modal.Header
|
||||
icon={SvgCheck}
|
||||
title={`${model.model_name} already chosen`}
|
||||
title={markdown(`*${model.model_name}* already chosen`)}
|
||||
description="You can select a different one if you want!"
|
||||
onClose={onClose}
|
||||
/>
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
getFormattedProviderName,
|
||||
} from "@/components/embedding/interfaces";
|
||||
import { EMBEDDING_PROVIDERS_ADMIN_URL } from "@/lib/llmConfig/constants";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { mutate } from "swr";
|
||||
import { SWR_KEYS } from "@/lib/swr-keys";
|
||||
import { testEmbedding } from "@/app/admin/embeddings/pages/utils";
|
||||
@@ -172,9 +173,11 @@ export default function ChangeCredentialsModal({
|
||||
<Modal.Content>
|
||||
<Modal.Header
|
||||
icon={SvgSettings}
|
||||
title={`Modify your ${getFormattedProviderName(
|
||||
provider.provider_type
|
||||
)} ${isProxy ? "Configuration" : "key"}`}
|
||||
title={markdown(
|
||||
`Modify your *${getFormattedProviderName(
|
||||
provider.provider_type
|
||||
)}* ${isProxy ? "configuration" : "key"}`
|
||||
)}
|
||||
onClose={onCancel}
|
||||
/>
|
||||
<Modal.Body>
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
getFormattedProviderName,
|
||||
} from "../../../../components/embedding/interfaces";
|
||||
import { SvgTrash } from "@opal/icons";
|
||||
import { markdown } from "@opal/utils";
|
||||
|
||||
export interface DeleteCredentialsModalProps {
|
||||
modelProvider: CloudEmbeddingProvider;
|
||||
@@ -24,9 +25,11 @@ export default function DeleteCredentialsModal({
|
||||
<Modal.Content width="sm" height="sm">
|
||||
<Modal.Header
|
||||
icon={SvgTrash}
|
||||
title={`Delete ${getFormattedProviderName(
|
||||
modelProvider.provider_type
|
||||
)} Credentials?`}
|
||||
title={markdown(
|
||||
`Delete *${getFormattedProviderName(
|
||||
modelProvider.provider_type
|
||||
)}* credentials?`
|
||||
)}
|
||||
onClose={onCancel}
|
||||
/>
|
||||
<Modal.Body>
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
} from "@/components/embedding/interfaces";
|
||||
import { EMBEDDING_PROVIDERS_ADMIN_URL } from "@/lib/llmConfig/constants";
|
||||
import Modal from "@/refresh-components/Modal";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { SvgSettings } from "@opal/icons";
|
||||
import SimpleLoader from "@/refresh-components/loaders/SimpleLoader";
|
||||
export interface ProviderCreationModalProps {
|
||||
@@ -185,9 +186,11 @@ export default function ProviderCreationModal({
|
||||
<Modal.Content width="sm" height="sm">
|
||||
<Modal.Header
|
||||
icon={SvgSettings}
|
||||
title={`Configure ${getFormattedProviderName(
|
||||
selectedProvider.provider_type
|
||||
)}`}
|
||||
title={markdown(
|
||||
`Configure *${getFormattedProviderName(
|
||||
selectedProvider.provider_type
|
||||
)}*`
|
||||
)}
|
||||
onClose={onCancel}
|
||||
/>
|
||||
<Modal.Body>
|
||||
|
||||
@@ -2,6 +2,7 @@ import Modal from "@/refresh-components/Modal";
|
||||
import { Button } from "@opal/components";
|
||||
import Text from "@/refresh-components/texts/Text";
|
||||
import { CloudEmbeddingModel } from "@/components/embedding/interfaces";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { SvgServer } from "@opal/icons";
|
||||
|
||||
export interface SelectModelModalProps {
|
||||
@@ -20,7 +21,7 @@ export default function SelectModelModal({
|
||||
<Modal.Content width="sm" height="sm">
|
||||
<Modal.Header
|
||||
icon={SvgServer}
|
||||
title={`Select ${model.model_name}`}
|
||||
title={markdown(`Select *${model.model_name}*`)}
|
||||
onClose={onCancel}
|
||||
/>
|
||||
<Modal.Body>
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"use client";
|
||||
|
||||
import { toast } from "@/hooks/useToast";
|
||||
import { markdown } from "@opal/utils";
|
||||
|
||||
import EmbeddingModelSelection from "../EmbeddingModelSelectionForm";
|
||||
import { useCallback, useEffect, useMemo, useState, useRef } from "react";
|
||||
@@ -538,7 +539,9 @@ export default function EmbeddingForm() {
|
||||
<Modal.Content>
|
||||
<Modal.Header
|
||||
icon={SvgAlertTriangle}
|
||||
title={`Are you sure you want to select ${selectedProvider.model_name}?`}
|
||||
title={markdown(
|
||||
`Are you sure you want to select *${selectedProvider.model_name}*?`
|
||||
)}
|
||||
onClose={() => setShowPoorModel(false)}
|
||||
/>
|
||||
<Modal.Body>
|
||||
|
||||
@@ -210,8 +210,10 @@ export default function MultiModelResponseView({
|
||||
const response = responses.find((r) => r.modelIndex === modelIndex);
|
||||
if (!response) return;
|
||||
|
||||
// Persist preferred response to backend + update local tree so the
|
||||
// input bar unblocks (awaitingPreferredSelection clears).
|
||||
// Persist preferred response + sync `latestChildNodeId`. Backend's
|
||||
// `set_preferred_response` updates `latest_child_message_id`; if the
|
||||
// frontend chain walk disagrees, the next follow-up fails with
|
||||
// "not on the latest mainline".
|
||||
if (parentMessage?.messageId && response.messageId && currentSessionId) {
|
||||
setPreferredResponse(parentMessage.messageId, response.messageId).catch(
|
||||
(err) => console.error("Failed to persist preferred response:", err)
|
||||
@@ -227,6 +229,7 @@ export default function MultiModelResponseView({
|
||||
updated.set(parentMessage.nodeId, {
|
||||
...userMsg,
|
||||
preferredResponseId: response.messageId,
|
||||
latestChildNodeId: response.nodeId,
|
||||
});
|
||||
updateSessionMessageTree(currentSessionId, updated);
|
||||
}
|
||||
|
||||
@@ -137,7 +137,7 @@ function DeleteConfirmModal({ hook, onDelete }: DeleteConfirmModalProps) {
|
||||
<Modal.Header
|
||||
// TODO(@raunakab): replace the colour of this SVG with red.
|
||||
icon={SvgTrash}
|
||||
title={`Delete ${hook.name}`}
|
||||
title={markdown(`Delete *${hook.name}*`)}
|
||||
onClose={onClose}
|
||||
/>
|
||||
<Modal.Body>
|
||||
|
||||
@@ -694,6 +694,25 @@ export function useLlmManager(
|
||||
prevAgentIdRef.current = liveAgent?.id;
|
||||
}, [liveAgent?.id]);
|
||||
|
||||
// Clear manual override when arriving at a *different* existing session
|
||||
// from any previously-seen defined session. Tracks only the last
|
||||
// *defined* session id so a round-trip through new-chat (A → undefined
|
||||
// → B) still resets, while A → undefined (new-chat) preserves it.
|
||||
const prevDefinedSessionIdRef = useRef<string | undefined>(undefined);
|
||||
useEffect(() => {
|
||||
const nextId = currentChatSession?.id;
|
||||
if (
|
||||
nextId !== undefined &&
|
||||
prevDefinedSessionIdRef.current !== undefined &&
|
||||
nextId !== prevDefinedSessionIdRef.current
|
||||
) {
|
||||
setUserHasManuallyOverriddenLLM(false);
|
||||
}
|
||||
if (nextId !== undefined) {
|
||||
prevDefinedSessionIdRef.current = nextId;
|
||||
}
|
||||
}, [currentChatSession?.id]);
|
||||
|
||||
function getValidLlmDescriptor(
|
||||
modelName: string | null | undefined
|
||||
): LlmDescriptor {
|
||||
@@ -715,8 +734,9 @@ export function useLlmManager(
|
||||
|
||||
if (llmProviders === undefined || llmProviders === null) {
|
||||
resolved = manualLlm;
|
||||
} else if (userHasManuallyOverriddenLLM && !currentChatSession) {
|
||||
// User has overridden in this session and switched to a new session
|
||||
} else if (userHasManuallyOverriddenLLM) {
|
||||
// Manual override wins over session's `current_alternate_model`.
|
||||
// Cleared on cross-session navigation by the effect above.
|
||||
resolved = manualLlm;
|
||||
} else if (currentChatSession?.current_alternate_model) {
|
||||
resolved = getValidLlmDescriptorForProviders(
|
||||
@@ -728,8 +748,6 @@ export function useLlmManager(
|
||||
liveAgent.llm_model_version_override,
|
||||
llmProviders
|
||||
);
|
||||
} else if (userHasManuallyOverriddenLLM) {
|
||||
resolved = manualLlm;
|
||||
} else if (user?.preferences?.default_model) {
|
||||
resolved = getValidLlmDescriptorForProviders(
|
||||
user.preferences.default_model,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"use client";
|
||||
|
||||
import { useEffect, useRef, useState } from "react";
|
||||
import copy from "copy-to-clipboard";
|
||||
import { Button, ButtonProps } from "@opal/components";
|
||||
import { SvgAlertTriangle, SvgCheck, SvgCopy } from "@opal/icons";
|
||||
|
||||
@@ -40,26 +41,19 @@ export default function CopyIconButton({
|
||||
}
|
||||
|
||||
try {
|
||||
// Check if Clipboard API is available
|
||||
if (!navigator.clipboard) {
|
||||
throw new Error("Clipboard API not available");
|
||||
}
|
||||
|
||||
// If HTML content getter is provided, copy both HTML and plain text
|
||||
if (getHtmlContent) {
|
||||
if (navigator.clipboard && getHtmlContent) {
|
||||
const htmlContent = getHtmlContent();
|
||||
const clipboardItem = new ClipboardItem({
|
||||
"text/html": new Blob([htmlContent], { type: "text/html" }),
|
||||
"text/plain": new Blob([text], { type: "text/plain" }),
|
||||
});
|
||||
await navigator.clipboard.write([clipboardItem]);
|
||||
}
|
||||
// Default: plain text only
|
||||
else {
|
||||
} else if (navigator.clipboard) {
|
||||
await navigator.clipboard.writeText(text);
|
||||
} else if (!copy(text)) {
|
||||
throw new Error("copy-to-clipboard returned false");
|
||||
}
|
||||
|
||||
// Show "copied" state
|
||||
setCopyState("copied");
|
||||
} catch (err) {
|
||||
console.error("Failed to copy:", err);
|
||||
|
||||
@@ -159,9 +159,12 @@ export default function ModelSelector({
|
||||
);
|
||||
|
||||
if (!isMultiModel) {
|
||||
// Stable key — keying on model would unmount the pill
|
||||
// on change and leave Radix's anchorRef detached,
|
||||
// flashing the closing popover at (0,0).
|
||||
return (
|
||||
<OpenButton
|
||||
key={modelKey(model.provider, model.modelName)}
|
||||
key="single-model-pill"
|
||||
icon={ProviderIcon}
|
||||
onClick={(e: React.MouseEvent) =>
|
||||
handlePillClick(index, e.currentTarget as HTMLElement)
|
||||
|
||||
@@ -425,16 +425,27 @@ export default function AppPage({ firstMessage }: ChatPageProps) {
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [multiModel.isMultiModelActive]);
|
||||
|
||||
// Sync single-model selection to llmManager so the submission path
|
||||
// uses the correct provider/version (replaces the old LLMPopover sync).
|
||||
// Sync single-model selection to llmManager so the submission path uses
|
||||
// the correct provider/version. Guard against echoing derived state back
|
||||
// — only call updateCurrentLlm when the selection actually differs from
|
||||
// currentLlm, otherwise the initial [] → [currentLlmModel] sync would
|
||||
// pin `userHasManuallyOverriddenLLM=true` with whatever was resolved
|
||||
// first (often the default model before the session's alt_model loads).
|
||||
useEffect(() => {
|
||||
if (multiModel.selectedModels.length === 1) {
|
||||
const model = multiModel.selectedModels[0]!;
|
||||
llmManager.updateCurrentLlm({
|
||||
name: model.name,
|
||||
provider: model.provider,
|
||||
modelName: model.modelName,
|
||||
});
|
||||
const current = llmManager.currentLlm;
|
||||
if (
|
||||
model.provider !== current.provider ||
|
||||
model.modelName !== current.modelName ||
|
||||
model.name !== current.name
|
||||
) {
|
||||
llmManager.updateCurrentLlm({
|
||||
name: model.name,
|
||||
provider: model.provider,
|
||||
modelName: model.modelName,
|
||||
});
|
||||
}
|
||||
}
|
||||
}, [multiModel.selectedModels]);
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ import { usePathname, useRouter } from "next/navigation";
|
||||
import * as InputLayouts from "@/layouts/input-layouts";
|
||||
import { Section, AttachmentItemLayout } from "@/layouts/general-layouts";
|
||||
import { Content, ContentAction } from "@opal/layouts";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { Formik, Form } from "formik";
|
||||
import * as Yup from "yup";
|
||||
import {
|
||||
@@ -1556,7 +1557,7 @@ function FederatedConnectorCard({
|
||||
{showDisconnectConfirmation && (
|
||||
<ConfirmationModalLayout
|
||||
icon={SvgUnplug}
|
||||
title={`Disconnect ${sourceMetadata.displayName}`}
|
||||
title={markdown(`Disconnect *${sourceMetadata.displayName}*`)}
|
||||
onClose={() => setShowDisconnectConfirmation(false)}
|
||||
submit={
|
||||
<Button
|
||||
|
||||
@@ -4,7 +4,7 @@ import { useCallback, useState } from "react";
|
||||
import { Button } from "@opal/components";
|
||||
// TODO(@raunakab): migrate to Opal LineItemButton once it supports danger variant
|
||||
import LineItem from "@/refresh-components/buttons/LineItem";
|
||||
import { cn } from "@opal/utils";
|
||||
import { cn, markdown } from "@opal/utils";
|
||||
import {
|
||||
SvgMoreHorizontal,
|
||||
SvgEdit,
|
||||
@@ -341,7 +341,7 @@ export default function AgentRowActions({
|
||||
{unlistOpen && (
|
||||
<ConfirmationModalLayout
|
||||
icon={SvgEyeOff}
|
||||
title={`Unlist ${agent.name}`}
|
||||
title={markdown(`Unlist *${agent.name}*`)}
|
||||
onClose={isSubmitting ? undefined : () => setUnlistOpen(false)}
|
||||
submit={
|
||||
<Button
|
||||
|
||||
@@ -347,7 +347,7 @@ export default function ImageGenerationContent() {
|
||||
{disconnectProvider && (
|
||||
<ConfirmationModalLayout
|
||||
icon={SvgUnplug}
|
||||
title={`Disconnect ${disconnectProvider.title}`}
|
||||
title={markdown(`Disconnect *${disconnectProvider.title}*`)}
|
||||
description="This will remove the stored credentials for this provider."
|
||||
onClose={() => {
|
||||
setDisconnectProvider(null);
|
||||
|
||||
@@ -201,7 +201,7 @@ function VoiceDisconnectModal({
|
||||
return (
|
||||
<ConfirmationModalLayout
|
||||
icon={SvgUnplug}
|
||||
title={`Disconnect ${disconnectTarget.providerLabel}`}
|
||||
title={markdown(`Disconnect *${disconnectTarget.providerLabel}*`)}
|
||||
description="Voice models"
|
||||
onClose={onClose}
|
||||
submit={
|
||||
|
||||
@@ -9,6 +9,7 @@ import Modal from "@/refresh-components/Modal";
|
||||
import { Button } from "@opal/components";
|
||||
|
||||
import { SvgArrowExchange } from "@opal/icons";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { SvgOnyxLogo } from "@opal/logos";
|
||||
import type { IconProps } from "@opal/types";
|
||||
|
||||
@@ -81,7 +82,7 @@ export const WebProviderSetupModal = memo(
|
||||
<Modal.Content width="sm" preventAccidentalClose>
|
||||
<Modal.Header
|
||||
icon={LogoArrangement}
|
||||
title={`Set up ${providerLabel}`}
|
||||
title={markdown(`Set up *${providerLabel}*`)}
|
||||
description={description}
|
||||
onClose={onClose}
|
||||
/>
|
||||
|
||||
@@ -7,6 +7,7 @@ import Text from "@/refresh-components/texts/Text";
|
||||
import { Section } from "@/layouts/general-layouts";
|
||||
import * as SettingsLayouts from "@/layouts/settings-layouts";
|
||||
import { Content, Card } from "@opal/layouts";
|
||||
import { markdown } from "@opal/utils";
|
||||
import useSWR from "swr";
|
||||
import { errorHandlingFetcher, FetchError } from "@/lib/fetcher";
|
||||
import { SWR_KEYS } from "@/lib/swr-keys";
|
||||
@@ -146,7 +147,7 @@ function WebSearchDisconnectModal({
|
||||
return (
|
||||
<ConfirmationModalLayout
|
||||
icon={SvgUnplug}
|
||||
title={`Disconnect ${disconnectTarget.label}`}
|
||||
title={markdown(`Disconnect *${disconnectTarget.label}*`)}
|
||||
description="This will remove the stored credentials for this provider."
|
||||
onClose={onClose}
|
||||
submit={
|
||||
|
||||
@@ -5,6 +5,7 @@ import Modal from "@/refresh-components/Modal";
|
||||
import { Button } from "@opal/components";
|
||||
import Text from "@/refresh-components/texts/Text";
|
||||
import { cn } from "@/lib/utils";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { SvgUnplug } from "@opal/icons";
|
||||
interface DisconnectEntityModalProps {
|
||||
isOpen: boolean;
|
||||
@@ -51,7 +52,7 @@ export default function DisconnectEntityModal({
|
||||
icon={({ className }) => (
|
||||
<SvgUnplug className={cn(className, "stroke-action-danger-05")} />
|
||||
)}
|
||||
title={`Disconnect ${name}`}
|
||||
title={markdown(`Disconnect *${name}*`)}
|
||||
onClose={onClose}
|
||||
/>
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ import InputSelect from "@/refresh-components/inputs/InputSelect";
|
||||
import InputTypeIn from "@/refresh-components/inputs/InputTypeIn";
|
||||
import PasswordInputTypeIn from "@/refresh-components/inputs/PasswordInputTypeIn";
|
||||
import { Button } from "@opal/components";
|
||||
import { markdown } from "@opal/utils";
|
||||
import CopyIconButton from "@/refresh-components/buttons/CopyIconButton";
|
||||
import Text from "@/refresh-components/texts/Text";
|
||||
import { Formik, Form } from "formik";
|
||||
@@ -317,7 +318,11 @@ export default function MCPAuthenticationModal({
|
||||
<Modal.Content width="sm" height="lg" skipOverlay={skipOverlay}>
|
||||
<Modal.Header
|
||||
icon={SvgArrowExchange}
|
||||
title={`Authenticate ${mcpServer?.name || "MCP Server"}`}
|
||||
title={
|
||||
mcpServer
|
||||
? markdown(`Authenticate *${mcpServer.name}*`)
|
||||
: "Authenticate MCP Server"
|
||||
}
|
||||
description="Authenticate your connection to start using the MCP server."
|
||||
/>
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import React, { useEffect, useRef, useState } from "react";
|
||||
import { Formik, Form, useFormikContext } from "formik";
|
||||
import type { FormikConfig } from "formik";
|
||||
import { cn } from "@/lib/utils";
|
||||
import { markdown } from "@opal/utils";
|
||||
import { Interactive } from "@opal/core";
|
||||
import { usePaidEnterpriseFeaturesEnabled } from "@/components/settings/usePaidEnterpriseFeaturesEnabled";
|
||||
import { useAgents } from "@/hooks/useAgents";
|
||||
@@ -720,7 +721,7 @@ function ModalWrapperInner({
|
||||
} = getProvider(providerName);
|
||||
|
||||
const title = llmProvider
|
||||
? `Configure "${llmProvider.name}"`
|
||||
? markdown(`Configure *${llmProvider.name}*`)
|
||||
: `Set up ${providerProductName}`;
|
||||
const description =
|
||||
descriptionOverride ??
|
||||
|
||||
Reference in New Issue
Block a user