Compare commits

..

37 Commits

Author SHA1 Message Date
Dane Urban
99671b365d . 2026-04-14 17:12:12 -07:00
Dane Urban
2facb367c2 . 2026-04-14 17:12:12 -07:00
Dane Urban
8898a5ac57 Connectors output TabularSection 2026-04-14 17:12:12 -07:00
Dane Urban
a2f7221bc4 Rework 2026-04-14 17:12:08 -07:00
Dane Urban
0d1452cc06 Change dispatcher 2026-04-14 17:11:39 -07:00
Dane Urban
c432d39ec0 . 2026-04-14 17:11:10 -07:00
Dane Urban
cdc61a155c . 2026-04-14 17:11:01 -07:00
Dane Urban
fb41378796 Tabular log 2026-04-14 17:10:27 -07:00
Dane Urban
46e6e1173e . 2026-04-14 17:07:32 -07:00
Dane Urban
b1165c7408 . 2026-04-14 16:49:12 -07:00
Dane Urban
ea5769bc4a nits 2026-04-14 16:31:17 -07:00
Dane Urban
8288483146 Reduce tokenizer usage 2026-04-14 16:26:00 -07:00
Dane Urban
7437efe808 Convert from generator to list 2026-04-14 16:06:16 -07:00
Dane Urban
b61b59ffba . 2026-04-14 15:51:27 -07:00
Dane Urban
bc3b5600bb . 2026-04-14 15:41:37 -07:00
Dane Urban
80c48ff80e refresh 2026-04-14 15:41:37 -07:00
Dane Urban
8cd205a2e5 . 2026-04-14 15:41:37 -07:00
Dane Urban
722a159e2e . 2026-04-14 15:41:37 -07:00
Dane Urban
8ac49fa587 Tabular Chunker 2026-04-14 15:41:37 -07:00
Dane Urban
b5c215ab91 . 2026-04-14 15:40:47 -07:00
Dane Urban
2c456bebf6 . 2026-04-14 15:39:34 -07:00
Dane Urban
dc413abbba Tabular log 2026-04-14 15:39:11 -07:00
Dane Urban
ff1b3e88de Rework 2026-04-14 15:37:39 -07:00
Dane Urban
ac39e2031c Change dispatcher 2026-04-14 15:37:39 -07:00
Dane Urban
70f5a906d1 . 2026-04-14 15:37:39 -07:00
Dane Urban
f2b4adc70a Refactor stuff 2026-04-14 15:37:39 -07:00
Dane Urban
390a102f8a Add kind to the Section model 2026-04-14 15:37:39 -07:00
Dane Urban
9978f5c2d5 Precommit 2026-04-14 15:37:39 -07:00
Dane Urban
cdf655e9f6 Minor fixes 2026-04-14 15:37:39 -07:00
Dane Urban
2aaf0b2f38 Rework 2026-04-14 15:37:39 -07:00
Dane Urban
5fab2900c5 Change dispatcher 2026-04-14 15:37:39 -07:00
Dane Urban
41eaa0c542 . 2026-04-14 15:37:39 -07:00
Dane Urban
a496fc27e4 . 2026-04-14 15:37:39 -07:00
Dane Urban
ae34379ba6 Refactor stuff 2026-04-14 15:37:39 -07:00
Dane Urban
8f2bbe57f3 . 2026-04-14 15:37:39 -07:00
Dane Urban
3294edbc72 Resolve nit 2026-04-14 15:37:39 -07:00
Dane Urban
efb2d11864 Add kind to the Section model 2026-04-14 15:37:39 -07:00
29 changed files with 1188 additions and 1413 deletions

View File

@@ -2,7 +2,6 @@ FROM ubuntu:26.04@sha256:cc925e589b7543b910fea57a240468940003fbfc0515245a495dd0a
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
default-jre \
fd-find \
fzf \
git \

View File

@@ -1,6 +1,6 @@
{
"name": "Onyx Dev Sandbox",
"image": "onyxdotapp/onyx-devcontainer@sha256:0f02d9299928849c7b15f3b348dcfdcdcb64411ff7a4580cbc026a6ee7aa1554",
"image": "onyxdotapp/onyx-devcontainer@sha256:12184169c5bcc9cca0388286d5ffe504b569bc9c37bfa631b76ee8eee2064055",
"runArgs": ["--cap-add=NET_ADMIN", "--cap-add=NET_RAW"],
"mounts": [
"source=${localEnv:HOME}/.claude,target=/home/dev/.claude,type=bind",

View File

@@ -96,14 +96,11 @@ def get_model_app() -> FastAPI:
title="Onyx Model Server", version=__version__, lifespan=lifespan
)
if SENTRY_DSN:
from onyx.configs.sentry import _add_instance_tags
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[StarletteIntegration(), FastApiIntegration()],
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)
logger.info("Sentry initialized")
else:

View File

@@ -63,14 +63,11 @@ logger = setup_logger()
task_logger = get_task_logger(__name__)
if SENTRY_DSN:
from onyx.configs.sentry import _add_instance_tags
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[CeleryIntegration()],
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)
logger.info("Sentry initialized")
else:

View File

@@ -135,13 +135,10 @@ def _docfetching_task(
# Since connector_indexing_proxy_task spawns a new process using this function as
# the entrypoint, we init Sentry here.
if SENTRY_DSN:
from onyx.configs.sentry import _add_instance_tags
sentry_sdk.init(
dsn=SENTRY_DSN,
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)
logger.info("Sentry initialized")
else:

View File

@@ -5,7 +5,6 @@ from datetime import datetime
from datetime import timedelta
from datetime import timezone
import sentry_sdk
from celery import Celery
from sqlalchemy.orm import Session
@@ -557,27 +556,6 @@ def connector_document_extraction(
# save record of any failures at the connector level
if failure is not None:
if failure.exception is not None:
with sentry_sdk.new_scope() as scope:
scope.set_tag("stage", "connector_fetch")
scope.set_tag("connector_source", db_connector.source.value)
scope.set_tag("cc_pair_id", str(cc_pair_id))
scope.set_tag("index_attempt_id", str(index_attempt_id))
scope.set_tag("tenant_id", tenant_id)
if failure.failed_document:
scope.set_tag(
"doc_id", failure.failed_document.document_id
)
if failure.failed_entity:
scope.set_tag(
"entity_id", failure.failed_entity.entity_id
)
scope.fingerprint = [
"connector-fetch-failure",
db_connector.source.value,
type(failure.exception).__name__,
]
sentry_sdk.capture_exception(failure.exception)
total_failures += 1
with get_session_with_current_tenant() as db_session:
create_index_attempt_error(

View File

@@ -1,48 +0,0 @@
from typing import Any
from sentry_sdk.types import Event
from onyx.utils.logger import setup_logger
logger = setup_logger()
_instance_id_resolved = False
def _add_instance_tags(
event: Event,
hint: dict[str, Any], # noqa: ARG001
) -> Event | None:
"""Sentry before_send hook that lazily attaches instance identification tags.
On the first event, resolves the instance UUID from the KV store (requires DB)
and sets it as a global Sentry tag. Subsequent events pick it up automatically.
"""
global _instance_id_resolved
if _instance_id_resolved:
return event
try:
import sentry_sdk
from shared_configs.configs import MULTI_TENANT
if MULTI_TENANT:
instance_id = "multi-tenant-cloud"
else:
from onyx.utils.telemetry import get_or_generate_uuid
instance_id = get_or_generate_uuid()
sentry_sdk.set_tag("instance_id", instance_id)
# Also set on this event since set_tag won't retroactively apply
event.setdefault("tags", {})["instance_id"] = instance_id
# Only mark resolved after success — if DB wasn't ready, retry next event
_instance_id_resolved = True
except Exception:
logger.debug("Failed to resolve instance_id for Sentry tagging")
return event

View File

@@ -10,6 +10,7 @@ from typing import Optional
from urllib.parse import quote
import boto3
from backend.onyx.connectors.models import TabularSection
from botocore.client import Config
from botocore.credentials import RefreshableCredentials
from botocore.exceptions import ClientError
@@ -26,6 +27,10 @@ from onyx.configs.constants import FileOrigin
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
process_onyx_metadata,
)
from onyx.connectors.cross_connector_utils.tabular_section_utils import is_tabular_file
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
tabular_file_to_sections,
)
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.connectors.exceptions import CredentialExpiredError
from onyx.connectors.exceptions import InsufficientPermissionsError
@@ -451,6 +456,40 @@ class BlobStorageConnector(LoadConnector, PollConnector):
logger.exception(f"Error processing image {key}")
continue
# Handle tabular files (xlsx, csv, tsv) — produce one
# TabularSection per sheet (or per file for csv/tsv)
# instead of a flat TextSection.
if is_tabular_file(file_name):
try:
downloaded_file = self._download_object(key)
if downloaded_file is None:
continue
tabular_sections = tabular_file_to_sections(
BytesIO(downloaded_file),
file_name=file_name,
link=link,
)
batch.append(
Document(
id=f"{self.bucket_type}:{self.bucket_name}:{key}",
sections=(
tabular_sections
if tabular_sections
else [TabularSection(link=link, text="")]
),
source=DocumentSource(self.bucket_type.value),
semantic_identifier=file_name,
doc_updated_at=last_modified,
metadata={},
)
)
if len(batch) == self.batch_size:
yield batch
batch = []
except Exception:
logger.exception(f"Error processing tabular file {key}")
continue
# Handle text and document files
try:
downloaded_file = self._download_object(key)

View File

@@ -43,7 +43,7 @@ def tabular_file_to_sections(
if lowered.endswith(".xlsx"):
return [
TabularSection(link=f"sheet:{sheet_title}", text=csv_text)
TabularSection(link=f"{file_name} :: {sheet_title}", text=csv_text)
for csv_text, sheet_title in xlsx_sheet_extraction(
file, file_name=file_name
)

View File

@@ -15,6 +15,10 @@ from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
)
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rate_limit_builder
from onyx.connectors.cross_connector_utils.rate_limit_wrapper import rl_requests
from onyx.connectors.cross_connector_utils.tabular_section_utils import is_tabular_file
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
tabular_file_to_sections,
)
from onyx.connectors.drupal_wiki.models import DrupalWikiCheckpoint
from onyx.connectors.drupal_wiki.models import DrupalWikiPage
from onyx.connectors.drupal_wiki.models import DrupalWikiPageResponse
@@ -33,6 +37,7 @@ from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TabularSection
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_text_and_images
from onyx.file_processing.extract_file_text import get_file_ext
@@ -213,7 +218,7 @@ class DrupalWikiConnector(
attachment: dict[str, Any],
page_id: int,
download_url: str,
) -> tuple[list[TextSection | ImageSection], str | None]:
) -> tuple[list[TextSection | ImageSection | TabularSection], str | None]:
"""
Process a single attachment and return generated sections.
@@ -226,7 +231,7 @@ class DrupalWikiConnector(
Tuple of (sections, error_message). If error_message is not None, the
sections list should be treated as invalid.
"""
sections: list[TextSection | ImageSection] = []
sections: list[TextSection | ImageSection | TabularSection] = []
try:
if not self._validate_attachment_filetype(attachment):
@@ -273,6 +278,25 @@ class DrupalWikiConnector(
return sections, None
# Tabular attachments (xlsx, csv, tsv) — produce
# TabularSections instead of a flat TextSection.
if is_tabular_file(file_name):
try:
sections.extend(
tabular_file_to_sections(
BytesIO(raw_bytes),
file_name=file_name,
link=download_url,
)
)
except Exception as e:
logger.warning(
f"Failed to extract tabular sections from {file_name}: {e}"
)
if not sections:
return [], f"No content extracted from tabular file {file_name}"
return sections, None
image_counter = 0
def _store_embedded_image(image_data: bytes, image_name: str) -> None:
@@ -497,7 +521,7 @@ class DrupalWikiConnector(
page_url = build_drupal_wiki_document_id(self.base_url, page.id)
# Create sections with just the page content
sections: list[TextSection | ImageSection] = [
sections: list[TextSection | ImageSection | TabularSection] = [
TextSection(text=text_content, link=page_url)
]

View File

@@ -2,6 +2,7 @@ import json
import os
from datetime import datetime
from datetime import timezone
from io import BytesIO
from pathlib import Path
from typing import Any
from typing import IO
@@ -12,11 +13,16 @@ from onyx.configs.constants import FileOrigin
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
process_onyx_metadata,
)
from onyx.connectors.cross_connector_utils.tabular_section_utils import is_tabular_file
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
tabular_file_to_sections,
)
from onyx.connectors.interfaces import GenerateDocumentsOutput
from onyx.connectors.interfaces import LoadConnector
from onyx.connectors.models import Document
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import ImageSection
from onyx.connectors.models import TabularSection
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_text_and_images
from onyx.file_processing.extract_file_text import get_file_ext
@@ -179,8 +185,32 @@ def _process_file(
link = onyx_metadata.link or link
# Build sections: first the text as a single Section
sections: list[TextSection | ImageSection] = []
if extraction_result.text_content.strip():
sections: list[TextSection | ImageSection | TabularSection] = []
if is_tabular_file(file_name):
# Produce TabularSections
lowered_name = file_name.lower()
if lowered_name.endswith(".xlsx"):
file.seek(0)
tabular_source: IO[bytes] = file
else:
tabular_source = BytesIO(
extraction_result.text_content.encode("utf-8", errors="replace")
)
try:
sections.extend(
tabular_file_to_sections(
file=tabular_source,
file_name=file_name,
link=link or "",
)
)
except Exception as e:
logger.error(f"Failed to process tabular file {file_name}: {e}")
return []
if not sections:
logger.warning(f"No content extracted from tabular file {file_name}")
return []
elif extraction_result.text_content.strip():
logger.debug(f"Creating TextSection for {file_name} with link: {link}")
sections.append(
TextSection(link=link, text=extraction_result.text_content.strip())

View File

@@ -13,6 +13,10 @@ from pydantic import BaseModel
from onyx.access.models import ExternalAccess
from onyx.configs.constants import DocumentSource
from onyx.configs.constants import FileOrigin
from onyx.connectors.cross_connector_utils.tabular_section_utils import is_tabular_file
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
tabular_file_to_sections,
)
from onyx.connectors.google_drive.constants import DRIVE_FOLDER_TYPE
from onyx.connectors.google_drive.constants import DRIVE_SHORTCUT_TYPE
from onyx.connectors.google_drive.models import GDriveMimeType
@@ -28,15 +32,16 @@ from onyx.connectors.models import Document
from onyx.connectors.models import DocumentFailure
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TabularSection
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.extract_file_text import get_file_ext
from onyx.file_processing.extract_file_text import pptx_to_text
from onyx.file_processing.extract_file_text import read_docx_file
from onyx.file_processing.extract_file_text import read_pdf_file
from onyx.file_processing.extract_file_text import xlsx_to_text
from onyx.file_processing.file_types import OnyxFileExtensions
from onyx.file_processing.file_types import OnyxMimeTypes
from onyx.file_processing.file_types import SPREADSHEET_MIME_TYPE
from onyx.file_processing.image_utils import store_image_and_create_section
from onyx.utils.logger import setup_logger
from onyx.utils.variable_functionality import (
@@ -289,7 +294,7 @@ def _download_and_extract_sections_basic(
service: GoogleDriveService,
allow_images: bool,
size_threshold: int,
) -> list[TextSection | ImageSection]:
) -> list[TextSection | ImageSection | TabularSection]:
"""Extract text and images from a Google Drive file."""
file_id = file["id"]
file_name = file["name"]
@@ -308,7 +313,7 @@ def _download_and_extract_sections_basic(
return []
# Store images for later processing
sections: list[TextSection | ImageSection] = []
sections: list[TextSection | ImageSection | TabularSection] = []
try:
section, embedded_id = store_image_and_create_section(
image_data=response_call(),
@@ -323,10 +328,9 @@ def _download_and_extract_sections_basic(
logger.error(f"Failed to process image {file_name}: {e}")
return sections
# For Google Docs, Sheets, and Slides, export as plain text
# For Google Docs, Sheets, and Slides, export via the Drive API
if mime_type in GOOGLE_MIME_TYPES_TO_EXPORT:
export_mime_type = GOOGLE_MIME_TYPES_TO_EXPORT[mime_type]
# Use the correct API call for exporting files
request = service.files().export_media(
fileId=file_id, mimeType=export_mime_type
)
@@ -335,6 +339,17 @@ def _download_and_extract_sections_basic(
logger.warning(f"Failed to export {file_name} as {export_mime_type}")
return []
if export_mime_type in OnyxMimeTypes.TABULAR_MIME_TYPES:
# Synthesize an extension on the filename
ext = ".xlsx" if export_mime_type == SPREADSHEET_MIME_TYPE else ".csv"
return list(
tabular_file_to_sections(
io.BytesIO(response),
file_name=f"{file_name}{ext}",
link=link,
)
)
text = response.decode("utf-8")
return [TextSection(link=link, text=text)]
@@ -356,9 +371,15 @@ def _download_and_extract_sections_basic(
elif (
mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
or is_tabular_file(file_name)
):
text = xlsx_to_text(io.BytesIO(response_call()), file_name=file_name)
return [TextSection(link=link, text=text)] if text else []
return list(
tabular_file_to_sections(
io.BytesIO(response_call()),
file_name=file_name,
link=link,
)
)
elif (
mime_type
@@ -410,8 +431,9 @@ def _find_nth(haystack: str, needle: str, n: int, start: int = 0) -> int:
def align_basic_advanced(
basic_sections: list[TextSection | ImageSection], adv_sections: list[TextSection]
) -> list[TextSection | ImageSection]:
basic_sections: list[TextSection | ImageSection | TabularSection],
adv_sections: list[TextSection],
) -> list[TextSection | ImageSection | TabularSection]:
"""Align the basic sections with the advanced sections.
In particular, the basic sections contain all content of the file,
including smart chips like dates and doc links. The advanced sections
@@ -428,7 +450,7 @@ def align_basic_advanced(
basic_full_text = "".join(
[section.text for section in basic_sections if isinstance(section, TextSection)]
)
new_sections: list[TextSection | ImageSection] = []
new_sections: list[TextSection | ImageSection | TabularSection] = []
heading_start = 0
for adv_ind in range(1, len(adv_sections)):
heading = adv_sections[adv_ind].text.split(HEADING_DELIMITER)[0]
@@ -599,7 +621,7 @@ def _convert_drive_item_to_document(
"""
Main entry point for converting a Google Drive file => Document object.
"""
sections: list[TextSection | ImageSection] = []
sections: list[TextSection | ImageSection | TabularSection] = []
# Only construct these services when needed
def _get_drive_service() -> GoogleDriveService:
@@ -639,7 +661,9 @@ def _convert_drive_item_to_document(
doc_id=file.get("id", ""),
)
if doc_sections:
sections = cast(list[TextSection | ImageSection], doc_sections)
sections = cast(
list[TextSection | ImageSection | TabularSection], doc_sections
)
if any(SMART_CHIP_CHAR in section.text for section in doc_sections):
logger.debug(
f"found smart chips in {file.get('name')}, aligning with basic sections"

View File

@@ -60,7 +60,12 @@ from onyx.connectors.models import ExternalAccess
from onyx.connectors.models import HierarchyNode
from onyx.connectors.models import ImageSection
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TabularSection
from onyx.connectors.models import TextSection
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
is_tabular_file,
tabular_file_to_sections,
)
from onyx.connectors.sharepoint.connector_utils import get_sharepoint_external_access
from onyx.db.enums import HierarchyNodeType
from onyx.file_processing.extract_file_text import extract_text_and_images
@@ -586,7 +591,7 @@ def _convert_driveitem_to_document_with_permissions(
driveitem, f"Failed to download via graph api: {e}", e
)
sections: list[TextSection | ImageSection] = []
sections: list[TextSection | ImageSection | TabularSection] = []
file_ext = get_file_ext(driveitem.name)
if not content_bytes:
@@ -602,6 +607,19 @@ def _convert_driveitem_to_document_with_permissions(
)
image_section.link = driveitem.web_url
sections.append(image_section)
elif is_tabular_file(driveitem.name):
try:
sections.extend(
tabular_file_to_sections(
file=io.BytesIO(content_bytes),
file_name=driveitem.name,
link=driveitem.web_url or "",
)
)
except Exception as e:
logger.warning(
f"Failed to extract tabular sections for '{driveitem.name}': {e}"
)
else:
def _store_embedded_image(img_data: bytes, img_name: str) -> None:

View File

@@ -7,6 +7,7 @@ from onyx.indexing.chunking.image_section_chunker import ImageChunker
from onyx.indexing.chunking.section_chunker import AccumulatorState
from onyx.indexing.chunking.section_chunker import ChunkPayload
from onyx.indexing.chunking.section_chunker import SectionChunker
from onyx.indexing.chunking.tabular_section_chunker import TabularChunker
from onyx.indexing.chunking.text_section_chunker import TextChunker
from onyx.indexing.models import DocAwareChunk
from onyx.natural_language_processing.utils import BaseTokenizer
@@ -38,6 +39,7 @@ class DocumentChunker:
chunk_splitter=chunk_splitter,
),
SectionType.IMAGE: ImageChunker(),
SectionType.TABULAR: TabularChunker(tokenizer=tokenizer),
}
def chunk(

View File

@@ -0,0 +1,272 @@
import csv
import io
from collections.abc import Iterable
from pydantic import BaseModel
from onyx.connectors.models import Section
from onyx.indexing.chunking.section_chunker import AccumulatorState
from onyx.indexing.chunking.section_chunker import ChunkPayload
from onyx.indexing.chunking.section_chunker import SectionChunker
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
from onyx.natural_language_processing.utils import BaseTokenizer
from onyx.natural_language_processing.utils import count_tokens
from onyx.natural_language_processing.utils import split_text_by_tokens
from onyx.utils.logger import setup_logger
logger = setup_logger()
COLUMNS_MARKER = "Columns:"
FIELD_VALUE_SEPARATOR = ", "
ROW_JOIN = "\n"
NEWLINE_TOKENS = 1
class _ParsedRow(BaseModel):
header: list[str]
row: list[str]
class _TokenizedText(BaseModel):
text: str
token_count: int
def format_row(header: list[str], row: list[str]) -> str:
"""
A header-row combination is formatted like this:
field1=value1, field2=value2, field3=value3
"""
pairs = _row_to_pairs(header, row)
formatted = FIELD_VALUE_SEPARATOR.join(f"{h}={v}" for h, v in pairs)
return formatted
def format_columns_header(headers: list[str]) -> str:
"""
Format the column header line. Underscored headers get a
space-substituted friendly alias in parens.
Example:
headers = ["id", "MTTR_hours"]
=> "Columns: id, MTTR_hours (MTTR hours)"
"""
parts: list[str] = []
for header in headers:
friendly = header
if "_" in header:
friendly = f'{header} ({header.replace("_", " ")})'
parts.append(friendly)
return f"{COLUMNS_MARKER} " + FIELD_VALUE_SEPARATOR.join(parts)
def parse_section(section: Section) -> list[_ParsedRow]:
"""Parse CSV into headers + rows. First non-empty row is the header;
blank rows are skipped."""
section_text = section.text or ""
if not section_text.strip():
return []
reader = csv.reader(io.StringIO(section_text))
non_empty_rows = [row for row in reader if any(cell.strip() for cell in row)]
if not non_empty_rows:
return []
header, *data_rows = non_empty_rows
return [_ParsedRow(header=header, row=row) for row in data_rows]
def _row_to_pairs(headers: list[str], row: list[str]) -> list[tuple[str, str]]:
return [(h, v) for h, v in zip(headers, row) if v.strip()]
def pack_chunk(chunk: str, new_row: str) -> str:
return chunk + "\n" + new_row
def _split_row_by_pairs(
pairs: list[tuple[str, str]],
tokenizer: BaseTokenizer,
max_tokens: int,
) -> list[_TokenizedText]:
"""Greedily pack pairs into max-sized pieces. Any single pair that
itself exceeds ``max_tokens`` is token-split at id boundaries.
No headers."""
separator_tokens = count_tokens(FIELD_VALUE_SEPARATOR, tokenizer)
pieces: list[_TokenizedText] = []
current_parts: list[str] = []
current_tokens = 0
for pair in pairs:
pair_str = f"{pair[0]}={pair[1]}"
pair_tokens = count_tokens(pair_str, tokenizer)
increment = pair_tokens if not current_parts else separator_tokens + pair_tokens
if current_tokens + increment <= max_tokens:
current_parts.append(pair_str)
current_tokens += increment
continue
if current_parts:
pieces.append(
_TokenizedText(
text=FIELD_VALUE_SEPARATOR.join(current_parts),
token_count=current_tokens,
)
)
current_parts = []
current_tokens = 0
if pair_tokens > max_tokens:
for split_text in split_text_by_tokens(pair_str, tokenizer, max_tokens):
pieces.append(
_TokenizedText(
text=split_text,
token_count=count_tokens(split_text, tokenizer),
)
)
else:
current_parts = [pair_str]
current_tokens = pair_tokens
if current_parts:
pieces.append(
_TokenizedText(
text=FIELD_VALUE_SEPARATOR.join(current_parts),
token_count=current_tokens,
)
)
return pieces
def _build_chunk_from_scratch(
pairs: list[tuple[str, str]],
formatted_row: str,
row_tokens: int,
column_header: str,
column_header_tokens: int,
sheet_header: str,
sheet_header_tokens: int,
tokenizer: BaseTokenizer,
max_tokens: int,
) -> list[_TokenizedText]:
# 1. Row alone is too large — split by pairs, no headers.
if row_tokens > max_tokens:
return _split_row_by_pairs(pairs, tokenizer, max_tokens)
chunk = formatted_row
chunk_tokens = row_tokens
# 2. Attempt to add column header
candidate_tokens = column_header_tokens + NEWLINE_TOKENS + chunk_tokens
if candidate_tokens <= max_tokens:
chunk = column_header + ROW_JOIN + chunk
chunk_tokens = candidate_tokens
# 3. Attempt to add sheet header
if sheet_header:
candidate_tokens = sheet_header_tokens + NEWLINE_TOKENS + chunk_tokens
if candidate_tokens <= max_tokens:
chunk = sheet_header + ROW_JOIN + chunk
chunk_tokens = candidate_tokens
return [_TokenizedText(text=chunk, token_count=chunk_tokens)]
def parse_to_chunks(
rows: Iterable[_ParsedRow],
sheet_header: str,
tokenizer: BaseTokenizer,
max_tokens: int,
) -> list[str]:
rows_list = list(rows)
if not rows_list:
return []
column_header = format_columns_header(rows_list[0].header)
column_header_tokens = count_tokens(column_header, tokenizer)
sheet_header_tokens = count_tokens(sheet_header, tokenizer) if sheet_header else 0
chunks: list[str] = []
current_chunk = ""
current_chunk_tokens = 0
for row in rows_list:
pairs: list[tuple[str, str]] = _row_to_pairs(row.header, row.row)
formatted = format_row(row.header, row.row)
row_tokens = count_tokens(formatted, tokenizer)
if current_chunk:
# Attempt to pack it in (additive approximation)
if current_chunk_tokens + NEWLINE_TOKENS + row_tokens <= max_tokens:
current_chunk = pack_chunk(current_chunk, formatted)
current_chunk_tokens += NEWLINE_TOKENS + row_tokens
continue
# Doesn't fit — flush and start new
chunks.append(current_chunk)
current_chunk = ""
current_chunk_tokens = 0
# Build chunk from scratch
for piece in _build_chunk_from_scratch(
pairs=pairs,
formatted_row=formatted,
row_tokens=row_tokens,
column_header=column_header,
column_header_tokens=column_header_tokens,
sheet_header=sheet_header,
sheet_header_tokens=sheet_header_tokens,
tokenizer=tokenizer,
max_tokens=max_tokens,
):
if current_chunk:
chunks.append(current_chunk)
current_chunk = piece.text
current_chunk_tokens = piece.token_count
# Flush remaining
if current_chunk:
chunks.append(current_chunk)
return chunks
class TabularChunker(SectionChunker):
def __init__(self, tokenizer: BaseTokenizer) -> None:
self.tokenizer = tokenizer
def chunk_section(
self,
section: Section,
accumulator: AccumulatorState,
content_token_limit: int,
) -> SectionChunkerOutput:
payloads = accumulator.flush_to_list()
parsed_rows = parse_section(section)
if not parsed_rows:
logger.warning(
f"TabularChunker: skipping unparseable section (link={section.link})"
)
return SectionChunkerOutput(
payloads=payloads, accumulator=AccumulatorState()
)
sheet_header = section.link or ""
chunk_texts = parse_to_chunks(
rows=parsed_rows,
sheet_header=sheet_header,
tokenizer=self.tokenizer,
max_tokens=content_token_limit,
)
for i, text in enumerate(chunk_texts):
payloads.append(
ChunkPayload(
text=text,
links={0: section.link or ""},
is_continuation=(i > 0),
)
)
return SectionChunkerOutput(payloads=payloads, accumulator=AccumulatorState())

View File

@@ -10,6 +10,7 @@ from onyx.indexing.chunking.section_chunker import SectionChunker
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
from onyx.natural_language_processing.utils import BaseTokenizer
from onyx.natural_language_processing.utils import count_tokens
from onyx.natural_language_processing.utils import split_text_by_tokens
from onyx.utils.text_processing import clean_text
from onyx.utils.text_processing import shared_precompare_cleanup
from shared_configs.configs import STRICT_CHUNK_TOKEN_LIMIT
@@ -90,8 +91,8 @@ class TextChunker(SectionChunker):
STRICT_CHUNK_TOKEN_LIMIT
and count_tokens(split_text, self.tokenizer) > content_token_limit
):
smaller_chunks = self._split_oversized_chunk(
split_text, content_token_limit
smaller_chunks = split_text_by_tokens(
split_text, self.tokenizer, content_token_limit
)
for j, small_chunk in enumerate(smaller_chunks):
payloads.append(
@@ -114,16 +115,3 @@ class TextChunker(SectionChunker):
payloads=payloads,
accumulator=AccumulatorState(),
)
def _split_oversized_chunk(self, text: str, content_token_limit: int) -> list[str]:
tokens = self.tokenizer.tokenize(text)
chunks: list[str] = []
start = 0
total_tokens = len(tokens)
while start < total_tokens:
end = min(start + content_token_limit, total_tokens)
token_chunk = tokens[start:end]
chunk_text = " ".join(token_chunk)
chunks.append(chunk_text)
start = end
return chunks

View File

@@ -3,8 +3,6 @@ from abc import ABC
from abc import abstractmethod
from collections import defaultdict
import sentry_sdk
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import ConnectorStopSignal
from onyx.connectors.models import DocumentFailure
@@ -293,13 +291,6 @@ def embed_chunks_with_failure_handling(
)
embedded_chunks.extend(doc_embedded_chunks)
except Exception as e:
with sentry_sdk.new_scope() as scope:
scope.set_tag("stage", "embedding")
scope.set_tag("doc_id", doc_id)
if tenant_id:
scope.set_tag("tenant_id", tenant_id)
scope.fingerprint = ["embedding-failure", type(e).__name__]
sentry_sdk.capture_exception(e)
logger.exception(f"Failed to embed chunks for document '{doc_id}'")
failures.append(
ConnectorFailure(

View File

@@ -5,7 +5,6 @@ from collections.abc import Iterator
from contextlib import contextmanager
from typing import Protocol
import sentry_sdk
from pydantic import BaseModel
from pydantic import ConfigDict
from sqlalchemy.orm import Session
@@ -333,13 +332,6 @@ def index_doc_batch_with_handler(
except Exception as e:
# don't log the batch directly, it's too much text
document_ids = [doc.id for doc in document_batch]
with sentry_sdk.new_scope() as scope:
scope.set_tag("stage", "indexing_pipeline")
scope.set_tag("tenant_id", tenant_id)
scope.set_tag("batch_size", str(len(document_batch)))
scope.set_extra("document_ids", document_ids)
scope.fingerprint = ["indexing-pipeline-failure", type(e).__name__]
sentry_sdk.capture_exception(e)
logger.exception(f"Failed to index document batch: {document_ids}")
index_pipeline_result = IndexingPipelineResult(

View File

@@ -6,7 +6,6 @@ from itertools import chain
from itertools import groupby
import httpx
import sentry_sdk
from onyx.connectors.models import ConnectorFailure
from onyx.connectors.models import DocumentFailure
@@ -89,12 +88,6 @@ def write_chunks_to_vector_db_with_backoff(
)
)
except Exception as e:
with sentry_sdk.new_scope() as scope:
scope.set_tag("stage", "vector_db_write")
scope.set_tag("doc_id", doc_id)
scope.set_tag("tenant_id", index_batch_params.tenant_id)
scope.fingerprint = ["vector-db-write-failure", type(e).__name__]
sentry_sdk.capture_exception(e)
logger.exception(
f"Failed to write document chunks for '{doc_id}' to vector db"
)

View File

@@ -434,14 +434,11 @@ def get_application(lifespan_override: Lifespan | None = None) -> FastAPI:
lifespan=lifespan_override or lifespan,
)
if SENTRY_DSN:
from onyx.configs.sentry import _add_instance_tags
sentry_sdk.init(
dsn=SENTRY_DSN,
integrations=[StarletteIntegration(), FastApiIntegration()],
traces_sample_rate=0.1,
release=__version__,
before_send=_add_instance_tags,
)
logger.info("Sentry initialized")
else:

View File

@@ -201,6 +201,33 @@ def count_tokens(
return total
def split_text_by_tokens(
text: str,
tokenizer: BaseTokenizer,
max_tokens: int,
) -> list[str]:
"""Split ``text`` into pieces of ≤ ``max_tokens`` tokens each, via
encode/decode at token-id boundaries.
Note: the returned pieces are not strictly guaranteed to re-tokenize to
≤ max_tokens. BPE merges at window boundaries may drift by a few tokens,
and cuts landing mid-multi-byte-UTF-8-character produce replacement
characters on decode. Good enough for "best-effort" splitting of
oversized content, not for hard limit enforcement.
"""
if not text:
return []
token_ids: list[int] = []
for start in range(0, len(text), _ENCODE_CHUNK_SIZE):
token_ids.extend(tokenizer.encode(text[start : start + _ENCODE_CHUNK_SIZE]))
return [
tokenizer.decode(token_ids[start : start + max_tokens])
for start in range(0, len(token_ids), max_tokens)
]
def tokenizer_trim_content(
content: str, desired_length: int, tokenizer: BaseTokenizer
) -> str:

View File

@@ -1,88 +0,0 @@
from typing import cast
from unittest.mock import MagicMock
from unittest.mock import patch
from sentry_sdk.types import Event
import onyx.configs.sentry as sentry_module
from onyx.configs.sentry import _add_instance_tags
def _event(data: dict) -> Event:
"""Helper to create a Sentry Event from a plain dict for testing."""
return cast(Event, data)
def _reset_state() -> None:
"""Reset the module-level resolved flag between tests."""
sentry_module._instance_id_resolved = False
class TestAddInstanceTags:
def setup_method(self) -> None:
_reset_state()
@patch("onyx.utils.telemetry.get_or_generate_uuid", return_value="test-uuid-1234")
@patch("sentry_sdk.set_tag")
def test_first_event_sets_instance_id(
self, mock_set_tag: MagicMock, mock_uuid: MagicMock
) -> None:
result = _add_instance_tags(_event({"message": "test error"}), {})
assert result is not None
assert result["tags"]["instance_id"] == "test-uuid-1234"
mock_set_tag.assert_called_once_with("instance_id", "test-uuid-1234")
mock_uuid.assert_called_once()
@patch("onyx.utils.telemetry.get_or_generate_uuid", return_value="test-uuid-1234")
@patch("sentry_sdk.set_tag")
def test_second_event_skips_resolution(
self, _mock_set_tag: MagicMock, mock_uuid: MagicMock
) -> None:
_add_instance_tags(_event({"message": "first"}), {})
result = _add_instance_tags(_event({"message": "second"}), {})
assert result is not None
assert "tags" not in result # second event not modified
mock_uuid.assert_called_once() # only resolved once
@patch(
"onyx.utils.telemetry.get_or_generate_uuid",
side_effect=Exception("DB unavailable"),
)
@patch("sentry_sdk.set_tag")
def test_resolution_failure_still_returns_event(
self, _mock_set_tag: MagicMock, _mock_uuid: MagicMock
) -> None:
result = _add_instance_tags(_event({"message": "test error"}), {})
assert result is not None
assert result["message"] == "test error"
assert "tags" not in result or "instance_id" not in result.get("tags", {})
@patch(
"onyx.utils.telemetry.get_or_generate_uuid",
side_effect=Exception("DB unavailable"),
)
@patch("sentry_sdk.set_tag")
def test_resolution_failure_retries_on_next_event(
self, _mock_set_tag: MagicMock, mock_uuid: MagicMock
) -> None:
"""If resolution fails (e.g. DB not ready), retry on the next event."""
_add_instance_tags(_event({"message": "first"}), {})
_add_instance_tags(_event({"message": "second"}), {})
assert mock_uuid.call_count == 2 # retried on second event
@patch("onyx.utils.telemetry.get_or_generate_uuid", return_value="test-uuid-1234")
@patch("sentry_sdk.set_tag")
def test_preserves_existing_tags(
self, _mock_set_tag: MagicMock, _mock_uuid: MagicMock
) -> None:
result = _add_instance_tags(
_event({"message": "test", "tags": {"existing": "tag"}}), {}
)
assert result is not None
assert result["tags"]["existing"] == "tag"
assert result["tags"]["instance_id"] == "test-uuid-1234"

View File

@@ -0,0 +1,551 @@
"""End-to-end tests for `TabularChunker.chunk_section`.
Each test is structured as:
INPUT — the CSV text passed to the chunker + token budget + link
EXPECTED — the exact chunk texts the chunker should emit
ACT — a single call to `chunk_section`
ASSERT — literal equality against the expected chunk texts
A character-level tokenizer (1 char == 1 token) is used so token-budget
arithmetic is deterministic and expected chunks can be spelled out
exactly.
"""
from onyx.connectors.models import Section
from onyx.connectors.models import TabularSection
from onyx.indexing.chunking.section_chunker import AccumulatorState
from onyx.indexing.chunking.tabular_section_chunker import TabularChunker
from onyx.natural_language_processing.utils import BaseTokenizer
class CharTokenizer(BaseTokenizer):
def encode(self, string: str) -> list[int]:
return [ord(c) for c in string]
def tokenize(self, string: str) -> list[str]:
return list(string)
def decode(self, tokens: list[int]) -> str:
return "".join(chr(t) for t in tokens)
def _make_chunker() -> TabularChunker:
return TabularChunker(tokenizer=CharTokenizer())
def _tabular_section(text: str, link: str = "sheet:Test") -> Section:
return TabularSection(text=text, link=link)
class TestTabularChunkerChunkSection:
def test_simple_csv_all_rows_fit_one_chunk(self) -> None:
# --- INPUT -----------------------------------------------------
csv_text = "Name,Age,City\n" "Alice,30,NYC\n" "Bob,25,SF\n"
link = "sheet:People"
content_token_limit = 500
# --- EXPECTED --------------------------------------------------
expected_texts = [
(
"sheet:People\n"
"Columns: Name, Age, City\n"
"Name=Alice, Age=30, City=NYC\n"
"Name=Bob, Age=25, City=SF"
),
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
assert [p.is_continuation for p in out.payloads] == [False]
assert all(p.links == {0: link} for p in out.payloads)
assert out.accumulator.is_empty()
def test_overflow_splits_into_two_deterministic_chunks(self) -> None:
# --- INPUT -----------------------------------------------------
# prelude = "sheet:S\nColumns: col, val" (25 chars = 25 tokens)
# At content_token_limit=57, row_budget = max(16, 57-31-1) = 25.
# Each row "col=a, val=1" is 12 tokens; two rows + \n = 25 (fits),
# three rows + 2×\n = 38 (overflows) → split after 2 rows.
csv_text = "col,val\n" "a,1\n" "b,2\n" "c,3\n" "d,4\n"
link = "sheet:S"
content_token_limit = 57
# --- EXPECTED --------------------------------------------------
expected_texts = [
("sheet:S\n" "Columns: col, val\n" "col=a, val=1\n" "col=b, val=2"),
("sheet:S\n" "Columns: col, val\n" "col=c, val=3\n" "col=d, val=4"),
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
# First chunk is fresh; subsequent chunks mark as continuations.
assert [p.is_continuation for p in out.payloads] == [False, True]
# Link carries through every chunk.
assert all(p.links == {0: link} for p in out.payloads)
# Add back in shortly
# def test_header_only_csv_produces_single_prelude_chunk(self) -> None:
# # --- INPUT -----------------------------------------------------
# csv_text = "col1,col2\n"
# link = "sheet:Headers"
# # --- EXPECTED --------------------------------------------------
# expected_texts = [
# "sheet:Headers\nColumns: col1, col2",
# ]
# # --- ACT -------------------------------------------------------
# out = _make_chunker().chunk_section(
# _tabular_section(csv_text, link=link),
# AccumulatorState(),
# content_token_limit=500,
# )
# # --- ASSERT ----------------------------------------------------
# assert [p.text for p in out.payloads] == expected_texts
def test_empty_cells_dropped_from_chunk_text(self) -> None:
# --- INPUT -----------------------------------------------------
# Alice's Age is empty; Bob's City is empty. Empty cells should
# not appear as `field=` pairs in the output.
csv_text = "Name,Age,City\n" "Alice,,NYC\n" "Bob,25,\n"
link = "sheet:P"
# --- EXPECTED --------------------------------------------------
expected_texts = [
(
"sheet:P\n"
"Columns: Name, Age, City\n"
"Name=Alice, City=NYC\n"
"Name=Bob, Age=25"
),
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=500,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
def test_quoted_commas_in_csv_preserved_as_one_field(self) -> None:
# --- INPUT -----------------------------------------------------
# "Hello, world" is quoted in the CSV, so csv.reader parses it as
# a single field. The surrounding quotes are stripped during
# decoding, so the chunk text carries the bare value.
csv_text = "Name,Notes\n" 'Alice,"Hello, world"\n'
link = "sheet:P"
# --- EXPECTED --------------------------------------------------
expected_texts = [
("sheet:P\n" "Columns: Name, Notes\n" "Name=Alice, Notes=Hello, world"),
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=500,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
def test_blank_rows_in_csv_are_skipped(self) -> None:
# --- INPUT -----------------------------------------------------
# Stray blank rows in the CSV (e.g. export artifacts) shouldn't
# produce ghost rows in the output.
csv_text = "A,B\n" "\n" "1,2\n" "\n" "\n" "3,4\n"
link = "sheet:S"
# --- EXPECTED --------------------------------------------------
expected_texts = [
("sheet:S\n" "Columns: A, B\n" "A=1, B=2\n" "A=3, B=4"),
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=500,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
def test_accumulator_flushes_before_tabular_chunks(self) -> None:
# --- INPUT -----------------------------------------------------
# A text accumulator was populated by the prior text section.
# Tabular sections are structural boundaries, so the pending
# text is flushed as its own chunk before the tabular content.
pending_text = "prior paragraph from an earlier text section"
pending_link = "prev-link"
csv_text = "a,b\n" "1,2\n"
link = "sheet:S"
# --- EXPECTED --------------------------------------------------
expected_texts = [
pending_text, # flushed accumulator
("sheet:S\n" "Columns: a, b\n" "a=1, b=2"),
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(
text=pending_text,
link_offsets={0: pending_link},
),
content_token_limit=500,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
# Flushed chunk keeps the prior text's link; tabular chunk uses
# the tabular section's link.
assert out.payloads[0].links == {0: pending_link}
assert out.payloads[1].links == {0: link}
# Accumulator resets — tabular section is a structural boundary.
assert out.accumulator.is_empty()
def test_multi_row_packing_under_budget_emits_single_chunk(self) -> None:
# --- INPUT -----------------------------------------------------
# Three small rows (20 tokens each) under a generous
# content_token_limit=100 should pack into ONE chunk — prelude
# emitted once, rows stacked beneath it.
csv_text = (
"x\n" "aaaaaaaaaaaaaaaaaa\n" "bbbbbbbbbbbbbbbbbb\n" "cccccccccccccccccc\n"
)
link = "S"
content_token_limit = 100
# --- EXPECTED --------------------------------------------------
# Each formatted row "x=<18-char value>" = 20 tokens.
# Full chunk with sheet + Columns + 3 rows =
# 1 + 1 + 10 + 1 + (20 + 1 + 20 + 1 + 20) = 75 tokens ≤ 100.
# Single chunk carries all three rows.
expected_texts = [
"S\n"
"Columns: x\n"
"x=aaaaaaaaaaaaaaaaaa\n"
"x=bbbbbbbbbbbbbbbbbb\n"
"x=cccccccccccccccccc"
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
assert [p.is_continuation for p in out.payloads] == [False]
assert all(len(p.text) <= content_token_limit for p in out.payloads)
def test_packing_reserves_prelude_budget_so_every_chunk_has_full_prelude(
self,
) -> None:
# --- INPUT -----------------------------------------------------
# Budget (30) is large enough for all 5 bare rows (row_block =
# 24 tokens) to pack as one chunk if the prelude were optional,
# but [sheet] + Columns + 5_rows would be 41 tokens > 30. The
# packing logic reserves space for the prelude: only 2 rows
# pack per chunk (17 prelude overhead + 9 rows = 26 ≤ 30).
# Every emitted chunk therefore carries its full prelude rather
# than dropping Columns at emit time.
csv_text = "x\n" "aa\n" "bb\n" "cc\n" "dd\n" "ee\n"
link = "S"
content_token_limit = 30
# --- EXPECTED --------------------------------------------------
# Prelude overhead = 'S\nColumns: x\n' = 1+1+10+1 = 13.
# Each row "x=XX" = 4 tokens, row separator "\n" = 1.
# 3 rows: 13 + (4+1+4+1+4) = 27 ≤ 30 ✓
# 4 rows: 13 + (4+1+4+1+4+1+4) = 32 > 30 ✗
# → 3 rows in the first chunk, 2 rows in the second.
expected_texts = [
"S\nColumns: x\nx=aa\nx=bb\nx=cc",
"S\nColumns: x\nx=dd\nx=ee",
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
# Every chunk fits under the budget AND carries its full
# prelude — that's the whole point of this check.
assert all(len(p.text) <= content_token_limit for p in out.payloads)
assert all("Columns: x" in p.text for p in out.payloads)
def test_oversized_row_splits_into_field_pieces_no_prelude(self) -> None:
# --- INPUT -----------------------------------------------------
# Single-row CSV whose formatted form ("field 1=1, ..." = 53
# tokens) exceeds content_token_limit (20). Per the chunker's
# rules, oversized rows are split at field boundaries into
# pieces each ≤ max_tokens, and no prelude is added to split
# pieces (they already consume the full budget). A 53-token row
# packs into 3 field-boundary pieces under a 20-token budget.
csv_text = "field 1,field 2,field 3,field 4,field 5\n" "1,2,3,4,5\n"
link = "S"
content_token_limit = 20
# --- EXPECTED --------------------------------------------------
# Row = "field 1=1, field 2=2, field 3=3, field 4=4, field 5=5"
# Fields @ 9 tokens each, ", " sep = 2 tokens.
# "field 1=1, field 2=2" = 9+2+9 = 20 tokens ≤ 20 ✓
# + ", field 3=3" = 20+2+9 = 31 > 20 → flush, start new
# "field 3=3, field 4=4" = 9+2+9 = 20 ≤ 20 ✓
# + ", field 5=5" = 20+2+9 = 31 > 20 → flush, start new
# "field 5=5" = 9 ≤ 20 ✓
# ceil(53 / 20) = 3 chunks.
expected_texts = [
"field 1=1, field 2=2",
"field 3=3, field 4=4",
"field 5=5",
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
# Invariant: no chunk exceeds max_tokens.
assert all(len(p.text) <= content_token_limit for p in out.payloads)
# is_continuation: first chunk False, rest True.
assert [p.is_continuation for p in out.payloads] == [False, True, True]
def test_empty_tabular_section_flushes_accumulator_and_resets_it(
self,
) -> None:
# --- INPUT -----------------------------------------------------
# Tabular sections are structural boundaries, so any pending text
# buffer is flushed to a chunk before parsing the tabular content
# — even if the tabular section itself is empty. The accumulator
# is then reset.
pending_text = "prior paragraph"
pending_link_offsets = {0: "prev-link"}
# --- EXPECTED --------------------------------------------------
expected_texts = [pending_text]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section("", link="sheet:Empty"),
AccumulatorState(
text=pending_text,
link_offsets=pending_link_offsets,
),
content_token_limit=500,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
assert out.accumulator.is_empty()
def test_single_oversized_field_token_splits_at_id_boundaries(self) -> None:
# --- INPUT -----------------------------------------------------
# A single `field=value` pair that itself exceeds max_tokens can't
# be split at field boundaries — there's only one field. The
# chunker falls back to encoding the pair to token ids and
# slicing at max-token-sized windows.
#
# CSV has one column "x" with a 50-char value. Formatted pair =
# "x=" + 50 a's = 52 tokens. Budget = 10.
csv_text = "x\n" + ("a" * 50) + "\n"
link = "S"
content_token_limit = 10
# --- EXPECTED --------------------------------------------------
# 52-char pair at 10 tokens per window = 6 pieces:
# [0:10) "x=aaaaaaaa" (10)
# [10:20) "aaaaaaaaaa" (10)
# [20:30) "aaaaaaaaaa" (10)
# [30:40) "aaaaaaaaaa" (10)
# [40:50) "aaaaaaaaaa" (10)
# [50:52) "aa" (2)
# Split pieces carry no prelude (they already consume the budget).
expected_texts = [
"x=aaaaaaaa",
"aaaaaaaaaa",
"aaaaaaaaaa",
"aaaaaaaaaa",
"aaaaaaaaaa",
"aa",
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
# Every piece is ≤ max_tokens — the invariant the token-level
# fallback exists to enforce.
assert all(len(p.text) <= content_token_limit for p in out.payloads)
def test_underscored_column_gets_friendly_alias_in_parens(self) -> None:
# --- INPUT -----------------------------------------------------
# Column headers with underscores get a space-substituted friendly
# alias appended in parens on the `Columns:` line. Plain headers
# pass through untouched.
csv_text = "MTTR_hours,id,owner_name\n" "3,42,Alice\n"
link = "sheet:M"
# --- EXPECTED --------------------------------------------------
expected_texts = [
(
"sheet:M\n"
"Columns: MTTR_hours (MTTR hours), id, owner_name (owner name)\n"
"MTTR_hours=3, id=42, owner_name=Alice"
),
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=500,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
def test_oversized_row_between_small_rows_preserves_flanking_chunks(
self,
) -> None:
# --- INPUT -----------------------------------------------------
# State-machine check: small row, oversized row, small row. The
# first small row should become a preluded chunk; the oversized
# row flushes it and emits split fragments without prelude; then
# the last small row picks up from wherever the split left off.
#
# Headers a,b,c,d. Row 1 and row 3 each have only column `a`
# populated (tiny). Row 2 is a "fat" row with all four columns
# populated.
csv_text = "a,b,c,d\n" "1,,,\n" "xxx,yyy,zzz,www\n" "2,,,\n"
link = "S"
content_token_limit = 20
# --- EXPECTED --------------------------------------------------
# Prelude = 'S\nColumns: a, b, c, d\n' = 1+1+19+1 = 22 > 20, so
# sheet fits with the row but full Columns header does not.
# Row 1 formatted = "a=1" (3). build_chunk_from_scratch:
# cols+row = 20+3 = 23 > 20 → skip cols. sheet+row = 1+1+3 = 5
# ≤ 20 → chunk = "S\na=1".
# Row 2 formatted = "a=xxx, b=yyy, c=zzz, d=www" (26 > 20) →
# flush "S\na=1" and split at pair boundaries:
# "a=xxx, b=yyy, c=zzz" (19 ≤ 20 ✓)
# "d=www" (5)
# Row 3 formatted = "a=2" (3). can_pack onto "d=www" (5):
# 5 + 3 + 1 = 9 ≤ 20 ✓ → packs. Trailing fragment from the
# split absorbs the next small row, which is the current v2
# behavior (the fragment becomes `current_chunk` and the next
# small row is appended with the standard packing rules).
expected_texts = [
"S\na=1",
"a=xxx, b=yyy, c=zzz",
"d=www\na=2",
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
assert all(len(p.text) <= content_token_limit for p in out.payloads)
def test_prelude_layering_column_header_fits_but_sheet_header_does_not(
self,
) -> None:
# --- INPUT -----------------------------------------------------
# Budget lets `Columns: x\nx=y` fit but not the additional sheet
# header on top. The chunker should add the column header and
# drop the sheet header.
#
# sheet = "LongSheetName" (13), cols = "Columns: x" (10),
# row = "x=y" (3). Budget = 15.
# cols + row: 10+1+3 = 14 ≤ 15 ✓
# sheet + cols + row: 13+1+10+1+3 = 28 > 15 ✗
csv_text = "x\n" "y\n"
link = "LongSheetName"
content_token_limit = 15
# --- EXPECTED --------------------------------------------------
expected_texts = ["Columns: x\nx=y"]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
def test_prelude_layering_sheet_header_fits_but_column_header_does_not(
self,
) -> None:
# --- INPUT -----------------------------------------------------
# Budget is too small for the column header but leaves room for
# the short sheet header. The chunker should fall back to just
# sheet + row (its layered "try cols, then try sheet on top of
# whatever we have" logic means sheet is attempted on the bare
# row when cols didn't fit).
#
# sheet = "S" (1), cols = "Columns: ABC, DEF" (17),
# row = "ABC=1, DEF=2" (12). Budget = 20.
# cols + row: 17+1+12 = 30 > 20 ✗
# sheet + row: 1+1+12 = 14 ≤ 20 ✓
csv_text = "ABC,DEF\n" "1,2\n"
link = "S"
content_token_limit = 20
# --- EXPECTED --------------------------------------------------
expected_texts = ["S\nABC=1, DEF=2"]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
_tabular_section(csv_text, link=link),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts

View File

@@ -1,927 +0,0 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 72,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": " This chart shows how long it takes for Onyx to crawl each source connector and collect the current list of documents. The Y axis represents duration in seconds (bucketed), and each band shows how many enumerations completed within that time range.",
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"calculate": false,
"cellGap": 1,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Oranges",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-09
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(onyx_pruning_enumeration_duration_seconds_bucket[30m])) by (le)",
"format": "heatmap",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Pruning Enumeration Duration",
"type": "heatmap"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 7,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum(rate(onyx_pruning_enumeration_duration_seconds_bucket[1h])) by (le, connector_type))",
"instant": false,
"legendFormat": "{{connector_type}}",
"range": true,
"refId": "A"
}
],
"title": "Pruning Enumeration Duration p95 by Connector",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Shows how many pruning enumerations completed per hour, broken down by connector type. A low count means few connectors are successfully completing the enumeration phase. A count of 0 for a connector type that should be pruning indicates enumerations are timing out before completion.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(onyx_pruning_enumeration_duration_seconds_count[1h])) by (connector_type)",
"instant": false,
"legendFormat": "{{connector_type}}",
"range": true,
"refId": "A"
}
],
"title": "Pruning Enumeration Count",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Shows the 95th percentile execution duration of pruning tasks. A rising p95 indicates pruning jobs are taking longer over time, potentially approaching the 6-hour timeout limit. Sustained values near 21600s (6 hours) indicate connectors with too many documents to prune within the allowed window.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "histogram_quantile(0.95, sum(rate(onyx_celery_task_duration_seconds_bucket{task_name=~\"connector_pruning.*\"}[1h])) by (le, task_name))",
"instant": false,
"legendFormat": "{{task_name}}",
"range": true,
"refId": "A"
}
],
"title": "Pruning Task Duration p95",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "hows the number of currently executing pruning tasks on the heavy worker, broken down by task type. A value of 0 means no pruning is actively running. A sustained high count may indicate workers are saturated and new pruning jobs are queuing up.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": [
{
"__systemRef": "hideSeriesFrom",
"matcher": {
"id": "byNames",
"options": {
"mode": "exclude",
"names": [
"connector_pruning_generator_task"
],
"prefix": "All except:",
"readOnly": true
}
},
"properties": [
{
"id": "custom.hideFrom",
"value": {
"legend": false,
"tooltip": false,
"viz": true
}
}
]
}
]
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(onyx_celery_tasks_active{queue=~\"connector_pruning.*|connector_doc_permissions.*|connector_external_group.*|csv_generation|sandbox\"}) by (task_name)",
"instant": false,
"legendFormat": "{{task_name}}",
"range": true,
"refId": "A"
}
],
"title": "Heavy Worker - Active Tasks",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "This chart shows how frequently Onyx hits rate limits from source connectors during the enumeration phase. Rate limit errors slow down or stall the document crawl, directly increasing enumeration duration. A spike here for a specific connector type indicates the source API is throttling Onyx's requests, which may explain long enumeration times for that connector.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(onyx_pruning_rate_limit_errors_total[5m])) by (connector_type)",
"instant": false,
"legendFormat": "{{connector_type}}",
"range": true,
"refId": "A"
}
],
"title": "Pruning Rate Limit Errors",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Shows the rate of pruning task failures and revocations per hour. Failures indicate crashed tasks (DB errors, timeouts). Revocations indicate cancelled tasks, typically from worker restarts or deployments. Both result in orphaned fences that block future pruning attempts for affected connectors.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 24
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(onyx_celery_task_revoked_total{task_name=~\"connector_pruning.*\"}[1h])) by (task_name)",
"hide": false,
"instant": false,
"legendFormat": "revoked",
"range": true,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(onyx_celery_task_completed_total{task_name=~\"connector_pruning.*\", outcome=\"failure\"}[1h])) by (task_name)",
"hide": false,
"instant": false,
"legendFormat": "failure",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(onyx_celery_task_completed_total{task_name=~\"connector_pruning.*\", outcome=\"success\"}[1h])) by (task_name)",
"hide": false,
"instant": false,
"legendFormat": "success",
"range": true,
"refId": "C"
}
],
"title": "Heavy Worker - Pruning Task Success & Failures & Revocations",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Shows the ratio of successfully completed pruning tasks to total completed tasks. A value of 1.0 (100%) means all pruning jobs are completing cleanly. A drop indicates tasks are crashing or timing out, which leads to orphaned fences and connectors being blocked from future pruning attempts.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 24
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": " sum(rate(onyx_celery_task_completed_total{task_name=~\"connector_pruning.*\", outcome=\"success\"}[1h]))\n /\n sum(rate(onyx_celery_task_completed_total{task_name=~\"connector_pruning.*\"}[1h]))",
"instant": false,
"legendFormat": "Success Rate",
"range": true,
"refId": "A"
}
],
"title": "Heavy Worker - Pruning Task Success Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "This chart shows how long it takes Onyx to compare the list of documents fetched from the source connector against what is currently indexed. The diff computes the set difference \u2014 documents that exist in the index but no longer exist in the source are flagged for removal.",
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"id": 2,
"options": {
"calculate": false,
"cellGap": 1,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Oranges",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-09
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"mode": "single",
"showColorScale": false,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "s"
}
},
"pluginVersion": "10.4.1",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(increase(onyx_pruning_diff_duration_seconds_bucket[30m])) by (le)",
"format": "heatmap",
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A"
}
],
"title": "Pruning Diff Duration",
"type": "heatmap"
}
],
"schemaVersion": 39,
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "browser",
"title": "Indexing - Pruning",
"uid": "onyx-indexing-pruning",
"version": 10,
"weekStart": ""
}

View File

@@ -38,17 +38,4 @@ metadata:
data:
onyx-redis-queues.json: |
{{- .Files.Get "dashboards/redis-queues.json" | nindent 4 }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "onyx.fullname" . }}-indexing-pruning-dashboard
labels:
{{- include "onyx.labels" . | nindent 4 }}
grafana_dashboard: "1"
annotations:
grafana_folder: "Onyx"
data:
onyx-indexing-pruning.json: |
{{- .Files.Get "dashboards/indexing-pruning.json" | nindent 4 }}
{{- end }}

388
web/package-lock.json generated
View File

@@ -62,7 +62,7 @@
"mdast-util-find-and-replace": "^3.0.1",
"mime": "^4.1.0",
"motion": "^12.29.0",
"next": "16.2.3",
"next": "16.1.7",
"next-themes": "^0.4.4",
"postcss": "^8.5.6",
"posthog-js": "^1.176.0",
@@ -1676,9 +1676,7 @@
}
},
"node_modules/@img/colour": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/@img/colour/-/colour-1.1.0.tgz",
"integrity": "sha512-Td76q7j57o/tLVdgS746cYARfSyxk8iEfRxewL9h4OMzYhbW4TAcppl0mT4eyqXddh6L/jwoM75mo7ixa/pCeQ==",
"version": "1.0.0",
"license": "MIT",
"optional": true,
"engines": {
@@ -1790,9 +1788,9 @@
}
},
"node_modules/@img/sharp-libvips-linux-ppc64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.4.tgz",
"integrity": "sha512-FMuvGijLDYG6lW+b/UvyilUWu5Ayu+3r2d1S8notiGCIyYU/76eig1UfMmkZ7vwgOrzKzlQbFSuQfgm7GYUPpA==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-ppc64/-/sharp-libvips-linux-ppc64-1.2.3.tgz",
"integrity": "sha512-Y2T7IsQvJLMCBM+pmPbM3bKT/yYJvVtLJGfCs4Sp95SjvnFIjynbjzsa7dY1fRJX45FTSfDksbTp6AGWudiyCg==",
"cpu": [
"ppc64"
],
@@ -1805,22 +1803,6 @@
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-linux-riscv64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-riscv64/-/sharp-libvips-linux-riscv64-1.2.4.tgz",
"integrity": "sha512-oVDbcR4zUC0ce82teubSm+x6ETixtKZBh/qbREIOcI3cULzDyb18Sr/Wcyx7NRQeQzOiHTNbZFF1UwPS2scyGA==",
"cpu": [
"riscv64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"linux"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/@img/sharp-libvips-linux-s390x": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.0.4.tgz",
@@ -1930,9 +1912,9 @@
}
},
"node_modules/@img/sharp-linux-ppc64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.5.tgz",
"integrity": "sha512-7zznwNaqW6YtsfrGGDA6BRkISKAAE1Jo0QdpNYXNMHu2+0dTrPflTLNkpc8l7MUP5M16ZJcUvysVWWrMefZquA==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-ppc64/-/sharp-linux-ppc64-0.34.4.tgz",
"integrity": "sha512-F4PDtF4Cy8L8hXA2p3TO6s4aDt93v+LKmpcYFLAVdkkD3hSxZzee0rh6/+94FpAynsuMpLX5h+LRsSG3rIciUQ==",
"cpu": [
"ppc64"
],
@@ -1948,29 +1930,7 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-ppc64": "1.2.4"
}
},
"node_modules/@img/sharp-linux-riscv64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-riscv64/-/sharp-linux-riscv64-0.34.5.tgz",
"integrity": "sha512-51gJuLPTKa7piYPaVs8GmByo7/U7/7TZOq+cnXJIHZKavIRHAP77e3N2HEl3dgiqdD/w0yUfiJnII77PuDDFdw==",
"cpu": [
"riscv64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"linux"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-riscv64": "1.2.4"
"@img/sharp-libvips-linux-ppc64": "1.2.3"
}
},
"node_modules/@img/sharp-linux-s390x": {
@@ -2081,9 +2041,9 @@
}
},
"node_modules/@img/sharp-win32-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.5.tgz",
"integrity": "sha512-WQ3AgWCWYSb2yt+IG8mnC6Jdk9Whs7O0gxphblsLvdhSpSTtmu69ZG1Gkb6NuvxsNACwiPV6cNSZNzt0KPsw7g==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-arm64/-/sharp-win32-arm64-0.34.4.tgz",
"integrity": "sha512-2Q250do/5WXTwxW3zjsEuMSv5sUU4Tq9VThWKlU2EYLm4MB7ZeMwF+SFJutldYODXF6jzc6YEOC+VfX0SZQPqA==",
"cpu": [
"arm64"
],
@@ -2937,9 +2897,9 @@
}
},
"node_modules/@next/env": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.2.3.tgz",
"integrity": "sha512-ZWXyj4uNu4GCWQw9cjRxWlbD+33mcDszIo9iQxFnBX3Wmgq9ulaSJcl6VhuWx5pCWqqD+9W6Wfz7N0lM5lYPMA==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/env/-/env-16.1.7.tgz",
"integrity": "sha512-rJJbIdJB/RQr2F1nylZr/PJzamvNNhfr3brdKP6s/GW850jbtR70QlSfFselvIBbcPUOlQwBakexjFzqLzF6pg==",
"license": "MIT"
},
"node_modules/@next/eslint-plugin-next": {
@@ -2983,9 +2943,9 @@
}
},
"node_modules/@next/swc-darwin-arm64": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.2.3.tgz",
"integrity": "sha512-u37KDKTKQ+OQLvY+z7SNXixwo4Q2/IAJFDzU1fYe66IbCE51aDSAzkNDkWmLN0yjTUh4BKBd+hb69jYn6qqqSg==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-16.1.7.tgz",
"integrity": "sha512-b2wWIE8sABdyafc4IM8r5Y/dS6kD80JRtOGrUiKTsACFQfWWgUQ2NwoUX1yjFMXVsAwcQeNpnucF2ZrujsBBPg==",
"cpu": [
"arm64"
],
@@ -2999,9 +2959,9 @@
}
},
"node_modules/@next/swc-darwin-x64": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.2.3.tgz",
"integrity": "sha512-gHjL/qy6Q6CG3176FWbAKyKh9IfntKZTB3RY/YOJdDFpHGsUDXVH38U4mMNpHVGXmeYW4wj22dMp1lTfmu/bTQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-16.1.7.tgz",
"integrity": "sha512-zcnVaaZulS1WL0Ss38R5Q6D2gz7MtBu8GZLPfK+73D/hp4GFMrC2sudLky1QibfV7h6RJBJs/gOFvYP0X7UVlQ==",
"cpu": [
"x64"
],
@@ -3015,9 +2975,9 @@
}
},
"node_modules/@next/swc-linux-arm64-gnu": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.2.3.tgz",
"integrity": "sha512-U6vtblPtU/P14Y/b/n9ZY0GOxbbIhTFuaFR7F4/uMBidCi2nSdaOFhA0Go81L61Zd6527+yvuX44T4ksnf8T+Q==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-16.1.7.tgz",
"integrity": "sha512-2ant89Lux/Q3VyC8vNVg7uBaFVP9SwoK2jJOOR0L8TQnX8CAYnh4uctAScy2Hwj2dgjVHqHLORQZJ2wH6VxhSQ==",
"cpu": [
"arm64"
],
@@ -3031,9 +2991,9 @@
}
},
"node_modules/@next/swc-linux-arm64-musl": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.2.3.tgz",
"integrity": "sha512-/YV0LgjHUmfhQpn9bVoGc4x4nan64pkhWR5wyEV8yCOfwwrH630KpvRg86olQHTwHIn1z59uh6JwKvHq1h4QEw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-16.1.7.tgz",
"integrity": "sha512-uufcze7LYv0FQg9GnNeZ3/whYfo+1Q3HnQpm16o6Uyi0OVzLlk2ZWoY7j07KADZFY8qwDbsmFnMQP3p3+Ftprw==",
"cpu": [
"arm64"
],
@@ -3047,9 +3007,9 @@
}
},
"node_modules/@next/swc-linux-x64-gnu": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.2.3.tgz",
"integrity": "sha512-/HiWEcp+WMZ7VajuiMEFGZ6cg0+aYZPqCJD3YJEfpVWQsKYSjXQG06vJP6F1rdA03COD9Fef4aODs3YxKx+RDQ==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-16.1.7.tgz",
"integrity": "sha512-KWVf2gxYvHtvuT+c4MBOGxuse5TD7DsMFYSxVxRBnOzok/xryNeQSjXgxSv9QpIVlaGzEn/pIuI6Koosx8CGWA==",
"cpu": [
"x64"
],
@@ -3063,9 +3023,9 @@
}
},
"node_modules/@next/swc-linux-x64-musl": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.2.3.tgz",
"integrity": "sha512-Kt44hGJfZSefebhk/7nIdivoDr3Ugp5+oNz9VvF3GUtfxutucUIHfIO0ZYO8QlOPDQloUVQn4NVC/9JvHRk9hw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-16.1.7.tgz",
"integrity": "sha512-HguhaGwsGr1YAGs68uRKc4aGWxLET+NevJskOcCAwXbwj0fYX0RgZW2gsOCzr9S11CSQPIkxmoSbuVaBp4Z3dA==",
"cpu": [
"x64"
],
@@ -3079,9 +3039,9 @@
}
},
"node_modules/@next/swc-win32-arm64-msvc": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.2.3.tgz",
"integrity": "sha512-O2NZ9ie3Tq6xj5Z5CSwBT3+aWAMW2PIZ4egUi9MaWLkwaehgtB7YZjPm+UpcNpKOme0IQuqDcor7BsW6QBiQBw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-16.1.7.tgz",
"integrity": "sha512-S0n3KrDJokKTeFyM/vGGGR8+pCmXYrjNTk2ZozOL1C/JFdfUIL9O1ATaJOl5r2POe56iRChbsszrjMAdWSv7kQ==",
"cpu": [
"arm64"
],
@@ -3095,9 +3055,9 @@
}
},
"node_modules/@next/swc-win32-x64-msvc": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.2.3.tgz",
"integrity": "sha512-Ibm29/GgB/ab5n7XKqlStkm54qqZE8v2FnijUPBgrd67FWrac45o/RsNlaOWjme/B5UqeWt/8KM4aWBwA1D2Kw==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-16.1.7.tgz",
"integrity": "sha512-mwgtg8CNZGYm06LeEd+bNnOUfwOyNem/rOiP14Lsz+AnUY92Zq/LXwtebtUiaeVkhbroRCQ0c8GlR4UT1U+0yg==",
"cpu": [
"x64"
],
@@ -14128,12 +14088,12 @@
"license": "MIT"
},
"node_modules/next": {
"version": "16.2.3",
"resolved": "https://registry.npmjs.org/next/-/next-16.2.3.tgz",
"integrity": "sha512-9V3zV4oZFza3PVev5/poB9g0dEafVcgNyQ8eTRop8GvxZjV2G15FC5ARuG1eFD42QgeYkzJBJzHghNP8Ad9xtA==",
"version": "16.1.7",
"resolved": "https://registry.npmjs.org/next/-/next-16.1.7.tgz",
"integrity": "sha512-WM0L7WrSvKwoLegLYr6V+mz+RIofqQgVAfHhMp9a88ms0cFX8iX9ew+snpWlSBwpkURJOUdvCEt3uLl3NNzvWg==",
"license": "MIT",
"dependencies": {
"@next/env": "16.2.3",
"@next/env": "16.1.7",
"@swc/helpers": "0.5.15",
"baseline-browser-mapping": "^2.9.19",
"caniuse-lite": "^1.0.30001579",
@@ -14147,15 +14107,15 @@
"node": ">=20.9.0"
},
"optionalDependencies": {
"@next/swc-darwin-arm64": "16.2.3",
"@next/swc-darwin-x64": "16.2.3",
"@next/swc-linux-arm64-gnu": "16.2.3",
"@next/swc-linux-arm64-musl": "16.2.3",
"@next/swc-linux-x64-gnu": "16.2.3",
"@next/swc-linux-x64-musl": "16.2.3",
"@next/swc-win32-arm64-msvc": "16.2.3",
"@next/swc-win32-x64-msvc": "16.2.3",
"sharp": "^0.34.5"
"@next/swc-darwin-arm64": "16.1.7",
"@next/swc-darwin-x64": "16.1.7",
"@next/swc-linux-arm64-gnu": "16.1.7",
"@next/swc-linux-arm64-musl": "16.1.7",
"@next/swc-linux-x64-gnu": "16.1.7",
"@next/swc-linux-x64-musl": "16.1.7",
"@next/swc-win32-arm64-msvc": "16.1.7",
"@next/swc-win32-x64-msvc": "16.1.7",
"sharp": "^0.34.4"
},
"peerDependencies": {
"@opentelemetry/api": "^1.1.0",
@@ -14188,32 +14148,10 @@
"react-dom": "^16.8 || ^17 || ^18 || ^19 || ^19.0.0-rc"
}
},
"node_modules/next/node_modules/@img/sharp-darwin-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.34.5.tgz",
"integrity": "sha512-imtQ3WMJXbMY4fxb/Ndp6HBTNVtWCUI0WdobyheGf5+ad6xX8VIDO8u2xE4qc/fr08CKG/7dDseFtn6M6g/r3w==",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-darwin-arm64": "1.2.4"
}
},
"node_modules/next/node_modules/@img/sharp-darwin-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.5.tgz",
"integrity": "sha512-YNEFAF/4KQ/PeW0N+r+aVVsoIY0/qxxikF2SWdp+NRkmMB7y9LBZAVqQ4yhGCm/H3H270OSykqmQMKLBhBJDEw==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.34.4.tgz",
"integrity": "sha512-rZheupWIoa3+SOdF/IcUe1ah4ZDpKBGWcsPX6MT0lYniH9micvIU7HQkYTfrx5Xi8u+YqwLtxC/3vl8TQN6rMg==",
"cpu": [
"x64"
],
@@ -14229,29 +14167,13 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-darwin-x64": "1.2.4"
}
},
"node_modules/next/node_modules/@img/sharp-libvips-darwin-arm64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.2.4.tgz",
"integrity": "sha512-zqjjo7RatFfFoP0MkQ51jfuFZBnVE2pRiaydKJ1G/rHZvnsrHAOcQALIi9sA5co5xenQdTugCvtb1cuf78Vf4g==",
"cpu": [
"arm64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"darwin"
],
"funding": {
"url": "https://opencollective.com/libvips"
"@img/sharp-libvips-darwin-x64": "1.2.3"
}
},
"node_modules/next/node_modules/@img/sharp-libvips-darwin-x64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.4.tgz",
"integrity": "sha512-1IOd5xfVhlGwX+zXv2N93k0yMONvUlANylbJw1eTah8K/Jtpi15KC+WSiaX/nBmbm2HxRM1gZ0nSdjSsrZbGKg==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.2.3.tgz",
"integrity": "sha512-Ju+g2xn1E2AKO6YBhxjj+ACcsPQRHT0bhpglxcEf+3uyPY+/gL8veniKoo96335ZaPo03bdDXMv0t+BBFAbmRA==",
"cpu": [
"x64"
],
@@ -14265,9 +14187,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-libvips-linux-arm": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.4.tgz",
"integrity": "sha512-bFI7xcKFELdiNCVov8e44Ia4u2byA+l3XtsAj+Q8tfCwO6BQ8iDojYdvoPMqsKDkuoOo+X6HZA0s0q11ANMQ8A==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.2.3.tgz",
"integrity": "sha512-x1uE93lyP6wEwGvgAIV0gP6zmaL/a0tGzJs/BIDDG0zeBhMnuUPm7ptxGhUbcGs4okDJrk4nxgrmxpib9g6HpA==",
"cpu": [
"arm"
],
@@ -14281,9 +14203,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-libvips-linux-arm64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.4.tgz",
"integrity": "sha512-excjX8DfsIcJ10x1Kzr4RcWe1edC9PquDRRPx3YVCvQv+U5p7Yin2s32ftzikXojb1PIFc/9Mt28/y+iRklkrw==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.2.3.tgz",
"integrity": "sha512-I4RxkXU90cpufazhGPyVujYwfIm9Nk1QDEmiIsaPwdnm013F7RIceaCc87kAH+oUB1ezqEvC6ga4m7MSlqsJvQ==",
"cpu": [
"arm64"
],
@@ -14297,9 +14219,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-libvips-linux-s390x": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.4.tgz",
"integrity": "sha512-qmp9VrzgPgMoGZyPvrQHqk02uyjA0/QrTO26Tqk6l4ZV0MPWIW6LTkqOIov+J1yEu7MbFQaDpwdwJKhbJvuRxQ==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.2.3.tgz",
"integrity": "sha512-RgWrs/gVU7f+K7P+KeHFaBAJlNkD1nIZuVXdQv6S+fNA6syCcoboNjsV2Pou7zNlVdNQoQUpQTk8SWDHUA3y/w==",
"cpu": [
"s390x"
],
@@ -14313,9 +14235,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-libvips-linux-x64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.4.tgz",
"integrity": "sha512-tJxiiLsmHc9Ax1bz3oaOYBURTXGIRDODBqhveVHonrHJ9/+k89qbLl0bcJns+e4t4rvaNBxaEZsFtSfAdquPrw==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.2.3.tgz",
"integrity": "sha512-3JU7LmR85K6bBiRzSUc/Ff9JBVIFVvq6bomKE0e63UXGeRw2HPVEjoJke1Yx+iU4rL7/7kUjES4dZ/81Qjhyxg==",
"cpu": [
"x64"
],
@@ -14329,9 +14251,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-libvips-linuxmusl-arm64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.4.tgz",
"integrity": "sha512-FVQHuwx1IIuNow9QAbYUzJ+En8KcVm9Lk5+uGUQJHaZmMECZmOlix9HnH7n1TRkXMS0pGxIJokIVB9SuqZGGXw==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.2.3.tgz",
"integrity": "sha512-F9q83RZ8yaCwENw1GieztSfj5msz7GGykG/BA+MOUefvER69K/ubgFHNeSyUu64amHIYKGDs4sRCMzXVj8sEyw==",
"cpu": [
"arm64"
],
@@ -14345,9 +14267,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-libvips-linuxmusl-x64": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.4.tgz",
"integrity": "sha512-+LpyBk7L44ZIXwz/VYfglaX/okxezESc6UxDSoyo2Ks6Jxc4Y7sGjpgU9s4PMgqgjj1gZCylTieNamqA1MF7Dg==",
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.2.3.tgz",
"integrity": "sha512-U5PUY5jbc45ANM6tSJpsgqmBF/VsL6LnxJmIf11kB7J5DctHgqm0SkuXzVWtIY90GnJxKnC/JT251TDnk1fu/g==",
"cpu": [
"x64"
],
@@ -14361,9 +14283,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-linux-arm": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.5.tgz",
"integrity": "sha512-9dLqsvwtg1uuXBGZKsxem9595+ujv0sJ6Vi8wcTANSFpwV/GONat5eCkzQo/1O6zRIkh0m/8+5BjrRr7jDUSZw==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.34.4.tgz",
"integrity": "sha512-Xyam4mlqM0KkTHYVSuc6wXRmM7LGN0P12li03jAnZ3EJWZqj83+hi8Y9UxZUbxsgsK1qOEwg7O0Bc0LjqQVtxA==",
"cpu": [
"arm"
],
@@ -14379,13 +14301,13 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-arm": "1.2.4"
"@img/sharp-libvips-linux-arm": "1.2.3"
}
},
"node_modules/next/node_modules/@img/sharp-linux-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.5.tgz",
"integrity": "sha512-bKQzaJRY/bkPOXyKx5EVup7qkaojECG6NLYswgktOZjaXecSAeCWiZwwiFf3/Y+O1HrauiE3FVsGxFg8c24rZg==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.34.4.tgz",
"integrity": "sha512-YXU1F/mN/Wu786tl72CyJjP/Ngl8mGHN1hST4BGl+hiW5jhCnV2uRVTNOcaYPs73NeT/H8Upm3y9582JVuZHrQ==",
"cpu": [
"arm64"
],
@@ -14401,13 +14323,13 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-arm64": "1.2.4"
"@img/sharp-libvips-linux-arm64": "1.2.3"
}
},
"node_modules/next/node_modules/@img/sharp-linux-s390x": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.5.tgz",
"integrity": "sha512-nQtCk0PdKfho3eC5MrbQoigJ2gd1CgddUMkabUj+rBevs8tZ2cULOx46E7oyX+04WGfABgIwmMC0VqieTiR4jg==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.34.4.tgz",
"integrity": "sha512-qVrZKE9Bsnzy+myf7lFKvng6bQzhNUAYcVORq2P7bDlvmF6u2sCmK2KyEQEBdYk+u3T01pVsPrkj943T1aJAsw==",
"cpu": [
"s390x"
],
@@ -14423,13 +14345,13 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-s390x": "1.2.4"
"@img/sharp-libvips-linux-s390x": "1.2.3"
}
},
"node_modules/next/node_modules/@img/sharp-linux-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.5.tgz",
"integrity": "sha512-MEzd8HPKxVxVenwAa+JRPwEC7QFjoPWuS5NZnBt6B3pu7EG2Ge0id1oLHZpPJdn3OQK+BQDiw9zStiHBTJQQQQ==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.34.4.tgz",
"integrity": "sha512-ZfGtcp2xS51iG79c6Vhw9CWqQC8l2Ot8dygxoDoIQPTat/Ov3qAa8qpxSrtAEAJW+UjTXc4yxCjNfxm4h6Xm2A==",
"cpu": [
"x64"
],
@@ -14445,13 +14367,13 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linux-x64": "1.2.4"
"@img/sharp-libvips-linux-x64": "1.2.3"
}
},
"node_modules/next/node_modules/@img/sharp-linuxmusl-arm64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.5.tgz",
"integrity": "sha512-fprJR6GtRsMt6Kyfq44IsChVZeGN97gTD331weR1ex1c1rypDEABN6Tm2xa1wE6lYb5DdEnk03NZPqA7Id21yg==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.34.4.tgz",
"integrity": "sha512-8hDVvW9eu4yHWnjaOOR8kHVrew1iIX+MUgwxSuH2XyYeNRtLUe4VNioSqbNkB7ZYQJj9rUTT4PyRscyk2PXFKA==",
"cpu": [
"arm64"
],
@@ -14467,13 +14389,13 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linuxmusl-arm64": "1.2.4"
"@img/sharp-libvips-linuxmusl-arm64": "1.2.3"
}
},
"node_modules/next/node_modules/@img/sharp-linuxmusl-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.5.tgz",
"integrity": "sha512-Jg8wNT1MUzIvhBFxViqrEhWDGzqymo3sV7z7ZsaWbZNDLXRJZoRGrjulp60YYtV4wfY8VIKcWidjojlLcWrd8Q==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.34.4.tgz",
"integrity": "sha512-lU0aA5L8QTlfKjpDCEFOZsTYGn3AEiO6db8W5aQDxj0nQkVrZWmN3ZP9sYKWJdtq3PWPhUNlqehWyXpYDcI9Sg==",
"cpu": [
"x64"
],
@@ -14489,20 +14411,20 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-linuxmusl-x64": "1.2.4"
"@img/sharp-libvips-linuxmusl-x64": "1.2.3"
}
},
"node_modules/next/node_modules/@img/sharp-wasm32": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.5.tgz",
"integrity": "sha512-OdWTEiVkY2PHwqkbBI8frFxQQFekHaSSkUIJkwzclWZe64O1X4UlUjqqqLaPbUpMOQk6FBu/HtlGXNblIs0huw==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.34.4.tgz",
"integrity": "sha512-33QL6ZO/qpRyG7woB/HUALz28WnTMI2W1jgX3Nu2bypqLIKx/QKMILLJzJjI+SIbvXdG9fUnmrxR7vbi1sTBeA==",
"cpu": [
"wasm32"
],
"license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT",
"optional": true,
"dependencies": {
"@emnapi/runtime": "^1.7.0"
"@emnapi/runtime": "^1.5.0"
},
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@@ -14512,9 +14434,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-win32-ia32": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.5.tgz",
"integrity": "sha512-FV9m/7NmeCmSHDD5j4+4pNI8Cp3aW+JvLoXcTUo0IqyjSfAZJ8dIUmijx1qaJsIiU+Hosw6xM5KijAWRJCSgNg==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.34.4.tgz",
"integrity": "sha512-3ZeLue5V82dT92CNL6rsal6I2weKw1cYu+rGKm8fOCCtJTR2gYeUfY3FqUnIJsMUPIH68oS5jmZ0NiJ508YpEw==",
"cpu": [
"ia32"
],
@@ -14531,9 +14453,9 @@
}
},
"node_modules/next/node_modules/@img/sharp-win32-x64": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.5.tgz",
"integrity": "sha512-+29YMsqY2/9eFEiW93eqWnuLcWcufowXewwSNIT6UwZdUUCrM3oFjMWH/Z6/TMmb4hlFenmfAVbpWeup2jryCw==",
"version": "0.34.4",
"resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.34.4.tgz",
"integrity": "sha512-xIyj4wpYs8J18sVN3mSQjwrw7fKUqRw+Z5rnHNCy5fYTxigBz81u5mOMPmFumwjcn8+ld1ppptMBCLic1nz6ig==",
"cpu": [
"x64"
],
@@ -14576,16 +14498,14 @@
}
},
"node_modules/next/node_modules/sharp": {
"version": "0.34.5",
"resolved": "https://registry.npmjs.org/sharp/-/sharp-0.34.5.tgz",
"integrity": "sha512-Ou9I5Ft9WNcCbXrU9cMgPBcCK8LiwLqcbywW3t4oDV37n1pzpuNLsYiAV8eODnjbtQlSDwZ2cUEeQz4E54Hltg==",
"version": "0.34.4",
"hasInstallScript": true,
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"@img/colour": "^1.0.0",
"detect-libc": "^2.1.2",
"semver": "^7.7.3"
"detect-libc": "^2.1.0",
"semver": "^7.7.2"
},
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
@@ -14594,30 +14514,62 @@
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-darwin-arm64": "0.34.5",
"@img/sharp-darwin-x64": "0.34.5",
"@img/sharp-libvips-darwin-arm64": "1.2.4",
"@img/sharp-libvips-darwin-x64": "1.2.4",
"@img/sharp-libvips-linux-arm": "1.2.4",
"@img/sharp-libvips-linux-arm64": "1.2.4",
"@img/sharp-libvips-linux-ppc64": "1.2.4",
"@img/sharp-libvips-linux-riscv64": "1.2.4",
"@img/sharp-libvips-linux-s390x": "1.2.4",
"@img/sharp-libvips-linux-x64": "1.2.4",
"@img/sharp-libvips-linuxmusl-arm64": "1.2.4",
"@img/sharp-libvips-linuxmusl-x64": "1.2.4",
"@img/sharp-linux-arm": "0.34.5",
"@img/sharp-linux-arm64": "0.34.5",
"@img/sharp-linux-ppc64": "0.34.5",
"@img/sharp-linux-riscv64": "0.34.5",
"@img/sharp-linux-s390x": "0.34.5",
"@img/sharp-linux-x64": "0.34.5",
"@img/sharp-linuxmusl-arm64": "0.34.5",
"@img/sharp-linuxmusl-x64": "0.34.5",
"@img/sharp-wasm32": "0.34.5",
"@img/sharp-win32-arm64": "0.34.5",
"@img/sharp-win32-ia32": "0.34.5",
"@img/sharp-win32-x64": "0.34.5"
"@img/sharp-darwin-arm64": "0.34.4",
"@img/sharp-darwin-x64": "0.34.4",
"@img/sharp-libvips-darwin-arm64": "1.2.3",
"@img/sharp-libvips-darwin-x64": "1.2.3",
"@img/sharp-libvips-linux-arm": "1.2.3",
"@img/sharp-libvips-linux-arm64": "1.2.3",
"@img/sharp-libvips-linux-ppc64": "1.2.3",
"@img/sharp-libvips-linux-s390x": "1.2.3",
"@img/sharp-libvips-linux-x64": "1.2.3",
"@img/sharp-libvips-linuxmusl-arm64": "1.2.3",
"@img/sharp-libvips-linuxmusl-x64": "1.2.3",
"@img/sharp-linux-arm": "0.34.4",
"@img/sharp-linux-arm64": "0.34.4",
"@img/sharp-linux-ppc64": "0.34.4",
"@img/sharp-linux-s390x": "0.34.4",
"@img/sharp-linux-x64": "0.34.4",
"@img/sharp-linuxmusl-arm64": "0.34.4",
"@img/sharp-linuxmusl-x64": "0.34.4",
"@img/sharp-wasm32": "0.34.4",
"@img/sharp-win32-arm64": "0.34.4",
"@img/sharp-win32-ia32": "0.34.4",
"@img/sharp-win32-x64": "0.34.4"
}
},
"node_modules/next/node_modules/sharp/node_modules/@img/sharp-darwin-arm64": {
"version": "0.34.4",
"cpu": [
"arm64"
],
"license": "Apache-2.0",
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^18.17.0 || ^20.3.0 || >=21.0.0"
},
"funding": {
"url": "https://opencollective.com/libvips"
},
"optionalDependencies": {
"@img/sharp-libvips-darwin-arm64": "1.2.3"
}
},
"node_modules/next/node_modules/sharp/node_modules/@img/sharp-libvips-darwin-arm64": {
"version": "1.2.3",
"cpu": [
"arm64"
],
"license": "LGPL-3.0-or-later",
"optional": true,
"os": [
"darwin"
],
"funding": {
"url": "https://opencollective.com/libvips"
}
},
"node_modules/node-fetch": {

View File

@@ -80,7 +80,7 @@
"mdast-util-find-and-replace": "^3.0.1",
"mime": "^4.1.0",
"motion": "^12.29.0",
"next": "16.2.3",
"next": "16.1.7",
"next-themes": "^0.4.4",
"postcss": "^8.5.6",
"posthog-js": "^1.176.0",

View File

@@ -136,49 +136,32 @@ const AgentMessage = React.memo(function AgentMessage({
finalAnswerComing
);
// Merge streaming citation/document data with chatState props.
// NOTE: citationMap and documentMap from usePacketProcessor are mutated in
// place (same object reference), so we use citations.length / documentMap.size
// as change-detection proxies to bust the memo cache when new data arrives.
// Memoize merged citations separately to avoid creating new object when neither source changed
const mergedCitations = useMemo(
() => ({
...chatState.citations,
...citationMap,
}),
// eslint-disable-next-line react-hooks/exhaustive-deps
[chatState.citations, citationMap, citations.length]
[chatState.citations, citationMap]
);
// Merge streaming documentMap into chatState.docs so inline citation chips
// can resolve [1] → document even when chatState.docs is empty (multi-model).
const mergedDocs = useMemo(() => {
const propDocs = chatState.docs ?? [];
if (documentMap.size === 0) return propDocs;
const seen = new Set(propDocs.map((d) => d.document_id));
const extras = Array.from(documentMap.values()).filter(
(d) => !seen.has(d.document_id)
);
return extras.length > 0 ? [...propDocs, ...extras] : propDocs;
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [chatState.docs, documentMap, documentMap.size]);
// Create a chatState that uses streaming citations and documents for immediate rendering.
// Memoized with granular dependencies to prevent cascading re-renders.
// Create a chatState that uses streaming citations for immediate rendering
// This merges the prop citations with streaming citations, preferring streaming ones
// Memoized with granular dependencies to prevent cascading re-renders
// Note: chatState object is recreated upstream on every render, so we depend on
// individual fields instead of the whole object for proper memoization.
// individual fields instead of the whole object for proper memoization
const effectiveChatState = useMemo<FullChatState>(
() => ({
...chatState,
citations: mergedCitations,
docs: mergedDocs,
}),
[
chatState.agent,
chatState.docs,
chatState.setPresentingDocument,
chatState.overriddenModel,
chatState.researchType,
mergedCitations,
mergedDocs,
]
);