mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-16 06:56:51 +00:00
Compare commits
7 Commits
v3.2.0-clo
...
dane/csv6
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
957f05618e | ||
|
|
3454fa6949 | ||
|
|
c55ab0cdf6 | ||
|
|
454479016c | ||
|
|
9839e46658 | ||
|
|
a50ce50d8b | ||
|
|
818f927ba0 |
@@ -0,0 +1,5 @@
|
||||
from onyx.indexing.chunking.tabular_section_chunker.tabular_section_chunker import (
|
||||
TabularChunker,
|
||||
)
|
||||
|
||||
__all__ = ["TabularChunker"]
|
||||
@@ -0,0 +1,229 @@
|
||||
"""Per-section sheet descriptor chunk builder."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import field
|
||||
from datetime import date
|
||||
from itertools import zip_longest
|
||||
|
||||
from dateutil.parser import parse as parse_dt
|
||||
|
||||
from onyx.connectors.models import Section
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.natural_language_processing.utils import count_tokens
|
||||
from onyx.utils.csv_utils import parse_csv_string
|
||||
from onyx.utils.csv_utils import ParsedRow
|
||||
from onyx.utils.csv_utils import read_csv_header
|
||||
|
||||
|
||||
MAX_NUMERIC_COLS = 12
|
||||
MAX_CATEGORICAL_COLS = 6
|
||||
MAX_CATEGORICAL_WITH_SAMPLES = 4
|
||||
MAX_DISTINCT_SAMPLES = 8
|
||||
CATEGORICAL_DISTINCT_THRESHOLD = 20
|
||||
ID_NAME_TOKENS = {"id", "uuid", "uid", "guid", "key"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SheetAnalysis:
|
||||
row_count: int
|
||||
num_cols: int
|
||||
numeric_cols: list[int] = field(default_factory=list)
|
||||
categorical_cols: list[int] = field(default_factory=list)
|
||||
categorical_values: dict[int, list[str]] = field(default_factory=dict)
|
||||
id_col: int | None = None
|
||||
date_min: date | None = None
|
||||
date_max: date | None = None
|
||||
|
||||
|
||||
def build_sheet_descriptor_chunks(
|
||||
section: Section,
|
||||
tokenizer: BaseTokenizer,
|
||||
max_tokens: int,
|
||||
) -> list[str]:
|
||||
"""Build sheet descriptor chunk(s) from a parsed CSV section.
|
||||
|
||||
Output (lines joined by "\\n"; lines that overflow ``max_tokens`` on
|
||||
their own are skipped; ``section.heading`` is prepended to every
|
||||
emitted chunk so retrieval keeps sheet context after a split):
|
||||
|
||||
{section.heading} # optional
|
||||
Sheet overview.
|
||||
This sheet has {N} rows and {M} columns.
|
||||
Columns: {col1}, {col2}, ...
|
||||
Time range: {start} to {end}. # optional
|
||||
Numeric columns (aggregatable by sum, average, min, max): ... # optional
|
||||
Categorical columns (groupable, can be counted by value): ... # optional
|
||||
Identifier column: {col}. # optional
|
||||
Values seen in {col}: {v1}, {v2}, ... # optional, repeated
|
||||
"""
|
||||
text = section.text or ""
|
||||
parsed_rows = list(parse_csv_string(text))
|
||||
headers = parsed_rows[0].header if parsed_rows else read_csv_header(text)
|
||||
if not headers:
|
||||
return []
|
||||
|
||||
a = _analyze(headers, parsed_rows)
|
||||
lines = [
|
||||
_overview_line(a),
|
||||
_columns_line(headers),
|
||||
_time_range_line(a),
|
||||
_numeric_cols_line(headers, a),
|
||||
_categorical_cols_line(headers, a),
|
||||
_id_col_line(headers, a),
|
||||
_values_seen_line(headers, a),
|
||||
]
|
||||
return _pack_lines(
|
||||
[line for line in lines if line],
|
||||
prefix=section.heading or "",
|
||||
tokenizer=tokenizer,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
|
||||
def _overview_line(a: SheetAnalysis) -> str:
|
||||
return (
|
||||
"Sheet overview.\n"
|
||||
f"This sheet has {a.row_count} rows and {a.num_cols} columns."
|
||||
)
|
||||
|
||||
|
||||
def _columns_line(headers: list[str]) -> str:
|
||||
return "Columns: " + ", ".join(_label(h) for h in headers)
|
||||
|
||||
|
||||
def _time_range_line(a: SheetAnalysis) -> str:
|
||||
if not (a.date_min and a.date_max):
|
||||
return ""
|
||||
return f"Time range: {a.date_min} to {a.date_max}."
|
||||
|
||||
|
||||
def _numeric_cols_line(headers: list[str], a: SheetAnalysis) -> str:
|
||||
if not a.numeric_cols:
|
||||
return ""
|
||||
names = ", ".join(_label(headers[i]) for i in a.numeric_cols[:MAX_NUMERIC_COLS])
|
||||
return f"Numeric columns (aggregatable by sum, average, min, max): {names}"
|
||||
|
||||
|
||||
def _categorical_cols_line(headers: list[str], a: SheetAnalysis) -> str:
|
||||
if not a.categorical_cols:
|
||||
return ""
|
||||
names = ", ".join(
|
||||
_label(headers[i]) for i in a.categorical_cols[:MAX_CATEGORICAL_COLS]
|
||||
)
|
||||
return f"Categorical columns (groupable, can be counted by value): {names}"
|
||||
|
||||
|
||||
def _id_col_line(headers: list[str], a: SheetAnalysis) -> str:
|
||||
if a.id_col is None:
|
||||
return ""
|
||||
return f"Identifier column: {_label(headers[a.id_col])}."
|
||||
|
||||
|
||||
def _values_seen_line(headers: list[str], a: SheetAnalysis) -> str:
|
||||
rows: list[str] = []
|
||||
for ci in a.categorical_cols[:MAX_CATEGORICAL_WITH_SAMPLES]:
|
||||
sample = sorted(a.categorical_values.get(ci, []))[:MAX_DISTINCT_SAMPLES]
|
||||
if sample:
|
||||
rows.append(f"Values seen in {_label(headers[ci])}: " + ", ".join(sample))
|
||||
return "\n".join(rows)
|
||||
|
||||
|
||||
def _label(name: str) -> str:
|
||||
return f"{name} ({name.replace('_', ' ')})" if "_" in name else name
|
||||
|
||||
|
||||
def _is_numeric(value: str) -> bool:
|
||||
try:
|
||||
float(value.replace(",", ""))
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def _try_date(value: str) -> date | None:
|
||||
if len(value) < 4 or not any(c in value for c in "-/T"):
|
||||
return None
|
||||
try:
|
||||
return parse_dt(value).date()
|
||||
except (ValueError, OverflowError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def _is_id_name(name: str) -> bool:
|
||||
lowered = name.lower().strip().replace("-", "_")
|
||||
return lowered in ID_NAME_TOKENS or any(
|
||||
lowered.endswith(f"_{t}") for t in ID_NAME_TOKENS
|
||||
)
|
||||
|
||||
|
||||
def _analyze(headers: list[str], parsed_rows: list[ParsedRow]) -> SheetAnalysis:
|
||||
a = SheetAnalysis(row_count=len(parsed_rows), num_cols=len(headers))
|
||||
columns = zip_longest(*(pr.row for pr in parsed_rows), fillvalue="")
|
||||
for idx, (header, raw_values) in enumerate(zip(headers, columns)):
|
||||
# Pull the column's non-empty values; skip if the column is blank.
|
||||
values = [v.strip() for v in raw_values if v.strip()]
|
||||
if not values:
|
||||
continue
|
||||
|
||||
# Identifier: id-named column whose values are all unique. Detected
|
||||
# before classification so a numeric `id` column still gets flagged.
|
||||
distinct = set(values)
|
||||
if a.id_col is None and len(distinct) == len(values) and _is_id_name(header):
|
||||
a.id_col = idx
|
||||
|
||||
# Numeric: every value parses as a number.
|
||||
if all(_is_numeric(v) for v in values):
|
||||
a.numeric_cols.append(idx)
|
||||
continue
|
||||
|
||||
# Date: every value parses as a date — fold into the sheet-wide range.
|
||||
dates = [_try_date(v) for v in values]
|
||||
if all(d is not None for d in dates):
|
||||
dmin = min(filter(None, dates))
|
||||
dmax = max(filter(None, dates))
|
||||
a.date_min = dmin if a.date_min is None else min(a.date_min, dmin)
|
||||
a.date_max = dmax if a.date_max is None else max(a.date_max, dmax)
|
||||
continue
|
||||
|
||||
# Categorical: low-cardinality column — keep distinct values for samples.
|
||||
if len(distinct) <= max(CATEGORICAL_DISTINCT_THRESHOLD, len(values) // 2):
|
||||
a.categorical_cols.append(idx)
|
||||
a.categorical_values[idx] = list(distinct)
|
||||
return a
|
||||
|
||||
|
||||
def _pack_lines(
|
||||
lines: list[str],
|
||||
prefix: str,
|
||||
tokenizer: BaseTokenizer,
|
||||
max_tokens: int,
|
||||
) -> list[str]:
|
||||
"""Greedily pack lines into chunks ≤ max_tokens. Lines that on
|
||||
their own exceed max_tokens (after accounting for the prefix) are
|
||||
skipped. ``prefix`` is prepended to every emitted chunk."""
|
||||
prefix_tokens = count_tokens(prefix, tokenizer) + 1 if prefix else 0
|
||||
budget = max_tokens - prefix_tokens
|
||||
|
||||
chunks: list[str] = []
|
||||
current: list[str] = []
|
||||
current_tokens = 0
|
||||
for line in lines:
|
||||
line_tokens = count_tokens(line, tokenizer)
|
||||
if line_tokens > budget:
|
||||
continue
|
||||
sep = 1 if current else 0
|
||||
if current_tokens + sep + line_tokens > budget:
|
||||
chunks.append(_join_with_prefix(current, prefix))
|
||||
current = [line]
|
||||
current_tokens = line_tokens
|
||||
else:
|
||||
current.append(line)
|
||||
current_tokens += sep + line_tokens
|
||||
if current:
|
||||
chunks.append(_join_with_prefix(current, prefix))
|
||||
return chunks
|
||||
|
||||
|
||||
def _join_with_prefix(lines: list[str], prefix: str) -> str:
|
||||
body = "\n".join(lines)
|
||||
return f"{prefix}\n{body}" if prefix else body
|
||||
@@ -1,5 +1,3 @@
|
||||
import csv
|
||||
import io
|
||||
from collections.abc import Iterable
|
||||
|
||||
from pydantic import BaseModel
|
||||
@@ -9,9 +7,14 @@ from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.section_chunker import ChunkPayload
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunker
|
||||
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
|
||||
from onyx.indexing.chunking.tabular_section_chunker.sheet_descriptor import (
|
||||
build_sheet_descriptor_chunks,
|
||||
)
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
from onyx.natural_language_processing.utils import count_tokens
|
||||
from onyx.natural_language_processing.utils import split_text_by_tokens
|
||||
from onyx.utils.csv_utils import parse_csv_string
|
||||
from onyx.utils.csv_utils import ParsedRow
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
@@ -23,11 +26,6 @@ ROW_JOIN = "\n"
|
||||
NEWLINE_TOKENS = 1
|
||||
|
||||
|
||||
class _ParsedRow(BaseModel):
|
||||
header: list[str]
|
||||
row: list[str]
|
||||
|
||||
|
||||
class _TokenizedText(BaseModel):
|
||||
text: str
|
||||
token_count: int
|
||||
@@ -60,23 +58,6 @@ def format_columns_header(headers: list[str]) -> str:
|
||||
return f"{COLUMNS_MARKER} " + FIELD_VALUE_SEPARATOR.join(parts)
|
||||
|
||||
|
||||
def parse_section(section: Section) -> list[_ParsedRow]:
|
||||
"""Parse CSV into headers + rows. First non-empty row is the header;
|
||||
blank rows are skipped."""
|
||||
section_text = section.text or ""
|
||||
if not section_text.strip():
|
||||
return []
|
||||
|
||||
reader = csv.reader(io.StringIO(section_text))
|
||||
non_empty_rows = [row for row in reader if any(cell.strip() for cell in row)]
|
||||
|
||||
if not non_empty_rows:
|
||||
return []
|
||||
|
||||
header, *data_rows = non_empty_rows
|
||||
return [_ParsedRow(header=header, row=row) for row in data_rows]
|
||||
|
||||
|
||||
def _row_to_pairs(headers: list[str], row: list[str]) -> list[tuple[str, str]]:
|
||||
return [(h, v) for h, v in zip(headers, row) if v.strip()]
|
||||
|
||||
@@ -175,7 +156,7 @@ def _build_chunk_from_scratch(
|
||||
|
||||
|
||||
def parse_to_chunks(
|
||||
rows: Iterable[_ParsedRow],
|
||||
rows: Iterable[ParsedRow],
|
||||
sheet_header: str,
|
||||
tokenizer: BaseTokenizer,
|
||||
max_tokens: int,
|
||||
@@ -233,8 +214,13 @@ def parse_to_chunks(
|
||||
|
||||
|
||||
class TabularChunker(SectionChunker):
|
||||
def __init__(self, tokenizer: BaseTokenizer) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer: BaseTokenizer,
|
||||
ignore_metadata_chunks: bool = False,
|
||||
) -> None:
|
||||
self.tokenizer = tokenizer
|
||||
self.ignore_metadata_chunks = ignore_metadata_chunks
|
||||
|
||||
def chunk_section(
|
||||
self,
|
||||
@@ -244,8 +230,30 @@ class TabularChunker(SectionChunker):
|
||||
) -> SectionChunkerOutput:
|
||||
payloads = accumulator.flush_to_list()
|
||||
|
||||
parsed_rows = parse_section(section)
|
||||
if not parsed_rows:
|
||||
parsed_rows = list(parse_csv_string(section.text or ""))
|
||||
sheet_header = section.heading or ""
|
||||
|
||||
chunk_texts: list[str] = []
|
||||
if parsed_rows:
|
||||
chunk_texts.extend(
|
||||
parse_to_chunks(
|
||||
rows=parsed_rows,
|
||||
sheet_header=sheet_header,
|
||||
tokenizer=self.tokenizer,
|
||||
max_tokens=content_token_limit,
|
||||
)
|
||||
)
|
||||
|
||||
if not self.ignore_metadata_chunks:
|
||||
chunk_texts.extend(
|
||||
build_sheet_descriptor_chunks(
|
||||
section=section,
|
||||
tokenizer=self.tokenizer,
|
||||
max_tokens=content_token_limit,
|
||||
)
|
||||
)
|
||||
|
||||
if not chunk_texts:
|
||||
logger.warning(
|
||||
f"TabularChunker: skipping unparseable section (link={section.link})"
|
||||
)
|
||||
@@ -253,14 +261,6 @@ class TabularChunker(SectionChunker):
|
||||
payloads=payloads, accumulator=AccumulatorState()
|
||||
)
|
||||
|
||||
sheet_header = section.heading or ""
|
||||
chunk_texts = parse_to_chunks(
|
||||
rows=parsed_rows,
|
||||
sheet_header=sheet_header,
|
||||
tokenizer=self.tokenizer,
|
||||
max_tokens=content_token_limit,
|
||||
)
|
||||
|
||||
for i, text in enumerate(chunk_texts):
|
||||
payloads.append(
|
||||
ChunkPayload(
|
||||
41
backend/onyx/utils/csv_utils.py
Normal file
41
backend/onyx/utils/csv_utils.py
Normal file
@@ -0,0 +1,41 @@
|
||||
import csv
|
||||
import io
|
||||
from collections.abc import Generator
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ParsedRow(BaseModel):
|
||||
header: list[str]
|
||||
row: list[str]
|
||||
|
||||
|
||||
def read_csv_header(csv_text: str) -> list[str]:
|
||||
"""Return the first non-blank row (the header) of a CSV string, or
|
||||
[] if the text has no usable header.
|
||||
"""
|
||||
if not csv_text.strip():
|
||||
return []
|
||||
for row in csv.reader(io.StringIO(csv_text)):
|
||||
if any(c.strip() for c in row):
|
||||
return row
|
||||
return []
|
||||
|
||||
|
||||
def parse_csv_string(csv_text: str) -> Generator[ParsedRow, None, None]:
|
||||
"""
|
||||
Takes in a string in the form of a CSV and yields back
|
||||
each row + header in the csv.
|
||||
"""
|
||||
if not csv_text.strip():
|
||||
return
|
||||
|
||||
reader = csv.reader(io.StringIO(csv_text))
|
||||
header: list[str] | None = None
|
||||
for row in reader:
|
||||
if not any(cell.strip() for cell in row):
|
||||
continue
|
||||
if header is None:
|
||||
header = row
|
||||
continue
|
||||
yield ParsedRow(header=header, row=row)
|
||||
@@ -15,6 +15,9 @@ from onyx.connectors.models import Section
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.indexing.chunking.section_chunker import AccumulatorState
|
||||
from onyx.indexing.chunking.tabular_section_chunker import TabularChunker
|
||||
from onyx.indexing.chunking.tabular_section_chunker.sheet_descriptor import (
|
||||
build_sheet_descriptor_chunks,
|
||||
)
|
||||
from onyx.natural_language_processing.utils import BaseTokenizer
|
||||
|
||||
|
||||
@@ -29,8 +32,12 @@ class CharTokenizer(BaseTokenizer):
|
||||
return "".join(chr(t) for t in tokens)
|
||||
|
||||
|
||||
def _make_chunker() -> TabularChunker:
|
||||
return TabularChunker(tokenizer=CharTokenizer())
|
||||
def _make_chunker_no_metadata() -> TabularChunker:
|
||||
return TabularChunker(tokenizer=CharTokenizer(), ignore_metadata_chunks=True)
|
||||
|
||||
|
||||
def _make_chunker_with_metadata() -> TabularChunker:
|
||||
return TabularChunker(tokenizer=CharTokenizer(), ignore_metadata_chunks=False)
|
||||
|
||||
|
||||
_DEFAULT_LINK = "https://example.com/doc"
|
||||
@@ -62,7 +69,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -91,7 +98,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -104,26 +111,35 @@ class TestTabularChunkerChunkSection:
|
||||
# Link carries through every chunk.
|
||||
assert all(p.links == {0: _DEFAULT_LINK} for p in out.payloads)
|
||||
|
||||
# Add back in shortly
|
||||
# def test_header_only_csv_produces_single_prelude_chunk(self) -> None:
|
||||
# # --- INPUT -----------------------------------------------------
|
||||
# csv_text = "col1,col2\n"
|
||||
# link = "sheet:Headers"
|
||||
def test_header_only_csv_emits_metadata_chunk_with_no_content(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# A header-only CSV has no data rows, so `parse_to_chunks` emits
|
||||
# nothing. With metadata enabled, the descriptor still fires —
|
||||
# column names alone are useful retrieval signal.
|
||||
csv_text = "col1,col2\n"
|
||||
heading = "sheet:Headers"
|
||||
content_token_limit = 500
|
||||
|
||||
# # --- EXPECTED --------------------------------------------------
|
||||
# expected_texts = [
|
||||
# "sheet:Headers\nColumns: col1, col2",
|
||||
# ]
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected_texts = [
|
||||
"sheet:Headers\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 0 rows and 2 columns.\n"
|
||||
"Columns: col1, col2",
|
||||
]
|
||||
|
||||
# # --- ACT -------------------------------------------------------
|
||||
# out = _make_chunker().chunk_section(
|
||||
# _tabular_section(csv_text, link=link),
|
||||
# AccumulatorState(),
|
||||
# content_token_limit=500,
|
||||
# )
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker_with_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
# # --- ASSERT ----------------------------------------------------
|
||||
# assert [p.text for p in out.payloads] == expected_texts
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
assert [p.is_continuation for p in out.payloads] == [False]
|
||||
assert all(p.links == {0: _DEFAULT_LINK} for p in out.payloads)
|
||||
assert out.accumulator.is_empty()
|
||||
|
||||
def test_empty_cells_dropped_from_chunk_text(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
@@ -143,7 +159,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
@@ -166,7 +182,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
@@ -188,7 +204,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
@@ -215,7 +231,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(
|
||||
text=pending_text,
|
||||
@@ -258,7 +274,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -296,7 +312,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -337,7 +353,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -365,7 +381,7 @@ class TestTabularChunkerChunkSection:
|
||||
expected_texts = [pending_text]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section("", heading="sheet:Empty"),
|
||||
AccumulatorState(
|
||||
text=pending_text,
|
||||
@@ -410,7 +426,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -440,7 +456,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=500,
|
||||
@@ -487,7 +503,7 @@ class TestTabularChunkerChunkSection:
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -517,7 +533,7 @@ class TestTabularChunkerChunkSection:
|
||||
expected_texts = ["Columns: x\nx=y"]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -548,7 +564,7 @@ class TestTabularChunkerChunkSection:
|
||||
expected_texts = ["S\nABC=1, DEF=2"]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker().chunk_section(
|
||||
out = _make_chunker_no_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
@@ -556,3 +572,268 @@ class TestTabularChunkerChunkSection:
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
|
||||
def test_metadata_chunks_appended_after_content_when_enabled(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# With ignore_metadata_chunks=False, the descriptor chunk is
|
||||
# appended AFTER the content chunk(s). is_continuation tracks
|
||||
# the index in the combined output, so the metadata chunk is
|
||||
# marked as a continuation.
|
||||
csv_text = "Name,Age\n" "Alice,30\n" "Bob,25\n"
|
||||
heading = "sheet:T"
|
||||
content_token_limit = 500
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
content_chunk = (
|
||||
"sheet:T\n" "Columns: Name, Age\n" "Name=Alice, Age=30\n" "Name=Bob, Age=25"
|
||||
)
|
||||
metadata_chunk = (
|
||||
"sheet:T\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 2 rows and 2 columns.\n"
|
||||
"Columns: Name, Age\n"
|
||||
"Numeric columns (aggregatable by sum, average, min, max): Age\n"
|
||||
"Categorical columns (groupable, can be counted by value): Name\n"
|
||||
"Values seen in Name: Alice, Bob"
|
||||
)
|
||||
expected_texts = [content_chunk, metadata_chunk]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = _make_chunker_with_metadata().chunk_section(
|
||||
_tabular_section(csv_text, heading=heading),
|
||||
AccumulatorState(),
|
||||
content_token_limit=content_token_limit,
|
||||
)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert [p.text for p in out.payloads] == expected_texts
|
||||
# Content first, metadata second — only the first chunk is fresh.
|
||||
assert [p.is_continuation for p in out.payloads] == [False, True]
|
||||
|
||||
|
||||
class TestBuildSheetDescriptorChunks:
|
||||
"""Direct tests of `build_sheet_descriptor_chunks` — the per-section
|
||||
descriptor builder that backs the metadata chunks emitted by
|
||||
`TabularChunker` when ``ignore_metadata_chunks=False``.
|
||||
|
||||
A character-level tokenizer (1 char == 1 token) is used so the
|
||||
`_pack_lines` budget arithmetic is deterministic and expected
|
||||
chunks can be spelled out exactly.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _build(
|
||||
csv_text: str,
|
||||
heading: str | None = "sheet:T",
|
||||
max_tokens: int = 500,
|
||||
) -> list[str]:
|
||||
section = TabularSection(text=csv_text, link=_DEFAULT_LINK, heading=heading)
|
||||
return build_sheet_descriptor_chunks(
|
||||
section=section,
|
||||
tokenizer=CharTokenizer(),
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
|
||||
def test_basic_descriptor_emits_every_component(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# CSV exercises every optional descriptor line:
|
||||
# - id → numeric AND identifier (unique + id-named)
|
||||
# - Name → categorical (with sample values)
|
||||
# - Age → numeric
|
||||
# - joined_at → date column → contributes to time range
|
||||
csv_text = (
|
||||
"id,Name,Age,joined_at\n" "1,Alice,30,2024-01-15\n" "2,Bob,25,2024-02-20\n"
|
||||
)
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"sheet:T\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 2 rows and 4 columns.\n"
|
||||
"Columns: id, Name, Age, joined_at (joined at)\n"
|
||||
"Time range: 2024-01-15 to 2024-02-20.\n"
|
||||
"Numeric columns (aggregatable by sum, average, min, max): id, Age\n"
|
||||
"Categorical columns (groupable, can be counted by value): Name\n"
|
||||
"Identifier column: id.\n"
|
||||
"Values seen in Name: Alice, Bob"
|
||||
]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text) == expected
|
||||
|
||||
def test_numeric_only_omits_categorical_and_values_seen_lines(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# All-numeric CSV: no categorical line, no identifier line, no
|
||||
# values-seen lines, no time range.
|
||||
csv_text = "x,y\n1,2\n3,4\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"sheet:T\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 2 rows and 2 columns.\n"
|
||||
"Columns: x, y\n"
|
||||
"Numeric columns (aggregatable by sum, average, min, max): x, y"
|
||||
]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text) == expected
|
||||
|
||||
def test_underscored_column_names_get_friendly_alias_in_descriptor(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# Underscored headers get the same `name (name with spaces)`
|
||||
# alias used by `format_columns_header`, so retrieval matches
|
||||
# either form. The alias appears in every line that names the
|
||||
# column (Columns:, Categorical columns:, Values seen in ...).
|
||||
csv_text = "MTTR_hours,owner_name\n3,Alice\n5,Bob\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"sheet:T\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 2 rows and 2 columns.\n"
|
||||
"Columns: MTTR_hours (MTTR hours), owner_name (owner name)\n"
|
||||
"Numeric columns (aggregatable by sum, average, min, max): "
|
||||
"MTTR_hours (MTTR hours)\n"
|
||||
"Categorical columns (groupable, can be counted by value): "
|
||||
"owner_name (owner name)\n"
|
||||
"Values seen in owner_name (owner name): Alice, Bob"
|
||||
]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text) == expected
|
||||
|
||||
def test_identifier_column_detected_for_unique_id_named_column(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# `uuid` is unique AND its name is in the ID_NAME_TOKENS set, so
|
||||
# it gets flagged as the identifier column. Non-numeric values
|
||||
# also make it categorical.
|
||||
csv_text = "uuid,Name\nabc,Alice\ndef,Bob\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"sheet:T\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 2 rows and 2 columns.\n"
|
||||
"Columns: uuid, Name\n"
|
||||
"Categorical columns (groupable, can be counted by value): uuid, Name\n"
|
||||
"Identifier column: uuid.\n"
|
||||
"Values seen in uuid: abc, def\n"
|
||||
"Values seen in Name: Alice, Bob"
|
||||
]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text) == expected
|
||||
|
||||
def test_time_range_emitted_for_date_only_column(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# A column whose values all parse as dates contributes to the
|
||||
# `Time range:` line and is excluded from numeric/categorical
|
||||
# classification.
|
||||
csv_text = "joined_at\n2024-01-15\n2024-03-20\n2024-02-10\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"sheet:T\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 3 rows and 1 columns.\n"
|
||||
"Columns: joined_at (joined at)\n"
|
||||
"Time range: 2024-01-15 to 2024-03-20."
|
||||
]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text) == expected
|
||||
|
||||
def test_empty_section_returns_no_chunks(self) -> None:
|
||||
# Empty CSV text → nothing to describe.
|
||||
assert self._build("") == []
|
||||
|
||||
def test_header_only_csv_emits_descriptor_with_zero_rows(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# Header line alone, no data rows. Column names are still useful
|
||||
# retrieval signal, so a minimal descriptor is emitted with
|
||||
# row_count=0 and no numeric/categorical/values-seen lines.
|
||||
csv_text = "col1,col2\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"sheet:T\n"
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 0 rows and 2 columns.\n"
|
||||
"Columns: col1, col2"
|
||||
]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text) == expected
|
||||
|
||||
def test_no_heading_means_no_prefix_line_in_chunks(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# heading=None → `_pack_lines` runs with prefix="", so emitted
|
||||
# chunks do not start with a heading line.
|
||||
csv_text = "Name\nAlice\nBob\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"Sheet overview.\n"
|
||||
"This sheet has 2 rows and 1 columns.\n"
|
||||
"Columns: Name\n"
|
||||
"Categorical columns (groupable, can be counted by value): Name\n"
|
||||
"Values seen in Name: Alice, Bob"
|
||||
]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text, heading=None) == expected
|
||||
|
||||
def test_descriptor_splits_across_chunks_with_heading_repeated(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# Tight budget forces the descriptor across multiple chunks. The
|
||||
# heading is prepended to every emitted chunk so retrieval keeps
|
||||
# context after the split. Lines that exceed the budget on their
|
||||
# own are silently skipped.
|
||||
#
|
||||
# heading="S" (1 char) → prefix_tokens = 1+1 = 2; budget = 60-2 = 58.
|
||||
# Lines (and lengths under CharTokenizer):
|
||||
# overview = "Sheet overview.\nThis sheet has 5 rows and 1 columns." (52)
|
||||
# columns = "Columns: Name" (13)
|
||||
# categorical = "Categorical columns (groupable, ...): Name" (62) > 58 → SKIPPED
|
||||
# values_seen = "Values seen in Name: Alice, Bob, Charlie, Dave, Eve" (51)
|
||||
# Pack:
|
||||
# [overview(52)] → fits, current=52
|
||||
# + columns(13): 52+1+13 = 66 > 58 → flush; current=[columns], 13
|
||||
# skip categorical (oversize)
|
||||
# + values_seen(51): 13+1+51 = 65 > 58 → flush; current=[values_seen], 51
|
||||
# end → flush
|
||||
csv_text = "Name\nAlice\nBob\nCharlie\nDave\nEve\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = [
|
||||
"S\nSheet overview.\nThis sheet has 5 rows and 1 columns.",
|
||||
"S\nColumns: Name",
|
||||
"S\nValues seen in Name: Alice, Bob, Charlie, Dave, Eve",
|
||||
]
|
||||
|
||||
# --- ACT -------------------------------------------------------
|
||||
out = self._build(csv_text, heading="S", max_tokens=60)
|
||||
|
||||
# --- ASSERT ----------------------------------------------------
|
||||
assert out == expected
|
||||
# Every emitted chunk fits the budget.
|
||||
assert all(len(c) <= 60 for c in out)
|
||||
# The dropped categorical line never makes it into output.
|
||||
assert all("Categorical columns" not in c for c in out)
|
||||
|
||||
def test_lines_exceeding_budget_are_skipped(self) -> None:
|
||||
# --- INPUT -----------------------------------------------------
|
||||
# heading="" (no prefix) → budget = max_tokens.
|
||||
# Lines:
|
||||
# overview = "Sheet overview.\nThis sheet has 1 rows and 1 columns." (52) > 30 → SKIPPED
|
||||
# columns = "Columns: x" (10)
|
||||
# numeric = "Numeric columns (...): x" (59) > 30 → SKIPPED
|
||||
# Only the columns line survives.
|
||||
csv_text = "x\n1\n"
|
||||
|
||||
# --- EXPECTED --------------------------------------------------
|
||||
expected = ["Columns: x"]
|
||||
|
||||
# --- ACT / ASSERT ---------------------------------------------
|
||||
assert self._build(csv_text, heading="", max_tokens=30) == expected
|
||||
|
||||
Reference in New Issue
Block a user