Compare commits

...

7 Commits

Author SHA1 Message Date
Dane Urban
957f05618e . 2026-04-15 17:50:22 -07:00
Dane Urban
3454fa6949 . 2026-04-15 17:47:54 -07:00
Dane Urban
c55ab0cdf6 Move csv parsing logic to util file 2026-04-15 17:47:08 -07:00
Dane Urban
454479016c . 2026-04-15 17:24:23 -07:00
Dane Urban
9839e46658 Add comments 2026-04-15 17:20:11 -07:00
Dane Urban
a50ce50d8b . 2026-04-15 17:14:30 -07:00
Dane Urban
818f927ba0 Add descriptor chunk 2026-04-15 17:01:00 -07:00
5 changed files with 626 additions and 70 deletions

View File

@@ -0,0 +1,5 @@
from onyx.indexing.chunking.tabular_section_chunker.tabular_section_chunker import (
TabularChunker,
)
__all__ = ["TabularChunker"]

View File

@@ -0,0 +1,229 @@
"""Per-section sheet descriptor chunk builder."""
from dataclasses import dataclass
from dataclasses import field
from datetime import date
from itertools import zip_longest
from dateutil.parser import parse as parse_dt
from onyx.connectors.models import Section
from onyx.natural_language_processing.utils import BaseTokenizer
from onyx.natural_language_processing.utils import count_tokens
from onyx.utils.csv_utils import parse_csv_string
from onyx.utils.csv_utils import ParsedRow
from onyx.utils.csv_utils import read_csv_header
MAX_NUMERIC_COLS = 12
MAX_CATEGORICAL_COLS = 6
MAX_CATEGORICAL_WITH_SAMPLES = 4
MAX_DISTINCT_SAMPLES = 8
CATEGORICAL_DISTINCT_THRESHOLD = 20
ID_NAME_TOKENS = {"id", "uuid", "uid", "guid", "key"}
@dataclass
class SheetAnalysis:
row_count: int
num_cols: int
numeric_cols: list[int] = field(default_factory=list)
categorical_cols: list[int] = field(default_factory=list)
categorical_values: dict[int, list[str]] = field(default_factory=dict)
id_col: int | None = None
date_min: date | None = None
date_max: date | None = None
def build_sheet_descriptor_chunks(
section: Section,
tokenizer: BaseTokenizer,
max_tokens: int,
) -> list[str]:
"""Build sheet descriptor chunk(s) from a parsed CSV section.
Output (lines joined by "\\n"; lines that overflow ``max_tokens`` on
their own are skipped; ``section.heading`` is prepended to every
emitted chunk so retrieval keeps sheet context after a split):
{section.heading} # optional
Sheet overview.
This sheet has {N} rows and {M} columns.
Columns: {col1}, {col2}, ...
Time range: {start} to {end}. # optional
Numeric columns (aggregatable by sum, average, min, max): ... # optional
Categorical columns (groupable, can be counted by value): ... # optional
Identifier column: {col}. # optional
Values seen in {col}: {v1}, {v2}, ... # optional, repeated
"""
text = section.text or ""
parsed_rows = list(parse_csv_string(text))
headers = parsed_rows[0].header if parsed_rows else read_csv_header(text)
if not headers:
return []
a = _analyze(headers, parsed_rows)
lines = [
_overview_line(a),
_columns_line(headers),
_time_range_line(a),
_numeric_cols_line(headers, a),
_categorical_cols_line(headers, a),
_id_col_line(headers, a),
_values_seen_line(headers, a),
]
return _pack_lines(
[line for line in lines if line],
prefix=section.heading or "",
tokenizer=tokenizer,
max_tokens=max_tokens,
)
def _overview_line(a: SheetAnalysis) -> str:
return (
"Sheet overview.\n"
f"This sheet has {a.row_count} rows and {a.num_cols} columns."
)
def _columns_line(headers: list[str]) -> str:
return "Columns: " + ", ".join(_label(h) for h in headers)
def _time_range_line(a: SheetAnalysis) -> str:
if not (a.date_min and a.date_max):
return ""
return f"Time range: {a.date_min} to {a.date_max}."
def _numeric_cols_line(headers: list[str], a: SheetAnalysis) -> str:
if not a.numeric_cols:
return ""
names = ", ".join(_label(headers[i]) for i in a.numeric_cols[:MAX_NUMERIC_COLS])
return f"Numeric columns (aggregatable by sum, average, min, max): {names}"
def _categorical_cols_line(headers: list[str], a: SheetAnalysis) -> str:
if not a.categorical_cols:
return ""
names = ", ".join(
_label(headers[i]) for i in a.categorical_cols[:MAX_CATEGORICAL_COLS]
)
return f"Categorical columns (groupable, can be counted by value): {names}"
def _id_col_line(headers: list[str], a: SheetAnalysis) -> str:
if a.id_col is None:
return ""
return f"Identifier column: {_label(headers[a.id_col])}."
def _values_seen_line(headers: list[str], a: SheetAnalysis) -> str:
rows: list[str] = []
for ci in a.categorical_cols[:MAX_CATEGORICAL_WITH_SAMPLES]:
sample = sorted(a.categorical_values.get(ci, []))[:MAX_DISTINCT_SAMPLES]
if sample:
rows.append(f"Values seen in {_label(headers[ci])}: " + ", ".join(sample))
return "\n".join(rows)
def _label(name: str) -> str:
return f"{name} ({name.replace('_', ' ')})" if "_" in name else name
def _is_numeric(value: str) -> bool:
try:
float(value.replace(",", ""))
return True
except ValueError:
return False
def _try_date(value: str) -> date | None:
if len(value) < 4 or not any(c in value for c in "-/T"):
return None
try:
return parse_dt(value).date()
except (ValueError, OverflowError, TypeError):
return None
def _is_id_name(name: str) -> bool:
lowered = name.lower().strip().replace("-", "_")
return lowered in ID_NAME_TOKENS or any(
lowered.endswith(f"_{t}") for t in ID_NAME_TOKENS
)
def _analyze(headers: list[str], parsed_rows: list[ParsedRow]) -> SheetAnalysis:
a = SheetAnalysis(row_count=len(parsed_rows), num_cols=len(headers))
columns = zip_longest(*(pr.row for pr in parsed_rows), fillvalue="")
for idx, (header, raw_values) in enumerate(zip(headers, columns)):
# Pull the column's non-empty values; skip if the column is blank.
values = [v.strip() for v in raw_values if v.strip()]
if not values:
continue
# Identifier: id-named column whose values are all unique. Detected
# before classification so a numeric `id` column still gets flagged.
distinct = set(values)
if a.id_col is None and len(distinct) == len(values) and _is_id_name(header):
a.id_col = idx
# Numeric: every value parses as a number.
if all(_is_numeric(v) for v in values):
a.numeric_cols.append(idx)
continue
# Date: every value parses as a date — fold into the sheet-wide range.
dates = [_try_date(v) for v in values]
if all(d is not None for d in dates):
dmin = min(filter(None, dates))
dmax = max(filter(None, dates))
a.date_min = dmin if a.date_min is None else min(a.date_min, dmin)
a.date_max = dmax if a.date_max is None else max(a.date_max, dmax)
continue
# Categorical: low-cardinality column — keep distinct values for samples.
if len(distinct) <= max(CATEGORICAL_DISTINCT_THRESHOLD, len(values) // 2):
a.categorical_cols.append(idx)
a.categorical_values[idx] = list(distinct)
return a
def _pack_lines(
lines: list[str],
prefix: str,
tokenizer: BaseTokenizer,
max_tokens: int,
) -> list[str]:
"""Greedily pack lines into chunks ≤ max_tokens. Lines that on
their own exceed max_tokens (after accounting for the prefix) are
skipped. ``prefix`` is prepended to every emitted chunk."""
prefix_tokens = count_tokens(prefix, tokenizer) + 1 if prefix else 0
budget = max_tokens - prefix_tokens
chunks: list[str] = []
current: list[str] = []
current_tokens = 0
for line in lines:
line_tokens = count_tokens(line, tokenizer)
if line_tokens > budget:
continue
sep = 1 if current else 0
if current_tokens + sep + line_tokens > budget:
chunks.append(_join_with_prefix(current, prefix))
current = [line]
current_tokens = line_tokens
else:
current.append(line)
current_tokens += sep + line_tokens
if current:
chunks.append(_join_with_prefix(current, prefix))
return chunks
def _join_with_prefix(lines: list[str], prefix: str) -> str:
body = "\n".join(lines)
return f"{prefix}\n{body}" if prefix else body

View File

@@ -1,5 +1,3 @@
import csv
import io
from collections.abc import Iterable
from pydantic import BaseModel
@@ -9,9 +7,14 @@ from onyx.indexing.chunking.section_chunker import AccumulatorState
from onyx.indexing.chunking.section_chunker import ChunkPayload
from onyx.indexing.chunking.section_chunker import SectionChunker
from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
from onyx.indexing.chunking.tabular_section_chunker.sheet_descriptor import (
build_sheet_descriptor_chunks,
)
from onyx.natural_language_processing.utils import BaseTokenizer
from onyx.natural_language_processing.utils import count_tokens
from onyx.natural_language_processing.utils import split_text_by_tokens
from onyx.utils.csv_utils import parse_csv_string
from onyx.utils.csv_utils import ParsedRow
from onyx.utils.logger import setup_logger
logger = setup_logger()
@@ -23,11 +26,6 @@ ROW_JOIN = "\n"
NEWLINE_TOKENS = 1
class _ParsedRow(BaseModel):
header: list[str]
row: list[str]
class _TokenizedText(BaseModel):
text: str
token_count: int
@@ -60,23 +58,6 @@ def format_columns_header(headers: list[str]) -> str:
return f"{COLUMNS_MARKER} " + FIELD_VALUE_SEPARATOR.join(parts)
def parse_section(section: Section) -> list[_ParsedRow]:
"""Parse CSV into headers + rows. First non-empty row is the header;
blank rows are skipped."""
section_text = section.text or ""
if not section_text.strip():
return []
reader = csv.reader(io.StringIO(section_text))
non_empty_rows = [row for row in reader if any(cell.strip() for cell in row)]
if not non_empty_rows:
return []
header, *data_rows = non_empty_rows
return [_ParsedRow(header=header, row=row) for row in data_rows]
def _row_to_pairs(headers: list[str], row: list[str]) -> list[tuple[str, str]]:
return [(h, v) for h, v in zip(headers, row) if v.strip()]
@@ -175,7 +156,7 @@ def _build_chunk_from_scratch(
def parse_to_chunks(
rows: Iterable[_ParsedRow],
rows: Iterable[ParsedRow],
sheet_header: str,
tokenizer: BaseTokenizer,
max_tokens: int,
@@ -233,8 +214,13 @@ def parse_to_chunks(
class TabularChunker(SectionChunker):
def __init__(self, tokenizer: BaseTokenizer) -> None:
def __init__(
self,
tokenizer: BaseTokenizer,
ignore_metadata_chunks: bool = False,
) -> None:
self.tokenizer = tokenizer
self.ignore_metadata_chunks = ignore_metadata_chunks
def chunk_section(
self,
@@ -244,8 +230,30 @@ class TabularChunker(SectionChunker):
) -> SectionChunkerOutput:
payloads = accumulator.flush_to_list()
parsed_rows = parse_section(section)
if not parsed_rows:
parsed_rows = list(parse_csv_string(section.text or ""))
sheet_header = section.heading or ""
chunk_texts: list[str] = []
if parsed_rows:
chunk_texts.extend(
parse_to_chunks(
rows=parsed_rows,
sheet_header=sheet_header,
tokenizer=self.tokenizer,
max_tokens=content_token_limit,
)
)
if not self.ignore_metadata_chunks:
chunk_texts.extend(
build_sheet_descriptor_chunks(
section=section,
tokenizer=self.tokenizer,
max_tokens=content_token_limit,
)
)
if not chunk_texts:
logger.warning(
f"TabularChunker: skipping unparseable section (link={section.link})"
)
@@ -253,14 +261,6 @@ class TabularChunker(SectionChunker):
payloads=payloads, accumulator=AccumulatorState()
)
sheet_header = section.heading or ""
chunk_texts = parse_to_chunks(
rows=parsed_rows,
sheet_header=sheet_header,
tokenizer=self.tokenizer,
max_tokens=content_token_limit,
)
for i, text in enumerate(chunk_texts):
payloads.append(
ChunkPayload(

View File

@@ -0,0 +1,41 @@
import csv
import io
from collections.abc import Generator
from pydantic import BaseModel
class ParsedRow(BaseModel):
header: list[str]
row: list[str]
def read_csv_header(csv_text: str) -> list[str]:
"""Return the first non-blank row (the header) of a CSV string, or
[] if the text has no usable header.
"""
if not csv_text.strip():
return []
for row in csv.reader(io.StringIO(csv_text)):
if any(c.strip() for c in row):
return row
return []
def parse_csv_string(csv_text: str) -> Generator[ParsedRow, None, None]:
"""
Takes in a string in the form of a CSV and yields back
each row + header in the csv.
"""
if not csv_text.strip():
return
reader = csv.reader(io.StringIO(csv_text))
header: list[str] | None = None
for row in reader:
if not any(cell.strip() for cell in row):
continue
if header is None:
header = row
continue
yield ParsedRow(header=header, row=row)

View File

@@ -15,6 +15,9 @@ from onyx.connectors.models import Section
from onyx.connectors.models import TabularSection
from onyx.indexing.chunking.section_chunker import AccumulatorState
from onyx.indexing.chunking.tabular_section_chunker import TabularChunker
from onyx.indexing.chunking.tabular_section_chunker.sheet_descriptor import (
build_sheet_descriptor_chunks,
)
from onyx.natural_language_processing.utils import BaseTokenizer
@@ -29,8 +32,12 @@ class CharTokenizer(BaseTokenizer):
return "".join(chr(t) for t in tokens)
def _make_chunker() -> TabularChunker:
return TabularChunker(tokenizer=CharTokenizer())
def _make_chunker_no_metadata() -> TabularChunker:
return TabularChunker(tokenizer=CharTokenizer(), ignore_metadata_chunks=True)
def _make_chunker_with_metadata() -> TabularChunker:
return TabularChunker(tokenizer=CharTokenizer(), ignore_metadata_chunks=False)
_DEFAULT_LINK = "https://example.com/doc"
@@ -62,7 +69,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -91,7 +98,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -104,26 +111,35 @@ class TestTabularChunkerChunkSection:
# Link carries through every chunk.
assert all(p.links == {0: _DEFAULT_LINK} for p in out.payloads)
# Add back in shortly
# def test_header_only_csv_produces_single_prelude_chunk(self) -> None:
# # --- INPUT -----------------------------------------------------
# csv_text = "col1,col2\n"
# link = "sheet:Headers"
def test_header_only_csv_emits_metadata_chunk_with_no_content(self) -> None:
# --- INPUT -----------------------------------------------------
# A header-only CSV has no data rows, so `parse_to_chunks` emits
# nothing. With metadata enabled, the descriptor still fires —
# column names alone are useful retrieval signal.
csv_text = "col1,col2\n"
heading = "sheet:Headers"
content_token_limit = 500
# # --- EXPECTED --------------------------------------------------
# expected_texts = [
# "sheet:Headers\nColumns: col1, col2",
# ]
# --- EXPECTED --------------------------------------------------
expected_texts = [
"sheet:Headers\n"
"Sheet overview.\n"
"This sheet has 0 rows and 2 columns.\n"
"Columns: col1, col2",
]
# # --- ACT -------------------------------------------------------
# out = _make_chunker().chunk_section(
# _tabular_section(csv_text, link=link),
# AccumulatorState(),
# content_token_limit=500,
# )
# --- ACT -------------------------------------------------------
out = _make_chunker_with_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# # --- ASSERT ----------------------------------------------------
# assert [p.text for p in out.payloads] == expected_texts
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
assert [p.is_continuation for p in out.payloads] == [False]
assert all(p.links == {0: _DEFAULT_LINK} for p in out.payloads)
assert out.accumulator.is_empty()
def test_empty_cells_dropped_from_chunk_text(self) -> None:
# --- INPUT -----------------------------------------------------
@@ -143,7 +159,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=500,
@@ -166,7 +182,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=500,
@@ -188,7 +204,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=500,
@@ -215,7 +231,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(
text=pending_text,
@@ -258,7 +274,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -296,7 +312,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -337,7 +353,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -365,7 +381,7 @@ class TestTabularChunkerChunkSection:
expected_texts = [pending_text]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section("", heading="sheet:Empty"),
AccumulatorState(
text=pending_text,
@@ -410,7 +426,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -440,7 +456,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=500,
@@ -487,7 +503,7 @@ class TestTabularChunkerChunkSection:
]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -517,7 +533,7 @@ class TestTabularChunkerChunkSection:
expected_texts = ["Columns: x\nx=y"]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -548,7 +564,7 @@ class TestTabularChunkerChunkSection:
expected_texts = ["S\nABC=1, DEF=2"]
# --- ACT -------------------------------------------------------
out = _make_chunker().chunk_section(
out = _make_chunker_no_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
@@ -556,3 +572,268 @@ class TestTabularChunkerChunkSection:
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
def test_metadata_chunks_appended_after_content_when_enabled(self) -> None:
# --- INPUT -----------------------------------------------------
# With ignore_metadata_chunks=False, the descriptor chunk is
# appended AFTER the content chunk(s). is_continuation tracks
# the index in the combined output, so the metadata chunk is
# marked as a continuation.
csv_text = "Name,Age\n" "Alice,30\n" "Bob,25\n"
heading = "sheet:T"
content_token_limit = 500
# --- EXPECTED --------------------------------------------------
content_chunk = (
"sheet:T\n" "Columns: Name, Age\n" "Name=Alice, Age=30\n" "Name=Bob, Age=25"
)
metadata_chunk = (
"sheet:T\n"
"Sheet overview.\n"
"This sheet has 2 rows and 2 columns.\n"
"Columns: Name, Age\n"
"Numeric columns (aggregatable by sum, average, min, max): Age\n"
"Categorical columns (groupable, can be counted by value): Name\n"
"Values seen in Name: Alice, Bob"
)
expected_texts = [content_chunk, metadata_chunk]
# --- ACT -------------------------------------------------------
out = _make_chunker_with_metadata().chunk_section(
_tabular_section(csv_text, heading=heading),
AccumulatorState(),
content_token_limit=content_token_limit,
)
# --- ASSERT ----------------------------------------------------
assert [p.text for p in out.payloads] == expected_texts
# Content first, metadata second — only the first chunk is fresh.
assert [p.is_continuation for p in out.payloads] == [False, True]
class TestBuildSheetDescriptorChunks:
"""Direct tests of `build_sheet_descriptor_chunks` — the per-section
descriptor builder that backs the metadata chunks emitted by
`TabularChunker` when ``ignore_metadata_chunks=False``.
A character-level tokenizer (1 char == 1 token) is used so the
`_pack_lines` budget arithmetic is deterministic and expected
chunks can be spelled out exactly.
"""
@staticmethod
def _build(
csv_text: str,
heading: str | None = "sheet:T",
max_tokens: int = 500,
) -> list[str]:
section = TabularSection(text=csv_text, link=_DEFAULT_LINK, heading=heading)
return build_sheet_descriptor_chunks(
section=section,
tokenizer=CharTokenizer(),
max_tokens=max_tokens,
)
def test_basic_descriptor_emits_every_component(self) -> None:
# --- INPUT -----------------------------------------------------
# CSV exercises every optional descriptor line:
# - id → numeric AND identifier (unique + id-named)
# - Name → categorical (with sample values)
# - Age → numeric
# - joined_at → date column → contributes to time range
csv_text = (
"id,Name,Age,joined_at\n" "1,Alice,30,2024-01-15\n" "2,Bob,25,2024-02-20\n"
)
# --- EXPECTED --------------------------------------------------
expected = [
"sheet:T\n"
"Sheet overview.\n"
"This sheet has 2 rows and 4 columns.\n"
"Columns: id, Name, Age, joined_at (joined at)\n"
"Time range: 2024-01-15 to 2024-02-20.\n"
"Numeric columns (aggregatable by sum, average, min, max): id, Age\n"
"Categorical columns (groupable, can be counted by value): Name\n"
"Identifier column: id.\n"
"Values seen in Name: Alice, Bob"
]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text) == expected
def test_numeric_only_omits_categorical_and_values_seen_lines(self) -> None:
# --- INPUT -----------------------------------------------------
# All-numeric CSV: no categorical line, no identifier line, no
# values-seen lines, no time range.
csv_text = "x,y\n1,2\n3,4\n"
# --- EXPECTED --------------------------------------------------
expected = [
"sheet:T\n"
"Sheet overview.\n"
"This sheet has 2 rows and 2 columns.\n"
"Columns: x, y\n"
"Numeric columns (aggregatable by sum, average, min, max): x, y"
]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text) == expected
def test_underscored_column_names_get_friendly_alias_in_descriptor(self) -> None:
# --- INPUT -----------------------------------------------------
# Underscored headers get the same `name (name with spaces)`
# alias used by `format_columns_header`, so retrieval matches
# either form. The alias appears in every line that names the
# column (Columns:, Categorical columns:, Values seen in ...).
csv_text = "MTTR_hours,owner_name\n3,Alice\n5,Bob\n"
# --- EXPECTED --------------------------------------------------
expected = [
"sheet:T\n"
"Sheet overview.\n"
"This sheet has 2 rows and 2 columns.\n"
"Columns: MTTR_hours (MTTR hours), owner_name (owner name)\n"
"Numeric columns (aggregatable by sum, average, min, max): "
"MTTR_hours (MTTR hours)\n"
"Categorical columns (groupable, can be counted by value): "
"owner_name (owner name)\n"
"Values seen in owner_name (owner name): Alice, Bob"
]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text) == expected
def test_identifier_column_detected_for_unique_id_named_column(self) -> None:
# --- INPUT -----------------------------------------------------
# `uuid` is unique AND its name is in the ID_NAME_TOKENS set, so
# it gets flagged as the identifier column. Non-numeric values
# also make it categorical.
csv_text = "uuid,Name\nabc,Alice\ndef,Bob\n"
# --- EXPECTED --------------------------------------------------
expected = [
"sheet:T\n"
"Sheet overview.\n"
"This sheet has 2 rows and 2 columns.\n"
"Columns: uuid, Name\n"
"Categorical columns (groupable, can be counted by value): uuid, Name\n"
"Identifier column: uuid.\n"
"Values seen in uuid: abc, def\n"
"Values seen in Name: Alice, Bob"
]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text) == expected
def test_time_range_emitted_for_date_only_column(self) -> None:
# --- INPUT -----------------------------------------------------
# A column whose values all parse as dates contributes to the
# `Time range:` line and is excluded from numeric/categorical
# classification.
csv_text = "joined_at\n2024-01-15\n2024-03-20\n2024-02-10\n"
# --- EXPECTED --------------------------------------------------
expected = [
"sheet:T\n"
"Sheet overview.\n"
"This sheet has 3 rows and 1 columns.\n"
"Columns: joined_at (joined at)\n"
"Time range: 2024-01-15 to 2024-03-20."
]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text) == expected
def test_empty_section_returns_no_chunks(self) -> None:
# Empty CSV text → nothing to describe.
assert self._build("") == []
def test_header_only_csv_emits_descriptor_with_zero_rows(self) -> None:
# --- INPUT -----------------------------------------------------
# Header line alone, no data rows. Column names are still useful
# retrieval signal, so a minimal descriptor is emitted with
# row_count=0 and no numeric/categorical/values-seen lines.
csv_text = "col1,col2\n"
# --- EXPECTED --------------------------------------------------
expected = [
"sheet:T\n"
"Sheet overview.\n"
"This sheet has 0 rows and 2 columns.\n"
"Columns: col1, col2"
]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text) == expected
def test_no_heading_means_no_prefix_line_in_chunks(self) -> None:
# --- INPUT -----------------------------------------------------
# heading=None → `_pack_lines` runs with prefix="", so emitted
# chunks do not start with a heading line.
csv_text = "Name\nAlice\nBob\n"
# --- EXPECTED --------------------------------------------------
expected = [
"Sheet overview.\n"
"This sheet has 2 rows and 1 columns.\n"
"Columns: Name\n"
"Categorical columns (groupable, can be counted by value): Name\n"
"Values seen in Name: Alice, Bob"
]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text, heading=None) == expected
def test_descriptor_splits_across_chunks_with_heading_repeated(self) -> None:
# --- INPUT -----------------------------------------------------
# Tight budget forces the descriptor across multiple chunks. The
# heading is prepended to every emitted chunk so retrieval keeps
# context after the split. Lines that exceed the budget on their
# own are silently skipped.
#
# heading="S" (1 char) → prefix_tokens = 1+1 = 2; budget = 60-2 = 58.
# Lines (and lengths under CharTokenizer):
# overview = "Sheet overview.\nThis sheet has 5 rows and 1 columns." (52)
# columns = "Columns: Name" (13)
# categorical = "Categorical columns (groupable, ...): Name" (62) > 58 → SKIPPED
# values_seen = "Values seen in Name: Alice, Bob, Charlie, Dave, Eve" (51)
# Pack:
# [overview(52)] → fits, current=52
# + columns(13): 52+1+13 = 66 > 58 → flush; current=[columns], 13
# skip categorical (oversize)
# + values_seen(51): 13+1+51 = 65 > 58 → flush; current=[values_seen], 51
# end → flush
csv_text = "Name\nAlice\nBob\nCharlie\nDave\nEve\n"
# --- EXPECTED --------------------------------------------------
expected = [
"S\nSheet overview.\nThis sheet has 5 rows and 1 columns.",
"S\nColumns: Name",
"S\nValues seen in Name: Alice, Bob, Charlie, Dave, Eve",
]
# --- ACT -------------------------------------------------------
out = self._build(csv_text, heading="S", max_tokens=60)
# --- ASSERT ----------------------------------------------------
assert out == expected
# Every emitted chunk fits the budget.
assert all(len(c) <= 60 for c in out)
# The dropped categorical line never makes it into output.
assert all("Categorical columns" not in c for c in out)
def test_lines_exceeding_budget_are_skipped(self) -> None:
# --- INPUT -----------------------------------------------------
# heading="" (no prefix) → budget = max_tokens.
# Lines:
# overview = "Sheet overview.\nThis sheet has 1 rows and 1 columns." (52) > 30 → SKIPPED
# columns = "Columns: x" (10)
# numeric = "Numeric columns (...): x" (59) > 30 → SKIPPED
# Only the columns line survives.
csv_text = "x\n1\n"
# --- EXPECTED --------------------------------------------------
expected = ["Columns: x"]
# --- ACT / ASSERT ---------------------------------------------
assert self._build(csv_text, heading="", max_tokens=30) == expected