.

Move csv parsing logic to util file
2026-04-16 06:56:51 +00:00 · 2026-04-15 17:50:22 -07:00 · 2026-04-15 17:47:54 -07:00 · 2026-04-15 17:47:08 -07:00 · 2026-04-15 17:24:23 -07:00 · 2026-04-15 17:20:11 -07:00
5 changed files with 626 additions and 70 deletions
--- a/backend/onyx/indexing/chunking/tabular_section_chunker/init.py
+++ b/backend/onyx/indexing/chunking/tabular_section_chunker/init.py
@@ -0,0 +1,5 @@
+from onyx.indexing.chunking.tabular_section_chunker.tabular_section_chunker import (
+    TabularChunker,
+)
+
+__all__ = ["TabularChunker"]
--- a/backend/onyx/indexing/chunking/tabular_section_chunker/sheet_descriptor.py
+++ b/backend/onyx/indexing/chunking/tabular_section_chunker/sheet_descriptor.py
@@ -0,0 +1,229 @@
+"""Per-section sheet descriptor chunk builder."""
+
+from dataclasses import dataclass
+from dataclasses import field
+from datetime import date
+from itertools import zip_longest
+
+from dateutil.parser import parse as parse_dt
+
+from onyx.connectors.models import Section
+from onyx.natural_language_processing.utils import BaseTokenizer
+from onyx.natural_language_processing.utils import count_tokens
+from onyx.utils.csv_utils import parse_csv_string
+from onyx.utils.csv_utils import ParsedRow
+from onyx.utils.csv_utils import read_csv_header
+
+
+MAX_NUMERIC_COLS = 12
+MAX_CATEGORICAL_COLS = 6
+MAX_CATEGORICAL_WITH_SAMPLES = 4
+MAX_DISTINCT_SAMPLES = 8
+CATEGORICAL_DISTINCT_THRESHOLD = 20
+ID_NAME_TOKENS = {"id", "uuid", "uid", "guid", "key"}
+
+
+@dataclass
+class SheetAnalysis:
+    row_count: int
+    num_cols: int
+    numeric_cols: list[int] = field(default_factory=list)
+    categorical_cols: list[int] = field(default_factory=list)
+    categorical_values: dict[int, list[str]] = field(default_factory=dict)
+    id_col: int | None = None
+    date_min: date | None = None
+    date_max: date | None = None
+
+
+def build_sheet_descriptor_chunks(
+    section: Section,
+    tokenizer: BaseTokenizer,
+    max_tokens: int,
+) -> list[str]:
+    """Build sheet descriptor chunk(s) from a parsed CSV section.
+
+    Output (lines joined by "\\n"; lines that overflow ``max_tokens`` on
+    their own are skipped; ``section.heading`` is prepended to every
+    emitted chunk so retrieval keeps sheet context after a split):
+
+        {section.heading}                                                     # optional
+        Sheet overview.
+        This sheet has {N} rows and {M} columns.
+        Columns: {col1}, {col2}, ...
+        Time range: {start} to {end}.                                         # optional
+        Numeric columns (aggregatable by sum, average, min, max): ...         # optional
+        Categorical columns (groupable, can be counted by value): ...         # optional
+        Identifier column: {col}.                                             # optional
+        Values seen in {col}: {v1}, {v2}, ...                                 # optional, repeated
+    """
+    text = section.text or ""
+    parsed_rows = list(parse_csv_string(text))
+    headers = parsed_rows[0].header if parsed_rows else read_csv_header(text)
+    if not headers:
+        return []
+
+    a = _analyze(headers, parsed_rows)
+    lines = [
+        _overview_line(a),
+        _columns_line(headers),
+        _time_range_line(a),
+        _numeric_cols_line(headers, a),
+        _categorical_cols_line(headers, a),
+        _id_col_line(headers, a),
+        _values_seen_line(headers, a),
+    ]
+    return _pack_lines(
+        [line for line in lines if line],
+        prefix=section.heading or "",
+        tokenizer=tokenizer,
+        max_tokens=max_tokens,
+    )
+
+
+def _overview_line(a: SheetAnalysis) -> str:
+    return (
+        "Sheet overview.\n"
+        f"This sheet has {a.row_count} rows and {a.num_cols} columns."
+    )
+
+
+def _columns_line(headers: list[str]) -> str:
+    return "Columns: " + ", ".join(_label(h) for h in headers)
+
+
+def _time_range_line(a: SheetAnalysis) -> str:
+    if not (a.date_min and a.date_max):
+        return ""
+    return f"Time range: {a.date_min} to {a.date_max}."
+
+
+def _numeric_cols_line(headers: list[str], a: SheetAnalysis) -> str:
+    if not a.numeric_cols:
+        return ""
+    names = ", ".join(_label(headers[i]) for i in a.numeric_cols[:MAX_NUMERIC_COLS])
+    return f"Numeric columns (aggregatable by sum, average, min, max): {names}"
+
+
+def _categorical_cols_line(headers: list[str], a: SheetAnalysis) -> str:
+    if not a.categorical_cols:
+        return ""
+    names = ", ".join(
+        _label(headers[i]) for i in a.categorical_cols[:MAX_CATEGORICAL_COLS]
+    )
+    return f"Categorical columns (groupable, can be counted by value): {names}"
+
+
+def _id_col_line(headers: list[str], a: SheetAnalysis) -> str:
+    if a.id_col is None:
+        return ""
+    return f"Identifier column: {_label(headers[a.id_col])}."
+
+
+def _values_seen_line(headers: list[str], a: SheetAnalysis) -> str:
+    rows: list[str] = []
+    for ci in a.categorical_cols[:MAX_CATEGORICAL_WITH_SAMPLES]:
+        sample = sorted(a.categorical_values.get(ci, []))[:MAX_DISTINCT_SAMPLES]
+        if sample:
+            rows.append(f"Values seen in {_label(headers[ci])}: " + ", ".join(sample))
+    return "\n".join(rows)
+
+
+def _label(name: str) -> str:
+    return f"{name} ({name.replace('_', ' ')})" if "_" in name else name
+
+
+def _is_numeric(value: str) -> bool:
+    try:
+        float(value.replace(",", ""))
+        return True
+    except ValueError:
+        return False
+
+
+def _try_date(value: str) -> date | None:
+    if len(value) < 4 or not any(c in value for c in "-/T"):
+        return None
+    try:
+        return parse_dt(value).date()
+    except (ValueError, OverflowError, TypeError):
+        return None
+
+
+def _is_id_name(name: str) -> bool:
+    lowered = name.lower().strip().replace("-", "_")
+    return lowered in ID_NAME_TOKENS or any(
+        lowered.endswith(f"_{t}") for t in ID_NAME_TOKENS
+    )
+
+
+def _analyze(headers: list[str], parsed_rows: list[ParsedRow]) -> SheetAnalysis:
+    a = SheetAnalysis(row_count=len(parsed_rows), num_cols=len(headers))
+    columns = zip_longest(*(pr.row for pr in parsed_rows), fillvalue="")
+    for idx, (header, raw_values) in enumerate(zip(headers, columns)):
+        # Pull the column's non-empty values; skip if the column is blank.
+        values = [v.strip() for v in raw_values if v.strip()]
+        if not values:
+            continue
+
+        # Identifier: id-named column whose values are all unique. Detected
+        # before classification so a numeric `id` column still gets flagged.
+        distinct = set(values)
+        if a.id_col is None and len(distinct) == len(values) and _is_id_name(header):
+            a.id_col = idx
+
+        # Numeric: every value parses as a number.
+        if all(_is_numeric(v) for v in values):
+            a.numeric_cols.append(idx)
+            continue
+
+        # Date: every value parses as a date — fold into the sheet-wide range.
+        dates = [_try_date(v) for v in values]
+        if all(d is not None for d in dates):
+            dmin = min(filter(None, dates))
+            dmax = max(filter(None, dates))
+            a.date_min = dmin if a.date_min is None else min(a.date_min, dmin)
+            a.date_max = dmax if a.date_max is None else max(a.date_max, dmax)
+            continue
+
+        # Categorical: low-cardinality column — keep distinct values for samples.
+        if len(distinct) <= max(CATEGORICAL_DISTINCT_THRESHOLD, len(values) // 2):
+            a.categorical_cols.append(idx)
+            a.categorical_values[idx] = list(distinct)
+    return a
+
+
+def _pack_lines(
+    lines: list[str],
+    prefix: str,
+    tokenizer: BaseTokenizer,
+    max_tokens: int,
+) -> list[str]:
+    """Greedily pack lines into chunks ≤ max_tokens. Lines that on
+    their own exceed max_tokens (after accounting for the prefix) are
+    skipped. ``prefix`` is prepended to every emitted chunk."""
+    prefix_tokens = count_tokens(prefix, tokenizer) + 1 if prefix else 0
+    budget = max_tokens - prefix_tokens
+
+    chunks: list[str] = []
+    current: list[str] = []
+    current_tokens = 0
+    for line in lines:
+        line_tokens = count_tokens(line, tokenizer)
+        if line_tokens > budget:
+            continue
+        sep = 1 if current else 0
+        if current_tokens + sep + line_tokens > budget:
+            chunks.append(_join_with_prefix(current, prefix))
+            current = [line]
+            current_tokens = line_tokens
+        else:
+            current.append(line)
+            current_tokens += sep + line_tokens
+    if current:
+        chunks.append(_join_with_prefix(current, prefix))
+    return chunks
+
+
+def _join_with_prefix(lines: list[str], prefix: str) -> str:
+    body = "\n".join(lines)
+    return f"{prefix}\n{body}" if prefix else body
--- a/backend/onyx/indexing/chunking/tabular_section_chunker/tabular_section_chunker.py
+++ b/backend/onyx/indexing/chunking/tabular_section_chunker/tabular_section_chunker.py
@@ -1,5 +1,3 @@
-import csv
-import io
 from collections.abc import Iterable

 from pydantic import BaseModel
@@ -9,9 +7,14 @@ from onyx.indexing.chunking.section_chunker import AccumulatorState
 from onyx.indexing.chunking.section_chunker import ChunkPayload
 from onyx.indexing.chunking.section_chunker import SectionChunker
 from onyx.indexing.chunking.section_chunker import SectionChunkerOutput
+from onyx.indexing.chunking.tabular_section_chunker.sheet_descriptor import (
+    build_sheet_descriptor_chunks,
+)
 from onyx.natural_language_processing.utils import BaseTokenizer
 from onyx.natural_language_processing.utils import count_tokens
 from onyx.natural_language_processing.utils import split_text_by_tokens
+from onyx.utils.csv_utils import parse_csv_string
+from onyx.utils.csv_utils import ParsedRow
 from onyx.utils.logger import setup_logger

 logger = setup_logger()
@@ -23,11 +26,6 @@ ROW_JOIN = "\n"
 NEWLINE_TOKENS = 1


-class _ParsedRow(BaseModel):
-    header: list[str]
-    row: list[str]
-
-
 class _TokenizedText(BaseModel):
    text: str
    token_count: int
@@ -60,23 +58,6 @@ def format_columns_header(headers: list[str]) -> str:
    return f"{COLUMNS_MARKER} " + FIELD_VALUE_SEPARATOR.join(parts)


-def parse_section(section: Section) -> list[_ParsedRow]:
-    """Parse CSV into headers + rows. First non-empty row is the header;
-    blank rows are skipped."""
-    section_text = section.text or ""
-    if not section_text.strip():
-        return []
-
-    reader = csv.reader(io.StringIO(section_text))
-    non_empty_rows = [row for row in reader if any(cell.strip() for cell in row)]
-
-    if not non_empty_rows:
-        return []
-
-    header, *data_rows = non_empty_rows
-    return [_ParsedRow(header=header, row=row) for row in data_rows]
-
-
 def _row_to_pairs(headers: list[str], row: list[str]) -> list[tuple[str, str]]:
    return [(h, v) for h, v in zip(headers, row) if v.strip()]

@@ -175,7 +156,7 @@ def _build_chunk_from_scratch(


 def parse_to_chunks(
-    rows: Iterable[_ParsedRow],
+    rows: Iterable[ParsedRow],
    sheet_header: str,
    tokenizer: BaseTokenizer,
    max_tokens: int,
@@ -233,8 +214,13 @@ def parse_to_chunks(


 class TabularChunker(SectionChunker):
-    def __init__(self, tokenizer: BaseTokenizer) -> None:
+    def __init__(
+        self,
+        tokenizer: BaseTokenizer,
+        ignore_metadata_chunks: bool = False,
+    ) -> None:
        self.tokenizer = tokenizer
+        self.ignore_metadata_chunks = ignore_metadata_chunks

    def chunk_section(
        self,
@@ -244,8 +230,30 @@ class TabularChunker(SectionChunker):
    ) -> SectionChunkerOutput:
        payloads = accumulator.flush_to_list()

-        parsed_rows = parse_section(section)
-        if not parsed_rows:
+        parsed_rows = list(parse_csv_string(section.text or ""))
+        sheet_header = section.heading or ""
+
+        chunk_texts: list[str] = []
+        if parsed_rows:
+            chunk_texts.extend(
+                parse_to_chunks(
+                    rows=parsed_rows,
+                    sheet_header=sheet_header,
+                    tokenizer=self.tokenizer,
+                    max_tokens=content_token_limit,
+                )
+            )
+
+        if not self.ignore_metadata_chunks:
+            chunk_texts.extend(
+                build_sheet_descriptor_chunks(
+                    section=section,
+                    tokenizer=self.tokenizer,
+                    max_tokens=content_token_limit,
+                )
+            )
+
+        if not chunk_texts:
            logger.warning(
                f"TabularChunker: skipping unparseable section (link={section.link})"
            )
@@ -253,14 +261,6 @@ class TabularChunker(SectionChunker):
                payloads=payloads, accumulator=AccumulatorState()
            )

-        sheet_header = section.heading or ""
-        chunk_texts = parse_to_chunks(
-            rows=parsed_rows,
-            sheet_header=sheet_header,
-            tokenizer=self.tokenizer,
-            max_tokens=content_token_limit,
-        )
-
        for i, text in enumerate(chunk_texts):
            payloads.append(
                ChunkPayload(
--- a/backend/onyx/utils/csv_utils.py
+++ b/backend/onyx/utils/csv_utils.py
@@ -0,0 +1,41 @@
+import csv
+import io
+from collections.abc import Generator
+
+from pydantic import BaseModel
+
+
+class ParsedRow(BaseModel):
+    header: list[str]
+    row: list[str]
+
+
+def read_csv_header(csv_text: str) -> list[str]:
+    """Return the first non-blank row (the header) of a CSV string, or
+    [] if the text has no usable header.
+    """
+    if not csv_text.strip():
+        return []
+    for row in csv.reader(io.StringIO(csv_text)):
+        if any(c.strip() for c in row):
+            return row
+    return []
+
+
+def parse_csv_string(csv_text: str) -> Generator[ParsedRow, None, None]:
+    """
+    Takes in a string in the form of a CSV and yields back
+    each row + header in the csv.
+    """
+    if not csv_text.strip():
+        return
+
+    reader = csv.reader(io.StringIO(csv_text))
+    header: list[str] | None = None
+    for row in reader:
+        if not any(cell.strip() for cell in row):
+            continue
+        if header is None:
+            header = row
+            continue
+        yield ParsedRow(header=header, row=row)
--- a/backend/tests/unit/onyx/indexing/test_tabular_section_chunker.py
+++ b/backend/tests/unit/onyx/indexing/test_tabular_section_chunker.py
@@ -15,6 +15,9 @@ from onyx.connectors.models import Section
 from onyx.connectors.models import TabularSection
 from onyx.indexing.chunking.section_chunker import AccumulatorState
 from onyx.indexing.chunking.tabular_section_chunker import TabularChunker
+from onyx.indexing.chunking.tabular_section_chunker.sheet_descriptor import (
+    build_sheet_descriptor_chunks,
+)
 from onyx.natural_language_processing.utils import BaseTokenizer


@@ -29,8 +32,12 @@ class CharTokenizer(BaseTokenizer):
        return "".join(chr(t) for t in tokens)


-def _make_chunker() -> TabularChunker:
-    return TabularChunker(tokenizer=CharTokenizer())
+def _make_chunker_no_metadata() -> TabularChunker:
+    return TabularChunker(tokenizer=CharTokenizer(), ignore_metadata_chunks=True)
+
+
+def _make_chunker_with_metadata() -> TabularChunker:
+    return TabularChunker(tokenizer=CharTokenizer(), ignore_metadata_chunks=False)


 _DEFAULT_LINK = "https://example.com/doc"
@@ -62,7 +69,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -91,7 +98,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -104,26 +111,35 @@ class TestTabularChunkerChunkSection:
        # Link carries through every chunk.
        assert all(p.links == {0: _DEFAULT_LINK} for p in out.payloads)

-    # Add back in shortly
-    # def test_header_only_csv_produces_single_prelude_chunk(self) -> None:
-    #     # --- INPUT -----------------------------------------------------
-    #     csv_text = "col1,col2\n"
-    #     link = "sheet:Headers"
+    def test_header_only_csv_emits_metadata_chunk_with_no_content(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # A header-only CSV has no data rows, so `parse_to_chunks` emits
+        # nothing. With metadata enabled, the descriptor still fires —
+        # column names alone are useful retrieval signal.
+        csv_text = "col1,col2\n"
+        heading = "sheet:Headers"
+        content_token_limit = 500

-    #     # --- EXPECTED --------------------------------------------------
-    #     expected_texts = [
-    #         "sheet:Headers\nColumns: col1, col2",
-    #     ]
+        # --- EXPECTED --------------------------------------------------
+        expected_texts = [
+            "sheet:Headers\n"
+            "Sheet overview.\n"
+            "This sheet has 0 rows and 2 columns.\n"
+            "Columns: col1, col2",
+        ]

-    #     # --- ACT -------------------------------------------------------
-    #     out = _make_chunker().chunk_section(
-    #         _tabular_section(csv_text, link=link),
-    #         AccumulatorState(),
-    #         content_token_limit=500,
-    #     )
+        # --- ACT -------------------------------------------------------
+        out = _make_chunker_with_metadata().chunk_section(
+            _tabular_section(csv_text, heading=heading),
+            AccumulatorState(),
+            content_token_limit=content_token_limit,
+        )

-    #     # --- ASSERT ----------------------------------------------------
-    #     assert [p.text for p in out.payloads] == expected_texts
+        # --- ASSERT ----------------------------------------------------
+        assert [p.text for p in out.payloads] == expected_texts
+        assert [p.is_continuation for p in out.payloads] == [False]
+        assert all(p.links == {0: _DEFAULT_LINK} for p in out.payloads)
+        assert out.accumulator.is_empty()

    def test_empty_cells_dropped_from_chunk_text(self) -> None:
        # --- INPUT -----------------------------------------------------
@@ -143,7 +159,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=500,
@@ -166,7 +182,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=500,
@@ -188,7 +204,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=500,
@@ -215,7 +231,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(
                text=pending_text,
@@ -258,7 +274,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -296,7 +312,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -337,7 +353,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -365,7 +381,7 @@ class TestTabularChunkerChunkSection:
        expected_texts = [pending_text]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section("", heading="sheet:Empty"),
            AccumulatorState(
                text=pending_text,
@@ -410,7 +426,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -440,7 +456,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=500,
@@ -487,7 +503,7 @@ class TestTabularChunkerChunkSection:
        ]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -517,7 +533,7 @@ class TestTabularChunkerChunkSection:
        expected_texts = ["Columns: x\nx=y"]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -548,7 +564,7 @@ class TestTabularChunkerChunkSection:
        expected_texts = ["S\nABC=1, DEF=2"]

        # --- ACT -------------------------------------------------------
-        out = _make_chunker().chunk_section(
+        out = _make_chunker_no_metadata().chunk_section(
            _tabular_section(csv_text, heading=heading),
            AccumulatorState(),
            content_token_limit=content_token_limit,
@@ -556,3 +572,268 @@ class TestTabularChunkerChunkSection:

        # --- ASSERT ----------------------------------------------------
        assert [p.text for p in out.payloads] == expected_texts
+
+    def test_metadata_chunks_appended_after_content_when_enabled(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # With ignore_metadata_chunks=False, the descriptor chunk is
+        # appended AFTER the content chunk(s). is_continuation tracks
+        # the index in the combined output, so the metadata chunk is
+        # marked as a continuation.
+        csv_text = "Name,Age\n" "Alice,30\n" "Bob,25\n"
+        heading = "sheet:T"
+        content_token_limit = 500
+
+        # --- EXPECTED --------------------------------------------------
+        content_chunk = (
+            "sheet:T\n" "Columns: Name, Age\n" "Name=Alice, Age=30\n" "Name=Bob, Age=25"
+        )
+        metadata_chunk = (
+            "sheet:T\n"
+            "Sheet overview.\n"
+            "This sheet has 2 rows and 2 columns.\n"
+            "Columns: Name, Age\n"
+            "Numeric columns (aggregatable by sum, average, min, max): Age\n"
+            "Categorical columns (groupable, can be counted by value): Name\n"
+            "Values seen in Name: Alice, Bob"
+        )
+        expected_texts = [content_chunk, metadata_chunk]
+
+        # --- ACT -------------------------------------------------------
+        out = _make_chunker_with_metadata().chunk_section(
+            _tabular_section(csv_text, heading=heading),
+            AccumulatorState(),
+            content_token_limit=content_token_limit,
+        )
+
+        # --- ASSERT ----------------------------------------------------
+        assert [p.text for p in out.payloads] == expected_texts
+        # Content first, metadata second — only the first chunk is fresh.
+        assert [p.is_continuation for p in out.payloads] == [False, True]
+
+
+class TestBuildSheetDescriptorChunks:
+    """Direct tests of `build_sheet_descriptor_chunks` — the per-section
+    descriptor builder that backs the metadata chunks emitted by
+    `TabularChunker` when ``ignore_metadata_chunks=False``.
+
+    A character-level tokenizer (1 char == 1 token) is used so the
+    `_pack_lines` budget arithmetic is deterministic and expected
+    chunks can be spelled out exactly.
+    """
+
+    @staticmethod
+    def _build(
+        csv_text: str,
+        heading: str | None = "sheet:T",
+        max_tokens: int = 500,
+    ) -> list[str]:
+        section = TabularSection(text=csv_text, link=_DEFAULT_LINK, heading=heading)
+        return build_sheet_descriptor_chunks(
+            section=section,
+            tokenizer=CharTokenizer(),
+            max_tokens=max_tokens,
+        )
+
+    def test_basic_descriptor_emits_every_component(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # CSV exercises every optional descriptor line:
+        #   - id           → numeric AND identifier (unique + id-named)
+        #   - Name         → categorical (with sample values)
+        #   - Age          → numeric
+        #   - joined_at    → date column → contributes to time range
+        csv_text = (
+            "id,Name,Age,joined_at\n" "1,Alice,30,2024-01-15\n" "2,Bob,25,2024-02-20\n"
+        )
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "sheet:T\n"
+            "Sheet overview.\n"
+            "This sheet has 2 rows and 4 columns.\n"
+            "Columns: id, Name, Age, joined_at (joined at)\n"
+            "Time range: 2024-01-15 to 2024-02-20.\n"
+            "Numeric columns (aggregatable by sum, average, min, max): id, Age\n"
+            "Categorical columns (groupable, can be counted by value): Name\n"
+            "Identifier column: id.\n"
+            "Values seen in Name: Alice, Bob"
+        ]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text) == expected
+
+    def test_numeric_only_omits_categorical_and_values_seen_lines(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # All-numeric CSV: no categorical line, no identifier line, no
+        # values-seen lines, no time range.
+        csv_text = "x,y\n1,2\n3,4\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "sheet:T\n"
+            "Sheet overview.\n"
+            "This sheet has 2 rows and 2 columns.\n"
+            "Columns: x, y\n"
+            "Numeric columns (aggregatable by sum, average, min, max): x, y"
+        ]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text) == expected
+
+    def test_underscored_column_names_get_friendly_alias_in_descriptor(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # Underscored headers get the same `name (name with spaces)`
+        # alias used by `format_columns_header`, so retrieval matches
+        # either form. The alias appears in every line that names the
+        # column (Columns:, Categorical columns:, Values seen in ...).
+        csv_text = "MTTR_hours,owner_name\n3,Alice\n5,Bob\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "sheet:T\n"
+            "Sheet overview.\n"
+            "This sheet has 2 rows and 2 columns.\n"
+            "Columns: MTTR_hours (MTTR hours), owner_name (owner name)\n"
+            "Numeric columns (aggregatable by sum, average, min, max): "
+            "MTTR_hours (MTTR hours)\n"
+            "Categorical columns (groupable, can be counted by value): "
+            "owner_name (owner name)\n"
+            "Values seen in owner_name (owner name): Alice, Bob"
+        ]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text) == expected
+
+    def test_identifier_column_detected_for_unique_id_named_column(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # `uuid` is unique AND its name is in the ID_NAME_TOKENS set, so
+        # it gets flagged as the identifier column. Non-numeric values
+        # also make it categorical.
+        csv_text = "uuid,Name\nabc,Alice\ndef,Bob\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "sheet:T\n"
+            "Sheet overview.\n"
+            "This sheet has 2 rows and 2 columns.\n"
+            "Columns: uuid, Name\n"
+            "Categorical columns (groupable, can be counted by value): uuid, Name\n"
+            "Identifier column: uuid.\n"
+            "Values seen in uuid: abc, def\n"
+            "Values seen in Name: Alice, Bob"
+        ]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text) == expected
+
+    def test_time_range_emitted_for_date_only_column(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # A column whose values all parse as dates contributes to the
+        # `Time range:` line and is excluded from numeric/categorical
+        # classification.
+        csv_text = "joined_at\n2024-01-15\n2024-03-20\n2024-02-10\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "sheet:T\n"
+            "Sheet overview.\n"
+            "This sheet has 3 rows and 1 columns.\n"
+            "Columns: joined_at (joined at)\n"
+            "Time range: 2024-01-15 to 2024-03-20."
+        ]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text) == expected
+
+    def test_empty_section_returns_no_chunks(self) -> None:
+        # Empty CSV text → nothing to describe.
+        assert self._build("") == []
+
+    def test_header_only_csv_emits_descriptor_with_zero_rows(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # Header line alone, no data rows. Column names are still useful
+        # retrieval signal, so a minimal descriptor is emitted with
+        # row_count=0 and no numeric/categorical/values-seen lines.
+        csv_text = "col1,col2\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "sheet:T\n"
+            "Sheet overview.\n"
+            "This sheet has 0 rows and 2 columns.\n"
+            "Columns: col1, col2"
+        ]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text) == expected
+
+    def test_no_heading_means_no_prefix_line_in_chunks(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # heading=None → `_pack_lines` runs with prefix="", so emitted
+        # chunks do not start with a heading line.
+        csv_text = "Name\nAlice\nBob\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "Sheet overview.\n"
+            "This sheet has 2 rows and 1 columns.\n"
+            "Columns: Name\n"
+            "Categorical columns (groupable, can be counted by value): Name\n"
+            "Values seen in Name: Alice, Bob"
+        ]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text, heading=None) == expected
+
+    def test_descriptor_splits_across_chunks_with_heading_repeated(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # Tight budget forces the descriptor across multiple chunks. The
+        # heading is prepended to every emitted chunk so retrieval keeps
+        # context after the split. Lines that exceed the budget on their
+        # own are silently skipped.
+        #
+        # heading="S" (1 char) → prefix_tokens = 1+1 = 2; budget = 60-2 = 58.
+        # Lines (and lengths under CharTokenizer):
+        #   overview     = "Sheet overview.\nThis sheet has 5 rows and 1 columns." (52)
+        #   columns      = "Columns: Name"                                          (13)
+        #   categorical  = "Categorical columns (groupable, ...): Name"             (62)  > 58 → SKIPPED
+        #   values_seen  = "Values seen in Name: Alice, Bob, Charlie, Dave, Eve"    (51)
+        # Pack:
+        #   [overview(52)]                                  → fits, current=52
+        #   + columns(13): 52+1+13 = 66 > 58 → flush; current=[columns], 13
+        #   skip categorical (oversize)
+        #   + values_seen(51): 13+1+51 = 65 > 58 → flush; current=[values_seen], 51
+        #   end → flush
+        csv_text = "Name\nAlice\nBob\nCharlie\nDave\nEve\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = [
+            "S\nSheet overview.\nThis sheet has 5 rows and 1 columns.",
+            "S\nColumns: Name",
+            "S\nValues seen in Name: Alice, Bob, Charlie, Dave, Eve",
+        ]
+
+        # --- ACT -------------------------------------------------------
+        out = self._build(csv_text, heading="S", max_tokens=60)
+
+        # --- ASSERT ----------------------------------------------------
+        assert out == expected
+        # Every emitted chunk fits the budget.
+        assert all(len(c) <= 60 for c in out)
+        # The dropped categorical line never makes it into output.
+        assert all("Categorical columns" not in c for c in out)
+
+    def test_lines_exceeding_budget_are_skipped(self) -> None:
+        # --- INPUT -----------------------------------------------------
+        # heading="" (no prefix) → budget = max_tokens.
+        # Lines:
+        #   overview = "Sheet overview.\nThis sheet has 1 rows and 1 columns." (52)  > 30 → SKIPPED
+        #   columns  = "Columns: x"                                            (10)
+        #   numeric  = "Numeric columns (...): x"                              (59)  > 30 → SKIPPED
+        # Only the columns line survives.
+        csv_text = "x\n1\n"
+
+        # --- EXPECTED --------------------------------------------------
+        expected = ["Columns: x"]
+
+        # --- ACT / ASSERT ---------------------------------------------
+        assert self._build(csv_text, heading="", max_tokens=30) == expected
Author	SHA1	Message	Date
Dane Urban	957f05618e	.	2026-04-15 17:50:22 -07:00
Dane Urban	3454fa6949	.	2026-04-15 17:47:54 -07:00
Dane Urban	c55ab0cdf6	Move csv parsing logic to util file	2026-04-15 17:47:08 -07:00
Dane Urban	454479016c	.	2026-04-15 17:24:23 -07:00
Dane Urban	9839e46658	Add comments	2026-04-15 17:20:11 -07:00
Dane Urban	a50ce50d8b	.	2026-04-15 17:14:30 -07:00
Dane Urban	818f927ba0	Add descriptor chunk	2026-04-15 17:01:00 -07:00