.

Some fixes
.
2026-03-28 11:02:42 +00:00 · 2026-03-11 17:52:55 -07:00 · 2026-03-11 17:44:49 -07:00 · 2026-03-11 17:32:30 -07:00 · 2026-03-11 17:19:09 -07:00
3 changed files with 263 additions and 21 deletions
--- a/backend/onyx/file_processing/extract_file_text.py
+++ b/backend/onyx/file_processing/extract_file_text.py
@@ -1,3 +1,4 @@
+import csv
 import gc
 import io
 import json
@@ -19,6 +20,7 @@ from zipfile import BadZipFile

 import chardet
 import openpyxl
+from openpyxl.worksheet.worksheet import Worksheet
 from PIL import Image

 from onyx.configs.constants import ONYX_METADATA_FILENAME
@@ -352,6 +354,65 @@ def pptx_to_text(file: IO[Any], file_name: str = "") -> str:
    return presentation.markdown


+def _worksheet_to_matrix(
+    worksheet: Worksheet,
+) -> list[list[str]]:
+    """
+    Converts a singular worksheet to a matrix of values
+    """
+    rows: list[list[str]] = []
+    for worksheet_row in worksheet.iter_rows(min_row=1, values_only=True):
+        row = ["" if cell is None else str(cell) for cell in worksheet_row]
+        rows.append(row)
+
+    return rows
+
+
+def _clean_worksheet_matrix(matrix: list[list[str]]) -> list[list[str]]:
+    """
+    Cleans a worksheet matrix by removing rows if there are N consecutive empty
+    rows and removing cols if there are M consecutive empty columns
+    """
+    MAX_EMPTY_ROWS = 2  # Runs longer than this are capped to max_empty; shorter runs are preserved as-is
+    MAX_EMPTY_COLS = 2
+
+    # Row cleanup
+    matrix = _remove_empty_runs(matrix, max_empty=MAX_EMPTY_ROWS)
+
+    # Column cleanup (transpose, clean, transpose back)
+    transposed = list(map(list, zip(*matrix))) if matrix else []
+    transposed = _remove_empty_runs(transposed, max_empty=MAX_EMPTY_COLS)
+    matrix = list(map(list, zip(*transposed))) if transposed else []
+
+    return matrix
+
+
+def _remove_empty_runs(
+    rows: list[list[str]],
+    max_empty: int,
+) -> list[list[str]]:
+    """Removes entire runs of empty rows when the run length exceeds max_empty.
+
+    Leading and trailing empty rows are always dropped regardless of run length,
+    since there is no adjacent non-empty row to bound the run.
+    """
+    result: list[list[str]] = []
+    empty_buffer: list[list[str]] = []
+
+    for row in rows:
+        # Check if empty
+        if not any(row):
+            empty_buffer.append(row)
+        else:
+            # Add upto max empty rows onto the result - that's what we allow
+            result.extend(empty_buffer[:max_empty])
+            # Add the new non-empty row
+            result.append(row)
+            empty_buffer = []
+
+    return result
+
+
 def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
    # TODO: switch back to this approach in a few months when markitdown
    # fixes their handling of excel files
@@ -390,30 +451,15 @@ def xlsx_to_text(file: IO[Any], file_name: str = "") -> str:
                f"Failed to extract text from {file_name or 'xlsx file'}. This happens due to a bug in openpyxl. {e}"
            )
            return ""
-        raise e
+        raise

    text_content = []
    for sheet in workbook.worksheets:
-        rows = []
-        num_empty_consecutive_rows = 0
-        for row in sheet.iter_rows(min_row=1, values_only=True):
-            row_str = ",".join(str(cell or "") for cell in row)
-
-            # Only add the row if there are any values in the cells
-            if len(row_str) >= len(row):
-                rows.append(row_str)
-                num_empty_consecutive_rows = 0
-            else:
-                num_empty_consecutive_rows += 1
-
-            if num_empty_consecutive_rows > 100:
-                # handle massive excel sheets with mostly empty cells
-                logger.warning(
-                    f"Found {num_empty_consecutive_rows} empty rows in {file_name}, skipping rest of file"
-                )
-                break
-        sheet_str = "\n".join(rows)
-        text_content.append(sheet_str)
+        sheet_matrix = _clean_worksheet_matrix(_worksheet_to_matrix(sheet))
+        buf = io.StringIO()
+        writer = csv.writer(buf, lineterminator="\n")
+        writer.writerows(sheet_matrix)
+        text_content.append(buf.getvalue().rstrip("\n"))
    return TEXT_SECTION_SEPARATOR.join(text_content)


--- a/backend/tests/unit/onyx/file_processing/init.py
+++ b/backend/tests/unit/onyx/file_processing/init.py
--- a/backend/tests/unit/onyx/file_processing/test_xlsx_to_text.py
+++ b/backend/tests/unit/onyx/file_processing/test_xlsx_to_text.py
@@ -0,0 +1,196 @@
+import io
+
+import openpyxl
+
+from onyx.file_processing.extract_file_text import xlsx_to_text
+
+
+def _make_xlsx(sheets: dict[str, list[list[str]]]) -> io.BytesIO:
+    """Create an in-memory xlsx file from a dict of sheet_name -> matrix of strings."""
+    wb = openpyxl.Workbook()
+    if wb.active is not None:
+        wb.remove(wb.active)
+    for sheet_name, rows in sheets.items():
+        ws = wb.create_sheet(title=sheet_name)
+        for row in rows:
+            ws.append(row)
+    buf = io.BytesIO()
+    wb.save(buf)
+    buf.seek(0)
+    return buf
+
+
+class TestXlsxToText:
+    def test_single_sheet_basic(self) -> None:
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ["Name", "Age"],
+                    ["Alice", "30"],
+                    ["Bob", "25"],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        lines = [line for line in result.strip().split("\n") if line.strip()]
+        assert len(lines) == 3
+        assert "Name" in lines[0]
+        assert "Age" in lines[0]
+        assert "Alice" in lines[1]
+        assert "30" in lines[1]
+        assert "Bob" in lines[2]
+
+    def test_multiple_sheets_separated(self) -> None:
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [["a", "b"]],
+                "Sheet2": [["c", "d"]],
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        # TEXT_SECTION_SEPARATOR is "\n\n"
+        assert "\n\n" in result
+        parts = result.split("\n\n")
+        assert any("a" in p for p in parts)
+        assert any("c" in p for p in parts)
+
+    def test_empty_cells(self) -> None:
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ["a", "", "b"],
+                    ["", "c", ""],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        lines = [line for line in result.strip().split("\n") if line.strip()]
+        assert len(lines) == 2
+
+    def test_commas_in_cells_are_quoted(self) -> None:
+        """Cells containing commas should be quoted in CSV output."""
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ["hello, world", "normal"],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        assert '"hello, world"' in result
+
+    def test_empty_workbook(self) -> None:
+        xlsx = _make_xlsx({"Sheet1": []})
+        result = xlsx_to_text(xlsx)
+        assert result.strip() == ""
+
+    def test_long_empty_row_run_capped(self) -> None:
+        """Runs of >2 empty rows should be capped to 2."""
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ["header"],
+                    [""],
+                    [""],
+                    [""],
+                    [""],
+                    ["data"],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        lines = [line for line in result.strip().split("\n") if line.strip()]
+        # 4 empty rows capped to 2, so: header + 2 empty + data = 4 lines
+        assert len(lines) == 4
+        assert "header" in lines[0]
+        assert "data" in lines[-1]
+
+    def test_long_empty_col_run_capped(self) -> None:
+        """Runs of >2 empty columns should be capped to 2."""
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ["a", "", "", "", "b"],
+                    ["c", "", "", "", "d"],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        lines = [line for line in result.strip().split("\n") if line.strip()]
+        assert len(lines) == 2
+        # Each row should have 4 fields (a + 2 empty + b), not 5
+        # csv format: a,,,b (3 commas = 4 fields)
+        first_line = lines[0].strip()
+        # Count commas to verify column reduction
+        assert first_line.count(",") == 3
+
+    def test_short_empty_runs_kept(self) -> None:
+        """Runs of <=2 empty rows/cols should be preserved."""
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ["a", "b"],
+                    ["", ""],
+                    ["", ""],
+                    ["c", "d"],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        lines = [line for line in result.strip().split("\n") if line.strip()]
+        # All 4 rows preserved (2 empty rows <= threshold)
+        assert len(lines) == 4
+
+    def test_bad_zip_file_returns_empty(self) -> None:
+        bad_file = io.BytesIO(b"not a zip file")
+        result = xlsx_to_text(bad_file, file_name="test.xlsx")
+        assert result == ""
+
+    def test_bad_zip_tilde_file_returns_empty(self) -> None:
+        bad_file = io.BytesIO(b"not a zip file")
+        result = xlsx_to_text(bad_file, file_name="~$temp.xlsx")
+        assert result == ""
+
+    def test_large_sparse_sheet(self) -> None:
+        """A sheet with data, a big empty gap, and more data — gap is capped to 2."""
+        rows: list[list[str]] = [["row1_data"]]
+        rows.extend([[""] for _ in range(10)])
+        rows.append(["row2_data"])
+        xlsx = _make_xlsx({"Sheet1": rows})
+        result = xlsx_to_text(xlsx)
+        lines = [line for line in result.strip().split("\n") if line.strip()]
+        # 10 empty rows capped to 2: row1_data + 2 empty + row2_data = 4
+        assert len(lines) == 4
+        assert "row1_data" in lines[0]
+        assert "row2_data" in lines[-1]
+
+    def test_quotes_in_cells(self) -> None:
+        """Cells containing quotes should be properly escaped."""
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ['say "hello"', "normal"],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        # csv.writer escapes quotes by doubling them
+        assert '""hello""' in result
+
+    def test_each_row_is_separate_line(self) -> None:
+        """Each row should produce its own line (regression for writerow vs writerows)."""
+        xlsx = _make_xlsx(
+            {
+                "Sheet1": [
+                    ["r1c1", "r1c2"],
+                    ["r2c1", "r2c2"],
+                    ["r3c1", "r3c2"],
+                ]
+            }
+        )
+        result = xlsx_to_text(xlsx)
+        lines = [line for line in result.strip().split("\n") if line.strip()]
+        assert len(lines) == 3
+        assert "r1c1" in lines[0] and "r1c2" in lines[0]
+        assert "r2c1" in lines[1] and "r2c2" in lines[1]
+        assert "r3c1" in lines[2] and "r3c2" in lines[2]
Author	SHA1	Message	Date
Dane Urban	83558ae04c	.	2026-03-11 17:52:55 -07:00
Dane Urban	005009602c	Some fixes	2026-03-11 17:44:49 -07:00
Dane Urban	b93875353b	.	2026-03-11 17:32:30 -07:00
Dane Urban	2290141b53	xlsx to text	2026-03-11 17:19:09 -07:00