Compare commits

...

4 Commits

Author SHA1 Message Date
pablonyx
a475f1b328 nit 2025-03-29 12:47:06 -07:00
pablonyx
e37bb2209e Revert "quick fix"
This reverts commit 906b29bd9b.
2025-03-29 12:46:05 -07:00
pablonyx
5aba739aee quick fix 2025-03-29 12:44:56 -07:00
pablonyx
906b29bd9b quick fix 2025-03-29 11:51:53 -07:00
2 changed files with 15 additions and 28 deletions

View File

@@ -224,27 +224,6 @@ class Chunker:
)
chunks_list.append(new_chunk)
def _chunk_document(
self,
document: IndexingDocument,
title_prefix: str,
metadata_suffix_semantic: str,
metadata_suffix_keyword: str,
content_token_limit: int,
) -> list[DocAwareChunk]:
"""
Legacy method for backward compatibility.
Calls _chunk_document_with_sections with document.sections.
"""
return self._chunk_document_with_sections(
document,
document.processed_sections,
title_prefix,
metadata_suffix_semantic,
metadata_suffix_keyword,
content_token_limit,
)
def _chunk_document_with_sections(
self,
document: IndexingDocument,
@@ -264,7 +243,7 @@ class Chunker:
for section_idx, section in enumerate(sections):
# Get section text and other attributes
section_text = clean_text(section.text or "")
section_text = clean_text(str(section.text or ""))
section_link_text = section.link or ""
image_url = section.image_file_name

View File

@@ -439,7 +439,7 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
**document.dict(),
processed_sections=[
Section(
text=section.text if isinstance(section, TextSection) else None,
text=section.text if isinstance(section, TextSection) else "",
link=section.link,
image_file_name=section.image_file_name
if isinstance(section, ImageSection)
@@ -459,11 +459,11 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
for section in document.sections:
# For ImageSection, process and create base Section with both text and image_file_name
if isinstance(section, ImageSection):
# Default section with image path preserved
# Default section with image path preserved - ensure text is always a string
processed_section = Section(
link=section.link,
image_file_name=section.image_file_name,
text=None, # Will be populated if summarization succeeds
text="", # Initialize with empty string
)
# Try to get image summary
@@ -506,13 +506,21 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
# For TextSection, create a base Section with text and link
elif isinstance(section, TextSection):
processed_section = Section(
text=section.text, link=section.link, image_file_name=None
text=section.text or "", # Ensure text is always a string, not None
link=section.link,
image_file_name=None,
)
processed_sections.append(processed_section)
# If it's already a base Section (unlikely), just append it
# If it's already a base Section (unlikely), just append it with text validation
else:
processed_sections.append(section)
# Ensure text is always a string
processed_section = Section(
text=section.text if section.text is not None else "",
link=section.link,
image_file_name=section.image_file_name,
)
processed_sections.append(processed_section)
# Create IndexingDocument with original sections and processed_sections
indexed_document = IndexingDocument(