Compare commits

...

3 Commits

Author SHA1 Message Date
Richard Kuo (Danswer)
5b7f40a21c perform just one callback at the start for now 2025-01-10 13:18:39 -08:00
Richard Kuo (Danswer)
3578f47263 add detailed logging to embed chunks 2025-01-10 13:17:31 -08:00
Richard Kuo (Danswer)
229defe7bf add additional debugging for indexing callback during chunking 2025-01-10 10:53:14 -08:00
3 changed files with 24 additions and 3 deletions

View File

@@ -154,6 +154,14 @@ class Document(DocumentBase):
"""Used when logging the identity of a document"""
return f"ID: '{self.id}'; Semantic ID: '{self.semantic_identifier}'"
@property
def total_section_length(self) -> int:
length = 0
for section in self.sections:
length += len(section.text)
return length
@classmethod
def from_base(cls, base: DocumentBase) -> "Document":
return cls(

View File

@@ -373,10 +373,13 @@ class Chunker:
if self.callback.should_stop():
raise RuntimeError("Chunker.chunk: Stop signal detected")
self.callback.progress(
f"Chunker.chunk start: doc={document.id} "
f"total_section_length={document.total_section_length} ",
0,
)
chunks = self._handle_single_document(document)
final_chunks.extend(chunks)
if self.callback:
self.callback.progress("Chunker.chunk", len(chunks))
return final_chunks

View File

@@ -1,3 +1,4 @@
import time
from abc import ABC
from abc import abstractmethod
@@ -106,6 +107,8 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
"""Adds embeddings to the chunks, the title and metadata suffixes are added to the chunk as well
if they exist. If there is no space for it, it would have been thrown out at the chunking step.
"""
start = time.monotonic()
# All chunks at this point must have some non-empty content
flat_chunk_texts: list[str] = []
large_chunks_present = False
@@ -197,6 +200,13 @@ class DefaultIndexingEmbedder(IndexingEmbedder):
embedded_chunks.append(new_embedded_chunk)
embedding_ind_start += num_embeddings
elapsed = time.monotonic() - start
logger.info(
f"embed_chunks finished: "
f"elapsed={elapsed:.2f} "
f"chunks={len(chunks)} "
f"chunks_embedded={len(embedded_chunks)}"
)
return embedded_chunks
@classmethod