k

2026-02-16 23:35:46 +00:00 · 2024-07-20 16:48:05 -07:00
1 changed files with 21 additions and 25 deletions
--- a/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/danswer/document_index/vespa/app_config/schemas/danswer_chunk.sd
@@ -153,43 +153,41 @@ schema DANSWER_CHUNK_NAME {
            query(query_embedding) tensor<float>(x[VARIABLE_DIM])
        }

-        # This must be separate function for normalize_linear to work
-        function vector_score() {
+        function title_vector_score() {
            expression {
                # If no title, the full vector score comes from the content embedding
-                (query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))) +
-                ((1 - query(title_content_ratio)) * closeness(field, embeddings))
-            }
-        }
-
-        # This must be separate function for normalize_linear to work
-        function keyword_score() {
-            expression {
-                (query(title_content_ratio) * bm25(title)) +
-                ((1 - query(title_content_ratio)) * bm25(content))
+                #query(title_content_ratio) * if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))
+                if(attribute(skip_title), closeness(field, embeddings), closeness(field, title_embedding))
            }
        }

        first-phase {
-            expression: vector_score
+            expression: closeness(field, embeddings)
        }

        # Weighted average between Vector Search and BM-25
-        # Each is a weighted average between the Title and Content fields
-        # Finally each doc is boosted by it's user feedback based boost and recency
-        # If any embedding or index field is missing, it just receives a score of 0
-        # Assumptions:
-        # - For a given query + corpus, the BM-25 scores will be relatively similar in distribution
-        #   therefore not normalizing before combining.
-        # - For documents without title, it gets a score of 0 for that and this is ok as documents
-        #   without any title match should be penalized.
        global-phase {
            expression {
                (
                    # Weighted Vector Similarity Score
-                    (query(alpha) * normalize_linear(vector_score)) +
+                    (
+                        query(alpha) * (
+                            (query(title_content_ratio) * normalize_linear(title_vector_score))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
+                        )
+                    )
+
+                    +
+
                    # Weighted Keyword Similarity Score
-                    ((1 - query(alpha)) * normalize_linear(keyword_score))
+                    (
+                        (1 - query(alpha)) * (
+                            (query(title_content_ratio) * normalize_linear(bm25(title)))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
+                        )
+                    )
                )
                # Boost based on user feedback
                * document_boost
@@ -204,8 +202,6 @@ schema DANSWER_CHUNK_NAME {
            bm25(content)
            closeness(field, title_embedding)
            closeness(field, embeddings)
-            keyword_score
-            vector_score
            document_boost
            recency_bias
            closest(embeddings)