new profile & double yql

2026-02-23 10:45:44 +00:00 · 2025-04-11 14:06:03 -07:00
2 changed files with 175 additions and 60 deletions
--- a/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
@@ -243,6 +243,73 @@ schema DANSWER_CHUNK_NAME {
        }
    }

+    rank-profile hybrid_search_kw_first_phaseVARIABLE_DIM inherits default, default_rank {
+        inputs {
+            query(query_embedding) tensor<float>(x[VARIABLE_DIM])
+        }
+
+        function title_vector_score() {
+            expression {
+                # If no good matching titles, then it should use the context embeddings rather than having some
+                # irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
+                # matching content score getting the full score
+                max(closeness(field, embeddings), closeness(field, title_embedding))
+            }
+        }
+
+        # First phase must be vector to allow hits that have no keyword matches
+        first-phase {
+            expression: 0.2 * bm25(title) + 0.8 * bm25(content)
+        }
+
+        # Weighted average between Vector Search and BM-25
+        global-phase {
+            expression {
+                (
+                    # Weighted Vector Similarity Score
+                    (
+                        query(alpha) * (
+                            (query(title_content_ratio) * normalize_linear(title_vector_score))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
+                        )
+                    )
+
+                    +
+
+                    # Weighted Keyword Similarity Score
+                    # Note: for the BM25 Title score, it requires decent stopword removal in the query
+                    # This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
+                    (
+                        (1 - query(alpha)) * (
+                            (query(title_content_ratio) * normalize_linear(bm25(title)))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
+                        )
+                    )
+                )
+                # Boost based on user feedback
+                * document_boost
+                # Decay factor based on time document was last updated
+                * recency_bias
+                # Boost based on aggregated boost calculation
+                * aggregated_chunk_boost
+            }
+            rerank-count: 1000
+        }
+
+        match-features {
+            bm25(title)
+            bm25(content)
+            closeness(field, title_embedding)
+            closeness(field, embeddings)
+            document_boost
+            recency_bias
+            aggregated_chunk_boost
+            closest(embeddings)
+        }
+    }
+
    # Used when searching from the admin UI for a specific doc to hide / boost
    # Very heavily prioritize title
    rank-profile admin_search inherits default, default_rank {
--- a/backend/onyx/document_index/vespa/chunk_retrieval.py
+++ b/backend/onyx/document_index/vespa/chunk_retrieval.py
@@ -297,76 +297,124 @@ def query_vespa(
    if "query" in query_params and not cast(str, query_params["query"]).strip():
        raise ValueError("No/empty query received")

-    params = dict(
-        **query_params,
-        **(
-            {
-                "presentation.timing": True,
-            }
-            if LOG_VESPA_TIMING_INFORMATION
-            else {}
-        ),
-    )
+    configured_ranking_profile = query_params.get("ranking.profile")
+    if not configured_ranking_profile:
+        raise ValueError("No ranking profile configured")

-    try:
-        with get_vespa_http_client() as http_client:
-            response = http_client.post(SEARCH_ENDPOINT, json=params)
-            response.raise_for_status()
-    except httpx.HTTPError as e:
-        error_base = "Failed to query Vespa"
-        logger.error(
-            f"{error_base}:\n"
-            f"Request URL: {e.request.url}\n"
-            f"Request Headers: {e.request.headers}\n"
-            f"Request Payload: {params}\n"
-            f"Exception: {str(e)}"
-            + (
-                f"\nResponse: {e.response.text}"
-                if isinstance(e, httpx.HTTPStatusError)
-                else ""
-            )
+    query_profiles: list[float | int | str] = []
+
+    if (
+        configured_ranking_profile
+        and isinstance(configured_ranking_profile, str)
+        and configured_ranking_profile.startswith("hybrid_search")
+    ):
+        dimension = configured_ranking_profile.split("hybrid_search")[1]
+        query_profiles = [
+            f"hybrid_search_kw_first_phase{dimension}",
+            f"hybrid_search{dimension}",
+        ]
+    else:
+        query_profiles = [configured_ranking_profile]
+
+    inference_chunk_sets = []
+    mutable_params = dict(query_params)
+
+    for query_profile in query_profiles:
+        mutable_params["ranking.profile"] = query_profile
+
+        params = dict(
+            **mutable_params,
+            **(
+                {
+                    "presentation.timing": True,
+                }
+                if LOG_VESPA_TIMING_INFORMATION
+                else {}
+            ),
        )
-        raise httpx.HTTPError(error_base) from e

-    response_json: dict[str, Any] = response.json()
-
-    if LOG_VESPA_TIMING_INFORMATION:
-        logger.debug("Vespa timing info: %s", response_json.get("timing"))
-    hits = response_json["root"].get("children", [])
-
-    if not hits:
-        logger.warning(
-            f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
-        )
-        logger.debug(f"Vespa Response: {response.text}")
-
-    for hit in hits:
-        if hit["fields"].get(CONTENT) is None:
-            identifier = hit["fields"].get("documentid") or hit["id"]
+        try:
+            with get_vespa_http_client() as http_client:
+                response = http_client.post(SEARCH_ENDPOINT, json=params)
+                response.raise_for_status()
+        except httpx.HTTPError as e:
+            error_base = "Failed to query Vespa"
            logger.error(
-                f"Vespa Index with Vespa ID {identifier} has no contents. "
-                f"This is invalid because the vector is not meaningful and keywordsearch cannot "
-                f"fetch this document"
+                f"{error_base}:\n"
+                f"Request URL: {e.request.url}\n"
+                f"Request Headers: {e.request.headers}\n"
+                f"Request Payload: {params}\n"
+                f"Exception: {str(e)}"
+                + (
+                    f"\nResponse: {e.response.text}"
+                    if isinstance(e, httpx.HTTPStatusError)
+                    else ""
+                )
            )
+            raise httpx.HTTPError(error_base) from e

-    filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
+        response_json: dict[str, Any] = response.json()

-    inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
+        if LOG_VESPA_TIMING_INFORMATION:
+            logger.debug("Vespa timing info: %s", response_json.get("timing"))
+        hits = response_json["root"].get("children", [])

-    try:
-        num_retrieved_inference_chunks = len(inference_chunks)
-        num_retrieved_document_ids = len(
-            set([chunk.document_id for chunk in inference_chunks])
-        )
-        logger.debug(
-            f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
-        )
-    except Exception as e:
-        # Debug logging only, should not fail the retrieval
-        logger.error(f"Error logging retrieval statistics: {e}")
+        if not hits:
+            logger.warning(
+                f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
+            )
+            logger.debug(f"Vespa Response: {response.text}")
+
+        for hit in hits:
+            if hit["fields"].get(CONTENT) is None:
+                identifier = hit["fields"].get("documentid") or hit["id"]
+                logger.error(
+                    f"Vespa Index with Vespa ID {identifier} has no contents. "
+                    f"This is invalid because the vector is not meaningful and keywordsearch cannot "
+                    f"fetch this document"
+                )
+
+        filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
+
+        inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
+
+        try:
+            num_retrieved_inference_chunks = len(inference_chunks)
+            num_retrieved_document_ids = len(
+                set([chunk.document_id for chunk in inference_chunks])
+            )
+            logger.debug(
+                f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
+            )
+        except Exception as e:
+            # Debug logging only, should not fail the retrieval
+            logger.error(f"Error logging retrieval statistics: {e}")
+
+        inference_chunk_sets.append(inference_chunks)
+
+    flattened_inference_chunks = []
+    for inference_chunk_set in inference_chunk_sets:
+        flattened_inference_chunks.extend(inference_chunk_set)
+
+    flattened_inference_chunks.sort(key=lambda chunk: chunk.score, reverse=True)
+
+    final_chunks = []
+    used_document_chunk_ids = set()
+
+    for chunk in flattened_inference_chunks:
+        if (
+            chunk.document_id + "__" + str(chunk.chunk_id)
+            not in used_document_chunk_ids
+        ):
+            final_chunks.append(chunk)
+            used_document_chunk_ids.add(chunk.document_id + "__" + str(chunk.chunk_id))
+        else:
+            continue
+
+    return final_chunks

    # Good Debugging Spot
-    return inference_chunks
+    return flattened_inference_chunks


 def _get_chunks_via_batch_search(