new profile & double yql

2026-02-25 19:55:47 +00:00 · 2025-04-11 14:06:03 -07:00
11 changed files with 239 additions and 135 deletions
--- a/backend/onyx/connectors/confluence/onyx_confluence.py
+++ b/backend/onyx/connectors/confluence/onyx_confluence.py
@@ -488,16 +488,6 @@ class OnyxConfluence:
            old_url_suffix = url_suffix
            url_suffix = cast(str, next_response.get("_links", {}).get("next", ""))

-            # we've observed that Confluence sometimes returns a next link despite giving
-            # 0 results. This is a bug with Confluence, so we need to check for it and
-            # stop paginating.
-            if url_suffix and not results:
-                logger.info(
-                    f"No results found for call '{old_url_suffix}' despite next link "
-                    "being present. Stopping pagination."
-                )
-                break
-
            # make sure we don't update the start by more than the amount
            # of results we were able to retrieve. The Confluence API has a
            # weird behavior where if you pass in a limit that is too large for
--- a/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
+++ b/backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd
@@ -243,6 +243,73 @@ schema DANSWER_CHUNK_NAME {
        }
    }

+    rank-profile hybrid_search_kw_first_phaseVARIABLE_DIM inherits default, default_rank {
+        inputs {
+            query(query_embedding) tensor<float>(x[VARIABLE_DIM])
+        }
+
+        function title_vector_score() {
+            expression {
+                # If no good matching titles, then it should use the context embeddings rather than having some
+                # irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
+                # matching content score getting the full score
+                max(closeness(field, embeddings), closeness(field, title_embedding))
+            }
+        }
+
+        # First phase must be vector to allow hits that have no keyword matches
+        first-phase {
+            expression: 0.2 * bm25(title) + 0.8 * bm25(content)
+        }
+
+        # Weighted average between Vector Search and BM-25
+        global-phase {
+            expression {
+                (
+                    # Weighted Vector Similarity Score
+                    (
+                        query(alpha) * (
+                            (query(title_content_ratio) * normalize_linear(title_vector_score))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
+                        )
+                    )
+
+                    +
+
+                    # Weighted Keyword Similarity Score
+                    # Note: for the BM25 Title score, it requires decent stopword removal in the query
+                    # This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
+                    (
+                        (1 - query(alpha)) * (
+                            (query(title_content_ratio) * normalize_linear(bm25(title)))
+                            +
+                            ((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
+                        )
+                    )
+                )
+                # Boost based on user feedback
+                * document_boost
+                # Decay factor based on time document was last updated
+                * recency_bias
+                # Boost based on aggregated boost calculation
+                * aggregated_chunk_boost
+            }
+            rerank-count: 1000
+        }
+
+        match-features {
+            bm25(title)
+            bm25(content)
+            closeness(field, title_embedding)
+            closeness(field, embeddings)
+            document_boost
+            recency_bias
+            aggregated_chunk_boost
+            closest(embeddings)
+        }
+    }
+
    # Used when searching from the admin UI for a specific doc to hide / boost
    # Very heavily prioritize title
    rank-profile admin_search inherits default, default_rank {
--- a/backend/onyx/document_index/vespa/chunk_retrieval.py
+++ b/backend/onyx/document_index/vespa/chunk_retrieval.py
@@ -297,76 +297,124 @@ def query_vespa(
    if "query" in query_params and not cast(str, query_params["query"]).strip():
        raise ValueError("No/empty query received")

-    params = dict(
-        **query_params,
-        **(
-            {
-                "presentation.timing": True,
-            }
-            if LOG_VESPA_TIMING_INFORMATION
-            else {}
-        ),
-    )
+    configured_ranking_profile = query_params.get("ranking.profile")
+    if not configured_ranking_profile:
+        raise ValueError("No ranking profile configured")

-    try:
-        with get_vespa_http_client() as http_client:
-            response = http_client.post(SEARCH_ENDPOINT, json=params)
-            response.raise_for_status()
-    except httpx.HTTPError as e:
-        error_base = "Failed to query Vespa"
-        logger.error(
-            f"{error_base}:\n"
-            f"Request URL: {e.request.url}\n"
-            f"Request Headers: {e.request.headers}\n"
-            f"Request Payload: {params}\n"
-            f"Exception: {str(e)}"
-            + (
-                f"\nResponse: {e.response.text}"
-                if isinstance(e, httpx.HTTPStatusError)
-                else ""
-            )
+    query_profiles: list[float | int | str] = []
+
+    if (
+        configured_ranking_profile
+        and isinstance(configured_ranking_profile, str)
+        and configured_ranking_profile.startswith("hybrid_search")
+    ):
+        dimension = configured_ranking_profile.split("hybrid_search")[1]
+        query_profiles = [
+            f"hybrid_search_kw_first_phase{dimension}",
+            f"hybrid_search{dimension}",
+        ]
+    else:
+        query_profiles = [configured_ranking_profile]
+
+    inference_chunk_sets = []
+    mutable_params = dict(query_params)
+
+    for query_profile in query_profiles:
+        mutable_params["ranking.profile"] = query_profile
+
+        params = dict(
+            **mutable_params,
+            **(
+                {
+                    "presentation.timing": True,
+                }
+                if LOG_VESPA_TIMING_INFORMATION
+                else {}
+            ),
        )
-        raise httpx.HTTPError(error_base) from e

-    response_json: dict[str, Any] = response.json()
-
-    if LOG_VESPA_TIMING_INFORMATION:
-        logger.debug("Vespa timing info: %s", response_json.get("timing"))
-    hits = response_json["root"].get("children", [])
-
-    if not hits:
-        logger.warning(
-            f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
-        )
-        logger.debug(f"Vespa Response: {response.text}")
-
-    for hit in hits:
-        if hit["fields"].get(CONTENT) is None:
-            identifier = hit["fields"].get("documentid") or hit["id"]
+        try:
+            with get_vespa_http_client() as http_client:
+                response = http_client.post(SEARCH_ENDPOINT, json=params)
+                response.raise_for_status()
+        except httpx.HTTPError as e:
+            error_base = "Failed to query Vespa"
            logger.error(
-                f"Vespa Index with Vespa ID {identifier} has no contents. "
-                f"This is invalid because the vector is not meaningful and keywordsearch cannot "
-                f"fetch this document"
+                f"{error_base}:\n"
+                f"Request URL: {e.request.url}\n"
+                f"Request Headers: {e.request.headers}\n"
+                f"Request Payload: {params}\n"
+                f"Exception: {str(e)}"
+                + (
+                    f"\nResponse: {e.response.text}"
+                    if isinstance(e, httpx.HTTPStatusError)
+                    else ""
+                )
            )
+            raise httpx.HTTPError(error_base) from e

-    filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
+        response_json: dict[str, Any] = response.json()

-    inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
+        if LOG_VESPA_TIMING_INFORMATION:
+            logger.debug("Vespa timing info: %s", response_json.get("timing"))
+        hits = response_json["root"].get("children", [])

-    try:
-        num_retrieved_inference_chunks = len(inference_chunks)
-        num_retrieved_document_ids = len(
-            set([chunk.document_id for chunk in inference_chunks])
-        )
-        logger.debug(
-            f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
-        )
-    except Exception as e:
-        # Debug logging only, should not fail the retrieval
-        logger.error(f"Error logging retrieval statistics: {e}")
+        if not hits:
+            logger.warning(
+                f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
+            )
+            logger.debug(f"Vespa Response: {response.text}")
+
+        for hit in hits:
+            if hit["fields"].get(CONTENT) is None:
+                identifier = hit["fields"].get("documentid") or hit["id"]
+                logger.error(
+                    f"Vespa Index with Vespa ID {identifier} has no contents. "
+                    f"This is invalid because the vector is not meaningful and keywordsearch cannot "
+                    f"fetch this document"
+                )
+
+        filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
+
+        inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
+
+        try:
+            num_retrieved_inference_chunks = len(inference_chunks)
+            num_retrieved_document_ids = len(
+                set([chunk.document_id for chunk in inference_chunks])
+            )
+            logger.debug(
+                f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
+            )
+        except Exception as e:
+            # Debug logging only, should not fail the retrieval
+            logger.error(f"Error logging retrieval statistics: {e}")
+
+        inference_chunk_sets.append(inference_chunks)
+
+    flattened_inference_chunks = []
+    for inference_chunk_set in inference_chunk_sets:
+        flattened_inference_chunks.extend(inference_chunk_set)
+
+    flattened_inference_chunks.sort(key=lambda chunk: chunk.score, reverse=True)
+
+    final_chunks = []
+    used_document_chunk_ids = set()
+
+    for chunk in flattened_inference_chunks:
+        if (
+            chunk.document_id + "__" + str(chunk.chunk_id)
+            not in used_document_chunk_ids
+        ):
+            final_chunks.append(chunk)
+            used_document_chunk_ids.add(chunk.document_id + "__" + str(chunk.chunk_id))
+        else:
+            continue
+
+    return final_chunks

    # Good Debugging Spot
-    return inference_chunks
+    return flattened_inference_chunks


 def _get_chunks_via_batch_search(
--- a/web/src/app/admin/configuration/llm/ConfiguredLLMProviderDisplay.tsx
+++ b/web/src/app/admin/configuration/llm/ConfiguredLLMProviderDisplay.tsx
@@ -29,11 +29,12 @@ function LLMProviderUpdateModal({
      llmProviderDescriptor?.name ||
      "Custom LLM Provider";

+  const hasAdvancedOptions = llmProviderDescriptor?.name != "azure";
+
  return (
    <Modal
      title={`${llmProviderDescriptor ? "Configure" : "Setup"} ${providerName}`}
      onOutsideClick={() => onClose()}
-      hideOverflow={true}
    >
      <div className="max-h-[70vh] overflow-y-auto px-4">
        {llmProviderDescriptor ? (
@@ -43,6 +44,7 @@ function LLMProviderUpdateModal({
            existingLlmProvider={existingLlmProvider}
            shouldMarkAsDefault={shouldMarkAsDefault}
            setPopup={setPopup}
+            hasAdvancedOptions={hasAdvancedOptions}
          />
        ) : (
          <CustomLLMProviderUpdateForm
--- a/web/src/app/admin/configuration/llm/LLMConfiguration.tsx
+++ b/web/src/app/admin/configuration/llm/LLMConfiguration.tsx
@@ -35,12 +35,10 @@ function LLMProviderUpdateModal({
    existingLlmProvider?.name ||
    "Custom LLM Provider";

+  const hasAdvancedOptions = llmProviderDescriptor?.name != "azure";
+
  return (
-    <Modal
-      title={`Setup ${providerName}`}
-      onOutsideClick={() => onClose()}
-      hideOverflow={true}
-    >
+    <Modal title={`Setup ${providerName}`} onOutsideClick={() => onClose()}>
      <div className="max-h-[70vh] overflow-y-auto px-4">
        {llmProviderDescriptor ? (
          <LLMProviderUpdateForm
@@ -49,6 +47,7 @@ function LLMProviderUpdateModal({
            existingLlmProvider={existingLlmProvider}
            shouldMarkAsDefault={shouldMarkAsDefault}
            setPopup={setPopup}
+            hasAdvancedOptions={hasAdvancedOptions}
          />
        ) : (
          <CustomLLMProviderUpdateForm
--- a/web/src/app/admin/configuration/llm/LLMProviderUpdateForm.tsx
+++ b/web/src/app/admin/configuration/llm/LLMProviderUpdateForm.tsx
@@ -29,6 +29,7 @@ export function LLMProviderUpdateForm({
  setPopup,
  hideSuccess,
  firstTimeConfiguration = false,
+  hasAdvancedOptions = false,
 }: {
  llmProviderDescriptor: WellKnownLLMProviderDescriptor;
  onClose: () => void;
@@ -39,6 +40,7 @@ export function LLMProviderUpdateForm({

  // Set this when this is the first time the user is setting Onyx up.
  firstTimeConfiguration?: boolean;
+  hasAdvancedOptions?: boolean;
 }) {
  const { mutate } = useSWRConfig();

@@ -300,7 +302,7 @@ export function LLMProviderUpdateForm({
            }
          })}

-          {!firstTimeConfiguration && (
+          {hasAdvancedOptions && !firstTimeConfiguration && (
            <>
              <Separator />

@@ -362,49 +364,52 @@ export function LLMProviderUpdateForm({
                  />
                ))}

-              <>
-                <Separator />
-                <AdvancedOptionsToggle
-                  showAdvancedOptions={showAdvancedOptions}
-                  setShowAdvancedOptions={setShowAdvancedOptions}
-                />
-                {showAdvancedOptions && (
-                  <>
-                    {llmProviderDescriptor.llm_names.length > 0 && (
-                      <div className="w-full">
-                        <MultiSelectField
-                          selectedInitially={
-                            formikProps.values.display_model_names
-                          }
-                          name="display_model_names"
-                          label="Display Models"
-                          subtext="Select the models to make available to users. Unselected models will not be available."
-                          options={llmProviderDescriptor.llm_names.map(
-                            (name) => ({
-                              value: name,
-                              // don't clean up names here to give admins descriptive names / handle duplicates
-                              // like us.anthropic.claude-3-7-sonnet-20250219-v1:0 and anthropic.claude-3-7-sonnet-20250219-v1:0
-                              label: name,
-                            })
-                          )}
-                          onChange={(selected) =>
-                            formikProps.setFieldValue(
-                              "display_model_names",
-                              selected
-                            )
-                          }
-                        />
-                      </div>
-                    )}
-                    <IsPublicGroupSelector
-                      formikProps={formikProps}
-                      objectName="LLM Provider"
-                      publicToWhom="Users"
-                      enforceGroupSelection={true}
-                    />
-                  </>
-                )}
-              </>
+              {hasAdvancedOptions && (
+                <>
+                  <Separator />
+                  <AdvancedOptionsToggle
+                    showAdvancedOptions={showAdvancedOptions}
+                    setShowAdvancedOptions={setShowAdvancedOptions}
+                  />
+                  {showAdvancedOptions && (
+                    <>
+                      {llmProviderDescriptor.llm_names.length > 0 && (
+                        <div className="w-full">
+                          <MultiSelectField
+                            selectedInitially={
+                              formikProps.values.display_model_names
+                            }
+                            name="display_model_names"
+                            label="Display Models"
+                            subtext="Select the models to make available to users. Unselected models will not be available."
+                            options={llmProviderDescriptor.llm_names.map(
+                              (name) => ({
+                                value: name,
+                                // don't clean up names here to give admins descriptive names / handle duplicates
+                                // like us.anthropic.claude-3-7-sonnet-20250219-v1:0 and anthropic.claude-3-7-sonnet-20250219-v1:0
+                                label: name,
+                              })
+                            )}
+                            onChange={(selected) =>
+                              formikProps.setFieldValue(
+                                "display_model_names",
+                                selected
+                              )
+                            }
+                          />
+                        </div>
+                      )}
+
+                      <IsPublicGroupSelector
+                        formikProps={formikProps}
+                        objectName="LLM Provider"
+                        publicToWhom="all users"
+                        enforceGroupSelection={true}
+                      />
+                    </>
+                  )}
+                </>
+              )}
            </>
          )}

--- a/web/src/app/chat/modal/ShareChatSessionModal.tsx
+++ b/web/src/app/chat/modal/ShareChatSessionModal.tsx
@@ -146,10 +146,7 @@ export function ShareChatSessionModal({
                      setShareLink("");
                      onShare && onShare(false);
                    } else {
-                      setPopup({
-                        message: "Failed to delete share link",
-                        type: "error",
-                      });
+                      alert("Failed to delete share link");
                    }
                  }}
                  size="sm"
@@ -174,10 +171,7 @@ export function ShareChatSessionModal({
                        const shareLink =
                          await generateShareLink(chatSessionId);
                        if (!shareLink) {
-                          setPopup({
-                            message: "Failed to generate share link",
-                            type: "error",
-                          });
+                          alert("Failed to generate share link");
                        } else {
                          setShareLink(shareLink);
                          onShare && onShare(true);
@@ -237,10 +231,7 @@ export function ShareChatSessionModal({
                      }
                    } catch (e) {
                      console.error(e);
-                      setPopup({
-                        message: "Failed to generate or copy link.",
-                        type: "error",
-                      });
+                      alert("Failed to generate or copy link.");
                    }
                  }}
                  size="sm"
--- a/web/src/app/chat/my-documents/[id]/components/DocumentList.tsx
+++ b/web/src/app/chat/my-documents/[id]/components/DocumentList.tsx
@@ -346,6 +346,8 @@ export const DocumentList: React.FC<DocumentListProps> = ({
            // Get the hostname (domain) from the URL
            const url = new URL(uploadingFile.name);
            const hostname = url.hostname;
+            alert("checking for " + hostname);
+            alert(JSON.stringify(files));

            // Look for recently added files that might match this URL
            const isUploaded = files.some(
--- a/web/src/components/IsPublicGroupSelector.tsx
+++ b/web/src/components/IsPublicGroupSelector.tsx
@@ -74,6 +74,7 @@ export const IsPublicGroupSelector = <T extends IsPublicGroupSelectorFormType>({

  return (
    <div>
+      <Separator />
      {isAdmin && (
        <>
          <BooleanFormField
--- a/web/src/components/Modal.tsx
+++ b/web/src/components/Modal.tsx
@@ -24,7 +24,6 @@ interface ModalProps {
  removeBottomPadding?: boolean;
  removePadding?: boolean;
  increasedPadding?: boolean;
-  hideOverflow?: boolean;
 }

 export function Modal({
@@ -44,7 +43,6 @@ export function Modal({
  removeBottomPadding,
  removePadding,
  increasedPadding,
-  hideOverflow,
 }: ModalProps) {
  const modalRef = useRef<HTMLDivElement>(null);
  const [isMounted, setIsMounted] = useState(false);
@@ -94,7 +92,7 @@ export function Modal({
          flex
          flex-col
          ${heightOverride ? `h-${heightOverride}` : "max-h-[90vh]"}
-          ${hideOverflow ? "overflow-hidden" : "overflow-auto"}
+          overflow-auto
        `}
      >
        {onOutsideClick && !hideCloseButton && (
--- a/web/src/components/tooltip/CustomTooltip.tsx
+++ b/web/src/components/tooltip/CustomTooltip.tsx
@@ -174,6 +174,7 @@ export const CustomTooltip = ({
                  : {}
              }
            >
+              lll
              {content}
            </div>
          </div>,