mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-25 19:55:47 +00:00
Compare commits
1 Commits
update
...
double_ini
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b3953b2c2f |
@@ -488,16 +488,6 @@ class OnyxConfluence:
|
||||
old_url_suffix = url_suffix
|
||||
url_suffix = cast(str, next_response.get("_links", {}).get("next", ""))
|
||||
|
||||
# we've observed that Confluence sometimes returns a next link despite giving
|
||||
# 0 results. This is a bug with Confluence, so we need to check for it and
|
||||
# stop paginating.
|
||||
if url_suffix and not results:
|
||||
logger.info(
|
||||
f"No results found for call '{old_url_suffix}' despite next link "
|
||||
"being present. Stopping pagination."
|
||||
)
|
||||
break
|
||||
|
||||
# make sure we don't update the start by more than the amount
|
||||
# of results we were able to retrieve. The Confluence API has a
|
||||
# weird behavior where if you pass in a limit that is too large for
|
||||
|
||||
@@ -243,6 +243,73 @@ schema DANSWER_CHUNK_NAME {
|
||||
}
|
||||
}
|
||||
|
||||
rank-profile hybrid_search_kw_first_phaseVARIABLE_DIM inherits default, default_rank {
|
||||
inputs {
|
||||
query(query_embedding) tensor<float>(x[VARIABLE_DIM])
|
||||
}
|
||||
|
||||
function title_vector_score() {
|
||||
expression {
|
||||
# If no good matching titles, then it should use the context embeddings rather than having some
|
||||
# irrelevant title have a vector score of 1. This way at least it will be the doc with the highest
|
||||
# matching content score getting the full score
|
||||
max(closeness(field, embeddings), closeness(field, title_embedding))
|
||||
}
|
||||
}
|
||||
|
||||
# First phase must be vector to allow hits that have no keyword matches
|
||||
first-phase {
|
||||
expression: 0.2 * bm25(title) + 0.8 * bm25(content)
|
||||
}
|
||||
|
||||
# Weighted average between Vector Search and BM-25
|
||||
global-phase {
|
||||
expression {
|
||||
(
|
||||
# Weighted Vector Similarity Score
|
||||
(
|
||||
query(alpha) * (
|
||||
(query(title_content_ratio) * normalize_linear(title_vector_score))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(closeness(field, embeddings)))
|
||||
)
|
||||
)
|
||||
|
||||
+
|
||||
|
||||
# Weighted Keyword Similarity Score
|
||||
# Note: for the BM25 Title score, it requires decent stopword removal in the query
|
||||
# This needs to be the case so there aren't irrelevant titles being normalized to a score of 1
|
||||
(
|
||||
(1 - query(alpha)) * (
|
||||
(query(title_content_ratio) * normalize_linear(bm25(title)))
|
||||
+
|
||||
((1 - query(title_content_ratio)) * normalize_linear(bm25(content)))
|
||||
)
|
||||
)
|
||||
)
|
||||
# Boost based on user feedback
|
||||
* document_boost
|
||||
# Decay factor based on time document was last updated
|
||||
* recency_bias
|
||||
# Boost based on aggregated boost calculation
|
||||
* aggregated_chunk_boost
|
||||
}
|
||||
rerank-count: 1000
|
||||
}
|
||||
|
||||
match-features {
|
||||
bm25(title)
|
||||
bm25(content)
|
||||
closeness(field, title_embedding)
|
||||
closeness(field, embeddings)
|
||||
document_boost
|
||||
recency_bias
|
||||
aggregated_chunk_boost
|
||||
closest(embeddings)
|
||||
}
|
||||
}
|
||||
|
||||
# Used when searching from the admin UI for a specific doc to hide / boost
|
||||
# Very heavily prioritize title
|
||||
rank-profile admin_search inherits default, default_rank {
|
||||
|
||||
@@ -297,76 +297,124 @@ def query_vespa(
|
||||
if "query" in query_params and not cast(str, query_params["query"]).strip():
|
||||
raise ValueError("No/empty query received")
|
||||
|
||||
params = dict(
|
||||
**query_params,
|
||||
**(
|
||||
{
|
||||
"presentation.timing": True,
|
||||
}
|
||||
if LOG_VESPA_TIMING_INFORMATION
|
||||
else {}
|
||||
),
|
||||
)
|
||||
configured_ranking_profile = query_params.get("ranking.profile")
|
||||
if not configured_ranking_profile:
|
||||
raise ValueError("No ranking profile configured")
|
||||
|
||||
try:
|
||||
with get_vespa_http_client() as http_client:
|
||||
response = http_client.post(SEARCH_ENDPOINT, json=params)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
error_base = "Failed to query Vespa"
|
||||
logger.error(
|
||||
f"{error_base}:\n"
|
||||
f"Request URL: {e.request.url}\n"
|
||||
f"Request Headers: {e.request.headers}\n"
|
||||
f"Request Payload: {params}\n"
|
||||
f"Exception: {str(e)}"
|
||||
+ (
|
||||
f"\nResponse: {e.response.text}"
|
||||
if isinstance(e, httpx.HTTPStatusError)
|
||||
else ""
|
||||
)
|
||||
query_profiles: list[float | int | str] = []
|
||||
|
||||
if (
|
||||
configured_ranking_profile
|
||||
and isinstance(configured_ranking_profile, str)
|
||||
and configured_ranking_profile.startswith("hybrid_search")
|
||||
):
|
||||
dimension = configured_ranking_profile.split("hybrid_search")[1]
|
||||
query_profiles = [
|
||||
f"hybrid_search_kw_first_phase{dimension}",
|
||||
f"hybrid_search{dimension}",
|
||||
]
|
||||
else:
|
||||
query_profiles = [configured_ranking_profile]
|
||||
|
||||
inference_chunk_sets = []
|
||||
mutable_params = dict(query_params)
|
||||
|
||||
for query_profile in query_profiles:
|
||||
mutable_params["ranking.profile"] = query_profile
|
||||
|
||||
params = dict(
|
||||
**mutable_params,
|
||||
**(
|
||||
{
|
||||
"presentation.timing": True,
|
||||
}
|
||||
if LOG_VESPA_TIMING_INFORMATION
|
||||
else {}
|
||||
),
|
||||
)
|
||||
raise httpx.HTTPError(error_base) from e
|
||||
|
||||
response_json: dict[str, Any] = response.json()
|
||||
|
||||
if LOG_VESPA_TIMING_INFORMATION:
|
||||
logger.debug("Vespa timing info: %s", response_json.get("timing"))
|
||||
hits = response_json["root"].get("children", [])
|
||||
|
||||
if not hits:
|
||||
logger.warning(
|
||||
f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
|
||||
)
|
||||
logger.debug(f"Vespa Response: {response.text}")
|
||||
|
||||
for hit in hits:
|
||||
if hit["fields"].get(CONTENT) is None:
|
||||
identifier = hit["fields"].get("documentid") or hit["id"]
|
||||
try:
|
||||
with get_vespa_http_client() as http_client:
|
||||
response = http_client.post(SEARCH_ENDPOINT, json=params)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
error_base = "Failed to query Vespa"
|
||||
logger.error(
|
||||
f"Vespa Index with Vespa ID {identifier} has no contents. "
|
||||
f"This is invalid because the vector is not meaningful and keywordsearch cannot "
|
||||
f"fetch this document"
|
||||
f"{error_base}:\n"
|
||||
f"Request URL: {e.request.url}\n"
|
||||
f"Request Headers: {e.request.headers}\n"
|
||||
f"Request Payload: {params}\n"
|
||||
f"Exception: {str(e)}"
|
||||
+ (
|
||||
f"\nResponse: {e.response.text}"
|
||||
if isinstance(e, httpx.HTTPStatusError)
|
||||
else ""
|
||||
)
|
||||
)
|
||||
raise httpx.HTTPError(error_base) from e
|
||||
|
||||
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
|
||||
response_json: dict[str, Any] = response.json()
|
||||
|
||||
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
|
||||
if LOG_VESPA_TIMING_INFORMATION:
|
||||
logger.debug("Vespa timing info: %s", response_json.get("timing"))
|
||||
hits = response_json["root"].get("children", [])
|
||||
|
||||
try:
|
||||
num_retrieved_inference_chunks = len(inference_chunks)
|
||||
num_retrieved_document_ids = len(
|
||||
set([chunk.document_id for chunk in inference_chunks])
|
||||
)
|
||||
logger.debug(
|
||||
f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
|
||||
)
|
||||
except Exception as e:
|
||||
# Debug logging only, should not fail the retrieval
|
||||
logger.error(f"Error logging retrieval statistics: {e}")
|
||||
if not hits:
|
||||
logger.warning(
|
||||
f"No hits found for YQL Query: {query_params.get('yql', 'No YQL Query')}"
|
||||
)
|
||||
logger.debug(f"Vespa Response: {response.text}")
|
||||
|
||||
for hit in hits:
|
||||
if hit["fields"].get(CONTENT) is None:
|
||||
identifier = hit["fields"].get("documentid") or hit["id"]
|
||||
logger.error(
|
||||
f"Vespa Index with Vespa ID {identifier} has no contents. "
|
||||
f"This is invalid because the vector is not meaningful and keywordsearch cannot "
|
||||
f"fetch this document"
|
||||
)
|
||||
|
||||
filtered_hits = [hit for hit in hits if hit["fields"].get(CONTENT) is not None]
|
||||
|
||||
inference_chunks = [_vespa_hit_to_inference_chunk(hit) for hit in filtered_hits]
|
||||
|
||||
try:
|
||||
num_retrieved_inference_chunks = len(inference_chunks)
|
||||
num_retrieved_document_ids = len(
|
||||
set([chunk.document_id for chunk in inference_chunks])
|
||||
)
|
||||
logger.debug(
|
||||
f"Retrieved {num_retrieved_inference_chunks} inference chunks for {num_retrieved_document_ids} documents"
|
||||
)
|
||||
except Exception as e:
|
||||
# Debug logging only, should not fail the retrieval
|
||||
logger.error(f"Error logging retrieval statistics: {e}")
|
||||
|
||||
inference_chunk_sets.append(inference_chunks)
|
||||
|
||||
flattened_inference_chunks = []
|
||||
for inference_chunk_set in inference_chunk_sets:
|
||||
flattened_inference_chunks.extend(inference_chunk_set)
|
||||
|
||||
flattened_inference_chunks.sort(key=lambda chunk: chunk.score, reverse=True)
|
||||
|
||||
final_chunks = []
|
||||
used_document_chunk_ids = set()
|
||||
|
||||
for chunk in flattened_inference_chunks:
|
||||
if (
|
||||
chunk.document_id + "__" + str(chunk.chunk_id)
|
||||
not in used_document_chunk_ids
|
||||
):
|
||||
final_chunks.append(chunk)
|
||||
used_document_chunk_ids.add(chunk.document_id + "__" + str(chunk.chunk_id))
|
||||
else:
|
||||
continue
|
||||
|
||||
return final_chunks
|
||||
|
||||
# Good Debugging Spot
|
||||
return inference_chunks
|
||||
return flattened_inference_chunks
|
||||
|
||||
|
||||
def _get_chunks_via_batch_search(
|
||||
|
||||
@@ -29,11 +29,12 @@ function LLMProviderUpdateModal({
|
||||
llmProviderDescriptor?.name ||
|
||||
"Custom LLM Provider";
|
||||
|
||||
const hasAdvancedOptions = llmProviderDescriptor?.name != "azure";
|
||||
|
||||
return (
|
||||
<Modal
|
||||
title={`${llmProviderDescriptor ? "Configure" : "Setup"} ${providerName}`}
|
||||
onOutsideClick={() => onClose()}
|
||||
hideOverflow={true}
|
||||
>
|
||||
<div className="max-h-[70vh] overflow-y-auto px-4">
|
||||
{llmProviderDescriptor ? (
|
||||
@@ -43,6 +44,7 @@ function LLMProviderUpdateModal({
|
||||
existingLlmProvider={existingLlmProvider}
|
||||
shouldMarkAsDefault={shouldMarkAsDefault}
|
||||
setPopup={setPopup}
|
||||
hasAdvancedOptions={hasAdvancedOptions}
|
||||
/>
|
||||
) : (
|
||||
<CustomLLMProviderUpdateForm
|
||||
|
||||
@@ -35,12 +35,10 @@ function LLMProviderUpdateModal({
|
||||
existingLlmProvider?.name ||
|
||||
"Custom LLM Provider";
|
||||
|
||||
const hasAdvancedOptions = llmProviderDescriptor?.name != "azure";
|
||||
|
||||
return (
|
||||
<Modal
|
||||
title={`Setup ${providerName}`}
|
||||
onOutsideClick={() => onClose()}
|
||||
hideOverflow={true}
|
||||
>
|
||||
<Modal title={`Setup ${providerName}`} onOutsideClick={() => onClose()}>
|
||||
<div className="max-h-[70vh] overflow-y-auto px-4">
|
||||
{llmProviderDescriptor ? (
|
||||
<LLMProviderUpdateForm
|
||||
@@ -49,6 +47,7 @@ function LLMProviderUpdateModal({
|
||||
existingLlmProvider={existingLlmProvider}
|
||||
shouldMarkAsDefault={shouldMarkAsDefault}
|
||||
setPopup={setPopup}
|
||||
hasAdvancedOptions={hasAdvancedOptions}
|
||||
/>
|
||||
) : (
|
||||
<CustomLLMProviderUpdateForm
|
||||
|
||||
@@ -29,6 +29,7 @@ export function LLMProviderUpdateForm({
|
||||
setPopup,
|
||||
hideSuccess,
|
||||
firstTimeConfiguration = false,
|
||||
hasAdvancedOptions = false,
|
||||
}: {
|
||||
llmProviderDescriptor: WellKnownLLMProviderDescriptor;
|
||||
onClose: () => void;
|
||||
@@ -39,6 +40,7 @@ export function LLMProviderUpdateForm({
|
||||
|
||||
// Set this when this is the first time the user is setting Onyx up.
|
||||
firstTimeConfiguration?: boolean;
|
||||
hasAdvancedOptions?: boolean;
|
||||
}) {
|
||||
const { mutate } = useSWRConfig();
|
||||
|
||||
@@ -300,7 +302,7 @@ export function LLMProviderUpdateForm({
|
||||
}
|
||||
})}
|
||||
|
||||
{!firstTimeConfiguration && (
|
||||
{hasAdvancedOptions && !firstTimeConfiguration && (
|
||||
<>
|
||||
<Separator />
|
||||
|
||||
@@ -362,49 +364,52 @@ export function LLMProviderUpdateForm({
|
||||
/>
|
||||
))}
|
||||
|
||||
<>
|
||||
<Separator />
|
||||
<AdvancedOptionsToggle
|
||||
showAdvancedOptions={showAdvancedOptions}
|
||||
setShowAdvancedOptions={setShowAdvancedOptions}
|
||||
/>
|
||||
{showAdvancedOptions && (
|
||||
<>
|
||||
{llmProviderDescriptor.llm_names.length > 0 && (
|
||||
<div className="w-full">
|
||||
<MultiSelectField
|
||||
selectedInitially={
|
||||
formikProps.values.display_model_names
|
||||
}
|
||||
name="display_model_names"
|
||||
label="Display Models"
|
||||
subtext="Select the models to make available to users. Unselected models will not be available."
|
||||
options={llmProviderDescriptor.llm_names.map(
|
||||
(name) => ({
|
||||
value: name,
|
||||
// don't clean up names here to give admins descriptive names / handle duplicates
|
||||
// like us.anthropic.claude-3-7-sonnet-20250219-v1:0 and anthropic.claude-3-7-sonnet-20250219-v1:0
|
||||
label: name,
|
||||
})
|
||||
)}
|
||||
onChange={(selected) =>
|
||||
formikProps.setFieldValue(
|
||||
"display_model_names",
|
||||
selected
|
||||
)
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
<IsPublicGroupSelector
|
||||
formikProps={formikProps}
|
||||
objectName="LLM Provider"
|
||||
publicToWhom="Users"
|
||||
enforceGroupSelection={true}
|
||||
/>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
{hasAdvancedOptions && (
|
||||
<>
|
||||
<Separator />
|
||||
<AdvancedOptionsToggle
|
||||
showAdvancedOptions={showAdvancedOptions}
|
||||
setShowAdvancedOptions={setShowAdvancedOptions}
|
||||
/>
|
||||
{showAdvancedOptions && (
|
||||
<>
|
||||
{llmProviderDescriptor.llm_names.length > 0 && (
|
||||
<div className="w-full">
|
||||
<MultiSelectField
|
||||
selectedInitially={
|
||||
formikProps.values.display_model_names
|
||||
}
|
||||
name="display_model_names"
|
||||
label="Display Models"
|
||||
subtext="Select the models to make available to users. Unselected models will not be available."
|
||||
options={llmProviderDescriptor.llm_names.map(
|
||||
(name) => ({
|
||||
value: name,
|
||||
// don't clean up names here to give admins descriptive names / handle duplicates
|
||||
// like us.anthropic.claude-3-7-sonnet-20250219-v1:0 and anthropic.claude-3-7-sonnet-20250219-v1:0
|
||||
label: name,
|
||||
})
|
||||
)}
|
||||
onChange={(selected) =>
|
||||
formikProps.setFieldValue(
|
||||
"display_model_names",
|
||||
selected
|
||||
)
|
||||
}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<IsPublicGroupSelector
|
||||
formikProps={formikProps}
|
||||
objectName="LLM Provider"
|
||||
publicToWhom="all users"
|
||||
enforceGroupSelection={true}
|
||||
/>
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
|
||||
@@ -146,10 +146,7 @@ export function ShareChatSessionModal({
|
||||
setShareLink("");
|
||||
onShare && onShare(false);
|
||||
} else {
|
||||
setPopup({
|
||||
message: "Failed to delete share link",
|
||||
type: "error",
|
||||
});
|
||||
alert("Failed to delete share link");
|
||||
}
|
||||
}}
|
||||
size="sm"
|
||||
@@ -174,10 +171,7 @@ export function ShareChatSessionModal({
|
||||
const shareLink =
|
||||
await generateShareLink(chatSessionId);
|
||||
if (!shareLink) {
|
||||
setPopup({
|
||||
message: "Failed to generate share link",
|
||||
type: "error",
|
||||
});
|
||||
alert("Failed to generate share link");
|
||||
} else {
|
||||
setShareLink(shareLink);
|
||||
onShare && onShare(true);
|
||||
@@ -237,10 +231,7 @@ export function ShareChatSessionModal({
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
setPopup({
|
||||
message: "Failed to generate or copy link.",
|
||||
type: "error",
|
||||
});
|
||||
alert("Failed to generate or copy link.");
|
||||
}
|
||||
}}
|
||||
size="sm"
|
||||
|
||||
@@ -346,6 +346,8 @@ export const DocumentList: React.FC<DocumentListProps> = ({
|
||||
// Get the hostname (domain) from the URL
|
||||
const url = new URL(uploadingFile.name);
|
||||
const hostname = url.hostname;
|
||||
alert("checking for " + hostname);
|
||||
alert(JSON.stringify(files));
|
||||
|
||||
// Look for recently added files that might match this URL
|
||||
const isUploaded = files.some(
|
||||
|
||||
@@ -74,6 +74,7 @@ export const IsPublicGroupSelector = <T extends IsPublicGroupSelectorFormType>({
|
||||
|
||||
return (
|
||||
<div>
|
||||
<Separator />
|
||||
{isAdmin && (
|
||||
<>
|
||||
<BooleanFormField
|
||||
|
||||
@@ -24,7 +24,6 @@ interface ModalProps {
|
||||
removeBottomPadding?: boolean;
|
||||
removePadding?: boolean;
|
||||
increasedPadding?: boolean;
|
||||
hideOverflow?: boolean;
|
||||
}
|
||||
|
||||
export function Modal({
|
||||
@@ -44,7 +43,6 @@ export function Modal({
|
||||
removeBottomPadding,
|
||||
removePadding,
|
||||
increasedPadding,
|
||||
hideOverflow,
|
||||
}: ModalProps) {
|
||||
const modalRef = useRef<HTMLDivElement>(null);
|
||||
const [isMounted, setIsMounted] = useState(false);
|
||||
@@ -94,7 +92,7 @@ export function Modal({
|
||||
flex
|
||||
flex-col
|
||||
${heightOverride ? `h-${heightOverride}` : "max-h-[90vh]"}
|
||||
${hideOverflow ? "overflow-hidden" : "overflow-auto"}
|
||||
overflow-auto
|
||||
`}
|
||||
>
|
||||
{onOutsideClick && !hideCloseButton && (
|
||||
|
||||
@@ -174,6 +174,7 @@ export const CustomTooltip = ({
|
||||
: {}
|
||||
}
|
||||
>
|
||||
lll
|
||||
{content}
|
||||
</div>
|
||||
</div>,
|
||||
|
||||
Reference in New Issue
Block a user