Compare commits

..

4 Commits

Author SHA1 Message Date
pablonyx
25d9266da4 update 2025-02-26 08:48:35 -08:00
Weves
23073d91b9 reduce number of chars to index for search 2025-02-25 19:27:50 -08:00
Chris Weaver
f767b1f476 Fix confluence permission syncing at scale (#4129)
* Fix confluence permission syncing at scale

* Remove line

* Better log message

* Adjust log
2025-02-25 19:22:52 -08:00
pablonyx
9ffc8cb2c4 k 2025-02-25 18:15:49 -08:00
5 changed files with 285 additions and 29 deletions

View File

@@ -17,10 +17,11 @@ depends_on = None
def upgrade() -> None:
# Create a basic index on the lowercase message column for direct text matching
# Limit to 1500 characters to stay well under the 2856 byte limit of btree version 4
op.execute(
"""
CREATE INDEX idx_chat_message_message_lower
ON chat_message (LOWER(message))
ON chat_message (LOWER(substring(message, 1, 1500)))
"""
)

View File

@@ -11,6 +11,8 @@ from atlassian import Confluence # type:ignore
from pydantic import BaseModel
from requests import HTTPError
from onyx.connectors.confluence.utils import get_start_param_from_url
from onyx.connectors.confluence.utils import update_param_in_path
from onyx.connectors.exceptions import ConnectorValidationError
from onyx.utils.logger import setup_logger
@@ -161,7 +163,7 @@ class OnyxConfluence(Confluence):
)
def _paginate_url(
self, url_suffix: str, limit: int | None = None
self, url_suffix: str, limit: int | None = None, auto_paginate: bool = False
) -> Iterator[dict[str, Any]]:
"""
This will paginate through the top level query.
@@ -236,9 +238,41 @@ class OnyxConfluence(Confluence):
raise e
# yield the results individually
yield from next_response.get("results", [])
results = cast(list[dict[str, Any]], next_response.get("results", []))
yield from results
url_suffix = next_response.get("_links", {}).get("next")
old_url_suffix = url_suffix
url_suffix = cast(str, next_response.get("_links", {}).get("next", ""))
# make sure we don't update the start by more than the amount
# of results we were able to retrieve. The Confluence API has a
# weird behavior where if you pass in a limit that is too large for
# the configured server, it will artificially limit the amount of
# results returned BUT will not apply this to the start parameter.
# This will cause us to miss results.
if url_suffix and "start" in url_suffix:
new_start = get_start_param_from_url(url_suffix)
previous_start = get_start_param_from_url(old_url_suffix)
if new_start - previous_start > len(results):
logger.warning(
f"Start was updated by more than the amount of results "
f"retrieved. This is a bug with Confluence. Start: {new_start}, "
f"Previous Start: {previous_start}, Len Results: {len(results)}."
)
# Update the url_suffix to use the adjusted start
adjusted_start = previous_start + len(results)
url_suffix = update_param_in_path(
url_suffix, "start", str(adjusted_start)
)
# some APIs don't properly paginate, so we need to manually update the `start` param
if auto_paginate and len(results) > 0:
previous_start = get_start_param_from_url(old_url_suffix)
updated_start = previous_start + len(results)
url_suffix = update_param_in_path(
old_url_suffix, "start", str(updated_start)
)
def paginated_cql_retrieval(
self,
@@ -298,7 +332,9 @@ class OnyxConfluence(Confluence):
url = "rest/api/search/user"
expand_string = f"&expand={expand}" if expand else ""
url += f"?cql={cql}{expand_string}"
for user_result in self._paginate_url(url, limit):
# endpoint doesn't properly paginate, so we need to manually update the `start` param
# thus the auto_paginate flag
for user_result in self._paginate_url(url, limit, auto_paginate=True):
# Example response:
# {
# 'user': {

View File

@@ -2,7 +2,10 @@ import io
from datetime import datetime
from datetime import timezone
from typing import Any
from typing import TYPE_CHECKING
from urllib.parse import parse_qs
from urllib.parse import quote
from urllib.parse import urlparse
import bs4
@@ -10,13 +13,13 @@ from onyx.configs.app_configs import (
CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD,
)
from onyx.configs.app_configs import CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD
from onyx.connectors.confluence.onyx_confluence import (
OnyxConfluence,
)
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.file_processing.html_utils import format_document_soup
from onyx.utils.logger import setup_logger
if TYPE_CHECKING:
from onyx.connectors.confluence.onyx_confluence import OnyxConfluence
logger = setup_logger()
@@ -24,7 +27,7 @@ _USER_EMAIL_CACHE: dict[str, str | None] = {}
def get_user_email_from_username__server(
confluence_client: OnyxConfluence, user_name: str
confluence_client: "OnyxConfluence", user_name: str
) -> str | None:
global _USER_EMAIL_CACHE
if _USER_EMAIL_CACHE.get(user_name) is None:
@@ -47,7 +50,7 @@ _USER_NOT_FOUND = "Unknown Confluence User"
_USER_ID_TO_DISPLAY_NAME_CACHE: dict[str, str | None] = {}
def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str:
def _get_user(confluence_client: "OnyxConfluence", user_id: str) -> str:
"""Get Confluence Display Name based on the account-id or userkey value
Args:
@@ -78,7 +81,7 @@ def _get_user(confluence_client: OnyxConfluence, user_id: str) -> str:
def extract_text_from_confluence_html(
confluence_client: OnyxConfluence,
confluence_client: "OnyxConfluence",
confluence_object: dict[str, Any],
fetched_titles: set[str],
) -> str:
@@ -191,7 +194,7 @@ def validate_attachment_filetype(attachment: dict[str, Any]) -> bool:
def attachment_to_content(
confluence_client: OnyxConfluence,
confluence_client: "OnyxConfluence",
attachment: dict[str, Any],
) -> str | None:
"""If it returns None, assume that we should skip this attachment."""
@@ -279,3 +282,32 @@ def datetime_from_string(datetime_string: str) -> datetime:
datetime_object = datetime_object.astimezone(timezone.utc)
return datetime_object
def get_single_param_from_url(url: str, param: str) -> str | None:
"""Get a parameter from a url"""
parsed_url = urlparse(url)
return parse_qs(parsed_url.query).get(param, [None])[0]
def get_start_param_from_url(url: str) -> int:
"""Get the start parameter from a url"""
start_str = get_single_param_from_url(url, "start")
if start_str is None:
return 0
return int(start_str)
def update_param_in_path(path: str, param: str, value: str) -> str:
"""Update a parameter in a path. Path should look something like:
/api/rest/users?start=0&limit=10
"""
parsed_url = urlparse(path)
query_params = parse_qs(parsed_url.query)
query_params[param] = [value]
return (
path.split("?")[0]
+ "?"
+ "&".join(f"{k}={quote(v[0])}" for k, v in query_params.items())
)

View File

@@ -68,6 +68,28 @@ const nextConfig = {
},
];
},
async rewrites() {
return [
{
source: "/api/docs/:path*", // catch /api/docs and /api/docs/...
destination: `${
process.env.INTERNAL_URL || "http://localhost:8080"
}/docs/:path*`,
},
{
source: "/api/docs", // if you also need the exact /api/docs
destination: `${
process.env.INTERNAL_URL || "http://localhost:8080"
}/docs`,
},
{
source: "/openapi.json",
destination: `${
process.env.INTERNAL_URL || "http://localhost:8080"
}/openapi.json`,
},
];
},
};
// Sentry configuration for error monitoring:

View File

@@ -56,7 +56,6 @@ import {
Dispatch,
SetStateAction,
use,
useCallback,
useContext,
useEffect,
useLayoutEffect,
@@ -894,6 +893,24 @@ export function ChatPage({
);
const scrollDist = useRef<number>(0);
const updateScrollTracking = () => {
const scrollDistance =
endDivRef?.current?.getBoundingClientRect()?.top! -
inputRef?.current?.getBoundingClientRect()?.top!;
scrollDist.current = scrollDistance;
setAboveHorizon(scrollDist.current > 500);
};
useEffect(() => {
const scrollableDiv = scrollableDivRef.current;
if (scrollableDiv) {
scrollableDiv.addEventListener("scroll", updateScrollTracking);
return () => {
scrollableDiv.removeEventListener("scroll", updateScrollTracking);
};
}
}, []);
const handleInputResize = () => {
setTimeout(() => {
if (
@@ -945,12 +962,33 @@ export function ChatPage({
if (isVisible) return;
// Check if all messages are currently rendered
// If all messages are already rendered, scroll immediately
endDivRef.current.scrollIntoView({
behavior: fast ? "auto" : "smooth",
});
if (currentVisibleRange.end < messageHistory.length) {
// Update visible range to include the last messages
updateCurrentVisibleRange({
start: Math.max(
0,
messageHistory.length -
(currentVisibleRange.end - currentVisibleRange.start)
),
end: messageHistory.length,
mostVisibleMessageId: currentVisibleRange.mostVisibleMessageId,
});
setHasPerformedInitialScroll(true);
// Wait for the state update and re-render before scrolling
setTimeout(() => {
endDivRef.current?.scrollIntoView({
behavior: fast ? "auto" : "smooth",
});
setHasPerformedInitialScroll(true);
}, 100);
} else {
// If all messages are already rendered, scroll immediately
endDivRef.current.scrollIntoView({
behavior: fast ? "auto" : "smooth",
});
setHasPerformedInitialScroll(true);
}
}, 50);
// Reset waitForScrollRef after 1.5 seconds
@@ -971,6 +1009,11 @@ export function ChatPage({
handleInputResize();
}, [message]);
// tracks scrolling
useEffect(() => {
updateScrollTracking();
}, [messageHistory]);
// used for resizing of the document sidebar
const masterFlexboxRef = useRef<HTMLDivElement>(null);
const [maxDocumentSidebarWidth, setMaxDocumentSidebarWidth] = useState<
@@ -1934,6 +1977,122 @@ export function ChatPage({
// Virtualization + Scrolling related effects and functions
const scrollInitialized = useRef(false);
interface VisibleRange {
start: number;
end: number;
mostVisibleMessageId: number | null;
}
const [visibleRange, setVisibleRange] = useState<
Map<string | null, VisibleRange>
>(() => {
const initialRange: VisibleRange = {
start: 0,
end: BUFFER_COUNT,
mostVisibleMessageId: null,
};
return new Map([[chatSessionIdRef.current, initialRange]]);
});
// Function used to update current visible range. Only method for updating `visibleRange` state.
const updateCurrentVisibleRange = (
newRange: VisibleRange,
forceUpdate?: boolean
) => {
if (
scrollInitialized.current &&
visibleRange.get(loadedIdSessionRef.current) == undefined &&
!forceUpdate
) {
return;
}
setVisibleRange((prevState) => {
const newState = new Map(prevState);
newState.set(loadedIdSessionRef.current, newRange);
return newState;
});
};
// Set first value for visibleRange state on page load / refresh.
const initializeVisibleRange = () => {
const upToDatemessageHistory = buildLatestMessageChain(
currentMessageMap(completeMessageDetail)
);
if (!scrollInitialized.current && upToDatemessageHistory.length > 0) {
const newEnd = Math.max(upToDatemessageHistory.length, BUFFER_COUNT);
const newStart = Math.max(0, newEnd - BUFFER_COUNT);
const newMostVisibleMessageId =
upToDatemessageHistory[newEnd - 1]?.messageId;
updateCurrentVisibleRange(
{
start: newStart,
end: newEnd,
mostVisibleMessageId: newMostVisibleMessageId,
},
true
);
scrollInitialized.current = true;
}
};
const updateVisibleRangeBasedOnScroll = () => {
if (!scrollInitialized.current) return;
const scrollableDiv = scrollableDivRef.current;
if (!scrollableDiv) return;
const viewportHeight = scrollableDiv.clientHeight;
let mostVisibleMessageIndex = -1;
messageHistory.forEach((message, index) => {
const messageElement = document.getElementById(
`message-${message.messageId}`
);
if (messageElement) {
const rect = messageElement.getBoundingClientRect();
const isVisible = rect.bottom <= viewportHeight && rect.bottom > 0;
if (isVisible && index > mostVisibleMessageIndex) {
mostVisibleMessageIndex = index;
}
}
});
if (mostVisibleMessageIndex !== -1) {
const startIndex = Math.max(0, mostVisibleMessageIndex - BUFFER_COUNT);
const endIndex = Math.min(
messageHistory.length,
mostVisibleMessageIndex + BUFFER_COUNT + 1
);
updateCurrentVisibleRange({
start: startIndex,
end: endIndex,
mostVisibleMessageId: messageHistory[mostVisibleMessageIndex].messageId,
});
}
};
useEffect(() => {
initializeVisibleRange();
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [router, messageHistory]);
useLayoutEffect(() => {
const scrollableDiv = scrollableDivRef.current;
const handleScroll = () => {
updateVisibleRangeBasedOnScroll();
};
scrollableDiv?.addEventListener("scroll", handleScroll);
return () => {
scrollableDiv?.removeEventListener("scroll", handleScroll);
};
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [messageHistory]);
const imageFileInMessageHistory = useMemo(() => {
return messageHistory
@@ -1943,6 +2102,11 @@ export function ChatPage({
);
}, [messageHistory]);
const currentVisibleRange = visibleRange.get(currentSessionId()) || {
start: 0,
end: 0,
mostVisibleMessageId: null,
};
useSendMessageToParent();
useEffect(() => {
@@ -1982,15 +2146,6 @@ export function ChatPage({
const currentPersona = alternativeAssistant || liveAssistant;
const HORIZON_DISTANCE = 800;
const handleScroll = useCallback(() => {
const scrollDistance =
endDivRef?.current?.getBoundingClientRect()?.top! -
inputRef?.current?.getBoundingClientRect()?.top!;
scrollDist.current = scrollDistance;
setAboveHorizon(scrollDist.current > HORIZON_DISTANCE);
}, []);
useEffect(() => {
const handleSlackChatRedirect = async () => {
if (!slackChatId) return;
@@ -2441,7 +2596,6 @@ export function ChatPage({
{...getRootProps()}
>
<div
onScroll={handleScroll}
className={`w-full h-[calc(100vh-160px)] flex flex-col default-scrollbar overflow-y-auto overflow-x-hidden relative`}
ref={scrollableDivRef}
>
@@ -2499,7 +2653,18 @@ export function ChatPage({
// NOTE: temporarily removing this to fix the scroll bug
// (hasPerformedInitialScroll ? "" : "invisible")
>
{messageHistory.map((message, i) => {
{(messageHistory.length < BUFFER_COUNT
? messageHistory
: messageHistory.slice(
currentVisibleRange.start,
currentVisibleRange.end
)
).map((message, fauxIndex) => {
const i =
messageHistory.length < BUFFER_COUNT
? fauxIndex
: fauxIndex + currentVisibleRange.start;
const messageMap = currentMessageMap(
completeMessageDetail
);