mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-03-28 19:12:43 +00:00
Compare commits
8 Commits
v3.1.0
...
nikg/fix-s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
deb6c65846 | ||
|
|
1683e8f667 | ||
|
|
6ac4cacfad | ||
|
|
71415c1a49 | ||
|
|
3fb8dc1db0 | ||
|
|
fda1528174 | ||
|
|
5f34c83f0a | ||
|
|
3509e9c48c |
@@ -1,3 +1,6 @@
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import cast
|
||||
|
||||
@@ -46,6 +49,8 @@ from onyx.onyxbot.slack.utils import remove_slack_text_interactions
|
||||
from onyx.onyxbot.slack.utils import translate_vespa_highlight_to_slack
|
||||
from onyx.utils.text_processing import decode_escapes
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_MAX_BLURB_LEN = 45
|
||||
|
||||
|
||||
@@ -76,6 +81,99 @@ def get_feedback_reminder_blocks(thread_link: str, include_followup: bool) -> Bl
|
||||
return SectionBlock(text=text)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodeSnippet:
|
||||
"""A code block extracted from the answer to be uploaded as a Slack file."""
|
||||
|
||||
code: str
|
||||
language: str
|
||||
filename: str
|
||||
|
||||
|
||||
_SECTION_BLOCK_LIMIT = 3000
|
||||
|
||||
# Matches fenced code blocks: ```lang\n...\n```
|
||||
# The opening fence must start at the beginning of a line (^), may have an
|
||||
# optional language specifier (\w*), followed by a newline. The closing fence
|
||||
# is ``` at the beginning of a line. We also handle an optional trailing
|
||||
# newline on the closing fence line to be robust against different formatting.
|
||||
_CODE_FENCE_RE = re.compile(
|
||||
r"^```(\w*)\n(.*?)^```\s*$",
|
||||
re.MULTILINE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def _extract_code_snippets(
|
||||
text: str, limit: int = _SECTION_BLOCK_LIMIT
|
||||
) -> tuple[str, list[CodeSnippet]]:
|
||||
"""Extract code blocks that would push the text over *limit*.
|
||||
|
||||
Returns (cleaned_text, snippets) where *cleaned_text* has large code
|
||||
blocks replaced with a placeholder and *snippets* contains the extracted
|
||||
code to be uploaded as Slack file snippets.
|
||||
|
||||
Uses a two-pass approach: first collect all matches, then decide which
|
||||
to extract based on cumulative removal so each decision accounts for
|
||||
previously extracted blocks.
|
||||
|
||||
Pass *limit=0* to force-extract ALL code blocks unconditionally.
|
||||
"""
|
||||
if limit > 0 and len(text) <= limit:
|
||||
return text, []
|
||||
|
||||
# Pass 1: collect all code-fence matches
|
||||
matches = list(_CODE_FENCE_RE.finditer(text))
|
||||
if not matches:
|
||||
return text, []
|
||||
|
||||
# Pass 2: decide which blocks to extract, accounting for cumulative removal.
|
||||
# Only extract if the text is still over the limit OR the block is very large.
|
||||
# With limit=0, extract everything unconditionally.
|
||||
extract_indices: set[int] = set()
|
||||
removed_chars = 0
|
||||
for i, match in enumerate(matches):
|
||||
full_block = match.group(0)
|
||||
if limit == 0:
|
||||
extract_indices.add(i)
|
||||
removed_chars += len(full_block)
|
||||
else:
|
||||
current_len = len(text) - removed_chars
|
||||
if current_len > limit and current_len - len(full_block) <= limit:
|
||||
extract_indices.add(i)
|
||||
removed_chars += len(full_block)
|
||||
elif len(full_block) > limit // 2:
|
||||
extract_indices.add(i)
|
||||
removed_chars += len(full_block)
|
||||
|
||||
if not extract_indices:
|
||||
return text, []
|
||||
|
||||
# Build cleaned text and snippets by processing matches in reverse
|
||||
# so character offsets remain valid.
|
||||
snippets: list[CodeSnippet] = []
|
||||
cleaned = text
|
||||
for i in sorted(extract_indices, reverse=True):
|
||||
match = matches[i]
|
||||
lang = match.group(1) or ""
|
||||
code = match.group(2)
|
||||
ext = lang if lang else "txt"
|
||||
snippets.append(
|
||||
CodeSnippet(
|
||||
code=code.strip(),
|
||||
language=lang or "text",
|
||||
filename=f"code_{len(extract_indices) - len(snippets)}.{ext}",
|
||||
)
|
||||
)
|
||||
cleaned = cleaned[: match.start()] + cleaned[match.end() :]
|
||||
|
||||
# Snippets were appended in reverse order — flip to match document order
|
||||
snippets.reverse()
|
||||
|
||||
# Clean up any triple+ blank lines left by extraction
|
||||
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
|
||||
return cleaned, snippets
|
||||
|
||||
|
||||
def _split_text(text: str, limit: int = 3000) -> list[str]:
|
||||
if len(text) <= limit:
|
||||
return [text]
|
||||
@@ -417,7 +515,7 @@ def _build_citations_blocks(
|
||||
|
||||
def _build_main_response_blocks(
|
||||
answer: ChatBasicResponse,
|
||||
) -> list[Block]:
|
||||
) -> tuple[list[Block], list[CodeSnippet]]:
|
||||
# TODO: add back in later when auto-filtering is implemented
|
||||
# if (
|
||||
# retrieval_info.applied_time_cutoff
|
||||
@@ -448,9 +546,45 @@ def _build_main_response_blocks(
|
||||
# replaces markdown links with slack format links
|
||||
formatted_answer = format_slack_message(answer.answer)
|
||||
answer_processed = decode_escapes(remove_slack_text_interactions(formatted_answer))
|
||||
answer_blocks = [SectionBlock(text=text) for text in _split_text(answer_processed)]
|
||||
|
||||
return cast(list[Block], answer_blocks)
|
||||
# Extract large code blocks as snippets to upload separately,
|
||||
# avoiding broken code fences when splitting across SectionBlocks.
|
||||
cleaned_text, code_snippets = _extract_code_snippets(answer_processed)
|
||||
logger.info(
|
||||
"Code extraction: input=%d chars, cleaned=%d chars, snippets=%d",
|
||||
len(answer_processed),
|
||||
len(cleaned_text),
|
||||
len(code_snippets),
|
||||
)
|
||||
|
||||
if len(cleaned_text) <= _SECTION_BLOCK_LIMIT:
|
||||
answer_blocks = [SectionBlock(text=cleaned_text)]
|
||||
elif "```" not in cleaned_text:
|
||||
# No code fences — safe to split at word boundaries.
|
||||
answer_blocks = [
|
||||
SectionBlock(text=text)
|
||||
for text in _split_text(cleaned_text, limit=_SECTION_BLOCK_LIMIT)
|
||||
]
|
||||
else:
|
||||
# Text still has code fences after extraction and exceeds the
|
||||
# SectionBlock limit. Splitting would break the fences, so fall
|
||||
# back to uploading the entire remaining code as another snippet
|
||||
# and keeping only the prose in the blocks.
|
||||
logger.warning(
|
||||
"Cleaned text still has code fences (%d chars); "
|
||||
"force-extracting remaining code blocks",
|
||||
len(cleaned_text),
|
||||
)
|
||||
remaining_cleaned, remaining_snippets = _extract_code_snippets(
|
||||
cleaned_text, limit=0
|
||||
)
|
||||
code_snippets.extend(remaining_snippets)
|
||||
answer_blocks = [
|
||||
SectionBlock(text=text)
|
||||
for text in _split_text(remaining_cleaned, limit=_SECTION_BLOCK_LIMIT)
|
||||
]
|
||||
|
||||
return cast(list[Block], answer_blocks), code_snippets
|
||||
|
||||
|
||||
def _build_continue_in_web_ui_block(
|
||||
@@ -531,10 +665,13 @@ def build_slack_response_blocks(
|
||||
skip_ai_feedback: bool = False,
|
||||
offer_ephemeral_publication: bool = False,
|
||||
skip_restated_question: bool = False,
|
||||
) -> list[Block]:
|
||||
) -> tuple[list[Block], list[CodeSnippet]]:
|
||||
"""
|
||||
This function is a top level function that builds all the blocks for the Slack response.
|
||||
It also handles combining all the blocks together.
|
||||
|
||||
Returns (blocks, code_snippets) where code_snippets should be uploaded
|
||||
as Slack file snippets in the same thread.
|
||||
"""
|
||||
# If called with the OnyxBot slash command, the question is lost so we have to reshow it
|
||||
if not skip_restated_question:
|
||||
@@ -544,7 +681,7 @@ def build_slack_response_blocks(
|
||||
else:
|
||||
restate_question_block = []
|
||||
|
||||
answer_blocks = _build_main_response_blocks(answer)
|
||||
answer_blocks, code_snippets = _build_main_response_blocks(answer)
|
||||
|
||||
web_follow_up_block = []
|
||||
if channel_conf and channel_conf.get("show_continue_in_web_ui"):
|
||||
@@ -610,4 +747,4 @@ def build_slack_response_blocks(
|
||||
+ follow_up_block
|
||||
)
|
||||
|
||||
return all_blocks
|
||||
return all_blocks, code_snippets
|
||||
|
||||
@@ -282,7 +282,7 @@ def handle_publish_ephemeral_message_button(
|
||||
logger.error(f"Failed to send webhook: {e}")
|
||||
|
||||
# remove handling of empheremal block and add AI feedback.
|
||||
all_blocks = build_slack_response_blocks(
|
||||
all_blocks, _ = build_slack_response_blocks(
|
||||
answer=onyx_bot_answer,
|
||||
message_info=slack_message_info,
|
||||
channel_conf=channel_conf,
|
||||
@@ -311,7 +311,7 @@ def handle_publish_ephemeral_message_button(
|
||||
elif action_id == KEEP_TO_YOURSELF_ACTION_ID:
|
||||
# Keep as ephemeral message in channel or thread, but remove the publish button and add feedback button
|
||||
|
||||
changed_blocks = build_slack_response_blocks(
|
||||
changed_blocks, _ = build_slack_response_blocks(
|
||||
answer=onyx_bot_answer,
|
||||
message_info=slack_message_info,
|
||||
channel_conf=channel_conf,
|
||||
|
||||
@@ -25,6 +25,7 @@ from onyx.db.models import User
|
||||
from onyx.db.persona import get_persona_by_id
|
||||
from onyx.db.users import get_user_by_email
|
||||
from onyx.onyxbot.slack.blocks import build_slack_response_blocks
|
||||
from onyx.onyxbot.slack.blocks import CodeSnippet
|
||||
from onyx.onyxbot.slack.constants import SLACK_CHANNEL_REF_PATTERN
|
||||
from onyx.onyxbot.slack.handlers.utils import send_team_member_message
|
||||
from onyx.onyxbot.slack.models import SlackMessageInfo
|
||||
@@ -134,6 +135,65 @@ def build_slack_context_str(
|
||||
return slack_context_str + "\n\n".join(message_strs)
|
||||
|
||||
|
||||
# Normalize common LLM language aliases to Slack's expected snippet_type values.
|
||||
# Slack silently falls back to plain text for unrecognized types, so this map
|
||||
# only needs to cover the most common mismatches.
|
||||
_SNIPPET_TYPE_MAP: dict[str, str] = {
|
||||
"py": "python",
|
||||
"js": "javascript",
|
||||
"ts": "typescript",
|
||||
"tsx": "typescript",
|
||||
"jsx": "javascript",
|
||||
"sh": "shell",
|
||||
"bash": "shell",
|
||||
"zsh": "shell",
|
||||
"yml": "yaml",
|
||||
"rb": "ruby",
|
||||
"rs": "rust",
|
||||
"cs": "csharp",
|
||||
"md": "markdown",
|
||||
"txt": "text",
|
||||
"text": "plain_text",
|
||||
}
|
||||
|
||||
|
||||
def _upload_code_snippets(
|
||||
client: WebClient,
|
||||
channel: str,
|
||||
thread_ts: str,
|
||||
snippets: list[CodeSnippet],
|
||||
logger: OnyxLoggingAdapter,
|
||||
receiver_ids: list[str] | None = None,
|
||||
send_as_ephemeral: bool | None = None,
|
||||
) -> None:
|
||||
"""Upload extracted code blocks as Slack file snippets in the thread."""
|
||||
for snippet in snippets:
|
||||
try:
|
||||
snippet_type = _SNIPPET_TYPE_MAP.get(snippet.language, snippet.language)
|
||||
client.files_upload_v2(
|
||||
channel=channel,
|
||||
thread_ts=thread_ts,
|
||||
content=snippet.code,
|
||||
filename=snippet.filename,
|
||||
snippet_type=snippet_type,
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
f"Failed to upload code snippet {snippet.filename}, "
|
||||
"falling back to inline code block"
|
||||
)
|
||||
# Fall back to posting as a regular message with code fences,
|
||||
# preserving the same visibility as the primary response.
|
||||
respond_in_thread_or_channel(
|
||||
client=client,
|
||||
channel=channel,
|
||||
receiver_ids=receiver_ids,
|
||||
text=f"```{snippet.language}\n{snippet.code}\n```",
|
||||
thread_ts=thread_ts,
|
||||
send_as_ephemeral=send_as_ephemeral,
|
||||
)
|
||||
|
||||
|
||||
def handle_regular_answer(
|
||||
message_info: SlackMessageInfo,
|
||||
slack_channel_config: SlackChannelConfig,
|
||||
@@ -387,7 +447,7 @@ def handle_regular_answer(
|
||||
offer_ephemeral_publication = False
|
||||
skip_ai_feedback = False
|
||||
|
||||
all_blocks = build_slack_response_blocks(
|
||||
all_blocks, code_snippets = build_slack_response_blocks(
|
||||
message_info=message_info,
|
||||
answer=answer,
|
||||
channel_conf=channel_conf,
|
||||
@@ -414,6 +474,20 @@ def handle_regular_answer(
|
||||
send_as_ephemeral=send_as_ephemeral,
|
||||
)
|
||||
|
||||
# Upload extracted code blocks as Slack file snippets so they
|
||||
# render as collapsible, syntax-highlighted blocks in the thread.
|
||||
snippet_thread_ts = target_thread_ts or message_ts_to_respond_to
|
||||
if code_snippets and snippet_thread_ts:
|
||||
_upload_code_snippets(
|
||||
client=client,
|
||||
channel=channel,
|
||||
thread_ts=snippet_thread_ts,
|
||||
snippets=code_snippets,
|
||||
logger=logger,
|
||||
receiver_ids=target_receiver_ids,
|
||||
send_as_ephemeral=send_as_ephemeral,
|
||||
)
|
||||
|
||||
# For DM (ephemeral message), we need to create a thread via a normal message so the user can see
|
||||
# the ephemeral message. This also will give the user a notification which ephemeral message does not.
|
||||
# if there is no message_ts_to_respond_to, and we have made it this far, then this is a /onyx message
|
||||
|
||||
@@ -7,6 +7,9 @@ import timeago # type: ignore
|
||||
from onyx.configs.constants import DocumentSource
|
||||
from onyx.context.search.models import SavedSearchDoc
|
||||
from onyx.onyxbot.slack.blocks import _build_documents_blocks
|
||||
from onyx.onyxbot.slack.blocks import _extract_code_snippets
|
||||
from onyx.onyxbot.slack.blocks import _split_text
|
||||
from onyx.onyxbot.slack.handlers.handle_regular_answer import _SNIPPET_TYPE_MAP
|
||||
|
||||
|
||||
def _make_saved_doc(updated_at: datetime | None) -> SavedSearchDoc:
|
||||
@@ -69,3 +72,148 @@ def test_build_documents_blocks_formats_naive_timestamp(
|
||||
formatted_timestamp: datetime = captured["doc"]
|
||||
expected_timestamp: datetime = naive_timestamp.replace(tzinfo=pytz.utc)
|
||||
assert formatted_timestamp == expected_timestamp
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _split_text tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSplitText:
|
||||
def test_short_text_returns_single_chunk(self) -> None:
|
||||
result = _split_text("hello world", limit=100)
|
||||
assert result == ["hello world"]
|
||||
|
||||
def test_splits_at_space_boundary(self) -> None:
|
||||
text = "aaa bbb ccc ddd"
|
||||
result = _split_text(text, limit=8)
|
||||
assert len(result) >= 2
|
||||
|
||||
def test_no_code_fences_splits_normally(self) -> None:
|
||||
text = "word " * 100 # 500 chars
|
||||
result = _split_text(text, limit=100)
|
||||
assert len(result) >= 5
|
||||
for chunk in result:
|
||||
assert "```" not in chunk
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _extract_code_snippets tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestExtractCodeSnippets:
|
||||
def test_short_text_no_extraction(self) -> None:
|
||||
text = "short answer with ```python\nprint('hi')\n``` inline"
|
||||
cleaned, snippets = _extract_code_snippets(text, limit=3000)
|
||||
assert cleaned == text
|
||||
assert snippets == []
|
||||
|
||||
def test_large_code_block_extracted(self) -> None:
|
||||
code = "x = 1\n" * 200 # ~1200 chars of code
|
||||
text = f"Here is the solution:\n```python\n{code}```\nHope that helps!"
|
||||
cleaned, snippets = _extract_code_snippets(text, limit=200)
|
||||
|
||||
assert len(snippets) == 1
|
||||
assert snippets[0].language == "python"
|
||||
assert snippets[0].filename == "code_1.python"
|
||||
assert "x = 1" in snippets[0].code
|
||||
# Code block should be removed from cleaned text
|
||||
assert "```" not in cleaned
|
||||
assert "Here is the solution" in cleaned
|
||||
assert "Hope that helps!" in cleaned
|
||||
|
||||
def test_multiple_code_blocks_only_large_ones_extracted(self) -> None:
|
||||
small_code = "print('hi')"
|
||||
large_code = "x = 1\n" * 300
|
||||
text = (
|
||||
f"First:\n```python\n{small_code}\n```\n"
|
||||
f"Second:\n```javascript\n{large_code}\n```\n"
|
||||
"Done!"
|
||||
)
|
||||
cleaned, snippets = _extract_code_snippets(text, limit=500)
|
||||
|
||||
# The large block should be extracted
|
||||
assert len(snippets) >= 1
|
||||
langs = [s.language for s in snippets]
|
||||
assert "javascript" in langs
|
||||
|
||||
def test_language_specifier_captured(self) -> None:
|
||||
code = "fn main() {}\n" * 100
|
||||
text = f"```rust\n{code}```"
|
||||
_, snippets = _extract_code_snippets(text, limit=100)
|
||||
|
||||
assert len(snippets) == 1
|
||||
assert snippets[0].language == "rust"
|
||||
assert snippets[0].filename == "code_1.rust"
|
||||
|
||||
def test_no_language_defaults_to_text(self) -> None:
|
||||
code = "some output\n" * 100
|
||||
text = f"```\n{code}```"
|
||||
_, snippets = _extract_code_snippets(text, limit=100)
|
||||
|
||||
assert len(snippets) == 1
|
||||
assert snippets[0].language == "text"
|
||||
assert snippets[0].filename == "code_1.txt"
|
||||
|
||||
def test_cleaned_text_has_no_triple_blank_lines(self) -> None:
|
||||
code = "x = 1\n" * 200
|
||||
text = f"Before\n\n```python\n{code}```\n\nAfter"
|
||||
cleaned, _ = _extract_code_snippets(text, limit=100)
|
||||
|
||||
assert "\n\n\n" not in cleaned
|
||||
|
||||
def test_multiple_blocks_cumulative_removal(self) -> None:
|
||||
"""When multiple code blocks exist, extraction decisions should
|
||||
account for previously extracted blocks (two-pass logic).
|
||||
Blocks must be smaller than limit//2 so the 'very large block'
|
||||
override doesn't trigger — we're testing the cumulative logic only."""
|
||||
# Each fenced block is ~103 chars (```python\n + 15*6 + ```)
|
||||
block_a = "a = 1\n" * 15 # 90 chars of code
|
||||
block_b = "b = 2\n" * 15 # 90 chars of code
|
||||
prose = "x" * 200
|
||||
# Total: ~200 + 103 + 103 + overhead ≈ 420 chars
|
||||
# limit=300, limit//2=150. Each block (~103) < 150, so only
|
||||
# the cumulative check applies. Removing block_a (~103 chars)
|
||||
# brings us to ~317 > 300, so block_b should also be extracted.
|
||||
# But with limit=350: removing block_a → ~317 ≤ 350, stop.
|
||||
text = f"{prose}\n```python\n{block_a}```\n```python\n{block_b}```\nEnd"
|
||||
cleaned, snippets = _extract_code_snippets(text, limit=350)
|
||||
|
||||
# After extracting block_a the text is ≤ 350, so block_b stays.
|
||||
assert len(snippets) == 1
|
||||
assert snippets[0].filename == "code_1.python"
|
||||
# block_b should still be in the cleaned text
|
||||
assert "b = 2" in cleaned
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _SNIPPET_TYPE_MAP tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSnippetTypeMap:
|
||||
@pytest.mark.parametrize(
|
||||
"alias,expected",
|
||||
[
|
||||
("py", "python"),
|
||||
("js", "javascript"),
|
||||
("ts", "typescript"),
|
||||
("tsx", "typescript"),
|
||||
("jsx", "javascript"),
|
||||
("sh", "shell"),
|
||||
("bash", "shell"),
|
||||
("yml", "yaml"),
|
||||
("rb", "ruby"),
|
||||
("rs", "rust"),
|
||||
("cs", "csharp"),
|
||||
("md", "markdown"),
|
||||
("text", "plain_text"),
|
||||
],
|
||||
)
|
||||
def test_common_aliases_normalized(self, alias: str, expected: str) -> None:
|
||||
assert _SNIPPET_TYPE_MAP[alias] == expected
|
||||
|
||||
def test_unknown_language_passes_through(self) -> None:
|
||||
unknown = "haskell"
|
||||
assert _SNIPPET_TYPE_MAP.get(unknown, unknown) == "haskell"
|
||||
|
||||
Reference in New Issue
Block a user