Compare commits

...

15 Commits

Author SHA1 Message Date
trial-danswer
8091efecef Merge branch 'feat/serpa_search_provider' into feat/google_webscraper 2025-09-30 22:59:57 -07:00
trial-danswer
bc1fccb7ac Merge branch 'main' into feat/google_webscraper 2025-09-30 22:59:48 -07:00
trial-danswer
1c1702d50f handle serper 500s 2025-09-30 22:45:40 -07:00
trial-danswer
666f836a2f Merge branch 'main' into feat/serpa_search_provider 2025-09-30 22:01:42 -07:00
trial-danswer
fa70d19ace Use scrape successful 2025-09-30 21:56:47 -07:00
trial-danswer
cc2c162f3d Resolve conflicts 2025-09-30 21:54:50 -07:00
trial-danswer
fd9bb91041 . 2025-09-30 21:51:45 -07:00
trial-danswer
a283e73f47 Minor style fixes 2025-09-30 12:05:01 -07:00
trial-danswer
ec282aed76 . 2025-09-30 11:57:12 -07:00
trial-danswer
acf2d73b31 Add handling for 400 responses 2025-09-30 11:57:12 -07:00
trial-danswer
69a107b123 Merge branch 'main' into feat/serpa_search_provider 2025-09-30 11:28:18 -07:00
trial-danswer
033265332f Change num worker threads 2025-09-30 10:35:36 -07:00
trial-danswer
ddf5c8f36c Merge branch 'main' into feat/serpa_search_provider 2025-09-29 20:11:12 -07:00
trial-danswer
bc7bcccd06 style changes 2025-09-29 19:52:50 -07:00
trial-danswer
9b0443f027 Implement serper client 2025-09-29 19:29:06 -07:00
9 changed files with 355 additions and 3 deletions

View File

@@ -0,0 +1,49 @@
from firecrawl import Firecrawl
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetContent,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetContentInterface,
)
from onyx.configs.chat_configs import FIRECRAWL_API_KEY
from onyx.utils.retry_wrapper import retry_builder
class FirecrawlContentClient(InternetContentInterface):
def __init__(self, api_key: str = FIRECRAWL_API_KEY):
self.firecrawl = Firecrawl(api_key=api_key)
@retry_builder(tries=3, delay=1, backoff=2)
def contents(self, urls: list[str]) -> list[InternetContent]:
if not urls:
return []
results = self.firecrawl.batch_scrape(urls)
output = [
InternetContent(
title=result.metadata and result.metadata.title or "",
link=result.metadata and result.metadata.url or "",
full_content=result.markdown or "",
published_date=None,
)
for result in results.data
]
failed_urls = set(urls) - set(map(lambda x: x.link, output))
output.extend(
[
InternetContent(
title="",
link=url,
full_content="",
published_date=None,
scrape_successful=False,
)
for url in failed_urls
]
)
return output

View File

@@ -0,0 +1,78 @@
import re
from datetime import datetime
from googleapiclient.discovery import build
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchInterface,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchResult,
)
from onyx.configs.chat_configs import GOOGLE_SEARCH_API_KEY
from onyx.configs.chat_configs import GOOGLE_SEARCH_CX
from onyx.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
from onyx.utils.retry_wrapper import retry_builder
class GoogleSearchClient(InternetSearchInterface):
def __init__(
self, api_key: str = GOOGLE_SEARCH_API_KEY, cx: str = GOOGLE_SEARCH_CX
):
self.cx = cx
self.service = build("customsearch", "v1", developerKey=api_key)
@retry_builder(tries=3, delay=1, backoff=2)
def search(self, query: str) -> list[InternetSearchResult]:
res = (
self.service.cse()
.list(
q=query,
cx=self.cx,
num=10,
)
.execute()
)
items = res.get("items", [])
return [
InternetSearchResult(
title=item["title"],
link=item["link"],
snippet=date_snippet[1],
author=None,
published_date=(
date_str_to_datetime(date_snippet[0]) if date_snippet[0] else None
),
)
for item in items
if (date_snippet := extract_date_and_clean_snippet(item.get("snippet")))
]
def extract_date_and_clean_snippet(snippet: str) -> tuple[str, str]:
"""
Google returns snippets in the format: ?(date ... ) (snippet)
We want to extract the date and remove it from the snippet
"""
if not snippet:
return "", ""
# Pattern match the date
# Matches formats like: "Mar 17, 2014 ...", "Sep 14, 2025 ...", "Jul 18, 2013 ..."
date_pattern = r"^([A-Za-z]{3}\s+\d{1,2},\s+\d{4})\s*\.{3}\s*(.*)$"
match = re.match(date_pattern, snippet)
if match:
extracted_date = match.group(1)
cleaned_snippet = match.group(2)
return extracted_date, cleaned_snippet
return "", snippet
def date_str_to_datetime(date_str: str) -> datetime:
return datetime_to_utc(datetime.strptime(date_str, "%b %d, %Y"))

View File

@@ -0,0 +1,29 @@
from onyx.agents.agent_search.dr.sub_agents.web_search.models import InternetContent
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetContentInterface,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchInterface,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchProvider,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchResult,
)
class MuxClient(InternetSearchProvider):
def __init__(
self,
search_client: InternetSearchInterface,
content_client: InternetContentInterface,
):
self.search_client = search_client
self.content_client = content_client
def search(self, query: str) -> list[InternetSearchResult]:
return self.search_client.search(query)
def contents(self, urls: list[str]) -> list[InternetContent]:
return self.content_client.contents(urls)

View File

@@ -0,0 +1,147 @@
import json
from concurrent.futures import ThreadPoolExecutor
import requests
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetContent,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchProvider,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchResult,
)
from onyx.configs.chat_configs import SERPER_API_KEY
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
from onyx.utils.retry_wrapper import retry_builder
SERPER_SEARCH_URL = "https://google.serper.dev/search"
SERPER_CONTENTS_URL = "https://scrape.serper.dev"
class SerperClient(InternetSearchProvider):
def __init__(self, api_key: str | None = SERPER_API_KEY) -> None:
self.headers = {
"X-API-KEY": api_key,
"Content-Type": "application/json",
}
@retry_builder(tries=3, delay=1, backoff=2)
def search(self, query: str) -> list[InternetSearchResult]:
payload = {
"q": query,
}
response = requests.post(
SERPER_SEARCH_URL,
headers=self.headers,
data=json.dumps(payload),
)
response.raise_for_status()
results = response.json()
organic_results = results["organic"]
return [
InternetSearchResult(
title=result["title"],
link=result["link"],
snippet=result["snippet"],
author=None,
published_date=None,
)
for result in organic_results
]
def contents(self, urls: list[str]) -> list[InternetContent]:
if not urls:
return []
# Serper can responds with 500s regularly. We want to retry,
# but in the event of failure, return an unsuccesful scrape.
def safe_get_webpage_content(url: str) -> InternetContent:
try:
return self._get_webpage_content(url)
except Exception:
return InternetContent(
title="",
link=url,
full_content="",
published_date=None,
scrape_successful=False,
)
with ThreadPoolExecutor(max_workers=min(8, len(urls))) as e:
return list(e.map(safe_get_webpage_content, urls))
@retry_builder(tries=3, delay=1, backoff=2)
def _get_webpage_content(self, url: str) -> InternetContent:
payload = {
"url": url,
}
response = requests.post(
SERPER_CONTENTS_URL,
headers=self.headers,
data=json.dumps(payload),
)
# 400 returned when serper cannot scrape
if response.status_code == 400:
return InternetContent(
title="",
link=url,
full_content="",
published_date=None,
scrape_successful=False,
)
response.raise_for_status()
response_json = response.json()
# Response only guarantees text
text = response_json["text"]
# metadata & jsonld is not guaranteed to be present
metadata = response_json.get("metadata", {})
jsonld = response_json.get("jsonld", {})
title = extract_title_from_metadata(metadata)
# Serper does not provide a reliable mechanism to extract the url
response_url = url
published_date_str = extract_published_date_from_jsonld(jsonld)
published_date = None
if published_date_str:
try:
published_date = time_str_to_utc(published_date_str)
except Exception:
published_date = None
return InternetContent(
title=title or "",
link=response_url,
full_content=text or "",
published_date=published_date,
)
def extract_title_from_metadata(metadata: dict[str, str]) -> str | None:
keys = ["title", "og:title"]
return extract_value_from_dict(metadata, keys)
def extract_published_date_from_jsonld(jsonld: dict[str, str]) -> str | None:
keys = ["dateModified"]
return extract_value_from_dict(jsonld, keys)
def extract_value_from_dict(data: dict[str, str], keys: list[str]) -> str | None:
for key in keys:
if key in data:
return data[key]
return None

View File

@@ -26,6 +26,7 @@ class InternetContent(BaseModel):
link: str
full_content: str
published_date: datetime | None = None
scrape_successful: bool = True
class InternetSearchProvider(ABC):
@@ -36,3 +37,15 @@ class InternetSearchProvider(ABC):
@abstractmethod
def contents(self, urls: list[str]) -> list[InternetContent]:
pass
class InternetSearchInterface(ABC):
@abstractmethod
def search(self, query: str) -> list[InternetSearchResult]:
pass
class InternetContentInterface(ABC):
@abstractmethod
def contents(self, urls: list[str]) -> list[InternetContent]:
pass

View File

@@ -1,13 +1,36 @@
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.exa_client import (
ExaClient,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.firecrawl_client import (
FirecrawlContentClient,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.google_client import (
GoogleSearchClient,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.mux_client import (
MuxClient,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.serper_client import (
SerperClient,
)
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
InternetSearchProvider,
)
from onyx.configs.chat_configs import EXA_API_KEY
from onyx.configs.chat_configs import FIRECRAWL_API_KEY
from onyx.configs.chat_configs import GOOGLE_SEARCH_API_KEY
from onyx.configs.chat_configs import GOOGLE_SEARCH_CX
from onyx.configs.chat_configs import SERPER_API_KEY
def get_default_provider() -> InternetSearchProvider | None:
if EXA_API_KEY:
return ExaClient()
if FIRECRAWL_API_KEY and GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_CX:
return MuxClient(
search_client=GoogleSearchClient(),
content_client=FirecrawlContentClient(),
)
if SERPER_API_KEY:
return SerperClient()
return None

View File

@@ -34,7 +34,7 @@ def dummy_inference_section_from_internet_content(
boost=1,
recency_bias=1.0,
score=1.0,
hidden=False,
hidden=(not result.scrape_successful),
metadata={},
match_highlights=[],
doc_summary=truncated_content,

View File

@@ -90,6 +90,11 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "").lower() == "true"
# Internet Search
EXA_API_KEY = os.environ.get("EXA_API_KEY") or None
FIRECRAWL_API_KEY = os.environ.get("FIRECRAWL_API_KEY") or None
GOOGLE_SEARCH_API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY") or None
GOOGLE_SEARCH_CX = os.environ.get("GOOGLE_SEARCH_CX") or None
SERPER_API_KEY = os.environ.get("SERPER_API_KEY") or None
NUM_INTERNET_SEARCH_RESULTS = int(os.environ.get("NUM_INTERNET_SEARCH_RESULTS") or 10)
NUM_INTERNET_SEARCH_CHUNKS = int(os.environ.get("NUM_INTERNET_SEARCH_CHUNKS") or 50)

View File

@@ -6,6 +6,10 @@ from typing_extensions import override
from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
from onyx.configs.chat_configs import EXA_API_KEY
from onyx.configs.chat_configs import FIRECRAWL_API_KEY
from onyx.configs.chat_configs import GOOGLE_SEARCH_API_KEY
from onyx.configs.chat_configs import GOOGLE_SEARCH_CX
from onyx.configs.chat_configs import SERPER_API_KEY
from onyx.llm.interfaces import LLM
from onyx.llm.models import PreviousMessage
from onyx.tools.message import ToolCallSummary
@@ -49,8 +53,12 @@ class WebSearchTool(Tool[None]):
@override
@classmethod
def is_available(cls, db_session: Session) -> bool:
"""Available only if EXA API key is configured."""
return bool(EXA_API_KEY)
"""Available only if EXA or SERPER API key or GOOGLE + FIRECRAWL is configured."""
return (
bool(EXA_API_KEY)
or bool(SERPER_API_KEY)
or bool(FIRECRAWL_API_KEY and GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_CX)
)
def tool_definition(self) -> dict:
return {