mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-02-16 23:35:46 +00:00
Compare commits
15 Commits
experiment
...
feat/googl
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8091efecef | ||
|
|
bc1fccb7ac | ||
|
|
1c1702d50f | ||
|
|
666f836a2f | ||
|
|
fa70d19ace | ||
|
|
cc2c162f3d | ||
|
|
fd9bb91041 | ||
|
|
a283e73f47 | ||
|
|
ec282aed76 | ||
|
|
acf2d73b31 | ||
|
|
69a107b123 | ||
|
|
033265332f | ||
|
|
ddf5c8f36c | ||
|
|
bc7bcccd06 | ||
|
|
9b0443f027 |
@@ -0,0 +1,49 @@
|
||||
from firecrawl import Firecrawl
|
||||
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetContent,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetContentInterface,
|
||||
)
|
||||
from onyx.configs.chat_configs import FIRECRAWL_API_KEY
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
class FirecrawlContentClient(InternetContentInterface):
|
||||
def __init__(self, api_key: str = FIRECRAWL_API_KEY):
|
||||
self.firecrawl = Firecrawl(api_key=api_key)
|
||||
|
||||
@retry_builder(tries=3, delay=1, backoff=2)
|
||||
def contents(self, urls: list[str]) -> list[InternetContent]:
|
||||
if not urls:
|
||||
return []
|
||||
|
||||
results = self.firecrawl.batch_scrape(urls)
|
||||
|
||||
output = [
|
||||
InternetContent(
|
||||
title=result.metadata and result.metadata.title or "",
|
||||
link=result.metadata and result.metadata.url or "",
|
||||
full_content=result.markdown or "",
|
||||
published_date=None,
|
||||
)
|
||||
for result in results.data
|
||||
]
|
||||
|
||||
failed_urls = set(urls) - set(map(lambda x: x.link, output))
|
||||
|
||||
output.extend(
|
||||
[
|
||||
InternetContent(
|
||||
title="",
|
||||
link=url,
|
||||
full_content="",
|
||||
published_date=None,
|
||||
scrape_successful=False,
|
||||
)
|
||||
for url in failed_urls
|
||||
]
|
||||
)
|
||||
|
||||
return output
|
||||
@@ -0,0 +1,78 @@
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchInterface,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchResult,
|
||||
)
|
||||
from onyx.configs.chat_configs import GOOGLE_SEARCH_API_KEY
|
||||
from onyx.configs.chat_configs import GOOGLE_SEARCH_CX
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
|
||||
|
||||
class GoogleSearchClient(InternetSearchInterface):
|
||||
def __init__(
|
||||
self, api_key: str = GOOGLE_SEARCH_API_KEY, cx: str = GOOGLE_SEARCH_CX
|
||||
):
|
||||
self.cx = cx
|
||||
|
||||
self.service = build("customsearch", "v1", developerKey=api_key)
|
||||
|
||||
@retry_builder(tries=3, delay=1, backoff=2)
|
||||
def search(self, query: str) -> list[InternetSearchResult]:
|
||||
res = (
|
||||
self.service.cse()
|
||||
.list(
|
||||
q=query,
|
||||
cx=self.cx,
|
||||
num=10,
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
|
||||
items = res.get("items", [])
|
||||
|
||||
return [
|
||||
InternetSearchResult(
|
||||
title=item["title"],
|
||||
link=item["link"],
|
||||
snippet=date_snippet[1],
|
||||
author=None,
|
||||
published_date=(
|
||||
date_str_to_datetime(date_snippet[0]) if date_snippet[0] else None
|
||||
),
|
||||
)
|
||||
for item in items
|
||||
if (date_snippet := extract_date_and_clean_snippet(item.get("snippet")))
|
||||
]
|
||||
|
||||
|
||||
def extract_date_and_clean_snippet(snippet: str) -> tuple[str, str]:
|
||||
"""
|
||||
Google returns snippets in the format: ?(date ... ) (snippet)
|
||||
We want to extract the date and remove it from the snippet
|
||||
"""
|
||||
if not snippet:
|
||||
return "", ""
|
||||
|
||||
# Pattern match the date
|
||||
# Matches formats like: "Mar 17, 2014 ...", "Sep 14, 2025 ...", "Jul 18, 2013 ..."
|
||||
date_pattern = r"^([A-Za-z]{3}\s+\d{1,2},\s+\d{4})\s*\.{3}\s*(.*)$"
|
||||
|
||||
match = re.match(date_pattern, snippet)
|
||||
|
||||
if match:
|
||||
extracted_date = match.group(1)
|
||||
cleaned_snippet = match.group(2)
|
||||
return extracted_date, cleaned_snippet
|
||||
|
||||
return "", snippet
|
||||
|
||||
|
||||
def date_str_to_datetime(date_str: str) -> datetime:
|
||||
return datetime_to_utc(datetime.strptime(date_str, "%b %d, %Y"))
|
||||
@@ -0,0 +1,29 @@
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import InternetContent
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetContentInterface,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchInterface,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchProvider,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchResult,
|
||||
)
|
||||
|
||||
|
||||
class MuxClient(InternetSearchProvider):
|
||||
def __init__(
|
||||
self,
|
||||
search_client: InternetSearchInterface,
|
||||
content_client: InternetContentInterface,
|
||||
):
|
||||
self.search_client = search_client
|
||||
self.content_client = content_client
|
||||
|
||||
def search(self, query: str) -> list[InternetSearchResult]:
|
||||
return self.search_client.search(query)
|
||||
|
||||
def contents(self, urls: list[str]) -> list[InternetContent]:
|
||||
return self.content_client.contents(urls)
|
||||
@@ -0,0 +1,147 @@
|
||||
import json
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import requests
|
||||
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetContent,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchProvider,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchResult,
|
||||
)
|
||||
from onyx.configs.chat_configs import SERPER_API_KEY
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import time_str_to_utc
|
||||
from onyx.utils.retry_wrapper import retry_builder
|
||||
|
||||
SERPER_SEARCH_URL = "https://google.serper.dev/search"
|
||||
SERPER_CONTENTS_URL = "https://scrape.serper.dev"
|
||||
|
||||
|
||||
class SerperClient(InternetSearchProvider):
|
||||
def __init__(self, api_key: str | None = SERPER_API_KEY) -> None:
|
||||
self.headers = {
|
||||
"X-API-KEY": api_key,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
@retry_builder(tries=3, delay=1, backoff=2)
|
||||
def search(self, query: str) -> list[InternetSearchResult]:
|
||||
payload = {
|
||||
"q": query,
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
SERPER_SEARCH_URL,
|
||||
headers=self.headers,
|
||||
data=json.dumps(payload),
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
results = response.json()
|
||||
organic_results = results["organic"]
|
||||
|
||||
return [
|
||||
InternetSearchResult(
|
||||
title=result["title"],
|
||||
link=result["link"],
|
||||
snippet=result["snippet"],
|
||||
author=None,
|
||||
published_date=None,
|
||||
)
|
||||
for result in organic_results
|
||||
]
|
||||
|
||||
def contents(self, urls: list[str]) -> list[InternetContent]:
|
||||
if not urls:
|
||||
return []
|
||||
|
||||
# Serper can responds with 500s regularly. We want to retry,
|
||||
# but in the event of failure, return an unsuccesful scrape.
|
||||
def safe_get_webpage_content(url: str) -> InternetContent:
|
||||
try:
|
||||
return self._get_webpage_content(url)
|
||||
except Exception:
|
||||
return InternetContent(
|
||||
title="",
|
||||
link=url,
|
||||
full_content="",
|
||||
published_date=None,
|
||||
scrape_successful=False,
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=min(8, len(urls))) as e:
|
||||
return list(e.map(safe_get_webpage_content, urls))
|
||||
|
||||
@retry_builder(tries=3, delay=1, backoff=2)
|
||||
def _get_webpage_content(self, url: str) -> InternetContent:
|
||||
payload = {
|
||||
"url": url,
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
SERPER_CONTENTS_URL,
|
||||
headers=self.headers,
|
||||
data=json.dumps(payload),
|
||||
)
|
||||
|
||||
# 400 returned when serper cannot scrape
|
||||
if response.status_code == 400:
|
||||
return InternetContent(
|
||||
title="",
|
||||
link=url,
|
||||
full_content="",
|
||||
published_date=None,
|
||||
scrape_successful=False,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
|
||||
response_json = response.json()
|
||||
|
||||
# Response only guarantees text
|
||||
text = response_json["text"]
|
||||
|
||||
# metadata & jsonld is not guaranteed to be present
|
||||
metadata = response_json.get("metadata", {})
|
||||
jsonld = response_json.get("jsonld", {})
|
||||
|
||||
title = extract_title_from_metadata(metadata)
|
||||
|
||||
# Serper does not provide a reliable mechanism to extract the url
|
||||
response_url = url
|
||||
published_date_str = extract_published_date_from_jsonld(jsonld)
|
||||
published_date = None
|
||||
|
||||
if published_date_str:
|
||||
try:
|
||||
published_date = time_str_to_utc(published_date_str)
|
||||
except Exception:
|
||||
published_date = None
|
||||
|
||||
return InternetContent(
|
||||
title=title or "",
|
||||
link=response_url,
|
||||
full_content=text or "",
|
||||
published_date=published_date,
|
||||
)
|
||||
|
||||
|
||||
def extract_title_from_metadata(metadata: dict[str, str]) -> str | None:
|
||||
keys = ["title", "og:title"]
|
||||
return extract_value_from_dict(metadata, keys)
|
||||
|
||||
|
||||
def extract_published_date_from_jsonld(jsonld: dict[str, str]) -> str | None:
|
||||
keys = ["dateModified"]
|
||||
return extract_value_from_dict(jsonld, keys)
|
||||
|
||||
|
||||
def extract_value_from_dict(data: dict[str, str], keys: list[str]) -> str | None:
|
||||
for key in keys:
|
||||
if key in data:
|
||||
return data[key]
|
||||
return None
|
||||
@@ -26,6 +26,7 @@ class InternetContent(BaseModel):
|
||||
link: str
|
||||
full_content: str
|
||||
published_date: datetime | None = None
|
||||
scrape_successful: bool = True
|
||||
|
||||
|
||||
class InternetSearchProvider(ABC):
|
||||
@@ -36,3 +37,15 @@ class InternetSearchProvider(ABC):
|
||||
@abstractmethod
|
||||
def contents(self, urls: list[str]) -> list[InternetContent]:
|
||||
pass
|
||||
|
||||
|
||||
class InternetSearchInterface(ABC):
|
||||
@abstractmethod
|
||||
def search(self, query: str) -> list[InternetSearchResult]:
|
||||
pass
|
||||
|
||||
|
||||
class InternetContentInterface(ABC):
|
||||
@abstractmethod
|
||||
def contents(self, urls: list[str]) -> list[InternetContent]:
|
||||
pass
|
||||
|
||||
@@ -1,13 +1,36 @@
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.exa_client import (
|
||||
ExaClient,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.firecrawl_client import (
|
||||
FirecrawlContentClient,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.google_client import (
|
||||
GoogleSearchClient,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.mux_client import (
|
||||
MuxClient,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.clients.serper_client import (
|
||||
SerperClient,
|
||||
)
|
||||
from onyx.agents.agent_search.dr.sub_agents.web_search.models import (
|
||||
InternetSearchProvider,
|
||||
)
|
||||
from onyx.configs.chat_configs import EXA_API_KEY
|
||||
from onyx.configs.chat_configs import FIRECRAWL_API_KEY
|
||||
from onyx.configs.chat_configs import GOOGLE_SEARCH_API_KEY
|
||||
from onyx.configs.chat_configs import GOOGLE_SEARCH_CX
|
||||
from onyx.configs.chat_configs import SERPER_API_KEY
|
||||
|
||||
|
||||
def get_default_provider() -> InternetSearchProvider | None:
|
||||
if EXA_API_KEY:
|
||||
return ExaClient()
|
||||
if FIRECRAWL_API_KEY and GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_CX:
|
||||
return MuxClient(
|
||||
search_client=GoogleSearchClient(),
|
||||
content_client=FirecrawlContentClient(),
|
||||
)
|
||||
if SERPER_API_KEY:
|
||||
return SerperClient()
|
||||
return None
|
||||
|
||||
@@ -34,7 +34,7 @@ def dummy_inference_section_from_internet_content(
|
||||
boost=1,
|
||||
recency_bias=1.0,
|
||||
score=1.0,
|
||||
hidden=False,
|
||||
hidden=(not result.scrape_successful),
|
||||
metadata={},
|
||||
match_highlights=[],
|
||||
doc_summary=truncated_content,
|
||||
|
||||
@@ -90,6 +90,11 @@ HARD_DELETE_CHATS = os.environ.get("HARD_DELETE_CHATS", "").lower() == "true"
|
||||
|
||||
# Internet Search
|
||||
EXA_API_KEY = os.environ.get("EXA_API_KEY") or None
|
||||
FIRECRAWL_API_KEY = os.environ.get("FIRECRAWL_API_KEY") or None
|
||||
|
||||
GOOGLE_SEARCH_API_KEY = os.environ.get("GOOGLE_SEARCH_API_KEY") or None
|
||||
GOOGLE_SEARCH_CX = os.environ.get("GOOGLE_SEARCH_CX") or None
|
||||
SERPER_API_KEY = os.environ.get("SERPER_API_KEY") or None
|
||||
|
||||
NUM_INTERNET_SEARCH_RESULTS = int(os.environ.get("NUM_INTERNET_SEARCH_RESULTS") or 10)
|
||||
NUM_INTERNET_SEARCH_CHUNKS = int(os.environ.get("NUM_INTERNET_SEARCH_CHUNKS") or 50)
|
||||
|
||||
@@ -6,6 +6,10 @@ from typing_extensions import override
|
||||
|
||||
from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
|
||||
from onyx.configs.chat_configs import EXA_API_KEY
|
||||
from onyx.configs.chat_configs import FIRECRAWL_API_KEY
|
||||
from onyx.configs.chat_configs import GOOGLE_SEARCH_API_KEY
|
||||
from onyx.configs.chat_configs import GOOGLE_SEARCH_CX
|
||||
from onyx.configs.chat_configs import SERPER_API_KEY
|
||||
from onyx.llm.interfaces import LLM
|
||||
from onyx.llm.models import PreviousMessage
|
||||
from onyx.tools.message import ToolCallSummary
|
||||
@@ -49,8 +53,12 @@ class WebSearchTool(Tool[None]):
|
||||
@override
|
||||
@classmethod
|
||||
def is_available(cls, db_session: Session) -> bool:
|
||||
"""Available only if EXA API key is configured."""
|
||||
return bool(EXA_API_KEY)
|
||||
"""Available only if EXA or SERPER API key or GOOGLE + FIRECRAWL is configured."""
|
||||
return (
|
||||
bool(EXA_API_KEY)
|
||||
or bool(SERPER_API_KEY)
|
||||
or bool(FIRECRAWL_API_KEY and GOOGLE_SEARCH_API_KEY and GOOGLE_SEARCH_CX)
|
||||
)
|
||||
|
||||
def tool_definition(self) -> dict:
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user