Compare commits

..

1 Commits

Author SHA1 Message Date
pablonyx
59c454debe k 2025-04-03 11:45:38 -07:00
75 changed files with 460 additions and 7014 deletions

View File

@@ -23,10 +23,6 @@ env:
# Jira
JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }}
JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }}
GONG_ACCESS_KEY: ${{ secrets.GONG_ACCESS_KEY }}
GONG_ACCESS_KEY_SECRET: ${{ secrets.GONG_ACCESS_KEY_SECRET }}
# Google
GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR: ${{ secrets.GOOGLE_DRIVE_SERVICE_ACCOUNT_JSON_STR }}
GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1: ${{ secrets.GOOGLE_DRIVE_OAUTH_CREDENTIALS_JSON_STR_TEST_USER_1 }}

View File

@@ -30,26 +30,30 @@ Keep knowledge and access controls sync-ed across over 40 connectors like Google
Create custom AI agents with unique prompts, knowledge, and actions that the agents can take.
Onyx can be deployed securely anywhere and for any scale - on a laptop, on-premise, or to cloud.
<h3>Feature Highlights</h3>
**Deep research over your team's knowledge:**
https://private-user-images.githubusercontent.com/32520769/414509312-48392e83-95d0-4fb5-8650-a396e05e0a32.mp4?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3Mzk5Mjg2MzYsIm5iZiI6MTczOTkyODMzNiwicGF0aCI6Ii8zMjUyMDc2OS80MTQ1MDkzMTItNDgzOTJlODMtOTVkMC00ZmI1LTg2NTAtYTM5NmUwNWUwYTMyLm1wND9YLUFtei1BbGdvcml0aG09QVdTNC1ITUFDLVNIQTI1NiZYLUFtei1DcmVkZW50aWFsPUFLSUFWQ09EWUxTQTUzUFFLNFpBJTJGMjAyNTAyMTklMkZ1cy1lYXN0LTElMkZzMyUyRmF3czRfcmVxdWVzdCZYLUFtei1EYXRlPTIwMjUwMjE5VDAxMjUzNlomWC1BbXotRXhwaXJlcz0zMDAmWC1BbXotU2lnbmF0dXJlPWFhMzk5Njg2Y2Y5YjFmNDNiYTQ2YzM5ZTg5YWJiYTU2NWMyY2YwNmUyODE2NWUxMDRiMWQxZWJmODI4YTA0MTUmWC1BbXotU2lnbmVkSGVhZGVycz1ob3N0In0.a9D8A0sgKE9AoaoE-mfFbJ6_OKYeqaf7TZ4Han2JfW8
**Use Onyx as a secure AI Chat with any LLM:**
![Onyx Chat Silent Demo](https://github.com/onyx-dot-app/onyx/releases/download/v0.21.1/OnyxChatSilentDemo.gif)
**Easily set up connectors to your apps:**
![Onyx Connector Silent Demo](https://github.com/onyx-dot-app/onyx/releases/download/v0.21.1/OnyxConnectorSilentDemo.gif)
**Access Onyx where your team already works:**
![Onyx Bot Demo](https://github.com/onyx-dot-app/onyx/releases/download/v0.21.1/OnyxBot.png)
## Deployment
## Deployment
**To try it out for free and get started in seconds, check out [Onyx Cloud](https://cloud.onyx.app/signup)**.
Onyx can also be run locally (even on a laptop) or deployed on a virtual machine with a single
@@ -58,23 +62,23 @@ Onyx can also be run locally (even on a laptop) or deployed on a virtual machine
We also have built-in support for high-availability/scalable deployment on Kubernetes.
References [here](https://github.com/onyx-dot-app/onyx/tree/main/deployment).
## 🔍 Other Notable Benefits of Onyx
## 🔍 Other Notable Benefits of Onyx
- Custom deep learning models for indexing and inference time, only through Onyx + learning from user feedback.
- Flexible security features like SSO (OIDC/SAML/OAuth2), RBAC, encryption of credentials, etc.
- Knowledge curation features like document-sets, query history, usage analytics, etc.
- Scalable deployment options tested up to many tens of thousands users and hundreds of millions of documents.
## 🚧 Roadmap
## 🚧 Roadmap
- New methods in information retrieval (StructRAG, LightGraphRAG, etc.)
- Personalized Search
- Organizational understanding and ability to locate and suggest experts from your team.
- Code Search
- SQL and Structured Query Language
## 🔌 Connectors
## 🔌 Connectors
Keep knowledge and access up to sync across 40+ connectors:
- Google Drive
@@ -95,65 +99,19 @@ Keep knowledge and access up to sync across 40+ connectors:
See the full list [here](https://docs.onyx.app/connectors).
## 📚 Licensing
## 📚 Licensing
There are two editions of Onyx:
- Onyx Community Edition (CE) is available freely under the MIT Expat license. Simply follow the Deployment guide above.
- Onyx Enterprise Edition (EE) includes extra features that are primarily useful for larger organizations.
For feature details, check out [our website](https://www.onyx.app/pricing).
For feature details, check out [our website](https://www.onyx.app/pricing).
To try the Onyx Enterprise Edition:
1. Checkout [Onyx Cloud](https://cloud.onyx.app/signup).
2. For self-hosting the Enterprise Edition, contact us at [founders@onyx.app](mailto:founders@onyx.app) or book a call with us on our [Cal](https://cal.com/team/onyx/founders).
## 💡 Contributing
## 💡 Contributing
Looking to contribute? Please check out the [Contribution Guide](CONTRIBUTING.md) for more details.
# YC Company Twitter Scraper
A script that scrapes YC company pages and extracts Twitter/X.com links.
## Requirements
- Python 3.7+
- Playwright
## Installation
1. Install the required packages:
```
pip install -r requirements.txt
```
2. Install Playwright browsers:
```
playwright install
```
## Usage
Run the script with default settings:
```
python scrape_yc_twitter.py
```
This will scrape the YC companies from recent batches (W23, S23, S24, F24, S22, W22) and save the Twitter links to `twitter_links.txt`.
### Custom URL and Output
```
python scrape_yc_twitter.py --url "https://www.ycombinator.com/companies?batch=W24" --output "w24_twitter.txt"
```
## How it works
1. Navigates to the specified YC companies page
2. Scrolls down to load all company cards
3. Extracts links to individual company pages
4. Visits each company page and extracts Twitter/X.com links
5. Saves the results to a text file

View File

@@ -1,45 +0,0 @@
# YC Company Twitter Scraper
A script that scrapes YC company pages and extracts Twitter/X.com links.
## Requirements
- Python 3.7+
- Playwright
## Installation
1. Install the required packages:
```
pip install -r requirements.txt
```
2. Install Playwright browsers:
```
playwright install
```
## Usage
Run the script with default settings:
```
python scrape_yc_twitter.py
```
This will scrape the YC companies from recent batches (W23, S23, S24, F24, S22, W22) and save the Twitter links to `twitter_links.txt`.
### Custom URL and Output
```
python scrape_yc_twitter.py --url "https://www.ycombinator.com/companies?batch=W24" --output "w24_twitter.txt"
```
## How it works
1. Navigates to the specified YC companies page
2. Scrolls down to load all company cards
3. Extracts links to individual company pages
4. Visits each company page and extracts Twitter/X.com links
5. Saves the results to a text file

View File

@@ -51,9 +51,9 @@ def _get_objects_access_for_user_email_from_salesforce(
# This is cached in the function so the first query takes an extra 0.1-0.3 seconds
# but subsequent queries by the same user are essentially instant
start_time = time.monotonic()
start_time = time.time()
user_id = get_salesforce_user_id_from_email(salesforce_client, user_email)
end_time = time.monotonic()
end_time = time.time()
logger.info(
f"Time taken to get Salesforce user ID: {end_time - start_time} seconds"
)

View File

@@ -1,6 +1,10 @@
from simple_salesforce import Salesforce
from sqlalchemy.orm import Session
from onyx.connectors.salesforce.sqlite_functions import get_user_id_by_email
from onyx.connectors.salesforce.sqlite_functions import init_db
from onyx.connectors.salesforce.sqlite_functions import NULL_ID_STRING
from onyx.connectors.salesforce.sqlite_functions import update_email_to_id_table
from onyx.db.connector_credential_pair import get_connector_credential_pair_from_id
from onyx.db.document import get_cc_pairs_for_document
from onyx.utils.logger import setup_logger
@@ -24,8 +28,6 @@ def get_any_salesforce_client_for_doc_id(
E.g. there are 2 different credential sets for 2 different salesforce cc_pairs
but only one has the permissions to access the permissions needed for the query.
"""
# NOTE: this global seems very very bad
global _ANY_SALESFORCE_CLIENT
if _ANY_SALESFORCE_CLIENT is None:
cc_pairs = get_cc_pairs_for_document(db_session, doc_id)
@@ -82,21 +84,35 @@ def get_salesforce_user_id_from_email(
salesforce database. (Around 0.1-0.3 seconds)
If it's cached or stored in the local salesforce database, it's fast (<0.001 seconds).
"""
# NOTE: this global seems bad
global _CACHED_SF_EMAIL_TO_ID_MAP
if user_email in _CACHED_SF_EMAIL_TO_ID_MAP:
if _CACHED_SF_EMAIL_TO_ID_MAP[user_email] is not None:
return _CACHED_SF_EMAIL_TO_ID_MAP[user_email]
# some caching via sqlite existed here before ... check history if interested
# ...query Salesforce and store the result in the database
user_id = _query_salesforce_user_id(sf_client, user_email)
db_exists = True
try:
# Check if the user is already in the database
user_id = get_user_id_by_email(user_email)
except Exception:
init_db()
try:
user_id = get_user_id_by_email(user_email)
except Exception as e:
logger.error(f"Error checking if user is in database: {e}")
user_id = None
db_exists = False
# If no entry is found in the database (indicated by user_id being None)...
if user_id is None:
# ...query Salesforce and store the result in the database
user_id = _query_salesforce_user_id(sf_client, user_email)
if db_exists:
update_email_to_id_table(user_email, user_id)
return user_id
elif user_id is None:
return None
elif user_id == NULL_ID_STRING:
return None
# If the found user_id is real, cache it
_CACHED_SF_EMAIL_TO_ID_MAP[user_email] = user_id
return user_id

View File

@@ -5,14 +5,12 @@ from slack_sdk import WebClient
from ee.onyx.external_permissions.slack.utils import fetch_user_id_to_email_map
from onyx.access.models import DocExternalAccess
from onyx.access.models import ExternalAccess
from onyx.connectors.credentials_provider import OnyxDBCredentialsProvider
from onyx.connectors.slack.connector import get_channels
from onyx.connectors.slack.connector import make_paginated_slack_api_call_w_retries
from onyx.connectors.slack.connector import SlackConnector
from onyx.db.models import ConnectorCredentialPair
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from shared_configs.contextvars import get_current_tenant_id
logger = setup_logger()
@@ -103,12 +101,7 @@ def _get_slack_document_access(
callback: IndexingHeartbeatInterface | None,
) -> Generator[DocExternalAccess, None, None]:
slack_connector = SlackConnector(**cc_pair.connector.connector_specific_config)
# Use credentials provider instead of directly loading credentials
provider = OnyxDBCredentialsProvider(
get_current_tenant_id(), "slack", cc_pair.credential.id
)
slack_connector.set_credentials_provider(provider)
slack_connector.load_credentials(cc_pair.credential.credential_json)
slim_doc_generator = slack_connector.retrieve_all_slim_documents(callback=callback)

View File

@@ -51,7 +51,6 @@ def _get_slack_group_members_email(
def slack_group_sync(
tenant_id: str,
cc_pair: ConnectorCredentialPair,
) -> list[ExternalUserGroup]:
slack_client = WebClient(

View File

@@ -15,7 +15,6 @@ from ee.onyx.external_permissions.post_query_censoring import (
DOC_SOURCE_TO_CHUNK_CENSORING_FUNCTION,
)
from ee.onyx.external_permissions.slack.doc_sync import slack_doc_sync
from ee.onyx.external_permissions.slack.group_sync import slack_group_sync
from onyx.access.models import DocExternalAccess
from onyx.configs.constants import DocumentSource
from onyx.db.models import ConnectorCredentialPair
@@ -57,7 +56,6 @@ DOC_PERMISSIONS_FUNC_MAP: dict[DocumentSource, DocSyncFuncType] = {
GROUP_PERMISSIONS_FUNC_MAP: dict[DocumentSource, GroupSyncFuncType] = {
DocumentSource.GOOGLE_DRIVE: gdrive_group_sync,
DocumentSource.CONFLUENCE: confluence_group_sync,
DocumentSource.SLACK: slack_group_sync,
}

View File

@@ -1,62 +0,0 @@
from collections.abc import Hashable
from typing import cast
from langchain_core.runnables.config import RunnableConfig
from langgraph.types import Send
from onyx.agents.agent_search.dc_search_analysis.states import ObjectInformationInput
from onyx.agents.agent_search.dc_search_analysis.states import (
ObjectResearchInformationUpdate,
)
from onyx.agents.agent_search.dc_search_analysis.states import ObjectSourceInput
from onyx.agents.agent_search.dc_search_analysis.states import (
SearchSourcesObjectsUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
def parallel_object_source_research_edge(
state: SearchSourcesObjectsUpdate, config: RunnableConfig
) -> list[Send | Hashable]:
"""
LangGraph edge to parallelize the research for an individual object and source
"""
search_objects = state.analysis_objects
search_sources = state.analysis_sources
object_source_combinations = [
(object, source) for object in search_objects for source in search_sources
]
return [
Send(
"research_object_source",
ObjectSourceInput(
object_source_combination=object_source_combination,
log_messages=[],
),
)
for object_source_combination in object_source_combinations
]
def parallel_object_research_consolidation_edge(
state: ObjectResearchInformationUpdate, config: RunnableConfig
) -> list[Send | Hashable]:
"""
LangGraph edge to parallelize the research for an individual object and source
"""
cast(GraphConfig, config["metadata"]["config"])
object_research_information_results = state.object_research_information_results
return [
Send(
"consolidate_object_research",
ObjectInformationInput(
object_information=object_information,
log_messages=[],
),
)
for object_information in object_research_information_results
]

View File

@@ -1,103 +0,0 @@
from langgraph.graph import END
from langgraph.graph import START
from langgraph.graph import StateGraph
from onyx.agents.agent_search.dc_search_analysis.edges import (
parallel_object_research_consolidation_edge,
)
from onyx.agents.agent_search.dc_search_analysis.edges import (
parallel_object_source_research_edge,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a1_search_objects import (
search_objects,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a2_research_object_source import (
research_object_source,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a3_structure_research_by_object import (
structure_research_by_object,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a4_consolidate_object_research import (
consolidate_object_research,
)
from onyx.agents.agent_search.dc_search_analysis.nodes.a5_consolidate_research import (
consolidate_research,
)
from onyx.agents.agent_search.dc_search_analysis.states import MainInput
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.utils.logger import setup_logger
logger = setup_logger()
test_mode = False
def divide_and_conquer_graph_builder(test_mode: bool = False) -> StateGraph:
"""
LangGraph graph builder for the knowledge graph search process.
"""
graph = StateGraph(
state_schema=MainState,
input=MainInput,
)
### Add nodes ###
graph.add_node(
"search_objects",
search_objects,
)
graph.add_node(
"structure_research_by_source",
structure_research_by_object,
)
graph.add_node(
"research_object_source",
research_object_source,
)
graph.add_node(
"consolidate_object_research",
consolidate_object_research,
)
graph.add_node(
"consolidate_research",
consolidate_research,
)
### Add edges ###
graph.add_edge(start_key=START, end_key="search_objects")
graph.add_conditional_edges(
source="search_objects",
path=parallel_object_source_research_edge,
path_map=["research_object_source"],
)
graph.add_edge(
start_key="research_object_source",
end_key="structure_research_by_source",
)
graph.add_conditional_edges(
source="structure_research_by_source",
path=parallel_object_research_consolidation_edge,
path_map=["consolidate_object_research"],
)
graph.add_edge(
start_key="consolidate_object_research",
end_key="consolidate_research",
)
graph.add_edge(
start_key="consolidate_research",
end_key=END,
)
return graph

View File

@@ -1,159 +0,0 @@
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.ops import research
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.agents.agent_search.dc_search_analysis.states import (
SearchSourcesObjectsUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.chat.models import AgentAnswerPiece
from onyx.configs.constants import DocumentSource
from onyx.prompts.agents.dc_prompts import DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT
from onyx.prompts.agents.dc_prompts import DC_OBJECT_SEPARATOR
from onyx.prompts.agents.dc_prompts import DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def search_objects(
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
) -> SearchSourcesObjectsUpdate:
"""
LangGraph node to start the agentic search process.
"""
graph_config = cast(GraphConfig, config["metadata"]["config"])
question = graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
try:
instructions = graph_config.inputs.search_request.persona.prompts[
0
].system_prompt
agent_1_instructions = extract_section(
instructions, "Agent Step 1:", "Agent Step 2:"
)
if agent_1_instructions is None:
raise ValueError("Agent 1 instructions not found")
agent_1_base_data = extract_section(instructions, "|Start Data|", "|End Data|")
agent_1_task = extract_section(
agent_1_instructions, "Task:", "Independent Research Sources:"
)
if agent_1_task is None:
raise ValueError("Agent 1 task not found")
agent_1_independent_sources_str = extract_section(
agent_1_instructions, "Independent Research Sources:", "Output Objective:"
)
if agent_1_independent_sources_str is None:
raise ValueError("Agent 1 Independent Research Sources not found")
document_sources = [
DocumentSource(x.strip().lower())
for x in agent_1_independent_sources_str.split(DC_OBJECT_SEPARATOR)
]
agent_1_output_objective = extract_section(
agent_1_instructions, "Output Objective:"
)
if agent_1_output_objective is None:
raise ValueError("Agent 1 output objective not found")
except Exception as e:
raise ValueError(
f"Agent 1 instructions not found or not formatted correctly: {e}"
)
# Extract objects
if agent_1_base_data is None:
# Retrieve chunks for objects
retrieved_docs = research(question, search_tool)[:10]
document_texts_list = []
for doc_num, doc in enumerate(retrieved_docs):
chunk_text = "Document " + str(doc_num) + ":\n" + doc.content
document_texts_list.append(chunk_text)
document_texts = "\n\n".join(document_texts_list)
dc_object_extraction_prompt = DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT.format(
question=question,
task=agent_1_task,
document_text=document_texts,
objects_of_interest=agent_1_output_objective,
)
else:
dc_object_extraction_prompt = DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT.format(
question=question,
task=agent_1_task,
base_data=agent_1_base_data,
objects_of_interest=agent_1_output_objective,
)
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_object_extraction_prompt,
reserved_str="",
),
)
]
primary_llm = graph_config.tooling.primary_llm
# Grader
try:
llm_response = run_with_timeout(
30,
primary_llm.invoke,
prompt=msg,
timeout_override=30,
max_tokens=300,
)
cleaned_response = (
str(llm_response.content)
.replace("```json\n", "")
.replace("\n```", "")
.replace("\n", "")
)
cleaned_response = cleaned_response.split("OBJECTS:")[1]
object_list = [x.strip() for x in cleaned_response.split(";")]
except Exception as e:
raise ValueError(f"Error in search_objects: {e}")
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=" Researching the individual objects for each source type... ",
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
return SearchSourcesObjectsUpdate(
analysis_objects=object_list,
analysis_sources=document_sources,
log_messages=["Agent 1 Task done"],
)

View File

@@ -1,185 +0,0 @@
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.ops import research
from onyx.agents.agent_search.dc_search_analysis.states import ObjectSourceInput
from onyx.agents.agent_search.dc_search_analysis.states import (
ObjectSourceResearchUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.prompts.agents.dc_prompts import DC_OBJECT_SOURCE_RESEARCH_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def research_object_source(
state: ObjectSourceInput,
config: RunnableConfig,
writer: StreamWriter = lambda _: None,
) -> ObjectSourceResearchUpdate:
"""
LangGraph node to start the agentic search process.
"""
datetime.now()
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
question = graph_config.inputs.search_request.query
object, document_source = state.object_source_combination
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
try:
instructions = graph_config.inputs.search_request.persona.prompts[
0
].system_prompt
agent_2_instructions = extract_section(
instructions, "Agent Step 2:", "Agent Step 3:"
)
if agent_2_instructions is None:
raise ValueError("Agent 2 instructions not found")
agent_2_task = extract_section(
agent_2_instructions, "Task:", "Independent Research Sources:"
)
if agent_2_task is None:
raise ValueError("Agent 2 task not found")
agent_2_time_cutoff = extract_section(
agent_2_instructions, "Time Cutoff:", "Research Topics:"
)
agent_2_research_topics = extract_section(
agent_2_instructions, "Research Topics:", "Output Objective"
)
agent_2_output_objective = extract_section(
agent_2_instructions, "Output Objective:"
)
if agent_2_output_objective is None:
raise ValueError("Agent 2 output objective not found")
except Exception:
raise ValueError(
"Agent 1 instructions not found or not formatted correctly: {e}"
)
# Populate prompt
# Retrieve chunks for objects
if agent_2_time_cutoff is not None and agent_2_time_cutoff.strip() != "":
if agent_2_time_cutoff.strip().endswith("d"):
try:
days = int(agent_2_time_cutoff.strip()[:-1])
agent_2_source_start_time = datetime.now(timezone.utc) - timedelta(
days=days
)
except ValueError:
raise ValueError(
f"Invalid time cutoff format: {agent_2_time_cutoff}. Expected format: '<number>d'"
)
else:
raise ValueError(
f"Invalid time cutoff format: {agent_2_time_cutoff}. Expected format: '<number>d'"
)
else:
agent_2_source_start_time = None
document_sources = [document_source] if document_source else None
if len(question.strip()) > 0:
research_area = f"{question} for {object}"
elif agent_2_research_topics and len(agent_2_research_topics.strip()) > 0:
research_area = f"{agent_2_research_topics} for {object}"
else:
research_area = object
retrieved_docs = research(
question=research_area,
search_tool=search_tool,
document_sources=document_sources,
time_cutoff=agent_2_source_start_time,
)
# Generate document text
document_texts_list = []
for doc_num, doc in enumerate(retrieved_docs):
chunk_text = "Document " + str(doc_num) + ":\n" + doc.content
document_texts_list.append(chunk_text)
document_texts = "\n\n".join(document_texts_list)
# Built prompt
today = datetime.now().strftime("%A, %Y-%m-%d")
dc_object_source_research_prompt = (
DC_OBJECT_SOURCE_RESEARCH_PROMPT.format(
today=today,
question=question,
task=agent_2_task,
document_text=document_texts,
format=agent_2_output_objective,
)
.replace("---object---", object)
.replace("---source---", document_source.value)
)
# Run LLM
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_object_source_research_prompt,
reserved_str="",
),
)
]
# fast_llm = graph_config.tooling.fast_llm
primary_llm = graph_config.tooling.primary_llm
llm = primary_llm
# Grader
try:
llm_response = run_with_timeout(
30,
llm.invoke,
prompt=msg,
timeout_override=30,
max_tokens=300,
)
cleaned_response = str(llm_response.content).replace("```json\n", "")
cleaned_response = cleaned_response.split("RESEARCH RESULTS:")[1]
object_research_results = {
"object": object,
"source": document_source.value,
"research_result": cleaned_response,
}
except Exception as e:
raise ValueError(f"Error in research_object_source: {e}")
logger.debug("DivCon Step A2 - Object Source Research - completed for an object")
return ObjectSourceResearchUpdate(
object_source_research_results=[object_research_results],
log_messages=["Agent Step 2 done for one object"],
)

View File

@@ -1,68 +0,0 @@
from collections import defaultdict
from datetime import datetime
from typing import cast
from typing import Dict
from typing import List
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.agents.agent_search.dc_search_analysis.states import (
ObjectResearchInformationUpdate,
)
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.chat.models import AgentAnswerPiece
from onyx.utils.logger import setup_logger
logger = setup_logger()
def structure_research_by_object(
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
) -> ObjectResearchInformationUpdate:
"""
LangGraph node to start the agentic search process.
"""
datetime.now()
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=" consolidating the information across source types for each object...",
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
object_source_research_results = state.object_source_research_results
object_research_information_results: List[Dict[str, str]] = []
object_research_information_results_list: Dict[str, List[str]] = defaultdict(list)
for object_source_research in object_source_research_results:
object = object_source_research["object"]
source = object_source_research["source"]
research_result = object_source_research["research_result"]
object_research_information_results_list[object].append(
f"Source: {source}\n{research_result}"
)
for object, information in object_research_information_results_list.items():
object_research_information_results.append(
{"object": object, "information": "\n".join(information)}
)
logger.debug("DivCon Step A3 - Object Research Information Structuring - completed")
return ObjectResearchInformationUpdate(
object_research_information_results=object_research_information_results,
log_messages=["A3 - Object Research Information structured"],
)

View File

@@ -1,107 +0,0 @@
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.states import ObjectInformationInput
from onyx.agents.agent_search.dc_search_analysis.states import ObjectResearchUpdate
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.prompts.agents.dc_prompts import DC_OBJECT_CONSOLIDATION_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def consolidate_object_research(
state: ObjectInformationInput,
config: RunnableConfig,
writer: StreamWriter = lambda _: None,
) -> ObjectResearchUpdate:
"""
LangGraph node to start the agentic search process.
"""
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
question = graph_config.inputs.search_request.query
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
instructions = graph_config.inputs.search_request.persona.prompts[0].system_prompt
agent_4_instructions = extract_section(
instructions, "Agent Step 4:", "Agent Step 5:"
)
if agent_4_instructions is None:
raise ValueError("Agent 4 instructions not found")
agent_4_output_objective = extract_section(
agent_4_instructions, "Output Objective:"
)
if agent_4_output_objective is None:
raise ValueError("Agent 4 output objective not found")
object_information = state.object_information
object = object_information["object"]
information = object_information["information"]
# Create a prompt for the object consolidation
dc_object_consolidation_prompt = DC_OBJECT_CONSOLIDATION_PROMPT.format(
question=question,
object=object,
information=information,
format=agent_4_output_objective,
)
# Run LLM
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_object_consolidation_prompt,
reserved_str="",
),
)
]
graph_config.tooling.primary_llm
# fast_llm = graph_config.tooling.fast_llm
primary_llm = graph_config.tooling.primary_llm
llm = primary_llm
# Grader
try:
llm_response = run_with_timeout(
30,
llm.invoke,
prompt=msg,
timeout_override=30,
max_tokens=300,
)
cleaned_response = str(llm_response.content).replace("```json\n", "")
consolidated_information = cleaned_response.split("INFORMATION:")[1]
except Exception as e:
raise ValueError(f"Error in consolidate_object_research: {e}")
object_research_results = {
"object": object,
"research_result": consolidated_information,
}
logger.debug(
"DivCon Step A4 - Object Research Consolidation - completed for an object"
)
return ObjectResearchUpdate(
object_research_results=[object_research_results],
log_messages=["Agent Source Consilidation done"],
)

View File

@@ -1,164 +0,0 @@
from datetime import datetime
from typing import cast
from langchain_core.messages import HumanMessage
from langchain_core.runnables import RunnableConfig
from langgraph.types import StreamWriter
from onyx.agents.agent_search.dc_search_analysis.ops import extract_section
from onyx.agents.agent_search.dc_search_analysis.states import MainState
from onyx.agents.agent_search.dc_search_analysis.states import ResearchUpdate
from onyx.agents.agent_search.models import GraphConfig
from onyx.agents.agent_search.shared_graph_utils.agent_prompt_ops import (
trim_prompt_piece,
)
from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
from onyx.chat.models import AgentAnswerPiece
from onyx.prompts.agents.dc_prompts import DC_FORMATTING_NO_BASE_DATA_PROMPT
from onyx.prompts.agents.dc_prompts import DC_FORMATTING_WITH_BASE_DATA_PROMPT
from onyx.utils.logger import setup_logger
from onyx.utils.threadpool_concurrency import run_with_timeout
logger = setup_logger()
def consolidate_research(
state: MainState, config: RunnableConfig, writer: StreamWriter = lambda _: None
) -> ResearchUpdate:
"""
LangGraph node to start the agentic search process.
"""
datetime.now()
graph_config = cast(GraphConfig, config["metadata"]["config"])
graph_config.inputs.search_request.query
search_tool = graph_config.tooling.search_tool
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=" generating the answer\n\n\n",
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
if search_tool is None or graph_config.inputs.search_request.persona is None:
raise ValueError("Search tool and persona must be provided for DivCon search")
# Populate prompt
instructions = graph_config.inputs.search_request.persona.prompts[0].system_prompt
try:
agent_5_instructions = extract_section(
instructions, "Agent Step 5:", "Agent End"
)
if agent_5_instructions is None:
raise ValueError("Agent 5 instructions not found")
agent_5_base_data = extract_section(instructions, "|Start Data|", "|End Data|")
agent_5_task = extract_section(
agent_5_instructions, "Task:", "Independent Research Sources:"
)
if agent_5_task is None:
raise ValueError("Agent 5 task not found")
agent_5_output_objective = extract_section(
agent_5_instructions, "Output Objective:"
)
if agent_5_output_objective is None:
raise ValueError("Agent 5 output objective not found")
except ValueError as e:
raise ValueError(
f"Instructions for Agent Step 5 were not properly formatted: {e}"
)
research_result_list = []
if agent_5_task.strip() == "*concatenate*":
object_research_results = state.object_research_results
for object_research_result in object_research_results:
object = object_research_result["object"]
research_result = object_research_result["research_result"]
research_result_list.append(f"Object: {object}\n\n{research_result}")
research_results = "\n\n".join(research_result_list)
else:
raise NotImplementedError("Only '*concatenate*' is currently supported")
# Create a prompt for the object consolidation
if agent_5_base_data is None:
dc_formatting_prompt = DC_FORMATTING_NO_BASE_DATA_PROMPT.format(
text=research_results,
format=agent_5_output_objective,
)
else:
dc_formatting_prompt = DC_FORMATTING_WITH_BASE_DATA_PROMPT.format(
base_data=agent_5_base_data,
text=research_results,
format=agent_5_output_objective,
)
# Run LLM
msg = [
HumanMessage(
content=trim_prompt_piece(
config=graph_config.tooling.primary_llm.config,
prompt_piece=dc_formatting_prompt,
reserved_str="",
),
)
]
dispatch_timings: list[float] = []
primary_model = graph_config.tooling.primary_llm
def stream_initial_answer() -> list[str]:
response: list[str] = []
for message in primary_model.stream(msg, timeout_override=30, max_tokens=None):
# TODO: in principle, the answer here COULD contain images, but we don't support that yet
content = message.content
if not isinstance(content, str):
raise ValueError(
f"Expected content to be a string, but got {type(content)}"
)
start_stream_token = datetime.now()
write_custom_event(
"initial_agent_answer",
AgentAnswerPiece(
answer_piece=content,
level=0,
level_question_num=0,
answer_type="agent_level_answer",
),
writer,
)
end_stream_token = datetime.now()
dispatch_timings.append(
(end_stream_token - start_stream_token).microseconds
)
response.append(content)
return response
try:
_ = run_with_timeout(
60,
stream_initial_answer,
)
except Exception as e:
raise ValueError(f"Error in consolidate_research: {e}")
logger.debug("DivCon Step A5 - Final Generation - completed")
return ResearchUpdate(
research_results=research_results,
log_messages=["Agent Source Consilidation done"],
)

View File

@@ -1,61 +0,0 @@
from datetime import datetime
from typing import cast
from onyx.chat.models import LlmDoc
from onyx.configs.constants import DocumentSource
from onyx.context.search.models import InferenceSection
from onyx.db.engine import get_session_with_current_tenant
from onyx.tools.models import SearchToolOverrideKwargs
from onyx.tools.tool_implementations.search.search_tool import (
FINAL_CONTEXT_DOCUMENTS_ID,
)
from onyx.tools.tool_implementations.search.search_tool import SearchTool
def research(
question: str,
search_tool: SearchTool,
document_sources: list[DocumentSource] | None = None,
time_cutoff: datetime | None = None,
) -> list[LlmDoc]:
# new db session to avoid concurrency issues
callback_container: list[list[InferenceSection]] = []
retrieved_docs: list[LlmDoc] = []
with get_session_with_current_tenant() as db_session:
for tool_response in search_tool.run(
query=question,
override_kwargs=SearchToolOverrideKwargs(
force_no_rerank=False,
alternate_db_session=db_session,
retrieved_sections_callback=callback_container.append,
skip_query_analysis=True,
document_sources=document_sources,
time_cutoff=time_cutoff,
),
):
# get retrieved docs to send to the rest of the graph
if tool_response.id == FINAL_CONTEXT_DOCUMENTS_ID:
retrieved_docs = cast(list[LlmDoc], tool_response.response)[:10]
break
return retrieved_docs
def extract_section(
text: str, start_marker: str, end_marker: str | None = None
) -> str | None:
"""Extract text between markers, returning None if markers not found"""
parts = text.split(start_marker)
if len(parts) == 1:
return None
after_start = parts[1].strip()
if not end_marker:
return after_start
extract = after_start.split(end_marker)[0]
return extract.strip()

View File

@@ -1,72 +0,0 @@
from operator import add
from typing import Annotated
from typing import Dict
from typing import TypedDict
from pydantic import BaseModel
from onyx.agents.agent_search.core_state import CoreState
from onyx.agents.agent_search.orchestration.states import ToolCallUpdate
from onyx.agents.agent_search.orchestration.states import ToolChoiceInput
from onyx.agents.agent_search.orchestration.states import ToolChoiceUpdate
from onyx.configs.constants import DocumentSource
### States ###
class LoggerUpdate(BaseModel):
log_messages: Annotated[list[str], add] = []
class SearchSourcesObjectsUpdate(LoggerUpdate):
analysis_objects: list[str] = []
analysis_sources: list[DocumentSource] = []
class ObjectSourceInput(LoggerUpdate):
object_source_combination: tuple[str, DocumentSource]
class ObjectSourceResearchUpdate(LoggerUpdate):
object_source_research_results: Annotated[list[Dict[str, str]], add] = []
class ObjectInformationInput(LoggerUpdate):
object_information: Dict[str, str]
class ObjectResearchInformationUpdate(LoggerUpdate):
object_research_information_results: Annotated[list[Dict[str, str]], add] = []
class ObjectResearchUpdate(LoggerUpdate):
object_research_results: Annotated[list[Dict[str, str]], add] = []
class ResearchUpdate(LoggerUpdate):
research_results: str | None = None
## Graph Input State
class MainInput(CoreState):
pass
## Graph State
class MainState(
# This includes the core state
MainInput,
ToolChoiceInput,
ToolCallUpdate,
ToolChoiceUpdate,
SearchSourcesObjectsUpdate,
ObjectSourceResearchUpdate,
ObjectResearchInformationUpdate,
ObjectResearchUpdate,
ResearchUpdate,
):
pass
## Graph Output State - presently not used
class MainOutput(TypedDict):
log_messages: list[str]

View File

@@ -8,10 +8,6 @@ from langgraph.graph.state import CompiledStateGraph
from onyx.agents.agent_search.basic.graph_builder import basic_graph_builder
from onyx.agents.agent_search.basic.states import BasicInput
from onyx.agents.agent_search.dc_search_analysis.graph_builder import (
divide_and_conquer_graph_builder,
)
from onyx.agents.agent_search.dc_search_analysis.states import MainInput as DCMainInput
from onyx.agents.agent_search.deep_search.main.graph_builder import (
main_graph_builder as main_graph_builder_a,
)
@@ -86,7 +82,7 @@ def _parse_agent_event(
def manage_sync_streaming(
compiled_graph: CompiledStateGraph,
config: GraphConfig,
graph_input: BasicInput | MainInput | DCMainInput,
graph_input: BasicInput | MainInput,
) -> Iterable[StreamEvent]:
message_id = config.persistence.message_id if config.persistence else None
for event in compiled_graph.stream(
@@ -100,7 +96,7 @@ def manage_sync_streaming(
def run_graph(
compiled_graph: CompiledStateGraph,
config: GraphConfig,
input: BasicInput | MainInput | DCMainInput,
input: BasicInput | MainInput,
) -> AnswerStream:
config.behavior.perform_initial_search_decomposition = (
INITIAL_SEARCH_DECOMPOSITION_ENABLED
@@ -150,16 +146,6 @@ def run_basic_graph(
return run_graph(compiled_graph, config, input)
def run_dc_graph(
config: GraphConfig,
) -> AnswerStream:
graph = divide_and_conquer_graph_builder()
compiled_graph = graph.compile()
input = DCMainInput(log_messages=[])
config.inputs.search_request.query = config.inputs.search_request.query.strip()
return run_graph(compiled_graph, config, input)
if __name__ == "__main__":
for _ in range(1):
query_start_time = datetime.now()

View File

@@ -180,35 +180,3 @@ def binary_string_test_after_answer_separator(
relevant_text = text.split(f"{separator}")[-1]
return binary_string_test(relevant_text, positive_value)
def build_dc_search_prompt(
question: str,
original_question: str,
docs: list[InferenceSection],
persona_specification: str,
config: LLMConfig,
) -> list[SystemMessage | HumanMessage | AIMessage | ToolMessage]:
system_message = SystemMessage(
content=persona_specification,
)
date_str = build_date_time_string()
docs_str = format_docs(docs)
docs_str = trim_prompt_piece(
config,
docs_str,
SUB_QUESTION_RAG_PROMPT + question + original_question + date_str,
)
human_message = HumanMessage(
content=SUB_QUESTION_RAG_PROMPT.format(
question=question,
original_question=original_question,
context=docs_str,
date_prompt=date_str,
)
)
return [system_message, human_message]

View File

@@ -10,7 +10,6 @@ from onyx.agents.agent_search.models import GraphPersistence
from onyx.agents.agent_search.models import GraphSearchConfig
from onyx.agents.agent_search.models import GraphTooling
from onyx.agents.agent_search.run_graph import run_basic_graph
from onyx.agents.agent_search.run_graph import run_dc_graph
from onyx.agents.agent_search.run_graph import run_main_graph
from onyx.chat.models import AgentAnswerPiece
from onyx.chat.models import AnswerPacket
@@ -143,18 +142,11 @@ class Answer:
yield from self._processed_stream
return
if self.graph_config.behavior.use_agentic_search:
run_langgraph = run_main_graph
elif (
self.graph_config.inputs.search_request.persona
and self.graph_config.inputs.search_request.persona.description.startswith(
"DivCon Beta Agent"
)
):
run_langgraph = run_dc_graph
else:
run_langgraph = run_basic_graph
run_langgraph = (
run_main_graph
if self.graph_config.behavior.use_agentic_search
else run_basic_graph
)
stream = run_langgraph(
self.graph_config,
)

View File

@@ -1,5 +1,4 @@
import base64
import time
from collections.abc import Generator
from datetime import datetime
from datetime import timedelta
@@ -8,8 +7,6 @@ from typing import Any
from typing import cast
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from onyx.configs.app_configs import CONTINUE_ON_CONNECTOR_FAILURE
from onyx.configs.app_configs import GONG_CONNECTOR_START_TIME
@@ -24,14 +21,13 @@ from onyx.connectors.models import Document
from onyx.connectors.models import TextSection
from onyx.utils.logger import setup_logger
logger = setup_logger()
GONG_BASE_URL = "https://us-34014.api.gong.io"
class GongConnector(LoadConnector, PollConnector):
BASE_URL = "https://api.gong.io"
MAX_CALL_DETAILS_ATTEMPTS = 6
CALL_DETAILS_DELAY = 30 # in seconds
def __init__(
self,
workspaces: list[str] | None = None,
@@ -45,23 +41,15 @@ class GongConnector(LoadConnector, PollConnector):
self.auth_token_basic: str | None = None
self.hide_user_info = hide_user_info
retry_strategy = Retry(
total=5,
backoff_factor=2,
status_forcelist=[429, 500, 502, 503, 504],
)
def _get_auth_header(self) -> dict[str, str]:
if self.auth_token_basic is None:
raise ConnectorMissingCredentialError("Gong")
session = requests.Session()
session.mount(GongConnector.BASE_URL, HTTPAdapter(max_retries=retry_strategy))
self._session = session
@staticmethod
def make_url(endpoint: str) -> str:
url = f"{GongConnector.BASE_URL}{endpoint}"
return url
return {"Authorization": f"Basic {self.auth_token_basic}"}
def _get_workspace_id_map(self) -> dict[str, str]:
response = self._session.get(GongConnector.make_url("/v2/workspaces"))
url = f"{GONG_BASE_URL}/v2/workspaces"
response = requests.get(url, headers=self._get_auth_header())
response.raise_for_status()
workspaces_details = response.json().get("workspaces")
@@ -78,6 +66,7 @@ class GongConnector(LoadConnector, PollConnector):
def _get_transcript_batches(
self, start_datetime: str | None = None, end_datetime: str | None = None
) -> Generator[list[dict[str, Any]], None, None]:
url = f"{GONG_BASE_URL}/v2/calls/transcript"
body: dict[str, dict] = {"filter": {}}
if start_datetime:
body["filter"]["fromDateTime"] = start_datetime
@@ -105,8 +94,8 @@ class GongConnector(LoadConnector, PollConnector):
del body["filter"]["workspaceId"]
while True:
response = self._session.post(
GongConnector.make_url("/v2/calls/transcript"), json=body
response = requests.post(
url, headers=self._get_auth_header(), json=body
)
# If no calls in the range, just break out
if response.status_code == 404:
@@ -136,14 +125,14 @@ class GongConnector(LoadConnector, PollConnector):
yield transcripts
def _get_call_details_by_ids(self, call_ids: list[str]) -> dict:
url = f"{GONG_BASE_URL}/v2/calls/extensive"
body = {
"filter": {"callIds": call_ids},
"contentSelector": {"exposedFields": {"parties": True}},
}
response = self._session.post(
GongConnector.make_url("/v2/calls/extensive"), json=body
)
response = requests.post(url, headers=self._get_auth_header(), json=body)
response.raise_for_status()
calls = response.json().get("calls")
@@ -176,74 +165,24 @@ class GongConnector(LoadConnector, PollConnector):
def _fetch_calls(
self, start_datetime: str | None = None, end_datetime: str | None = None
) -> GenerateDocumentsOutput:
num_calls = 0
for transcript_batch in self._get_transcript_batches(
start_datetime, end_datetime
):
doc_batch: list[Document] = []
transcript_call_ids = cast(
call_ids = cast(
list[str],
[t.get("callId") for t in transcript_batch if t.get("callId")],
)
call_details_map = self._get_call_details_by_ids(call_ids)
call_details_map: dict[str, Any] = {}
# There's a likely race condition in the API where a transcript will have a
# call id but the call to v2/calls/extensive will not return all of the id's
# retry with exponential backoff has been observed to mitigate this
# in ~2 minutes
current_attempt = 0
while True:
current_attempt += 1
call_details_map = self._get_call_details_by_ids(transcript_call_ids)
if set(transcript_call_ids) == set(call_details_map.keys()):
# we got all the id's we were expecting ... break and continue
break
# we are missing some id's. Log and retry with exponential backoff
missing_call_ids = set(transcript_call_ids) - set(
call_details_map.keys()
)
logger.warning(
f"_get_call_details_by_ids is missing call id's: "
f"current_attempt={current_attempt} "
f"missing_call_ids={missing_call_ids}"
)
if current_attempt >= self.MAX_CALL_DETAILS_ATTEMPTS:
raise RuntimeError(
f"Attempt count exceeded for _get_call_details_by_ids: "
f"missing_call_ids={missing_call_ids} "
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
)
wait_seconds = self.CALL_DETAILS_DELAY * pow(2, current_attempt - 1)
logger.warning(
f"_get_call_details_by_ids waiting to retry: "
f"wait={wait_seconds}s "
f"current_attempt={current_attempt} "
f"next_attempt={current_attempt+1} "
f"max_attempts={self.MAX_CALL_DETAILS_ATTEMPTS}"
)
time.sleep(wait_seconds)
# now we can iterate per call/transcript
for transcript in transcript_batch:
call_id = transcript.get("callId")
if not call_id or call_id not in call_details_map:
# NOTE(rkuo): seeing odd behavior where call_ids from the transcript
# don't have call details. adding error debugging logs to trace.
logger.error(
f"Couldn't get call information for Call ID: {call_id}"
)
if call_id:
logger.error(
f"Call debug info: call_id={call_id} "
f"call_ids={transcript_call_ids} "
f"call_details_map={call_details_map.keys()}"
)
if not self.continue_on_fail:
raise RuntimeError(
f"Couldn't get call information for Call ID: {call_id}"
@@ -256,8 +195,7 @@ class GongConnector(LoadConnector, PollConnector):
call_time_str = call_metadata["started"]
call_title = call_metadata["title"]
logger.info(
f"{num_calls+1}: Indexing Gong call id {call_id} "
f"from {call_time_str.split('T', 1)[0]}: {call_title}"
f"Indexing Gong call from {call_time_str.split('T', 1)[0]}: {call_title}"
)
call_parties = cast(list[dict] | None, call_details.get("parties"))
@@ -316,13 +254,8 @@ class GongConnector(LoadConnector, PollConnector):
metadata={"client": call_metadata.get("system")},
)
)
num_calls += 1
yield doc_batch
logger.info(f"_fetch_calls finished: num_calls={num_calls}")
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
combined = (
f'{credentials["gong_access_key"]}:{credentials["gong_access_key_secret"]}'
@@ -330,13 +263,6 @@ class GongConnector(LoadConnector, PollConnector):
self.auth_token_basic = base64.b64encode(combined.encode("utf-8")).decode(
"utf-8"
)
if self.auth_token_basic is None:
raise ConnectorMissingCredentialError("Gong")
self._session.headers.update(
{"Authorization": f"Basic {self.auth_token_basic}"}
)
return None
def load_from_state(self) -> GenerateDocumentsOutput:

View File

@@ -20,8 +20,7 @@ from onyx.connectors.models import ConnectorMissingCredentialError
from onyx.connectors.models import Document
from onyx.connectors.models import SlimDocument
from onyx.connectors.models import TextSection
from onyx.file_processing.extract_file_text import ACCEPTED_DOCUMENT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import ALL_ACCEPTED_FILE_EXTENSIONS
from onyx.file_processing.extract_file_text import extract_file_text
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
@@ -85,21 +84,14 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Populate the spot ID map with all available spots.
Keys are stored as lowercase for case-insensitive lookups.
"""
try:
spots = self.client.get_spots()
for spot in spots:
if "title" in spot and "id" in spot:
spot_name = spot["title"]
self._spot_id_map[spot_name.lower()] = spot["id"]
spots = self.client.get_spots()
for spot in spots:
if "title" in spot and "id" in spot:
spot_name = spot["title"]
self._spot_id_map[spot_name.lower()] = spot["id"]
self._all_spots_fetched = True
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
except HighspotClientError as e:
logger.error(f"Error retrieving spots from Highspot: {str(e)}")
raise
except Exception as e:
logger.error(f"Unexpected error retrieving spots from Highspot: {str(e)}")
raise
self._all_spots_fetched = True
logger.info(f"Retrieved {len(self._spot_id_map)} spots from Highspot")
def _get_all_spot_names(self) -> List[str]:
"""
@@ -159,142 +151,116 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Batches of Document objects
"""
doc_batch: list[Document] = []
try:
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
if not spot_names_to_process:
logger.warning("No spots found in Highspot")
raise ValueError("No spots found in Highspot")
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots"
)
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
if spot_id is None:
logger.warning(f"Spot ID not found for spot {spot_name}")
continue
offset = 0
has_more = True
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots"
)
while has_more:
logger.info(
f"Retrieving items from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
logger.info(f"Received Items: {items}")
if not items:
has_more = False
continue
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
if spot_id is None:
logger.warning(f"Spot ID not found for spot {spot_name}")
continue
offset = 0
has_more = True
for item in items:
try:
item_id = item.get("id")
if not item_id:
logger.warning("Item without ID found, skipping")
continue
item_details = self.client.get_item(item_id)
if not item_details:
logger.warning(
f"Item {item_id} details not found, skipping"
)
continue
# Apply time filter if specified
if start or end:
updated_at = item_details.get("date_updated")
if updated_at:
# Convert to datetime for comparison
try:
updated_time = datetime.fromisoformat(
updated_at.replace("Z", "+00:00")
)
if (
start
and updated_time.timestamp() < start
) or (
end and updated_time.timestamp() > end
):
continue
except (ValueError, TypeError):
# Skip if date cannot be parsed
logger.warning(
f"Invalid date format for item {item_id}: {updated_at}"
)
continue
content = self._get_item_content(item_details)
title = item_details.get("title", "")
doc_batch.append(
Document(
id=f"HIGHSPOT_{item_id}",
sections=[
TextSection(
link=item_details.get(
"url",
f"https://www.highspot.com/items/{item_id}",
),
text=content,
)
],
source=DocumentSource.HIGHSPOT,
semantic_identifier=title,
metadata={
"spot_name": spot_name,
"type": item_details.get(
"content_type", ""
),
"created_at": item_details.get(
"date_added", ""
),
"author": item_details.get("author", ""),
"language": item_details.get(
"language", ""
),
"can_download": str(
item_details.get("can_download", False)
),
},
doc_updated_at=item_details.get("date_updated"),
)
)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
except HighspotClientError as e:
item_id = "ID" if not item_id else item_id
logger.error(
f"Error retrieving item {item_id}: {str(e)}"
)
except Exception as e:
item_id = "ID" if not item_id else item_id
logger.error(
f"Unexpected error for item {item_id}: {str(e)}"
)
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(f"Error processing spot {spot_name}: {str(e)}")
except Exception as e:
logger.error(
f"Unexpected error processing spot {spot_name}: {str(e)}"
while has_more:
logger.info(
f"Retrieving items from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
items = response.get("collection", [])
logger.info(f"Received Items: {items}")
if not items:
has_more = False
continue
except Exception as e:
logger.error(f"Error in Highspot connector: {str(e)}")
raise
for item in items:
try:
item_id = item.get("id")
if not item_id:
logger.warning("Item without ID found, skipping")
continue
item_details = self.client.get_item(item_id)
if not item_details:
logger.warning(
f"Item {item_id} details not found, skipping"
)
continue
# Apply time filter if specified
if start or end:
updated_at = item_details.get("date_updated")
if updated_at:
# Convert to datetime for comparison
try:
updated_time = datetime.fromisoformat(
updated_at.replace("Z", "+00:00")
)
if (
start and updated_time.timestamp() < start
) or (end and updated_time.timestamp() > end):
continue
except (ValueError, TypeError):
# Skip if date cannot be parsed
logger.warning(
f"Invalid date format for item {item_id}: {updated_at}"
)
continue
content = self._get_item_content(item_details)
title = item_details.get("title", "")
doc_batch.append(
Document(
id=f"HIGHSPOT_{item_id}",
sections=[
TextSection(
link=item_details.get(
"url",
f"https://www.highspot.com/items/{item_id}",
),
text=content,
)
],
source=DocumentSource.HIGHSPOT,
semantic_identifier=title,
metadata={
"spot_name": spot_name,
"type": item_details.get("content_type", ""),
"created_at": item_details.get(
"date_added", ""
),
"author": item_details.get("author", ""),
"language": item_details.get("language", ""),
"can_download": str(
item_details.get("can_download", False)
),
},
doc_updated_at=item_details.get("date_updated"),
)
)
if len(doc_batch) >= self.batch_size:
yield doc_batch
doc_batch = []
except HighspotClientError as e:
item_id = "ID" if not item_id else item_id
logger.error(f"Error retrieving item {item_id}: {str(e)}")
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(f"Error processing spot {spot_name}: {str(e)}")
if doc_batch:
yield doc_batch
@@ -320,9 +286,7 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
# Extract title and description once at the beginning
title, description = self._extract_title_and_description(item_details)
default_content = f"{title}\n{description}"
logger.info(
f"Processing item {item_id} with extension {file_extension} and file name {content_name}"
)
logger.info(f"Processing item {item_id} with extension {file_extension}")
try:
if content_type == "WebLink":
@@ -334,39 +298,30 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
elif (
is_valid_format
and (
file_extension in ACCEPTED_PLAIN_TEXT_FILE_EXTENSIONS
or file_extension in ACCEPTED_DOCUMENT_FILE_EXTENSIONS
)
and file_extension in ALL_ACCEPTED_FILE_EXTENSIONS
and can_download
):
# For documents, try to get the text content
if not item_id: # Ensure item_id is defined
return default_content
content_response = self.client.get_item_content(item_id)
# Process and extract text from binary content based on type
if content_response:
text_content = extract_file_text(
BytesIO(content_response), content_name, False
BytesIO(content_response), content_name
)
return text_content if text_content else default_content
return text_content
return default_content
else:
return default_content
except HighspotClientError as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
# Use item_id safely in the warning message
error_context = f"item {item_id}" if item_id else "item"
logger.warning(f"Could not retrieve content for {error_context}: {str(e)}")
return default_content
except ValueError as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
logger.error(f"Value error for {error_context}: {str(e)}")
return default_content
except Exception as e:
error_context = f"item {item_id}" if item_id else "(item id not found)"
logger.error(
f"Unexpected error retrieving content for {error_context}: {str(e)}"
)
return default_content
return ""
def _extract_title_and_description(
self, item_details: Dict[str, Any]
@@ -403,63 +358,55 @@ class HighspotConnector(LoadConnector, PollConnector, SlimConnector):
Batches of SlimDocument objects
"""
slim_doc_batch: list[SlimDocument] = []
try:
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
if not spot_names_to_process:
logger.warning("No spots found in Highspot")
raise ValueError("No spots found in Highspot")
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
)
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
offset = 0
has_more = True
# If no spots specified, get all spots
spot_names_to_process = self.spot_names
if not spot_names_to_process:
spot_names_to_process = self._get_all_spot_names()
logger.info(
f"No spots specified, using all {len(spot_names_to_process)} available spots for slim documents"
)
while has_more:
logger.info(
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
for spot_name in spot_names_to_process:
try:
spot_id = self._get_spot_id_from_name(spot_name)
offset = 0
has_more = True
items = response.get("collection", [])
if not items:
has_more = False
continue
for item in items:
item_id = item.get("id")
if not item_id:
continue
slim_doc_batch.append(
SlimDocument(id=f"HIGHSPOT_{item_id}")
)
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
while has_more:
logger.info(
f"Retrieving slim documents from spot {spot_name}, offset {offset}"
)
response = self.client.get_spot_items(
spot_id=spot_id, offset=offset, page_size=self.batch_size
)
if slim_doc_batch:
yield slim_doc_batch
except Exception as e:
logger.error(f"Error in Highspot Slim Connector: {str(e)}")
raise
items = response.get("collection", [])
if not items:
has_more = False
continue
for item in items:
item_id = item.get("id")
if not item_id:
continue
slim_doc_batch.append(SlimDocument(id=f"HIGHSPOT_{item_id}"))
if len(slim_doc_batch) >= _SLIM_BATCH_SIZE:
yield slim_doc_batch
slim_doc_batch = []
has_more = len(items) >= self.batch_size
offset += self.batch_size
except (HighspotClientError, ValueError) as e:
logger.error(
f"Error retrieving slim documents from spot {spot_name}: {str(e)}"
)
if slim_doc_batch:
yield slim_doc_batch
def validate_credentials(self) -> bool:
"""

View File

@@ -1,4 +1,3 @@
import sys
from datetime import datetime
from enum import Enum
from typing import Any
@@ -41,9 +40,6 @@ class TextSection(Section):
text: str
link: str | None = None
def __sizeof__(self) -> int:
return sys.getsizeof(self.text) + sys.getsizeof(self.link)
class ImageSection(Section):
"""Section containing an image reference"""
@@ -51,9 +47,6 @@ class ImageSection(Section):
image_file_name: str
link: str | None = None
def __sizeof__(self) -> int:
return sys.getsizeof(self.image_file_name) + sys.getsizeof(self.link)
class BasicExpertInfo(BaseModel):
"""Basic Information for the owner of a document, any of the fields can be left as None
@@ -117,14 +110,6 @@ class BasicExpertInfo(BaseModel):
)
)
def __sizeof__(self) -> int:
size = sys.getsizeof(self.display_name)
size += sys.getsizeof(self.first_name)
size += sys.getsizeof(self.middle_initial)
size += sys.getsizeof(self.last_name)
size += sys.getsizeof(self.email)
return size
class DocumentBase(BaseModel):
"""Used for Onyx ingestion api, the ID is inferred before use if not provided"""
@@ -178,32 +163,6 @@ class DocumentBase(BaseModel):
attributes.append(k + INDEX_SEPARATOR + v)
return attributes
def __sizeof__(self) -> int:
size = sys.getsizeof(self.id)
for section in self.sections:
size += sys.getsizeof(section)
size += sys.getsizeof(self.source)
size += sys.getsizeof(self.semantic_identifier)
size += sys.getsizeof(self.doc_updated_at)
size += sys.getsizeof(self.chunk_count)
if self.primary_owners is not None:
for primary_owner in self.primary_owners:
size += sys.getsizeof(primary_owner)
else:
size += sys.getsizeof(self.primary_owners)
if self.secondary_owners is not None:
for secondary_owner in self.secondary_owners:
size += sys.getsizeof(secondary_owner)
else:
size += sys.getsizeof(self.secondary_owners)
size += sys.getsizeof(self.title)
size += sys.getsizeof(self.from_ingestion_api)
size += sys.getsizeof(self.additional_info)
return size
def get_text_content(self) -> str:
return " ".join([section.text for section in self.sections if section.text])
@@ -235,12 +194,6 @@ class Document(DocumentBase):
from_ingestion_api=base.from_ingestion_api,
)
def __sizeof__(self) -> int:
size = super().__sizeof__()
size += sys.getsizeof(self.id)
size += sys.getsizeof(self.source)
return size
class IndexingDocument(Document):
"""Document with processed sections for indexing"""

View File

@@ -1,9 +1,4 @@
import gc
import os
import sys
import tempfile
from collections import defaultdict
from pathlib import Path
from typing import Any
from simple_salesforce import Salesforce
@@ -26,13 +21,9 @@ from onyx.connectors.salesforce.salesforce_calls import get_all_children_of_sf_t
from onyx.connectors.salesforce.sqlite_functions import get_affected_parent_ids_by_type
from onyx.connectors.salesforce.sqlite_functions import get_record
from onyx.connectors.salesforce.sqlite_functions import init_db
from onyx.connectors.salesforce.sqlite_functions import sqlite_log_stats
from onyx.connectors.salesforce.sqlite_functions import update_sf_db_with_csv
from onyx.connectors.salesforce.utils import BASE_DATA_PATH
from onyx.connectors.salesforce.utils import get_sqlite_db_path
from onyx.indexing.indexing_heartbeat import IndexingHeartbeatInterface
from onyx.utils.logger import setup_logger
from shared_configs.configs import MULTI_TENANT
logger = setup_logger()
@@ -41,8 +32,6 @@ _DEFAULT_PARENT_OBJECT_TYPES = ["Account"]
class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
MAX_BATCH_BYTES = 1024 * 1024
def __init__(
self,
batch_size: int = INDEX_BATCH_SIZE,
@@ -75,45 +64,22 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
raise ConnectorMissingCredentialError("Salesforce")
return self._sf_client
@staticmethod
def reconstruct_object_types(directory: str) -> dict[str, list[str] | None]:
"""
Scans the given directory for all CSV files and reconstructs the available object types.
Assumes filenames are formatted as "ObjectType.filename.csv" or "ObjectType.csv".
Args:
directory (str): The path to the directory containing CSV files.
Returns:
dict[str, list[str]]: A dictionary mapping object types to lists of file paths.
"""
object_types = defaultdict(list)
for filename in os.listdir(directory):
if filename.endswith(".csv"):
parts = filename.split(".", 1) # Split on the first period
object_type = parts[0] # Take the first part as the object type
object_types[object_type].append(os.path.join(directory, filename))
return dict(object_types)
@staticmethod
def _download_object_csvs(
directory: str,
parent_object_list: list[str],
sf_client: Salesforce,
def _fetch_from_salesforce(
self,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> None:
all_object_types: set[str] = set(parent_object_list)
) -> GenerateDocumentsOutput:
init_db()
all_object_types: set[str] = set(self.parent_object_list)
logger.info(
f"Parent object types: num={len(parent_object_list)} list={parent_object_list}"
)
logger.info(f"Starting with {len(self.parent_object_list)} parent object types")
logger.debug(f"Parent object types: {self.parent_object_list}")
# This takes like 20 seconds
for parent_object_type in parent_object_list:
child_types = get_all_children_of_sf_type(sf_client, parent_object_type)
for parent_object_type in self.parent_object_list:
child_types = get_all_children_of_sf_type(
self.sf_client, parent_object_type
)
all_object_types.update(child_types)
logger.debug(
f"Found {len(child_types)} child types for {parent_object_type}"
@@ -122,53 +88,20 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
# Always want to make sure user is grabbed for permissioning purposes
all_object_types.add("User")
logger.info(
f"All object types: num={len(all_object_types)} list={all_object_types}"
)
# gc.collect()
logger.info(f"Found total of {len(all_object_types)} object types to fetch")
logger.debug(f"All object types: {all_object_types}")
# checkpoint - we've found all object types, now time to fetch the data
logger.info("Fetching CSVs for all object types")
logger.info("Starting to fetch CSVs for all object types")
# This takes like 30 minutes first time and <2 minutes for updates
object_type_to_csv_path = fetch_all_csvs_in_parallel(
sf_client=sf_client,
sf_client=self.sf_client,
object_types=all_object_types,
start=start,
end=end,
target_dir=directory,
)
# print useful information
num_csvs = 0
num_bytes = 0
for object_type, csv_paths in object_type_to_csv_path.items():
if not csv_paths:
continue
for csv_path in csv_paths:
if not csv_path:
continue
file_path = Path(csv_path)
file_size = file_path.stat().st_size
num_csvs += 1
num_bytes += file_size
logger.info(
f"CSV info: object_type={object_type} path={csv_path} bytes={file_size}"
)
logger.info(f"CSV info total: total_csvs={num_csvs} total_bytes={num_bytes}")
@staticmethod
def _load_csvs_to_db(csv_directory: str, db_directory: str) -> set[str]:
updated_ids: set[str] = set()
object_type_to_csv_path = SalesforceConnector.reconstruct_object_types(
csv_directory
)
# This takes like 10 seconds
# This is for testing the rest of the functionality if data has
# already been fetched and put in sqlite
@@ -187,16 +120,10 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
# If path is None, it means it failed to fetch the csv
if csv_paths is None:
continue
# Go through each csv path and use it to update the db
for csv_path in csv_paths:
logger.debug(
f"Processing CSV: object_type={object_type} "
f"csv={csv_path} "
f"len={Path(csv_path).stat().st_size}"
)
logger.debug(f"Updating {object_type} with {csv_path}")
new_ids = update_sf_db_with_csv(
db_directory,
object_type=object_type,
csv_download_path=csv_path,
)
@@ -205,127 +132,49 @@ class SalesforceConnector(LoadConnector, PollConnector, SlimConnector):
f"Added {len(new_ids)} new/updated records for {object_type}"
)
os.remove(csv_path)
return updated_ids
def _fetch_from_salesforce(
self,
temp_dir: str,
start: SecondsSinceUnixEpoch | None = None,
end: SecondsSinceUnixEpoch | None = None,
) -> GenerateDocumentsOutput:
logger.info("_fetch_from_salesforce starting.")
if not self._sf_client:
raise RuntimeError("self._sf_client is None!")
init_db(temp_dir)
sqlite_log_stats(temp_dir)
# Step 1 - download
SalesforceConnector._download_object_csvs(
temp_dir, self.parent_object_list, self._sf_client, start, end
)
gc.collect()
# Step 2 - load CSV's to sqlite
updated_ids = SalesforceConnector._load_csvs_to_db(temp_dir, temp_dir)
gc.collect()
logger.info(f"Found {len(updated_ids)} total updated records")
logger.info(
f"Starting to process parent objects of types: {self.parent_object_list}"
)
# Step 3 - extract and index docs
batches_processed = 0
docs_processed = 0
docs_to_yield: list[Document] = []
docs_to_yield_bytes = 0
docs_processed = 0
# Takes 15-20 seconds per batch
for parent_type, parent_id_batch in get_affected_parent_ids_by_type(
temp_dir,
updated_ids=list(updated_ids),
parent_types=self.parent_object_list,
):
batches_processed += 1
logger.info(
f"Processing batch: index={batches_processed} "
f"object_type={parent_type} "
f"len={len(parent_id_batch)} "
f"processed={docs_processed} "
f"remaining={len(updated_ids) - docs_processed}"
f"Processing batch of {len(parent_id_batch)} {parent_type} objects"
)
for parent_id in parent_id_batch:
if not (parent_object := get_record(temp_dir, parent_id, parent_type)):
if not (parent_object := get_record(parent_id, parent_type)):
logger.warning(
f"Failed to get parent object {parent_id} for {parent_type}"
)
continue
doc = convert_sf_object_to_doc(
temp_dir,
sf_object=parent_object,
sf_instance=self.sf_client.sf_instance,
docs_to_yield.append(
convert_sf_object_to_doc(
sf_object=parent_object,
sf_instance=self.sf_client.sf_instance,
)
)
doc_sizeof = sys.getsizeof(doc)
docs_to_yield_bytes += doc_sizeof
docs_to_yield.append(doc)
docs_processed += 1
# memory usage is sensitive to the input length, so we're yielding immediately
# if the batch exceeds a certain byte length
if (
len(docs_to_yield) >= self.batch_size
or docs_to_yield_bytes > SalesforceConnector.MAX_BATCH_BYTES
):
if len(docs_to_yield) >= self.batch_size:
yield docs_to_yield
docs_to_yield = []
docs_to_yield_bytes = 0
# observed a memory leak / size issue with the account table if we don't gc.collect here.
gc.collect()
yield docs_to_yield
logger.info(
f"Final processing stats: "
f"processed={docs_processed} "
f"remaining={len(updated_ids) - docs_processed}"
)
def load_from_state(self) -> GenerateDocumentsOutput:
if MULTI_TENANT:
# if multi tenant, we cannot expect the sqlite db to be cached/present
with tempfile.TemporaryDirectory() as temp_dir:
return self._fetch_from_salesforce(temp_dir)
# nuke the db since we're starting from scratch
sqlite_db_path = get_sqlite_db_path(BASE_DATA_PATH)
if os.path.exists(sqlite_db_path):
logger.info(f"load_from_state: Removing db at {sqlite_db_path}.")
os.remove(sqlite_db_path)
return self._fetch_from_salesforce(BASE_DATA_PATH)
return self._fetch_from_salesforce()
def poll_source(
self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch
) -> GenerateDocumentsOutput:
if MULTI_TENANT:
# if multi tenant, we cannot expect the sqlite db to be cached/present
with tempfile.TemporaryDirectory() as temp_dir:
return self._fetch_from_salesforce(temp_dir, start=start, end=end)
if start == 0:
# nuke the db if we're starting from scratch
sqlite_db_path = get_sqlite_db_path(BASE_DATA_PATH)
if os.path.exists(sqlite_db_path):
logger.info(
f"poll_source: Starting at time 0, removing db at {sqlite_db_path}."
)
os.remove(sqlite_db_path)
return self._fetch_from_salesforce(BASE_DATA_PATH)
return self._fetch_from_salesforce(start=start, end=end)
def retrieve_all_slim_documents(
self,
@@ -360,7 +209,7 @@ if __name__ == "__main__":
"sf_security_token": os.environ["SF_SECURITY_TOKEN"],
}
)
start_time = time.monotonic()
start_time = time.time()
doc_count = 0
section_count = 0
text_count = 0
@@ -372,7 +221,7 @@ if __name__ == "__main__":
for section in doc.sections:
if isinstance(section, TextSection) and section.text is not None:
text_count += len(section.text)
end_time = time.monotonic()
end_time = time.time()
print(f"Doc count: {doc_count}")
print(f"Section count: {section_count}")

View File

@@ -124,14 +124,13 @@ def _extract_section(salesforce_object: SalesforceObject, base_url: str) -> Text
def _extract_primary_owners(
directory: str,
sf_object: SalesforceObject,
) -> list[BasicExpertInfo] | None:
object_dict = sf_object.data
if not (last_modified_by_id := object_dict.get("LastModifiedById")):
logger.warning(f"No LastModifiedById found for {sf_object.id}")
return None
if not (last_modified_by := get_record(directory, last_modified_by_id)):
if not (last_modified_by := get_record(last_modified_by_id)):
logger.warning(f"No LastModifiedBy found for {last_modified_by_id}")
return None
@@ -160,7 +159,6 @@ def _extract_primary_owners(
def convert_sf_object_to_doc(
directory: str,
sf_object: SalesforceObject,
sf_instance: str,
) -> Document:
@@ -172,8 +170,8 @@ def convert_sf_object_to_doc(
extracted_semantic_identifier = object_dict.get("Name", "Unknown Object")
sections = [_extract_section(sf_object, base_url)]
for id in get_child_ids(directory, sf_object.id):
if not (child_object := get_record(directory, id)):
for id in get_child_ids(sf_object.id):
if not (child_object := get_record(id)):
continue
sections.append(_extract_section(child_object, base_url))
@@ -183,7 +181,7 @@ def convert_sf_object_to_doc(
source=DocumentSource.SALESFORCE,
semantic_identifier=extracted_semantic_identifier,
doc_updated_at=extracted_doc_updated_at,
primary_owners=_extract_primary_owners(directory, sf_object),
primary_owners=_extract_primary_owners(sf_object),
metadata={},
)
return doc

View File

@@ -11,12 +11,13 @@ from simple_salesforce.bulk2 import SFBulk2Type
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
from onyx.connectors.salesforce.sqlite_functions import has_at_least_one_object_of_type
from onyx.connectors.salesforce.utils import get_object_type_path
from onyx.utils.logger import setup_logger
logger = setup_logger()
def _build_last_modified_time_filter_for_salesforce(
def _build_time_filter_for_salesforce(
start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> str:
if start is None or end is None:
@@ -29,19 +30,6 @@ def _build_last_modified_time_filter_for_salesforce(
)
def _build_created_date_time_filter_for_salesforce(
start: SecondsSinceUnixEpoch | None, end: SecondsSinceUnixEpoch | None
) -> str:
if start is None or end is None:
return ""
start_datetime = datetime.fromtimestamp(start, UTC)
end_datetime = datetime.fromtimestamp(end, UTC)
return (
f" WHERE CreatedDate > {start_datetime.isoformat()} "
f"AND CreatedDate < {end_datetime.isoformat()}"
)
def _get_sf_type_object_json(sf_client: Salesforce, type_name: str) -> Any:
sf_object = SFType(type_name, sf_client.session_id, sf_client.sf_instance)
return sf_object.describe()
@@ -121,6 +109,23 @@ def _check_if_object_type_is_empty(
return True
def _check_for_existing_csvs(sf_type: str) -> list[str] | None:
# Check if the csv already exists
if os.path.exists(get_object_type_path(sf_type)):
existing_csvs = [
os.path.join(get_object_type_path(sf_type), f)
for f in os.listdir(get_object_type_path(sf_type))
if f.endswith(".csv")
]
# If the csv already exists, return the path
# This is likely due to a previous run that failed
# after downloading the csv but before the data was
# written to the db
if existing_csvs:
return existing_csvs
return None
def _build_bulk_query(sf_client: Salesforce, sf_type: str, time_filter: str) -> str:
queryable_fields = _get_all_queryable_fields_of_sf_type(sf_client, sf_type)
query = f"SELECT {', '.join(queryable_fields)} FROM {sf_type}{time_filter}"
@@ -128,15 +133,16 @@ def _build_bulk_query(sf_client: Salesforce, sf_type: str, time_filter: str) ->
def _bulk_retrieve_from_salesforce(
sf_client: Salesforce, sf_type: str, time_filter: str, target_dir: str
sf_client: Salesforce,
sf_type: str,
time_filter: str,
) -> tuple[str, list[str] | None]:
"""Returns a tuple of
1. the salesforce object type
2. the list of CSV's
"""
if not _check_if_object_type_is_empty(sf_client, sf_type, time_filter):
return sf_type, None
if existing_csvs := _check_for_existing_csvs(sf_type):
return sf_type, existing_csvs
query = _build_bulk_query(sf_client, sf_type, time_filter)
bulk_2_handler = SFBulk2Handler(
@@ -153,33 +159,20 @@ def _bulk_retrieve_from_salesforce(
)
logger.info(f"Downloading {sf_type}")
logger.debug(f"Query: {query}")
logger.info(f"Query: {query}")
try:
# This downloads the file to a file in the target path with a random name
results = bulk_2_type.download(
query=query,
path=target_dir,
path=get_object_type_path(sf_type),
max_records=1000000,
)
# prepend each downloaded csv with the object type (delimiter = '.')
all_download_paths: list[str] = []
for result in results:
original_file_path = result["file"]
directory, filename = os.path.split(original_file_path)
new_filename = f"{sf_type}.{filename}"
new_file_path = os.path.join(directory, new_filename)
os.rename(original_file_path, new_file_path)
all_download_paths.append(new_file_path)
all_download_paths = [result["file"] for result in results]
logger.info(f"Downloaded {sf_type} to {all_download_paths}")
return sf_type, all_download_paths
except Exception as e:
logger.error(
f"Failed to download salesforce csv for object type {sf_type}: {e}"
)
logger.warning(f"Exceptioning query for object type {sf_type}: {query}")
logger.info(f"Failed to download salesforce csv for object type {sf_type}: {e}")
return sf_type, None
@@ -188,35 +181,12 @@ def fetch_all_csvs_in_parallel(
object_types: set[str],
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
target_dir: str,
) -> dict[str, list[str] | None]:
"""
Fetches all the csvs in parallel for the given object types
Returns a dict of (sf_type, full_download_path)
"""
# these types don't query properly and need looking at
# problem_types: set[str] = {
# "ContentDocumentLink",
# "RecordActionHistory",
# "PendingOrderSummary",
# "UnifiedActivityRelation",
# }
# these types don't have a LastModifiedDate field and instead use CreatedDate
created_date_types: set[str] = {
"AccountHistory",
"AccountTag",
"EntitySubscription",
}
last_modified_time_filter = _build_last_modified_time_filter_for_salesforce(
start, end
)
created_date_time_filter = _build_created_date_time_filter_for_salesforce(
start, end
)
time_filter = _build_time_filter_for_salesforce(start, end)
time_filter_for_each_object_type = {}
# We do this outside of the thread pool executor because this requires
# a database connection and we don't want to block the thread pool
@@ -225,11 +195,8 @@ def fetch_all_csvs_in_parallel(
"""Only add time filter if there is at least one object of the type
in the database. We aren't worried about partially completed object update runs
because this occurs after we check for existing csvs which covers this case"""
if has_at_least_one_object_of_type(target_dir, sf_type):
if sf_type in created_date_types:
time_filter_for_each_object_type[sf_type] = created_date_time_filter
else:
time_filter_for_each_object_type[sf_type] = last_modified_time_filter
if has_at_least_one_object_of_type(sf_type):
time_filter_for_each_object_type[sf_type] = time_filter
else:
time_filter_for_each_object_type[sf_type] = ""
@@ -240,7 +207,6 @@ def fetch_all_csvs_in_parallel(
sf_client=sf_client,
sf_type=object_type,
time_filter=time_filter_for_each_object_type[object_type],
target_dir=target_dir,
),
object_types,
)

View File

@@ -2,10 +2,8 @@ import csv
import json
import os
import sqlite3
import time
from collections.abc import Iterator
from contextlib import contextmanager
from pathlib import Path
from onyx.connectors.salesforce.utils import get_sqlite_db_path
from onyx.connectors.salesforce.utils import SalesforceObject
@@ -18,7 +16,6 @@ logger = setup_logger()
@contextmanager
def get_db_connection(
directory: str,
isolation_level: str | None = None,
) -> Iterator[sqlite3.Connection]:
"""Get a database connection with proper isolation level and error handling.
@@ -28,7 +25,7 @@ def get_db_connection(
can be "IMMEDIATE" or "EXCLUSIVE" for more strict isolation.
"""
# 60 second timeout for locks
conn = sqlite3.connect(get_sqlite_db_path(directory), timeout=60.0)
conn = sqlite3.connect(get_sqlite_db_path(), timeout=60.0)
if isolation_level is not None:
conn.isolation_level = isolation_level
@@ -41,41 +38,17 @@ def get_db_connection(
conn.close()
def sqlite_log_stats(directory: str) -> None:
with get_db_connection(directory, "EXCLUSIVE") as conn:
cache_pages = conn.execute("PRAGMA cache_size").fetchone()[0]
page_size = conn.execute("PRAGMA page_size").fetchone()[0]
if cache_pages >= 0:
cache_bytes = cache_pages * page_size
else:
cache_bytes = abs(cache_pages * 1024)
logger.info(
f"SQLite stats: sqlite_version={sqlite3.sqlite_version} "
f"cache_pages={cache_pages} "
f"page_size={page_size} "
f"cache_bytes={cache_bytes}"
)
def init_db(directory: str) -> None:
def init_db() -> None:
"""Initialize the SQLite database with required tables if they don't exist."""
# Create database directory if it doesn't exist
start = time.monotonic()
os.makedirs(os.path.dirname(get_sqlite_db_path()), exist_ok=True)
os.makedirs(os.path.dirname(get_sqlite_db_path(directory)), exist_ok=True)
with get_db_connection(directory, "EXCLUSIVE") as conn:
with get_db_connection("EXCLUSIVE") as conn:
cursor = conn.cursor()
db_exists = os.path.exists(get_sqlite_db_path(directory))
if db_exists:
file_path = Path(get_sqlite_db_path(directory))
file_size = file_path.stat().st_size
logger.info(f"init_db - found existing sqlite db: len={file_size}")
else:
# why is this only if the db doesn't exist?
db_exists = os.path.exists(get_sqlite_db_path())
if not db_exists:
# Enable WAL mode for better concurrent access and write performance
cursor.execute("PRAGMA journal_mode=WAL")
cursor.execute("PRAGMA synchronous=NORMAL")
@@ -170,31 +143,16 @@ def init_db(directory: str) -> None:
""",
)
elapsed = time.monotonic() - start
logger.info(f"init_db - create tables and indices: elapsed={elapsed:.2f}")
# Analyze tables to help query planner
# NOTE(rkuo): skip ANALYZE - it takes too long and we likely don't have
# complicated queries that need this
# start = time.monotonic()
# cursor.execute("ANALYZE relationships")
# cursor.execute("ANALYZE salesforce_objects")
# cursor.execute("ANALYZE relationship_types")
# cursor.execute("ANALYZE user_email_map")
# elapsed = time.monotonic() - start
# logger.info(f"init_db - analyze: elapsed={elapsed:.2f}")
cursor.execute("ANALYZE relationships")
cursor.execute("ANALYZE salesforce_objects")
cursor.execute("ANALYZE relationship_types")
cursor.execute("ANALYZE user_email_map")
# If database already existed but user_email_map needs to be populated
start = time.monotonic()
cursor.execute("SELECT COUNT(*) FROM user_email_map")
elapsed = time.monotonic() - start
logger.info(f"init_db - count user_email_map: elapsed={elapsed:.2f}")
start = time.monotonic()
if cursor.fetchone()[0] == 0:
_update_user_email_map(conn)
elapsed = time.monotonic() - start
logger.info(f"init_db - update_user_email_map: elapsed={elapsed:.2f}")
conn.commit()
@@ -282,15 +240,15 @@ def _update_user_email_map(conn: sqlite3.Connection) -> None:
def update_sf_db_with_csv(
directory: str,
object_type: str,
csv_download_path: str,
delete_csv_after_use: bool = True,
) -> list[str]:
"""Update the SF DB with a CSV file using SQLite storage."""
updated_ids = []
# Use IMMEDIATE to get a write lock at the start of the transaction
with get_db_connection(directory, "IMMEDIATE") as conn:
with get_db_connection("IMMEDIATE") as conn:
cursor = conn.cursor()
with open(csv_download_path, "r", newline="", encoding="utf-8") as f:
@@ -337,12 +295,17 @@ def update_sf_db_with_csv(
conn.commit()
if delete_csv_after_use:
# Remove the csv file after it has been used
# to successfully update the db
os.remove(csv_download_path)
return updated_ids
def get_child_ids(directory: str, parent_id: str) -> set[str]:
def get_child_ids(parent_id: str) -> set[str]:
"""Get all child IDs for a given parent ID."""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
# Force index usage with INDEXED BY
@@ -354,9 +317,9 @@ def get_child_ids(directory: str, parent_id: str) -> set[str]:
return child_ids
def get_type_from_id(directory: str, object_id: str) -> str | None:
def get_type_from_id(object_id: str) -> str | None:
"""Get the type of an object from its ID."""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT object_type FROM salesforce_objects WHERE id = ?", (object_id,)
@@ -369,15 +332,15 @@ def get_type_from_id(directory: str, object_id: str) -> str | None:
def get_record(
directory: str, object_id: str, object_type: str | None = None
object_id: str, object_type: str | None = None
) -> SalesforceObject | None:
"""Retrieve the record and return it as a SalesforceObject."""
if object_type is None:
object_type = get_type_from_id(directory, object_id)
object_type = get_type_from_id(object_id)
if not object_type:
return None
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT data FROM salesforce_objects WHERE id = ?", (object_id,))
result = cursor.fetchone()
@@ -389,9 +352,9 @@ def get_record(
return SalesforceObject(id=object_id, type=object_type, data=data)
def find_ids_by_type(directory: str, object_type: str) -> list[str]:
def find_ids_by_type(object_type: str) -> list[str]:
"""Find all object IDs for rows of the specified type."""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT id FROM salesforce_objects WHERE object_type = ?", (object_type,)
@@ -400,7 +363,6 @@ def find_ids_by_type(directory: str, object_type: str) -> list[str]:
def get_affected_parent_ids_by_type(
directory: str,
updated_ids: list[str],
parent_types: list[str],
batch_size: int = 500,
@@ -412,7 +374,7 @@ def get_affected_parent_ids_by_type(
updated_ids_batches = batch_list(updated_ids, batch_size)
updated_parent_ids: set[str] = set()
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
for batch_ids in updated_ids_batches:
@@ -457,7 +419,7 @@ def get_affected_parent_ids_by_type(
yield parent_type, new_affected_ids
def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
def has_at_least_one_object_of_type(object_type: str) -> bool:
"""Check if there is at least one object of the specified type in the database.
Args:
@@ -466,7 +428,7 @@ def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
Returns:
bool: True if at least one object exists, False otherwise
"""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"SELECT COUNT(*) FROM salesforce_objects WHERE object_type = ?",
@@ -481,7 +443,7 @@ def has_at_least_one_object_of_type(directory: str, object_type: str) -> bool:
NULL_ID_STRING = "N/A"
def get_user_id_by_email(directory: str, email: str) -> str | None:
def get_user_id_by_email(email: str) -> str | None:
"""Get the Salesforce User ID for a given email address.
Args:
@@ -492,7 +454,7 @@ def get_user_id_by_email(directory: str, email: str) -> str | None:
- was_found: True if the email exists in the table, False if not found
- user_id: The Salesforce User ID if exists, None otherwise
"""
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT user_id FROM user_email_map WHERE email = ?", (email,))
result = cursor.fetchone()
@@ -501,10 +463,10 @@ def get_user_id_by_email(directory: str, email: str) -> str | None:
return result[0]
def update_email_to_id_table(directory: str, email: str, id: str | None) -> None:
def update_email_to_id_table(email: str, id: str | None) -> None:
"""Update the email to ID map table with a new email and ID."""
id_to_use = id or NULL_ID_STRING
with get_db_connection(directory) as conn:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute(
"INSERT OR REPLACE INTO user_email_map (email, user_id) VALUES (?, ?)",

View File

@@ -30,9 +30,9 @@ class SalesforceObject:
BASE_DATA_PATH = os.path.join(os.path.dirname(__file__), "data")
def get_sqlite_db_path(directory: str) -> str:
def get_sqlite_db_path() -> str:
"""Get the path to the sqlite db file."""
return os.path.join(directory, "salesforce_db.sqlite")
return os.path.join(BASE_DATA_PATH, "salesforce_db.sqlite")
def get_object_type_path(object_type: str) -> str:

View File

@@ -255,9 +255,7 @@ _DISALLOWED_MSG_SUBTYPES = {
def default_msg_filter(message: MessageType) -> bool:
# Don't keep messages from bots
if message.get("bot_id") or message.get("app_id"):
bot_profile_name = message.get("bot_profile", {}).get("name")
print(f"bot_profile_name: {bot_profile_name}")
if bot_profile_name == "DanswerBot Testing":
if message.get("bot_profile", {}).get("name") == "OnyxConnector":
return False
return True

View File

@@ -227,13 +227,16 @@ class SearchPipeline:
# If ee is enabled, censor the chunk sections based on user access
# Otherwise, return the retrieved chunks
censored_chunks: list[InferenceChunk] = fetch_ee_implementation_or_noop(
"onyx.external_permissions.post_query_censoring",
"_post_query_chunk_censoring",
retrieved_chunks,
)(
chunks=retrieved_chunks,
user=self.user,
censored_chunks = cast(
list[InferenceChunk],
fetch_ee_implementation_or_noop(
"onyx.external_permissions.post_query_censoring",
"_post_query_chunk_censoring",
retrieved_chunks,
)(
chunks=retrieved_chunks,
user=self.user,
),
)
above = self.search_query.chunks_above

View File

@@ -2,7 +2,6 @@ import io
import json
import os
import re
import uuid
import zipfile
from collections.abc import Callable
from collections.abc import Iterator
@@ -15,7 +14,6 @@ from pathlib import Path
from typing import Any
from typing import IO
from typing import NamedTuple
from typing import Optional
import chardet
import docx # type: ignore
@@ -570,8 +568,8 @@ def extract_text_and_images(
def convert_docx_to_txt(
file: UploadFile, file_store: FileStore, file_path: Optional[str] = None
) -> str:
file: UploadFile, file_store: FileStore, file_path: str
) -> None:
"""
Helper to convert docx to a .txt file in the same filestore.
"""
@@ -583,41 +581,15 @@ def convert_docx_to_txt(
all_paras = [p.text for p in doc.paragraphs]
text_content = "\n".join(all_paras)
file_name = file.filename or f"docx_{uuid.uuid4()}"
text_file_name = docx_to_txt_filename(file_path if file_path else file_name)
txt_file_path = docx_to_txt_filename(file_path)
file_store.save_file(
file_name=text_file_name,
file_name=txt_file_path,
content=BytesIO(text_content.encode("utf-8")),
display_name=file.filename,
file_origin=FileOrigin.CONNECTOR,
file_type="text/plain",
)
return text_file_name
def docx_to_txt_filename(file_path: str) -> str:
return file_path.rsplit(".", 1)[0] + ".txt"
def convert_pdf_to_txt(file: UploadFile, file_store: FileStore, file_path: str) -> str:
"""
Helper to convert PDF to a .txt file in the same filestore.
"""
file.file.seek(0)
# Extract text from the PDF
text_content, _, _ = read_pdf_file(file.file)
text_file_name = pdf_to_txt_filename(file_path)
file_store.save_file(
file_name=text_file_name,
content=BytesIO(text_content.encode("utf-8")),
display_name=file.filename,
file_origin=FileOrigin.CONNECTOR,
file_type="text/plain",
)
return text_file_name
def pdf_to_txt_filename(file_path: str) -> str:
return file_path.rsplit(".", 1)[0] + ".txt"

View File

@@ -459,6 +459,10 @@ def process_image_sections(documents: list[Document]) -> list[IndexingDocument]:
llm = get_default_llm_with_vision()
if not llm:
logger.warning(
"No vision-capable LLM available. Image sections will not be processed."
)
# Even without LLM, we still convert to IndexingDocument with base Sections
return [
IndexingDocument(
@@ -925,12 +929,10 @@ def index_doc_batch(
for chunk_num, chunk in enumerate(chunks_with_embeddings)
]
short_descriptor_list = [
chunk.to_short_descriptor() for chunk in access_aware_chunks
]
short_descriptor_log = str(short_descriptor_list)[:1024]
logger.debug(f"Indexing the following chunks: {short_descriptor_log}")
logger.debug(
"Indexing the following chunks: "
f"{[chunk.to_short_descriptor() for chunk in access_aware_chunks]}"
)
# A document will not be spread across different batches, so all the
# documents with chunks in this set, are fully represented by the chunks
# in this set

View File

@@ -175,7 +175,7 @@ class EmbeddingModel:
embeddings: list[Embedding] = []
def process_batch(
batch_idx: int, batch_len: int, text_batch: list[str]
batch_idx: int, text_batch: list[str]
) -> tuple[int, list[Embedding]]:
if self.callback:
if self.callback.should_stop():
@@ -202,8 +202,8 @@ class EmbeddingModel:
end_time = time.time()
processing_time = end_time - start_time
logger.debug(
f"EmbeddingModel.process_batch: Batch {batch_idx}/{batch_len} processing time: {processing_time:.2f} seconds"
logger.info(
f"Batch {batch_idx} processing time: {processing_time:.2f} seconds"
)
return batch_idx, response.embeddings
@@ -215,7 +215,7 @@ class EmbeddingModel:
if num_threads >= 1 and self.provider_type and len(text_batches) > 1:
with ThreadPoolExecutor(max_workers=num_threads) as executor:
future_to_batch = {
executor.submit(process_batch, idx, len(text_batches), batch): idx
executor.submit(process_batch, idx, batch): idx
for idx, batch in enumerate(text_batches, start=1)
}
@@ -238,7 +238,7 @@ class EmbeddingModel:
else:
# Original sequential processing
for idx, text_batch in enumerate(text_batches, start=1):
_, batch_embeddings = process_batch(idx, len(text_batches), text_batch)
_, batch_embeddings = process_batch(idx, text_batch)
embeddings.extend(batch_embeddings)
if self.callback:
self.callback.progress("_batch_encode_texts", 1)

View File

@@ -1,147 +0,0 @@
# Standards
SEPARATOR_LINE = "-------"
SEPARATOR_LINE_LONG = "---------------"
NO_EXTRACTION = "No extraction of knowledge graph objects was feasable."
YES = "yes"
NO = "no"
DC_OBJECT_SEPARATOR = ";"
DC_OBJECT_NO_BASE_DATA_EXTRACTION_PROMPT = f"""
You are an expert in finding relevant objects/objext specifications of the same type in a list of documents. \
In this case you are interested \
in generating: {{objects_of_interest}}.
You should look at the documents - in no particular order! - and extract each object you find in the documents.
{SEPARATOR_LINE}
Here are the documents you are supposed to search through:
--
{{document_text}}
{SEPARATOR_LINE}
Here are the task instructions you should use to help you find the desired objects:
{SEPARATOR_LINE}
{{task}}
{SEPARATOR_LINE}
Here is the question that may provide critical additional context for the task:
{SEPARATOR_LINE}
{{question}}
{SEPARATOR_LINE}
Please answer the question in the following format:
REASONING: <your reasoning for the classification> - OBJECTS: <the objects - just their names - that you found, \
separated by ';'>
""".strip()
DC_OBJECT_WITH_BASE_DATA_EXTRACTION_PROMPT = f"""
You are an expert in finding relevant objects/object specifications of the same type in a list of documents. \
In this case you are interested \
in generating: {{objects_of_interest}}.
You should look at the provided data - in no particular order! - and extract each object you find in the documents.
{SEPARATOR_LINE}
Here are the data provided by the user:
--
{{base_data}}
{SEPARATOR_LINE}
Here are the task instructions you should use to help you find the desired objects:
{SEPARATOR_LINE}
{{task}}
{SEPARATOR_LINE}
Here is the request that may provide critical additional context for the task:
{SEPARATOR_LINE}
{{question}}
{SEPARATOR_LINE}
Please address the request in the following format:
REASONING: <your reasoning for the classification> - OBJECTS: <the objects - just their names - that you found, \
separated by ';'>
""".strip()
DC_OBJECT_SOURCE_RESEARCH_PROMPT = f"""
Today is {{today}}. You are an expert in extracting relevant structured information from a list of documents that \
should relate to one object. (Try to make sure that you know it relates to that one object!).
You should look at the documents - in no particular order! - and extract the information asked for this task:
{SEPARATOR_LINE}
{{task}}
{SEPARATOR_LINE}
Here is the user question that may provide critical additional context for the task:
{SEPARATOR_LINE}
{{question}}
{SEPARATOR_LINE}
Here are the documents you are supposed to search through:
--
{{document_text}}
{SEPARATOR_LINE}
Note: please cite your sources inline as you generate the results! Use the format [1], etc. Infer the \
number from the provided context documents. This is very important!
Please address the task in the following format:
REASONING:
-- <your reasoning for the classification>
RESEARCH RESULTS:
{{format}}
""".strip()
DC_OBJECT_CONSOLIDATION_PROMPT = f"""
You are a helpful assistant that consolidates information about a specific object \
from multiple sources.
The object is:
{SEPARATOR_LINE}
{{object}}
{SEPARATOR_LINE}
and the information is
{SEPARATOR_LINE}
{{information}}
{SEPARATOR_LINE}
Here is the user question that may provide critical additional context for the task:
{SEPARATOR_LINE}
{{question}}
{SEPARATOR_LINE}
Please consolidate the information into a single, concise answer. The consolidated informtation \
for the object should be in the following format:
{SEPARATOR_LINE}
{{format}}
{SEPARATOR_LINE}
Overall, please use this structure to communicate the consolidated information:
{SEPARATOR_LINE}
REASONING: <your reasoning for consolidating the information>
INFORMATION:
<consolidated information in the proper format that you have created>
"""
DC_FORMATTING_NO_BASE_DATA_PROMPT = f"""
You are an expert in text formatting. Your task is to take a given text and convert it 100 percent accurately \
in a new format.
Here is the text you are supposed to format:
{SEPARATOR_LINE}
{{text}}
{SEPARATOR_LINE}
Here is the format you are supposed to use:
{SEPARATOR_LINE}
{{format}}
{SEPARATOR_LINE}
Please start the generation directly with the formatted text. (Note that the output should not be code, but text.)
"""
DC_FORMATTING_WITH_BASE_DATA_PROMPT = f"""
You are an expert in text formatting. Your task is to take a given text and the initial \
base data provided by the user, and convert it 100 percent accurately \
in a new format. The base data may also contain important relationships that are critical \
for the formatting.
Here is the initial data provided by the user:
{SEPARATOR_LINE}
{{base_data}}
{SEPARATOR_LINE}
Here is the text you are supposed combine (and format) with the initial data, adhering to the \
format instructions provided by later in the prompt:
{SEPARATOR_LINE}
{{text}}
{SEPARATOR_LINE}
And here are the format instructions you are supposed to use:
{SEPARATOR_LINE}
{{format}}
{SEPARATOR_LINE}
Please start the generation directly with the formatted text. (Note that the output should not be code, but text.)
"""

View File

@@ -100,7 +100,6 @@ from onyx.db.models import UserGroup__ConnectorCredentialPair
from onyx.db.search_settings import get_current_search_settings
from onyx.db.search_settings import get_secondary_search_settings
from onyx.file_processing.extract_file_text import convert_docx_to_txt
from onyx.file_processing.extract_file_text import convert_pdf_to_txt
from onyx.file_store.file_store import get_default_file_store
from onyx.key_value_store.interface import KvKeyNotFoundError
from onyx.redis.redis_connector import RedisConnector
@@ -129,7 +128,6 @@ from onyx.utils.telemetry import create_milestone_and_report
from onyx.utils.threadpool_concurrency import run_functions_tuples_in_parallel
from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
logger = setup_logger()
_GMAIL_CREDENTIAL_ID_COOKIE_NAME = "gmail_credential_id"
@@ -432,23 +430,6 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
)
continue
# Special handling for docx files - only store the plaintext version
if file.content_type and file.content_type.startswith(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
text_file_path = convert_docx_to_txt(file, file_store)
deduped_file_paths.append(text_file_path)
continue
# Special handling for PDF files - only store the plaintext version
if file.content_type and file.content_type.startswith("application/pdf"):
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
text_file_path = convert_pdf_to_txt(file, file_store, file_path)
deduped_file_paths.append(text_file_path)
continue
# Default handling for all other file types
file_path = os.path.join(str(uuid.uuid4()), cast(str, file.filename))
deduped_file_paths.append(file_path)
file_store.save_file(
@@ -459,6 +440,11 @@ def upload_files(files: list[UploadFile], db_session: Session) -> FileUploadResp
file_type=file.content_type or "text/plain",
)
if file.content_type and file.content_type.startswith(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
convert_docx_to_txt(file, file_store, file_path)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
return FileUploadResponse(file_paths=deduped_file_paths)

View File

@@ -1,5 +1,4 @@
from collections.abc import Callable
from datetime import datetime
from typing import Any
from uuid import UUID
@@ -7,7 +6,6 @@ from pydantic import BaseModel
from pydantic import model_validator
from sqlalchemy.orm import Session
from onyx.configs.constants import DocumentSource
from onyx.context.search.enums import SearchType
from onyx.context.search.models import IndexFilters
from onyx.context.search.models import InferenceSection
@@ -77,8 +75,6 @@ class SearchToolOverrideKwargs(BaseModel):
ordering_only: bool | None = (
None # Flag for fast path when search is only needed for ordering
)
document_sources: list[DocumentSource] | None = None
time_cutoff: datetime | None = None
class Config:
arbitrary_types_allowed = True

View File

@@ -292,8 +292,6 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
user_file_ids = None
user_folder_ids = None
ordering_only = False
document_sources = None
time_cutoff = None
if override_kwargs:
force_no_rerank = use_alt_not_None(override_kwargs.force_no_rerank, False)
alternate_db_session = override_kwargs.alternate_db_session
@@ -304,8 +302,6 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
user_file_ids = override_kwargs.user_file_ids
user_folder_ids = override_kwargs.user_folder_ids
ordering_only = use_alt_not_None(override_kwargs.ordering_only, False)
document_sources = override_kwargs.document_sources
time_cutoff = override_kwargs.time_cutoff
# Fast path for ordering-only search
if ordering_only:
@@ -338,23 +334,6 @@ class SearchTool(Tool[SearchToolOverrideKwargs]):
)
retrieval_options = RetrievalDetails(filters=filters)
if document_sources or time_cutoff:
# Get retrieval_options and filters, or create if they don't exist
retrieval_options = retrieval_options or RetrievalDetails()
retrieval_options.filters = retrieval_options.filters or BaseFilters()
# Handle document sources
if document_sources:
source_types = retrieval_options.filters.source_type or []
retrieval_options.filters.source_type = list(
set(source_types + document_sources)
)
# Handle time cutoff
if time_cutoff:
# Overwrite time-cutoff should supercede existing time-cutoff, even if defined
retrieval_options.filters.time_cutoff = time_cutoff
search_pipeline = SearchPipeline(
search_request=SearchRequest(
query=query,

View File

@@ -1,44 +0,0 @@
import os
import time
from unittest.mock import MagicMock
from unittest.mock import patch
import pytest
from onyx.connectors.gong.connector import GongConnector
from onyx.connectors.models import Document
@pytest.fixture
def gong_connector() -> GongConnector:
connector = GongConnector()
connector.load_credentials(
{
"gong_access_key": os.environ["GONG_ACCESS_KEY"],
"gong_access_key_secret": os.environ["GONG_ACCESS_KEY_SECRET"],
}
)
return connector
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_gong_basic(mock_get_api_key: MagicMock, gong_connector: GongConnector) -> None:
doc_batch_generator = gong_connector.poll_source(0, time.time())
doc_batch = next(doc_batch_generator)
with pytest.raises(StopIteration):
next(doc_batch_generator)
assert len(doc_batch) == 2
docs: list[Document] = []
for doc in doc_batch:
docs.append(doc)
assert docs[0].semantic_identifier == "test with chris"
assert docs[1].semantic_identifier == "Testing Gong"

View File

@@ -1,7 +1,6 @@
import json
import os
import time
from datetime import datetime
from pathlib import Path
from unittest.mock import MagicMock
from unittest.mock import patch
@@ -106,54 +105,6 @@ def test_highspot_connector_slim(
assert len(all_slim_doc_ids) > 0
@patch(
"onyx.file_processing.extract_file_text.get_unstructured_api_key",
return_value=None,
)
def test_highspot_connector_poll_source(
mock_get_api_key: MagicMock, highspot_connector: HighspotConnector
) -> None:
"""Test poll_source functionality with date range filtering."""
# Define date range: April 3, 2025 to April 4, 2025
start_date = datetime(2025, 4, 3, 0, 0, 0)
end_date = datetime(2025, 4, 4, 23, 59, 59)
# Convert to seconds since Unix epoch
start_time = int(time.mktime(start_date.timetuple()))
end_time = int(time.mktime(end_date.timetuple()))
# Load test data for assertions
test_data = load_test_data()
poll_source_data = test_data.get("poll_source", {})
target_doc_id = poll_source_data.get("target_doc_id")
# Call poll_source with date range
all_docs: list[Document] = []
target_doc: Document | None = None
for doc_batch in highspot_connector.poll_source(start_time, end_time):
for doc in doc_batch:
all_docs.append(doc)
if doc.id == f"HIGHSPOT_{target_doc_id}":
target_doc = doc
# Verify documents were loaded
assert len(all_docs) > 0
# Verify the specific test document was found and has correct properties
assert target_doc is not None
assert target_doc.semantic_identifier == poll_source_data.get("semantic_identifier")
assert target_doc.source == DocumentSource.HIGHSPOT
assert target_doc.metadata is not None
# Verify sections
assert len(target_doc.sections) == 1
section = target_doc.sections[0]
assert section.link == poll_source_data.get("link")
assert section.text is not None
assert len(section.text) > 0
def test_highspot_connector_validate_credentials(
highspot_connector: HighspotConnector,
) -> None:

View File

@@ -1,10 +1,5 @@
{
"target_doc_id": "67cd8eb35d3ee0487de2e704",
"semantic_identifier": "Highspot in Action _ Salesforce Integration",
"link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704",
"poll_source": {
"target_doc_id":"67ef9edcc3f40b2bf3d816a8",
"semantic_identifier":"A Brief Introduction To AI",
"link":"https://www.highspot.com/items/67ef9edcc3f40b2bf3d816a8"
}
"link": "https://www.highspot.com/items/67cd8eb35d3ee0487de2e704"
}

View File

@@ -35,22 +35,23 @@ def salesforce_connector() -> SalesforceConnector:
connector = SalesforceConnector(
requested_objects=["Account", "Contact", "Opportunity"],
)
username = os.environ["SF_USERNAME"]
password = os.environ["SF_PASSWORD"]
security_token = os.environ["SF_SECURITY_TOKEN"]
connector.load_credentials(
{
"sf_username": username,
"sf_password": password,
"sf_security_token": security_token,
"sf_username": os.environ["SF_USERNAME"],
"sf_password": os.environ["SF_PASSWORD"],
"sf_security_token": os.environ["SF_SECURITY_TOKEN"],
}
)
return connector
# TODO: make the credentials not expire
@pytest.mark.xfail(
reason=(
"Credentials change over time, so this test will fail if run when "
"the credentials expire."
)
)
def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -> None:
test_data = load_test_data()
target_test_doc: Document | None = None
@@ -60,26 +61,21 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
all_docs.append(doc)
if doc.id == test_data["id"]:
target_test_doc = doc
break
# The number of docs here seems to change actively so do a very loose check
# as of 2025-03-28 it was around 32472
assert len(all_docs) > 32000
assert len(all_docs) < 40000
assert len(all_docs) == 6
assert target_test_doc is not None
# Set of received links
received_links: set[str] = set()
# List of received text fields, which contain key-value pairs seperated by newlines
received_text: list[str] = []
recieved_text: list[str] = []
# Iterate over the sections of the target test doc to extract the links and text
for section in target_test_doc.sections:
assert section.link
assert section.text
received_links.add(section.link)
received_text.append(section.text)
recieved_text.append(section.text)
# Check that the received links match the expected links from the test data json
expected_links = set(test_data["expected_links"])
@@ -89,9 +85,8 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
expected_text = test_data["expected_text"]
if not isinstance(expected_text, list):
raise ValueError("Expected text is not a list")
unparsed_expected_key_value_pairs: list[str] = expected_text
received_key_value_pairs = extract_key_value_pairs_to_set(received_text)
received_key_value_pairs = extract_key_value_pairs_to_set(recieved_text)
expected_key_value_pairs = extract_key_value_pairs_to_set(
unparsed_expected_key_value_pairs
)
@@ -101,21 +96,13 @@ def test_salesforce_connector_basic(salesforce_connector: SalesforceConnector) -
assert target_test_doc.source == DocumentSource.SALESFORCE
assert target_test_doc.semantic_identifier == test_data["semantic_identifier"]
assert target_test_doc.metadata == test_data["metadata"]
assert target_test_doc.primary_owners is not None
primary_owner = target_test_doc.primary_owners[0]
expected_primary_owner = test_data["primary_owners"]
assert isinstance(expected_primary_owner, dict)
assert primary_owner.email == expected_primary_owner["email"]
assert primary_owner.first_name == expected_primary_owner["first_name"]
assert primary_owner.last_name == expected_primary_owner["last_name"]
assert target_test_doc.primary_owners == test_data["primary_owners"]
assert target_test_doc.secondary_owners == test_data["secondary_owners"]
assert target_test_doc.title == test_data["title"]
# TODO: make the credentials not expire
@pytest.mark.skip(
@pytest.mark.xfail(
reason=(
"Credentials change over time, so this test will fail if run when "
"the credentials expire."

View File

@@ -1,162 +1,20 @@
{
"id": "SALESFORCE_001bm00000eu6n5AAA",
"id": "SALESFORCE_001fI000005drUcQAI",
"expected_links": [
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpEeAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqd3AAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoKiAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvDSAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrmHAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrl2AAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvejAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStlvAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpPfAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrP9AAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvlMAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESt3JAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoBkAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStw2AAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrkMAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESojKAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuLEAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoSIAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESu2YAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvgSAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESurnAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrnqAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoB5AAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuJuAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrfyAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/001bm00000eu6n5AAA",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpUHAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsgGAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESr7UAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESu1BAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpqzAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESplZAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvJ3AAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESurKAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStSiAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuJFAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESu8xAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqfzAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqsrAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStoZAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsIUAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsAGAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESv8GAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrOKAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoUmAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESudKAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuJ8AAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvf2AAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESw3qAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESugRAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESr18AAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqV1AAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuLVAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpjoAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESqULAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuCAAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrfpAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESp5YAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrMNAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStaUAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESt5LAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrtcAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESomaAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrtIAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESoToAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuWLAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESrWvAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsJEAA1",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESsxwAAD",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvUgAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESvWjAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000EStBuAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESpZiAAL",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuhYAAT",
"https://danswer-dev-ed.develop.my.salesforce.com/003bm00000ESuWAAA1"
"https://customization-ruby-2195.my.salesforce.com/001fI000005drUcQAI",
"https://customization-ruby-2195.my.salesforce.com/003fI000001jiCPQAY",
"https://customization-ruby-2195.my.salesforce.com/017fI00000T7hvsQAB",
"https://customization-ruby-2195.my.salesforce.com/006fI000000rDvBQAU"
],
"expected_text": [
"IsDeleted: false\nBillingCity: Shaykh al \u00e1\u00b8\u00a8ad\u00c4\u00abd\nName: Voonder\nCleanStatus: Pending\nBillingStreet: 12 Cambridge Parkway",
"Email: eslayqzs@icio.us\nIsDeleted: false\nLastName: Slay\nIsEmailBounced: false\nFirstName: Ebeneser\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ptweedgdh@umich.edu\nIsDeleted: false\nLastName: Tweed\nIsEmailBounced: false\nFirstName: Paulita\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ehurnellnlx@facebook.com\nIsDeleted: false\nLastName: Hurnell\nIsEmailBounced: false\nFirstName: Eliot\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ccarik4q4@google.it\nIsDeleted: false\nLastName: Carik\nIsEmailBounced: false\nFirstName: Chadwick\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: cvannozziina6@moonfruit.com\nIsDeleted: false\nLastName: Vannozzii\nIsEmailBounced: false\nFirstName: Christophorus\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: mikringill2kz@hugedomains.com\nIsDeleted: false\nLastName: Ikringill\nIsEmailBounced: false\nFirstName: Meghann\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: bgrinvalray@fda.gov\nIsDeleted: false\nLastName: Grinval\nIsEmailBounced: false\nFirstName: Berti\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: aollanderhr7@cam.ac.uk\nIsDeleted: false\nLastName: Ollander\nIsEmailBounced: false\nFirstName: Annemarie\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: rwhitesideq38@gravatar.com\nIsDeleted: false\nLastName: Whiteside\nIsEmailBounced: false\nFirstName: Rolando\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: vkrafthmz@techcrunch.com\nIsDeleted: false\nLastName: Kraft\nIsEmailBounced: false\nFirstName: Vidovik\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: jhillaut@4shared.com\nIsDeleted: false\nLastName: Hill\nIsEmailBounced: false\nFirstName: Janel\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: lralstonycs@discovery.com\nIsDeleted: false\nLastName: Ralston\nIsEmailBounced: false\nFirstName: Lorrayne\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: blyttlewba@networkadvertising.org\nIsDeleted: false\nLastName: Lyttle\nIsEmailBounced: false\nFirstName: Ban\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: pplummernvf@technorati.com\nIsDeleted: false\nLastName: Plummer\nIsEmailBounced: false\nFirstName: Pete\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: babrahamoffxpb@theatlantic.com\nIsDeleted: false\nLastName: Abrahamoff\nIsEmailBounced: false\nFirstName: Brander\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ahargieym0@homestead.com\nIsDeleted: false\nLastName: Hargie\nIsEmailBounced: false\nFirstName: Aili\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: hstotthp2@yelp.com\nIsDeleted: false\nLastName: Stott\nIsEmailBounced: false\nFirstName: Hartley\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: jganniclifftuvj@blinklist.com\nIsDeleted: false\nLastName: Ganniclifft\nIsEmailBounced: false\nFirstName: Jamima\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ldodelly8q@ed.gov\nIsDeleted: false\nLastName: Dodell\nIsEmailBounced: false\nFirstName: Lynde\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: rmilner3cp@smh.com.au\nIsDeleted: false\nLastName: Milner\nIsEmailBounced: false\nFirstName: Ralph\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: gghiriardellic19@state.tx.us\nIsDeleted: false\nLastName: Ghiriardelli\nIsEmailBounced: false\nFirstName: Garv\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: rhubatschfpu@nature.com\nIsDeleted: false\nLastName: Hubatsch\nIsEmailBounced: false\nFirstName: Rose\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: mtrenholme1ws@quantcast.com\nIsDeleted: false\nLastName: Trenholme\nIsEmailBounced: false\nFirstName: Mariejeanne\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: jmussettpbd@over-blog.com\nIsDeleted: false\nLastName: Mussett\nIsEmailBounced: false\nFirstName: Juliann\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: bgoroni145@illinois.edu\nIsDeleted: false\nLastName: Goroni\nIsEmailBounced: false\nFirstName: Bernarr\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: afalls3ph@theguardian.com\nIsDeleted: false\nLastName: Falls\nIsEmailBounced: false\nFirstName: Angelia\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: lswettjoi@go.com\nIsDeleted: false\nLastName: Swett\nIsEmailBounced: false\nFirstName: Levon\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: emullinsz38@dailymotion.com\nIsDeleted: false\nLastName: Mullins\nIsEmailBounced: false\nFirstName: Elsa\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ibernettehco@ebay.co.uk\nIsDeleted: false\nLastName: Bernette\nIsEmailBounced: false\nFirstName: Ingrid\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: trisleybtt@simplemachines.org\nIsDeleted: false\nLastName: Risley\nIsEmailBounced: false\nFirstName: Toma\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: rgypsonqx1@goodreads.com\nIsDeleted: false\nLastName: Gypson\nIsEmailBounced: false\nFirstName: Reed\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: cposvneri28@jiathis.com\nIsDeleted: false\nLastName: Posvner\nIsEmailBounced: false\nFirstName: Culley\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: awilmut2rz@geocities.jp\nIsDeleted: false\nLastName: Wilmut\nIsEmailBounced: false\nFirstName: Andy\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: aluckwellra5@exblog.jp\nIsDeleted: false\nLastName: Luckwell\nIsEmailBounced: false\nFirstName: Andreana\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: irollings26j@timesonline.co.uk\nIsDeleted: false\nLastName: Rollings\nIsEmailBounced: false\nFirstName: Ibrahim\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: gspireqpd@g.co\nIsDeleted: false\nLastName: Spire\nIsEmailBounced: false\nFirstName: Gaelan\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: sbezleyk2y@acquirethisname.com\nIsDeleted: false\nLastName: Bezley\nIsEmailBounced: false\nFirstName: Sindee\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: icollerrr@flickr.com\nIsDeleted: false\nLastName: Coller\nIsEmailBounced: false\nFirstName: Inesita\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: kfolliott1bo@nature.com\nIsDeleted: false\nLastName: Folliott\nIsEmailBounced: false\nFirstName: Kennan\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: kroofjfo@gnu.org\nIsDeleted: false\nLastName: Roof\nIsEmailBounced: false\nFirstName: Karlik\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: lcovotti8s4@rediff.com\nIsDeleted: false\nLastName: Covotti\nIsEmailBounced: false\nFirstName: Lucho\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: gpatriskson1rs@census.gov\nIsDeleted: false\nLastName: Patriskson\nIsEmailBounced: false\nFirstName: Gardener\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: spidgleyqvw@usgs.gov\nIsDeleted: false\nLastName: Pidgley\nIsEmailBounced: false\nFirstName: Simona\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: cbecarrak0i@over-blog.com\nIsDeleted: false\nLastName: Becarra\nIsEmailBounced: false\nFirstName: Cally\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: aparkman9td@bbc.co.uk\nIsDeleted: false\nLastName: Parkman\nIsEmailBounced: false\nFirstName: Agneta\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: bboddingtonhn@quantcast.com\nIsDeleted: false\nLastName: Boddington\nIsEmailBounced: false\nFirstName: Betta\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: dcasementx0p@cafepress.com\nIsDeleted: false\nLastName: Casement\nIsEmailBounced: false\nFirstName: Dannie\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: hzornbhe@latimes.com\nIsDeleted: false\nLastName: Zorn\nIsEmailBounced: false\nFirstName: Haleigh\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: cfifieldbjb@blogspot.com\nIsDeleted: false\nLastName: Fifield\nIsEmailBounced: false\nFirstName: Christalle\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ddewerson4t3@skype.com\nIsDeleted: false\nLastName: Dewerson\nIsEmailBounced: false\nFirstName: Dyann\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: khullock52p@sohu.com\nIsDeleted: false\nLastName: Hullock\nIsEmailBounced: false\nFirstName: Kellina\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: tfremantle32n@bandcamp.com\nIsDeleted: false\nLastName: Fremantle\nIsEmailBounced: false\nFirstName: Turner\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: sbernardtylp@nps.gov\nIsDeleted: false\nLastName: Bernardt\nIsEmailBounced: false\nFirstName: Selina\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: smcgettigan8kk@slideshare.net\nIsDeleted: false\nLastName: McGettigan\nIsEmailBounced: false\nFirstName: Sada\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: wdelafontvgn@businesswire.com\nIsDeleted: false\nLastName: Delafont\nIsEmailBounced: false\nFirstName: West\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: lbelsher9ne@indiatimes.com\nIsDeleted: false\nLastName: Belsher\nIsEmailBounced: false\nFirstName: Lou\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: cgoody27y@blogtalkradio.com\nIsDeleted: false\nLastName: Goody\nIsEmailBounced: false\nFirstName: Colene\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: cstodejzz@ucoz.ru\nIsDeleted: false\nLastName: Stode\nIsEmailBounced: false\nFirstName: Curcio\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: abromidgejb@china.com.cn\nIsDeleted: false\nLastName: Bromidge\nIsEmailBounced: false\nFirstName: Ariela\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ldelgardilloqvp@xrea.com\nIsDeleted: false\nLastName: Delgardillo\nIsEmailBounced: false\nFirstName: Lauralee\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: dcroal9t4@businessinsider.com\nIsDeleted: false\nLastName: Croal\nIsEmailBounced: false\nFirstName: Devlin\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: dclarageqzb@wordpress.com\nIsDeleted: false\nLastName: Clarage\nIsEmailBounced: false\nFirstName: Dre\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: dthirlwall3jf@taobao.com\nIsDeleted: false\nLastName: Thirlwall\nIsEmailBounced: false\nFirstName: Dareen\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: tkeddie2lj@wiley.com\nIsDeleted: false\nLastName: Keddie\nIsEmailBounced: false\nFirstName: Tandi\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: jrimingtoni3i@istockphoto.com\nIsDeleted: false\nLastName: Rimington\nIsEmailBounced: false\nFirstName: Judy\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: gtroynet@slashdot.org\nIsDeleted: false\nLastName: Troy\nIsEmailBounced: false\nFirstName: Gail\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: ebunneyh0n@meetup.com\nIsDeleted: false\nLastName: Bunney\nIsEmailBounced: false\nFirstName: Efren\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: yhaken8p3@slate.com\nIsDeleted: false\nLastName: Haken\nIsEmailBounced: false\nFirstName: Yard\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: nolliffeq6q@biblegateway.com\nIsDeleted: false\nLastName: Olliffe\nIsEmailBounced: false\nFirstName: Nani\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: bgalia9jz@odnoklassniki.ru\nIsDeleted: false\nLastName: Galia\nIsEmailBounced: false\nFirstName: Berrie\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: djedrzej3v1@google.com\nIsDeleted: false\nLastName: Jedrzej\nIsEmailBounced: false\nFirstName: Deanne\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: mcamiesh1t@fc2.com\nIsDeleted: false\nLastName: Camies\nIsEmailBounced: false\nFirstName: Mikaela\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: csunshineqni@state.tx.us\nIsDeleted: false\nLastName: Sunshine\nIsEmailBounced: false\nFirstName: Curtis\nIsPriorityRecord: false\nCleanStatus: Pending",
"Email: fiannellib46@marriott.com\nIsDeleted: false\nLastName: Iannelli\nIsEmailBounced: false\nFirstName: Felicio\nIsPriorityRecord: false\nCleanStatus: Pending"
"BillingPostalCode: 60601\nType: Prospect\nWebsite: www.globalistindustries.com\nBillingCity: Chicago\nDescription: Globalist company\nIsDeleted: false\nIsPartner: false\nPhone: (312) 555-0456\nShippingCountry: USA\nShippingState: IL\nIsBuyer: false\nBillingCountry: USA\nBillingState: IL\nShippingPostalCode: 60601\nBillingStreet: 456 Market St\nIsCustomerPortal: false\nPersonActiveTrackerCount: 0\nShippingCity: Chicago\nShippingStreet: 456 Market St",
"FirstName: Michael\nMailingCountry: USA\nActiveTrackerCount: 0\nEmail: m.brown@globalindustries.com\nMailingState: IL\nMailingStreet: 456 Market St\nMailingCity: Chicago\nLastName: Brown\nTitle: CTO\nIsDeleted: false\nPhone: (312) 555-0456\nHasOptedOutOfEmail: false\nIsEmailBounced: false\nMailingPostalCode: 60601",
"ForecastCategory: Closed\nName: Global Industries Equipment Sale\nIsDeleted: false\nForecastCategoryName: Closed\nFiscalYear: 2024\nFiscalQuarter: 4\nIsClosed: true\nIsWon: true\nAmount: 5000000.0\nProbability: 100.0\nPushCount: 0\nHasOverdueTask: false\nStageName: Closed Won\nHasOpenActivity: false\nHasOpportunityLineItem: false",
"Field: created\nDataType: Text\nIsDeleted: false"
],
"semantic_identifier": "Voonder",
"semantic_identifier": "Unknown Object",
"metadata": {},
"primary_owners": {"email": "hagen@danswer.ai", "first_name": "Hagen", "last_name": "oneill"},
"primary_owners": null,
"secondary_owners": null,
"title": null
}

View File

@@ -444,7 +444,6 @@ class CCPairManager:
)
if group_sync_result.status_code != 409:
group_sync_result.raise_for_status()
time.sleep(2)
@staticmethod
def get_doc_sync_task(

View File

@@ -14,8 +14,9 @@ from tests.integration.connector_job_tests.slack.slack_api_utils import SlackMan
@pytest.fixture()
def slack_test_setup() -> Generator[tuple[dict[str, Any], dict[str, Any]], None, None]:
slack_client = SlackManager.get_slack_client(os.environ["SLACK_BOT_TOKEN"])
user_map = SlackManager.build_slack_user_email_id_map(slack_client)
admin_user_id = user_map["admin@onyx-test.com"]
admin_user_id = SlackManager.build_slack_user_email_id_map(slack_client)[
"admin@onyx-test.com"
]
(
public_channel,

View File

@@ -3,6 +3,8 @@ from datetime import datetime
from datetime import timezone
from typing import Any
import pytest
from onyx.connectors.models import InputType
from onyx.db.enums import AccessType
from onyx.server.documents.models import DocumentSource
@@ -23,6 +25,7 @@ from tests.integration.common_utils.vespa import vespa_fixture
from tests.integration.connector_job_tests.slack.slack_api_utils import SlackManager
@pytest.mark.xfail(reason="flaky - see DAN-789 for example", strict=False)
def test_slack_permission_sync(
reset: None,
vespa_client: vespa_fixture,
@@ -218,6 +221,7 @@ def test_slack_permission_sync(
assert private_message not in onyx_doc_message_strings
@pytest.mark.xfail(reason="flaky", strict=False)
def test_slack_group_permission_sync(
reset: None,
vespa_client: vespa_fixture,

View File

@@ -5,11 +5,8 @@ from ee.onyx.external_permissions.salesforce.postprocessing import (
)
from onyx.configs.app_configs import BLURB_SIZE
from onyx.configs.constants import DocumentSource
from onyx.connectors.salesforce.utils import BASE_DATA_PATH
from onyx.context.search.models import InferenceChunk
SQLITE_DIR = BASE_DATA_PATH
def create_test_chunk(
doc_id: str,
@@ -42,7 +39,6 @@ def create_test_chunk(
def test_validate_salesforce_access_single_object() -> None:
"""Test filtering when chunk has a single Salesforce object reference"""
section = "This is a test document about a Salesforce object."
test_content = section
test_chunk = create_test_chunk(

View File

@@ -113,18 +113,15 @@ _VALID_SALESFORCE_IDS = [
]
def _clear_sf_db(directory: str) -> None:
def _clear_sf_db() -> None:
"""
Clears the SF DB by deleting all files in the data directory.
"""
shutil.rmtree(directory, ignore_errors=True)
shutil.rmtree(BASE_DATA_PATH, ignore_errors=True)
def _create_csv_file(
directory: str,
object_type: str,
records: list[dict],
filename: str = "test_data.csv",
object_type: str, records: list[dict], filename: str = "test_data.csv"
) -> None:
"""
Creates a CSV file for the given object type and records.
@@ -152,10 +149,10 @@ def _create_csv_file(
writer.writerow(record)
# Update the database with the CSV
update_sf_db_with_csv(directory, object_type, csv_path)
update_sf_db_with_csv(object_type, csv_path)
def _create_csv_with_example_data(directory: str) -> None:
def _create_csv_with_example_data() -> None:
"""
Creates CSV files with example data, organized by object type.
"""
@@ -345,10 +342,10 @@ def _create_csv_with_example_data(directory: str) -> None:
# Create CSV files for each object type
for object_type, records in example_data.items():
_create_csv_file(directory, object_type, records)
_create_csv_file(object_type, records)
def _test_query(directory: str) -> None:
def _test_query() -> None:
"""
Tests querying functionality by verifying:
1. All expected Account IDs are found
@@ -404,7 +401,7 @@ def _test_query(directory: str) -> None:
}
# Get all Account IDs
account_ids = find_ids_by_type(directory, "Account")
account_ids = find_ids_by_type("Account")
# Verify we found all expected accounts
assert len(account_ids) == len(
@@ -416,7 +413,7 @@ def _test_query(directory: str) -> None:
# Verify each account's data
for acc_id in account_ids:
combined = get_record(directory, acc_id)
combined = get_record(acc_id)
assert combined is not None, f"Could not find account {acc_id}"
expected = expected_accounts[acc_id]
@@ -431,7 +428,7 @@ def _test_query(directory: str) -> None:
print("All query tests passed successfully!")
def _test_upsert(directory: str) -> None:
def _test_upsert() -> None:
"""
Tests upsert functionality by:
1. Updating an existing account
@@ -456,10 +453,10 @@ def _test_upsert(directory: str) -> None:
},
]
_create_csv_file(directory, "Account", update_data, "update_data.csv")
_create_csv_file("Account", update_data, "update_data.csv")
# Verify the update worked
updated_record = get_record(directory, _VALID_SALESFORCE_IDS[0])
updated_record = get_record(_VALID_SALESFORCE_IDS[0])
assert updated_record is not None, "Updated record not found"
assert updated_record.data["Name"] == "Acme Inc. Updated", "Name not updated"
assert (
@@ -467,7 +464,7 @@ def _test_upsert(directory: str) -> None:
), "Description not added"
# Verify the new record was created
new_record = get_record(directory, _VALID_SALESFORCE_IDS[2])
new_record = get_record(_VALID_SALESFORCE_IDS[2])
assert new_record is not None, "New record not found"
assert new_record.data["Name"] == "New Company Inc.", "New record name incorrect"
assert new_record.data["AnnualRevenue"] == "1000000", "New record revenue incorrect"
@@ -475,7 +472,7 @@ def _test_upsert(directory: str) -> None:
print("All upsert tests passed successfully!")
def _test_relationships(directory: str) -> None:
def _test_relationships() -> None:
"""
Tests relationship shelf updates and queries by:
1. Creating test data with relationships
@@ -516,11 +513,11 @@ def _test_relationships(directory: str) -> None:
# Create and update CSV files for each object type
for object_type, records in test_data.items():
_create_csv_file(directory, object_type, records, "relationship_test.csv")
_create_csv_file(object_type, records, "relationship_test.csv")
# Test relationship queries
# All these objects should be children of Acme Inc.
child_ids = get_child_ids(directory, _VALID_SALESFORCE_IDS[0])
child_ids = get_child_ids(_VALID_SALESFORCE_IDS[0])
assert len(child_ids) == 4, f"Expected 4 child objects, found {len(child_ids)}"
assert _VALID_SALESFORCE_IDS[13] in child_ids, "Case 1 not found in relationship"
assert _VALID_SALESFORCE_IDS[14] in child_ids, "Case 2 not found in relationship"
@@ -530,7 +527,7 @@ def _test_relationships(directory: str) -> None:
), "Opportunity not found in relationship"
# Test querying relationships for a different account (should be empty)
other_account_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[1])
other_account_children = get_child_ids(_VALID_SALESFORCE_IDS[1])
assert (
len(other_account_children) == 0
), "Expected no children for different account"
@@ -538,7 +535,7 @@ def _test_relationships(directory: str) -> None:
print("All relationship tests passed successfully!")
def _test_account_with_children(directory: str) -> None:
def _test_account_with_children() -> None:
"""
Tests querying all accounts and retrieving their child objects.
This test verifies that:
@@ -547,16 +544,16 @@ def _test_account_with_children(directory: str) -> None:
3. Child object data is complete and accurate
"""
# First get all account IDs
account_ids = find_ids_by_type(directory, "Account")
account_ids = find_ids_by_type("Account")
assert len(account_ids) > 0, "No accounts found"
# For each account, get its children and verify the data
for account_id in account_ids:
account = get_record(directory, account_id)
account = get_record(account_id)
assert account is not None, f"Could not find account {account_id}"
# Get all child objects
child_ids = get_child_ids(directory, account_id)
child_ids = get_child_ids(account_id)
# For Acme Inc., verify specific relationships
if account_id == _VALID_SALESFORCE_IDS[0]: # Acme Inc.
@@ -567,7 +564,7 @@ def _test_account_with_children(directory: str) -> None:
# Get all child records
child_records = []
for child_id in child_ids:
child_record = get_record(directory, child_id)
child_record = get_record(child_id)
if child_record is not None:
child_records.append(child_record)
# Verify Cases
@@ -602,7 +599,7 @@ def _test_account_with_children(directory: str) -> None:
print("All account with children tests passed successfully!")
def _test_relationship_updates(directory: str) -> None:
def _test_relationship_updates() -> None:
"""
Tests that relationships are properly updated when a child object's parent reference changes.
This test verifies:
@@ -619,10 +616,10 @@ def _test_relationship_updates(directory: str) -> None:
"LastName": "Contact",
}
]
_create_csv_file(directory, "Contact", initial_contact, "initial_contact.csv")
_create_csv_file("Contact", initial_contact, "initial_contact.csv")
# Verify initial relationship
acme_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[0])
acme_children = get_child_ids(_VALID_SALESFORCE_IDS[0])
assert (
_VALID_SALESFORCE_IDS[40] in acme_children
), "Initial relationship not created"
@@ -636,22 +633,22 @@ def _test_relationship_updates(directory: str) -> None:
"LastName": "Contact",
}
]
_create_csv_file(directory, "Contact", updated_contact, "updated_contact.csv")
_create_csv_file("Contact", updated_contact, "updated_contact.csv")
# Verify old relationship is removed
acme_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[0])
acme_children = get_child_ids(_VALID_SALESFORCE_IDS[0])
assert (
_VALID_SALESFORCE_IDS[40] not in acme_children
), "Old relationship not removed"
# Verify new relationship is created
globex_children = get_child_ids(directory, _VALID_SALESFORCE_IDS[1])
globex_children = get_child_ids(_VALID_SALESFORCE_IDS[1])
assert _VALID_SALESFORCE_IDS[40] in globex_children, "New relationship not created"
print("All relationship update tests passed successfully!")
def _test_get_affected_parent_ids(directory: str) -> None:
def _test_get_affected_parent_ids() -> None:
"""
Tests get_affected_parent_ids functionality by verifying:
1. IDs that are directly in the parent_types list are included
@@ -686,13 +683,13 @@ def _test_get_affected_parent_ids(directory: str) -> None:
# Create and update CSV files for test data
for object_type, records in test_data.items():
_create_csv_file(directory, object_type, records)
_create_csv_file(object_type, records)
# Test Case 1: Account directly in updated_ids and parent_types
updated_ids = [_VALID_SALESFORCE_IDS[1]] # Parent Account 2
parent_types = ["Account"]
affected_ids_by_type = dict(
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
get_affected_parent_ids_by_type(updated_ids, parent_types)
)
assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
assert (
@@ -703,7 +700,7 @@ def _test_get_affected_parent_ids(directory: str) -> None:
updated_ids = [_VALID_SALESFORCE_IDS[40]] # Child Contact
parent_types = ["Account"]
affected_ids_by_type = dict(
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
get_affected_parent_ids_by_type(updated_ids, parent_types)
)
assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
assert (
@@ -714,7 +711,7 @@ def _test_get_affected_parent_ids(directory: str) -> None:
updated_ids = [_VALID_SALESFORCE_IDS[1], _VALID_SALESFORCE_IDS[40]] # Both cases
parent_types = ["Account"]
affected_ids_by_type = dict(
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
get_affected_parent_ids_by_type(updated_ids, parent_types)
)
assert "Account" in affected_ids_by_type, "Account type not in affected_ids_by_type"
affected_ids = affected_ids_by_type["Account"]
@@ -729,7 +726,7 @@ def _test_get_affected_parent_ids(directory: str) -> None:
updated_ids = [_VALID_SALESFORCE_IDS[40]] # Child Contact
parent_types = ["Opportunity"] # Wrong type
affected_ids_by_type = dict(
get_affected_parent_ids_by_type(directory, updated_ids, parent_types)
get_affected_parent_ids_by_type(updated_ids, parent_types)
)
assert len(affected_ids_by_type) == 0, "Should return empty dict when no matches"
@@ -737,15 +734,13 @@ def _test_get_affected_parent_ids(directory: str) -> None:
def test_salesforce_sqlite() -> None:
directory = BASE_DATA_PATH
_clear_sf_db(directory)
init_db(directory)
_create_csv_with_example_data(directory)
_test_query(directory)
_test_upsert(directory)
_test_relationships(directory)
_test_account_with_children(directory)
_test_relationship_updates(directory)
_test_get_affected_parent_ids(directory)
_clear_sf_db(directory)
_clear_sf_db()
init_db()
_create_csv_with_example_data()
_test_query()
_test_upsert()
_test_relationships()
_test_account_with_children()
_test_relationship_updates()
_test_get_affected_parent_ids()
_clear_sf_db()

View File

@@ -1,529 +0,0 @@
Company,Link
1849-bio,https://x.com/1849bio
1stcollab,https://twitter.com/ycombinator
abundant,https://x.com/abundant_labs
activepieces,https://mobile.twitter.com/mabuaboud
acx,https://twitter.com/ycombinator
adri-ai,https://twitter.com/darshitac_
affil-ai,https://twitter.com/ycombinator
agave,https://twitter.com/moyicat
aglide,https://twitter.com/pdmcguckian
ai-2,https://twitter.com/the_yuppy
ai-sell,https://x.com/liuzjerry
airtrain-ai,https://twitter.com/neutralino1
aisdr,https://twitter.com/YuriyZaremba
alex,https://x.com/DanielEdrisian
alga-biosciences,https://twitter.com/algabiosciences
alguna,https://twitter.com/aleks_djekic
alixia,https://twitter.com/ycombinator
aminoanalytica,https://x.com/lilwuuzivert
anara,https://twitter.com/naveedjanmo
andi,https://twitter.com/MiamiAngela
andoria,https://x.com/dbudimane
andromeda-surgical,https://twitter.com/nickdamian0
anglera,https://twitter.com/ycombinator
angstrom-ai,https://twitter.com/JaviAC7
ankr-health,https://twitter.com/Ankr_us
apoxy,https://twitter.com/ycombinator
apten,https://twitter.com/dho1357
aragorn-ai,https://twitter.com/ycombinator
arc-2,https://twitter.com/DarkMirage
archilabs,https://twitter.com/ycombinator
arcimus,https://twitter.com/husseinsyed73
argovox,https://www.argovox.com/
artemis-search,https://twitter.com/ycombinator
artie,https://x.com/JacquelineSYC19
asklio,https://twitter.com/butterflock
atlas-2,https://twitter.com/jobryan
attain,https://twitter.com/aamir_hudda
autocomputer,https://twitter.com/madhavsinghal_
automat,https://twitter.com/lucas0choa
automorphic,https://twitter.com/sandkoan
autopallet-robotics,https://twitter.com/ycombinator
autumn-labs,https://twitter.com/ycombinator
aviary,https://twitter.com/ycombinator
azuki,https://twitter.com/VamptVo
banabo,https://twitter.com/ycombinator
baseline-ai,https://twitter.com/ycombinator
baserun,https://twitter.com/effyyzhang
benchify,https://www.x.com/maxvonhippel
berry,https://twitter.com/annchanyt
bifrost,https://twitter.com/0xMysterious
bifrost-orbital,https://x.com/ionkarbatra
biggerpicture,https://twitter.com/ycombinator
biocartesian,https://twitter.com/ycombinator
bland-ai,https://twitter.com/zaygranet
blast,https://x.com/useblast
blaze,https://twitter.com/larfy_rothwell
bluebirds,https://twitter.com/RohanPunamia
bluedot,https://twitter.com/selinayfilizp
bluehill-payments,https://twitter.com/HimanshuMinocha
blyss,https://twitter.com/blyssdev
bolto,https://twitter.com/mrinalsingh02?lang=en
botcity,https://twitter.com/lorhancaproni
boundo,https://twitter.com/ycombinator
bramble,https://x.com/meksikanpijha
bricksai,https://twitter.com/ycombinator
broccoli-ai,https://twitter.com/abhishekjain25
bronco-ai,https://twitter.com/dluozhang
bunting-labs,https://twitter.com/normconstant
byterat,https://twitter.com/penelopekjones_
callback,https://twitter.com/ycombinator
cambio-2,https://twitter.com/ycombinator
camfer,https://x.com/AryaBastani
campfire-2,https://twitter.com/ycombinator
campfire-applied-ai-company,https://twitter.com/siamakfr
candid,https://x.com/kesavkosana
canvas,https://x.com/essamsleiman
capsule,https://twitter.com/kelsey_pedersen
cardinal,http://twitter.com/nadavwiz
cardinal-gray,https://twitter.com/ycombinator
cargo,https://twitter.com/aureeaubert
cartage,https://twitter.com/ycombinator
cashmere,https://twitter.com/shashankbuilds
cedalio,https://twitter.com/LucianaReznik
cekura-2,https://x.com/tarush_agarwal_
central,https://twitter.com/nilaymod
champ,https://twitter.com/ycombinator
cheers,https://twitter.com/ycombinator
chequpi,https://twitter.com/sudshekhar02
chima,https://twitter.com/nikharanirghin
cinapse,https://www.twitter.com/hgphillipsiv
ciro,https://twitter.com/davidjwiner
clara,https://x.com/levinsonjon
cleancard,https://twitter.com/_tom_dot_com
clearspace,https://twitter.com/rbfasho
cobbery,https://twitter.com/Dan_The_Goodman
codeviz,https://x.com/liam_prev
coil-inc,https://twitter.com/ycombinator
coldreach,https://twitter.com/ycombinator
combinehealth,https://twitter.com/ycombinator
comfy-deploy,https://twitter.com/nicholaskkao
complete,https://twitter.com/ranimavram
conductor-quantum,https://twitter.com/BrandonSeverin
conduit,https://twitter.com/ycombinator
continue,https://twitter.com/tylerjdunn
contour,https://twitter.com/ycombinator
coperniq,https://twitter.com/abdullahzandani
corgea,https://twitter.com/asadeddin
corgi,https://twitter.com/nico_laqua?lang=en
corgi-labs,https://twitter.com/ycombinator
coris,https://twitter.com/psvinodh
cosine,https://twitter.com/AlistairPullen
courtyard-io,https://twitter.com/lejeunedall
coverage-cat,https://twitter.com/coveragecats
craftos,https://twitter.com/wa3l
craniometrix,https://craniometrix.com
ctgt,https://twitter.com/cyrilgorlla
curo,https://x.com/EnergizedAndrew
dagworks-inc,https://twitter.com/dagworks
dart,https://twitter.com/milad3malek
dashdive,https://twitter.com/micahawheat
dataleap,https://twitter.com/jh_damm
decisional-ai,https://x.com/groovetandon
decoda-health,https://twitter.com/ycombinator
deepsilicon,https://x.com/abhireddy2004
delfino-ai,https://twitter.com/ycombinator
demo-gorilla,https://twitter.com/ycombinator
demospace,https://www.twitter.com/nick_fiacco
dench-com,https://www.twitter.com/markrachapoom
denormalized,https://twitter.com/IAmMattGreen
dev-tools-ai,https://twitter.com/ycombinator
diffusion-studio,https://x.com/MatthiasRuiz22
digitalcarbon,https://x.com/CtrlGuruDelete
dimely,https://x.com/UseDimely
disputeninja,https://twitter.com/legitmaxwu
diversion,https://twitter.com/sasham1
dmodel,https://twitter.com/dmooooon
doctor-droid,https://twitter.com/TheBengaluruGuy
dodo,https://x.com/dominik_moehrle
dojah-inc,https://twitter.com/ololaday
domu-technology-inc,https://twitter.com/ycombinator
dr-treat,https://twitter.com/rakeshtondon
dreamrp,https://x.com/dreamrpofficial
drivingforce,https://twitter.com/drivingforcehq
dynamo-ai,https://twitter.com/dynamo_fl
edgebit,https://twitter.com/robszumski
educato-ai,https://x.com/FelixGabler
electric-air-2,https://twitter.com/JezOsborne
ember,https://twitter.com/hsinleiwang
ember-robotics,https://twitter.com/ycombinator
emergent,https://twitter.com/mukundjha
emobi,https://twitter.com/ycombinator
entangl,https://twitter.com/Shapol_m
envelope,https://twitter.com/joshuakcockrell
et-al,https://twitter.com/ycombinator
eugit-therapeutics,http://www.eugittx.com
eventual,https://twitter.com/sammy_sidhu
evoly,https://twitter.com/ycombinator
expand-ai,https://twitter.com/timsuchanek
ezdubs,https://twitter.com/PadmanabhanKri
fabius,https://twitter.com/adayNU
fazeshift,https://twitter.com/ycombinator
felafax,https://twitter.com/ThatNithin
fetchr,https://twitter.com/CalvinnChenn
fiber-ai,https://twitter.com/AdiAgashe
ficra,https://x.com/ficra_ai
fiddlecube,https://twitter.com/nupoor_neha
finic,https://twitter.com/jfan001
finta,https://www.twitter.com/andywang
fintool,https://twitter.com/nicbstme
finvest,https://twitter.com/shivambharuka
firecrawl,https://x.com/ericciarla
firstwork,https://twitter.com/techie_Shubham
fixa,https://x.com/jonathanzliu
flair-health,https://twitter.com/adivawhocodes
fleek,https://twitter.com/ycombinator
fleetworks,https://twitter.com/ycombinator
flike,https://twitter.com/yajmch
flint-2,https://twitter.com/hungrysohan
floworks,https://twitter.com/sarthaks92
focus-buddy,https://twitter.com/yash14700/
forerunner-ai,https://x.com/willnida0
founders,https://twitter.com/ycombinator
foundry,https://x.com/FoundryAI_
freestyle,https://x.com/benswerd
fresco,https://twitter.com/ycombinator
friday,https://x.com/AllenNaliath
frigade,https://twitter.com/FrigadeHQ
futureclinic,https://twitter.com/usamasyedmd
gait,https://twitter.com/AlexYHsia
galini,https://twitter.com/ycombinator
gauge,https://twitter.com/the1024th
gecko-security,https://x.com/jjjutla
general-analysis,https://twitter.com/ycombinator
giga-ml,https://twitter.com/varunvummadi
glade,https://twitter.com/ycombinator
glass-health,https://twitter.com/dereckwpaul
goodfin,https://twitter.com/ycombinator
grai,https://twitter.com/ycombinator
greenlite,https://twitter.com/will_lawrenceTO
grey,https://www.twitter.com/kingidee
happyrobot,https://twitter.com/pablorpalafox
haystack-software,https://x.com/AkshaySubr42403
health-harbor,https://twitter.com/AlanLiu96
healthspark,https://twitter.com/stephengrinich
hedgehog-2,https://twitter.com/ycombinator
helicone,https://twitter.com/justinstorre
heroui,https://x.com/jrgarciadev
hoai,https://twitter.com/ycombinator
hockeystack,https://twitter.com/ycombinator
hokali,https://twitter.com/hokalico
homeflow,https://twitter.com/ycombinator
hubble-network,https://twitter.com/BenWild10
humand,https://twitter.com/nicolasbenenzon
humanlayer,https://twitter.com/dexhorthy
hydra,https://twitter.com/JoeSciarrino
hyperbound,https://twitter.com/sguduguntla
ideate-xyz,https://twitter.com/nomocodes
inbuild,https://twitter.com/TySharp_iB
indexical,https://twitter.com/try_nebula
industrial-next,https://twitter.com/ycombinator
infisical,https://twitter.com/matsiiako
inkeep,https://twitter.com/nickgomezc
inlet-2,https://twitter.com/inlet_ai
innkeeper,https://twitter.com/tejasybhakta
instant,https://twitter.com/JoeAverbukh
integrated-reasoning,https://twitter.com/d4r5c2
interlock,https://twitter.com/ycombinator
intryc,https://x.com/alexmarantelos?lang=en
invert,https://twitter.com/purrmin
iollo,https://twitter.com/daniel_gomari
jamble,https://twitter.com/ycombinator
joon-health,https://twitter.com/IsaacVanEaves
juicebox,https://twitter.com/davepaffenholz
julius,https://twitter.com/0interestrates
karmen,https://twitter.com/ycombinator
kenley,https://x.com/KenleyAI
keylika,https://twitter.com/buddhachaudhuri
khoj,https://twitter.com/debanjum
kite,https://twitter.com/DerekFeehrer
kivo-health,https://twitter.com/vaughnkoch
knowtex,https://twitter.com/CarolineCZhang
koala,https://twitter.com/studioseinstein?s=11
kopra-bio,https://x.com/AF_Haddad
kura,https://x.com/kura_labs
laminar,https://twitter.com/skull8888888888
lancedb,https://twitter.com/changhiskhan
latent,https://twitter.com/ycombinator
layerup,https://twitter.com/arnavbathla20
lazyeditor,https://twitter.com/jee_cash
ledgerup,https://twitter.com/josephrjohnson
lifelike,https://twitter.com/alecxiang1
lighthouz-ai,https://x.com/srijankedia
lightski,https://www.twitter.com/hansenq
ligo-biosciences,https://x.com/ArdaGoreci/status/1830744265007480934
line-build,https://twitter.com/ycombinator
lingodotdev,https://twitter.com/maxprilutskiy
linkgrep,https://twitter.com/linkgrep
linum,https://twitter.com/schopra909
livedocs,https://twitter.com/arsalanbashir
luca,https://twitter.com/LucaPricingHq
lumenary,https://twitter.com/vivekhaz
lune,https://x.com/samuelp4rk
lynx,https://twitter.com/ycombinator
magic-loops,https://twitter.com/jumploops
manaflow,https://twitter.com/austinywang
mandel-ai,https://twitter.com/shmkkr
martin,https://twitter.com/martinvoiceai
matano,https://twitter.com/AhmedSamrose
mdhub,https://twitter.com/ealamolda
mederva-health,http://twitter.com/sabihmir
medplum,https://twitter.com/ReshmaKhilnani
melty,https://x.com/charliebholtz
mem0,https://twitter.com/taranjeetio
mercator,https://www.twitter.com/ajdstein
mercoa,https://twitter.com/Sarora27
meru,https://twitter.com/rohanarora_
metalware,https://twitter.com/ryanchowww
metriport,https://twitter.com/dimagoncharov_
mica-ai,https://twitter.com/ycombinator
middleware,https://twitter.com/laduramvishnoi
midship,https://twitter.com/_kietay
mintlify,https://twitter.com/hanwangio
minusx,https://twitter.com/nuwandavek
miracle,https://twitter.com/ycombinator
miru-ml,https://twitter.com/armelwtalla
mito-health,https://twitter.com/teemingchew
mocha,https://twitter.com/nichochar
modern-realty,https://x.com/RIsanians
modulari-t,https://twitter.com/ycombinator
mogara,https://twitter.com/ycombinator
monterey-ai,https://twitter.com/chunonline
moonglow,https://twitter.com/leilavclark
moonshine,https://x.com/useMoonshine
moreta,https://twitter.com/ycombinator
mutable-ai,https://x.com/smahsramo
myria,https://twitter.com/reyflemings
nango,https://twitter.com/rguldener
nanograb,https://twitter.com/lauhoyeung
nara,https://twitter.com/join_nara
narrative,https://twitter.com/axitkhurana
nectar,https://twitter.com/AllenWang314
neosync,https://twitter.com/evisdrenova
nerve,https://x.com/fortress_build
networkocean,https://twitter.com/sammendel4
ngrow-ai,https://twitter.com/ycombinator
no-cap,https://x.com/nocapso
nowadays,https://twitter.com/ycombinator
numeral,https://www.twitter.com/mduvall_
obento-health,https://twitter.com/ycombinator
octopipe,https://twitter.com/abhishekray07
odo,https://twitter.com/ycombinator
ofone,https://twitter.com/ycombinator
onetext,http://twitter.com/jfudem
openfunnel,https://x.com/fenilsuchak
opensight,https://twitter.com/OpenSightAI
ora-ai,https://twitter.com/ryan_rl_phelps
orchid,https://twitter.com/ycombinator
origami-agents,https://x.com/fin465
outerbase,https://www.twitter.com/burcs
outerport,https://x.com/yongyuanxi
outset,https://twitter.com/AaronLCannon
overeasy,https://twitter.com/skyflylu
overlap,https://x.com/jbaerofficial
oway,https://twitter.com/owayinc
ozone,https://twitter.com/maxvwolff
pair-ai,https://twitter.com/ycombinator
palmier,https://twitter.com/ycombinator
panora,https://twitter.com/rflih_
parabolic,https://twitter.com/ycombinator
paragon-ai,https://twitter.com/ycombinator
parahelp,https://twitter.com/ankerbachryhl
parity,https://x.com/wilson_spearman
parley,https://twitter.com/ycombinator
patched,https://x.com/rohan_sood15
pearson-labs,https://twitter.com/ycombinator
pelm,https://twitter.com/ycombinator
penguin-ai,https://twitter.com/ycombinator
peoplebox,https://twitter.com/abhichugh
permitflow,https://twitter.com/ycombinator
permitportal,https://twitter.com/rgmazilu
persana-ai,https://www.twitter.com/tweetsreez
pharos,https://x.com/felix_brann
phind,https://twitter.com/michaelroyzen
phonely,https://x.com/phonely_ai
pier,https://twitter.com/ycombinator
pierre,https://twitter.com/fat
pinnacle,https://twitter.com/SeanRoades
pipeshift,https://x.com/FerraoEnrique
pivot,https://twitter.com/raimietang
planbase,https://twitter.com/ycombinator
plover-parametrics,https://twitter.com/ycombinator
plutis,https://twitter.com/kamil_m_ali
poka-labs,https://twitter.com/ycombinator
poly,https://twitter.com/Denizen_Kane
polymath-robotics,https://twitter.com/stefanesa
ponyrun,https://twitter.com/ycombinator
poplarml,https://twitter.com/dnaliu17
posh,https://twitter.com/PoshElectric
power-to-the-brand,https://twitter.com/ycombinator
primevault,https://twitter.com/prashantupd
prohostai,https://twitter.com/bilguunu
promptloop,https://twitter.com/PeterbMangan
propaya,https://x.com/PropayaOfficial
proper,https://twitter.com/kylemaloney_
proprise,https://twitter.com/kragerDev
protegee,https://x.com/kirthibanothu
pump-co,https://www.twitter.com/spndn07/
pumpkin,https://twitter.com/SamuelCrombie
pure,https://twitter.com/collectpure
pylon-2,https://x.com/marty_kausas
pyq-ai,https://twitter.com/araghuvanshi2
query-vary,https://twitter.com/DJFinetunes
rankai,https://x.com/rankai_ai
rastro,https://twitter.com/baptiste_cumin
reactwise,https://twitter.com/ycombinator
read-bean,https://twitter.com/maggieqzhang
readily,https://twitter.com/ycombinator
redouble-ai,https://twitter.com/pneumaticdill?s=21
refine,https://twitter.com/civanozseyhan
reflex,https://twitter.com/getreflex
reforged-labs,https://twitter.com/ycombinator
relace,https://twitter.com/ycombinator
relate,https://twitter.com/chrischae__
remade,https://x.com/Christos_antono
remy,https://twitter.com/ycombinator
remy-2,https://x.com/remysearch
rentflow,https://twitter.com/ycombinator
requestly,https://twitter.com/sachinjain024
resend,https://x.com/zenorocha
respaid,https://twitter.com/johnbanr
reticular,https://x.com/nithinparsan
retrofix-ai,https://twitter.com/danieldoesdev
revamp,https://twitter.com/getrevamp_ai
revyl,https://x.com/landseerenga
reworkd,https://twitter.com/asimdotshrestha
reworks,https://twitter.com/ycombinator
rift,https://twitter.com/FilipTwarowski
riskangle,https://twitter.com/ycombinator
riskcube,https://x.com/andrei_risk
rivet,https://twitter.com/nicholaskissel
riveter-ai,https://x.com/AGrillz
roame,https://x.com/timtqin
roforco,https://x.com/brain_xiang
rome,https://twitter.com/craigzLiszt
roomplays,https://twitter.com/criyaco
rosebud-biosciences,https://twitter.com/KitchenerWilson
rowboat-labs,https://twitter.com/segmenta
rubber-ducky-labs,https://twitter.com/alexandraj777
ruleset,https://twitter.com/LoganFrederick
ryvn,https://x.com/ryvnai
safetykit,https://twitter.com/ycombinator
sage-ai,https://twitter.com/akhilmurthy20
saldor,https://x.com/notblandjacob
salient,https://twitter.com/ycombinator
schemeflow,https://x.com/browninghere
sculpt,https://twitter.com/ycombinator
seals-ai,https://x.com/luismariogm
seis,https://twitter.com/TrevMcKendrick
sensei,https://twitter.com/ycombinator
sensorsurf,https://twitter.com/noahjepstein
sepal-ai,https://www.twitter.com/katqhu1
serial,https://twitter.com/Serialmfg
serif-health,https://www.twitter.com/mfrobben
serra,https://twitter.com/ycombinator
shasta-health,https://twitter.com/SrinjoyMajumdar
shekel-mobility,https://twitter.com/ShekelMobility
shortbread,https://twitter.com/ShortbreadAI
showandtell,https://twitter.com/ycombinator
sidenote,https://twitter.com/jclin22009
sieve,https://twitter.com/mokshith_v
silkchart,https://twitter.com/afakerele
simple-ai,https://twitter.com/catheryn_li
simplehash,https://twitter.com/Alex_Kilkka
simplex,https://x.com/simplexdata
simplifine,https://x.com/egekduman
sizeless,https://twitter.com/cornelius_einem
skyvern,https://x.com/itssuchintan
slingshot,https://twitter.com/ycombinator
snowpilot,https://x.com/snowpilotai
soff,https://x.com/BernhardHausle1
solum-health,https://twitter.com/ycombinator
sonnet,https://twitter.com/ycombinator
sophys,https://twitter.com/ycombinator
sorcerer,https://x.com/big_veech
soteri-skin,https://twitter.com/SoteriSkin
sphere,https://twitter.com/nrudder_
spine-ai,https://twitter.com/BudhkarAkshay
spongecake,https://twitter.com/ycombinator
spur,https://twitter.com/sneha8sivakumar
sre-ai,https://twitter.com/ycombinator
stably,https://x.com/JinjingLiang
stack-ai,https://twitter.com/bernaceituno
stellar,https://twitter.com/ycombinator
stormy-ai-autonomous-marketing-agent,https://twitter.com/karmedge/
strada,https://twitter.com/AmirProd1
stream,https://twitter.com/ycombinator
structured-labs,https://twitter.com/amruthagujjar
studdy,https://twitter.com/mike_lamma
subscriptionflow,https://twitter.com/KashifSaleemCEO
subsets,https://twitter.com/ycombinator
supercontrast,https://twitter.com/ycombinator
supertone,https://twitter.com/trysupertone
superunit,https://x.com/peter_marler
sweep,https://twitter.com/wwzeng1
syncly,https://x.com/synclyhq
synnax,https://x.com/Emilbon99
syntheticfi,https://x.com/SyntheticFi_SF
t3-chat-prev-ping-gg,https://twitter.com/t3dotgg
tableflow,https://twitter.com/mitchpatin
tai,https://twitter.com/Tragen_ai
tandem-2,https://x.com/Tandemspace
taxgpt,https://twitter.com/ChKashifAli
taylor-ai,https://twitter.com/brian_j_kim
teamout,https://twitter.com/ycombinator
tegon,https://twitter.com/harshithb4h
terminal,https://x.com/withterminal
theneo,https://twitter.com/robakid
theya,https://twitter.com/vikasch
thyme,https://twitter.com/ycombinator
tiny,https://twitter.com/ycombinator
tola,https://twitter.com/alencvisic
trainy,https://twitter.com/TrainyAI
trendex-we-tokenize-talent,https://twitter.com/ycombinator
trueplace,https://twitter.com/ycombinator
truewind,https://twitter.com/AlexLee611
trusty,https://twitter.com/trustyhomes
truva,https://twitter.com/gaurav_aggarwal
tuesday,https://twitter.com/kai_jiabo_feng
twenty,https://twitter.com/twentycrm
twine,https://twitter.com/anandvalavalkar
two-dots,https://twitter.com/HensonOrser1
typa,https://twitter.com/sounhochung
typeless,https://twitter.com/ycombinator
unbound,https://twitter.com/ycombinator
undermind,https://twitter.com/UndermindAI
unison,https://twitter.com/maxim_xyz
unlayer,https://twitter.com/adeelraza
unstatiq,https://twitter.com/NishSingaraju
unusual,https://x.com/willwjack
upfront,https://twitter.com/KnowUpfront
vaero,https://twitter.com/ycombinator
vango-ai,https://twitter.com/vango_ai
variance,https://twitter.com/karinemellata
variant,https://twitter.com/bnj
velos,https://twitter.com/OscarMHBF
velt,https://twitter.com/rakesh_goyal
vendra,https://x.com/vendraHQ
vera-health,https://x.com/_maximall
verata,https://twitter.com/ycombinator
versive,https://twitter.com/getversive
vessel,https://twitter.com/vesselapi
vibe,https://twitter.com/ycombinator
videogen,https://twitter.com/ycombinator
vigilant,https://twitter.com/BenShumaker_
vitalize-care,https://twitter.com/nikhiljdsouza
viva-labs,https://twitter.com/vishal_the_jain
vizly,https://twitter.com/vizlyhq
vly-ai-2,https://x.com/victorxheng
vocode,https://twitter.com/kianhooshmand
void,https://x.com/parel_es
voltic,https://twitter.com/ycombinator
vooma,https://twitter.com/jessebucks
wingback,https://twitter.com/tfriehe_
winter,https://twitter.com/AzianMike
wolfia,https://twitter.com/narenmano
wordware,https://twitter.com/kozerafilip
zenbase-ai,https://twitter.com/CyrusOfEden
zeropath,https://x.com/zeropathAI
1 Company Link
2 1849-bio https://x.com/1849bio
3 1stcollab https://twitter.com/ycombinator
4 abundant https://x.com/abundant_labs
5 activepieces https://mobile.twitter.com/mabuaboud
6 acx https://twitter.com/ycombinator
7 adri-ai https://twitter.com/darshitac_
8 affil-ai https://twitter.com/ycombinator
9 agave https://twitter.com/moyicat
10 aglide https://twitter.com/pdmcguckian
11 ai-2 https://twitter.com/the_yuppy
12 ai-sell https://x.com/liuzjerry
13 airtrain-ai https://twitter.com/neutralino1
14 aisdr https://twitter.com/YuriyZaremba
15 alex https://x.com/DanielEdrisian
16 alga-biosciences https://twitter.com/algabiosciences
17 alguna https://twitter.com/aleks_djekic
18 alixia https://twitter.com/ycombinator
19 aminoanalytica https://x.com/lilwuuzivert
20 anara https://twitter.com/naveedjanmo
21 andi https://twitter.com/MiamiAngela
22 andoria https://x.com/dbudimane
23 andromeda-surgical https://twitter.com/nickdamian0
24 anglera https://twitter.com/ycombinator
25 angstrom-ai https://twitter.com/JaviAC7
26 ankr-health https://twitter.com/Ankr_us
27 apoxy https://twitter.com/ycombinator
28 apten https://twitter.com/dho1357
29 aragorn-ai https://twitter.com/ycombinator
30 arc-2 https://twitter.com/DarkMirage
31 archilabs https://twitter.com/ycombinator
32 arcimus https://twitter.com/husseinsyed73
33 argovox https://www.argovox.com/
34 artemis-search https://twitter.com/ycombinator
35 artie https://x.com/JacquelineSYC19
36 asklio https://twitter.com/butterflock
37 atlas-2 https://twitter.com/jobryan
38 attain https://twitter.com/aamir_hudda
39 autocomputer https://twitter.com/madhavsinghal_
40 automat https://twitter.com/lucas0choa
41 automorphic https://twitter.com/sandkoan
42 autopallet-robotics https://twitter.com/ycombinator
43 autumn-labs https://twitter.com/ycombinator
44 aviary https://twitter.com/ycombinator
45 azuki https://twitter.com/VamptVo
46 banabo https://twitter.com/ycombinator
47 baseline-ai https://twitter.com/ycombinator
48 baserun https://twitter.com/effyyzhang
49 benchify https://www.x.com/maxvonhippel
50 berry https://twitter.com/annchanyt
51 bifrost https://twitter.com/0xMysterious
52 bifrost-orbital https://x.com/ionkarbatra
53 biggerpicture https://twitter.com/ycombinator
54 biocartesian https://twitter.com/ycombinator
55 bland-ai https://twitter.com/zaygranet
56 blast https://x.com/useblast
57 blaze https://twitter.com/larfy_rothwell
58 bluebirds https://twitter.com/RohanPunamia
59 bluedot https://twitter.com/selinayfilizp
60 bluehill-payments https://twitter.com/HimanshuMinocha
61 blyss https://twitter.com/blyssdev
62 bolto https://twitter.com/mrinalsingh02?lang=en
63 botcity https://twitter.com/lorhancaproni
64 boundo https://twitter.com/ycombinator
65 bramble https://x.com/meksikanpijha
66 bricksai https://twitter.com/ycombinator
67 broccoli-ai https://twitter.com/abhishekjain25
68 bronco-ai https://twitter.com/dluozhang
69 bunting-labs https://twitter.com/normconstant
70 byterat https://twitter.com/penelopekjones_
71 callback https://twitter.com/ycombinator
72 cambio-2 https://twitter.com/ycombinator
73 camfer https://x.com/AryaBastani
74 campfire-2 https://twitter.com/ycombinator
75 campfire-applied-ai-company https://twitter.com/siamakfr
76 candid https://x.com/kesavkosana
77 canvas https://x.com/essamsleiman
78 capsule https://twitter.com/kelsey_pedersen
79 cardinal http://twitter.com/nadavwiz
80 cardinal-gray https://twitter.com/ycombinator
81 cargo https://twitter.com/aureeaubert
82 cartage https://twitter.com/ycombinator
83 cashmere https://twitter.com/shashankbuilds
84 cedalio https://twitter.com/LucianaReznik
85 cekura-2 https://x.com/tarush_agarwal_
86 central https://twitter.com/nilaymod
87 champ https://twitter.com/ycombinator
88 cheers https://twitter.com/ycombinator
89 chequpi https://twitter.com/sudshekhar02
90 chima https://twitter.com/nikharanirghin
91 cinapse https://www.twitter.com/hgphillipsiv
92 ciro https://twitter.com/davidjwiner
93 clara https://x.com/levinsonjon
94 cleancard https://twitter.com/_tom_dot_com
95 clearspace https://twitter.com/rbfasho
96 cobbery https://twitter.com/Dan_The_Goodman
97 codeviz https://x.com/liam_prev
98 coil-inc https://twitter.com/ycombinator
99 coldreach https://twitter.com/ycombinator
100 combinehealth https://twitter.com/ycombinator
101 comfy-deploy https://twitter.com/nicholaskkao
102 complete https://twitter.com/ranimavram
103 conductor-quantum https://twitter.com/BrandonSeverin
104 conduit https://twitter.com/ycombinator
105 continue https://twitter.com/tylerjdunn
106 contour https://twitter.com/ycombinator
107 coperniq https://twitter.com/abdullahzandani
108 corgea https://twitter.com/asadeddin
109 corgi https://twitter.com/nico_laqua?lang=en
110 corgi-labs https://twitter.com/ycombinator
111 coris https://twitter.com/psvinodh
112 cosine https://twitter.com/AlistairPullen
113 courtyard-io https://twitter.com/lejeunedall
114 coverage-cat https://twitter.com/coveragecats
115 craftos https://twitter.com/wa3l
116 craniometrix https://craniometrix.com
117 ctgt https://twitter.com/cyrilgorlla
118 curo https://x.com/EnergizedAndrew
119 dagworks-inc https://twitter.com/dagworks
120 dart https://twitter.com/milad3malek
121 dashdive https://twitter.com/micahawheat
122 dataleap https://twitter.com/jh_damm
123 decisional-ai https://x.com/groovetandon
124 decoda-health https://twitter.com/ycombinator
125 deepsilicon https://x.com/abhireddy2004
126 delfino-ai https://twitter.com/ycombinator
127 demo-gorilla https://twitter.com/ycombinator
128 demospace https://www.twitter.com/nick_fiacco
129 dench-com https://www.twitter.com/markrachapoom
130 denormalized https://twitter.com/IAmMattGreen
131 dev-tools-ai https://twitter.com/ycombinator
132 diffusion-studio https://x.com/MatthiasRuiz22
133 digitalcarbon https://x.com/CtrlGuruDelete
134 dimely https://x.com/UseDimely
135 disputeninja https://twitter.com/legitmaxwu
136 diversion https://twitter.com/sasham1
137 dmodel https://twitter.com/dmooooon
138 doctor-droid https://twitter.com/TheBengaluruGuy
139 dodo https://x.com/dominik_moehrle
140 dojah-inc https://twitter.com/ololaday
141 domu-technology-inc https://twitter.com/ycombinator
142 dr-treat https://twitter.com/rakeshtondon
143 dreamrp https://x.com/dreamrpofficial
144 drivingforce https://twitter.com/drivingforcehq
145 dynamo-ai https://twitter.com/dynamo_fl
146 edgebit https://twitter.com/robszumski
147 educato-ai https://x.com/FelixGabler
148 electric-air-2 https://twitter.com/JezOsborne
149 ember https://twitter.com/hsinleiwang
150 ember-robotics https://twitter.com/ycombinator
151 emergent https://twitter.com/mukundjha
152 emobi https://twitter.com/ycombinator
153 entangl https://twitter.com/Shapol_m
154 envelope https://twitter.com/joshuakcockrell
155 et-al https://twitter.com/ycombinator
156 eugit-therapeutics http://www.eugittx.com
157 eventual https://twitter.com/sammy_sidhu
158 evoly https://twitter.com/ycombinator
159 expand-ai https://twitter.com/timsuchanek
160 ezdubs https://twitter.com/PadmanabhanKri
161 fabius https://twitter.com/adayNU
162 fazeshift https://twitter.com/ycombinator
163 felafax https://twitter.com/ThatNithin
164 fetchr https://twitter.com/CalvinnChenn
165 fiber-ai https://twitter.com/AdiAgashe
166 ficra https://x.com/ficra_ai
167 fiddlecube https://twitter.com/nupoor_neha
168 finic https://twitter.com/jfan001
169 finta https://www.twitter.com/andywang
170 fintool https://twitter.com/nicbstme
171 finvest https://twitter.com/shivambharuka
172 firecrawl https://x.com/ericciarla
173 firstwork https://twitter.com/techie_Shubham
174 fixa https://x.com/jonathanzliu
175 flair-health https://twitter.com/adivawhocodes
176 fleek https://twitter.com/ycombinator
177 fleetworks https://twitter.com/ycombinator
178 flike https://twitter.com/yajmch
179 flint-2 https://twitter.com/hungrysohan
180 floworks https://twitter.com/sarthaks92
181 focus-buddy https://twitter.com/yash14700/
182 forerunner-ai https://x.com/willnida0
183 founders https://twitter.com/ycombinator
184 foundry https://x.com/FoundryAI_
185 freestyle https://x.com/benswerd
186 fresco https://twitter.com/ycombinator
187 friday https://x.com/AllenNaliath
188 frigade https://twitter.com/FrigadeHQ
189 futureclinic https://twitter.com/usamasyedmd
190 gait https://twitter.com/AlexYHsia
191 galini https://twitter.com/ycombinator
192 gauge https://twitter.com/the1024th
193 gecko-security https://x.com/jjjutla
194 general-analysis https://twitter.com/ycombinator
195 giga-ml https://twitter.com/varunvummadi
196 glade https://twitter.com/ycombinator
197 glass-health https://twitter.com/dereckwpaul
198 goodfin https://twitter.com/ycombinator
199 grai https://twitter.com/ycombinator
200 greenlite https://twitter.com/will_lawrenceTO
201 grey https://www.twitter.com/kingidee
202 happyrobot https://twitter.com/pablorpalafox
203 haystack-software https://x.com/AkshaySubr42403
204 health-harbor https://twitter.com/AlanLiu96
205 healthspark https://twitter.com/stephengrinich
206 hedgehog-2 https://twitter.com/ycombinator
207 helicone https://twitter.com/justinstorre
208 heroui https://x.com/jrgarciadev
209 hoai https://twitter.com/ycombinator
210 hockeystack https://twitter.com/ycombinator
211 hokali https://twitter.com/hokalico
212 homeflow https://twitter.com/ycombinator
213 hubble-network https://twitter.com/BenWild10
214 humand https://twitter.com/nicolasbenenzon
215 humanlayer https://twitter.com/dexhorthy
216 hydra https://twitter.com/JoeSciarrino
217 hyperbound https://twitter.com/sguduguntla
218 ideate-xyz https://twitter.com/nomocodes
219 inbuild https://twitter.com/TySharp_iB
220 indexical https://twitter.com/try_nebula
221 industrial-next https://twitter.com/ycombinator
222 infisical https://twitter.com/matsiiako
223 inkeep https://twitter.com/nickgomezc
224 inlet-2 https://twitter.com/inlet_ai
225 innkeeper https://twitter.com/tejasybhakta
226 instant https://twitter.com/JoeAverbukh
227 integrated-reasoning https://twitter.com/d4r5c2
228 interlock https://twitter.com/ycombinator
229 intryc https://x.com/alexmarantelos?lang=en
230 invert https://twitter.com/purrmin
231 iollo https://twitter.com/daniel_gomari
232 jamble https://twitter.com/ycombinator
233 joon-health https://twitter.com/IsaacVanEaves
234 juicebox https://twitter.com/davepaffenholz
235 julius https://twitter.com/0interestrates
236 karmen https://twitter.com/ycombinator
237 kenley https://x.com/KenleyAI
238 keylika https://twitter.com/buddhachaudhuri
239 khoj https://twitter.com/debanjum
240 kite https://twitter.com/DerekFeehrer
241 kivo-health https://twitter.com/vaughnkoch
242 knowtex https://twitter.com/CarolineCZhang
243 koala https://twitter.com/studioseinstein?s=11
244 kopra-bio https://x.com/AF_Haddad
245 kura https://x.com/kura_labs
246 laminar https://twitter.com/skull8888888888
247 lancedb https://twitter.com/changhiskhan
248 latent https://twitter.com/ycombinator
249 layerup https://twitter.com/arnavbathla20
250 lazyeditor https://twitter.com/jee_cash
251 ledgerup https://twitter.com/josephrjohnson
252 lifelike https://twitter.com/alecxiang1
253 lighthouz-ai https://x.com/srijankedia
254 lightski https://www.twitter.com/hansenq
255 ligo-biosciences https://x.com/ArdaGoreci/status/1830744265007480934
256 line-build https://twitter.com/ycombinator
257 lingodotdev https://twitter.com/maxprilutskiy
258 linkgrep https://twitter.com/linkgrep
259 linum https://twitter.com/schopra909
260 livedocs https://twitter.com/arsalanbashir
261 luca https://twitter.com/LucaPricingHq
262 lumenary https://twitter.com/vivekhaz
263 lune https://x.com/samuelp4rk
264 lynx https://twitter.com/ycombinator
265 magic-loops https://twitter.com/jumploops
266 manaflow https://twitter.com/austinywang
267 mandel-ai https://twitter.com/shmkkr
268 martin https://twitter.com/martinvoiceai
269 matano https://twitter.com/AhmedSamrose
270 mdhub https://twitter.com/ealamolda
271 mederva-health http://twitter.com/sabihmir
272 medplum https://twitter.com/ReshmaKhilnani
273 melty https://x.com/charliebholtz
274 mem0 https://twitter.com/taranjeetio
275 mercator https://www.twitter.com/ajdstein
276 mercoa https://twitter.com/Sarora27
277 meru https://twitter.com/rohanarora_
278 metalware https://twitter.com/ryanchowww
279 metriport https://twitter.com/dimagoncharov_
280 mica-ai https://twitter.com/ycombinator
281 middleware https://twitter.com/laduramvishnoi
282 midship https://twitter.com/_kietay
283 mintlify https://twitter.com/hanwangio
284 minusx https://twitter.com/nuwandavek
285 miracle https://twitter.com/ycombinator
286 miru-ml https://twitter.com/armelwtalla
287 mito-health https://twitter.com/teemingchew
288 mocha https://twitter.com/nichochar
289 modern-realty https://x.com/RIsanians
290 modulari-t https://twitter.com/ycombinator
291 mogara https://twitter.com/ycombinator
292 monterey-ai https://twitter.com/chunonline
293 moonglow https://twitter.com/leilavclark
294 moonshine https://x.com/useMoonshine
295 moreta https://twitter.com/ycombinator
296 mutable-ai https://x.com/smahsramo
297 myria https://twitter.com/reyflemings
298 nango https://twitter.com/rguldener
299 nanograb https://twitter.com/lauhoyeung
300 nara https://twitter.com/join_nara
301 narrative https://twitter.com/axitkhurana
302 nectar https://twitter.com/AllenWang314
303 neosync https://twitter.com/evisdrenova
304 nerve https://x.com/fortress_build
305 networkocean https://twitter.com/sammendel4
306 ngrow-ai https://twitter.com/ycombinator
307 no-cap https://x.com/nocapso
308 nowadays https://twitter.com/ycombinator
309 numeral https://www.twitter.com/mduvall_
310 obento-health https://twitter.com/ycombinator
311 octopipe https://twitter.com/abhishekray07
312 odo https://twitter.com/ycombinator
313 ofone https://twitter.com/ycombinator
314 onetext http://twitter.com/jfudem
315 openfunnel https://x.com/fenilsuchak
316 opensight https://twitter.com/OpenSightAI
317 ora-ai https://twitter.com/ryan_rl_phelps
318 orchid https://twitter.com/ycombinator
319 origami-agents https://x.com/fin465
320 outerbase https://www.twitter.com/burcs
321 outerport https://x.com/yongyuanxi
322 outset https://twitter.com/AaronLCannon
323 overeasy https://twitter.com/skyflylu
324 overlap https://x.com/jbaerofficial
325 oway https://twitter.com/owayinc
326 ozone https://twitter.com/maxvwolff
327 pair-ai https://twitter.com/ycombinator
328 palmier https://twitter.com/ycombinator
329 panora https://twitter.com/rflih_
330 parabolic https://twitter.com/ycombinator
331 paragon-ai https://twitter.com/ycombinator
332 parahelp https://twitter.com/ankerbachryhl
333 parity https://x.com/wilson_spearman
334 parley https://twitter.com/ycombinator
335 patched https://x.com/rohan_sood15
336 pearson-labs https://twitter.com/ycombinator
337 pelm https://twitter.com/ycombinator
338 penguin-ai https://twitter.com/ycombinator
339 peoplebox https://twitter.com/abhichugh
340 permitflow https://twitter.com/ycombinator
341 permitportal https://twitter.com/rgmazilu
342 persana-ai https://www.twitter.com/tweetsreez
343 pharos https://x.com/felix_brann
344 phind https://twitter.com/michaelroyzen
345 phonely https://x.com/phonely_ai
346 pier https://twitter.com/ycombinator
347 pierre https://twitter.com/fat
348 pinnacle https://twitter.com/SeanRoades
349 pipeshift https://x.com/FerraoEnrique
350 pivot https://twitter.com/raimietang
351 planbase https://twitter.com/ycombinator
352 plover-parametrics https://twitter.com/ycombinator
353 plutis https://twitter.com/kamil_m_ali
354 poka-labs https://twitter.com/ycombinator
355 poly https://twitter.com/Denizen_Kane
356 polymath-robotics https://twitter.com/stefanesa
357 ponyrun https://twitter.com/ycombinator
358 poplarml https://twitter.com/dnaliu17
359 posh https://twitter.com/PoshElectric
360 power-to-the-brand https://twitter.com/ycombinator
361 primevault https://twitter.com/prashantupd
362 prohostai https://twitter.com/bilguunu
363 promptloop https://twitter.com/PeterbMangan
364 propaya https://x.com/PropayaOfficial
365 proper https://twitter.com/kylemaloney_
366 proprise https://twitter.com/kragerDev
367 protegee https://x.com/kirthibanothu
368 pump-co https://www.twitter.com/spndn07/
369 pumpkin https://twitter.com/SamuelCrombie
370 pure https://twitter.com/collectpure
371 pylon-2 https://x.com/marty_kausas
372 pyq-ai https://twitter.com/araghuvanshi2
373 query-vary https://twitter.com/DJFinetunes
374 rankai https://x.com/rankai_ai
375 rastro https://twitter.com/baptiste_cumin
376 reactwise https://twitter.com/ycombinator
377 read-bean https://twitter.com/maggieqzhang
378 readily https://twitter.com/ycombinator
379 redouble-ai https://twitter.com/pneumaticdill?s=21
380 refine https://twitter.com/civanozseyhan
381 reflex https://twitter.com/getreflex
382 reforged-labs https://twitter.com/ycombinator
383 relace https://twitter.com/ycombinator
384 relate https://twitter.com/chrischae__
385 remade https://x.com/Christos_antono
386 remy https://twitter.com/ycombinator
387 remy-2 https://x.com/remysearch
388 rentflow https://twitter.com/ycombinator
389 requestly https://twitter.com/sachinjain024
390 resend https://x.com/zenorocha
391 respaid https://twitter.com/johnbanr
392 reticular https://x.com/nithinparsan
393 retrofix-ai https://twitter.com/danieldoesdev
394 revamp https://twitter.com/getrevamp_ai
395 revyl https://x.com/landseerenga
396 reworkd https://twitter.com/asimdotshrestha
397 reworks https://twitter.com/ycombinator
398 rift https://twitter.com/FilipTwarowski
399 riskangle https://twitter.com/ycombinator
400 riskcube https://x.com/andrei_risk
401 rivet https://twitter.com/nicholaskissel
402 riveter-ai https://x.com/AGrillz
403 roame https://x.com/timtqin
404 roforco https://x.com/brain_xiang
405 rome https://twitter.com/craigzLiszt
406 roomplays https://twitter.com/criyaco
407 rosebud-biosciences https://twitter.com/KitchenerWilson
408 rowboat-labs https://twitter.com/segmenta
409 rubber-ducky-labs https://twitter.com/alexandraj777
410 ruleset https://twitter.com/LoganFrederick
411 ryvn https://x.com/ryvnai
412 safetykit https://twitter.com/ycombinator
413 sage-ai https://twitter.com/akhilmurthy20
414 saldor https://x.com/notblandjacob
415 salient https://twitter.com/ycombinator
416 schemeflow https://x.com/browninghere
417 sculpt https://twitter.com/ycombinator
418 seals-ai https://x.com/luismariogm
419 seis https://twitter.com/TrevMcKendrick
420 sensei https://twitter.com/ycombinator
421 sensorsurf https://twitter.com/noahjepstein
422 sepal-ai https://www.twitter.com/katqhu1
423 serial https://twitter.com/Serialmfg
424 serif-health https://www.twitter.com/mfrobben
425 serra https://twitter.com/ycombinator
426 shasta-health https://twitter.com/SrinjoyMajumdar
427 shekel-mobility https://twitter.com/ShekelMobility
428 shortbread https://twitter.com/ShortbreadAI
429 showandtell https://twitter.com/ycombinator
430 sidenote https://twitter.com/jclin22009
431 sieve https://twitter.com/mokshith_v
432 silkchart https://twitter.com/afakerele
433 simple-ai https://twitter.com/catheryn_li
434 simplehash https://twitter.com/Alex_Kilkka
435 simplex https://x.com/simplexdata
436 simplifine https://x.com/egekduman
437 sizeless https://twitter.com/cornelius_einem
438 skyvern https://x.com/itssuchintan
439 slingshot https://twitter.com/ycombinator
440 snowpilot https://x.com/snowpilotai
441 soff https://x.com/BernhardHausle1
442 solum-health https://twitter.com/ycombinator
443 sonnet https://twitter.com/ycombinator
444 sophys https://twitter.com/ycombinator
445 sorcerer https://x.com/big_veech
446 soteri-skin https://twitter.com/SoteriSkin
447 sphere https://twitter.com/nrudder_
448 spine-ai https://twitter.com/BudhkarAkshay
449 spongecake https://twitter.com/ycombinator
450 spur https://twitter.com/sneha8sivakumar
451 sre-ai https://twitter.com/ycombinator
452 stably https://x.com/JinjingLiang
453 stack-ai https://twitter.com/bernaceituno
454 stellar https://twitter.com/ycombinator
455 stormy-ai-autonomous-marketing-agent https://twitter.com/karmedge/
456 strada https://twitter.com/AmirProd1
457 stream https://twitter.com/ycombinator
458 structured-labs https://twitter.com/amruthagujjar
459 studdy https://twitter.com/mike_lamma
460 subscriptionflow https://twitter.com/KashifSaleemCEO
461 subsets https://twitter.com/ycombinator
462 supercontrast https://twitter.com/ycombinator
463 supertone https://twitter.com/trysupertone
464 superunit https://x.com/peter_marler
465 sweep https://twitter.com/wwzeng1
466 syncly https://x.com/synclyhq
467 synnax https://x.com/Emilbon99
468 syntheticfi https://x.com/SyntheticFi_SF
469 t3-chat-prev-ping-gg https://twitter.com/t3dotgg
470 tableflow https://twitter.com/mitchpatin
471 tai https://twitter.com/Tragen_ai
472 tandem-2 https://x.com/Tandemspace
473 taxgpt https://twitter.com/ChKashifAli
474 taylor-ai https://twitter.com/brian_j_kim
475 teamout https://twitter.com/ycombinator
476 tegon https://twitter.com/harshithb4h
477 terminal https://x.com/withterminal
478 theneo https://twitter.com/robakid
479 theya https://twitter.com/vikasch
480 thyme https://twitter.com/ycombinator
481 tiny https://twitter.com/ycombinator
482 tola https://twitter.com/alencvisic
483 trainy https://twitter.com/TrainyAI
484 trendex-we-tokenize-talent https://twitter.com/ycombinator
485 trueplace https://twitter.com/ycombinator
486 truewind https://twitter.com/AlexLee611
487 trusty https://twitter.com/trustyhomes
488 truva https://twitter.com/gaurav_aggarwal
489 tuesday https://twitter.com/kai_jiabo_feng
490 twenty https://twitter.com/twentycrm
491 twine https://twitter.com/anandvalavalkar
492 two-dots https://twitter.com/HensonOrser1
493 typa https://twitter.com/sounhochung
494 typeless https://twitter.com/ycombinator
495 unbound https://twitter.com/ycombinator
496 undermind https://twitter.com/UndermindAI
497 unison https://twitter.com/maxim_xyz
498 unlayer https://twitter.com/adeelraza
499 unstatiq https://twitter.com/NishSingaraju
500 unusual https://x.com/willwjack
501 upfront https://twitter.com/KnowUpfront
502 vaero https://twitter.com/ycombinator
503 vango-ai https://twitter.com/vango_ai
504 variance https://twitter.com/karinemellata
505 variant https://twitter.com/bnj
506 velos https://twitter.com/OscarMHBF
507 velt https://twitter.com/rakesh_goyal
508 vendra https://x.com/vendraHQ
509 vera-health https://x.com/_maximall
510 verata https://twitter.com/ycombinator
511 versive https://twitter.com/getversive
512 vessel https://twitter.com/vesselapi
513 vibe https://twitter.com/ycombinator
514 videogen https://twitter.com/ycombinator
515 vigilant https://twitter.com/BenShumaker_
516 vitalize-care https://twitter.com/nikhiljdsouza
517 viva-labs https://twitter.com/vishal_the_jain
518 vizly https://twitter.com/vizlyhq
519 vly-ai-2 https://x.com/victorxheng
520 vocode https://twitter.com/kianhooshmand
521 void https://x.com/parel_es
522 voltic https://twitter.com/ycombinator
523 vooma https://twitter.com/jessebucks
524 wingback https://twitter.com/tfriehe_
525 winter https://twitter.com/AzianMike
526 wolfia https://twitter.com/narenmano
527 wordware https://twitter.com/kozerafilip
528 zenbase-ai https://twitter.com/CyrusOfEden
529 zeropath https://x.com/zeropathAI

View File

@@ -1,29 +0,0 @@
import csv
companies = {}
with open("twitter_links.txt", "r") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split(":", 1)
if len(parts) != 2:
continue
company, url = parts
url = url.strip()
# Store only the first URL for each company
if company not in companies:
companies[company] = url
# Write to CSV
with open("company_links.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Company", "Link"])
for company, url in sorted(companies.items()):
writer.writerow([company, url])
print(f"Deduped {len(companies)} companies to company_links.csv")

View File

@@ -1,68 +0,0 @@
# Onyx AWS ECS Fargate CloudFormation Deployment
This directory contains CloudFormation templates and scripts to deploy Onyx on AWS ECS Fargate.
## Configuration
All configuration parameters are stored in a single JSON file: `onyx_config.json`. This file contains all the parameters needed for the different CloudFormation stacks.
Example:
```json
{
"OnyxNamespace": "onyx",
"Environment": "production",
"EFSName": "onyx-efs",
"AWSRegion": "us-east-2",
"VpcID": "YOUR_VPC_ID",
"SubnetIDs": "YOUR_SUBNET_ID1,YOUR_SUBNET_ID2",
"DomainName": "YOUR_DOMAIN e.g ecs.onyx.app",
"ValidationMethod": "DNS",
"HostedZoneId": ""
}
```
### Required Parameters
- `Environment`: Used to prefix all stack names during deployment. This is required.
- `OnyxNamespace`: Namespace for the Onyx deployment.
- `EFSName`: Name for the Elastic File System.
- `AWSRegion`: AWS region where resources will be deployed.
- `VpcID`: ID of the VPC where Onyx will be deployed.
- `SubnetIDs`: Comma-separated list of subnet IDs for deployment.
- `DomainName`: Domain name for the Onyx deployment.
- `ValidationMethod`: Method for domain validation (typically "DNS").
- [optional] `HostedZoneId`: Route 53 hosted zone ID (only if using Route 53 for DNS).
The deployment script automatically extracts the needed parameters for each CloudFormation template based on the parameter names defined in the templates.
## Deployment Order
The deployment follows this order:
1. Infrastructure stacks:
- EFS
- Cluster
- ACM
2. Service stacks:
- Postgres
- Redis
- Vespa Engine
- Model Server (Indexing)
- Model Server (Inference)
- Backend API Server
- Backend Background Server
- Web Server
- Nginx
## Usage
To deploy:
```bash
./deploy.sh
```
To uninstall:
```bash
./uninstall.sh
```

View File

@@ -1,194 +0,0 @@
#!/bin/bash
# Function to remove comments from JSON and output valid JSON
remove_comments() {
sed 's/\/\/.*$//' "$1" | grep -v '^[[:space:]]*$'
}
# Variables
TEMPLATE_DIR="$(pwd)"
SERVICE_DIR="$TEMPLATE_DIR/services"
# Unified config file
CONFIG_FILE="onyx_config.jsonl"
# Try to get AWS_REGION from config, fallback to default if not found
AWS_REGION_FROM_CONFIG=$(remove_comments "$CONFIG_FILE" | jq -r '.AWSRegion // empty')
if [ -n "$AWS_REGION_FROM_CONFIG" ]; then
AWS_REGION="$AWS_REGION_FROM_CONFIG"
else
AWS_REGION="${AWS_REGION:-us-east-2}"
fi
# Get environment from config file
ENVIRONMENT=$(remove_comments "$CONFIG_FILE" | jq -r '.Environment')
if [ -z "$ENVIRONMENT" ] || [ "$ENVIRONMENT" == "null" ]; then
echo "Missing Environment in $CONFIG_FILE. Please add the Environment field."
exit 1
fi
# Try to get S3_BUCKET from config, fallback to default if not found
S3_BUCKET_FROM_CONFIG=$(remove_comments "$CONFIG_FILE" | jq -r '.S3Bucket // empty')
if [ -n "$S3_BUCKET_FROM_CONFIG" ]; then
S3_BUCKET="$S3_BUCKET_FROM_CONFIG"
else
S3_BUCKET="${S3_BUCKET:-onyx-ecs-fargate-configs}"
fi
INFRA_ORDER=(
"onyx_efs_template.yaml"
"onyx_cluster_template.yaml"
"onyx_acm_template.yaml"
)
# Deployment order for services
SERVICE_ORDER=(
"onyx_postgres_service_template.yaml"
"onyx_redis_service_template.yaml"
"onyx_vespaengine_service_template.yaml"
"onyx_model_server_indexing_service_template.yaml"
"onyx_model_server_inference_service_template.yaml"
"onyx_backend_api_server_service_template.yaml"
"onyx_backend_background_server_service_template.yaml"
"onyx_web_server_service_template.yaml"
"onyx_nginx_service_template.yaml"
)
# Function to validate a CloudFormation template
validate_template() {
local template_file=$1
echo "Validating template: $template_file..."
aws cloudformation validate-template --template-body file://"$template_file" --region "$AWS_REGION" > /dev/null
if [ $? -ne 0 ]; then
echo "Error: Validation failed for $template_file. Exiting."
exit 1
fi
echo "Validation succeeded for $template_file."
}
# Function to create CloudFormation parameters from JSON
create_parameters_from_json() {
local template_file=$1
local temp_params_file="${template_file%.yaml}_parameters.json"
# Convert the config file contents to CloudFormation parameter format
echo "[" > "$temp_params_file"
# Process all key-value pairs from the config file
local first=true
remove_comments "$CONFIG_FILE" | jq -r 'to_entries[] | select(.value != null and .value != "") | "\(.key)|\(.value)"' | while IFS='|' read -r key value; do
if [ "$first" = true ]; then
first=false
else
echo "," >> "$temp_params_file"
fi
echo " {\"ParameterKey\": \"$key\", \"ParameterValue\": \"$value\"}" >> "$temp_params_file"
done
echo "]" >> "$temp_params_file"
# Debug output - display the created parameters file
echo "Generated parameters file: $temp_params_file" >&2
echo "Contents:" >&2
cat "$temp_params_file" >&2
# Return just the filename
echo "$temp_params_file"
}
# Function to deploy a CloudFormation stack
deploy_stack() {
local stack_name=$1
local template_file=$2
echo "Checking if stack $stack_name exists..."
if aws cloudformation describe-stacks --stack-name "$stack_name" --region "$AWS_REGION" > /dev/null 2>&1; then
echo "Stack $stack_name already exists. Skipping deployment."
return 0
fi
# Create temporary parameters file for this template
local temp_params_file=$(create_parameters_from_json "$template_file")
# Special handling for SubnetIDs parameter if needed
if grep -q "SubnetIDs" "$template_file"; then
echo "Template uses SubnetIDs parameter, ensuring it's properly formatted..."
# Make sure we're passing SubnetIDs as a comma-separated list
local subnet_ids=$(remove_comments "$CONFIG_FILE" | jq -r '.SubnetIDs // empty')
if [ -n "$subnet_ids" ]; then
echo "Using SubnetIDs from config: $subnet_ids"
else
echo "Warning: SubnetIDs not found in config but template requires it."
fi
fi
echo "Deploying stack: $stack_name with template: $template_file and generated config from: $CONFIG_FILE..."
aws cloudformation deploy \
--stack-name "$stack_name" \
--template-file "$template_file" \
--parameter-overrides file://"$temp_params_file" \
--capabilities CAPABILITY_IAM CAPABILITY_NAMED_IAM CAPABILITY_AUTO_EXPAND \
--region "$AWS_REGION" \
--no-cli-auto-prompt > /dev/null
if [ $? -ne 0 ]; then
echo "Error: Deployment failed for $stack_name. Exiting."
exit 1
fi
# Clean up temporary parameter file
rm "$temp_params_file"
echo "Stack deployed successfully: $stack_name."
}
convert_underscores_to_hyphens() {
local input_string="$1"
local converted_string="${input_string//_/-}"
echo "$converted_string"
}
deploy_infra_stacks() {
for template_name in "${INFRA_ORDER[@]}"; do
# Skip ACM template if HostedZoneId is not set
if [[ "$template_name" == "onyx_acm_template.yaml" ]]; then
HOSTED_ZONE_ID=$(remove_comments "$CONFIG_FILE" | jq -r '.HostedZoneId')
if [ -z "$HOSTED_ZONE_ID" ] || [ "$HOSTED_ZONE_ID" == "" ] || [ "$HOSTED_ZONE_ID" == "null" ]; then
echo "Skipping ACM template deployment because HostedZoneId is not set in $CONFIG_FILE"
continue
fi
fi
template_file="$template_name"
stack_name="$ENVIRONMENT-$(basename "$template_name" _template.yaml)"
stack_name=$(convert_underscores_to_hyphens "$stack_name")
if [ -f "$template_file" ]; then
validate_template "$template_file"
deploy_stack "$stack_name" "$template_file"
else
echo "Warning: Template file $template_file not found. Skipping."
fi
done
}
deploy_services_stacks() {
for template_name in "${SERVICE_ORDER[@]}"; do
template_file="$SERVICE_DIR/$template_name"
stack_name="$ENVIRONMENT-$(basename "$template_name" _template.yaml)"
stack_name=$(convert_underscores_to_hyphens "$stack_name")
if [ -f "$template_file" ]; then
validate_template "$template_file"
deploy_stack "$stack_name" "$template_file"
else
echo "Warning: Template file $template_file not found. Skipping."
fi
done
}
echo "Starting deployment of Onyx to ECS Fargate Cluster..."
deploy_infra_stacks
deploy_services_stacks
echo "All templates validated and deployed successfully."

View File

@@ -1,31 +0,0 @@
AWSTemplateFormatVersion: '2010-09-09'
Description: CloudFormation template to create an ACM Certificate.
Parameters:
DomainName:
Type: String
Description: The primary domain name for the certificate (e.g., example.com).
Default: example.com
Environment:
Type: String
Default: production
ValidationMethod:
Type: String
Default: DNS
Resources:
Certificate:
Type: AWS::CertificateManager::Certificate
Properties:
DomainName: !Ref DomainName
ValidationMethod: !Ref ValidationMethod
Tags:
- Key: env
Value: !Ref Environment
Outputs:
OutputAcm:
Description: ACM Cert Id
Value: !Ref Certificate
Export:
Name: !Sub ${AWS::StackName}-OnyxCertificate

View File

@@ -1,156 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: The template used to create an ECS Cluster from the ECS Console.
Parameters:
Environment:
Type: String
Description: The environment that is used in the name of the cluster as well.
OnyxNamespace:
Type: String
Default: onyx
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
Resources:
ECSCluster:
Type: AWS::ECS::Cluster
Properties:
ClusterName: !Sub ${Environment}-onyx-cluster
CapacityProviders:
- FARGATE
- FARGATE_SPOT
ClusterSettings:
- Name: containerInsights
Value: enhanced
ServiceConnectDefaults:
Namespace: !Sub ${Environment}-onyx-cluster
Tags:
- Key: env
Value: !Ref Environment
- Key: app
Value: onyx
S3Bucket:
Type: AWS::S3::Bucket
Properties:
BucketName: !Sub ${Environment}-onyx-ecs-fargate-configs
AccessControl: Private
BucketEncryption:
ServerSideEncryptionConfiguration:
- ServerSideEncryptionByDefault:
SSEAlgorithm: AES256
PublicAccessBlockConfiguration:
BlockPublicAcls: true
BlockPublicPolicy: true
IgnorePublicAcls: true
RestrictPublicBuckets: true
PrivateDnsNamespace:
Type: AWS::ServiceDiscovery::PrivateDnsNamespace
Properties:
Description: AWS Cloud Map private DNS namespace for resources for onyx website.
Vpc: !Ref VpcID
Name: !Ref OnyxNamespace
Properties:
DnsProperties:
SOA:
TTL: 50
ECSTaskRole:
Type: AWS::IAM::Role
Properties:
RoleName: !Sub ${Environment}-OnyxEcsTaskRole
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: "EFSPolicy"
PolicyDocument:
Version: "2012-10-17"
Statement:
- Sid: "VisualEditor0"
Effect: Allow
Action:
- "elasticfilesystem:*"
Resource:
- !Sub "arn:aws:elasticfilesystem:*:${AWS::AccountId}:access-point/*"
- !Sub "arn:aws:elasticfilesystem:*:${AWS::AccountId}:file-system/*"
- Sid: "VisualEditor1"
Effect: Allow
Action: "elasticfilesystem:*"
Resource: "*"
- PolicyName: "S3Policy"
PolicyDocument:
Version: "2012-10-17"
Statement:
- Sid: "VisualEditor0"
Effect: Allow
Action:
- "s3:GetObject"
- "s3:ListBucket"
Resource:
- !Sub "arn:aws:s3:::${Environment}-onyx-ecs-fargate-configs/*"
- !Sub "arn:aws:s3:::${Environment}-onyx-ecs-fargate-configs"
ECSTaskExecutionRole:
Type: AWS::IAM::Role
Properties:
RoleName: !Sub ${Environment}-OnyxECSTaskExecutionRole
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Principal:
Service: ecs-tasks.amazonaws.com
Action: sts:AssumeRole
ManagedPolicyArns:
- arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy
Policies:
- PolicyName: "CloudWatchLogsPolicy"
PolicyDocument:
Version: "2012-10-17"
Statement:
- Sid: "VisualEditor0"
Effect: Allow
Action: "logs:CreateLogGroup"
Resource: !Sub "arn:aws:logs:*:${AWS::AccountId}:log-group:*"
- PolicyName: "SecretsManagerPolicy"
PolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- secretsmanager:GetSecretValue
Resource: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password-*
Outputs:
OutputEcsCluster:
Description: Onyx ECS Cluster
Value: !Ref ECSCluster
Export:
Name: !Sub ${AWS::StackName}-ECSClusterName
OutputECSTaskRole:
Description: Onyx ECS Task Role
Value: !Ref ECSTaskRole
Export:
Name: !Sub ${AWS::StackName}-ECSTaskRole
OutputECSTaskExecutionRole:
Description: Onyx ECS TaskExecutionRole
Value: !Ref ECSTaskExecutionRole
Export:
Name: !Sub ${AWS::StackName}-ECSTaskExecutionRole
OutputOnyxNamespace:
Description: Onyx CloudMap namespace ID for ECS service discvoery.
Value: !Ref PrivateDnsNamespace
Export:
Name: !Sub ${AWS::StackName}-OnyxNamespace
OutputOnyxNamespaceName:
Description: Onyx CloudMap namespace domain name for ECS service discvoery.
Value: !Ref OnyxNamespace
Export:
Name: !Sub ${AWS::StackName}-OnyxNamespaceName

View File

@@ -1,16 +0,0 @@
{
// Naming, likely doesn't need to be changed
"OnyxNamespace": "onyx",
"Environment": "production",
"EFSName": "onyx-efs",
// Region and VPC Stuff
"AWSRegion": "us-east-2",
"VpcID": "YOUR_VPC_ID",
"SubnetIDs": "YOUR_SUBNET_ID1,YOUR_SUBNET_ID2",
// Domain and ACM Stuff
"DomainName": "YOUR_DOMAIN e.g ecs.onyx.app",
"ValidationMethod": "DNS",
"HostedZoneId": "" // Only specify if using Route 53 for DNS
}

View File

@@ -1,128 +0,0 @@
Parameters:
EFSName:
Type: String
Default: onyx-efs
Environment:
Type: String
Default: production
VpcID:
Type: String
Default: vpc-0f230ca52bb04c722
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
Resources:
OnyxEfs:
Type: AWS::EFS::FileSystem
Properties:
BackupPolicy:
Status: ENABLED
Encrypted: True
PerformanceMode: generalPurpose
FileSystemTags:
- Key: Name
Value: !Sub ${Environment}-${EFSName}-${AWS::Region}-${AWS::AccountId}
FileSystemProtection:
ReplicationOverwriteProtection: ENABLED
ThroughputMode: elastic
VespaEngineTmpEfsAccessPoint:
Type: AWS::EFS::AccessPoint
Properties:
AccessPointTags:
- Key: Name
Value: vespaengine-tmp
FileSystemId: !Ref OnyxEfs
RootDirectory:
CreationInfo:
OwnerGid: "1000"
OwnerUid: "1000"
Permissions: "0755"
Path: /var/tmp
VespaEngineDataEfsAccessPoint:
Type: AWS::EFS::AccessPoint
Properties:
AccessPointTags:
- Key: Name
Value: vespaengine-data
FileSystemId: !Ref OnyxEfs
RootDirectory:
CreationInfo:
OwnerGid: "1000"
OwnerUid: "1000"
Permissions: "0755"
Path: /opt/vespa/var
PostgresDataEfsAccessPoint:
Type: AWS::EFS::AccessPoint
Properties:
AccessPointTags:
- Key: Name
Value: postgres-data
FileSystemId: !Ref OnyxEfs
RootDirectory:
CreationInfo:
OwnerGid: "1000"
OwnerUid: "1000"
Permissions: "0755"
Path: /var/lib/postgresql/data
EFSMountTarget1:
DependsOn: OnyxEfs
Type: AWS::EFS::MountTarget
Properties:
FileSystemId: !Ref OnyxEfs
SubnetId: !Select [0, !Ref SubnetIDs]
SecurityGroups:
- !Ref EFSSecurityGroupMountTargets
EFSMountTarget2:
DependsOn: OnyxEfs
Type: AWS::EFS::MountTarget
Properties:
FileSystemId: !Ref OnyxEfs
SubnetId: !Select [1, !Ref SubnetIDs]
SecurityGroups:
- !Ref EFSSecurityGroupMountTargets
EFSSecurityGroupMountTargets:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: Security Group for EFS Mount Targets
VpcId: !Ref VpcID
SecurityGroupIngress:
- IpProtocol: tcp
FromPort: 2049
ToPort: 2049
CidrIp: 0.0.0.0/0
Outputs:
OutputOnyxEfsId:
Description: Onyx Filesystem Id
Value: !Ref OnyxEfs
Export:
Name: !Sub ${AWS::StackName}-OnyxEfsId
OutputVespaEngineTmpEfsAccessPoint:
Description: VespaEngine Tmp AP
Value: !Ref VespaEngineTmpEfsAccessPoint
Export:
Name: !Sub ${AWS::StackName}-VespaEngineTmpEfsAccessPoint
OutputVespaEngineDataEfsAccessPoint:
Description: VespaEngine Data Ap
Value: !Ref VespaEngineDataEfsAccessPoint
Export:
Name: !Sub ${AWS::StackName}-VespaEngineDataEfsAccessPoint
OutputPostgresDataEfsAccessPoint:
Description: Postgres Data AP
Value: !Ref PostgresDataEfsAccessPoint
Export:
Name: !Sub ${AWS::StackName}-PostgresDataEfsAccessPoint
OutputEFSSecurityGroupMountTargets:
Description: EFS Security Group
Value: !Ref EFSSecurityGroupMountTargets
Export:
Name: !Sub ${AWS::StackName}-EFSSecurityGroupMountTargets

View File

@@ -1,216 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template for Onyx Backend Api Server TaskDefinition
Parameters:
Environment:
Type: String
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-backend-api-server
TaskCpu:
Type: String
Default: "2048"
TaskMemory:
Type: String
Default: "4096"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: ENABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Ref: SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 8080
ToPort: 8080
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 8080
ToPort: 8080
IpProtocol: tcp
CidrIpv6: "::/0"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
ContainerDefinitions:
- Name: onyx-backend
Image: onyxdotapp/onyx-backend:latest
Cpu: 0
Essential: true
Command:
- "/bin/sh"
- "-c"
- |
alembic upgrade head && echo "Starting Onyx Api Server" && uvicorn onyx.main:app --host 0.0.0.0 --port 8080
PortMappings:
- Name: backend
ContainerPort: 8080
HostPort: 8080
Protocol: tcp
AppProtocol: http
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
Environment:
- Name: REDIS_HOST
Value: !Sub
- "${Environment}-onyx-redis-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: MODEL_SERVER_HOST
Value: !Sub
- "${Environment}-onyx-model-server-inference-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: VESPA_HOST
Value: !Sub
- "${Environment}-onyx-vespaengine-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: POSTGRES_HOST
Value: !Sub
- "${Environment}-onyx-postgres-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: INDEXING_MODEL_SERVER_HOST
Value: !Sub
- "${Environment}-onyx-model-server-indexing-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: AUTH_TYPE
Value: disabled
Secrets:
- Name: POSTGRES_PASSWORD
ValueFrom: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password
VolumesFrom: []
SystemControls: []
ECSAutoScalingTarget:
Type: AWS::ApplicationAutoScaling::ScalableTarget
DependsOn: ECSService
Properties:
MaxCapacity: 5
MinCapacity: 1
ResourceId: !Sub
- "service/${ImportedCluster}/${Environment}-${ServiceName}-service"
- ImportedCluster: !ImportValue
'Fn::Sub': "${Environment}-onyx-cluster-ECSClusterName"
ServiceName: !Ref ServiceName
Environment: !Ref Environment
ScalableDimension: ecs:service:DesiredCount
ServiceNamespace: ecs
ECSAutoScalingPolicy:
Type: AWS::ApplicationAutoScaling::ScalingPolicy
Properties:
PolicyName: !Sub ${Environment}-${ServiceName}-service-cpu-scaleout
ScalingTargetId: !Ref ECSAutoScalingTarget
PolicyType: TargetTrackingScaling
TargetTrackingScalingPolicyConfiguration:
TargetValue: 75
PredefinedMetricSpecification:
PredefinedMetricType: ECSServiceAverageCPUUtilization
ScaleOutCooldown: 60
ScaleInCooldown: 60
ECSAutoScalingPolicyMemory:
Type: AWS::ApplicationAutoScaling::ScalingPolicy
Properties:
PolicyName: !Sub ${Environment}-${ServiceName}-service-mem-scaleout
ScalingTargetId: !Ref ECSAutoScalingTarget
PolicyType: TargetTrackingScaling
TargetTrackingScalingPolicyConfiguration:
TargetValue: 80
PredefinedMetricSpecification:
PredefinedMetricType: ECSServiceAverageMemoryUtilization
ScaleOutCooldown: 60
ScaleInCooldown: 60

View File

@@ -1,174 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template for Onyx Backend Background Server TaskDefinition
Parameters:
Environment:
Type: String
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-backend-background-server
TaskCpu:
Type: String
Default: "2048"
TaskMemory:
Type: String
Default: "4096"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: ENABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Ref: SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 8080
ToPort: 8080
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 8080
ToPort: 8080
IpProtocol: tcp
CidrIpv6: "::/0"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
ContainerDefinitions:
- Name: onyx-backend-background
Image: onyxdotapp/onyx-backend:latest
Cpu: 0
Essential: true
Command:
- "/usr/bin/supervisord"
- "-c"
- "/etc/supervisor/conf.d/supervisord.conf"
PortMappings:
- Name: backend
ContainerPort: 8080
HostPort: 8080
Protocol: tcp
AppProtocol: http
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
Environment:
- Name: REDIS_HOST
Value: !Sub
- "${Environment}-onyx-redis-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: MODEL_SERVER_HOST
Value: !Sub
- "${Environment}-onyx-model-server-inference-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: VESPA_HOST
Value: !Sub
- "${Environment}-onyx-vespaengine-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: POSTGRES_HOST
Value: !Sub
- "${Environment}-onyx-postgres-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: INDEXING_MODEL_SERVER_HOST
Value: !Sub
- "${Environment}-onyx-model-server-indexing-service.${ImportedNamespace}"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
- Name: AUTH_TYPE
Value: disabled
Secrets:
- Name: POSTGRES_PASSWORD
ValueFrom: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password
VolumesFrom: []
SystemControls: []

View File

@@ -1,163 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template for Onyx Model Server Indexing TaskDefinition
Parameters:
Environment:
Type: String
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-model-server-indexing
TaskCpu:
Type: String
Default: "2048"
TaskMemory:
Type: String
Default: "4096"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: ENABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Ref: SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 9000
ToPort: 9000
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 9000
ToPort: 9000
IpProtocol: tcp
CidrIpv6: "::/0"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
ContainerDefinitions:
- Name: onyx-model-server-indexing
Image: onyxdotapp/onyx-model-server:latest
Cpu: 0
Essential: true
Command:
- "/bin/sh"
- "-c"
- >
if [ "${DISABLE_MODEL_SERVER:-false}" = "True" ]; then echo 'Skipping service...';
exit 0; else exec uvicorn model_server.main:app --host 0.0.0.0 --port 9000; fi
PortMappings:
- Name: model-server
ContainerPort: 9000
HostPort: 9000
Protocol: tcp
AppProtocol: http
Environment:
- Name: LOG_LEVEL
Value: info
- Name: INDEXING_ONLY
Value: True
- Name: VESPA_SEARCHER_THREADS
Value: "1"
MountPoints:
- SourceVolume: efs-volume
ContainerPath: /root/.cache/huggingface/
ReadOnly: false
VolumesFrom: []
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: "ecs"
SystemControls: []
Volumes:
- Name: efs-volume
EFSVolumeConfiguration:
FilesystemId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
RootDirectory: "/"

View File

@@ -1,200 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template for Onyx Model Server Inference TaskDefinition
Parameters:
Environment:
Type: String
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-model-server-inference
TaskCpu:
Type: String
Default: "2048"
TaskMemory:
Type: String
Default: "4096"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: ENABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Ref: SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 9000
ToPort: 9000
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 9000
ToPort: 9000
IpProtocol: tcp
CidrIpv6: "::/0"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
ContainerDefinitions:
- Name: onyx-model-server-inference
Image: onyxdotapp/onyx-model-server:latest
Cpu: 0
Essential: true
Command:
- "/bin/sh"
- "-c"
- >
if [ "${DISABLE_MODEL_SERVER:-false}" = "True" ]; then echo 'Skipping service...';
exit 0; else exec uvicorn model_server.main:app --host 0.0.0.0 --port 9000; fi
PortMappings:
- Name: model-server
ContainerPort: 9000
HostPort: 9000
Protocol: tcp
AppProtocol: http
Environment:
- Name: LOG_LEVEL
Value: info
MountPoints:
- SourceVolume: efs-volume
ContainerPath: /root/.cache/huggingface/
ReadOnly: false
VolumesFrom: []
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: "ecs"
SystemControls: []
Volumes:
- Name: efs-volume
EFSVolumeConfiguration:
FilesystemId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
RootDirectory: "/"
ECSAutoScalingTarget:
Type: AWS::ApplicationAutoScaling::ScalableTarget
DependsOn: ECSService
Properties:
MaxCapacity: 5
MinCapacity: 1
ResourceId: !Sub
- "service/${ImportedCluster}/${Environment}-${ServiceName}-service"
- ImportedCluster: !ImportValue
'Fn::Sub': "${Environment}-onyx-cluster-ECSClusterName"
ServiceName: !Ref ServiceName
Environment: !Ref Environment
ScalableDimension: ecs:service:DesiredCount
ServiceNamespace: ecs
ECSAutoScalingPolicy:
Type: AWS::ApplicationAutoScaling::ScalingPolicy
Properties:
PolicyName: !Sub ${Environment}-${ServiceName}-service-cpu-scaleout
ScalingTargetId: !Ref ECSAutoScalingTarget
PolicyType: TargetTrackingScaling
TargetTrackingScalingPolicyConfiguration:
TargetValue: 75
PredefinedMetricSpecification:
PredefinedMetricType: ECSServiceAverageCPUUtilization
ScaleOutCooldown: 60
ScaleInCooldown: 60
ECSAutoScalingPolicyMemory:
Type: AWS::ApplicationAutoScaling::ScalingPolicy
Properties:
PolicyName: !Sub ${Environment}-${ServiceName}-service-memory-scaleout
ScalingTargetId: !Ref ECSAutoScalingTarget
PolicyType: TargetTrackingScaling
TargetTrackingScalingPolicyConfiguration:
TargetValue: 80
PredefinedMetricSpecification:
PredefinedMetricType: ECSServiceAverageMemoryUtilization
ScaleOutCooldown: 60
ScaleInCooldown: 60

View File

@@ -1,288 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: "The template used to create an ECS Service from the ECS Console."
Parameters:
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
HostedZoneId:
Type: String
Default: ''
DomainName:
Type: String
Default: demo.danswer.ai
Environment:
Type: String
ServiceName:
Type: String
Default: onyx-nginx
OnyxNamespace:
Type: String
Default: onyx
OnyxBackendApiServiceName:
Type: String
Default: onyx-backend-api-server-service
OnyxWebServerServiceName:
Type: String
Default: onyx-web-server-service
TaskCpu:
Type: String
Default: "512"
TaskMemory:
Type: String
Default: "1024"
TaskDesiredCount:
Type: Number
Default: 1
GitHubConfigUrl:
Type: String
Default: "https://raw.githubusercontent.com/onyx-dot-app/onyx/main/deployment/data/nginx/app.conf.template.dev"
Description: "URL to the nginx configuration file on GitHub"
GitHubRunScriptUrl:
Type: String
Default: "https://raw.githubusercontent.com/onyx-dot-app/onyx/main/deployment/data/nginx/run-nginx.sh"
Description: "URL to the nginx run script on GitHub"
Conditions:
CreateRoute53: !Not
- !Equals
- !Ref HostedZoneId
- ''
Resources:
ECSService:
Type: "AWS::ECS::Service"
DependsOn: LoadBalancer
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: "FARGATE"
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}
SchedulingStrategy: "REPLICA"
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: "ENABLED"
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: "ENABLED"
SecurityGroups:
- !Ref SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: "LATEST"
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: "ECS"
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt
- "ServiceDiscoveryService"
- "Arn"
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
LoadBalancers:
- ContainerName: nginx
ContainerPort: 80
TargetGroupArn: !Ref TargetGroup
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
ContainerDefinitions:
- Name: nginx
Image: nginx:1.23.4-alpine
Cpu: 0
PortMappings:
- Name: nginx-80-tcp
ContainerPort: 80
HostPort: 80
Protocol: tcp
Essential: true
Command:
- /bin/sh
- -c
- dos2unix /etc/nginx/conf.d/run-nginx.sh && /etc/nginx/conf.d/run-nginx.sh app.conf.template.dev
Environment:
- Name: EMAIL
Value: ""
- Name: DOMAIN
Value: !Ref DomainName
- Name: ONYX_BACKEND_API_HOST
Value: !Sub ${Environment}-${OnyxBackendApiServiceName}.${OnyxNamespace}
- Name: ONYX_WEB_SERVER_HOST
Value: !Sub ${Environment}-${OnyxWebServerServiceName}.${OnyxNamespace}
MountPoints:
- SourceVolume: efs-volume
ContainerPath: /etc/nginx/conf.d
VolumesFrom: []
DependsOn:
- ContainerName: github-sync-container
Condition: SUCCESS
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-OnyxNginxTaskDefinition
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: 25m
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
SystemControls: []
- Name: github-sync-container
Image: curlimages/curl:latest
Cpu: 128
MemoryReservation: 256
PortMappings: []
Essential: false
Command:
- sh
- -c
- !Sub |
curl -L ${GitHubConfigUrl} -o /etc/nginx/conf.d/app.conf.template.dev &&
curl -L ${GitHubRunScriptUrl} -o /etc/nginx/conf.d/run-nginx.sh &&
chmod 644 /etc/nginx/conf.d/app.conf.template.dev &&
chmod 755 /etc/nginx/conf.d/run-nginx.sh &&
exit 0 || exit 1
MountPoints:
- SourceVolume: efs-volume
ContainerPath: /etc/nginx/conf.d
VolumesFrom: []
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-github-sync-configs-TaskDefinition
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: 25m
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
SystemControls: []
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
Volumes:
- Name: efs-volume
EFSVolumeConfiguration:
FilesystemId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
RootDirectory: /
PlacementConstraints: []
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
EnableFaultInjection: false
SecurityGroup:
Type: "AWS::EC2::SecurityGroup"
Properties:
GroupDescription: !Sub "Security group for ${ServiceName}"
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 80
ToPort: 80
IpProtocol: "tcp"
CidrIp: "0.0.0.0/0"
- FromPort: 80
ToPort: 80
IpProtocol: "tcp"
CidrIpv6: "::/0"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Ref ServiceName
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
LoadBalancer:
Type: AWS::ElasticLoadBalancingV2::LoadBalancer
DependsOn: SecurityGroup
Properties:
Type: application
Scheme: internet-facing
Subnets: !Ref SubnetIDs
SecurityGroups:
- !Ref SecurityGroup
LoadBalancerListener:
Type: AWS::ElasticLoadBalancingV2::Listener
Properties:
LoadBalancerArn: !Ref LoadBalancer
Port: 80
Protocol: HTTP
DefaultActions:
- Type: forward
TargetGroupArn: !Ref TargetGroup
TargetGroup:
Type: AWS::ElasticLoadBalancingV2::TargetGroup
Properties:
HealthCheckEnabled: True
HealthCheckIntervalSeconds: 30
HealthCheckPort: 80
HealthCheckPath: /api/health
HealthCheckProtocol: HTTP
HealthCheckTimeoutSeconds: 20
HealthyThresholdCount: 3
Port: 80
Protocol: HTTP
ProtocolVersion: HTTP1
VpcId: !Ref VpcID
TargetType: ip
Route53Record:
Type: AWS::Route53::RecordSet
Condition: CreateRoute53
Properties:
HostedZoneId: !Ref HostedZoneId
Name: !Ref DomainName
Type: A
AliasTarget:
DNSName: !GetAtt LoadBalancer.DNSName
HostedZoneId: !GetAtt LoadBalancer.CanonicalHostedZoneID
EvaluateTargetHealth: false
Outputs:
ECSService:
Description: "The created service."
Value: !Ref "ECSService"
ServiceDiscoveryService:
Value: !Ref "ServiceDiscoveryService"
OutputOnyxLoadBalancerDNSName:
Description: LoadBalancer DNSName
Value: !GetAtt LoadBalancer.DNSName
Export:
Name: !Sub ${AWS::StackName}-OnyxLoadBalancerDNSName

View File

@@ -1,177 +0,0 @@
AWSTemplateFormatVersion: '2010-09-09'
Parameters:
Environment:
Type: String
Default: production
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-postgres
TaskCpu:
Type: String
Default: "1024"
TaskMemory:
Type: String
Default: "2048"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: DISABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- !Ref SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 100
MinimumHealthyPercent: 0
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 5432
ToPort: 5432
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 5432
ToPort: 5432
IpProtocol: tcp
CidrIpv6: "::/0"
- FromPort: 2049
ToPort: 2049
IpProtocol: tcp
SourceSecurityGroupId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-EFSSecurityGroupMountTargets"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
Volumes:
- Name: efs-volume-data
EFSVolumeConfiguration:
FilesystemId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
RootDirectory: "/"
TransitEncryption: ENABLED
AuthorizationConfig:
AccessPointId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-PostgresDataEfsAccessPoint"
ContainerDefinitions:
- Name: !Ref ServiceName
Image: postgres:15.2-alpine
Cpu: 0
Essential: true
StopTimeout: 30
Command:
- "-c"
- "max_connections=250"
PortMappings:
- Name: postgres
ContainerPort: 5432
HostPort: 5432
Protocol: tcp
AppProtocol: http
Environment:
- Name: POSTGRES_USER
Value: postgres
- Name: PGSSLMODE
Value: require
- Name: POSTGRES_DB
Value: postgres
Secrets:
- Name: POSTGRES_PASSWORD
ValueFrom: !Sub arn:aws:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:${Environment}/postgres/user/password
MountPoints:
- SourceVolume: efs-volume-data
ContainerPath: /var/lib/postgresql/data
ReadOnly: false
- SourceVolume: efs-volume-data
ContainerPath: /var/lib/postgresql
ReadOnly: false
User: "1000"
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: /ecs/OnyxPostgresTaskDefinition
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs

View File

@@ -1,146 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template for Onyx Redis TaskDefinition
Parameters:
Environment:
Type: String
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-redis
TaskCpu:
Type: String
Default: "1024"
TaskMemory:
Type: String
Default: "2048"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: ENABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Ref: SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 6379
ToPort: 6379
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 6379
ToPort: 6379
IpProtocol: tcp
CidrIpv6: "::/0"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
ContainerDefinitions:
- Name: redis
Image: redis:7.4-alpine
Cpu: 0
Essential: true
Command:
- "redis-server"
- "--save"
- "\"\""
- "--appendonly"
- "no"
PortMappings:
- Name: redis_port
ContainerPort: 6379
HostPort: 6379
Protocol: tcp
AppProtocol: http
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
Environment: []
VolumesFrom: []
SystemControls: []

View File

@@ -1,190 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template for Onyx Vespa Engine TaskDefinition
Parameters:
Environment:
Type: String
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-vespaengine
TaskCpu:
Type: String
Default: "4096"
TaskMemory:
Type: String
Default: "16384"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: ENABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Ref: SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 19071
ToPort: 19071
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 19071
ToPort: 19071
IpProtocol: tcp
CidrIpv6: "::/0"
- FromPort: 8081
ToPort: 8081
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 8081
ToPort: 8081
IpProtocol: tcp
CidrIpv6: "::/0"
- FromPort: 2049
ToPort: 2049
IpProtocol: tcp
SourceSecurityGroupId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-EFSSecurityGroupMountTargets"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
ContainerDefinitions:
- Name: vespaengine
Image: vespaengine/vespa:8.277.17
Cpu: 0
Essential: true
PortMappings:
- Name: vespaengine_port
ContainerPort: 19071
HostPort: 19071
Protocol: tcp
AppProtocol: http
- Name: vespaengine_port2
ContainerPort: 8081
HostPort: 8081
Protocol: tcp
AppProtocol: http
MountPoints:
- SourceVolume: efs-volume-data
ContainerPath: /opt/vespa/var
ReadOnly: false
- SourceVolume: efs-volume-tmp
ContainerPath: /var/tmp
ReadOnly: false
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: /ecs/OnyxVespaEngineTaskDefinition
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
User: "1000"
Environment: []
VolumesFrom: []
SystemControls: []
Volumes:
- Name: efs-volume-tmp
EFSVolumeConfiguration:
FilesystemId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
RootDirectory: "/"
TransitEncryption: ENABLED
AuthorizationConfig:
AccessPointId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-VespaEngineTmpEfsAccessPoint"
- Name: efs-volume-data
EFSVolumeConfiguration:
FilesystemId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-OnyxEfsId"
RootDirectory: "/"
TransitEncryption: ENABLED
AuthorizationConfig:
AccessPointId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-efs-VespaEngineDataEfsAccessPoint"

View File

@@ -1,190 +0,0 @@
AWSTemplateFormatVersion: "2010-09-09"
Description: CloudFormation template for Onyx Web Server TaskDefinition
Parameters:
Environment:
Type: String
SubnetIDs:
Type: CommaDelimitedList
Description: "Comma-delimited list of at least two subnet IDs in different Availability Zones"
VpcID:
Type: String
Default: vpc-098cfa79d637dabff
ServiceName:
Type: String
Default: onyx-web-server
TaskCpu:
Type: String
Default: "1024"
TaskMemory:
Type: String
Default: "2048"
TaskDesiredCount:
Type: Number
Default: 1
Resources:
ECSService:
Type: AWS::ECS::Service
Properties:
Cluster:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSClusterName"
CapacityProviderStrategy:
- CapacityProvider: FARGATE
Base: 0
Weight: 1
TaskDefinition: !Ref TaskDefinition
ServiceName: !Sub ${Environment}-${ServiceName}-service
SchedulingStrategy: REPLICA
DesiredCount: !Ref TaskDesiredCount
AvailabilityZoneRebalancing: ENABLED
NetworkConfiguration:
AwsvpcConfiguration:
AssignPublicIp: ENABLED
SecurityGroups:
- Ref: SecurityGroup
Subnets: !Ref SubnetIDs
PlatformVersion: LATEST
DeploymentConfiguration:
MaximumPercent: 200
MinimumHealthyPercent: 100
DeploymentCircuitBreaker:
Enable: true
Rollback: true
DeploymentController:
Type: ECS
ServiceConnectConfiguration:
Enabled: false
ServiceRegistries:
- RegistryArn: !GetAtt ServiceDiscoveryService.Arn
Tags:
- Key: app
Value: onyx
- Key: service
Value: !Ref ServiceName
- Key: env
Value: !Ref Environment
EnableECSManagedTags: true
SecurityGroup:
Type: AWS::EC2::SecurityGroup
Properties:
GroupDescription: !Sub Onyx SecurityGroup access to EFS mount and ${ServiceName}.
GroupName: !Sub ${Environment}-ecs-${ServiceName}
VpcId: !Ref VpcID
SecurityGroupIngress:
- FromPort: 3000
ToPort: 3000
IpProtocol: tcp
CidrIp: 0.0.0.0/0
- FromPort: 3000
ToPort: 3000
IpProtocol: tcp
CidrIpv6: "::/0"
ServiceDiscoveryService:
Type: "AWS::ServiceDiscovery::Service"
Properties:
Name: !Sub ${Environment}-${ServiceName}-service
DnsConfig:
DnsRecords:
- Type: "A"
TTL: 15
NamespaceId:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespace"
HealthCheckCustomConfig:
FailureThreshold: 1
TaskDefinition:
Type: AWS::ECS::TaskDefinition
Properties:
Family: !Sub ${Environment}-${ServiceName}-TaskDefinition
TaskRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskRole"
ExecutionRoleArn:
Fn::ImportValue:
Fn::Sub: "${Environment}-onyx-cluster-ECSTaskExecutionRole"
NetworkMode: awsvpc
RequiresCompatibilities:
- FARGATE
Cpu: !Ref TaskCpu
Memory: !Ref TaskMemory
RuntimePlatform:
CpuArchitecture: ARM64
OperatingSystemFamily: LINUX
ContainerDefinitions:
- Name: onyx-webserver
Image: onyxdotapp/onyx-web-server:latest
Cpu: 0
Essential: true
PortMappings:
- Name: webserver
ContainerPort: 3000
HostPort: 3000
Protocol: tcp
Environment:
- Name: NEXT_PUBLIC_DISABLE_STREAMING
Value: "false"
- Name: NEXT_PUBLIC_NEW_CHAT_DIRECTS_TO_SAME_PERSONA
Value: "false"
- Name: INTERNAL_URL
Value: !Sub
- "http://${Environment}-onyx-backend-api-server-service.${ImportedNamespace}:8080"
- ImportedNamespace: !ImportValue
Fn::Sub: "${Environment}-onyx-cluster-OnyxNamespaceName"
LogConfiguration:
LogDriver: awslogs
Options:
awslogs-group: !Sub /ecs/${Environment}-${ServiceName}
mode: non-blocking
awslogs-create-group: "true"
max-buffer-size: "25m"
awslogs-region: !Ref AWS::Region
awslogs-stream-prefix: ecs
User: "1000"
VolumesFrom: []
SystemControls: []
ECSAutoScalingTarget:
Type: AWS::ApplicationAutoScaling::ScalableTarget
DependsOn: ECSService
Properties:
MaxCapacity: 5
MinCapacity: 1
ResourceId: !Sub
- "service/${ImportedCluster}/${Environment}-${ServiceName}-service"
- ImportedCluster: !ImportValue
'Fn::Sub': "${Environment}-onyx-cluster-ECSClusterName"
ServiceName: !Ref ServiceName
Environment: !Ref Environment
ScalableDimension: ecs:service:DesiredCount
ServiceNamespace: ecs
ECSAutoScalingPolicy:
Type: AWS::ApplicationAutoScaling::ScalingPolicy
Properties:
PolicyName: !Sub ${Environment}-${ServiceName}-service-cpu-scaleout
ScalingTargetId: !Ref ECSAutoScalingTarget
PolicyType: TargetTrackingScaling
TargetTrackingScalingPolicyConfiguration:
TargetValue: 75
PredefinedMetricSpecification:
PredefinedMetricType: ECSServiceAverageCPUUtilization
ScaleOutCooldown: 60
ScaleInCooldown: 60
ECSAutoScalingPolicyMemory:
Type: AWS::ApplicationAutoScaling::ScalingPolicy
Properties:
PolicyName: !Sub ${Environment}-${ServiceName}-service-memory-scaleout
ScalingTargetId: !Ref ECSAutoScalingTarget
PolicyType: TargetTrackingScaling
TargetTrackingScalingPolicyConfiguration:
TargetValue: 80
PredefinedMetricSpecification:
PredefinedMetricType: ECSServiceAverageMemoryUtilization
ScaleOutCooldown: 60
ScaleInCooldown: 60

View File

@@ -1,76 +0,0 @@
#!/bin/bash
AWS_REGION="${AWS_REGION:-us-west-1}"
# Reference to consolidated config
CONFIG_FILE="onyx_config.json"
# Get environment from config file
ENVIRONMENT=$(jq -r '.Environment' "$CONFIG_FILE")
if [ -z "$ENVIRONMENT" ] || [ "$ENVIRONMENT" == "null" ]; then
echo "Missing Environment in $CONFIG_FILE. Please add the Environment field."
exit 1
fi
# Try to get S3_BUCKET from config, fallback to default if not found
S3_BUCKET_FROM_CONFIG=$(jq -r '.S3Bucket // empty' "$CONFIG_FILE")
if [ -n "$S3_BUCKET_FROM_CONFIG" ]; then
S3_BUCKET="$S3_BUCKET_FROM_CONFIG"
else
S3_BUCKET="${S3_BUCKET:-onyx-ecs-fargate-configs}"
fi
STACK_NAMES=(
"${ENVIRONMENT}-onyx-nginx-service"
"${ENVIRONMENT}-onyx-web-server-service"
"${ENVIRONMENT}-onyx-backend-background-server-service"
"${ENVIRONMENT}-onyx-backend-api-server-service"
"${ENVIRONMENT}-onyx-model-server-inference-service"
"${ENVIRONMENT}-onyx-model-server-indexing-service"
"${ENVIRONMENT}-onyx-vespaengine-service"
"${ENVIRONMENT}-onyx-redis-service"
"${ENVIRONMENT}-onyx-postgres-service"
"${ENVIRONMENT}-onyx-cluster"
"${ENVIRONMENT}-onyx-acm"
"${ENVIRONMENT}-onyx-efs"
)
delete_stack() {
local stack_name=$1
if [ "$stack_name" == "${ENVIRONMENT}-onyx-cluster" ]; then
echo "Removing all objects and directories from the onyx config s3 bucket."
aws s3 rm "s3://${ENVIRONMENT}-${S3_BUCKET}" --recursive
sleep 5
fi
echo "Checking if stack $stack_name exists..."
if aws cloudformation describe-stacks --stack-name "$stack_name" --region "$AWS_REGION" > /dev/null 2>&1; then
echo "Deleting stack: $stack_name..."
aws cloudformation delete-stack \
--stack-name "$stack_name" \
--region "$AWS_REGION"
echo "Waiting for stack $stack_name to be deleted..."
aws cloudformation wait stack-delete-complete \
--stack-name "$stack_name" \
--region "$AWS_REGION"
if [ $? -eq 0 ]; then
echo "Stack $stack_name deleted successfully."
sleep 10
else
echo "Failed to delete stack $stack_name. Exiting."
exit 1
fi
else
echo "Stack $stack_name does not exist, skipping."
return 0
fi
}
for stack_name in "${STACK_NAMES[@]}"; do
delete_stack "$stack_name"
done
echo "All stacks deleted successfully."

View File

@@ -31,11 +31,11 @@ upstream api_server {
# for a TCP configuration
# TODO: use gunicorn to manage multiple processes
server ${ONYX_BACKEND_API_HOST}:8080 fail_timeout=0;
server api_server:8080 fail_timeout=0;
}
upstream web_server {
server ${ONYX_WEB_SERVER_HOST}:3000 fail_timeout=0;
server web_server:3000 fail_timeout=0;
}
server {

View File

@@ -1,8 +1,8 @@
# Log format to include request latency
# Override log format to include request latency
log_format custom_main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" '
'rt=$request_time';
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" '
'rt=$request_time';
upstream api_server {
# fail_timeout=0 means we always retry an upstream even if it failed
@@ -13,17 +13,17 @@ upstream api_server {
# for a TCP configuration
# TODO: use gunicorn to manage multiple processes
server ${ONYX_BACKEND_API_HOST}:8080 fail_timeout=0;
server api_server:8080 fail_timeout=0;
}
upstream web_server {
server ${ONYX_WEB_SERVER_HOST}:3000 fail_timeout=0;
server web_server:3000 fail_timeout=0;
}
server {
listen 80 default_server;
client_max_body_size 5G; # Maximum upload size
client_max_body_size 5G; # Maximum upload size
access_log /var/log/nginx/access.log custom_main;
@@ -66,5 +66,5 @@ server {
proxy_redirect off;
proxy_pass http://web_server;
}
}

View File

@@ -1,8 +1,5 @@
# fill in the template
ONYX_BACKEND_API_HOST="${ONYX_BACKEND_API_HOST:-api_server}"
ONYX_WEB_SERVER_HOST="${ONYX_WEB_SERVER_HOST:-web_server}"
envsubst '$DOMAIN $SSL_CERT_FILE_NAME $SSL_CERT_KEY_FILE_NAME $ONYX_BACKEND_API_HOST $ONYX_WEB_SERVER_HOST' < "/etc/nginx/conf.d/$1" > /etc/nginx/conf.d/app.conf
envsubst '$DOMAIN $SSL_CERT_FILE_NAME $SSL_CERT_KEY_FILE_NAME' < "/etc/nginx/conf.d/$1" > /etc/nginx/conf.d/app.conf
# wait for the api_server to be ready
echo "Waiting for API server to boot up; this may take a minute or two..."
@@ -13,7 +10,7 @@ echo
while true; do
# Use curl to send a request and capture the HTTP status code
status_code=$(curl -o /dev/null -s -w "%{http_code}\n" "http://${ONYX_BACKEND_API_HOST}:8080/health")
status_code=$(curl -o /dev/null -s -w "%{http_code}\n" "http://api_server:8080/health")
# Check if the status code is 200
if [ "$status_code" -eq 200 ]; then

View File

@@ -1 +0,0 @@
playwright==1.42.0

View File

@@ -1,161 +0,0 @@
#!/usr/bin/env python3
import argparse
import asyncio
from playwright.async_api import async_playwright
async def scrape_twitter_links(url):
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False
) # Use non-headless for better scrolling
page = await browser.new_page(viewport={"width": 1280, "height": 800})
print(f"Navigating to main page: {url}")
await page.goto(url)
await page.wait_for_load_state("networkidle")
# More aggressive scrolling to load all company cards
company_links = set() # Use a set for automatic deduplication
no_new_links_count = 0
print("Starting to scroll and collect company links...")
# First, try scrolling to the very bottom
await scroll_to_bottom(page)
# Then collect all links
prev_size = 0
while True:
# Get all company links
elements = await page.query_selector_all('a[href^="/companies/"]')
for element in elements:
href = await element.get_attribute("href")
if href and "/companies/" in href and "?" not in href:
company_url = f"https://www.ycombinator.com{href}"
company_links.add(company_url)
current_size = len(company_links)
print(f"Found {current_size} unique company links so far...")
if current_size == prev_size:
no_new_links_count += 1
if no_new_links_count >= 3:
print("No new links found after multiple attempts, ending scroll.")
break
else:
no_new_links_count = 0
prev_size = current_size
# Try to click "Load More" button if it exists
try:
load_more = await page.query_selector('button:has-text("Load More")')
if load_more:
await load_more.click()
print("Clicked 'Load More' button")
await page.wait_for_timeout(3000)
await scroll_to_bottom(page)
continue
except Exception as e:
print(f"Error clicking Load More: {str(e)}")
# Scroll more
try:
await scroll_to_bottom(page)
except Exception as e:
print(f"Error scrolling: {str(e)}")
break
print(f"Found {len(company_links)} total unique company links after scrolling")
# Visit each company page and extract Twitter links
twitter_data = []
for i, company_url in enumerate(sorted(company_links)):
print(f"Processing company {i+1}/{len(company_links)}: {company_url}")
try:
await page.goto(company_url)
await page.wait_for_load_state("networkidle")
# Extract company name from URL
company_name = company_url.split("/")[-1]
# Find all links on the page
all_links = await page.query_selector_all("a")
twitter_links = []
for link in all_links:
href = await link.get_attribute("href")
if href and ("twitter.com" in href or "x.com" in href):
twitter_links.append(href)
if twitter_links:
for twitter_link in twitter_links:
twitter_data.append(f"{company_name}: {twitter_link}")
else:
twitter_data.append(f"{company_name}: No Twitter/X link found")
except Exception as e:
print(f"Error processing {company_url}: {str(e)}")
await browser.close()
return twitter_data
async def scroll_to_bottom(page):
"""Aggressively scroll to the bottom of the page."""
print("Scrolling to bottom...")
# Get the current height of the page
await page.evaluate("document.body.scrollHeight")
# while True:
# Scroll to bottom
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(2000) # Wait for content to load
# Check if we've reached the bottom
await page.evaluate("document.body.scrollHeight")
# if current_height == prev_height:
# break
# Additional scrolls for extra measure
for _ in range(3):
await page.keyboard.press("End")
await page.wait_for_timeout(500)
async def main():
parser = argparse.ArgumentParser(
description="Scrape Twitter links from YC company pages"
)
parser.add_argument(
"--url",
default="https://www.ycombinator.com/companies?batch=W23&batch=S23&batch=S24&batch=F24&batch=S22&batch=W22&query=San%20Francisco",
help="URL to scrape (default: YC companies from recent batches)",
)
parser.add_argument(
"--output",
default="twitter_links.txt",
help="Output file name (default: twitter_links.txt)",
)
parser.add_argument(
"--headless", action="store_true", help="Run in headless mode (default: False)"
)
args = parser.parse_args()
twitter_links = await scrape_twitter_links(args.url)
# Save to file
with open(args.output, "w") as f:
f.write("\n".join(twitter_links))
print(f"Saved {len(twitter_links)} results to {args.output}")
if __name__ == "__main__":
asyncio.run(main())

File diff suppressed because it is too large Load Diff

View File

@@ -3018,11 +3018,7 @@ export function ChatPage({
currentAlternativeAssistant
}
messageId={message.messageId}
content={
userFiles
? message.message
: "message.message"
}
content={message.message}
files={message.files}
query={
messageHistory[i]?.query || undefined

View File

@@ -508,11 +508,7 @@ export const AIMessage = ({
userKnowledgeFiles={userKnowledgeFiles}
/>
)}
{userKnowledgeFiles ? (
<div className="h-10 w-10 rounded-full bg-black" />
) : (
<div className="h-10 w-10 rounded-full bg-red-400" />
)}
{!userKnowledgeFiles &&
toolCall &&
!TOOLS_WITH_CUSTOM_HANDLING.includes(

View File

@@ -43,13 +43,13 @@ const DropdownOption: React.FC<DropdownOptionProps> = ({
if (href) {
return (
<Link
<a
href={href}
target={openInNewTab ? "_blank" : undefined}
rel={openInNewTab ? "noopener noreferrer" : undefined}
>
{content}
</Link>
</a>
);
} else {
return <div onClick={onClick}>{content}</div>;

View File

@@ -377,10 +377,7 @@ export function listSourceMetadata(): SourceMetadata[] {
display in the Add Connector page */
const entries = Object.entries(SOURCE_METADATA_MAP)
.filter(
([source, _]) =>
source !== "not_applicable" &&
source !== "ingestion_api" &&
source !== "mock_connector"
([source, _]) => source !== "not_applicable" && source != "ingestion_api"
)
.map(([source, metadata]) => {
return fillSourceMetadata(metadata, source as ValidSources);